]> git.proxmox.com Git - mirror_qemu.git/blob - migration/ram.c
Merge remote-tracking branch 'remotes/nvme/tags/nvme-next-pull-request' into staging
[mirror_qemu.git] / migration / ram.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/main-loop.h"
34 #include "xbzrle.h"
35 #include "ram.h"
36 #include "migration.h"
37 #include "migration/register.h"
38 #include "migration/misc.h"
39 #include "qemu-file.h"
40 #include "postcopy-ram.h"
41 #include "page_cache.h"
42 #include "qemu/error-report.h"
43 #include "qapi/error.h"
44 #include "qapi/qapi-types-migration.h"
45 #include "qapi/qapi-events-migration.h"
46 #include "qapi/qmp/qerror.h"
47 #include "trace.h"
48 #include "exec/ram_addr.h"
49 #include "exec/target_page.h"
50 #include "qemu/rcu_queue.h"
51 #include "migration/colo.h"
52 #include "block.h"
53 #include "sysemu/cpu-throttle.h"
54 #include "savevm.h"
55 #include "qemu/iov.h"
56 #include "multifd.h"
57 #include "sysemu/runstate.h"
58
59 #if defined(__linux__)
60 #include "qemu/userfaultfd.h"
61 #endif /* defined(__linux__) */
62
63 /***********************************************************/
64 /* ram save/restore */
65
66 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
67 * worked for pages that where filled with the same char. We switched
68 * it to only search for the zero value. And to avoid confusion with
69 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
70 */
71
72 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
73 #define RAM_SAVE_FLAG_ZERO 0x02
74 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
75 #define RAM_SAVE_FLAG_PAGE 0x08
76 #define RAM_SAVE_FLAG_EOS 0x10
77 #define RAM_SAVE_FLAG_CONTINUE 0x20
78 #define RAM_SAVE_FLAG_XBZRLE 0x40
79 /* 0x80 is reserved in migration.h start with 0x100 next */
80 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
81
82 static inline bool is_zero_range(uint8_t *p, uint64_t size)
83 {
84 return buffer_is_zero(p, size);
85 }
86
87 XBZRLECacheStats xbzrle_counters;
88
89 /* struct contains XBZRLE cache and a static page
90 used by the compression */
91 static struct {
92 /* buffer used for XBZRLE encoding */
93 uint8_t *encoded_buf;
94 /* buffer for storing page content */
95 uint8_t *current_buf;
96 /* Cache for XBZRLE, Protected by lock. */
97 PageCache *cache;
98 QemuMutex lock;
99 /* it will store a page full of zeros */
100 uint8_t *zero_target_page;
101 /* buffer used for XBZRLE decoding */
102 uint8_t *decoded_buf;
103 } XBZRLE;
104
105 static void XBZRLE_cache_lock(void)
106 {
107 if (migrate_use_xbzrle()) {
108 qemu_mutex_lock(&XBZRLE.lock);
109 }
110 }
111
112 static void XBZRLE_cache_unlock(void)
113 {
114 if (migrate_use_xbzrle()) {
115 qemu_mutex_unlock(&XBZRLE.lock);
116 }
117 }
118
119 /**
120 * xbzrle_cache_resize: resize the xbzrle cache
121 *
122 * This function is called from migrate_params_apply in main
123 * thread, possibly while a migration is in progress. A running
124 * migration may be using the cache and might finish during this call,
125 * hence changes to the cache are protected by XBZRLE.lock().
126 *
127 * Returns 0 for success or -1 for error
128 *
129 * @new_size: new cache size
130 * @errp: set *errp if the check failed, with reason
131 */
132 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
133 {
134 PageCache *new_cache;
135 int64_t ret = 0;
136
137 /* Check for truncation */
138 if (new_size != (size_t)new_size) {
139 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
140 "exceeding address space");
141 return -1;
142 }
143
144 if (new_size == migrate_xbzrle_cache_size()) {
145 /* nothing to do */
146 return 0;
147 }
148
149 XBZRLE_cache_lock();
150
151 if (XBZRLE.cache != NULL) {
152 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
153 if (!new_cache) {
154 ret = -1;
155 goto out;
156 }
157
158 cache_fini(XBZRLE.cache);
159 XBZRLE.cache = new_cache;
160 }
161 out:
162 XBZRLE_cache_unlock();
163 return ret;
164 }
165
166 bool ramblock_is_ignored(RAMBlock *block)
167 {
168 return !qemu_ram_is_migratable(block) ||
169 (migrate_ignore_shared() && qemu_ram_is_shared(block));
170 }
171
172 #undef RAMBLOCK_FOREACH
173
174 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
175 {
176 RAMBlock *block;
177 int ret = 0;
178
179 RCU_READ_LOCK_GUARD();
180
181 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
182 ret = func(block, opaque);
183 if (ret) {
184 break;
185 }
186 }
187 return ret;
188 }
189
190 static void ramblock_recv_map_init(void)
191 {
192 RAMBlock *rb;
193
194 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
195 assert(!rb->receivedmap);
196 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
197 }
198 }
199
200 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
201 {
202 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
203 rb->receivedmap);
204 }
205
206 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
207 {
208 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
209 }
210
211 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
212 {
213 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
214 }
215
216 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
217 size_t nr)
218 {
219 bitmap_set_atomic(rb->receivedmap,
220 ramblock_recv_bitmap_offset(host_addr, rb),
221 nr);
222 }
223
224 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
225
226 /*
227 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
228 *
229 * Returns >0 if success with sent bytes, or <0 if error.
230 */
231 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
232 const char *block_name)
233 {
234 RAMBlock *block = qemu_ram_block_by_name(block_name);
235 unsigned long *le_bitmap, nbits;
236 uint64_t size;
237
238 if (!block) {
239 error_report("%s: invalid block name: %s", __func__, block_name);
240 return -1;
241 }
242
243 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
244
245 /*
246 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
247 * machines we may need 4 more bytes for padding (see below
248 * comment). So extend it a bit before hand.
249 */
250 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
251
252 /*
253 * Always use little endian when sending the bitmap. This is
254 * required that when source and destination VMs are not using the
255 * same endianness. (Note: big endian won't work.)
256 */
257 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
258
259 /* Size of the bitmap, in bytes */
260 size = DIV_ROUND_UP(nbits, 8);
261
262 /*
263 * size is always aligned to 8 bytes for 64bit machines, but it
264 * may not be true for 32bit machines. We need this padding to
265 * make sure the migration can survive even between 32bit and
266 * 64bit machines.
267 */
268 size = ROUND_UP(size, 8);
269
270 qemu_put_be64(file, size);
271 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
272 /*
273 * Mark as an end, in case the middle part is screwed up due to
274 * some "mysterious" reason.
275 */
276 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
277 qemu_fflush(file);
278
279 g_free(le_bitmap);
280
281 if (qemu_file_get_error(file)) {
282 return qemu_file_get_error(file);
283 }
284
285 return size + sizeof(size);
286 }
287
288 /*
289 * An outstanding page request, on the source, having been received
290 * and queued
291 */
292 struct RAMSrcPageRequest {
293 RAMBlock *rb;
294 hwaddr offset;
295 hwaddr len;
296
297 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
298 };
299
300 /* State of RAM for migration */
301 struct RAMState {
302 /* QEMUFile used for this migration */
303 QEMUFile *f;
304 /* UFFD file descriptor, used in 'write-tracking' migration */
305 int uffdio_fd;
306 /* Last block that we have visited searching for dirty pages */
307 RAMBlock *last_seen_block;
308 /* Last block from where we have sent data */
309 RAMBlock *last_sent_block;
310 /* Last dirty target page we have sent */
311 ram_addr_t last_page;
312 /* last ram version we have seen */
313 uint32_t last_version;
314 /* How many times we have dirty too many pages */
315 int dirty_rate_high_cnt;
316 /* these variables are used for bitmap sync */
317 /* last time we did a full bitmap_sync */
318 int64_t time_last_bitmap_sync;
319 /* bytes transferred at start_time */
320 uint64_t bytes_xfer_prev;
321 /* number of dirty pages since start_time */
322 uint64_t num_dirty_pages_period;
323 /* xbzrle misses since the beginning of the period */
324 uint64_t xbzrle_cache_miss_prev;
325 /* Amount of xbzrle pages since the beginning of the period */
326 uint64_t xbzrle_pages_prev;
327 /* Amount of xbzrle encoded bytes since the beginning of the period */
328 uint64_t xbzrle_bytes_prev;
329 /* Start using XBZRLE (e.g., after the first round). */
330 bool xbzrle_enabled;
331
332 /* compression statistics since the beginning of the period */
333 /* amount of count that no free thread to compress data */
334 uint64_t compress_thread_busy_prev;
335 /* amount bytes after compression */
336 uint64_t compressed_size_prev;
337 /* amount of compressed pages */
338 uint64_t compress_pages_prev;
339
340 /* total handled target pages at the beginning of period */
341 uint64_t target_page_count_prev;
342 /* total handled target pages since start */
343 uint64_t target_page_count;
344 /* number of dirty bits in the bitmap */
345 uint64_t migration_dirty_pages;
346 /* Protects modification of the bitmap and migration dirty pages */
347 QemuMutex bitmap_mutex;
348 /* The RAMBlock used in the last src_page_requests */
349 RAMBlock *last_req_rb;
350 /* Queue of outstanding page requests from the destination */
351 QemuMutex src_page_req_mutex;
352 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
353 };
354 typedef struct RAMState RAMState;
355
356 static RAMState *ram_state;
357
358 static NotifierWithReturnList precopy_notifier_list;
359
360 void precopy_infrastructure_init(void)
361 {
362 notifier_with_return_list_init(&precopy_notifier_list);
363 }
364
365 void precopy_add_notifier(NotifierWithReturn *n)
366 {
367 notifier_with_return_list_add(&precopy_notifier_list, n);
368 }
369
370 void precopy_remove_notifier(NotifierWithReturn *n)
371 {
372 notifier_with_return_remove(n);
373 }
374
375 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
376 {
377 PrecopyNotifyData pnd;
378 pnd.reason = reason;
379 pnd.errp = errp;
380
381 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
382 }
383
384 uint64_t ram_bytes_remaining(void)
385 {
386 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
387 0;
388 }
389
390 MigrationStats ram_counters;
391
392 /* used by the search for pages to send */
393 struct PageSearchStatus {
394 /* Current block being searched */
395 RAMBlock *block;
396 /* Current page to search from */
397 unsigned long page;
398 /* Set once we wrap around */
399 bool complete_round;
400 };
401 typedef struct PageSearchStatus PageSearchStatus;
402
403 CompressionStats compression_counters;
404
405 struct CompressParam {
406 bool done;
407 bool quit;
408 bool zero_page;
409 QEMUFile *file;
410 QemuMutex mutex;
411 QemuCond cond;
412 RAMBlock *block;
413 ram_addr_t offset;
414
415 /* internally used fields */
416 z_stream stream;
417 uint8_t *originbuf;
418 };
419 typedef struct CompressParam CompressParam;
420
421 struct DecompressParam {
422 bool done;
423 bool quit;
424 QemuMutex mutex;
425 QemuCond cond;
426 void *des;
427 uint8_t *compbuf;
428 int len;
429 z_stream stream;
430 };
431 typedef struct DecompressParam DecompressParam;
432
433 static CompressParam *comp_param;
434 static QemuThread *compress_threads;
435 /* comp_done_cond is used to wake up the migration thread when
436 * one of the compression threads has finished the compression.
437 * comp_done_lock is used to co-work with comp_done_cond.
438 */
439 static QemuMutex comp_done_lock;
440 static QemuCond comp_done_cond;
441 /* The empty QEMUFileOps will be used by file in CompressParam */
442 static const QEMUFileOps empty_ops = { };
443
444 static QEMUFile *decomp_file;
445 static DecompressParam *decomp_param;
446 static QemuThread *decompress_threads;
447 static QemuMutex decomp_done_lock;
448 static QemuCond decomp_done_cond;
449
450 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
451 ram_addr_t offset, uint8_t *source_buf);
452
453 static void *do_data_compress(void *opaque)
454 {
455 CompressParam *param = opaque;
456 RAMBlock *block;
457 ram_addr_t offset;
458 bool zero_page;
459
460 qemu_mutex_lock(&param->mutex);
461 while (!param->quit) {
462 if (param->block) {
463 block = param->block;
464 offset = param->offset;
465 param->block = NULL;
466 qemu_mutex_unlock(&param->mutex);
467
468 zero_page = do_compress_ram_page(param->file, &param->stream,
469 block, offset, param->originbuf);
470
471 qemu_mutex_lock(&comp_done_lock);
472 param->done = true;
473 param->zero_page = zero_page;
474 qemu_cond_signal(&comp_done_cond);
475 qemu_mutex_unlock(&comp_done_lock);
476
477 qemu_mutex_lock(&param->mutex);
478 } else {
479 qemu_cond_wait(&param->cond, &param->mutex);
480 }
481 }
482 qemu_mutex_unlock(&param->mutex);
483
484 return NULL;
485 }
486
487 static void compress_threads_save_cleanup(void)
488 {
489 int i, thread_count;
490
491 if (!migrate_use_compression() || !comp_param) {
492 return;
493 }
494
495 thread_count = migrate_compress_threads();
496 for (i = 0; i < thread_count; i++) {
497 /*
498 * we use it as a indicator which shows if the thread is
499 * properly init'd or not
500 */
501 if (!comp_param[i].file) {
502 break;
503 }
504
505 qemu_mutex_lock(&comp_param[i].mutex);
506 comp_param[i].quit = true;
507 qemu_cond_signal(&comp_param[i].cond);
508 qemu_mutex_unlock(&comp_param[i].mutex);
509
510 qemu_thread_join(compress_threads + i);
511 qemu_mutex_destroy(&comp_param[i].mutex);
512 qemu_cond_destroy(&comp_param[i].cond);
513 deflateEnd(&comp_param[i].stream);
514 g_free(comp_param[i].originbuf);
515 qemu_fclose(comp_param[i].file);
516 comp_param[i].file = NULL;
517 }
518 qemu_mutex_destroy(&comp_done_lock);
519 qemu_cond_destroy(&comp_done_cond);
520 g_free(compress_threads);
521 g_free(comp_param);
522 compress_threads = NULL;
523 comp_param = NULL;
524 }
525
526 static int compress_threads_save_setup(void)
527 {
528 int i, thread_count;
529
530 if (!migrate_use_compression()) {
531 return 0;
532 }
533 thread_count = migrate_compress_threads();
534 compress_threads = g_new0(QemuThread, thread_count);
535 comp_param = g_new0(CompressParam, thread_count);
536 qemu_cond_init(&comp_done_cond);
537 qemu_mutex_init(&comp_done_lock);
538 for (i = 0; i < thread_count; i++) {
539 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
540 if (!comp_param[i].originbuf) {
541 goto exit;
542 }
543
544 if (deflateInit(&comp_param[i].stream,
545 migrate_compress_level()) != Z_OK) {
546 g_free(comp_param[i].originbuf);
547 goto exit;
548 }
549
550 /* comp_param[i].file is just used as a dummy buffer to save data,
551 * set its ops to empty.
552 */
553 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
554 comp_param[i].done = true;
555 comp_param[i].quit = false;
556 qemu_mutex_init(&comp_param[i].mutex);
557 qemu_cond_init(&comp_param[i].cond);
558 qemu_thread_create(compress_threads + i, "compress",
559 do_data_compress, comp_param + i,
560 QEMU_THREAD_JOINABLE);
561 }
562 return 0;
563
564 exit:
565 compress_threads_save_cleanup();
566 return -1;
567 }
568
569 /**
570 * save_page_header: write page header to wire
571 *
572 * If this is the 1st block, it also writes the block identification
573 *
574 * Returns the number of bytes written
575 *
576 * @f: QEMUFile where to send the data
577 * @block: block that contains the page we want to send
578 * @offset: offset inside the block for the page
579 * in the lower bits, it contains flags
580 */
581 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
582 ram_addr_t offset)
583 {
584 size_t size, len;
585
586 if (block == rs->last_sent_block) {
587 offset |= RAM_SAVE_FLAG_CONTINUE;
588 }
589 qemu_put_be64(f, offset);
590 size = 8;
591
592 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
593 len = strlen(block->idstr);
594 qemu_put_byte(f, len);
595 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
596 size += 1 + len;
597 rs->last_sent_block = block;
598 }
599 return size;
600 }
601
602 /**
603 * mig_throttle_guest_down: throttle down the guest
604 *
605 * Reduce amount of guest cpu execution to hopefully slow down memory
606 * writes. If guest dirty memory rate is reduced below the rate at
607 * which we can transfer pages to the destination then we should be
608 * able to complete migration. Some workloads dirty memory way too
609 * fast and will not effectively converge, even with auto-converge.
610 */
611 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
612 uint64_t bytes_dirty_threshold)
613 {
614 MigrationState *s = migrate_get_current();
615 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
616 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
617 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
618 int pct_max = s->parameters.max_cpu_throttle;
619
620 uint64_t throttle_now = cpu_throttle_get_percentage();
621 uint64_t cpu_now, cpu_ideal, throttle_inc;
622
623 /* We have not started throttling yet. Let's start it. */
624 if (!cpu_throttle_active()) {
625 cpu_throttle_set(pct_initial);
626 } else {
627 /* Throttling already on, just increase the rate */
628 if (!pct_tailslow) {
629 throttle_inc = pct_increment;
630 } else {
631 /* Compute the ideal CPU percentage used by Guest, which may
632 * make the dirty rate match the dirty rate threshold. */
633 cpu_now = 100 - throttle_now;
634 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
635 bytes_dirty_period);
636 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
637 }
638 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
639 }
640 }
641
642 /**
643 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
644 *
645 * @rs: current RAM state
646 * @current_addr: address for the zero page
647 *
648 * Update the xbzrle cache to reflect a page that's been sent as all 0.
649 * The important thing is that a stale (not-yet-0'd) page be replaced
650 * by the new data.
651 * As a bonus, if the page wasn't in the cache it gets added so that
652 * when a small write is made into the 0'd page it gets XBZRLE sent.
653 */
654 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
655 {
656 if (!rs->xbzrle_enabled) {
657 return;
658 }
659
660 /* We don't care if this fails to allocate a new cache page
661 * as long as it updated an old one */
662 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
663 ram_counters.dirty_sync_count);
664 }
665
666 #define ENCODING_FLAG_XBZRLE 0x1
667
668 /**
669 * save_xbzrle_page: compress and send current page
670 *
671 * Returns: 1 means that we wrote the page
672 * 0 means that page is identical to the one already sent
673 * -1 means that xbzrle would be longer than normal
674 *
675 * @rs: current RAM state
676 * @current_data: pointer to the address of the page contents
677 * @current_addr: addr of the page
678 * @block: block that contains the page we want to send
679 * @offset: offset inside the block for the page
680 * @last_stage: if we are at the completion stage
681 */
682 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
683 ram_addr_t current_addr, RAMBlock *block,
684 ram_addr_t offset, bool last_stage)
685 {
686 int encoded_len = 0, bytes_xbzrle;
687 uint8_t *prev_cached_page;
688
689 if (!cache_is_cached(XBZRLE.cache, current_addr,
690 ram_counters.dirty_sync_count)) {
691 xbzrle_counters.cache_miss++;
692 if (!last_stage) {
693 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
694 ram_counters.dirty_sync_count) == -1) {
695 return -1;
696 } else {
697 /* update *current_data when the page has been
698 inserted into cache */
699 *current_data = get_cached_data(XBZRLE.cache, current_addr);
700 }
701 }
702 return -1;
703 }
704
705 /*
706 * Reaching here means the page has hit the xbzrle cache, no matter what
707 * encoding result it is (normal encoding, overflow or skipping the page),
708 * count the page as encoded. This is used to calculate the encoding rate.
709 *
710 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
711 * 2nd page turns out to be skipped (i.e. no new bytes written to the
712 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
713 * skipped page included. In this way, the encoding rate can tell if the
714 * guest page is good for xbzrle encoding.
715 */
716 xbzrle_counters.pages++;
717 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
718
719 /* save current buffer into memory */
720 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
721
722 /* XBZRLE encoding (if there is no overflow) */
723 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
724 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
725 TARGET_PAGE_SIZE);
726
727 /*
728 * Update the cache contents, so that it corresponds to the data
729 * sent, in all cases except where we skip the page.
730 */
731 if (!last_stage && encoded_len != 0) {
732 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
733 /*
734 * In the case where we couldn't compress, ensure that the caller
735 * sends the data from the cache, since the guest might have
736 * changed the RAM since we copied it.
737 */
738 *current_data = prev_cached_page;
739 }
740
741 if (encoded_len == 0) {
742 trace_save_xbzrle_page_skipping();
743 return 0;
744 } else if (encoded_len == -1) {
745 trace_save_xbzrle_page_overflow();
746 xbzrle_counters.overflow++;
747 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
748 return -1;
749 }
750
751 /* Send XBZRLE based compressed page */
752 bytes_xbzrle = save_page_header(rs, rs->f, block,
753 offset | RAM_SAVE_FLAG_XBZRLE);
754 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
755 qemu_put_be16(rs->f, encoded_len);
756 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
757 bytes_xbzrle += encoded_len + 1 + 2;
758 /*
759 * Like compressed_size (please see update_compress_thread_counts),
760 * the xbzrle encoded bytes don't count the 8 byte header with
761 * RAM_SAVE_FLAG_CONTINUE.
762 */
763 xbzrle_counters.bytes += bytes_xbzrle - 8;
764 ram_counters.transferred += bytes_xbzrle;
765
766 return 1;
767 }
768
769 /**
770 * migration_bitmap_find_dirty: find the next dirty page from start
771 *
772 * Returns the page offset within memory region of the start of a dirty page
773 *
774 * @rs: current RAM state
775 * @rb: RAMBlock where to search for dirty pages
776 * @start: page where we start the search
777 */
778 static inline
779 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
780 unsigned long start)
781 {
782 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
783 unsigned long *bitmap = rb->bmap;
784
785 if (ramblock_is_ignored(rb)) {
786 return size;
787 }
788
789 return find_next_bit(bitmap, size, start);
790 }
791
792 static void migration_clear_memory_region_dirty_bitmap(RAMState *rs,
793 RAMBlock *rb,
794 unsigned long page)
795 {
796 uint8_t shift;
797 hwaddr size, start;
798
799 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
800 return;
801 }
802
803 shift = rb->clear_bmap_shift;
804 /*
805 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
806 * can make things easier sometimes since then start address
807 * of the small chunk will always be 64 pages aligned so the
808 * bitmap will always be aligned to unsigned long. We should
809 * even be able to remove this restriction but I'm simply
810 * keeping it.
811 */
812 assert(shift >= 6);
813
814 size = 1ULL << (TARGET_PAGE_BITS + shift);
815 start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
816 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
817 memory_region_clear_dirty_bitmap(rb->mr, start, size);
818 }
819
820 static void
821 migration_clear_memory_region_dirty_bitmap_range(RAMState *rs,
822 RAMBlock *rb,
823 unsigned long start,
824 unsigned long npages)
825 {
826 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
827 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
828 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
829
830 /*
831 * Clear pages from start to start + npages - 1, so the end boundary is
832 * exclusive.
833 */
834 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
835 migration_clear_memory_region_dirty_bitmap(rs, rb, i);
836 }
837 }
838
839 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
840 RAMBlock *rb,
841 unsigned long page)
842 {
843 bool ret;
844
845 /*
846 * Clear dirty bitmap if needed. This _must_ be called before we
847 * send any of the page in the chunk because we need to make sure
848 * we can capture further page content changes when we sync dirty
849 * log the next time. So as long as we are going to send any of
850 * the page in the chunk we clear the remote dirty bitmap for all.
851 * Clearing it earlier won't be a problem, but too late will.
852 */
853 migration_clear_memory_region_dirty_bitmap(rs, rb, page);
854
855 ret = test_and_clear_bit(page, rb->bmap);
856 if (ret) {
857 rs->migration_dirty_pages--;
858 }
859
860 return ret;
861 }
862
863 /* Called with RCU critical section */
864 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
865 {
866 uint64_t new_dirty_pages =
867 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
868
869 rs->migration_dirty_pages += new_dirty_pages;
870 rs->num_dirty_pages_period += new_dirty_pages;
871 }
872
873 /**
874 * ram_pagesize_summary: calculate all the pagesizes of a VM
875 *
876 * Returns a summary bitmap of the page sizes of all RAMBlocks
877 *
878 * For VMs with just normal pages this is equivalent to the host page
879 * size. If it's got some huge pages then it's the OR of all the
880 * different page sizes.
881 */
882 uint64_t ram_pagesize_summary(void)
883 {
884 RAMBlock *block;
885 uint64_t summary = 0;
886
887 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
888 summary |= block->page_size;
889 }
890
891 return summary;
892 }
893
894 uint64_t ram_get_total_transferred_pages(void)
895 {
896 return ram_counters.normal + ram_counters.duplicate +
897 compression_counters.pages + xbzrle_counters.pages;
898 }
899
900 static void migration_update_rates(RAMState *rs, int64_t end_time)
901 {
902 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
903 double compressed_size;
904
905 /* calculate period counters */
906 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
907 / (end_time - rs->time_last_bitmap_sync);
908
909 if (!page_count) {
910 return;
911 }
912
913 if (migrate_use_xbzrle()) {
914 double encoded_size, unencoded_size;
915
916 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
917 rs->xbzrle_cache_miss_prev) / page_count;
918 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
919 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
920 TARGET_PAGE_SIZE;
921 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
922 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
923 xbzrle_counters.encoding_rate = 0;
924 } else {
925 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
926 }
927 rs->xbzrle_pages_prev = xbzrle_counters.pages;
928 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
929 }
930
931 if (migrate_use_compression()) {
932 compression_counters.busy_rate = (double)(compression_counters.busy -
933 rs->compress_thread_busy_prev) / page_count;
934 rs->compress_thread_busy_prev = compression_counters.busy;
935
936 compressed_size = compression_counters.compressed_size -
937 rs->compressed_size_prev;
938 if (compressed_size) {
939 double uncompressed_size = (compression_counters.pages -
940 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
941
942 /* Compression-Ratio = Uncompressed-size / Compressed-size */
943 compression_counters.compression_rate =
944 uncompressed_size / compressed_size;
945
946 rs->compress_pages_prev = compression_counters.pages;
947 rs->compressed_size_prev = compression_counters.compressed_size;
948 }
949 }
950 }
951
952 static void migration_trigger_throttle(RAMState *rs)
953 {
954 MigrationState *s = migrate_get_current();
955 uint64_t threshold = s->parameters.throttle_trigger_threshold;
956
957 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
958 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
959 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
960
961 /* During block migration the auto-converge logic incorrectly detects
962 * that ram migration makes no progress. Avoid this by disabling the
963 * throttling logic during the bulk phase of block migration. */
964 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
965 /* The following detection logic can be refined later. For now:
966 Check to see if the ratio between dirtied bytes and the approx.
967 amount of bytes that just got transferred since the last time
968 we were in this routine reaches the threshold. If that happens
969 twice, start or increase throttling. */
970
971 if ((bytes_dirty_period > bytes_dirty_threshold) &&
972 (++rs->dirty_rate_high_cnt >= 2)) {
973 trace_migration_throttle();
974 rs->dirty_rate_high_cnt = 0;
975 mig_throttle_guest_down(bytes_dirty_period,
976 bytes_dirty_threshold);
977 }
978 }
979 }
980
981 static void migration_bitmap_sync(RAMState *rs)
982 {
983 RAMBlock *block;
984 int64_t end_time;
985
986 ram_counters.dirty_sync_count++;
987
988 if (!rs->time_last_bitmap_sync) {
989 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
990 }
991
992 trace_migration_bitmap_sync_start();
993 memory_global_dirty_log_sync();
994
995 qemu_mutex_lock(&rs->bitmap_mutex);
996 WITH_RCU_READ_LOCK_GUARD() {
997 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
998 ramblock_sync_dirty_bitmap(rs, block);
999 }
1000 ram_counters.remaining = ram_bytes_remaining();
1001 }
1002 qemu_mutex_unlock(&rs->bitmap_mutex);
1003
1004 memory_global_after_dirty_log_sync();
1005 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1006
1007 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1008
1009 /* more than 1 second = 1000 millisecons */
1010 if (end_time > rs->time_last_bitmap_sync + 1000) {
1011 migration_trigger_throttle(rs);
1012
1013 migration_update_rates(rs, end_time);
1014
1015 rs->target_page_count_prev = rs->target_page_count;
1016
1017 /* reset period counters */
1018 rs->time_last_bitmap_sync = end_time;
1019 rs->num_dirty_pages_period = 0;
1020 rs->bytes_xfer_prev = ram_counters.transferred;
1021 }
1022 if (migrate_use_events()) {
1023 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1024 }
1025 }
1026
1027 static void migration_bitmap_sync_precopy(RAMState *rs)
1028 {
1029 Error *local_err = NULL;
1030
1031 /*
1032 * The current notifier usage is just an optimization to migration, so we
1033 * don't stop the normal migration process in the error case.
1034 */
1035 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1036 error_report_err(local_err);
1037 local_err = NULL;
1038 }
1039
1040 migration_bitmap_sync(rs);
1041
1042 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1043 error_report_err(local_err);
1044 }
1045 }
1046
1047 /**
1048 * save_zero_page_to_file: send the zero page to the file
1049 *
1050 * Returns the size of data written to the file, 0 means the page is not
1051 * a zero page
1052 *
1053 * @rs: current RAM state
1054 * @file: the file where the data is saved
1055 * @block: block that contains the page we want to send
1056 * @offset: offset inside the block for the page
1057 */
1058 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1059 RAMBlock *block, ram_addr_t offset)
1060 {
1061 uint8_t *p = block->host + offset;
1062 int len = 0;
1063
1064 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1065 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1066 qemu_put_byte(file, 0);
1067 len += 1;
1068 }
1069 return len;
1070 }
1071
1072 /**
1073 * save_zero_page: send the zero page to the stream
1074 *
1075 * Returns the number of pages written.
1076 *
1077 * @rs: current RAM state
1078 * @block: block that contains the page we want to send
1079 * @offset: offset inside the block for the page
1080 */
1081 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1082 {
1083 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1084
1085 if (len) {
1086 ram_counters.duplicate++;
1087 ram_counters.transferred += len;
1088 return 1;
1089 }
1090 return -1;
1091 }
1092
1093 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1094 {
1095 if (!migrate_release_ram() || !migration_in_postcopy()) {
1096 return;
1097 }
1098
1099 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1100 }
1101
1102 /*
1103 * @pages: the number of pages written by the control path,
1104 * < 0 - error
1105 * > 0 - number of pages written
1106 *
1107 * Return true if the pages has been saved, otherwise false is returned.
1108 */
1109 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1110 int *pages)
1111 {
1112 uint64_t bytes_xmit = 0;
1113 int ret;
1114
1115 *pages = -1;
1116 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1117 &bytes_xmit);
1118 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1119 return false;
1120 }
1121
1122 if (bytes_xmit) {
1123 ram_counters.transferred += bytes_xmit;
1124 *pages = 1;
1125 }
1126
1127 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1128 return true;
1129 }
1130
1131 if (bytes_xmit > 0) {
1132 ram_counters.normal++;
1133 } else if (bytes_xmit == 0) {
1134 ram_counters.duplicate++;
1135 }
1136
1137 return true;
1138 }
1139
1140 /*
1141 * directly send the page to the stream
1142 *
1143 * Returns the number of pages written.
1144 *
1145 * @rs: current RAM state
1146 * @block: block that contains the page we want to send
1147 * @offset: offset inside the block for the page
1148 * @buf: the page to be sent
1149 * @async: send to page asyncly
1150 */
1151 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1152 uint8_t *buf, bool async)
1153 {
1154 ram_counters.transferred += save_page_header(rs, rs->f, block,
1155 offset | RAM_SAVE_FLAG_PAGE);
1156 if (async) {
1157 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1158 migrate_release_ram() &
1159 migration_in_postcopy());
1160 } else {
1161 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1162 }
1163 ram_counters.transferred += TARGET_PAGE_SIZE;
1164 ram_counters.normal++;
1165 return 1;
1166 }
1167
1168 /**
1169 * ram_save_page: send the given page to the stream
1170 *
1171 * Returns the number of pages written.
1172 * < 0 - error
1173 * >=0 - Number of pages written - this might legally be 0
1174 * if xbzrle noticed the page was the same.
1175 *
1176 * @rs: current RAM state
1177 * @block: block that contains the page we want to send
1178 * @offset: offset inside the block for the page
1179 * @last_stage: if we are at the completion stage
1180 */
1181 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1182 {
1183 int pages = -1;
1184 uint8_t *p;
1185 bool send_async = true;
1186 RAMBlock *block = pss->block;
1187 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1188 ram_addr_t current_addr = block->offset + offset;
1189
1190 p = block->host + offset;
1191 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1192
1193 XBZRLE_cache_lock();
1194 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1195 pages = save_xbzrle_page(rs, &p, current_addr, block,
1196 offset, last_stage);
1197 if (!last_stage) {
1198 /* Can't send this cached data async, since the cache page
1199 * might get updated before it gets to the wire
1200 */
1201 send_async = false;
1202 }
1203 }
1204
1205 /* XBZRLE overflow or normal page */
1206 if (pages == -1) {
1207 pages = save_normal_page(rs, block, offset, p, send_async);
1208 }
1209
1210 XBZRLE_cache_unlock();
1211
1212 return pages;
1213 }
1214
1215 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1216 ram_addr_t offset)
1217 {
1218 if (multifd_queue_page(rs->f, block, offset) < 0) {
1219 return -1;
1220 }
1221 ram_counters.normal++;
1222
1223 return 1;
1224 }
1225
1226 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1227 ram_addr_t offset, uint8_t *source_buf)
1228 {
1229 RAMState *rs = ram_state;
1230 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1231 bool zero_page = false;
1232 int ret;
1233
1234 if (save_zero_page_to_file(rs, f, block, offset)) {
1235 zero_page = true;
1236 goto exit;
1237 }
1238
1239 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1240
1241 /*
1242 * copy it to a internal buffer to avoid it being modified by VM
1243 * so that we can catch up the error during compression and
1244 * decompression
1245 */
1246 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1247 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1248 if (ret < 0) {
1249 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1250 error_report("compressed data failed!");
1251 return false;
1252 }
1253
1254 exit:
1255 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1256 return zero_page;
1257 }
1258
1259 static void
1260 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1261 {
1262 ram_counters.transferred += bytes_xmit;
1263
1264 if (param->zero_page) {
1265 ram_counters.duplicate++;
1266 return;
1267 }
1268
1269 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1270 compression_counters.compressed_size += bytes_xmit - 8;
1271 compression_counters.pages++;
1272 }
1273
1274 static bool save_page_use_compression(RAMState *rs);
1275
1276 static void flush_compressed_data(RAMState *rs)
1277 {
1278 int idx, len, thread_count;
1279
1280 if (!save_page_use_compression(rs)) {
1281 return;
1282 }
1283 thread_count = migrate_compress_threads();
1284
1285 qemu_mutex_lock(&comp_done_lock);
1286 for (idx = 0; idx < thread_count; idx++) {
1287 while (!comp_param[idx].done) {
1288 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1289 }
1290 }
1291 qemu_mutex_unlock(&comp_done_lock);
1292
1293 for (idx = 0; idx < thread_count; idx++) {
1294 qemu_mutex_lock(&comp_param[idx].mutex);
1295 if (!comp_param[idx].quit) {
1296 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1297 /*
1298 * it's safe to fetch zero_page without holding comp_done_lock
1299 * as there is no further request submitted to the thread,
1300 * i.e, the thread should be waiting for a request at this point.
1301 */
1302 update_compress_thread_counts(&comp_param[idx], len);
1303 }
1304 qemu_mutex_unlock(&comp_param[idx].mutex);
1305 }
1306 }
1307
1308 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1309 ram_addr_t offset)
1310 {
1311 param->block = block;
1312 param->offset = offset;
1313 }
1314
1315 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1316 ram_addr_t offset)
1317 {
1318 int idx, thread_count, bytes_xmit = -1, pages = -1;
1319 bool wait = migrate_compress_wait_thread();
1320
1321 thread_count = migrate_compress_threads();
1322 qemu_mutex_lock(&comp_done_lock);
1323 retry:
1324 for (idx = 0; idx < thread_count; idx++) {
1325 if (comp_param[idx].done) {
1326 comp_param[idx].done = false;
1327 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1328 qemu_mutex_lock(&comp_param[idx].mutex);
1329 set_compress_params(&comp_param[idx], block, offset);
1330 qemu_cond_signal(&comp_param[idx].cond);
1331 qemu_mutex_unlock(&comp_param[idx].mutex);
1332 pages = 1;
1333 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1334 break;
1335 }
1336 }
1337
1338 /*
1339 * wait for the free thread if the user specifies 'compress-wait-thread',
1340 * otherwise we will post the page out in the main thread as normal page.
1341 */
1342 if (pages < 0 && wait) {
1343 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1344 goto retry;
1345 }
1346 qemu_mutex_unlock(&comp_done_lock);
1347
1348 return pages;
1349 }
1350
1351 /**
1352 * find_dirty_block: find the next dirty page and update any state
1353 * associated with the search process.
1354 *
1355 * Returns true if a page is found
1356 *
1357 * @rs: current RAM state
1358 * @pss: data about the state of the current dirty page scan
1359 * @again: set to false if the search has scanned the whole of RAM
1360 */
1361 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1362 {
1363 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1364 if (pss->complete_round && pss->block == rs->last_seen_block &&
1365 pss->page >= rs->last_page) {
1366 /*
1367 * We've been once around the RAM and haven't found anything.
1368 * Give up.
1369 */
1370 *again = false;
1371 return false;
1372 }
1373 if (!offset_in_ramblock(pss->block,
1374 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1375 /* Didn't find anything in this RAM Block */
1376 pss->page = 0;
1377 pss->block = QLIST_NEXT_RCU(pss->block, next);
1378 if (!pss->block) {
1379 /*
1380 * If memory migration starts over, we will meet a dirtied page
1381 * which may still exists in compression threads's ring, so we
1382 * should flush the compressed data to make sure the new page
1383 * is not overwritten by the old one in the destination.
1384 *
1385 * Also If xbzrle is on, stop using the data compression at this
1386 * point. In theory, xbzrle can do better than compression.
1387 */
1388 flush_compressed_data(rs);
1389
1390 /* Hit the end of the list */
1391 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1392 /* Flag that we've looped */
1393 pss->complete_round = true;
1394 /* After the first round, enable XBZRLE. */
1395 if (migrate_use_xbzrle()) {
1396 rs->xbzrle_enabled = true;
1397 }
1398 }
1399 /* Didn't find anything this time, but try again on the new block */
1400 *again = true;
1401 return false;
1402 } else {
1403 /* Can go around again, but... */
1404 *again = true;
1405 /* We've found something so probably don't need to */
1406 return true;
1407 }
1408 }
1409
1410 /**
1411 * unqueue_page: gets a page of the queue
1412 *
1413 * Helper for 'get_queued_page' - gets a page off the queue
1414 *
1415 * Returns the block of the page (or NULL if none available)
1416 *
1417 * @rs: current RAM state
1418 * @offset: used to return the offset within the RAMBlock
1419 */
1420 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1421 {
1422 RAMBlock *block = NULL;
1423
1424 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1425 return NULL;
1426 }
1427
1428 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1429 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1430 struct RAMSrcPageRequest *entry =
1431 QSIMPLEQ_FIRST(&rs->src_page_requests);
1432 block = entry->rb;
1433 *offset = entry->offset;
1434
1435 if (entry->len > TARGET_PAGE_SIZE) {
1436 entry->len -= TARGET_PAGE_SIZE;
1437 entry->offset += TARGET_PAGE_SIZE;
1438 } else {
1439 memory_region_unref(block->mr);
1440 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1441 g_free(entry);
1442 migration_consume_urgent_request();
1443 }
1444 }
1445
1446 return block;
1447 }
1448
1449 #if defined(__linux__)
1450 /**
1451 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1452 * is found, return RAM block pointer and page offset
1453 *
1454 * Returns pointer to the RAMBlock containing faulting page,
1455 * NULL if no write faults are pending
1456 *
1457 * @rs: current RAM state
1458 * @offset: page offset from the beginning of the block
1459 */
1460 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1461 {
1462 struct uffd_msg uffd_msg;
1463 void *page_address;
1464 RAMBlock *block;
1465 int res;
1466
1467 if (!migrate_background_snapshot()) {
1468 return NULL;
1469 }
1470
1471 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1472 if (res <= 0) {
1473 return NULL;
1474 }
1475
1476 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1477 block = qemu_ram_block_from_host(page_address, false, offset);
1478 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1479 return block;
1480 }
1481
1482 /**
1483 * ram_save_release_protection: release UFFD write protection after
1484 * a range of pages has been saved
1485 *
1486 * @rs: current RAM state
1487 * @pss: page-search-status structure
1488 * @start_page: index of the first page in the range relative to pss->block
1489 *
1490 * Returns 0 on success, negative value in case of an error
1491 */
1492 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1493 unsigned long start_page)
1494 {
1495 int res = 0;
1496
1497 /* Check if page is from UFFD-managed region. */
1498 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1499 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1500 uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1501
1502 /* Flush async buffers before un-protect. */
1503 qemu_fflush(rs->f);
1504 /* Un-protect memory range. */
1505 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1506 false, false);
1507 }
1508
1509 return res;
1510 }
1511
1512 /* ram_write_tracking_available: check if kernel supports required UFFD features
1513 *
1514 * Returns true if supports, false otherwise
1515 */
1516 bool ram_write_tracking_available(void)
1517 {
1518 uint64_t uffd_features;
1519 int res;
1520
1521 res = uffd_query_features(&uffd_features);
1522 return (res == 0 &&
1523 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1524 }
1525
1526 /* ram_write_tracking_compatible: check if guest configuration is
1527 * compatible with 'write-tracking'
1528 *
1529 * Returns true if compatible, false otherwise
1530 */
1531 bool ram_write_tracking_compatible(void)
1532 {
1533 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1534 int uffd_fd;
1535 RAMBlock *block;
1536 bool ret = false;
1537
1538 /* Open UFFD file descriptor */
1539 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1540 if (uffd_fd < 0) {
1541 return false;
1542 }
1543
1544 RCU_READ_LOCK_GUARD();
1545
1546 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1547 uint64_t uffd_ioctls;
1548
1549 /* Nothing to do with read-only and MMIO-writable regions */
1550 if (block->mr->readonly || block->mr->rom_device) {
1551 continue;
1552 }
1553 /* Try to register block memory via UFFD-IO to track writes */
1554 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1555 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1556 goto out;
1557 }
1558 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1559 goto out;
1560 }
1561 }
1562 ret = true;
1563
1564 out:
1565 uffd_close_fd(uffd_fd);
1566 return ret;
1567 }
1568
1569 /*
1570 * ram_block_populate_pages: populate memory in the RAM block by reading
1571 * an integer from the beginning of each page.
1572 *
1573 * Since it's solely used for userfault_fd WP feature, here we just
1574 * hardcode page size to qemu_real_host_page_size.
1575 *
1576 * @block: RAM block to populate
1577 */
1578 static void ram_block_populate_pages(RAMBlock *block)
1579 {
1580 char *ptr = (char *) block->host;
1581
1582 for (ram_addr_t offset = 0; offset < block->used_length;
1583 offset += qemu_real_host_page_size) {
1584 char tmp = *(ptr + offset);
1585
1586 /* Don't optimize the read out */
1587 asm volatile("" : "+r" (tmp));
1588 }
1589 }
1590
1591 /*
1592 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1593 */
1594 void ram_write_tracking_prepare(void)
1595 {
1596 RAMBlock *block;
1597
1598 RCU_READ_LOCK_GUARD();
1599
1600 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1601 /* Nothing to do with read-only and MMIO-writable regions */
1602 if (block->mr->readonly || block->mr->rom_device) {
1603 continue;
1604 }
1605
1606 /*
1607 * Populate pages of the RAM block before enabling userfault_fd
1608 * write protection.
1609 *
1610 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1611 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1612 * pages with pte_none() entries in page table.
1613 */
1614 ram_block_populate_pages(block);
1615 }
1616 }
1617
1618 /*
1619 * ram_write_tracking_start: start UFFD-WP memory tracking
1620 *
1621 * Returns 0 for success or negative value in case of error
1622 */
1623 int ram_write_tracking_start(void)
1624 {
1625 int uffd_fd;
1626 RAMState *rs = ram_state;
1627 RAMBlock *block;
1628
1629 /* Open UFFD file descriptor */
1630 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1631 if (uffd_fd < 0) {
1632 return uffd_fd;
1633 }
1634 rs->uffdio_fd = uffd_fd;
1635
1636 RCU_READ_LOCK_GUARD();
1637
1638 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1639 /* Nothing to do with read-only and MMIO-writable regions */
1640 if (block->mr->readonly || block->mr->rom_device) {
1641 continue;
1642 }
1643
1644 /* Register block memory with UFFD to track writes */
1645 if (uffd_register_memory(rs->uffdio_fd, block->host,
1646 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1647 goto fail;
1648 }
1649 /* Apply UFFD write protection to the block memory range */
1650 if (uffd_change_protection(rs->uffdio_fd, block->host,
1651 block->max_length, true, false)) {
1652 goto fail;
1653 }
1654 block->flags |= RAM_UF_WRITEPROTECT;
1655 memory_region_ref(block->mr);
1656
1657 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1658 block->host, block->max_length);
1659 }
1660
1661 return 0;
1662
1663 fail:
1664 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1665
1666 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1667 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1668 continue;
1669 }
1670 /*
1671 * In case some memory block failed to be write-protected
1672 * remove protection and unregister all succeeded RAM blocks
1673 */
1674 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1675 false, false);
1676 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1677 /* Cleanup flags and remove reference */
1678 block->flags &= ~RAM_UF_WRITEPROTECT;
1679 memory_region_unref(block->mr);
1680 }
1681
1682 uffd_close_fd(uffd_fd);
1683 rs->uffdio_fd = -1;
1684 return -1;
1685 }
1686
1687 /**
1688 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1689 */
1690 void ram_write_tracking_stop(void)
1691 {
1692 RAMState *rs = ram_state;
1693 RAMBlock *block;
1694
1695 RCU_READ_LOCK_GUARD();
1696
1697 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1698 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1699 continue;
1700 }
1701 /* Remove protection and unregister all affected RAM blocks */
1702 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1703 false, false);
1704 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1705
1706 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1707 block->host, block->max_length);
1708
1709 /* Cleanup flags and remove reference */
1710 block->flags &= ~RAM_UF_WRITEPROTECT;
1711 memory_region_unref(block->mr);
1712 }
1713
1714 /* Finally close UFFD file descriptor */
1715 uffd_close_fd(rs->uffdio_fd);
1716 rs->uffdio_fd = -1;
1717 }
1718
1719 #else
1720 /* No target OS support, stubs just fail or ignore */
1721
1722 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1723 {
1724 (void) rs;
1725 (void) offset;
1726
1727 return NULL;
1728 }
1729
1730 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1731 unsigned long start_page)
1732 {
1733 (void) rs;
1734 (void) pss;
1735 (void) start_page;
1736
1737 return 0;
1738 }
1739
1740 bool ram_write_tracking_available(void)
1741 {
1742 return false;
1743 }
1744
1745 bool ram_write_tracking_compatible(void)
1746 {
1747 assert(0);
1748 return false;
1749 }
1750
1751 int ram_write_tracking_start(void)
1752 {
1753 assert(0);
1754 return -1;
1755 }
1756
1757 void ram_write_tracking_stop(void)
1758 {
1759 assert(0);
1760 }
1761 #endif /* defined(__linux__) */
1762
1763 /**
1764 * get_queued_page: unqueue a page from the postcopy requests
1765 *
1766 * Skips pages that are already sent (!dirty)
1767 *
1768 * Returns true if a queued page is found
1769 *
1770 * @rs: current RAM state
1771 * @pss: data about the state of the current dirty page scan
1772 */
1773 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1774 {
1775 RAMBlock *block;
1776 ram_addr_t offset;
1777 bool dirty;
1778
1779 do {
1780 block = unqueue_page(rs, &offset);
1781 /*
1782 * We're sending this page, and since it's postcopy nothing else
1783 * will dirty it, and we must make sure it doesn't get sent again
1784 * even if this queue request was received after the background
1785 * search already sent it.
1786 */
1787 if (block) {
1788 unsigned long page;
1789
1790 page = offset >> TARGET_PAGE_BITS;
1791 dirty = test_bit(page, block->bmap);
1792 if (!dirty) {
1793 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1794 page);
1795 } else {
1796 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1797 }
1798 }
1799
1800 } while (block && !dirty);
1801
1802 if (!block) {
1803 /*
1804 * Poll write faults too if background snapshot is enabled; that's
1805 * when we have vcpus got blocked by the write protected pages.
1806 */
1807 block = poll_fault_page(rs, &offset);
1808 }
1809
1810 if (block) {
1811 /*
1812 * We want the background search to continue from the queued page
1813 * since the guest is likely to want other pages near to the page
1814 * it just requested.
1815 */
1816 pss->block = block;
1817 pss->page = offset >> TARGET_PAGE_BITS;
1818
1819 /*
1820 * This unqueued page would break the "one round" check, even is
1821 * really rare.
1822 */
1823 pss->complete_round = false;
1824 }
1825
1826 return !!block;
1827 }
1828
1829 /**
1830 * migration_page_queue_free: drop any remaining pages in the ram
1831 * request queue
1832 *
1833 * It should be empty at the end anyway, but in error cases there may
1834 * be some left. in case that there is any page left, we drop it.
1835 *
1836 */
1837 static void migration_page_queue_free(RAMState *rs)
1838 {
1839 struct RAMSrcPageRequest *mspr, *next_mspr;
1840 /* This queue generally should be empty - but in the case of a failed
1841 * migration might have some droppings in.
1842 */
1843 RCU_READ_LOCK_GUARD();
1844 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1845 memory_region_unref(mspr->rb->mr);
1846 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1847 g_free(mspr);
1848 }
1849 }
1850
1851 /**
1852 * ram_save_queue_pages: queue the page for transmission
1853 *
1854 * A request from postcopy destination for example.
1855 *
1856 * Returns zero on success or negative on error
1857 *
1858 * @rbname: Name of the RAMBLock of the request. NULL means the
1859 * same that last one.
1860 * @start: starting address from the start of the RAMBlock
1861 * @len: length (in bytes) to send
1862 */
1863 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1864 {
1865 RAMBlock *ramblock;
1866 RAMState *rs = ram_state;
1867
1868 ram_counters.postcopy_requests++;
1869 RCU_READ_LOCK_GUARD();
1870
1871 if (!rbname) {
1872 /* Reuse last RAMBlock */
1873 ramblock = rs->last_req_rb;
1874
1875 if (!ramblock) {
1876 /*
1877 * Shouldn't happen, we can't reuse the last RAMBlock if
1878 * it's the 1st request.
1879 */
1880 error_report("ram_save_queue_pages no previous block");
1881 return -1;
1882 }
1883 } else {
1884 ramblock = qemu_ram_block_by_name(rbname);
1885
1886 if (!ramblock) {
1887 /* We shouldn't be asked for a non-existent RAMBlock */
1888 error_report("ram_save_queue_pages no block '%s'", rbname);
1889 return -1;
1890 }
1891 rs->last_req_rb = ramblock;
1892 }
1893 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1894 if (!offset_in_ramblock(ramblock, start + len - 1)) {
1895 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1896 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1897 __func__, start, len, ramblock->used_length);
1898 return -1;
1899 }
1900
1901 struct RAMSrcPageRequest *new_entry =
1902 g_malloc0(sizeof(struct RAMSrcPageRequest));
1903 new_entry->rb = ramblock;
1904 new_entry->offset = start;
1905 new_entry->len = len;
1906
1907 memory_region_ref(ramblock->mr);
1908 qemu_mutex_lock(&rs->src_page_req_mutex);
1909 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1910 migration_make_urgent_request();
1911 qemu_mutex_unlock(&rs->src_page_req_mutex);
1912
1913 return 0;
1914 }
1915
1916 static bool save_page_use_compression(RAMState *rs)
1917 {
1918 if (!migrate_use_compression()) {
1919 return false;
1920 }
1921
1922 /*
1923 * If xbzrle is enabled (e.g., after first round of migration), stop
1924 * using the data compression. In theory, xbzrle can do better than
1925 * compression.
1926 */
1927 if (rs->xbzrle_enabled) {
1928 return false;
1929 }
1930
1931 return true;
1932 }
1933
1934 /*
1935 * try to compress the page before posting it out, return true if the page
1936 * has been properly handled by compression, otherwise needs other
1937 * paths to handle it
1938 */
1939 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1940 {
1941 if (!save_page_use_compression(rs)) {
1942 return false;
1943 }
1944
1945 /*
1946 * When starting the process of a new block, the first page of
1947 * the block should be sent out before other pages in the same
1948 * block, and all the pages in last block should have been sent
1949 * out, keeping this order is important, because the 'cont' flag
1950 * is used to avoid resending the block name.
1951 *
1952 * We post the fist page as normal page as compression will take
1953 * much CPU resource.
1954 */
1955 if (block != rs->last_sent_block) {
1956 flush_compressed_data(rs);
1957 return false;
1958 }
1959
1960 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1961 return true;
1962 }
1963
1964 compression_counters.busy++;
1965 return false;
1966 }
1967
1968 /**
1969 * ram_save_target_page: save one target page
1970 *
1971 * Returns the number of pages written
1972 *
1973 * @rs: current RAM state
1974 * @pss: data about the page we want to send
1975 * @last_stage: if we are at the completion stage
1976 */
1977 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1978 bool last_stage)
1979 {
1980 RAMBlock *block = pss->block;
1981 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1982 int res;
1983
1984 if (control_save_page(rs, block, offset, &res)) {
1985 return res;
1986 }
1987
1988 if (save_compress_page(rs, block, offset)) {
1989 return 1;
1990 }
1991
1992 res = save_zero_page(rs, block, offset);
1993 if (res > 0) {
1994 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1995 * page would be stale
1996 */
1997 if (!save_page_use_compression(rs)) {
1998 XBZRLE_cache_lock();
1999 xbzrle_cache_zero_page(rs, block->offset + offset);
2000 XBZRLE_cache_unlock();
2001 }
2002 ram_release_pages(block->idstr, offset, res);
2003 return res;
2004 }
2005
2006 /*
2007 * Do not use multifd for:
2008 * 1. Compression as the first page in the new block should be posted out
2009 * before sending the compressed page
2010 * 2. In postcopy as one whole host page should be placed
2011 */
2012 if (!save_page_use_compression(rs) && migrate_use_multifd()
2013 && !migration_in_postcopy()) {
2014 return ram_save_multifd_page(rs, block, offset);
2015 }
2016
2017 return ram_save_page(rs, pss, last_stage);
2018 }
2019
2020 /**
2021 * ram_save_host_page: save a whole host page
2022 *
2023 * Starting at *offset send pages up to the end of the current host
2024 * page. It's valid for the initial offset to point into the middle of
2025 * a host page in which case the remainder of the hostpage is sent.
2026 * Only dirty target pages are sent. Note that the host page size may
2027 * be a huge page for this block.
2028 * The saving stops at the boundary of the used_length of the block
2029 * if the RAMBlock isn't a multiple of the host page size.
2030 *
2031 * Returns the number of pages written or negative on error
2032 *
2033 * @rs: current RAM state
2034 * @ms: current migration state
2035 * @pss: data about the page we want to send
2036 * @last_stage: if we are at the completion stage
2037 */
2038 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2039 bool last_stage)
2040 {
2041 int tmppages, pages = 0;
2042 size_t pagesize_bits =
2043 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2044 unsigned long hostpage_boundary =
2045 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2046 unsigned long start_page = pss->page;
2047 int res;
2048
2049 if (ramblock_is_ignored(pss->block)) {
2050 error_report("block %s should not be migrated !", pss->block->idstr);
2051 return 0;
2052 }
2053
2054 do {
2055 /* Check the pages is dirty and if it is send it */
2056 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2057 tmppages = ram_save_target_page(rs, pss, last_stage);
2058 if (tmppages < 0) {
2059 return tmppages;
2060 }
2061
2062 pages += tmppages;
2063 /*
2064 * Allow rate limiting to happen in the middle of huge pages if
2065 * something is sent in the current iteration.
2066 */
2067 if (pagesize_bits > 1 && tmppages > 0) {
2068 migration_rate_limit();
2069 }
2070 }
2071 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2072 } while ((pss->page < hostpage_boundary) &&
2073 offset_in_ramblock(pss->block,
2074 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2075 /* The offset we leave with is the min boundary of host page and block */
2076 pss->page = MIN(pss->page, hostpage_boundary) - 1;
2077
2078 res = ram_save_release_protection(rs, pss, start_page);
2079 return (res < 0 ? res : pages);
2080 }
2081
2082 /**
2083 * ram_find_and_save_block: finds a dirty page and sends it to f
2084 *
2085 * Called within an RCU critical section.
2086 *
2087 * Returns the number of pages written where zero means no dirty pages,
2088 * or negative on error
2089 *
2090 * @rs: current RAM state
2091 * @last_stage: if we are at the completion stage
2092 *
2093 * On systems where host-page-size > target-page-size it will send all the
2094 * pages in a host page that are dirty.
2095 */
2096
2097 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2098 {
2099 PageSearchStatus pss;
2100 int pages = 0;
2101 bool again, found;
2102
2103 /* No dirty page as there is zero RAM */
2104 if (!ram_bytes_total()) {
2105 return pages;
2106 }
2107
2108 pss.block = rs->last_seen_block;
2109 pss.page = rs->last_page;
2110 pss.complete_round = false;
2111
2112 if (!pss.block) {
2113 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2114 }
2115
2116 do {
2117 again = true;
2118 found = get_queued_page(rs, &pss);
2119
2120 if (!found) {
2121 /* priority queue empty, so just search for something dirty */
2122 found = find_dirty_block(rs, &pss, &again);
2123 }
2124
2125 if (found) {
2126 pages = ram_save_host_page(rs, &pss, last_stage);
2127 }
2128 } while (!pages && again);
2129
2130 rs->last_seen_block = pss.block;
2131 rs->last_page = pss.page;
2132
2133 return pages;
2134 }
2135
2136 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2137 {
2138 uint64_t pages = size / TARGET_PAGE_SIZE;
2139
2140 if (zero) {
2141 ram_counters.duplicate += pages;
2142 } else {
2143 ram_counters.normal += pages;
2144 ram_counters.transferred += size;
2145 qemu_update_position(f, size);
2146 }
2147 }
2148
2149 static uint64_t ram_bytes_total_common(bool count_ignored)
2150 {
2151 RAMBlock *block;
2152 uint64_t total = 0;
2153
2154 RCU_READ_LOCK_GUARD();
2155
2156 if (count_ignored) {
2157 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2158 total += block->used_length;
2159 }
2160 } else {
2161 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2162 total += block->used_length;
2163 }
2164 }
2165 return total;
2166 }
2167
2168 uint64_t ram_bytes_total(void)
2169 {
2170 return ram_bytes_total_common(false);
2171 }
2172
2173 static void xbzrle_load_setup(void)
2174 {
2175 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2176 }
2177
2178 static void xbzrle_load_cleanup(void)
2179 {
2180 g_free(XBZRLE.decoded_buf);
2181 XBZRLE.decoded_buf = NULL;
2182 }
2183
2184 static void ram_state_cleanup(RAMState **rsp)
2185 {
2186 if (*rsp) {
2187 migration_page_queue_free(*rsp);
2188 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2189 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2190 g_free(*rsp);
2191 *rsp = NULL;
2192 }
2193 }
2194
2195 static void xbzrle_cleanup(void)
2196 {
2197 XBZRLE_cache_lock();
2198 if (XBZRLE.cache) {
2199 cache_fini(XBZRLE.cache);
2200 g_free(XBZRLE.encoded_buf);
2201 g_free(XBZRLE.current_buf);
2202 g_free(XBZRLE.zero_target_page);
2203 XBZRLE.cache = NULL;
2204 XBZRLE.encoded_buf = NULL;
2205 XBZRLE.current_buf = NULL;
2206 XBZRLE.zero_target_page = NULL;
2207 }
2208 XBZRLE_cache_unlock();
2209 }
2210
2211 static void ram_save_cleanup(void *opaque)
2212 {
2213 RAMState **rsp = opaque;
2214 RAMBlock *block;
2215
2216 /* We don't use dirty log with background snapshots */
2217 if (!migrate_background_snapshot()) {
2218 /* caller have hold iothread lock or is in a bh, so there is
2219 * no writing race against the migration bitmap
2220 */
2221 memory_global_dirty_log_stop();
2222 }
2223
2224 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2225 g_free(block->clear_bmap);
2226 block->clear_bmap = NULL;
2227 g_free(block->bmap);
2228 block->bmap = NULL;
2229 }
2230
2231 xbzrle_cleanup();
2232 compress_threads_save_cleanup();
2233 ram_state_cleanup(rsp);
2234 }
2235
2236 static void ram_state_reset(RAMState *rs)
2237 {
2238 rs->last_seen_block = NULL;
2239 rs->last_sent_block = NULL;
2240 rs->last_page = 0;
2241 rs->last_version = ram_list.version;
2242 rs->xbzrle_enabled = false;
2243 }
2244
2245 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2246
2247 /*
2248 * 'expected' is the value you expect the bitmap mostly to be full
2249 * of; it won't bother printing lines that are all this value.
2250 * If 'todump' is null the migration bitmap is dumped.
2251 */
2252 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2253 unsigned long pages)
2254 {
2255 int64_t cur;
2256 int64_t linelen = 128;
2257 char linebuf[129];
2258
2259 for (cur = 0; cur < pages; cur += linelen) {
2260 int64_t curb;
2261 bool found = false;
2262 /*
2263 * Last line; catch the case where the line length
2264 * is longer than remaining ram
2265 */
2266 if (cur + linelen > pages) {
2267 linelen = pages - cur;
2268 }
2269 for (curb = 0; curb < linelen; curb++) {
2270 bool thisbit = test_bit(cur + curb, todump);
2271 linebuf[curb] = thisbit ? '1' : '.';
2272 found = found || (thisbit != expected);
2273 }
2274 if (found) {
2275 linebuf[curb] = '\0';
2276 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2277 }
2278 }
2279 }
2280
2281 /* **** functions for postcopy ***** */
2282
2283 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2284 {
2285 struct RAMBlock *block;
2286
2287 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2288 unsigned long *bitmap = block->bmap;
2289 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2290 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2291
2292 while (run_start < range) {
2293 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2294 ram_discard_range(block->idstr,
2295 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2296 ((ram_addr_t)(run_end - run_start))
2297 << TARGET_PAGE_BITS);
2298 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2299 }
2300 }
2301 }
2302
2303 /**
2304 * postcopy_send_discard_bm_ram: discard a RAMBlock
2305 *
2306 * Returns zero on success
2307 *
2308 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2309 *
2310 * @ms: current migration state
2311 * @block: RAMBlock to discard
2312 */
2313 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2314 {
2315 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2316 unsigned long current;
2317 unsigned long *bitmap = block->bmap;
2318
2319 for (current = 0; current < end; ) {
2320 unsigned long one = find_next_bit(bitmap, end, current);
2321 unsigned long zero, discard_length;
2322
2323 if (one >= end) {
2324 break;
2325 }
2326
2327 zero = find_next_zero_bit(bitmap, end, one + 1);
2328
2329 if (zero >= end) {
2330 discard_length = end - one;
2331 } else {
2332 discard_length = zero - one;
2333 }
2334 postcopy_discard_send_range(ms, one, discard_length);
2335 current = one + discard_length;
2336 }
2337
2338 return 0;
2339 }
2340
2341 /**
2342 * postcopy_each_ram_send_discard: discard all RAMBlocks
2343 *
2344 * Returns 0 for success or negative for error
2345 *
2346 * Utility for the outgoing postcopy code.
2347 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2348 * passing it bitmap indexes and name.
2349 * (qemu_ram_foreach_block ends up passing unscaled lengths
2350 * which would mean postcopy code would have to deal with target page)
2351 *
2352 * @ms: current migration state
2353 */
2354 static int postcopy_each_ram_send_discard(MigrationState *ms)
2355 {
2356 struct RAMBlock *block;
2357 int ret;
2358
2359 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2360 postcopy_discard_send_init(ms, block->idstr);
2361
2362 /*
2363 * Postcopy sends chunks of bitmap over the wire, but it
2364 * just needs indexes at this point, avoids it having
2365 * target page specific code.
2366 */
2367 ret = postcopy_send_discard_bm_ram(ms, block);
2368 postcopy_discard_send_finish(ms);
2369 if (ret) {
2370 return ret;
2371 }
2372 }
2373
2374 return 0;
2375 }
2376
2377 /**
2378 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2379 *
2380 * Helper for postcopy_chunk_hostpages; it's called twice to
2381 * canonicalize the two bitmaps, that are similar, but one is
2382 * inverted.
2383 *
2384 * Postcopy requires that all target pages in a hostpage are dirty or
2385 * clean, not a mix. This function canonicalizes the bitmaps.
2386 *
2387 * @ms: current migration state
2388 * @block: block that contains the page we want to canonicalize
2389 */
2390 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2391 {
2392 RAMState *rs = ram_state;
2393 unsigned long *bitmap = block->bmap;
2394 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2395 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2396 unsigned long run_start;
2397
2398 if (block->page_size == TARGET_PAGE_SIZE) {
2399 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2400 return;
2401 }
2402
2403 /* Find a dirty page */
2404 run_start = find_next_bit(bitmap, pages, 0);
2405
2406 while (run_start < pages) {
2407
2408 /*
2409 * If the start of this run of pages is in the middle of a host
2410 * page, then we need to fixup this host page.
2411 */
2412 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2413 /* Find the end of this run */
2414 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2415 /*
2416 * If the end isn't at the start of a host page, then the
2417 * run doesn't finish at the end of a host page
2418 * and we need to discard.
2419 */
2420 }
2421
2422 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2423 unsigned long page;
2424 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2425 host_ratio);
2426 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2427
2428 /* Clean up the bitmap */
2429 for (page = fixup_start_addr;
2430 page < fixup_start_addr + host_ratio; page++) {
2431 /*
2432 * Remark them as dirty, updating the count for any pages
2433 * that weren't previously dirty.
2434 */
2435 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2436 }
2437 }
2438
2439 /* Find the next dirty page for the next iteration */
2440 run_start = find_next_bit(bitmap, pages, run_start);
2441 }
2442 }
2443
2444 /**
2445 * postcopy_chunk_hostpages: discard any partially sent host page
2446 *
2447 * Utility for the outgoing postcopy code.
2448 *
2449 * Discard any partially sent host-page size chunks, mark any partially
2450 * dirty host-page size chunks as all dirty. In this case the host-page
2451 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2452 *
2453 * Returns zero on success
2454 *
2455 * @ms: current migration state
2456 * @block: block we want to work with
2457 */
2458 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2459 {
2460 postcopy_discard_send_init(ms, block->idstr);
2461
2462 /*
2463 * Ensure that all partially dirty host pages are made fully dirty.
2464 */
2465 postcopy_chunk_hostpages_pass(ms, block);
2466
2467 postcopy_discard_send_finish(ms);
2468 return 0;
2469 }
2470
2471 /**
2472 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2473 *
2474 * Returns zero on success
2475 *
2476 * Transmit the set of pages to be discarded after precopy to the target
2477 * these are pages that:
2478 * a) Have been previously transmitted but are now dirty again
2479 * b) Pages that have never been transmitted, this ensures that
2480 * any pages on the destination that have been mapped by background
2481 * tasks get discarded (transparent huge pages is the specific concern)
2482 * Hopefully this is pretty sparse
2483 *
2484 * @ms: current migration state
2485 */
2486 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2487 {
2488 RAMState *rs = ram_state;
2489 RAMBlock *block;
2490 int ret;
2491
2492 RCU_READ_LOCK_GUARD();
2493
2494 /* This should be our last sync, the src is now paused */
2495 migration_bitmap_sync(rs);
2496
2497 /* Easiest way to make sure we don't resume in the middle of a host-page */
2498 rs->last_seen_block = NULL;
2499 rs->last_sent_block = NULL;
2500 rs->last_page = 0;
2501
2502 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2503 /* Deal with TPS != HPS and huge pages */
2504 ret = postcopy_chunk_hostpages(ms, block);
2505 if (ret) {
2506 return ret;
2507 }
2508
2509 #ifdef DEBUG_POSTCOPY
2510 ram_debug_dump_bitmap(block->bmap, true,
2511 block->used_length >> TARGET_PAGE_BITS);
2512 #endif
2513 }
2514 trace_ram_postcopy_send_discard_bitmap();
2515
2516 return postcopy_each_ram_send_discard(ms);
2517 }
2518
2519 /**
2520 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2521 *
2522 * Returns zero on success
2523 *
2524 * @rbname: name of the RAMBlock of the request. NULL means the
2525 * same that last one.
2526 * @start: RAMBlock starting page
2527 * @length: RAMBlock size
2528 */
2529 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2530 {
2531 trace_ram_discard_range(rbname, start, length);
2532
2533 RCU_READ_LOCK_GUARD();
2534 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2535
2536 if (!rb) {
2537 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2538 return -1;
2539 }
2540
2541 /*
2542 * On source VM, we don't need to update the received bitmap since
2543 * we don't even have one.
2544 */
2545 if (rb->receivedmap) {
2546 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2547 length >> qemu_target_page_bits());
2548 }
2549
2550 return ram_block_discard_range(rb, start, length);
2551 }
2552
2553 /*
2554 * For every allocation, we will try not to crash the VM if the
2555 * allocation failed.
2556 */
2557 static int xbzrle_init(void)
2558 {
2559 Error *local_err = NULL;
2560
2561 if (!migrate_use_xbzrle()) {
2562 return 0;
2563 }
2564
2565 XBZRLE_cache_lock();
2566
2567 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2568 if (!XBZRLE.zero_target_page) {
2569 error_report("%s: Error allocating zero page", __func__);
2570 goto err_out;
2571 }
2572
2573 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2574 TARGET_PAGE_SIZE, &local_err);
2575 if (!XBZRLE.cache) {
2576 error_report_err(local_err);
2577 goto free_zero_page;
2578 }
2579
2580 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2581 if (!XBZRLE.encoded_buf) {
2582 error_report("%s: Error allocating encoded_buf", __func__);
2583 goto free_cache;
2584 }
2585
2586 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2587 if (!XBZRLE.current_buf) {
2588 error_report("%s: Error allocating current_buf", __func__);
2589 goto free_encoded_buf;
2590 }
2591
2592 /* We are all good */
2593 XBZRLE_cache_unlock();
2594 return 0;
2595
2596 free_encoded_buf:
2597 g_free(XBZRLE.encoded_buf);
2598 XBZRLE.encoded_buf = NULL;
2599 free_cache:
2600 cache_fini(XBZRLE.cache);
2601 XBZRLE.cache = NULL;
2602 free_zero_page:
2603 g_free(XBZRLE.zero_target_page);
2604 XBZRLE.zero_target_page = NULL;
2605 err_out:
2606 XBZRLE_cache_unlock();
2607 return -ENOMEM;
2608 }
2609
2610 static int ram_state_init(RAMState **rsp)
2611 {
2612 *rsp = g_try_new0(RAMState, 1);
2613
2614 if (!*rsp) {
2615 error_report("%s: Init ramstate fail", __func__);
2616 return -1;
2617 }
2618
2619 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2620 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2621 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2622
2623 /*
2624 * Count the total number of pages used by ram blocks not including any
2625 * gaps due to alignment or unplugs.
2626 * This must match with the initial values of dirty bitmap.
2627 */
2628 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2629 ram_state_reset(*rsp);
2630
2631 return 0;
2632 }
2633
2634 static void ram_list_init_bitmaps(void)
2635 {
2636 MigrationState *ms = migrate_get_current();
2637 RAMBlock *block;
2638 unsigned long pages;
2639 uint8_t shift;
2640
2641 /* Skip setting bitmap if there is no RAM */
2642 if (ram_bytes_total()) {
2643 shift = ms->clear_bitmap_shift;
2644 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2645 error_report("clear_bitmap_shift (%u) too big, using "
2646 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2647 shift = CLEAR_BITMAP_SHIFT_MAX;
2648 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2649 error_report("clear_bitmap_shift (%u) too small, using "
2650 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2651 shift = CLEAR_BITMAP_SHIFT_MIN;
2652 }
2653
2654 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2655 pages = block->max_length >> TARGET_PAGE_BITS;
2656 /*
2657 * The initial dirty bitmap for migration must be set with all
2658 * ones to make sure we'll migrate every guest RAM page to
2659 * destination.
2660 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2661 * new migration after a failed migration, ram_list.
2662 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2663 * guest memory.
2664 */
2665 block->bmap = bitmap_new(pages);
2666 bitmap_set(block->bmap, 0, pages);
2667 block->clear_bmap_shift = shift;
2668 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2669 }
2670 }
2671 }
2672
2673 static void ram_init_bitmaps(RAMState *rs)
2674 {
2675 /* For memory_global_dirty_log_start below. */
2676 qemu_mutex_lock_iothread();
2677 qemu_mutex_lock_ramlist();
2678
2679 WITH_RCU_READ_LOCK_GUARD() {
2680 ram_list_init_bitmaps();
2681 /* We don't use dirty log with background snapshots */
2682 if (!migrate_background_snapshot()) {
2683 memory_global_dirty_log_start();
2684 migration_bitmap_sync_precopy(rs);
2685 }
2686 }
2687 qemu_mutex_unlock_ramlist();
2688 qemu_mutex_unlock_iothread();
2689 }
2690
2691 static int ram_init_all(RAMState **rsp)
2692 {
2693 if (ram_state_init(rsp)) {
2694 return -1;
2695 }
2696
2697 if (xbzrle_init()) {
2698 ram_state_cleanup(rsp);
2699 return -1;
2700 }
2701
2702 ram_init_bitmaps(*rsp);
2703
2704 return 0;
2705 }
2706
2707 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2708 {
2709 RAMBlock *block;
2710 uint64_t pages = 0;
2711
2712 /*
2713 * Postcopy is not using xbzrle/compression, so no need for that.
2714 * Also, since source are already halted, we don't need to care
2715 * about dirty page logging as well.
2716 */
2717
2718 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2719 pages += bitmap_count_one(block->bmap,
2720 block->used_length >> TARGET_PAGE_BITS);
2721 }
2722
2723 /* This may not be aligned with current bitmaps. Recalculate. */
2724 rs->migration_dirty_pages = pages;
2725
2726 ram_state_reset(rs);
2727
2728 /* Update RAMState cache of output QEMUFile */
2729 rs->f = out;
2730
2731 trace_ram_state_resume_prepare(pages);
2732 }
2733
2734 /*
2735 * This function clears bits of the free pages reported by the caller from the
2736 * migration dirty bitmap. @addr is the host address corresponding to the
2737 * start of the continuous guest free pages, and @len is the total bytes of
2738 * those pages.
2739 */
2740 void qemu_guest_free_page_hint(void *addr, size_t len)
2741 {
2742 RAMBlock *block;
2743 ram_addr_t offset;
2744 size_t used_len, start, npages;
2745 MigrationState *s = migrate_get_current();
2746
2747 /* This function is currently expected to be used during live migration */
2748 if (!migration_is_setup_or_active(s->state)) {
2749 return;
2750 }
2751
2752 for (; len > 0; len -= used_len, addr += used_len) {
2753 block = qemu_ram_block_from_host(addr, false, &offset);
2754 if (unlikely(!block || offset >= block->used_length)) {
2755 /*
2756 * The implementation might not support RAMBlock resize during
2757 * live migration, but it could happen in theory with future
2758 * updates. So we add a check here to capture that case.
2759 */
2760 error_report_once("%s unexpected error", __func__);
2761 return;
2762 }
2763
2764 if (len <= block->used_length - offset) {
2765 used_len = len;
2766 } else {
2767 used_len = block->used_length - offset;
2768 }
2769
2770 start = offset >> TARGET_PAGE_BITS;
2771 npages = used_len >> TARGET_PAGE_BITS;
2772
2773 qemu_mutex_lock(&ram_state->bitmap_mutex);
2774 /*
2775 * The skipped free pages are equavalent to be sent from clear_bmap's
2776 * perspective, so clear the bits from the memory region bitmap which
2777 * are initially set. Otherwise those skipped pages will be sent in
2778 * the next round after syncing from the memory region bitmap.
2779 */
2780 migration_clear_memory_region_dirty_bitmap_range(ram_state, block,
2781 start, npages);
2782 ram_state->migration_dirty_pages -=
2783 bitmap_count_one_with_offset(block->bmap, start, npages);
2784 bitmap_clear(block->bmap, start, npages);
2785 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2786 }
2787 }
2788
2789 /*
2790 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2791 * long-running RCU critical section. When rcu-reclaims in the code
2792 * start to become numerous it will be necessary to reduce the
2793 * granularity of these critical sections.
2794 */
2795
2796 /**
2797 * ram_save_setup: Setup RAM for migration
2798 *
2799 * Returns zero to indicate success and negative for error
2800 *
2801 * @f: QEMUFile where to send the data
2802 * @opaque: RAMState pointer
2803 */
2804 static int ram_save_setup(QEMUFile *f, void *opaque)
2805 {
2806 RAMState **rsp = opaque;
2807 RAMBlock *block;
2808
2809 if (compress_threads_save_setup()) {
2810 return -1;
2811 }
2812
2813 /* migration has already setup the bitmap, reuse it. */
2814 if (!migration_in_colo_state()) {
2815 if (ram_init_all(rsp) != 0) {
2816 compress_threads_save_cleanup();
2817 return -1;
2818 }
2819 }
2820 (*rsp)->f = f;
2821
2822 WITH_RCU_READ_LOCK_GUARD() {
2823 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2824
2825 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2826 qemu_put_byte(f, strlen(block->idstr));
2827 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2828 qemu_put_be64(f, block->used_length);
2829 if (migrate_postcopy_ram() && block->page_size !=
2830 qemu_host_page_size) {
2831 qemu_put_be64(f, block->page_size);
2832 }
2833 if (migrate_ignore_shared()) {
2834 qemu_put_be64(f, block->mr->addr);
2835 }
2836 }
2837 }
2838
2839 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2840 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2841
2842 multifd_send_sync_main(f);
2843 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2844 qemu_fflush(f);
2845
2846 return 0;
2847 }
2848
2849 /**
2850 * ram_save_iterate: iterative stage for migration
2851 *
2852 * Returns zero to indicate success and negative for error
2853 *
2854 * @f: QEMUFile where to send the data
2855 * @opaque: RAMState pointer
2856 */
2857 static int ram_save_iterate(QEMUFile *f, void *opaque)
2858 {
2859 RAMState **temp = opaque;
2860 RAMState *rs = *temp;
2861 int ret = 0;
2862 int i;
2863 int64_t t0;
2864 int done = 0;
2865
2866 if (blk_mig_bulk_active()) {
2867 /* Avoid transferring ram during bulk phase of block migration as
2868 * the bulk phase will usually take a long time and transferring
2869 * ram updates during that time is pointless. */
2870 goto out;
2871 }
2872
2873 /*
2874 * We'll take this lock a little bit long, but it's okay for two reasons.
2875 * Firstly, the only possible other thread to take it is who calls
2876 * qemu_guest_free_page_hint(), which should be rare; secondly, see
2877 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
2878 * guarantees that we'll at least released it in a regular basis.
2879 */
2880 qemu_mutex_lock(&rs->bitmap_mutex);
2881 WITH_RCU_READ_LOCK_GUARD() {
2882 if (ram_list.version != rs->last_version) {
2883 ram_state_reset(rs);
2884 }
2885
2886 /* Read version before ram_list.blocks */
2887 smp_rmb();
2888
2889 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2890
2891 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2892 i = 0;
2893 while ((ret = qemu_file_rate_limit(f)) == 0 ||
2894 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2895 int pages;
2896
2897 if (qemu_file_get_error(f)) {
2898 break;
2899 }
2900
2901 pages = ram_find_and_save_block(rs, false);
2902 /* no more pages to sent */
2903 if (pages == 0) {
2904 done = 1;
2905 break;
2906 }
2907
2908 if (pages < 0) {
2909 qemu_file_set_error(f, pages);
2910 break;
2911 }
2912
2913 rs->target_page_count += pages;
2914
2915 /*
2916 * During postcopy, it is necessary to make sure one whole host
2917 * page is sent in one chunk.
2918 */
2919 if (migrate_postcopy_ram()) {
2920 flush_compressed_data(rs);
2921 }
2922
2923 /*
2924 * we want to check in the 1st loop, just in case it was the 1st
2925 * time and we had to sync the dirty bitmap.
2926 * qemu_clock_get_ns() is a bit expensive, so we only check each
2927 * some iterations
2928 */
2929 if ((i & 63) == 0) {
2930 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2931 1000000;
2932 if (t1 > MAX_WAIT) {
2933 trace_ram_save_iterate_big_wait(t1, i);
2934 break;
2935 }
2936 }
2937 i++;
2938 }
2939 }
2940 qemu_mutex_unlock(&rs->bitmap_mutex);
2941
2942 /*
2943 * Must occur before EOS (or any QEMUFile operation)
2944 * because of RDMA protocol.
2945 */
2946 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2947
2948 out:
2949 if (ret >= 0
2950 && migration_is_setup_or_active(migrate_get_current()->state)) {
2951 multifd_send_sync_main(rs->f);
2952 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2953 qemu_fflush(f);
2954 ram_counters.transferred += 8;
2955
2956 ret = qemu_file_get_error(f);
2957 }
2958 if (ret < 0) {
2959 return ret;
2960 }
2961
2962 return done;
2963 }
2964
2965 /**
2966 * ram_save_complete: function called to send the remaining amount of ram
2967 *
2968 * Returns zero to indicate success or negative on error
2969 *
2970 * Called with iothread lock
2971 *
2972 * @f: QEMUFile where to send the data
2973 * @opaque: RAMState pointer
2974 */
2975 static int ram_save_complete(QEMUFile *f, void *opaque)
2976 {
2977 RAMState **temp = opaque;
2978 RAMState *rs = *temp;
2979 int ret = 0;
2980
2981 WITH_RCU_READ_LOCK_GUARD() {
2982 if (!migration_in_postcopy()) {
2983 migration_bitmap_sync_precopy(rs);
2984 }
2985
2986 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2987
2988 /* try transferring iterative blocks of memory */
2989
2990 /* flush all remaining blocks regardless of rate limiting */
2991 while (true) {
2992 int pages;
2993
2994 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2995 /* no more blocks to sent */
2996 if (pages == 0) {
2997 break;
2998 }
2999 if (pages < 0) {
3000 ret = pages;
3001 break;
3002 }
3003 }
3004
3005 flush_compressed_data(rs);
3006 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3007 }
3008
3009 if (ret >= 0) {
3010 multifd_send_sync_main(rs->f);
3011 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3012 qemu_fflush(f);
3013 }
3014
3015 return ret;
3016 }
3017
3018 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3019 uint64_t *res_precopy_only,
3020 uint64_t *res_compatible,
3021 uint64_t *res_postcopy_only)
3022 {
3023 RAMState **temp = opaque;
3024 RAMState *rs = *temp;
3025 uint64_t remaining_size;
3026
3027 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3028
3029 if (!migration_in_postcopy() &&
3030 remaining_size < max_size) {
3031 qemu_mutex_lock_iothread();
3032 WITH_RCU_READ_LOCK_GUARD() {
3033 migration_bitmap_sync_precopy(rs);
3034 }
3035 qemu_mutex_unlock_iothread();
3036 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3037 }
3038
3039 if (migrate_postcopy_ram()) {
3040 /* We can do postcopy, and all the data is postcopiable */
3041 *res_compatible += remaining_size;
3042 } else {
3043 *res_precopy_only += remaining_size;
3044 }
3045 }
3046
3047 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3048 {
3049 unsigned int xh_len;
3050 int xh_flags;
3051 uint8_t *loaded_data;
3052
3053 /* extract RLE header */
3054 xh_flags = qemu_get_byte(f);
3055 xh_len = qemu_get_be16(f);
3056
3057 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3058 error_report("Failed to load XBZRLE page - wrong compression!");
3059 return -1;
3060 }
3061
3062 if (xh_len > TARGET_PAGE_SIZE) {
3063 error_report("Failed to load XBZRLE page - len overflow!");
3064 return -1;
3065 }
3066 loaded_data = XBZRLE.decoded_buf;
3067 /* load data and decode */
3068 /* it can change loaded_data to point to an internal buffer */
3069 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3070
3071 /* decode RLE */
3072 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3073 TARGET_PAGE_SIZE) == -1) {
3074 error_report("Failed to load XBZRLE page - decode error!");
3075 return -1;
3076 }
3077
3078 return 0;
3079 }
3080
3081 /**
3082 * ram_block_from_stream: read a RAMBlock id from the migration stream
3083 *
3084 * Must be called from within a rcu critical section.
3085 *
3086 * Returns a pointer from within the RCU-protected ram_list.
3087 *
3088 * @f: QEMUFile where to read the data from
3089 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3090 */
3091 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3092 {
3093 static RAMBlock *block;
3094 char id[256];
3095 uint8_t len;
3096
3097 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3098 if (!block) {
3099 error_report("Ack, bad migration stream!");
3100 return NULL;
3101 }
3102 return block;
3103 }
3104
3105 len = qemu_get_byte(f);
3106 qemu_get_buffer(f, (uint8_t *)id, len);
3107 id[len] = 0;
3108
3109 block = qemu_ram_block_by_name(id);
3110 if (!block) {
3111 error_report("Can't find block %s", id);
3112 return NULL;
3113 }
3114
3115 if (ramblock_is_ignored(block)) {
3116 error_report("block %s should not be migrated !", id);
3117 return NULL;
3118 }
3119
3120 return block;
3121 }
3122
3123 static inline void *host_from_ram_block_offset(RAMBlock *block,
3124 ram_addr_t offset)
3125 {
3126 if (!offset_in_ramblock(block, offset)) {
3127 return NULL;
3128 }
3129
3130 return block->host + offset;
3131 }
3132
3133 static void *host_page_from_ram_block_offset(RAMBlock *block,
3134 ram_addr_t offset)
3135 {
3136 /* Note: Explicitly no check against offset_in_ramblock(). */
3137 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3138 block->page_size);
3139 }
3140
3141 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3142 ram_addr_t offset)
3143 {
3144 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3145 }
3146
3147 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3148 ram_addr_t offset, bool record_bitmap)
3149 {
3150 if (!offset_in_ramblock(block, offset)) {
3151 return NULL;
3152 }
3153 if (!block->colo_cache) {
3154 error_report("%s: colo_cache is NULL in block :%s",
3155 __func__, block->idstr);
3156 return NULL;
3157 }
3158
3159 /*
3160 * During colo checkpoint, we need bitmap of these migrated pages.
3161 * It help us to decide which pages in ram cache should be flushed
3162 * into VM's RAM later.
3163 */
3164 if (record_bitmap &&
3165 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3166 ram_state->migration_dirty_pages++;
3167 }
3168 return block->colo_cache + offset;
3169 }
3170
3171 /**
3172 * ram_handle_compressed: handle the zero page case
3173 *
3174 * If a page (or a whole RDMA chunk) has been
3175 * determined to be zero, then zap it.
3176 *
3177 * @host: host address for the zero page
3178 * @ch: what the page is filled from. We only support zero
3179 * @size: size of the zero page
3180 */
3181 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3182 {
3183 if (ch != 0 || !is_zero_range(host, size)) {
3184 memset(host, ch, size);
3185 }
3186 }
3187
3188 /* return the size after decompression, or negative value on error */
3189 static int
3190 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3191 const uint8_t *source, size_t source_len)
3192 {
3193 int err;
3194
3195 err = inflateReset(stream);
3196 if (err != Z_OK) {
3197 return -1;
3198 }
3199
3200 stream->avail_in = source_len;
3201 stream->next_in = (uint8_t *)source;
3202 stream->avail_out = dest_len;
3203 stream->next_out = dest;
3204
3205 err = inflate(stream, Z_NO_FLUSH);
3206 if (err != Z_STREAM_END) {
3207 return -1;
3208 }
3209
3210 return stream->total_out;
3211 }
3212
3213 static void *do_data_decompress(void *opaque)
3214 {
3215 DecompressParam *param = opaque;
3216 unsigned long pagesize;
3217 uint8_t *des;
3218 int len, ret;
3219
3220 qemu_mutex_lock(&param->mutex);
3221 while (!param->quit) {
3222 if (param->des) {
3223 des = param->des;
3224 len = param->len;
3225 param->des = 0;
3226 qemu_mutex_unlock(&param->mutex);
3227
3228 pagesize = TARGET_PAGE_SIZE;
3229
3230 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3231 param->compbuf, len);
3232 if (ret < 0 && migrate_get_current()->decompress_error_check) {
3233 error_report("decompress data failed");
3234 qemu_file_set_error(decomp_file, ret);
3235 }
3236
3237 qemu_mutex_lock(&decomp_done_lock);
3238 param->done = true;
3239 qemu_cond_signal(&decomp_done_cond);
3240 qemu_mutex_unlock(&decomp_done_lock);
3241
3242 qemu_mutex_lock(&param->mutex);
3243 } else {
3244 qemu_cond_wait(&param->cond, &param->mutex);
3245 }
3246 }
3247 qemu_mutex_unlock(&param->mutex);
3248
3249 return NULL;
3250 }
3251
3252 static int wait_for_decompress_done(void)
3253 {
3254 int idx, thread_count;
3255
3256 if (!migrate_use_compression()) {
3257 return 0;
3258 }
3259
3260 thread_count = migrate_decompress_threads();
3261 qemu_mutex_lock(&decomp_done_lock);
3262 for (idx = 0; idx < thread_count; idx++) {
3263 while (!decomp_param[idx].done) {
3264 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3265 }
3266 }
3267 qemu_mutex_unlock(&decomp_done_lock);
3268 return qemu_file_get_error(decomp_file);
3269 }
3270
3271 static void compress_threads_load_cleanup(void)
3272 {
3273 int i, thread_count;
3274
3275 if (!migrate_use_compression()) {
3276 return;
3277 }
3278 thread_count = migrate_decompress_threads();
3279 for (i = 0; i < thread_count; i++) {
3280 /*
3281 * we use it as a indicator which shows if the thread is
3282 * properly init'd or not
3283 */
3284 if (!decomp_param[i].compbuf) {
3285 break;
3286 }
3287
3288 qemu_mutex_lock(&decomp_param[i].mutex);
3289 decomp_param[i].quit = true;
3290 qemu_cond_signal(&decomp_param[i].cond);
3291 qemu_mutex_unlock(&decomp_param[i].mutex);
3292 }
3293 for (i = 0; i < thread_count; i++) {
3294 if (!decomp_param[i].compbuf) {
3295 break;
3296 }
3297
3298 qemu_thread_join(decompress_threads + i);
3299 qemu_mutex_destroy(&decomp_param[i].mutex);
3300 qemu_cond_destroy(&decomp_param[i].cond);
3301 inflateEnd(&decomp_param[i].stream);
3302 g_free(decomp_param[i].compbuf);
3303 decomp_param[i].compbuf = NULL;
3304 }
3305 g_free(decompress_threads);
3306 g_free(decomp_param);
3307 decompress_threads = NULL;
3308 decomp_param = NULL;
3309 decomp_file = NULL;
3310 }
3311
3312 static int compress_threads_load_setup(QEMUFile *f)
3313 {
3314 int i, thread_count;
3315
3316 if (!migrate_use_compression()) {
3317 return 0;
3318 }
3319
3320 thread_count = migrate_decompress_threads();
3321 decompress_threads = g_new0(QemuThread, thread_count);
3322 decomp_param = g_new0(DecompressParam, thread_count);
3323 qemu_mutex_init(&decomp_done_lock);
3324 qemu_cond_init(&decomp_done_cond);
3325 decomp_file = f;
3326 for (i = 0; i < thread_count; i++) {
3327 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3328 goto exit;
3329 }
3330
3331 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3332 qemu_mutex_init(&decomp_param[i].mutex);
3333 qemu_cond_init(&decomp_param[i].cond);
3334 decomp_param[i].done = true;
3335 decomp_param[i].quit = false;
3336 qemu_thread_create(decompress_threads + i, "decompress",
3337 do_data_decompress, decomp_param + i,
3338 QEMU_THREAD_JOINABLE);
3339 }
3340 return 0;
3341 exit:
3342 compress_threads_load_cleanup();
3343 return -1;
3344 }
3345
3346 static void decompress_data_with_multi_threads(QEMUFile *f,
3347 void *host, int len)
3348 {
3349 int idx, thread_count;
3350
3351 thread_count = migrate_decompress_threads();
3352 QEMU_LOCK_GUARD(&decomp_done_lock);
3353 while (true) {
3354 for (idx = 0; idx < thread_count; idx++) {
3355 if (decomp_param[idx].done) {
3356 decomp_param[idx].done = false;
3357 qemu_mutex_lock(&decomp_param[idx].mutex);
3358 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3359 decomp_param[idx].des = host;
3360 decomp_param[idx].len = len;
3361 qemu_cond_signal(&decomp_param[idx].cond);
3362 qemu_mutex_unlock(&decomp_param[idx].mutex);
3363 break;
3364 }
3365 }
3366 if (idx < thread_count) {
3367 break;
3368 } else {
3369 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3370 }
3371 }
3372 }
3373
3374 static void colo_init_ram_state(void)
3375 {
3376 ram_state_init(&ram_state);
3377 }
3378
3379 /*
3380 * colo cache: this is for secondary VM, we cache the whole
3381 * memory of the secondary VM, it is need to hold the global lock
3382 * to call this helper.
3383 */
3384 int colo_init_ram_cache(void)
3385 {
3386 RAMBlock *block;
3387
3388 WITH_RCU_READ_LOCK_GUARD() {
3389 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3390 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3391 NULL, false, false);
3392 if (!block->colo_cache) {
3393 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3394 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3395 block->used_length);
3396 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3397 if (block->colo_cache) {
3398 qemu_anon_ram_free(block->colo_cache, block->used_length);
3399 block->colo_cache = NULL;
3400 }
3401 }
3402 return -errno;
3403 }
3404 }
3405 }
3406
3407 /*
3408 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3409 * with to decide which page in cache should be flushed into SVM's RAM. Here
3410 * we use the same name 'ram_bitmap' as for migration.
3411 */
3412 if (ram_bytes_total()) {
3413 RAMBlock *block;
3414
3415 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3416 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3417 block->bmap = bitmap_new(pages);
3418 }
3419 }
3420
3421 colo_init_ram_state();
3422 return 0;
3423 }
3424
3425 /* TODO: duplicated with ram_init_bitmaps */
3426 void colo_incoming_start_dirty_log(void)
3427 {
3428 RAMBlock *block = NULL;
3429 /* For memory_global_dirty_log_start below. */
3430 qemu_mutex_lock_iothread();
3431 qemu_mutex_lock_ramlist();
3432
3433 memory_global_dirty_log_sync();
3434 WITH_RCU_READ_LOCK_GUARD() {
3435 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3436 ramblock_sync_dirty_bitmap(ram_state, block);
3437 /* Discard this dirty bitmap record */
3438 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3439 }
3440 memory_global_dirty_log_start();
3441 }
3442 ram_state->migration_dirty_pages = 0;
3443 qemu_mutex_unlock_ramlist();
3444 qemu_mutex_unlock_iothread();
3445 }
3446
3447 /* It is need to hold the global lock to call this helper */
3448 void colo_release_ram_cache(void)
3449 {
3450 RAMBlock *block;
3451
3452 memory_global_dirty_log_stop();
3453 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3454 g_free(block->bmap);
3455 block->bmap = NULL;
3456 }
3457
3458 WITH_RCU_READ_LOCK_GUARD() {
3459 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3460 if (block->colo_cache) {
3461 qemu_anon_ram_free(block->colo_cache, block->used_length);
3462 block->colo_cache = NULL;
3463 }
3464 }
3465 }
3466 ram_state_cleanup(&ram_state);
3467 }
3468
3469 /**
3470 * ram_load_setup: Setup RAM for migration incoming side
3471 *
3472 * Returns zero to indicate success and negative for error
3473 *
3474 * @f: QEMUFile where to receive the data
3475 * @opaque: RAMState pointer
3476 */
3477 static int ram_load_setup(QEMUFile *f, void *opaque)
3478 {
3479 if (compress_threads_load_setup(f)) {
3480 return -1;
3481 }
3482
3483 xbzrle_load_setup();
3484 ramblock_recv_map_init();
3485
3486 return 0;
3487 }
3488
3489 static int ram_load_cleanup(void *opaque)
3490 {
3491 RAMBlock *rb;
3492
3493 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3494 qemu_ram_block_writeback(rb);
3495 }
3496
3497 xbzrle_load_cleanup();
3498 compress_threads_load_cleanup();
3499
3500 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3501 g_free(rb->receivedmap);
3502 rb->receivedmap = NULL;
3503 }
3504
3505 return 0;
3506 }
3507
3508 /**
3509 * ram_postcopy_incoming_init: allocate postcopy data structures
3510 *
3511 * Returns 0 for success and negative if there was one error
3512 *
3513 * @mis: current migration incoming state
3514 *
3515 * Allocate data structures etc needed by incoming migration with
3516 * postcopy-ram. postcopy-ram's similarly names
3517 * postcopy_ram_incoming_init does the work.
3518 */
3519 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3520 {
3521 return postcopy_ram_incoming_init(mis);
3522 }
3523
3524 /**
3525 * ram_load_postcopy: load a page in postcopy case
3526 *
3527 * Returns 0 for success or -errno in case of error
3528 *
3529 * Called in postcopy mode by ram_load().
3530 * rcu_read_lock is taken prior to this being called.
3531 *
3532 * @f: QEMUFile where to send the data
3533 */
3534 static int ram_load_postcopy(QEMUFile *f)
3535 {
3536 int flags = 0, ret = 0;
3537 bool place_needed = false;
3538 bool matches_target_page_size = false;
3539 MigrationIncomingState *mis = migration_incoming_get_current();
3540 /* Temporary page that is later 'placed' */
3541 void *postcopy_host_page = mis->postcopy_tmp_page;
3542 void *host_page = NULL;
3543 bool all_zero = true;
3544 int target_pages = 0;
3545
3546 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3547 ram_addr_t addr;
3548 void *page_buffer = NULL;
3549 void *place_source = NULL;
3550 RAMBlock *block = NULL;
3551 uint8_t ch;
3552 int len;
3553
3554 addr = qemu_get_be64(f);
3555
3556 /*
3557 * If qemu file error, we should stop here, and then "addr"
3558 * may be invalid
3559 */
3560 ret = qemu_file_get_error(f);
3561 if (ret) {
3562 break;
3563 }
3564
3565 flags = addr & ~TARGET_PAGE_MASK;
3566 addr &= TARGET_PAGE_MASK;
3567
3568 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3569 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3570 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3571 block = ram_block_from_stream(f, flags);
3572 if (!block) {
3573 ret = -EINVAL;
3574 break;
3575 }
3576
3577 /*
3578 * Relying on used_length is racy and can result in false positives.
3579 * We might place pages beyond used_length in case RAM was shrunk
3580 * while in postcopy, which is fine - trying to place via
3581 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3582 */
3583 if (!block->host || addr >= block->postcopy_length) {
3584 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3585 ret = -EINVAL;
3586 break;
3587 }
3588 target_pages++;
3589 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3590 /*
3591 * Postcopy requires that we place whole host pages atomically;
3592 * these may be huge pages for RAMBlocks that are backed by
3593 * hugetlbfs.
3594 * To make it atomic, the data is read into a temporary page
3595 * that's moved into place later.
3596 * The migration protocol uses, possibly smaller, target-pages
3597 * however the source ensures it always sends all the components
3598 * of a host page in one chunk.
3599 */
3600 page_buffer = postcopy_host_page +
3601 host_page_offset_from_ram_block_offset(block, addr);
3602 /* If all TP are zero then we can optimise the place */
3603 if (target_pages == 1) {
3604 host_page = host_page_from_ram_block_offset(block, addr);
3605 } else if (host_page != host_page_from_ram_block_offset(block,
3606 addr)) {
3607 /* not the 1st TP within the HP */
3608 error_report("Non-same host page %p/%p", host_page,
3609 host_page_from_ram_block_offset(block, addr));
3610 ret = -EINVAL;
3611 break;
3612 }
3613
3614 /*
3615 * If it's the last part of a host page then we place the host
3616 * page
3617 */
3618 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3619 place_needed = true;
3620 }
3621 place_source = postcopy_host_page;
3622 }
3623
3624 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3625 case RAM_SAVE_FLAG_ZERO:
3626 ch = qemu_get_byte(f);
3627 /*
3628 * Can skip to set page_buffer when
3629 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3630 */
3631 if (ch || !matches_target_page_size) {
3632 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3633 }
3634 if (ch) {
3635 all_zero = false;
3636 }
3637 break;
3638
3639 case RAM_SAVE_FLAG_PAGE:
3640 all_zero = false;
3641 if (!matches_target_page_size) {
3642 /* For huge pages, we always use temporary buffer */
3643 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3644 } else {
3645 /*
3646 * For small pages that matches target page size, we
3647 * avoid the qemu_file copy. Instead we directly use
3648 * the buffer of QEMUFile to place the page. Note: we
3649 * cannot do any QEMUFile operation before using that
3650 * buffer to make sure the buffer is valid when
3651 * placing the page.
3652 */
3653 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3654 TARGET_PAGE_SIZE);
3655 }
3656 break;
3657 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3658 all_zero = false;
3659 len = qemu_get_be32(f);
3660 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3661 error_report("Invalid compressed data length: %d", len);
3662 ret = -EINVAL;
3663 break;
3664 }
3665 decompress_data_with_multi_threads(f, page_buffer, len);
3666 break;
3667
3668 case RAM_SAVE_FLAG_EOS:
3669 /* normal exit */
3670 multifd_recv_sync_main();
3671 break;
3672 default:
3673 error_report("Unknown combination of migration flags: 0x%x"
3674 " (postcopy mode)", flags);
3675 ret = -EINVAL;
3676 break;
3677 }
3678
3679 /* Got the whole host page, wait for decompress before placing. */
3680 if (place_needed) {
3681 ret |= wait_for_decompress_done();
3682 }
3683
3684 /* Detect for any possible file errors */
3685 if (!ret && qemu_file_get_error(f)) {
3686 ret = qemu_file_get_error(f);
3687 }
3688
3689 if (!ret && place_needed) {
3690 if (all_zero) {
3691 ret = postcopy_place_page_zero(mis, host_page, block);
3692 } else {
3693 ret = postcopy_place_page(mis, host_page, place_source,
3694 block);
3695 }
3696 place_needed = false;
3697 target_pages = 0;
3698 /* Assume we have a zero page until we detect something different */
3699 all_zero = true;
3700 }
3701 }
3702
3703 return ret;
3704 }
3705
3706 static bool postcopy_is_advised(void)
3707 {
3708 PostcopyState ps = postcopy_state_get();
3709 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3710 }
3711
3712 static bool postcopy_is_running(void)
3713 {
3714 PostcopyState ps = postcopy_state_get();
3715 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3716 }
3717
3718 /*
3719 * Flush content of RAM cache into SVM's memory.
3720 * Only flush the pages that be dirtied by PVM or SVM or both.
3721 */
3722 void colo_flush_ram_cache(void)
3723 {
3724 RAMBlock *block = NULL;
3725 void *dst_host;
3726 void *src_host;
3727 unsigned long offset = 0;
3728
3729 memory_global_dirty_log_sync();
3730 qemu_mutex_lock(&ram_state->bitmap_mutex);
3731 WITH_RCU_READ_LOCK_GUARD() {
3732 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3733 ramblock_sync_dirty_bitmap(ram_state, block);
3734 }
3735 }
3736
3737 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3738 WITH_RCU_READ_LOCK_GUARD() {
3739 block = QLIST_FIRST_RCU(&ram_list.blocks);
3740
3741 while (block) {
3742 offset = migration_bitmap_find_dirty(ram_state, block, offset);
3743
3744 if (!offset_in_ramblock(block,
3745 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3746 offset = 0;
3747 block = QLIST_NEXT_RCU(block, next);
3748 } else {
3749 migration_bitmap_clear_dirty(ram_state, block, offset);
3750 dst_host = block->host
3751 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3752 src_host = block->colo_cache
3753 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3754 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3755 }
3756 }
3757 }
3758 trace_colo_flush_ram_cache_end();
3759 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3760 }
3761
3762 /**
3763 * ram_load_precopy: load pages in precopy case
3764 *
3765 * Returns 0 for success or -errno in case of error
3766 *
3767 * Called in precopy mode by ram_load().
3768 * rcu_read_lock is taken prior to this being called.
3769 *
3770 * @f: QEMUFile where to send the data
3771 */
3772 static int ram_load_precopy(QEMUFile *f)
3773 {
3774 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3775 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3776 bool postcopy_advised = postcopy_is_advised();
3777 if (!migrate_use_compression()) {
3778 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3779 }
3780
3781 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3782 ram_addr_t addr, total_ram_bytes;
3783 void *host = NULL, *host_bak = NULL;
3784 uint8_t ch;
3785
3786 /*
3787 * Yield periodically to let main loop run, but an iteration of
3788 * the main loop is expensive, so do it each some iterations
3789 */
3790 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3791 aio_co_schedule(qemu_get_current_aio_context(),
3792 qemu_coroutine_self());
3793 qemu_coroutine_yield();
3794 }
3795 i++;
3796
3797 addr = qemu_get_be64(f);
3798 flags = addr & ~TARGET_PAGE_MASK;
3799 addr &= TARGET_PAGE_MASK;
3800
3801 if (flags & invalid_flags) {
3802 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3803 error_report("Received an unexpected compressed page");
3804 }
3805
3806 ret = -EINVAL;
3807 break;
3808 }
3809
3810 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3811 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3812 RAMBlock *block = ram_block_from_stream(f, flags);
3813
3814 host = host_from_ram_block_offset(block, addr);
3815 /*
3816 * After going into COLO stage, we should not load the page
3817 * into SVM's memory directly, we put them into colo_cache firstly.
3818 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3819 * Previously, we copied all these memory in preparing stage of COLO
3820 * while we need to stop VM, which is a time-consuming process.
3821 * Here we optimize it by a trick, back-up every page while in
3822 * migration process while COLO is enabled, though it affects the
3823 * speed of the migration, but it obviously reduce the downtime of
3824 * back-up all SVM'S memory in COLO preparing stage.
3825 */
3826 if (migration_incoming_colo_enabled()) {
3827 if (migration_incoming_in_colo_state()) {
3828 /* In COLO stage, put all pages into cache temporarily */
3829 host = colo_cache_from_block_offset(block, addr, true);
3830 } else {
3831 /*
3832 * In migration stage but before COLO stage,
3833 * Put all pages into both cache and SVM's memory.
3834 */
3835 host_bak = colo_cache_from_block_offset(block, addr, false);
3836 }
3837 }
3838 if (!host) {
3839 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3840 ret = -EINVAL;
3841 break;
3842 }
3843 if (!migration_incoming_in_colo_state()) {
3844 ramblock_recv_bitmap_set(block, host);
3845 }
3846
3847 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3848 }
3849
3850 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3851 case RAM_SAVE_FLAG_MEM_SIZE:
3852 /* Synchronize RAM block list */
3853 total_ram_bytes = addr;
3854 while (!ret && total_ram_bytes) {
3855 RAMBlock *block;
3856 char id[256];
3857 ram_addr_t length;
3858
3859 len = qemu_get_byte(f);
3860 qemu_get_buffer(f, (uint8_t *)id, len);
3861 id[len] = 0;
3862 length = qemu_get_be64(f);
3863
3864 block = qemu_ram_block_by_name(id);
3865 if (block && !qemu_ram_is_migratable(block)) {
3866 error_report("block %s should not be migrated !", id);
3867 ret = -EINVAL;
3868 } else if (block) {
3869 if (length != block->used_length) {
3870 Error *local_err = NULL;
3871
3872 ret = qemu_ram_resize(block, length,
3873 &local_err);
3874 if (local_err) {
3875 error_report_err(local_err);
3876 }
3877 }
3878 /* For postcopy we need to check hugepage sizes match */
3879 if (postcopy_advised && migrate_postcopy_ram() &&
3880 block->page_size != qemu_host_page_size) {
3881 uint64_t remote_page_size = qemu_get_be64(f);
3882 if (remote_page_size != block->page_size) {
3883 error_report("Mismatched RAM page size %s "
3884 "(local) %zd != %" PRId64,
3885 id, block->page_size,
3886 remote_page_size);
3887 ret = -EINVAL;
3888 }
3889 }
3890 if (migrate_ignore_shared()) {
3891 hwaddr addr = qemu_get_be64(f);
3892 if (ramblock_is_ignored(block) &&
3893 block->mr->addr != addr) {
3894 error_report("Mismatched GPAs for block %s "
3895 "%" PRId64 "!= %" PRId64,
3896 id, (uint64_t)addr,
3897 (uint64_t)block->mr->addr);
3898 ret = -EINVAL;
3899 }
3900 }
3901 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3902 block->idstr);
3903 } else {
3904 error_report("Unknown ramblock \"%s\", cannot "
3905 "accept migration", id);
3906 ret = -EINVAL;
3907 }
3908
3909 total_ram_bytes -= length;
3910 }
3911 break;
3912
3913 case RAM_SAVE_FLAG_ZERO:
3914 ch = qemu_get_byte(f);
3915 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3916 break;
3917
3918 case RAM_SAVE_FLAG_PAGE:
3919 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3920 break;
3921
3922 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3923 len = qemu_get_be32(f);
3924 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3925 error_report("Invalid compressed data length: %d", len);
3926 ret = -EINVAL;
3927 break;
3928 }
3929 decompress_data_with_multi_threads(f, host, len);
3930 break;
3931
3932 case RAM_SAVE_FLAG_XBZRLE:
3933 if (load_xbzrle(f, addr, host) < 0) {
3934 error_report("Failed to decompress XBZRLE page at "
3935 RAM_ADDR_FMT, addr);
3936 ret = -EINVAL;
3937 break;
3938 }
3939 break;
3940 case RAM_SAVE_FLAG_EOS:
3941 /* normal exit */
3942 multifd_recv_sync_main();
3943 break;
3944 default:
3945 if (flags & RAM_SAVE_FLAG_HOOK) {
3946 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3947 } else {
3948 error_report("Unknown combination of migration flags: 0x%x",
3949 flags);
3950 ret = -EINVAL;
3951 }
3952 }
3953 if (!ret) {
3954 ret = qemu_file_get_error(f);
3955 }
3956 if (!ret && host_bak) {
3957 memcpy(host_bak, host, TARGET_PAGE_SIZE);
3958 }
3959 }
3960
3961 ret |= wait_for_decompress_done();
3962 return ret;
3963 }
3964
3965 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3966 {
3967 int ret = 0;
3968 static uint64_t seq_iter;
3969 /*
3970 * If system is running in postcopy mode, page inserts to host memory must
3971 * be atomic
3972 */
3973 bool postcopy_running = postcopy_is_running();
3974
3975 seq_iter++;
3976
3977 if (version_id != 4) {
3978 return -EINVAL;
3979 }
3980
3981 /*
3982 * This RCU critical section can be very long running.
3983 * When RCU reclaims in the code start to become numerous,
3984 * it will be necessary to reduce the granularity of this
3985 * critical section.
3986 */
3987 WITH_RCU_READ_LOCK_GUARD() {
3988 if (postcopy_running) {
3989 ret = ram_load_postcopy(f);
3990 } else {
3991 ret = ram_load_precopy(f);
3992 }
3993 }
3994 trace_ram_load_complete(ret, seq_iter);
3995
3996 return ret;
3997 }
3998
3999 static bool ram_has_postcopy(void *opaque)
4000 {
4001 RAMBlock *rb;
4002 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4003 if (ramblock_is_pmem(rb)) {
4004 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4005 "is not supported now!", rb->idstr, rb->host);
4006 return false;
4007 }
4008 }
4009
4010 return migrate_postcopy_ram();
4011 }
4012
4013 /* Sync all the dirty bitmap with destination VM. */
4014 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4015 {
4016 RAMBlock *block;
4017 QEMUFile *file = s->to_dst_file;
4018 int ramblock_count = 0;
4019
4020 trace_ram_dirty_bitmap_sync_start();
4021
4022 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4023 qemu_savevm_send_recv_bitmap(file, block->idstr);
4024 trace_ram_dirty_bitmap_request(block->idstr);
4025 ramblock_count++;
4026 }
4027
4028 trace_ram_dirty_bitmap_sync_wait();
4029
4030 /* Wait until all the ramblocks' dirty bitmap synced */
4031 while (ramblock_count--) {
4032 qemu_sem_wait(&s->rp_state.rp_sem);
4033 }
4034
4035 trace_ram_dirty_bitmap_sync_complete();
4036
4037 return 0;
4038 }
4039
4040 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4041 {
4042 qemu_sem_post(&s->rp_state.rp_sem);
4043 }
4044
4045 /*
4046 * Read the received bitmap, revert it as the initial dirty bitmap.
4047 * This is only used when the postcopy migration is paused but wants
4048 * to resume from a middle point.
4049 */
4050 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4051 {
4052 int ret = -EINVAL;
4053 /* from_dst_file is always valid because we're within rp_thread */
4054 QEMUFile *file = s->rp_state.from_dst_file;
4055 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4056 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4057 uint64_t size, end_mark;
4058
4059 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4060
4061 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4062 error_report("%s: incorrect state %s", __func__,
4063 MigrationStatus_str(s->state));
4064 return -EINVAL;
4065 }
4066
4067 /*
4068 * Note: see comments in ramblock_recv_bitmap_send() on why we
4069 * need the endianness conversion, and the paddings.
4070 */
4071 local_size = ROUND_UP(local_size, 8);
4072
4073 /* Add paddings */
4074 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4075
4076 size = qemu_get_be64(file);
4077
4078 /* The size of the bitmap should match with our ramblock */
4079 if (size != local_size) {
4080 error_report("%s: ramblock '%s' bitmap size mismatch "
4081 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4082 block->idstr, size, local_size);
4083 ret = -EINVAL;
4084 goto out;
4085 }
4086
4087 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4088 end_mark = qemu_get_be64(file);
4089
4090 ret = qemu_file_get_error(file);
4091 if (ret || size != local_size) {
4092 error_report("%s: read bitmap failed for ramblock '%s': %d"
4093 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4094 __func__, block->idstr, ret, local_size, size);
4095 ret = -EIO;
4096 goto out;
4097 }
4098
4099 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4100 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4101 __func__, block->idstr, end_mark);
4102 ret = -EINVAL;
4103 goto out;
4104 }
4105
4106 /*
4107 * Endianness conversion. We are during postcopy (though paused).
4108 * The dirty bitmap won't change. We can directly modify it.
4109 */
4110 bitmap_from_le(block->bmap, le_bitmap, nbits);
4111
4112 /*
4113 * What we received is "received bitmap". Revert it as the initial
4114 * dirty bitmap for this ramblock.
4115 */
4116 bitmap_complement(block->bmap, block->bmap, nbits);
4117
4118 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4119
4120 /*
4121 * We succeeded to sync bitmap for current ramblock. If this is
4122 * the last one to sync, we need to notify the main send thread.
4123 */
4124 ram_dirty_bitmap_reload_notify(s);
4125
4126 ret = 0;
4127 out:
4128 g_free(le_bitmap);
4129 return ret;
4130 }
4131
4132 static int ram_resume_prepare(MigrationState *s, void *opaque)
4133 {
4134 RAMState *rs = *(RAMState **)opaque;
4135 int ret;
4136
4137 ret = ram_dirty_bitmap_sync_all(s, rs);
4138 if (ret) {
4139 return ret;
4140 }
4141
4142 ram_state_resume_prepare(rs, s->to_dst_file);
4143
4144 return 0;
4145 }
4146
4147 static SaveVMHandlers savevm_ram_handlers = {
4148 .save_setup = ram_save_setup,
4149 .save_live_iterate = ram_save_iterate,
4150 .save_live_complete_postcopy = ram_save_complete,
4151 .save_live_complete_precopy = ram_save_complete,
4152 .has_postcopy = ram_has_postcopy,
4153 .save_live_pending = ram_save_pending,
4154 .load_state = ram_load,
4155 .save_cleanup = ram_save_cleanup,
4156 .load_setup = ram_load_setup,
4157 .load_cleanup = ram_load_cleanup,
4158 .resume_prepare = ram_resume_prepare,
4159 };
4160
4161 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4162 size_t old_size, size_t new_size)
4163 {
4164 PostcopyState ps = postcopy_state_get();
4165 ram_addr_t offset;
4166 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4167 Error *err = NULL;
4168
4169 if (ramblock_is_ignored(rb)) {
4170 return;
4171 }
4172
4173 if (!migration_is_idle()) {
4174 /*
4175 * Precopy code on the source cannot deal with the size of RAM blocks
4176 * changing at random points in time - especially after sending the
4177 * RAM block sizes in the migration stream, they must no longer change.
4178 * Abort and indicate a proper reason.
4179 */
4180 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4181 migrate_set_error(migrate_get_current(), err);
4182 error_free(err);
4183 migration_cancel();
4184 }
4185
4186 switch (ps) {
4187 case POSTCOPY_INCOMING_ADVISE:
4188 /*
4189 * Update what ram_postcopy_incoming_init()->init_range() does at the
4190 * time postcopy was advised. Syncing RAM blocks with the source will
4191 * result in RAM resizes.
4192 */
4193 if (old_size < new_size) {
4194 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4195 error_report("RAM block '%s' discard of resized RAM failed",
4196 rb->idstr);
4197 }
4198 }
4199 rb->postcopy_length = new_size;
4200 break;
4201 case POSTCOPY_INCOMING_NONE:
4202 case POSTCOPY_INCOMING_RUNNING:
4203 case POSTCOPY_INCOMING_END:
4204 /*
4205 * Once our guest is running, postcopy does no longer care about
4206 * resizes. When growing, the new memory was not available on the
4207 * source, no handler needed.
4208 */
4209 break;
4210 default:
4211 error_report("RAM block '%s' resized during postcopy state: %d",
4212 rb->idstr, ps);
4213 exit(-1);
4214 }
4215 }
4216
4217 static RAMBlockNotifier ram_mig_ram_notifier = {
4218 .ram_block_resized = ram_mig_ram_block_resized,
4219 };
4220
4221 void ram_mig_init(void)
4222 {
4223 qemu_mutex_init(&XBZRLE.lock);
4224 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4225 ram_block_notifier_add(&ram_mig_ram_notifier);
4226 }