]> git.proxmox.com Git - mirror_qemu.git/blob - migration/ram.c
Merge remote-tracking branch 'remotes/ericb/tags/pull-nbd-2022-01-28' into staging
[mirror_qemu.git] / migration / ram.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/main-loop.h"
34 #include "xbzrle.h"
35 #include "ram.h"
36 #include "migration.h"
37 #include "migration/register.h"
38 #include "migration/misc.h"
39 #include "qemu-file.h"
40 #include "postcopy-ram.h"
41 #include "page_cache.h"
42 #include "qemu/error-report.h"
43 #include "qapi/error.h"
44 #include "qapi/qapi-types-migration.h"
45 #include "qapi/qapi-events-migration.h"
46 #include "qapi/qmp/qerror.h"
47 #include "trace.h"
48 #include "exec/ram_addr.h"
49 #include "exec/target_page.h"
50 #include "qemu/rcu_queue.h"
51 #include "migration/colo.h"
52 #include "block.h"
53 #include "sysemu/cpu-throttle.h"
54 #include "savevm.h"
55 #include "qemu/iov.h"
56 #include "multifd.h"
57 #include "sysemu/runstate.h"
58
59 #include "hw/boards.h" /* for machine_dump_guest_core() */
60
61 #if defined(__linux__)
62 #include "qemu/userfaultfd.h"
63 #endif /* defined(__linux__) */
64
65 /***********************************************************/
66 /* ram save/restore */
67
68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
69 * worked for pages that where filled with the same char. We switched
70 * it to only search for the zero value. And to avoid confusion with
71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
72 */
73
74 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
75 #define RAM_SAVE_FLAG_ZERO 0x02
76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
77 #define RAM_SAVE_FLAG_PAGE 0x08
78 #define RAM_SAVE_FLAG_EOS 0x10
79 #define RAM_SAVE_FLAG_CONTINUE 0x20
80 #define RAM_SAVE_FLAG_XBZRLE 0x40
81 /* 0x80 is reserved in migration.h start with 0x100 next */
82 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
83
84 XBZRLECacheStats xbzrle_counters;
85
86 /* struct contains XBZRLE cache and a static page
87 used by the compression */
88 static struct {
89 /* buffer used for XBZRLE encoding */
90 uint8_t *encoded_buf;
91 /* buffer for storing page content */
92 uint8_t *current_buf;
93 /* Cache for XBZRLE, Protected by lock. */
94 PageCache *cache;
95 QemuMutex lock;
96 /* it will store a page full of zeros */
97 uint8_t *zero_target_page;
98 /* buffer used for XBZRLE decoding */
99 uint8_t *decoded_buf;
100 } XBZRLE;
101
102 static void XBZRLE_cache_lock(void)
103 {
104 if (migrate_use_xbzrle()) {
105 qemu_mutex_lock(&XBZRLE.lock);
106 }
107 }
108
109 static void XBZRLE_cache_unlock(void)
110 {
111 if (migrate_use_xbzrle()) {
112 qemu_mutex_unlock(&XBZRLE.lock);
113 }
114 }
115
116 /**
117 * xbzrle_cache_resize: resize the xbzrle cache
118 *
119 * This function is called from migrate_params_apply in main
120 * thread, possibly while a migration is in progress. A running
121 * migration may be using the cache and might finish during this call,
122 * hence changes to the cache are protected by XBZRLE.lock().
123 *
124 * Returns 0 for success or -1 for error
125 *
126 * @new_size: new cache size
127 * @errp: set *errp if the check failed, with reason
128 */
129 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
130 {
131 PageCache *new_cache;
132 int64_t ret = 0;
133
134 /* Check for truncation */
135 if (new_size != (size_t)new_size) {
136 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
137 "exceeding address space");
138 return -1;
139 }
140
141 if (new_size == migrate_xbzrle_cache_size()) {
142 /* nothing to do */
143 return 0;
144 }
145
146 XBZRLE_cache_lock();
147
148 if (XBZRLE.cache != NULL) {
149 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
150 if (!new_cache) {
151 ret = -1;
152 goto out;
153 }
154
155 cache_fini(XBZRLE.cache);
156 XBZRLE.cache = new_cache;
157 }
158 out:
159 XBZRLE_cache_unlock();
160 return ret;
161 }
162
163 bool ramblock_is_ignored(RAMBlock *block)
164 {
165 return !qemu_ram_is_migratable(block) ||
166 (migrate_ignore_shared() && qemu_ram_is_shared(block));
167 }
168
169 #undef RAMBLOCK_FOREACH
170
171 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
172 {
173 RAMBlock *block;
174 int ret = 0;
175
176 RCU_READ_LOCK_GUARD();
177
178 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
179 ret = func(block, opaque);
180 if (ret) {
181 break;
182 }
183 }
184 return ret;
185 }
186
187 static void ramblock_recv_map_init(void)
188 {
189 RAMBlock *rb;
190
191 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
192 assert(!rb->receivedmap);
193 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
194 }
195 }
196
197 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
198 {
199 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
200 rb->receivedmap);
201 }
202
203 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
204 {
205 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
206 }
207
208 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
209 {
210 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
211 }
212
213 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
214 size_t nr)
215 {
216 bitmap_set_atomic(rb->receivedmap,
217 ramblock_recv_bitmap_offset(host_addr, rb),
218 nr);
219 }
220
221 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
222
223 /*
224 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
225 *
226 * Returns >0 if success with sent bytes, or <0 if error.
227 */
228 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
229 const char *block_name)
230 {
231 RAMBlock *block = qemu_ram_block_by_name(block_name);
232 unsigned long *le_bitmap, nbits;
233 uint64_t size;
234
235 if (!block) {
236 error_report("%s: invalid block name: %s", __func__, block_name);
237 return -1;
238 }
239
240 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
241
242 /*
243 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
244 * machines we may need 4 more bytes for padding (see below
245 * comment). So extend it a bit before hand.
246 */
247 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
248
249 /*
250 * Always use little endian when sending the bitmap. This is
251 * required that when source and destination VMs are not using the
252 * same endianness. (Note: big endian won't work.)
253 */
254 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
255
256 /* Size of the bitmap, in bytes */
257 size = DIV_ROUND_UP(nbits, 8);
258
259 /*
260 * size is always aligned to 8 bytes for 64bit machines, but it
261 * may not be true for 32bit machines. We need this padding to
262 * make sure the migration can survive even between 32bit and
263 * 64bit machines.
264 */
265 size = ROUND_UP(size, 8);
266
267 qemu_put_be64(file, size);
268 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
269 /*
270 * Mark as an end, in case the middle part is screwed up due to
271 * some "mysterious" reason.
272 */
273 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
274 qemu_fflush(file);
275
276 g_free(le_bitmap);
277
278 if (qemu_file_get_error(file)) {
279 return qemu_file_get_error(file);
280 }
281
282 return size + sizeof(size);
283 }
284
285 /*
286 * An outstanding page request, on the source, having been received
287 * and queued
288 */
289 struct RAMSrcPageRequest {
290 RAMBlock *rb;
291 hwaddr offset;
292 hwaddr len;
293
294 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
295 };
296
297 /* State of RAM for migration */
298 struct RAMState {
299 /* QEMUFile used for this migration */
300 QEMUFile *f;
301 /* UFFD file descriptor, used in 'write-tracking' migration */
302 int uffdio_fd;
303 /* Last block that we have visited searching for dirty pages */
304 RAMBlock *last_seen_block;
305 /* Last block from where we have sent data */
306 RAMBlock *last_sent_block;
307 /* Last dirty target page we have sent */
308 ram_addr_t last_page;
309 /* last ram version we have seen */
310 uint32_t last_version;
311 /* How many times we have dirty too many pages */
312 int dirty_rate_high_cnt;
313 /* these variables are used for bitmap sync */
314 /* last time we did a full bitmap_sync */
315 int64_t time_last_bitmap_sync;
316 /* bytes transferred at start_time */
317 uint64_t bytes_xfer_prev;
318 /* number of dirty pages since start_time */
319 uint64_t num_dirty_pages_period;
320 /* xbzrle misses since the beginning of the period */
321 uint64_t xbzrle_cache_miss_prev;
322 /* Amount of xbzrle pages since the beginning of the period */
323 uint64_t xbzrle_pages_prev;
324 /* Amount of xbzrle encoded bytes since the beginning of the period */
325 uint64_t xbzrle_bytes_prev;
326 /* Start using XBZRLE (e.g., after the first round). */
327 bool xbzrle_enabled;
328 /* Are we on the last stage of migration */
329 bool last_stage;
330 /* compression statistics since the beginning of the period */
331 /* amount of count that no free thread to compress data */
332 uint64_t compress_thread_busy_prev;
333 /* amount bytes after compression */
334 uint64_t compressed_size_prev;
335 /* amount of compressed pages */
336 uint64_t compress_pages_prev;
337
338 /* total handled target pages at the beginning of period */
339 uint64_t target_page_count_prev;
340 /* total handled target pages since start */
341 uint64_t target_page_count;
342 /* number of dirty bits in the bitmap */
343 uint64_t migration_dirty_pages;
344 /* Protects modification of the bitmap and migration dirty pages */
345 QemuMutex bitmap_mutex;
346 /* The RAMBlock used in the last src_page_requests */
347 RAMBlock *last_req_rb;
348 /* Queue of outstanding page requests from the destination */
349 QemuMutex src_page_req_mutex;
350 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
351 };
352 typedef struct RAMState RAMState;
353
354 static RAMState *ram_state;
355
356 static NotifierWithReturnList precopy_notifier_list;
357
358 /* Whether postcopy has queued requests? */
359 static bool postcopy_has_request(RAMState *rs)
360 {
361 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
362 }
363
364 void precopy_infrastructure_init(void)
365 {
366 notifier_with_return_list_init(&precopy_notifier_list);
367 }
368
369 void precopy_add_notifier(NotifierWithReturn *n)
370 {
371 notifier_with_return_list_add(&precopy_notifier_list, n);
372 }
373
374 void precopy_remove_notifier(NotifierWithReturn *n)
375 {
376 notifier_with_return_remove(n);
377 }
378
379 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
380 {
381 PrecopyNotifyData pnd;
382 pnd.reason = reason;
383 pnd.errp = errp;
384
385 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
386 }
387
388 uint64_t ram_bytes_remaining(void)
389 {
390 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
391 0;
392 }
393
394 MigrationStats ram_counters;
395
396 static void ram_transferred_add(uint64_t bytes)
397 {
398 if (runstate_is_running()) {
399 ram_counters.precopy_bytes += bytes;
400 } else if (migration_in_postcopy()) {
401 ram_counters.postcopy_bytes += bytes;
402 } else {
403 ram_counters.downtime_bytes += bytes;
404 }
405 ram_counters.transferred += bytes;
406 }
407
408 /* used by the search for pages to send */
409 struct PageSearchStatus {
410 /* Current block being searched */
411 RAMBlock *block;
412 /* Current page to search from */
413 unsigned long page;
414 /* Set once we wrap around */
415 bool complete_round;
416 };
417 typedef struct PageSearchStatus PageSearchStatus;
418
419 CompressionStats compression_counters;
420
421 struct CompressParam {
422 bool done;
423 bool quit;
424 bool zero_page;
425 QEMUFile *file;
426 QemuMutex mutex;
427 QemuCond cond;
428 RAMBlock *block;
429 ram_addr_t offset;
430
431 /* internally used fields */
432 z_stream stream;
433 uint8_t *originbuf;
434 };
435 typedef struct CompressParam CompressParam;
436
437 struct DecompressParam {
438 bool done;
439 bool quit;
440 QemuMutex mutex;
441 QemuCond cond;
442 void *des;
443 uint8_t *compbuf;
444 int len;
445 z_stream stream;
446 };
447 typedef struct DecompressParam DecompressParam;
448
449 static CompressParam *comp_param;
450 static QemuThread *compress_threads;
451 /* comp_done_cond is used to wake up the migration thread when
452 * one of the compression threads has finished the compression.
453 * comp_done_lock is used to co-work with comp_done_cond.
454 */
455 static QemuMutex comp_done_lock;
456 static QemuCond comp_done_cond;
457 /* The empty QEMUFileOps will be used by file in CompressParam */
458 static const QEMUFileOps empty_ops = { };
459
460 static QEMUFile *decomp_file;
461 static DecompressParam *decomp_param;
462 static QemuThread *decompress_threads;
463 static QemuMutex decomp_done_lock;
464 static QemuCond decomp_done_cond;
465
466 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
467 ram_addr_t offset, uint8_t *source_buf);
468
469 static void *do_data_compress(void *opaque)
470 {
471 CompressParam *param = opaque;
472 RAMBlock *block;
473 ram_addr_t offset;
474 bool zero_page;
475
476 qemu_mutex_lock(&param->mutex);
477 while (!param->quit) {
478 if (param->block) {
479 block = param->block;
480 offset = param->offset;
481 param->block = NULL;
482 qemu_mutex_unlock(&param->mutex);
483
484 zero_page = do_compress_ram_page(param->file, &param->stream,
485 block, offset, param->originbuf);
486
487 qemu_mutex_lock(&comp_done_lock);
488 param->done = true;
489 param->zero_page = zero_page;
490 qemu_cond_signal(&comp_done_cond);
491 qemu_mutex_unlock(&comp_done_lock);
492
493 qemu_mutex_lock(&param->mutex);
494 } else {
495 qemu_cond_wait(&param->cond, &param->mutex);
496 }
497 }
498 qemu_mutex_unlock(&param->mutex);
499
500 return NULL;
501 }
502
503 static void compress_threads_save_cleanup(void)
504 {
505 int i, thread_count;
506
507 if (!migrate_use_compression() || !comp_param) {
508 return;
509 }
510
511 thread_count = migrate_compress_threads();
512 for (i = 0; i < thread_count; i++) {
513 /*
514 * we use it as a indicator which shows if the thread is
515 * properly init'd or not
516 */
517 if (!comp_param[i].file) {
518 break;
519 }
520
521 qemu_mutex_lock(&comp_param[i].mutex);
522 comp_param[i].quit = true;
523 qemu_cond_signal(&comp_param[i].cond);
524 qemu_mutex_unlock(&comp_param[i].mutex);
525
526 qemu_thread_join(compress_threads + i);
527 qemu_mutex_destroy(&comp_param[i].mutex);
528 qemu_cond_destroy(&comp_param[i].cond);
529 deflateEnd(&comp_param[i].stream);
530 g_free(comp_param[i].originbuf);
531 qemu_fclose(comp_param[i].file);
532 comp_param[i].file = NULL;
533 }
534 qemu_mutex_destroy(&comp_done_lock);
535 qemu_cond_destroy(&comp_done_cond);
536 g_free(compress_threads);
537 g_free(comp_param);
538 compress_threads = NULL;
539 comp_param = NULL;
540 }
541
542 static int compress_threads_save_setup(void)
543 {
544 int i, thread_count;
545
546 if (!migrate_use_compression()) {
547 return 0;
548 }
549 thread_count = migrate_compress_threads();
550 compress_threads = g_new0(QemuThread, thread_count);
551 comp_param = g_new0(CompressParam, thread_count);
552 qemu_cond_init(&comp_done_cond);
553 qemu_mutex_init(&comp_done_lock);
554 for (i = 0; i < thread_count; i++) {
555 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
556 if (!comp_param[i].originbuf) {
557 goto exit;
558 }
559
560 if (deflateInit(&comp_param[i].stream,
561 migrate_compress_level()) != Z_OK) {
562 g_free(comp_param[i].originbuf);
563 goto exit;
564 }
565
566 /* comp_param[i].file is just used as a dummy buffer to save data,
567 * set its ops to empty.
568 */
569 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
570 comp_param[i].done = true;
571 comp_param[i].quit = false;
572 qemu_mutex_init(&comp_param[i].mutex);
573 qemu_cond_init(&comp_param[i].cond);
574 qemu_thread_create(compress_threads + i, "compress",
575 do_data_compress, comp_param + i,
576 QEMU_THREAD_JOINABLE);
577 }
578 return 0;
579
580 exit:
581 compress_threads_save_cleanup();
582 return -1;
583 }
584
585 /**
586 * save_page_header: write page header to wire
587 *
588 * If this is the 1st block, it also writes the block identification
589 *
590 * Returns the number of bytes written
591 *
592 * @f: QEMUFile where to send the data
593 * @block: block that contains the page we want to send
594 * @offset: offset inside the block for the page
595 * in the lower bits, it contains flags
596 */
597 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
598 ram_addr_t offset)
599 {
600 size_t size, len;
601
602 if (block == rs->last_sent_block) {
603 offset |= RAM_SAVE_FLAG_CONTINUE;
604 }
605 qemu_put_be64(f, offset);
606 size = 8;
607
608 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
609 len = strlen(block->idstr);
610 qemu_put_byte(f, len);
611 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
612 size += 1 + len;
613 rs->last_sent_block = block;
614 }
615 return size;
616 }
617
618 /**
619 * mig_throttle_guest_down: throttle down the guest
620 *
621 * Reduce amount of guest cpu execution to hopefully slow down memory
622 * writes. If guest dirty memory rate is reduced below the rate at
623 * which we can transfer pages to the destination then we should be
624 * able to complete migration. Some workloads dirty memory way too
625 * fast and will not effectively converge, even with auto-converge.
626 */
627 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
628 uint64_t bytes_dirty_threshold)
629 {
630 MigrationState *s = migrate_get_current();
631 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
632 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
633 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
634 int pct_max = s->parameters.max_cpu_throttle;
635
636 uint64_t throttle_now = cpu_throttle_get_percentage();
637 uint64_t cpu_now, cpu_ideal, throttle_inc;
638
639 /* We have not started throttling yet. Let's start it. */
640 if (!cpu_throttle_active()) {
641 cpu_throttle_set(pct_initial);
642 } else {
643 /* Throttling already on, just increase the rate */
644 if (!pct_tailslow) {
645 throttle_inc = pct_increment;
646 } else {
647 /* Compute the ideal CPU percentage used by Guest, which may
648 * make the dirty rate match the dirty rate threshold. */
649 cpu_now = 100 - throttle_now;
650 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
651 bytes_dirty_period);
652 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
653 }
654 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
655 }
656 }
657
658 void mig_throttle_counter_reset(void)
659 {
660 RAMState *rs = ram_state;
661
662 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
663 rs->num_dirty_pages_period = 0;
664 rs->bytes_xfer_prev = ram_counters.transferred;
665 }
666
667 /**
668 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
669 *
670 * @rs: current RAM state
671 * @current_addr: address for the zero page
672 *
673 * Update the xbzrle cache to reflect a page that's been sent as all 0.
674 * The important thing is that a stale (not-yet-0'd) page be replaced
675 * by the new data.
676 * As a bonus, if the page wasn't in the cache it gets added so that
677 * when a small write is made into the 0'd page it gets XBZRLE sent.
678 */
679 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
680 {
681 if (!rs->xbzrle_enabled) {
682 return;
683 }
684
685 /* We don't care if this fails to allocate a new cache page
686 * as long as it updated an old one */
687 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
688 ram_counters.dirty_sync_count);
689 }
690
691 #define ENCODING_FLAG_XBZRLE 0x1
692
693 /**
694 * save_xbzrle_page: compress and send current page
695 *
696 * Returns: 1 means that we wrote the page
697 * 0 means that page is identical to the one already sent
698 * -1 means that xbzrle would be longer than normal
699 *
700 * @rs: current RAM state
701 * @current_data: pointer to the address of the page contents
702 * @current_addr: addr of the page
703 * @block: block that contains the page we want to send
704 * @offset: offset inside the block for the page
705 */
706 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
707 ram_addr_t current_addr, RAMBlock *block,
708 ram_addr_t offset)
709 {
710 int encoded_len = 0, bytes_xbzrle;
711 uint8_t *prev_cached_page;
712
713 if (!cache_is_cached(XBZRLE.cache, current_addr,
714 ram_counters.dirty_sync_count)) {
715 xbzrle_counters.cache_miss++;
716 if (!rs->last_stage) {
717 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
718 ram_counters.dirty_sync_count) == -1) {
719 return -1;
720 } else {
721 /* update *current_data when the page has been
722 inserted into cache */
723 *current_data = get_cached_data(XBZRLE.cache, current_addr);
724 }
725 }
726 return -1;
727 }
728
729 /*
730 * Reaching here means the page has hit the xbzrle cache, no matter what
731 * encoding result it is (normal encoding, overflow or skipping the page),
732 * count the page as encoded. This is used to calculate the encoding rate.
733 *
734 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
735 * 2nd page turns out to be skipped (i.e. no new bytes written to the
736 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
737 * skipped page included. In this way, the encoding rate can tell if the
738 * guest page is good for xbzrle encoding.
739 */
740 xbzrle_counters.pages++;
741 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
742
743 /* save current buffer into memory */
744 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
745
746 /* XBZRLE encoding (if there is no overflow) */
747 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
748 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
749 TARGET_PAGE_SIZE);
750
751 /*
752 * Update the cache contents, so that it corresponds to the data
753 * sent, in all cases except where we skip the page.
754 */
755 if (!rs->last_stage && encoded_len != 0) {
756 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
757 /*
758 * In the case where we couldn't compress, ensure that the caller
759 * sends the data from the cache, since the guest might have
760 * changed the RAM since we copied it.
761 */
762 *current_data = prev_cached_page;
763 }
764
765 if (encoded_len == 0) {
766 trace_save_xbzrle_page_skipping();
767 return 0;
768 } else if (encoded_len == -1) {
769 trace_save_xbzrle_page_overflow();
770 xbzrle_counters.overflow++;
771 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
772 return -1;
773 }
774
775 /* Send XBZRLE based compressed page */
776 bytes_xbzrle = save_page_header(rs, rs->f, block,
777 offset | RAM_SAVE_FLAG_XBZRLE);
778 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
779 qemu_put_be16(rs->f, encoded_len);
780 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
781 bytes_xbzrle += encoded_len + 1 + 2;
782 /*
783 * Like compressed_size (please see update_compress_thread_counts),
784 * the xbzrle encoded bytes don't count the 8 byte header with
785 * RAM_SAVE_FLAG_CONTINUE.
786 */
787 xbzrle_counters.bytes += bytes_xbzrle - 8;
788 ram_transferred_add(bytes_xbzrle);
789
790 return 1;
791 }
792
793 /**
794 * migration_bitmap_find_dirty: find the next dirty page from start
795 *
796 * Returns the page offset within memory region of the start of a dirty page
797 *
798 * @rs: current RAM state
799 * @rb: RAMBlock where to search for dirty pages
800 * @start: page where we start the search
801 */
802 static inline
803 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
804 unsigned long start)
805 {
806 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
807 unsigned long *bitmap = rb->bmap;
808
809 if (ramblock_is_ignored(rb)) {
810 return size;
811 }
812
813 return find_next_bit(bitmap, size, start);
814 }
815
816 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
817 unsigned long page)
818 {
819 uint8_t shift;
820 hwaddr size, start;
821
822 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
823 return;
824 }
825
826 shift = rb->clear_bmap_shift;
827 /*
828 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
829 * can make things easier sometimes since then start address
830 * of the small chunk will always be 64 pages aligned so the
831 * bitmap will always be aligned to unsigned long. We should
832 * even be able to remove this restriction but I'm simply
833 * keeping it.
834 */
835 assert(shift >= 6);
836
837 size = 1ULL << (TARGET_PAGE_BITS + shift);
838 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
839 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
840 memory_region_clear_dirty_bitmap(rb->mr, start, size);
841 }
842
843 static void
844 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
845 unsigned long start,
846 unsigned long npages)
847 {
848 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
849 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
850 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
851
852 /*
853 * Clear pages from start to start + npages - 1, so the end boundary is
854 * exclusive.
855 */
856 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
857 migration_clear_memory_region_dirty_bitmap(rb, i);
858 }
859 }
860
861 /*
862 * colo_bitmap_find_diry:find contiguous dirty pages from start
863 *
864 * Returns the page offset within memory region of the start of the contiguout
865 * dirty page
866 *
867 * @rs: current RAM state
868 * @rb: RAMBlock where to search for dirty pages
869 * @start: page where we start the search
870 * @num: the number of contiguous dirty pages
871 */
872 static inline
873 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
874 unsigned long start, unsigned long *num)
875 {
876 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
877 unsigned long *bitmap = rb->bmap;
878 unsigned long first, next;
879
880 *num = 0;
881
882 if (ramblock_is_ignored(rb)) {
883 return size;
884 }
885
886 first = find_next_bit(bitmap, size, start);
887 if (first >= size) {
888 return first;
889 }
890 next = find_next_zero_bit(bitmap, size, first + 1);
891 assert(next >= first);
892 *num = next - first;
893 return first;
894 }
895
896 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
897 RAMBlock *rb,
898 unsigned long page)
899 {
900 bool ret;
901
902 /*
903 * Clear dirty bitmap if needed. This _must_ be called before we
904 * send any of the page in the chunk because we need to make sure
905 * we can capture further page content changes when we sync dirty
906 * log the next time. So as long as we are going to send any of
907 * the page in the chunk we clear the remote dirty bitmap for all.
908 * Clearing it earlier won't be a problem, but too late will.
909 */
910 migration_clear_memory_region_dirty_bitmap(rb, page);
911
912 ret = test_and_clear_bit(page, rb->bmap);
913 if (ret) {
914 rs->migration_dirty_pages--;
915 }
916
917 return ret;
918 }
919
920 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
921 void *opaque)
922 {
923 const hwaddr offset = section->offset_within_region;
924 const hwaddr size = int128_get64(section->size);
925 const unsigned long start = offset >> TARGET_PAGE_BITS;
926 const unsigned long npages = size >> TARGET_PAGE_BITS;
927 RAMBlock *rb = section->mr->ram_block;
928 uint64_t *cleared_bits = opaque;
929
930 /*
931 * We don't grab ram_state->bitmap_mutex because we expect to run
932 * only when starting migration or during postcopy recovery where
933 * we don't have concurrent access.
934 */
935 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
936 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
937 }
938 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
939 bitmap_clear(rb->bmap, start, npages);
940 }
941
942 /*
943 * Exclude all dirty pages from migration that fall into a discarded range as
944 * managed by a RamDiscardManager responsible for the mapped memory region of
945 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
946 *
947 * Discarded pages ("logically unplugged") have undefined content and must
948 * not get migrated, because even reading these pages for migration might
949 * result in undesired behavior.
950 *
951 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
952 *
953 * Note: The result is only stable while migrating (precopy/postcopy).
954 */
955 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
956 {
957 uint64_t cleared_bits = 0;
958
959 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
960 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
961 MemoryRegionSection section = {
962 .mr = rb->mr,
963 .offset_within_region = 0,
964 .size = int128_make64(qemu_ram_get_used_length(rb)),
965 };
966
967 ram_discard_manager_replay_discarded(rdm, &section,
968 dirty_bitmap_clear_section,
969 &cleared_bits);
970 }
971 return cleared_bits;
972 }
973
974 /*
975 * Check if a host-page aligned page falls into a discarded range as managed by
976 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
977 *
978 * Note: The result is only stable while migrating (precopy/postcopy).
979 */
980 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
981 {
982 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
983 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
984 MemoryRegionSection section = {
985 .mr = rb->mr,
986 .offset_within_region = start,
987 .size = int128_make64(qemu_ram_pagesize(rb)),
988 };
989
990 return !ram_discard_manager_is_populated(rdm, &section);
991 }
992 return false;
993 }
994
995 /* Called with RCU critical section */
996 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
997 {
998 uint64_t new_dirty_pages =
999 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1000
1001 rs->migration_dirty_pages += new_dirty_pages;
1002 rs->num_dirty_pages_period += new_dirty_pages;
1003 }
1004
1005 /**
1006 * ram_pagesize_summary: calculate all the pagesizes of a VM
1007 *
1008 * Returns a summary bitmap of the page sizes of all RAMBlocks
1009 *
1010 * For VMs with just normal pages this is equivalent to the host page
1011 * size. If it's got some huge pages then it's the OR of all the
1012 * different page sizes.
1013 */
1014 uint64_t ram_pagesize_summary(void)
1015 {
1016 RAMBlock *block;
1017 uint64_t summary = 0;
1018
1019 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1020 summary |= block->page_size;
1021 }
1022
1023 return summary;
1024 }
1025
1026 uint64_t ram_get_total_transferred_pages(void)
1027 {
1028 return ram_counters.normal + ram_counters.duplicate +
1029 compression_counters.pages + xbzrle_counters.pages;
1030 }
1031
1032 static void migration_update_rates(RAMState *rs, int64_t end_time)
1033 {
1034 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1035 double compressed_size;
1036
1037 /* calculate period counters */
1038 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1039 / (end_time - rs->time_last_bitmap_sync);
1040
1041 if (!page_count) {
1042 return;
1043 }
1044
1045 if (migrate_use_xbzrle()) {
1046 double encoded_size, unencoded_size;
1047
1048 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1049 rs->xbzrle_cache_miss_prev) / page_count;
1050 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1051 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1052 TARGET_PAGE_SIZE;
1053 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1054 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1055 xbzrle_counters.encoding_rate = 0;
1056 } else {
1057 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1058 }
1059 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1060 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1061 }
1062
1063 if (migrate_use_compression()) {
1064 compression_counters.busy_rate = (double)(compression_counters.busy -
1065 rs->compress_thread_busy_prev) / page_count;
1066 rs->compress_thread_busy_prev = compression_counters.busy;
1067
1068 compressed_size = compression_counters.compressed_size -
1069 rs->compressed_size_prev;
1070 if (compressed_size) {
1071 double uncompressed_size = (compression_counters.pages -
1072 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1073
1074 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1075 compression_counters.compression_rate =
1076 uncompressed_size / compressed_size;
1077
1078 rs->compress_pages_prev = compression_counters.pages;
1079 rs->compressed_size_prev = compression_counters.compressed_size;
1080 }
1081 }
1082 }
1083
1084 static void migration_trigger_throttle(RAMState *rs)
1085 {
1086 MigrationState *s = migrate_get_current();
1087 uint64_t threshold = s->parameters.throttle_trigger_threshold;
1088
1089 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1090 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1091 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1092
1093 /* During block migration the auto-converge logic incorrectly detects
1094 * that ram migration makes no progress. Avoid this by disabling the
1095 * throttling logic during the bulk phase of block migration. */
1096 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1097 /* The following detection logic can be refined later. For now:
1098 Check to see if the ratio between dirtied bytes and the approx.
1099 amount of bytes that just got transferred since the last time
1100 we were in this routine reaches the threshold. If that happens
1101 twice, start or increase throttling. */
1102
1103 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1104 (++rs->dirty_rate_high_cnt >= 2)) {
1105 trace_migration_throttle();
1106 rs->dirty_rate_high_cnt = 0;
1107 mig_throttle_guest_down(bytes_dirty_period,
1108 bytes_dirty_threshold);
1109 }
1110 }
1111 }
1112
1113 static void migration_bitmap_sync(RAMState *rs)
1114 {
1115 RAMBlock *block;
1116 int64_t end_time;
1117
1118 ram_counters.dirty_sync_count++;
1119
1120 if (!rs->time_last_bitmap_sync) {
1121 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1122 }
1123
1124 trace_migration_bitmap_sync_start();
1125 memory_global_dirty_log_sync();
1126
1127 qemu_mutex_lock(&rs->bitmap_mutex);
1128 WITH_RCU_READ_LOCK_GUARD() {
1129 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1130 ramblock_sync_dirty_bitmap(rs, block);
1131 }
1132 ram_counters.remaining = ram_bytes_remaining();
1133 }
1134 qemu_mutex_unlock(&rs->bitmap_mutex);
1135
1136 memory_global_after_dirty_log_sync();
1137 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1138
1139 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1140
1141 /* more than 1 second = 1000 millisecons */
1142 if (end_time > rs->time_last_bitmap_sync + 1000) {
1143 migration_trigger_throttle(rs);
1144
1145 migration_update_rates(rs, end_time);
1146
1147 rs->target_page_count_prev = rs->target_page_count;
1148
1149 /* reset period counters */
1150 rs->time_last_bitmap_sync = end_time;
1151 rs->num_dirty_pages_period = 0;
1152 rs->bytes_xfer_prev = ram_counters.transferred;
1153 }
1154 if (migrate_use_events()) {
1155 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1156 }
1157 }
1158
1159 static void migration_bitmap_sync_precopy(RAMState *rs)
1160 {
1161 Error *local_err = NULL;
1162
1163 /*
1164 * The current notifier usage is just an optimization to migration, so we
1165 * don't stop the normal migration process in the error case.
1166 */
1167 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1168 error_report_err(local_err);
1169 local_err = NULL;
1170 }
1171
1172 migration_bitmap_sync(rs);
1173
1174 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1175 error_report_err(local_err);
1176 }
1177 }
1178
1179 static void ram_release_page(const char *rbname, uint64_t offset)
1180 {
1181 if (!migrate_release_ram() || !migration_in_postcopy()) {
1182 return;
1183 }
1184
1185 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1186 }
1187
1188 /**
1189 * save_zero_page_to_file: send the zero page to the file
1190 *
1191 * Returns the size of data written to the file, 0 means the page is not
1192 * a zero page
1193 *
1194 * @rs: current RAM state
1195 * @file: the file where the data is saved
1196 * @block: block that contains the page we want to send
1197 * @offset: offset inside the block for the page
1198 */
1199 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1200 RAMBlock *block, ram_addr_t offset)
1201 {
1202 uint8_t *p = block->host + offset;
1203 int len = 0;
1204
1205 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1206 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1207 qemu_put_byte(file, 0);
1208 len += 1;
1209 ram_release_page(block->idstr, offset);
1210 }
1211 return len;
1212 }
1213
1214 /**
1215 * save_zero_page: send the zero page to the stream
1216 *
1217 * Returns the number of pages written.
1218 *
1219 * @rs: current RAM state
1220 * @block: block that contains the page we want to send
1221 * @offset: offset inside the block for the page
1222 */
1223 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1224 {
1225 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1226
1227 if (len) {
1228 ram_counters.duplicate++;
1229 ram_transferred_add(len);
1230 return 1;
1231 }
1232 return -1;
1233 }
1234
1235 /*
1236 * @pages: the number of pages written by the control path,
1237 * < 0 - error
1238 * > 0 - number of pages written
1239 *
1240 * Return true if the pages has been saved, otherwise false is returned.
1241 */
1242 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1243 int *pages)
1244 {
1245 uint64_t bytes_xmit = 0;
1246 int ret;
1247
1248 *pages = -1;
1249 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1250 &bytes_xmit);
1251 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1252 return false;
1253 }
1254
1255 if (bytes_xmit) {
1256 ram_transferred_add(bytes_xmit);
1257 *pages = 1;
1258 }
1259
1260 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1261 return true;
1262 }
1263
1264 if (bytes_xmit > 0) {
1265 ram_counters.normal++;
1266 } else if (bytes_xmit == 0) {
1267 ram_counters.duplicate++;
1268 }
1269
1270 return true;
1271 }
1272
1273 /*
1274 * directly send the page to the stream
1275 *
1276 * Returns the number of pages written.
1277 *
1278 * @rs: current RAM state
1279 * @block: block that contains the page we want to send
1280 * @offset: offset inside the block for the page
1281 * @buf: the page to be sent
1282 * @async: send to page asyncly
1283 */
1284 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1285 uint8_t *buf, bool async)
1286 {
1287 ram_transferred_add(save_page_header(rs, rs->f, block,
1288 offset | RAM_SAVE_FLAG_PAGE));
1289 if (async) {
1290 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1291 migrate_release_ram() &
1292 migration_in_postcopy());
1293 } else {
1294 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1295 }
1296 ram_transferred_add(TARGET_PAGE_SIZE);
1297 ram_counters.normal++;
1298 return 1;
1299 }
1300
1301 /**
1302 * ram_save_page: send the given page to the stream
1303 *
1304 * Returns the number of pages written.
1305 * < 0 - error
1306 * >=0 - Number of pages written - this might legally be 0
1307 * if xbzrle noticed the page was the same.
1308 *
1309 * @rs: current RAM state
1310 * @block: block that contains the page we want to send
1311 * @offset: offset inside the block for the page
1312 */
1313 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1314 {
1315 int pages = -1;
1316 uint8_t *p;
1317 bool send_async = true;
1318 RAMBlock *block = pss->block;
1319 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1320 ram_addr_t current_addr = block->offset + offset;
1321
1322 p = block->host + offset;
1323 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1324
1325 XBZRLE_cache_lock();
1326 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1327 pages = save_xbzrle_page(rs, &p, current_addr, block,
1328 offset);
1329 if (!rs->last_stage) {
1330 /* Can't send this cached data async, since the cache page
1331 * might get updated before it gets to the wire
1332 */
1333 send_async = false;
1334 }
1335 }
1336
1337 /* XBZRLE overflow or normal page */
1338 if (pages == -1) {
1339 pages = save_normal_page(rs, block, offset, p, send_async);
1340 }
1341
1342 XBZRLE_cache_unlock();
1343
1344 return pages;
1345 }
1346
1347 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1348 ram_addr_t offset)
1349 {
1350 if (multifd_queue_page(rs->f, block, offset) < 0) {
1351 return -1;
1352 }
1353 ram_counters.normal++;
1354
1355 return 1;
1356 }
1357
1358 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1359 ram_addr_t offset, uint8_t *source_buf)
1360 {
1361 RAMState *rs = ram_state;
1362 uint8_t *p = block->host + offset;
1363 int ret;
1364
1365 if (save_zero_page_to_file(rs, f, block, offset)) {
1366 return true;
1367 }
1368
1369 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1370
1371 /*
1372 * copy it to a internal buffer to avoid it being modified by VM
1373 * so that we can catch up the error during compression and
1374 * decompression
1375 */
1376 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1377 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1378 if (ret < 0) {
1379 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1380 error_report("compressed data failed!");
1381 }
1382 return false;
1383 }
1384
1385 static void
1386 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1387 {
1388 ram_transferred_add(bytes_xmit);
1389
1390 if (param->zero_page) {
1391 ram_counters.duplicate++;
1392 return;
1393 }
1394
1395 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1396 compression_counters.compressed_size += bytes_xmit - 8;
1397 compression_counters.pages++;
1398 }
1399
1400 static bool save_page_use_compression(RAMState *rs);
1401
1402 static void flush_compressed_data(RAMState *rs)
1403 {
1404 int idx, len, thread_count;
1405
1406 if (!save_page_use_compression(rs)) {
1407 return;
1408 }
1409 thread_count = migrate_compress_threads();
1410
1411 qemu_mutex_lock(&comp_done_lock);
1412 for (idx = 0; idx < thread_count; idx++) {
1413 while (!comp_param[idx].done) {
1414 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1415 }
1416 }
1417 qemu_mutex_unlock(&comp_done_lock);
1418
1419 for (idx = 0; idx < thread_count; idx++) {
1420 qemu_mutex_lock(&comp_param[idx].mutex);
1421 if (!comp_param[idx].quit) {
1422 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1423 /*
1424 * it's safe to fetch zero_page without holding comp_done_lock
1425 * as there is no further request submitted to the thread,
1426 * i.e, the thread should be waiting for a request at this point.
1427 */
1428 update_compress_thread_counts(&comp_param[idx], len);
1429 }
1430 qemu_mutex_unlock(&comp_param[idx].mutex);
1431 }
1432 }
1433
1434 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1435 ram_addr_t offset)
1436 {
1437 param->block = block;
1438 param->offset = offset;
1439 }
1440
1441 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1442 ram_addr_t offset)
1443 {
1444 int idx, thread_count, bytes_xmit = -1, pages = -1;
1445 bool wait = migrate_compress_wait_thread();
1446
1447 thread_count = migrate_compress_threads();
1448 qemu_mutex_lock(&comp_done_lock);
1449 retry:
1450 for (idx = 0; idx < thread_count; idx++) {
1451 if (comp_param[idx].done) {
1452 comp_param[idx].done = false;
1453 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1454 qemu_mutex_lock(&comp_param[idx].mutex);
1455 set_compress_params(&comp_param[idx], block, offset);
1456 qemu_cond_signal(&comp_param[idx].cond);
1457 qemu_mutex_unlock(&comp_param[idx].mutex);
1458 pages = 1;
1459 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1460 break;
1461 }
1462 }
1463
1464 /*
1465 * wait for the free thread if the user specifies 'compress-wait-thread',
1466 * otherwise we will post the page out in the main thread as normal page.
1467 */
1468 if (pages < 0 && wait) {
1469 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1470 goto retry;
1471 }
1472 qemu_mutex_unlock(&comp_done_lock);
1473
1474 return pages;
1475 }
1476
1477 /**
1478 * find_dirty_block: find the next dirty page and update any state
1479 * associated with the search process.
1480 *
1481 * Returns true if a page is found
1482 *
1483 * @rs: current RAM state
1484 * @pss: data about the state of the current dirty page scan
1485 * @again: set to false if the search has scanned the whole of RAM
1486 */
1487 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1488 {
1489 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1490 if (pss->complete_round && pss->block == rs->last_seen_block &&
1491 pss->page >= rs->last_page) {
1492 /*
1493 * We've been once around the RAM and haven't found anything.
1494 * Give up.
1495 */
1496 *again = false;
1497 return false;
1498 }
1499 if (!offset_in_ramblock(pss->block,
1500 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1501 /* Didn't find anything in this RAM Block */
1502 pss->page = 0;
1503 pss->block = QLIST_NEXT_RCU(pss->block, next);
1504 if (!pss->block) {
1505 /*
1506 * If memory migration starts over, we will meet a dirtied page
1507 * which may still exists in compression threads's ring, so we
1508 * should flush the compressed data to make sure the new page
1509 * is not overwritten by the old one in the destination.
1510 *
1511 * Also If xbzrle is on, stop using the data compression at this
1512 * point. In theory, xbzrle can do better than compression.
1513 */
1514 flush_compressed_data(rs);
1515
1516 /* Hit the end of the list */
1517 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1518 /* Flag that we've looped */
1519 pss->complete_round = true;
1520 /* After the first round, enable XBZRLE. */
1521 if (migrate_use_xbzrle()) {
1522 rs->xbzrle_enabled = true;
1523 }
1524 }
1525 /* Didn't find anything this time, but try again on the new block */
1526 *again = true;
1527 return false;
1528 } else {
1529 /* Can go around again, but... */
1530 *again = true;
1531 /* We've found something so probably don't need to */
1532 return true;
1533 }
1534 }
1535
1536 /**
1537 * unqueue_page: gets a page of the queue
1538 *
1539 * Helper for 'get_queued_page' - gets a page off the queue
1540 *
1541 * Returns the block of the page (or NULL if none available)
1542 *
1543 * @rs: current RAM state
1544 * @offset: used to return the offset within the RAMBlock
1545 */
1546 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1547 {
1548 struct RAMSrcPageRequest *entry;
1549 RAMBlock *block = NULL;
1550 size_t page_size;
1551
1552 if (!postcopy_has_request(rs)) {
1553 return NULL;
1554 }
1555
1556 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1557
1558 /*
1559 * This should _never_ change even after we take the lock, because no one
1560 * should be taking anything off the request list other than us.
1561 */
1562 assert(postcopy_has_request(rs));
1563
1564 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1565 block = entry->rb;
1566 *offset = entry->offset;
1567 page_size = qemu_ram_pagesize(block);
1568 /* Each page request should only be multiple page size of the ramblock */
1569 assert((entry->len % page_size) == 0);
1570
1571 if (entry->len > page_size) {
1572 entry->len -= page_size;
1573 entry->offset += page_size;
1574 } else {
1575 memory_region_unref(block->mr);
1576 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1577 g_free(entry);
1578 migration_consume_urgent_request();
1579 }
1580
1581 trace_unqueue_page(block->idstr, *offset,
1582 test_bit((*offset >> TARGET_PAGE_BITS), block->bmap));
1583
1584 return block;
1585 }
1586
1587 #if defined(__linux__)
1588 /**
1589 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1590 * is found, return RAM block pointer and page offset
1591 *
1592 * Returns pointer to the RAMBlock containing faulting page,
1593 * NULL if no write faults are pending
1594 *
1595 * @rs: current RAM state
1596 * @offset: page offset from the beginning of the block
1597 */
1598 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1599 {
1600 struct uffd_msg uffd_msg;
1601 void *page_address;
1602 RAMBlock *block;
1603 int res;
1604
1605 if (!migrate_background_snapshot()) {
1606 return NULL;
1607 }
1608
1609 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1610 if (res <= 0) {
1611 return NULL;
1612 }
1613
1614 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1615 block = qemu_ram_block_from_host(page_address, false, offset);
1616 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1617 return block;
1618 }
1619
1620 /**
1621 * ram_save_release_protection: release UFFD write protection after
1622 * a range of pages has been saved
1623 *
1624 * @rs: current RAM state
1625 * @pss: page-search-status structure
1626 * @start_page: index of the first page in the range relative to pss->block
1627 *
1628 * Returns 0 on success, negative value in case of an error
1629 */
1630 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1631 unsigned long start_page)
1632 {
1633 int res = 0;
1634
1635 /* Check if page is from UFFD-managed region. */
1636 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1637 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1638 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1639
1640 /* Flush async buffers before un-protect. */
1641 qemu_fflush(rs->f);
1642 /* Un-protect memory range. */
1643 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1644 false, false);
1645 }
1646
1647 return res;
1648 }
1649
1650 /* ram_write_tracking_available: check if kernel supports required UFFD features
1651 *
1652 * Returns true if supports, false otherwise
1653 */
1654 bool ram_write_tracking_available(void)
1655 {
1656 uint64_t uffd_features;
1657 int res;
1658
1659 res = uffd_query_features(&uffd_features);
1660 return (res == 0 &&
1661 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1662 }
1663
1664 /* ram_write_tracking_compatible: check if guest configuration is
1665 * compatible with 'write-tracking'
1666 *
1667 * Returns true if compatible, false otherwise
1668 */
1669 bool ram_write_tracking_compatible(void)
1670 {
1671 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1672 int uffd_fd;
1673 RAMBlock *block;
1674 bool ret = false;
1675
1676 /* Open UFFD file descriptor */
1677 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1678 if (uffd_fd < 0) {
1679 return false;
1680 }
1681
1682 RCU_READ_LOCK_GUARD();
1683
1684 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1685 uint64_t uffd_ioctls;
1686
1687 /* Nothing to do with read-only and MMIO-writable regions */
1688 if (block->mr->readonly || block->mr->rom_device) {
1689 continue;
1690 }
1691 /* Try to register block memory via UFFD-IO to track writes */
1692 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1693 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1694 goto out;
1695 }
1696 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1697 goto out;
1698 }
1699 }
1700 ret = true;
1701
1702 out:
1703 uffd_close_fd(uffd_fd);
1704 return ret;
1705 }
1706
1707 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1708 ram_addr_t size)
1709 {
1710 /*
1711 * We read one byte of each page; this will preallocate page tables if
1712 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1713 * where no page was populated yet. This might require adaption when
1714 * supporting other mappings, like shmem.
1715 */
1716 for (; offset < size; offset += block->page_size) {
1717 char tmp = *((char *)block->host + offset);
1718
1719 /* Don't optimize the read out */
1720 asm volatile("" : "+r" (tmp));
1721 }
1722 }
1723
1724 static inline int populate_read_section(MemoryRegionSection *section,
1725 void *opaque)
1726 {
1727 const hwaddr size = int128_get64(section->size);
1728 hwaddr offset = section->offset_within_region;
1729 RAMBlock *block = section->mr->ram_block;
1730
1731 populate_read_range(block, offset, size);
1732 return 0;
1733 }
1734
1735 /*
1736 * ram_block_populate_read: preallocate page tables and populate pages in the
1737 * RAM block by reading a byte of each page.
1738 *
1739 * Since it's solely used for userfault_fd WP feature, here we just
1740 * hardcode page size to qemu_real_host_page_size.
1741 *
1742 * @block: RAM block to populate
1743 */
1744 static void ram_block_populate_read(RAMBlock *rb)
1745 {
1746 /*
1747 * Skip populating all pages that fall into a discarded range as managed by
1748 * a RamDiscardManager responsible for the mapped memory region of the
1749 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1750 * must not get populated automatically. We don't have to track
1751 * modifications via userfaultfd WP reliably, because these pages will
1752 * not be part of the migration stream either way -- see
1753 * ramblock_dirty_bitmap_exclude_discarded_pages().
1754 *
1755 * Note: The result is only stable while migrating (precopy/postcopy).
1756 */
1757 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1758 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1759 MemoryRegionSection section = {
1760 .mr = rb->mr,
1761 .offset_within_region = 0,
1762 .size = rb->mr->size,
1763 };
1764
1765 ram_discard_manager_replay_populated(rdm, &section,
1766 populate_read_section, NULL);
1767 } else {
1768 populate_read_range(rb, 0, rb->used_length);
1769 }
1770 }
1771
1772 /*
1773 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1774 */
1775 void ram_write_tracking_prepare(void)
1776 {
1777 RAMBlock *block;
1778
1779 RCU_READ_LOCK_GUARD();
1780
1781 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1782 /* Nothing to do with read-only and MMIO-writable regions */
1783 if (block->mr->readonly || block->mr->rom_device) {
1784 continue;
1785 }
1786
1787 /*
1788 * Populate pages of the RAM block before enabling userfault_fd
1789 * write protection.
1790 *
1791 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1792 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1793 * pages with pte_none() entries in page table.
1794 */
1795 ram_block_populate_read(block);
1796 }
1797 }
1798
1799 /*
1800 * ram_write_tracking_start: start UFFD-WP memory tracking
1801 *
1802 * Returns 0 for success or negative value in case of error
1803 */
1804 int ram_write_tracking_start(void)
1805 {
1806 int uffd_fd;
1807 RAMState *rs = ram_state;
1808 RAMBlock *block;
1809
1810 /* Open UFFD file descriptor */
1811 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1812 if (uffd_fd < 0) {
1813 return uffd_fd;
1814 }
1815 rs->uffdio_fd = uffd_fd;
1816
1817 RCU_READ_LOCK_GUARD();
1818
1819 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1820 /* Nothing to do with read-only and MMIO-writable regions */
1821 if (block->mr->readonly || block->mr->rom_device) {
1822 continue;
1823 }
1824
1825 /* Register block memory with UFFD to track writes */
1826 if (uffd_register_memory(rs->uffdio_fd, block->host,
1827 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1828 goto fail;
1829 }
1830 /* Apply UFFD write protection to the block memory range */
1831 if (uffd_change_protection(rs->uffdio_fd, block->host,
1832 block->max_length, true, false)) {
1833 goto fail;
1834 }
1835 block->flags |= RAM_UF_WRITEPROTECT;
1836 memory_region_ref(block->mr);
1837
1838 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1839 block->host, block->max_length);
1840 }
1841
1842 return 0;
1843
1844 fail:
1845 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1846
1847 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1848 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1849 continue;
1850 }
1851 /*
1852 * In case some memory block failed to be write-protected
1853 * remove protection and unregister all succeeded RAM blocks
1854 */
1855 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1856 false, false);
1857 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1858 /* Cleanup flags and remove reference */
1859 block->flags &= ~RAM_UF_WRITEPROTECT;
1860 memory_region_unref(block->mr);
1861 }
1862
1863 uffd_close_fd(uffd_fd);
1864 rs->uffdio_fd = -1;
1865 return -1;
1866 }
1867
1868 /**
1869 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1870 */
1871 void ram_write_tracking_stop(void)
1872 {
1873 RAMState *rs = ram_state;
1874 RAMBlock *block;
1875
1876 RCU_READ_LOCK_GUARD();
1877
1878 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1879 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1880 continue;
1881 }
1882 /* Remove protection and unregister all affected RAM blocks */
1883 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1884 false, false);
1885 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1886
1887 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1888 block->host, block->max_length);
1889
1890 /* Cleanup flags and remove reference */
1891 block->flags &= ~RAM_UF_WRITEPROTECT;
1892 memory_region_unref(block->mr);
1893 }
1894
1895 /* Finally close UFFD file descriptor */
1896 uffd_close_fd(rs->uffdio_fd);
1897 rs->uffdio_fd = -1;
1898 }
1899
1900 #else
1901 /* No target OS support, stubs just fail or ignore */
1902
1903 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1904 {
1905 (void) rs;
1906 (void) offset;
1907
1908 return NULL;
1909 }
1910
1911 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1912 unsigned long start_page)
1913 {
1914 (void) rs;
1915 (void) pss;
1916 (void) start_page;
1917
1918 return 0;
1919 }
1920
1921 bool ram_write_tracking_available(void)
1922 {
1923 return false;
1924 }
1925
1926 bool ram_write_tracking_compatible(void)
1927 {
1928 assert(0);
1929 return false;
1930 }
1931
1932 int ram_write_tracking_start(void)
1933 {
1934 assert(0);
1935 return -1;
1936 }
1937
1938 void ram_write_tracking_stop(void)
1939 {
1940 assert(0);
1941 }
1942 #endif /* defined(__linux__) */
1943
1944 /**
1945 * get_queued_page: unqueue a page from the postcopy requests
1946 *
1947 * Skips pages that are already sent (!dirty)
1948 *
1949 * Returns true if a queued page is found
1950 *
1951 * @rs: current RAM state
1952 * @pss: data about the state of the current dirty page scan
1953 */
1954 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1955 {
1956 RAMBlock *block;
1957 ram_addr_t offset;
1958
1959 block = unqueue_page(rs, &offset);
1960
1961 if (!block) {
1962 /*
1963 * Poll write faults too if background snapshot is enabled; that's
1964 * when we have vcpus got blocked by the write protected pages.
1965 */
1966 block = poll_fault_page(rs, &offset);
1967 }
1968
1969 if (block) {
1970 /*
1971 * We want the background search to continue from the queued page
1972 * since the guest is likely to want other pages near to the page
1973 * it just requested.
1974 */
1975 pss->block = block;
1976 pss->page = offset >> TARGET_PAGE_BITS;
1977
1978 /*
1979 * This unqueued page would break the "one round" check, even is
1980 * really rare.
1981 */
1982 pss->complete_round = false;
1983 }
1984
1985 return !!block;
1986 }
1987
1988 /**
1989 * migration_page_queue_free: drop any remaining pages in the ram
1990 * request queue
1991 *
1992 * It should be empty at the end anyway, but in error cases there may
1993 * be some left. in case that there is any page left, we drop it.
1994 *
1995 */
1996 static void migration_page_queue_free(RAMState *rs)
1997 {
1998 struct RAMSrcPageRequest *mspr, *next_mspr;
1999 /* This queue generally should be empty - but in the case of a failed
2000 * migration might have some droppings in.
2001 */
2002 RCU_READ_LOCK_GUARD();
2003 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2004 memory_region_unref(mspr->rb->mr);
2005 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2006 g_free(mspr);
2007 }
2008 }
2009
2010 /**
2011 * ram_save_queue_pages: queue the page for transmission
2012 *
2013 * A request from postcopy destination for example.
2014 *
2015 * Returns zero on success or negative on error
2016 *
2017 * @rbname: Name of the RAMBLock of the request. NULL means the
2018 * same that last one.
2019 * @start: starting address from the start of the RAMBlock
2020 * @len: length (in bytes) to send
2021 */
2022 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2023 {
2024 RAMBlock *ramblock;
2025 RAMState *rs = ram_state;
2026
2027 ram_counters.postcopy_requests++;
2028 RCU_READ_LOCK_GUARD();
2029
2030 if (!rbname) {
2031 /* Reuse last RAMBlock */
2032 ramblock = rs->last_req_rb;
2033
2034 if (!ramblock) {
2035 /*
2036 * Shouldn't happen, we can't reuse the last RAMBlock if
2037 * it's the 1st request.
2038 */
2039 error_report("ram_save_queue_pages no previous block");
2040 return -1;
2041 }
2042 } else {
2043 ramblock = qemu_ram_block_by_name(rbname);
2044
2045 if (!ramblock) {
2046 /* We shouldn't be asked for a non-existent RAMBlock */
2047 error_report("ram_save_queue_pages no block '%s'", rbname);
2048 return -1;
2049 }
2050 rs->last_req_rb = ramblock;
2051 }
2052 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2053 if (!offset_in_ramblock(ramblock, start + len - 1)) {
2054 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2055 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2056 __func__, start, len, ramblock->used_length);
2057 return -1;
2058 }
2059
2060 struct RAMSrcPageRequest *new_entry =
2061 g_malloc0(sizeof(struct RAMSrcPageRequest));
2062 new_entry->rb = ramblock;
2063 new_entry->offset = start;
2064 new_entry->len = len;
2065
2066 memory_region_ref(ramblock->mr);
2067 qemu_mutex_lock(&rs->src_page_req_mutex);
2068 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2069 migration_make_urgent_request();
2070 qemu_mutex_unlock(&rs->src_page_req_mutex);
2071
2072 return 0;
2073 }
2074
2075 static bool save_page_use_compression(RAMState *rs)
2076 {
2077 if (!migrate_use_compression()) {
2078 return false;
2079 }
2080
2081 /*
2082 * If xbzrle is enabled (e.g., after first round of migration), stop
2083 * using the data compression. In theory, xbzrle can do better than
2084 * compression.
2085 */
2086 if (rs->xbzrle_enabled) {
2087 return false;
2088 }
2089
2090 return true;
2091 }
2092
2093 /*
2094 * try to compress the page before posting it out, return true if the page
2095 * has been properly handled by compression, otherwise needs other
2096 * paths to handle it
2097 */
2098 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2099 {
2100 if (!save_page_use_compression(rs)) {
2101 return false;
2102 }
2103
2104 /*
2105 * When starting the process of a new block, the first page of
2106 * the block should be sent out before other pages in the same
2107 * block, and all the pages in last block should have been sent
2108 * out, keeping this order is important, because the 'cont' flag
2109 * is used to avoid resending the block name.
2110 *
2111 * We post the fist page as normal page as compression will take
2112 * much CPU resource.
2113 */
2114 if (block != rs->last_sent_block) {
2115 flush_compressed_data(rs);
2116 return false;
2117 }
2118
2119 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2120 return true;
2121 }
2122
2123 compression_counters.busy++;
2124 return false;
2125 }
2126
2127 /**
2128 * ram_save_target_page: save one target page
2129 *
2130 * Returns the number of pages written
2131 *
2132 * @rs: current RAM state
2133 * @pss: data about the page we want to send
2134 */
2135 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2136 {
2137 RAMBlock *block = pss->block;
2138 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2139 int res;
2140
2141 if (control_save_page(rs, block, offset, &res)) {
2142 return res;
2143 }
2144
2145 if (save_compress_page(rs, block, offset)) {
2146 return 1;
2147 }
2148
2149 res = save_zero_page(rs, block, offset);
2150 if (res > 0) {
2151 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2152 * page would be stale
2153 */
2154 if (!save_page_use_compression(rs)) {
2155 XBZRLE_cache_lock();
2156 xbzrle_cache_zero_page(rs, block->offset + offset);
2157 XBZRLE_cache_unlock();
2158 }
2159 return res;
2160 }
2161
2162 /*
2163 * Do not use multifd for:
2164 * 1. Compression as the first page in the new block should be posted out
2165 * before sending the compressed page
2166 * 2. In postcopy as one whole host page should be placed
2167 */
2168 if (!save_page_use_compression(rs) && migrate_use_multifd()
2169 && !migration_in_postcopy()) {
2170 return ram_save_multifd_page(rs, block, offset);
2171 }
2172
2173 return ram_save_page(rs, pss);
2174 }
2175
2176 /**
2177 * ram_save_host_page: save a whole host page
2178 *
2179 * Starting at *offset send pages up to the end of the current host
2180 * page. It's valid for the initial offset to point into the middle of
2181 * a host page in which case the remainder of the hostpage is sent.
2182 * Only dirty target pages are sent. Note that the host page size may
2183 * be a huge page for this block.
2184 * The saving stops at the boundary of the used_length of the block
2185 * if the RAMBlock isn't a multiple of the host page size.
2186 *
2187 * Returns the number of pages written or negative on error
2188 *
2189 * @rs: current RAM state
2190 * @pss: data about the page we want to send
2191 */
2192 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2193 {
2194 int tmppages, pages = 0;
2195 size_t pagesize_bits =
2196 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2197 unsigned long hostpage_boundary =
2198 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2199 unsigned long start_page = pss->page;
2200 int res;
2201
2202 if (ramblock_is_ignored(pss->block)) {
2203 error_report("block %s should not be migrated !", pss->block->idstr);
2204 return 0;
2205 }
2206
2207 do {
2208 /* Check the pages is dirty and if it is send it */
2209 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2210 tmppages = ram_save_target_page(rs, pss);
2211 if (tmppages < 0) {
2212 return tmppages;
2213 }
2214
2215 pages += tmppages;
2216 /*
2217 * Allow rate limiting to happen in the middle of huge pages if
2218 * something is sent in the current iteration.
2219 */
2220 if (pagesize_bits > 1 && tmppages > 0) {
2221 migration_rate_limit();
2222 }
2223 }
2224 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2225 } while ((pss->page < hostpage_boundary) &&
2226 offset_in_ramblock(pss->block,
2227 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2228 /* The offset we leave with is the min boundary of host page and block */
2229 pss->page = MIN(pss->page, hostpage_boundary);
2230
2231 res = ram_save_release_protection(rs, pss, start_page);
2232 return (res < 0 ? res : pages);
2233 }
2234
2235 /**
2236 * ram_find_and_save_block: finds a dirty page and sends it to f
2237 *
2238 * Called within an RCU critical section.
2239 *
2240 * Returns the number of pages written where zero means no dirty pages,
2241 * or negative on error
2242 *
2243 * @rs: current RAM state
2244 *
2245 * On systems where host-page-size > target-page-size it will send all the
2246 * pages in a host page that are dirty.
2247 */
2248 static int ram_find_and_save_block(RAMState *rs)
2249 {
2250 PageSearchStatus pss;
2251 int pages = 0;
2252 bool again, found;
2253
2254 /* No dirty page as there is zero RAM */
2255 if (!ram_bytes_total()) {
2256 return pages;
2257 }
2258
2259 pss.block = rs->last_seen_block;
2260 pss.page = rs->last_page;
2261 pss.complete_round = false;
2262
2263 if (!pss.block) {
2264 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2265 }
2266
2267 do {
2268 again = true;
2269 found = get_queued_page(rs, &pss);
2270
2271 if (!found) {
2272 /* priority queue empty, so just search for something dirty */
2273 found = find_dirty_block(rs, &pss, &again);
2274 }
2275
2276 if (found) {
2277 pages = ram_save_host_page(rs, &pss);
2278 }
2279 } while (!pages && again);
2280
2281 rs->last_seen_block = pss.block;
2282 rs->last_page = pss.page;
2283
2284 return pages;
2285 }
2286
2287 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2288 {
2289 uint64_t pages = size / TARGET_PAGE_SIZE;
2290
2291 if (zero) {
2292 ram_counters.duplicate += pages;
2293 } else {
2294 ram_counters.normal += pages;
2295 ram_transferred_add(size);
2296 qemu_update_position(f, size);
2297 }
2298 }
2299
2300 static uint64_t ram_bytes_total_common(bool count_ignored)
2301 {
2302 RAMBlock *block;
2303 uint64_t total = 0;
2304
2305 RCU_READ_LOCK_GUARD();
2306
2307 if (count_ignored) {
2308 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2309 total += block->used_length;
2310 }
2311 } else {
2312 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2313 total += block->used_length;
2314 }
2315 }
2316 return total;
2317 }
2318
2319 uint64_t ram_bytes_total(void)
2320 {
2321 return ram_bytes_total_common(false);
2322 }
2323
2324 static void xbzrle_load_setup(void)
2325 {
2326 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2327 }
2328
2329 static void xbzrle_load_cleanup(void)
2330 {
2331 g_free(XBZRLE.decoded_buf);
2332 XBZRLE.decoded_buf = NULL;
2333 }
2334
2335 static void ram_state_cleanup(RAMState **rsp)
2336 {
2337 if (*rsp) {
2338 migration_page_queue_free(*rsp);
2339 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2340 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2341 g_free(*rsp);
2342 *rsp = NULL;
2343 }
2344 }
2345
2346 static void xbzrle_cleanup(void)
2347 {
2348 XBZRLE_cache_lock();
2349 if (XBZRLE.cache) {
2350 cache_fini(XBZRLE.cache);
2351 g_free(XBZRLE.encoded_buf);
2352 g_free(XBZRLE.current_buf);
2353 g_free(XBZRLE.zero_target_page);
2354 XBZRLE.cache = NULL;
2355 XBZRLE.encoded_buf = NULL;
2356 XBZRLE.current_buf = NULL;
2357 XBZRLE.zero_target_page = NULL;
2358 }
2359 XBZRLE_cache_unlock();
2360 }
2361
2362 static void ram_save_cleanup(void *opaque)
2363 {
2364 RAMState **rsp = opaque;
2365 RAMBlock *block;
2366
2367 /* We don't use dirty log with background snapshots */
2368 if (!migrate_background_snapshot()) {
2369 /* caller have hold iothread lock or is in a bh, so there is
2370 * no writing race against the migration bitmap
2371 */
2372 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2373 /*
2374 * do not stop dirty log without starting it, since
2375 * memory_global_dirty_log_stop will assert that
2376 * memory_global_dirty_log_start/stop used in pairs
2377 */
2378 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2379 }
2380 }
2381
2382 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2383 g_free(block->clear_bmap);
2384 block->clear_bmap = NULL;
2385 g_free(block->bmap);
2386 block->bmap = NULL;
2387 }
2388
2389 xbzrle_cleanup();
2390 compress_threads_save_cleanup();
2391 ram_state_cleanup(rsp);
2392 }
2393
2394 static void ram_state_reset(RAMState *rs)
2395 {
2396 rs->last_seen_block = NULL;
2397 rs->last_sent_block = NULL;
2398 rs->last_page = 0;
2399 rs->last_version = ram_list.version;
2400 rs->xbzrle_enabled = false;
2401 }
2402
2403 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2404
2405 /* **** functions for postcopy ***** */
2406
2407 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2408 {
2409 struct RAMBlock *block;
2410
2411 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2412 unsigned long *bitmap = block->bmap;
2413 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2414 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2415
2416 while (run_start < range) {
2417 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2418 ram_discard_range(block->idstr,
2419 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2420 ((ram_addr_t)(run_end - run_start))
2421 << TARGET_PAGE_BITS);
2422 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2423 }
2424 }
2425 }
2426
2427 /**
2428 * postcopy_send_discard_bm_ram: discard a RAMBlock
2429 *
2430 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2431 *
2432 * @ms: current migration state
2433 * @block: RAMBlock to discard
2434 */
2435 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2436 {
2437 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2438 unsigned long current;
2439 unsigned long *bitmap = block->bmap;
2440
2441 for (current = 0; current < end; ) {
2442 unsigned long one = find_next_bit(bitmap, end, current);
2443 unsigned long zero, discard_length;
2444
2445 if (one >= end) {
2446 break;
2447 }
2448
2449 zero = find_next_zero_bit(bitmap, end, one + 1);
2450
2451 if (zero >= end) {
2452 discard_length = end - one;
2453 } else {
2454 discard_length = zero - one;
2455 }
2456 postcopy_discard_send_range(ms, one, discard_length);
2457 current = one + discard_length;
2458 }
2459 }
2460
2461 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2462
2463 /**
2464 * postcopy_each_ram_send_discard: discard all RAMBlocks
2465 *
2466 * Utility for the outgoing postcopy code.
2467 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2468 * passing it bitmap indexes and name.
2469 * (qemu_ram_foreach_block ends up passing unscaled lengths
2470 * which would mean postcopy code would have to deal with target page)
2471 *
2472 * @ms: current migration state
2473 */
2474 static void postcopy_each_ram_send_discard(MigrationState *ms)
2475 {
2476 struct RAMBlock *block;
2477
2478 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2479 postcopy_discard_send_init(ms, block->idstr);
2480
2481 /*
2482 * Deal with TPS != HPS and huge pages. It discard any partially sent
2483 * host-page size chunks, mark any partially dirty host-page size
2484 * chunks as all dirty. In this case the host-page is the host-page
2485 * for the particular RAMBlock, i.e. it might be a huge page.
2486 */
2487 postcopy_chunk_hostpages_pass(ms, block);
2488
2489 /*
2490 * Postcopy sends chunks of bitmap over the wire, but it
2491 * just needs indexes at this point, avoids it having
2492 * target page specific code.
2493 */
2494 postcopy_send_discard_bm_ram(ms, block);
2495 postcopy_discard_send_finish(ms);
2496 }
2497 }
2498
2499 /**
2500 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2501 *
2502 * Helper for postcopy_chunk_hostpages; it's called twice to
2503 * canonicalize the two bitmaps, that are similar, but one is
2504 * inverted.
2505 *
2506 * Postcopy requires that all target pages in a hostpage are dirty or
2507 * clean, not a mix. This function canonicalizes the bitmaps.
2508 *
2509 * @ms: current migration state
2510 * @block: block that contains the page we want to canonicalize
2511 */
2512 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2513 {
2514 RAMState *rs = ram_state;
2515 unsigned long *bitmap = block->bmap;
2516 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2517 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2518 unsigned long run_start;
2519
2520 if (block->page_size == TARGET_PAGE_SIZE) {
2521 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2522 return;
2523 }
2524
2525 /* Find a dirty page */
2526 run_start = find_next_bit(bitmap, pages, 0);
2527
2528 while (run_start < pages) {
2529
2530 /*
2531 * If the start of this run of pages is in the middle of a host
2532 * page, then we need to fixup this host page.
2533 */
2534 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2535 /* Find the end of this run */
2536 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2537 /*
2538 * If the end isn't at the start of a host page, then the
2539 * run doesn't finish at the end of a host page
2540 * and we need to discard.
2541 */
2542 }
2543
2544 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2545 unsigned long page;
2546 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2547 host_ratio);
2548 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2549
2550 /* Clean up the bitmap */
2551 for (page = fixup_start_addr;
2552 page < fixup_start_addr + host_ratio; page++) {
2553 /*
2554 * Remark them as dirty, updating the count for any pages
2555 * that weren't previously dirty.
2556 */
2557 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2558 }
2559 }
2560
2561 /* Find the next dirty page for the next iteration */
2562 run_start = find_next_bit(bitmap, pages, run_start);
2563 }
2564 }
2565
2566 /**
2567 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2568 *
2569 * Transmit the set of pages to be discarded after precopy to the target
2570 * these are pages that:
2571 * a) Have been previously transmitted but are now dirty again
2572 * b) Pages that have never been transmitted, this ensures that
2573 * any pages on the destination that have been mapped by background
2574 * tasks get discarded (transparent huge pages is the specific concern)
2575 * Hopefully this is pretty sparse
2576 *
2577 * @ms: current migration state
2578 */
2579 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2580 {
2581 RAMState *rs = ram_state;
2582
2583 RCU_READ_LOCK_GUARD();
2584
2585 /* This should be our last sync, the src is now paused */
2586 migration_bitmap_sync(rs);
2587
2588 /* Easiest way to make sure we don't resume in the middle of a host-page */
2589 rs->last_seen_block = NULL;
2590 rs->last_sent_block = NULL;
2591 rs->last_page = 0;
2592
2593 postcopy_each_ram_send_discard(ms);
2594
2595 trace_ram_postcopy_send_discard_bitmap();
2596 }
2597
2598 /**
2599 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2600 *
2601 * Returns zero on success
2602 *
2603 * @rbname: name of the RAMBlock of the request. NULL means the
2604 * same that last one.
2605 * @start: RAMBlock starting page
2606 * @length: RAMBlock size
2607 */
2608 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2609 {
2610 trace_ram_discard_range(rbname, start, length);
2611
2612 RCU_READ_LOCK_GUARD();
2613 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2614
2615 if (!rb) {
2616 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2617 return -1;
2618 }
2619
2620 /*
2621 * On source VM, we don't need to update the received bitmap since
2622 * we don't even have one.
2623 */
2624 if (rb->receivedmap) {
2625 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2626 length >> qemu_target_page_bits());
2627 }
2628
2629 return ram_block_discard_range(rb, start, length);
2630 }
2631
2632 /*
2633 * For every allocation, we will try not to crash the VM if the
2634 * allocation failed.
2635 */
2636 static int xbzrle_init(void)
2637 {
2638 Error *local_err = NULL;
2639
2640 if (!migrate_use_xbzrle()) {
2641 return 0;
2642 }
2643
2644 XBZRLE_cache_lock();
2645
2646 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2647 if (!XBZRLE.zero_target_page) {
2648 error_report("%s: Error allocating zero page", __func__);
2649 goto err_out;
2650 }
2651
2652 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2653 TARGET_PAGE_SIZE, &local_err);
2654 if (!XBZRLE.cache) {
2655 error_report_err(local_err);
2656 goto free_zero_page;
2657 }
2658
2659 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2660 if (!XBZRLE.encoded_buf) {
2661 error_report("%s: Error allocating encoded_buf", __func__);
2662 goto free_cache;
2663 }
2664
2665 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2666 if (!XBZRLE.current_buf) {
2667 error_report("%s: Error allocating current_buf", __func__);
2668 goto free_encoded_buf;
2669 }
2670
2671 /* We are all good */
2672 XBZRLE_cache_unlock();
2673 return 0;
2674
2675 free_encoded_buf:
2676 g_free(XBZRLE.encoded_buf);
2677 XBZRLE.encoded_buf = NULL;
2678 free_cache:
2679 cache_fini(XBZRLE.cache);
2680 XBZRLE.cache = NULL;
2681 free_zero_page:
2682 g_free(XBZRLE.zero_target_page);
2683 XBZRLE.zero_target_page = NULL;
2684 err_out:
2685 XBZRLE_cache_unlock();
2686 return -ENOMEM;
2687 }
2688
2689 static int ram_state_init(RAMState **rsp)
2690 {
2691 *rsp = g_try_new0(RAMState, 1);
2692
2693 if (!*rsp) {
2694 error_report("%s: Init ramstate fail", __func__);
2695 return -1;
2696 }
2697
2698 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2699 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2700 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2701
2702 /*
2703 * Count the total number of pages used by ram blocks not including any
2704 * gaps due to alignment or unplugs.
2705 * This must match with the initial values of dirty bitmap.
2706 */
2707 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2708 ram_state_reset(*rsp);
2709
2710 return 0;
2711 }
2712
2713 static void ram_list_init_bitmaps(void)
2714 {
2715 MigrationState *ms = migrate_get_current();
2716 RAMBlock *block;
2717 unsigned long pages;
2718 uint8_t shift;
2719
2720 /* Skip setting bitmap if there is no RAM */
2721 if (ram_bytes_total()) {
2722 shift = ms->clear_bitmap_shift;
2723 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2724 error_report("clear_bitmap_shift (%u) too big, using "
2725 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2726 shift = CLEAR_BITMAP_SHIFT_MAX;
2727 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2728 error_report("clear_bitmap_shift (%u) too small, using "
2729 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2730 shift = CLEAR_BITMAP_SHIFT_MIN;
2731 }
2732
2733 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2734 pages = block->max_length >> TARGET_PAGE_BITS;
2735 /*
2736 * The initial dirty bitmap for migration must be set with all
2737 * ones to make sure we'll migrate every guest RAM page to
2738 * destination.
2739 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2740 * new migration after a failed migration, ram_list.
2741 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2742 * guest memory.
2743 */
2744 block->bmap = bitmap_new(pages);
2745 bitmap_set(block->bmap, 0, pages);
2746 block->clear_bmap_shift = shift;
2747 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2748 }
2749 }
2750 }
2751
2752 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2753 {
2754 unsigned long pages;
2755 RAMBlock *rb;
2756
2757 RCU_READ_LOCK_GUARD();
2758
2759 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2760 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2761 rs->migration_dirty_pages -= pages;
2762 }
2763 }
2764
2765 static void ram_init_bitmaps(RAMState *rs)
2766 {
2767 /* For memory_global_dirty_log_start below. */
2768 qemu_mutex_lock_iothread();
2769 qemu_mutex_lock_ramlist();
2770
2771 WITH_RCU_READ_LOCK_GUARD() {
2772 ram_list_init_bitmaps();
2773 /* We don't use dirty log with background snapshots */
2774 if (!migrate_background_snapshot()) {
2775 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2776 migration_bitmap_sync_precopy(rs);
2777 }
2778 }
2779 qemu_mutex_unlock_ramlist();
2780 qemu_mutex_unlock_iothread();
2781
2782 /*
2783 * After an eventual first bitmap sync, fixup the initial bitmap
2784 * containing all 1s to exclude any discarded pages from migration.
2785 */
2786 migration_bitmap_clear_discarded_pages(rs);
2787 }
2788
2789 static int ram_init_all(RAMState **rsp)
2790 {
2791 if (ram_state_init(rsp)) {
2792 return -1;
2793 }
2794
2795 if (xbzrle_init()) {
2796 ram_state_cleanup(rsp);
2797 return -1;
2798 }
2799
2800 ram_init_bitmaps(*rsp);
2801
2802 return 0;
2803 }
2804
2805 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2806 {
2807 RAMBlock *block;
2808 uint64_t pages = 0;
2809
2810 /*
2811 * Postcopy is not using xbzrle/compression, so no need for that.
2812 * Also, since source are already halted, we don't need to care
2813 * about dirty page logging as well.
2814 */
2815
2816 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2817 pages += bitmap_count_one(block->bmap,
2818 block->used_length >> TARGET_PAGE_BITS);
2819 }
2820
2821 /* This may not be aligned with current bitmaps. Recalculate. */
2822 rs->migration_dirty_pages = pages;
2823
2824 ram_state_reset(rs);
2825
2826 /* Update RAMState cache of output QEMUFile */
2827 rs->f = out;
2828
2829 trace_ram_state_resume_prepare(pages);
2830 }
2831
2832 /*
2833 * This function clears bits of the free pages reported by the caller from the
2834 * migration dirty bitmap. @addr is the host address corresponding to the
2835 * start of the continuous guest free pages, and @len is the total bytes of
2836 * those pages.
2837 */
2838 void qemu_guest_free_page_hint(void *addr, size_t len)
2839 {
2840 RAMBlock *block;
2841 ram_addr_t offset;
2842 size_t used_len, start, npages;
2843 MigrationState *s = migrate_get_current();
2844
2845 /* This function is currently expected to be used during live migration */
2846 if (!migration_is_setup_or_active(s->state)) {
2847 return;
2848 }
2849
2850 for (; len > 0; len -= used_len, addr += used_len) {
2851 block = qemu_ram_block_from_host(addr, false, &offset);
2852 if (unlikely(!block || offset >= block->used_length)) {
2853 /*
2854 * The implementation might not support RAMBlock resize during
2855 * live migration, but it could happen in theory with future
2856 * updates. So we add a check here to capture that case.
2857 */
2858 error_report_once("%s unexpected error", __func__);
2859 return;
2860 }
2861
2862 if (len <= block->used_length - offset) {
2863 used_len = len;
2864 } else {
2865 used_len = block->used_length - offset;
2866 }
2867
2868 start = offset >> TARGET_PAGE_BITS;
2869 npages = used_len >> TARGET_PAGE_BITS;
2870
2871 qemu_mutex_lock(&ram_state->bitmap_mutex);
2872 /*
2873 * The skipped free pages are equavalent to be sent from clear_bmap's
2874 * perspective, so clear the bits from the memory region bitmap which
2875 * are initially set. Otherwise those skipped pages will be sent in
2876 * the next round after syncing from the memory region bitmap.
2877 */
2878 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2879 ram_state->migration_dirty_pages -=
2880 bitmap_count_one_with_offset(block->bmap, start, npages);
2881 bitmap_clear(block->bmap, start, npages);
2882 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2883 }
2884 }
2885
2886 /*
2887 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2888 * long-running RCU critical section. When rcu-reclaims in the code
2889 * start to become numerous it will be necessary to reduce the
2890 * granularity of these critical sections.
2891 */
2892
2893 /**
2894 * ram_save_setup: Setup RAM for migration
2895 *
2896 * Returns zero to indicate success and negative for error
2897 *
2898 * @f: QEMUFile where to send the data
2899 * @opaque: RAMState pointer
2900 */
2901 static int ram_save_setup(QEMUFile *f, void *opaque)
2902 {
2903 RAMState **rsp = opaque;
2904 RAMBlock *block;
2905
2906 if (compress_threads_save_setup()) {
2907 return -1;
2908 }
2909
2910 /* migration has already setup the bitmap, reuse it. */
2911 if (!migration_in_colo_state()) {
2912 if (ram_init_all(rsp) != 0) {
2913 compress_threads_save_cleanup();
2914 return -1;
2915 }
2916 }
2917 (*rsp)->f = f;
2918
2919 WITH_RCU_READ_LOCK_GUARD() {
2920 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2921
2922 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2923 qemu_put_byte(f, strlen(block->idstr));
2924 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2925 qemu_put_be64(f, block->used_length);
2926 if (migrate_postcopy_ram() && block->page_size !=
2927 qemu_host_page_size) {
2928 qemu_put_be64(f, block->page_size);
2929 }
2930 if (migrate_ignore_shared()) {
2931 qemu_put_be64(f, block->mr->addr);
2932 }
2933 }
2934 }
2935
2936 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2937 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2938
2939 multifd_send_sync_main(f);
2940 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2941 qemu_fflush(f);
2942
2943 return 0;
2944 }
2945
2946 /**
2947 * ram_save_iterate: iterative stage for migration
2948 *
2949 * Returns zero to indicate success and negative for error
2950 *
2951 * @f: QEMUFile where to send the data
2952 * @opaque: RAMState pointer
2953 */
2954 static int ram_save_iterate(QEMUFile *f, void *opaque)
2955 {
2956 RAMState **temp = opaque;
2957 RAMState *rs = *temp;
2958 int ret = 0;
2959 int i;
2960 int64_t t0;
2961 int done = 0;
2962
2963 if (blk_mig_bulk_active()) {
2964 /* Avoid transferring ram during bulk phase of block migration as
2965 * the bulk phase will usually take a long time and transferring
2966 * ram updates during that time is pointless. */
2967 goto out;
2968 }
2969
2970 /*
2971 * We'll take this lock a little bit long, but it's okay for two reasons.
2972 * Firstly, the only possible other thread to take it is who calls
2973 * qemu_guest_free_page_hint(), which should be rare; secondly, see
2974 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
2975 * guarantees that we'll at least released it in a regular basis.
2976 */
2977 qemu_mutex_lock(&rs->bitmap_mutex);
2978 WITH_RCU_READ_LOCK_GUARD() {
2979 if (ram_list.version != rs->last_version) {
2980 ram_state_reset(rs);
2981 }
2982
2983 /* Read version before ram_list.blocks */
2984 smp_rmb();
2985
2986 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2987
2988 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2989 i = 0;
2990 while ((ret = qemu_file_rate_limit(f)) == 0 ||
2991 postcopy_has_request(rs)) {
2992 int pages;
2993
2994 if (qemu_file_get_error(f)) {
2995 break;
2996 }
2997
2998 pages = ram_find_and_save_block(rs);
2999 /* no more pages to sent */
3000 if (pages == 0) {
3001 done = 1;
3002 break;
3003 }
3004
3005 if (pages < 0) {
3006 qemu_file_set_error(f, pages);
3007 break;
3008 }
3009
3010 rs->target_page_count += pages;
3011
3012 /*
3013 * During postcopy, it is necessary to make sure one whole host
3014 * page is sent in one chunk.
3015 */
3016 if (migrate_postcopy_ram()) {
3017 flush_compressed_data(rs);
3018 }
3019
3020 /*
3021 * we want to check in the 1st loop, just in case it was the 1st
3022 * time and we had to sync the dirty bitmap.
3023 * qemu_clock_get_ns() is a bit expensive, so we only check each
3024 * some iterations
3025 */
3026 if ((i & 63) == 0) {
3027 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3028 1000000;
3029 if (t1 > MAX_WAIT) {
3030 trace_ram_save_iterate_big_wait(t1, i);
3031 break;
3032 }
3033 }
3034 i++;
3035 }
3036 }
3037 qemu_mutex_unlock(&rs->bitmap_mutex);
3038
3039 /*
3040 * Must occur before EOS (or any QEMUFile operation)
3041 * because of RDMA protocol.
3042 */
3043 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3044
3045 out:
3046 if (ret >= 0
3047 && migration_is_setup_or_active(migrate_get_current()->state)) {
3048 multifd_send_sync_main(rs->f);
3049 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3050 qemu_fflush(f);
3051 ram_transferred_add(8);
3052
3053 ret = qemu_file_get_error(f);
3054 }
3055 if (ret < 0) {
3056 return ret;
3057 }
3058
3059 return done;
3060 }
3061
3062 /**
3063 * ram_save_complete: function called to send the remaining amount of ram
3064 *
3065 * Returns zero to indicate success or negative on error
3066 *
3067 * Called with iothread lock
3068 *
3069 * @f: QEMUFile where to send the data
3070 * @opaque: RAMState pointer
3071 */
3072 static int ram_save_complete(QEMUFile *f, void *opaque)
3073 {
3074 RAMState **temp = opaque;
3075 RAMState *rs = *temp;
3076 int ret = 0;
3077
3078 rs->last_stage = !migration_in_colo_state();
3079
3080 WITH_RCU_READ_LOCK_GUARD() {
3081 if (!migration_in_postcopy()) {
3082 migration_bitmap_sync_precopy(rs);
3083 }
3084
3085 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3086
3087 /* try transferring iterative blocks of memory */
3088
3089 /* flush all remaining blocks regardless of rate limiting */
3090 while (true) {
3091 int pages;
3092
3093 pages = ram_find_and_save_block(rs);
3094 /* no more blocks to sent */
3095 if (pages == 0) {
3096 break;
3097 }
3098 if (pages < 0) {
3099 ret = pages;
3100 break;
3101 }
3102 }
3103
3104 flush_compressed_data(rs);
3105 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3106 }
3107
3108 if (ret >= 0) {
3109 multifd_send_sync_main(rs->f);
3110 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3111 qemu_fflush(f);
3112 }
3113
3114 return ret;
3115 }
3116
3117 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3118 uint64_t *res_precopy_only,
3119 uint64_t *res_compatible,
3120 uint64_t *res_postcopy_only)
3121 {
3122 RAMState **temp = opaque;
3123 RAMState *rs = *temp;
3124 uint64_t remaining_size;
3125
3126 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3127
3128 if (!migration_in_postcopy() &&
3129 remaining_size < max_size) {
3130 qemu_mutex_lock_iothread();
3131 WITH_RCU_READ_LOCK_GUARD() {
3132 migration_bitmap_sync_precopy(rs);
3133 }
3134 qemu_mutex_unlock_iothread();
3135 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3136 }
3137
3138 if (migrate_postcopy_ram()) {
3139 /* We can do postcopy, and all the data is postcopiable */
3140 *res_compatible += remaining_size;
3141 } else {
3142 *res_precopy_only += remaining_size;
3143 }
3144 }
3145
3146 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3147 {
3148 unsigned int xh_len;
3149 int xh_flags;
3150 uint8_t *loaded_data;
3151
3152 /* extract RLE header */
3153 xh_flags = qemu_get_byte(f);
3154 xh_len = qemu_get_be16(f);
3155
3156 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3157 error_report("Failed to load XBZRLE page - wrong compression!");
3158 return -1;
3159 }
3160
3161 if (xh_len > TARGET_PAGE_SIZE) {
3162 error_report("Failed to load XBZRLE page - len overflow!");
3163 return -1;
3164 }
3165 loaded_data = XBZRLE.decoded_buf;
3166 /* load data and decode */
3167 /* it can change loaded_data to point to an internal buffer */
3168 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3169
3170 /* decode RLE */
3171 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3172 TARGET_PAGE_SIZE) == -1) {
3173 error_report("Failed to load XBZRLE page - decode error!");
3174 return -1;
3175 }
3176
3177 return 0;
3178 }
3179
3180 /**
3181 * ram_block_from_stream: read a RAMBlock id from the migration stream
3182 *
3183 * Must be called from within a rcu critical section.
3184 *
3185 * Returns a pointer from within the RCU-protected ram_list.
3186 *
3187 * @f: QEMUFile where to read the data from
3188 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3189 */
3190 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3191 {
3192 static RAMBlock *block;
3193 char id[256];
3194 uint8_t len;
3195
3196 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3197 if (!block) {
3198 error_report("Ack, bad migration stream!");
3199 return NULL;
3200 }
3201 return block;
3202 }
3203
3204 len = qemu_get_byte(f);
3205 qemu_get_buffer(f, (uint8_t *)id, len);
3206 id[len] = 0;
3207
3208 block = qemu_ram_block_by_name(id);
3209 if (!block) {
3210 error_report("Can't find block %s", id);
3211 return NULL;
3212 }
3213
3214 if (ramblock_is_ignored(block)) {
3215 error_report("block %s should not be migrated !", id);
3216 return NULL;
3217 }
3218
3219 return block;
3220 }
3221
3222 static inline void *host_from_ram_block_offset(RAMBlock *block,
3223 ram_addr_t offset)
3224 {
3225 if (!offset_in_ramblock(block, offset)) {
3226 return NULL;
3227 }
3228
3229 return block->host + offset;
3230 }
3231
3232 static void *host_page_from_ram_block_offset(RAMBlock *block,
3233 ram_addr_t offset)
3234 {
3235 /* Note: Explicitly no check against offset_in_ramblock(). */
3236 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3237 block->page_size);
3238 }
3239
3240 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3241 ram_addr_t offset)
3242 {
3243 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3244 }
3245
3246 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3247 ram_addr_t offset, bool record_bitmap)
3248 {
3249 if (!offset_in_ramblock(block, offset)) {
3250 return NULL;
3251 }
3252 if (!block->colo_cache) {
3253 error_report("%s: colo_cache is NULL in block :%s",
3254 __func__, block->idstr);
3255 return NULL;
3256 }
3257
3258 /*
3259 * During colo checkpoint, we need bitmap of these migrated pages.
3260 * It help us to decide which pages in ram cache should be flushed
3261 * into VM's RAM later.
3262 */
3263 if (record_bitmap &&
3264 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3265 ram_state->migration_dirty_pages++;
3266 }
3267 return block->colo_cache + offset;
3268 }
3269
3270 /**
3271 * ram_handle_compressed: handle the zero page case
3272 *
3273 * If a page (or a whole RDMA chunk) has been
3274 * determined to be zero, then zap it.
3275 *
3276 * @host: host address for the zero page
3277 * @ch: what the page is filled from. We only support zero
3278 * @size: size of the zero page
3279 */
3280 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3281 {
3282 if (ch != 0 || !buffer_is_zero(host, size)) {
3283 memset(host, ch, size);
3284 }
3285 }
3286
3287 /* return the size after decompression, or negative value on error */
3288 static int
3289 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3290 const uint8_t *source, size_t source_len)
3291 {
3292 int err;
3293
3294 err = inflateReset(stream);
3295 if (err != Z_OK) {
3296 return -1;
3297 }
3298
3299 stream->avail_in = source_len;
3300 stream->next_in = (uint8_t *)source;
3301 stream->avail_out = dest_len;
3302 stream->next_out = dest;
3303
3304 err = inflate(stream, Z_NO_FLUSH);
3305 if (err != Z_STREAM_END) {
3306 return -1;
3307 }
3308
3309 return stream->total_out;
3310 }
3311
3312 static void *do_data_decompress(void *opaque)
3313 {
3314 DecompressParam *param = opaque;
3315 unsigned long pagesize;
3316 uint8_t *des;
3317 int len, ret;
3318
3319 qemu_mutex_lock(&param->mutex);
3320 while (!param->quit) {
3321 if (param->des) {
3322 des = param->des;
3323 len = param->len;
3324 param->des = 0;
3325 qemu_mutex_unlock(&param->mutex);
3326
3327 pagesize = TARGET_PAGE_SIZE;
3328
3329 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3330 param->compbuf, len);
3331 if (ret < 0 && migrate_get_current()->decompress_error_check) {
3332 error_report("decompress data failed");
3333 qemu_file_set_error(decomp_file, ret);
3334 }
3335
3336 qemu_mutex_lock(&decomp_done_lock);
3337 param->done = true;
3338 qemu_cond_signal(&decomp_done_cond);
3339 qemu_mutex_unlock(&decomp_done_lock);
3340
3341 qemu_mutex_lock(&param->mutex);
3342 } else {
3343 qemu_cond_wait(&param->cond, &param->mutex);
3344 }
3345 }
3346 qemu_mutex_unlock(&param->mutex);
3347
3348 return NULL;
3349 }
3350
3351 static int wait_for_decompress_done(void)
3352 {
3353 int idx, thread_count;
3354
3355 if (!migrate_use_compression()) {
3356 return 0;
3357 }
3358
3359 thread_count = migrate_decompress_threads();
3360 qemu_mutex_lock(&decomp_done_lock);
3361 for (idx = 0; idx < thread_count; idx++) {
3362 while (!decomp_param[idx].done) {
3363 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3364 }
3365 }
3366 qemu_mutex_unlock(&decomp_done_lock);
3367 return qemu_file_get_error(decomp_file);
3368 }
3369
3370 static void compress_threads_load_cleanup(void)
3371 {
3372 int i, thread_count;
3373
3374 if (!migrate_use_compression()) {
3375 return;
3376 }
3377 thread_count = migrate_decompress_threads();
3378 for (i = 0; i < thread_count; i++) {
3379 /*
3380 * we use it as a indicator which shows if the thread is
3381 * properly init'd or not
3382 */
3383 if (!decomp_param[i].compbuf) {
3384 break;
3385 }
3386
3387 qemu_mutex_lock(&decomp_param[i].mutex);
3388 decomp_param[i].quit = true;
3389 qemu_cond_signal(&decomp_param[i].cond);
3390 qemu_mutex_unlock(&decomp_param[i].mutex);
3391 }
3392 for (i = 0; i < thread_count; i++) {
3393 if (!decomp_param[i].compbuf) {
3394 break;
3395 }
3396
3397 qemu_thread_join(decompress_threads + i);
3398 qemu_mutex_destroy(&decomp_param[i].mutex);
3399 qemu_cond_destroy(&decomp_param[i].cond);
3400 inflateEnd(&decomp_param[i].stream);
3401 g_free(decomp_param[i].compbuf);
3402 decomp_param[i].compbuf = NULL;
3403 }
3404 g_free(decompress_threads);
3405 g_free(decomp_param);
3406 decompress_threads = NULL;
3407 decomp_param = NULL;
3408 decomp_file = NULL;
3409 }
3410
3411 static int compress_threads_load_setup(QEMUFile *f)
3412 {
3413 int i, thread_count;
3414
3415 if (!migrate_use_compression()) {
3416 return 0;
3417 }
3418
3419 thread_count = migrate_decompress_threads();
3420 decompress_threads = g_new0(QemuThread, thread_count);
3421 decomp_param = g_new0(DecompressParam, thread_count);
3422 qemu_mutex_init(&decomp_done_lock);
3423 qemu_cond_init(&decomp_done_cond);
3424 decomp_file = f;
3425 for (i = 0; i < thread_count; i++) {
3426 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3427 goto exit;
3428 }
3429
3430 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3431 qemu_mutex_init(&decomp_param[i].mutex);
3432 qemu_cond_init(&decomp_param[i].cond);
3433 decomp_param[i].done = true;
3434 decomp_param[i].quit = false;
3435 qemu_thread_create(decompress_threads + i, "decompress",
3436 do_data_decompress, decomp_param + i,
3437 QEMU_THREAD_JOINABLE);
3438 }
3439 return 0;
3440 exit:
3441 compress_threads_load_cleanup();
3442 return -1;
3443 }
3444
3445 static void decompress_data_with_multi_threads(QEMUFile *f,
3446 void *host, int len)
3447 {
3448 int idx, thread_count;
3449
3450 thread_count = migrate_decompress_threads();
3451 QEMU_LOCK_GUARD(&decomp_done_lock);
3452 while (true) {
3453 for (idx = 0; idx < thread_count; idx++) {
3454 if (decomp_param[idx].done) {
3455 decomp_param[idx].done = false;
3456 qemu_mutex_lock(&decomp_param[idx].mutex);
3457 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3458 decomp_param[idx].des = host;
3459 decomp_param[idx].len = len;
3460 qemu_cond_signal(&decomp_param[idx].cond);
3461 qemu_mutex_unlock(&decomp_param[idx].mutex);
3462 break;
3463 }
3464 }
3465 if (idx < thread_count) {
3466 break;
3467 } else {
3468 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3469 }
3470 }
3471 }
3472
3473 static void colo_init_ram_state(void)
3474 {
3475 ram_state_init(&ram_state);
3476 }
3477
3478 /*
3479 * colo cache: this is for secondary VM, we cache the whole
3480 * memory of the secondary VM, it is need to hold the global lock
3481 * to call this helper.
3482 */
3483 int colo_init_ram_cache(void)
3484 {
3485 RAMBlock *block;
3486
3487 WITH_RCU_READ_LOCK_GUARD() {
3488 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3489 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3490 NULL, false, false);
3491 if (!block->colo_cache) {
3492 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3493 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3494 block->used_length);
3495 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3496 if (block->colo_cache) {
3497 qemu_anon_ram_free(block->colo_cache, block->used_length);
3498 block->colo_cache = NULL;
3499 }
3500 }
3501 return -errno;
3502 }
3503 if (!machine_dump_guest_core(current_machine)) {
3504 qemu_madvise(block->colo_cache, block->used_length,
3505 QEMU_MADV_DONTDUMP);
3506 }
3507 }
3508 }
3509
3510 /*
3511 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3512 * with to decide which page in cache should be flushed into SVM's RAM. Here
3513 * we use the same name 'ram_bitmap' as for migration.
3514 */
3515 if (ram_bytes_total()) {
3516 RAMBlock *block;
3517
3518 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3519 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3520 block->bmap = bitmap_new(pages);
3521 }
3522 }
3523
3524 colo_init_ram_state();
3525 return 0;
3526 }
3527
3528 /* TODO: duplicated with ram_init_bitmaps */
3529 void colo_incoming_start_dirty_log(void)
3530 {
3531 RAMBlock *block = NULL;
3532 /* For memory_global_dirty_log_start below. */
3533 qemu_mutex_lock_iothread();
3534 qemu_mutex_lock_ramlist();
3535
3536 memory_global_dirty_log_sync();
3537 WITH_RCU_READ_LOCK_GUARD() {
3538 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3539 ramblock_sync_dirty_bitmap(ram_state, block);
3540 /* Discard this dirty bitmap record */
3541 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3542 }
3543 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3544 }
3545 ram_state->migration_dirty_pages = 0;
3546 qemu_mutex_unlock_ramlist();
3547 qemu_mutex_unlock_iothread();
3548 }
3549
3550 /* It is need to hold the global lock to call this helper */
3551 void colo_release_ram_cache(void)
3552 {
3553 RAMBlock *block;
3554
3555 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3556 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3557 g_free(block->bmap);
3558 block->bmap = NULL;
3559 }
3560
3561 WITH_RCU_READ_LOCK_GUARD() {
3562 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3563 if (block->colo_cache) {
3564 qemu_anon_ram_free(block->colo_cache, block->used_length);
3565 block->colo_cache = NULL;
3566 }
3567 }
3568 }
3569 ram_state_cleanup(&ram_state);
3570 }
3571
3572 /**
3573 * ram_load_setup: Setup RAM for migration incoming side
3574 *
3575 * Returns zero to indicate success and negative for error
3576 *
3577 * @f: QEMUFile where to receive the data
3578 * @opaque: RAMState pointer
3579 */
3580 static int ram_load_setup(QEMUFile *f, void *opaque)
3581 {
3582 if (compress_threads_load_setup(f)) {
3583 return -1;
3584 }
3585
3586 xbzrle_load_setup();
3587 ramblock_recv_map_init();
3588
3589 return 0;
3590 }
3591
3592 static int ram_load_cleanup(void *opaque)
3593 {
3594 RAMBlock *rb;
3595
3596 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3597 qemu_ram_block_writeback(rb);
3598 }
3599
3600 xbzrle_load_cleanup();
3601 compress_threads_load_cleanup();
3602
3603 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3604 g_free(rb->receivedmap);
3605 rb->receivedmap = NULL;
3606 }
3607
3608 return 0;
3609 }
3610
3611 /**
3612 * ram_postcopy_incoming_init: allocate postcopy data structures
3613 *
3614 * Returns 0 for success and negative if there was one error
3615 *
3616 * @mis: current migration incoming state
3617 *
3618 * Allocate data structures etc needed by incoming migration with
3619 * postcopy-ram. postcopy-ram's similarly names
3620 * postcopy_ram_incoming_init does the work.
3621 */
3622 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3623 {
3624 return postcopy_ram_incoming_init(mis);
3625 }
3626
3627 /**
3628 * ram_load_postcopy: load a page in postcopy case
3629 *
3630 * Returns 0 for success or -errno in case of error
3631 *
3632 * Called in postcopy mode by ram_load().
3633 * rcu_read_lock is taken prior to this being called.
3634 *
3635 * @f: QEMUFile where to send the data
3636 */
3637 static int ram_load_postcopy(QEMUFile *f)
3638 {
3639 int flags = 0, ret = 0;
3640 bool place_needed = false;
3641 bool matches_target_page_size = false;
3642 MigrationIncomingState *mis = migration_incoming_get_current();
3643 /* Temporary page that is later 'placed' */
3644 void *postcopy_host_page = mis->postcopy_tmp_page;
3645 void *host_page = NULL;
3646 bool all_zero = true;
3647 int target_pages = 0;
3648
3649 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3650 ram_addr_t addr;
3651 void *page_buffer = NULL;
3652 void *place_source = NULL;
3653 RAMBlock *block = NULL;
3654 uint8_t ch;
3655 int len;
3656
3657 addr = qemu_get_be64(f);
3658
3659 /*
3660 * If qemu file error, we should stop here, and then "addr"
3661 * may be invalid
3662 */
3663 ret = qemu_file_get_error(f);
3664 if (ret) {
3665 break;
3666 }
3667
3668 flags = addr & ~TARGET_PAGE_MASK;
3669 addr &= TARGET_PAGE_MASK;
3670
3671 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3672 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3673 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3674 block = ram_block_from_stream(f, flags);
3675 if (!block) {
3676 ret = -EINVAL;
3677 break;
3678 }
3679
3680 /*
3681 * Relying on used_length is racy and can result in false positives.
3682 * We might place pages beyond used_length in case RAM was shrunk
3683 * while in postcopy, which is fine - trying to place via
3684 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3685 */
3686 if (!block->host || addr >= block->postcopy_length) {
3687 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3688 ret = -EINVAL;
3689 break;
3690 }
3691 target_pages++;
3692 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3693 /*
3694 * Postcopy requires that we place whole host pages atomically;
3695 * these may be huge pages for RAMBlocks that are backed by
3696 * hugetlbfs.
3697 * To make it atomic, the data is read into a temporary page
3698 * that's moved into place later.
3699 * The migration protocol uses, possibly smaller, target-pages
3700 * however the source ensures it always sends all the components
3701 * of a host page in one chunk.
3702 */
3703 page_buffer = postcopy_host_page +
3704 host_page_offset_from_ram_block_offset(block, addr);
3705 /* If all TP are zero then we can optimise the place */
3706 if (target_pages == 1) {
3707 host_page = host_page_from_ram_block_offset(block, addr);
3708 } else if (host_page != host_page_from_ram_block_offset(block,
3709 addr)) {
3710 /* not the 1st TP within the HP */
3711 error_report("Non-same host page %p/%p", host_page,
3712 host_page_from_ram_block_offset(block, addr));
3713 ret = -EINVAL;
3714 break;
3715 }
3716
3717 /*
3718 * If it's the last part of a host page then we place the host
3719 * page
3720 */
3721 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3722 place_needed = true;
3723 }
3724 place_source = postcopy_host_page;
3725 }
3726
3727 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3728 case RAM_SAVE_FLAG_ZERO:
3729 ch = qemu_get_byte(f);
3730 /*
3731 * Can skip to set page_buffer when
3732 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3733 */
3734 if (ch || !matches_target_page_size) {
3735 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3736 }
3737 if (ch) {
3738 all_zero = false;
3739 }
3740 break;
3741
3742 case RAM_SAVE_FLAG_PAGE:
3743 all_zero = false;
3744 if (!matches_target_page_size) {
3745 /* For huge pages, we always use temporary buffer */
3746 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3747 } else {
3748 /*
3749 * For small pages that matches target page size, we
3750 * avoid the qemu_file copy. Instead we directly use
3751 * the buffer of QEMUFile to place the page. Note: we
3752 * cannot do any QEMUFile operation before using that
3753 * buffer to make sure the buffer is valid when
3754 * placing the page.
3755 */
3756 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3757 TARGET_PAGE_SIZE);
3758 }
3759 break;
3760 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3761 all_zero = false;
3762 len = qemu_get_be32(f);
3763 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3764 error_report("Invalid compressed data length: %d", len);
3765 ret = -EINVAL;
3766 break;
3767 }
3768 decompress_data_with_multi_threads(f, page_buffer, len);
3769 break;
3770
3771 case RAM_SAVE_FLAG_EOS:
3772 /* normal exit */
3773 multifd_recv_sync_main();
3774 break;
3775 default:
3776 error_report("Unknown combination of migration flags: 0x%x"
3777 " (postcopy mode)", flags);
3778 ret = -EINVAL;
3779 break;
3780 }
3781
3782 /* Got the whole host page, wait for decompress before placing. */
3783 if (place_needed) {
3784 ret |= wait_for_decompress_done();
3785 }
3786
3787 /* Detect for any possible file errors */
3788 if (!ret && qemu_file_get_error(f)) {
3789 ret = qemu_file_get_error(f);
3790 }
3791
3792 if (!ret && place_needed) {
3793 if (all_zero) {
3794 ret = postcopy_place_page_zero(mis, host_page, block);
3795 } else {
3796 ret = postcopy_place_page(mis, host_page, place_source,
3797 block);
3798 }
3799 place_needed = false;
3800 target_pages = 0;
3801 /* Assume we have a zero page until we detect something different */
3802 all_zero = true;
3803 }
3804 }
3805
3806 return ret;
3807 }
3808
3809 static bool postcopy_is_advised(void)
3810 {
3811 PostcopyState ps = postcopy_state_get();
3812 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3813 }
3814
3815 static bool postcopy_is_running(void)
3816 {
3817 PostcopyState ps = postcopy_state_get();
3818 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3819 }
3820
3821 /*
3822 * Flush content of RAM cache into SVM's memory.
3823 * Only flush the pages that be dirtied by PVM or SVM or both.
3824 */
3825 void colo_flush_ram_cache(void)
3826 {
3827 RAMBlock *block = NULL;
3828 void *dst_host;
3829 void *src_host;
3830 unsigned long offset = 0;
3831
3832 memory_global_dirty_log_sync();
3833 WITH_RCU_READ_LOCK_GUARD() {
3834 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3835 ramblock_sync_dirty_bitmap(ram_state, block);
3836 }
3837 }
3838
3839 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3840 WITH_RCU_READ_LOCK_GUARD() {
3841 block = QLIST_FIRST_RCU(&ram_list.blocks);
3842
3843 while (block) {
3844 unsigned long num = 0;
3845
3846 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3847 if (!offset_in_ramblock(block,
3848 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3849 offset = 0;
3850 num = 0;
3851 block = QLIST_NEXT_RCU(block, next);
3852 } else {
3853 unsigned long i = 0;
3854
3855 for (i = 0; i < num; i++) {
3856 migration_bitmap_clear_dirty(ram_state, block, offset + i);
3857 }
3858 dst_host = block->host
3859 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3860 src_host = block->colo_cache
3861 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3862 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3863 offset += num;
3864 }
3865 }
3866 }
3867 trace_colo_flush_ram_cache_end();
3868 }
3869
3870 /**
3871 * ram_load_precopy: load pages in precopy case
3872 *
3873 * Returns 0 for success or -errno in case of error
3874 *
3875 * Called in precopy mode by ram_load().
3876 * rcu_read_lock is taken prior to this being called.
3877 *
3878 * @f: QEMUFile where to send the data
3879 */
3880 static int ram_load_precopy(QEMUFile *f)
3881 {
3882 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3883 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3884 bool postcopy_advised = postcopy_is_advised();
3885 if (!migrate_use_compression()) {
3886 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3887 }
3888
3889 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3890 ram_addr_t addr, total_ram_bytes;
3891 void *host = NULL, *host_bak = NULL;
3892 uint8_t ch;
3893
3894 /*
3895 * Yield periodically to let main loop run, but an iteration of
3896 * the main loop is expensive, so do it each some iterations
3897 */
3898 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3899 aio_co_schedule(qemu_get_current_aio_context(),
3900 qemu_coroutine_self());
3901 qemu_coroutine_yield();
3902 }
3903 i++;
3904
3905 addr = qemu_get_be64(f);
3906 flags = addr & ~TARGET_PAGE_MASK;
3907 addr &= TARGET_PAGE_MASK;
3908
3909 if (flags & invalid_flags) {
3910 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3911 error_report("Received an unexpected compressed page");
3912 }
3913
3914 ret = -EINVAL;
3915 break;
3916 }
3917
3918 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3919 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3920 RAMBlock *block = ram_block_from_stream(f, flags);
3921
3922 host = host_from_ram_block_offset(block, addr);
3923 /*
3924 * After going into COLO stage, we should not load the page
3925 * into SVM's memory directly, we put them into colo_cache firstly.
3926 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3927 * Previously, we copied all these memory in preparing stage of COLO
3928 * while we need to stop VM, which is a time-consuming process.
3929 * Here we optimize it by a trick, back-up every page while in
3930 * migration process while COLO is enabled, though it affects the
3931 * speed of the migration, but it obviously reduce the downtime of
3932 * back-up all SVM'S memory in COLO preparing stage.
3933 */
3934 if (migration_incoming_colo_enabled()) {
3935 if (migration_incoming_in_colo_state()) {
3936 /* In COLO stage, put all pages into cache temporarily */
3937 host = colo_cache_from_block_offset(block, addr, true);
3938 } else {
3939 /*
3940 * In migration stage but before COLO stage,
3941 * Put all pages into both cache and SVM's memory.
3942 */
3943 host_bak = colo_cache_from_block_offset(block, addr, false);
3944 }
3945 }
3946 if (!host) {
3947 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3948 ret = -EINVAL;
3949 break;
3950 }
3951 if (!migration_incoming_in_colo_state()) {
3952 ramblock_recv_bitmap_set(block, host);
3953 }
3954
3955 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3956 }
3957
3958 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3959 case RAM_SAVE_FLAG_MEM_SIZE:
3960 /* Synchronize RAM block list */
3961 total_ram_bytes = addr;
3962 while (!ret && total_ram_bytes) {
3963 RAMBlock *block;
3964 char id[256];
3965 ram_addr_t length;
3966
3967 len = qemu_get_byte(f);
3968 qemu_get_buffer(f, (uint8_t *)id, len);
3969 id[len] = 0;
3970 length = qemu_get_be64(f);
3971
3972 block = qemu_ram_block_by_name(id);
3973 if (block && !qemu_ram_is_migratable(block)) {
3974 error_report("block %s should not be migrated !", id);
3975 ret = -EINVAL;
3976 } else if (block) {
3977 if (length != block->used_length) {
3978 Error *local_err = NULL;
3979
3980 ret = qemu_ram_resize(block, length,
3981 &local_err);
3982 if (local_err) {
3983 error_report_err(local_err);
3984 }
3985 }
3986 /* For postcopy we need to check hugepage sizes match */
3987 if (postcopy_advised && migrate_postcopy_ram() &&
3988 block->page_size != qemu_host_page_size) {
3989 uint64_t remote_page_size = qemu_get_be64(f);
3990 if (remote_page_size != block->page_size) {
3991 error_report("Mismatched RAM page size %s "
3992 "(local) %zd != %" PRId64,
3993 id, block->page_size,
3994 remote_page_size);
3995 ret = -EINVAL;
3996 }
3997 }
3998 if (migrate_ignore_shared()) {
3999 hwaddr addr = qemu_get_be64(f);
4000 if (ramblock_is_ignored(block) &&
4001 block->mr->addr != addr) {
4002 error_report("Mismatched GPAs for block %s "
4003 "%" PRId64 "!= %" PRId64,
4004 id, (uint64_t)addr,
4005 (uint64_t)block->mr->addr);
4006 ret = -EINVAL;
4007 }
4008 }
4009 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4010 block->idstr);
4011 } else {
4012 error_report("Unknown ramblock \"%s\", cannot "
4013 "accept migration", id);
4014 ret = -EINVAL;
4015 }
4016
4017 total_ram_bytes -= length;
4018 }
4019 break;
4020
4021 case RAM_SAVE_FLAG_ZERO:
4022 ch = qemu_get_byte(f);
4023 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4024 break;
4025
4026 case RAM_SAVE_FLAG_PAGE:
4027 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4028 break;
4029
4030 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4031 len = qemu_get_be32(f);
4032 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4033 error_report("Invalid compressed data length: %d", len);
4034 ret = -EINVAL;
4035 break;
4036 }
4037 decompress_data_with_multi_threads(f, host, len);
4038 break;
4039
4040 case RAM_SAVE_FLAG_XBZRLE:
4041 if (load_xbzrle(f, addr, host) < 0) {
4042 error_report("Failed to decompress XBZRLE page at "
4043 RAM_ADDR_FMT, addr);
4044 ret = -EINVAL;
4045 break;
4046 }
4047 break;
4048 case RAM_SAVE_FLAG_EOS:
4049 /* normal exit */
4050 multifd_recv_sync_main();
4051 break;
4052 default:
4053 if (flags & RAM_SAVE_FLAG_HOOK) {
4054 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4055 } else {
4056 error_report("Unknown combination of migration flags: 0x%x",
4057 flags);
4058 ret = -EINVAL;
4059 }
4060 }
4061 if (!ret) {
4062 ret = qemu_file_get_error(f);
4063 }
4064 if (!ret && host_bak) {
4065 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4066 }
4067 }
4068
4069 ret |= wait_for_decompress_done();
4070 return ret;
4071 }
4072
4073 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4074 {
4075 int ret = 0;
4076 static uint64_t seq_iter;
4077 /*
4078 * If system is running in postcopy mode, page inserts to host memory must
4079 * be atomic
4080 */
4081 bool postcopy_running = postcopy_is_running();
4082
4083 seq_iter++;
4084
4085 if (version_id != 4) {
4086 return -EINVAL;
4087 }
4088
4089 /*
4090 * This RCU critical section can be very long running.
4091 * When RCU reclaims in the code start to become numerous,
4092 * it will be necessary to reduce the granularity of this
4093 * critical section.
4094 */
4095 WITH_RCU_READ_LOCK_GUARD() {
4096 if (postcopy_running) {
4097 ret = ram_load_postcopy(f);
4098 } else {
4099 ret = ram_load_precopy(f);
4100 }
4101 }
4102 trace_ram_load_complete(ret, seq_iter);
4103
4104 return ret;
4105 }
4106
4107 static bool ram_has_postcopy(void *opaque)
4108 {
4109 RAMBlock *rb;
4110 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4111 if (ramblock_is_pmem(rb)) {
4112 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4113 "is not supported now!", rb->idstr, rb->host);
4114 return false;
4115 }
4116 }
4117
4118 return migrate_postcopy_ram();
4119 }
4120
4121 /* Sync all the dirty bitmap with destination VM. */
4122 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4123 {
4124 RAMBlock *block;
4125 QEMUFile *file = s->to_dst_file;
4126 int ramblock_count = 0;
4127
4128 trace_ram_dirty_bitmap_sync_start();
4129
4130 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4131 qemu_savevm_send_recv_bitmap(file, block->idstr);
4132 trace_ram_dirty_bitmap_request(block->idstr);
4133 ramblock_count++;
4134 }
4135
4136 trace_ram_dirty_bitmap_sync_wait();
4137
4138 /* Wait until all the ramblocks' dirty bitmap synced */
4139 while (ramblock_count--) {
4140 qemu_sem_wait(&s->rp_state.rp_sem);
4141 }
4142
4143 trace_ram_dirty_bitmap_sync_complete();
4144
4145 return 0;
4146 }
4147
4148 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4149 {
4150 qemu_sem_post(&s->rp_state.rp_sem);
4151 }
4152
4153 /*
4154 * Read the received bitmap, revert it as the initial dirty bitmap.
4155 * This is only used when the postcopy migration is paused but wants
4156 * to resume from a middle point.
4157 */
4158 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4159 {
4160 int ret = -EINVAL;
4161 /* from_dst_file is always valid because we're within rp_thread */
4162 QEMUFile *file = s->rp_state.from_dst_file;
4163 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4164 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4165 uint64_t size, end_mark;
4166
4167 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4168
4169 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4170 error_report("%s: incorrect state %s", __func__,
4171 MigrationStatus_str(s->state));
4172 return -EINVAL;
4173 }
4174
4175 /*
4176 * Note: see comments in ramblock_recv_bitmap_send() on why we
4177 * need the endianness conversion, and the paddings.
4178 */
4179 local_size = ROUND_UP(local_size, 8);
4180
4181 /* Add paddings */
4182 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4183
4184 size = qemu_get_be64(file);
4185
4186 /* The size of the bitmap should match with our ramblock */
4187 if (size != local_size) {
4188 error_report("%s: ramblock '%s' bitmap size mismatch "
4189 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4190 block->idstr, size, local_size);
4191 ret = -EINVAL;
4192 goto out;
4193 }
4194
4195 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4196 end_mark = qemu_get_be64(file);
4197
4198 ret = qemu_file_get_error(file);
4199 if (ret || size != local_size) {
4200 error_report("%s: read bitmap failed for ramblock '%s': %d"
4201 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4202 __func__, block->idstr, ret, local_size, size);
4203 ret = -EIO;
4204 goto out;
4205 }
4206
4207 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4208 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4209 __func__, block->idstr, end_mark);
4210 ret = -EINVAL;
4211 goto out;
4212 }
4213
4214 /*
4215 * Endianness conversion. We are during postcopy (though paused).
4216 * The dirty bitmap won't change. We can directly modify it.
4217 */
4218 bitmap_from_le(block->bmap, le_bitmap, nbits);
4219
4220 /*
4221 * What we received is "received bitmap". Revert it as the initial
4222 * dirty bitmap for this ramblock.
4223 */
4224 bitmap_complement(block->bmap, block->bmap, nbits);
4225
4226 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4227 ramblock_dirty_bitmap_clear_discarded_pages(block);
4228
4229 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4230 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4231
4232 /*
4233 * We succeeded to sync bitmap for current ramblock. If this is
4234 * the last one to sync, we need to notify the main send thread.
4235 */
4236 ram_dirty_bitmap_reload_notify(s);
4237
4238 ret = 0;
4239 out:
4240 g_free(le_bitmap);
4241 return ret;
4242 }
4243
4244 static int ram_resume_prepare(MigrationState *s, void *opaque)
4245 {
4246 RAMState *rs = *(RAMState **)opaque;
4247 int ret;
4248
4249 ret = ram_dirty_bitmap_sync_all(s, rs);
4250 if (ret) {
4251 return ret;
4252 }
4253
4254 ram_state_resume_prepare(rs, s->to_dst_file);
4255
4256 return 0;
4257 }
4258
4259 static SaveVMHandlers savevm_ram_handlers = {
4260 .save_setup = ram_save_setup,
4261 .save_live_iterate = ram_save_iterate,
4262 .save_live_complete_postcopy = ram_save_complete,
4263 .save_live_complete_precopy = ram_save_complete,
4264 .has_postcopy = ram_has_postcopy,
4265 .save_live_pending = ram_save_pending,
4266 .load_state = ram_load,
4267 .save_cleanup = ram_save_cleanup,
4268 .load_setup = ram_load_setup,
4269 .load_cleanup = ram_load_cleanup,
4270 .resume_prepare = ram_resume_prepare,
4271 };
4272
4273 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4274 size_t old_size, size_t new_size)
4275 {
4276 PostcopyState ps = postcopy_state_get();
4277 ram_addr_t offset;
4278 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4279 Error *err = NULL;
4280
4281 if (ramblock_is_ignored(rb)) {
4282 return;
4283 }
4284
4285 if (!migration_is_idle()) {
4286 /*
4287 * Precopy code on the source cannot deal with the size of RAM blocks
4288 * changing at random points in time - especially after sending the
4289 * RAM block sizes in the migration stream, they must no longer change.
4290 * Abort and indicate a proper reason.
4291 */
4292 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4293 migration_cancel(err);
4294 error_free(err);
4295 }
4296
4297 switch (ps) {
4298 case POSTCOPY_INCOMING_ADVISE:
4299 /*
4300 * Update what ram_postcopy_incoming_init()->init_range() does at the
4301 * time postcopy was advised. Syncing RAM blocks with the source will
4302 * result in RAM resizes.
4303 */
4304 if (old_size < new_size) {
4305 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4306 error_report("RAM block '%s' discard of resized RAM failed",
4307 rb->idstr);
4308 }
4309 }
4310 rb->postcopy_length = new_size;
4311 break;
4312 case POSTCOPY_INCOMING_NONE:
4313 case POSTCOPY_INCOMING_RUNNING:
4314 case POSTCOPY_INCOMING_END:
4315 /*
4316 * Once our guest is running, postcopy does no longer care about
4317 * resizes. When growing, the new memory was not available on the
4318 * source, no handler needed.
4319 */
4320 break;
4321 default:
4322 error_report("RAM block '%s' resized during postcopy state: %d",
4323 rb->idstr, ps);
4324 exit(-1);
4325 }
4326 }
4327
4328 static RAMBlockNotifier ram_mig_ram_notifier = {
4329 .ram_block_resized = ram_mig_ram_block_resized,
4330 };
4331
4332 void ram_mig_init(void)
4333 {
4334 qemu_mutex_init(&XBZRLE.lock);
4335 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4336 ram_block_notifier_add(&ram_mig_ram_notifier);
4337 }