]> git.proxmox.com Git - mirror_qemu.git/blob - migration/ram.c
block/copy-before-write: implement cbw-timeout option
[mirror_qemu.git] / migration / ram.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/madvise.h"
34 #include "qemu/main-loop.h"
35 #include "io/channel-null.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/error.h"
46 #include "qapi/qapi-types-migration.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
49 #include "trace.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
54 #include "block.h"
55 #include "sysemu/cpu-throttle.h"
56 #include "savevm.h"
57 #include "qemu/iov.h"
58 #include "multifd.h"
59 #include "sysemu/runstate.h"
60
61 #include "hw/boards.h" /* for machine_dump_guest_core() */
62
63 #if defined(__linux__)
64 #include "qemu/userfaultfd.h"
65 #endif /* defined(__linux__) */
66
67 /***********************************************************/
68 /* ram save/restore */
69
70 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
71 * worked for pages that where filled with the same char. We switched
72 * it to only search for the zero value. And to avoid confusion with
73 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
74 */
75
76 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
77 #define RAM_SAVE_FLAG_ZERO 0x02
78 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
79 #define RAM_SAVE_FLAG_PAGE 0x08
80 #define RAM_SAVE_FLAG_EOS 0x10
81 #define RAM_SAVE_FLAG_CONTINUE 0x20
82 #define RAM_SAVE_FLAG_XBZRLE 0x40
83 /* 0x80 is reserved in migration.h start with 0x100 next */
84 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
85
86 XBZRLECacheStats xbzrle_counters;
87
88 /* struct contains XBZRLE cache and a static page
89 used by the compression */
90 static struct {
91 /* buffer used for XBZRLE encoding */
92 uint8_t *encoded_buf;
93 /* buffer for storing page content */
94 uint8_t *current_buf;
95 /* Cache for XBZRLE, Protected by lock. */
96 PageCache *cache;
97 QemuMutex lock;
98 /* it will store a page full of zeros */
99 uint8_t *zero_target_page;
100 /* buffer used for XBZRLE decoding */
101 uint8_t *decoded_buf;
102 } XBZRLE;
103
104 static void XBZRLE_cache_lock(void)
105 {
106 if (migrate_use_xbzrle()) {
107 qemu_mutex_lock(&XBZRLE.lock);
108 }
109 }
110
111 static void XBZRLE_cache_unlock(void)
112 {
113 if (migrate_use_xbzrle()) {
114 qemu_mutex_unlock(&XBZRLE.lock);
115 }
116 }
117
118 /**
119 * xbzrle_cache_resize: resize the xbzrle cache
120 *
121 * This function is called from migrate_params_apply in main
122 * thread, possibly while a migration is in progress. A running
123 * migration may be using the cache and might finish during this call,
124 * hence changes to the cache are protected by XBZRLE.lock().
125 *
126 * Returns 0 for success or -1 for error
127 *
128 * @new_size: new cache size
129 * @errp: set *errp if the check failed, with reason
130 */
131 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
132 {
133 PageCache *new_cache;
134 int64_t ret = 0;
135
136 /* Check for truncation */
137 if (new_size != (size_t)new_size) {
138 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
139 "exceeding address space");
140 return -1;
141 }
142
143 if (new_size == migrate_xbzrle_cache_size()) {
144 /* nothing to do */
145 return 0;
146 }
147
148 XBZRLE_cache_lock();
149
150 if (XBZRLE.cache != NULL) {
151 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
152 if (!new_cache) {
153 ret = -1;
154 goto out;
155 }
156
157 cache_fini(XBZRLE.cache);
158 XBZRLE.cache = new_cache;
159 }
160 out:
161 XBZRLE_cache_unlock();
162 return ret;
163 }
164
165 bool ramblock_is_ignored(RAMBlock *block)
166 {
167 return !qemu_ram_is_migratable(block) ||
168 (migrate_ignore_shared() && qemu_ram_is_shared(block));
169 }
170
171 #undef RAMBLOCK_FOREACH
172
173 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
174 {
175 RAMBlock *block;
176 int ret = 0;
177
178 RCU_READ_LOCK_GUARD();
179
180 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
181 ret = func(block, opaque);
182 if (ret) {
183 break;
184 }
185 }
186 return ret;
187 }
188
189 static void ramblock_recv_map_init(void)
190 {
191 RAMBlock *rb;
192
193 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
194 assert(!rb->receivedmap);
195 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
196 }
197 }
198
199 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
200 {
201 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
202 rb->receivedmap);
203 }
204
205 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
206 {
207 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
208 }
209
210 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
211 {
212 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
213 }
214
215 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
216 size_t nr)
217 {
218 bitmap_set_atomic(rb->receivedmap,
219 ramblock_recv_bitmap_offset(host_addr, rb),
220 nr);
221 }
222
223 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
224
225 /*
226 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
227 *
228 * Returns >0 if success with sent bytes, or <0 if error.
229 */
230 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
231 const char *block_name)
232 {
233 RAMBlock *block = qemu_ram_block_by_name(block_name);
234 unsigned long *le_bitmap, nbits;
235 uint64_t size;
236
237 if (!block) {
238 error_report("%s: invalid block name: %s", __func__, block_name);
239 return -1;
240 }
241
242 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
243
244 /*
245 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
246 * machines we may need 4 more bytes for padding (see below
247 * comment). So extend it a bit before hand.
248 */
249 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
250
251 /*
252 * Always use little endian when sending the bitmap. This is
253 * required that when source and destination VMs are not using the
254 * same endianness. (Note: big endian won't work.)
255 */
256 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
257
258 /* Size of the bitmap, in bytes */
259 size = DIV_ROUND_UP(nbits, 8);
260
261 /*
262 * size is always aligned to 8 bytes for 64bit machines, but it
263 * may not be true for 32bit machines. We need this padding to
264 * make sure the migration can survive even between 32bit and
265 * 64bit machines.
266 */
267 size = ROUND_UP(size, 8);
268
269 qemu_put_be64(file, size);
270 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
271 /*
272 * Mark as an end, in case the middle part is screwed up due to
273 * some "mysterious" reason.
274 */
275 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
276 qemu_fflush(file);
277
278 g_free(le_bitmap);
279
280 if (qemu_file_get_error(file)) {
281 return qemu_file_get_error(file);
282 }
283
284 return size + sizeof(size);
285 }
286
287 /*
288 * An outstanding page request, on the source, having been received
289 * and queued
290 */
291 struct RAMSrcPageRequest {
292 RAMBlock *rb;
293 hwaddr offset;
294 hwaddr len;
295
296 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
297 };
298
299 /* State of RAM for migration */
300 struct RAMState {
301 /* QEMUFile used for this migration */
302 QEMUFile *f;
303 /* UFFD file descriptor, used in 'write-tracking' migration */
304 int uffdio_fd;
305 /* Last block that we have visited searching for dirty pages */
306 RAMBlock *last_seen_block;
307 /* Last block from where we have sent data */
308 RAMBlock *last_sent_block;
309 /* Last dirty target page we have sent */
310 ram_addr_t last_page;
311 /* last ram version we have seen */
312 uint32_t last_version;
313 /* How many times we have dirty too many pages */
314 int dirty_rate_high_cnt;
315 /* these variables are used for bitmap sync */
316 /* last time we did a full bitmap_sync */
317 int64_t time_last_bitmap_sync;
318 /* bytes transferred at start_time */
319 uint64_t bytes_xfer_prev;
320 /* number of dirty pages since start_time */
321 uint64_t num_dirty_pages_period;
322 /* xbzrle misses since the beginning of the period */
323 uint64_t xbzrle_cache_miss_prev;
324 /* Amount of xbzrle pages since the beginning of the period */
325 uint64_t xbzrle_pages_prev;
326 /* Amount of xbzrle encoded bytes since the beginning of the period */
327 uint64_t xbzrle_bytes_prev;
328 /* Start using XBZRLE (e.g., after the first round). */
329 bool xbzrle_enabled;
330 /* Are we on the last stage of migration */
331 bool last_stage;
332 /* compression statistics since the beginning of the period */
333 /* amount of count that no free thread to compress data */
334 uint64_t compress_thread_busy_prev;
335 /* amount bytes after compression */
336 uint64_t compressed_size_prev;
337 /* amount of compressed pages */
338 uint64_t compress_pages_prev;
339
340 /* total handled target pages at the beginning of period */
341 uint64_t target_page_count_prev;
342 /* total handled target pages since start */
343 uint64_t target_page_count;
344 /* number of dirty bits in the bitmap */
345 uint64_t migration_dirty_pages;
346 /* Protects modification of the bitmap and migration dirty pages */
347 QemuMutex bitmap_mutex;
348 /* The RAMBlock used in the last src_page_requests */
349 RAMBlock *last_req_rb;
350 /* Queue of outstanding page requests from the destination */
351 QemuMutex src_page_req_mutex;
352 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
353 };
354 typedef struct RAMState RAMState;
355
356 static RAMState *ram_state;
357
358 static NotifierWithReturnList precopy_notifier_list;
359
360 /* Whether postcopy has queued requests? */
361 static bool postcopy_has_request(RAMState *rs)
362 {
363 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
364 }
365
366 void precopy_infrastructure_init(void)
367 {
368 notifier_with_return_list_init(&precopy_notifier_list);
369 }
370
371 void precopy_add_notifier(NotifierWithReturn *n)
372 {
373 notifier_with_return_list_add(&precopy_notifier_list, n);
374 }
375
376 void precopy_remove_notifier(NotifierWithReturn *n)
377 {
378 notifier_with_return_remove(n);
379 }
380
381 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
382 {
383 PrecopyNotifyData pnd;
384 pnd.reason = reason;
385 pnd.errp = errp;
386
387 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
388 }
389
390 uint64_t ram_bytes_remaining(void)
391 {
392 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
393 0;
394 }
395
396 MigrationStats ram_counters;
397
398 static void ram_transferred_add(uint64_t bytes)
399 {
400 if (runstate_is_running()) {
401 ram_counters.precopy_bytes += bytes;
402 } else if (migration_in_postcopy()) {
403 ram_counters.postcopy_bytes += bytes;
404 } else {
405 ram_counters.downtime_bytes += bytes;
406 }
407 ram_counters.transferred += bytes;
408 }
409
410 /* used by the search for pages to send */
411 struct PageSearchStatus {
412 /* Current block being searched */
413 RAMBlock *block;
414 /* Current page to search from */
415 unsigned long page;
416 /* Set once we wrap around */
417 bool complete_round;
418 /* Whether current page is explicitly requested by postcopy */
419 bool postcopy_requested;
420 };
421 typedef struct PageSearchStatus PageSearchStatus;
422
423 CompressionStats compression_counters;
424
425 struct CompressParam {
426 bool done;
427 bool quit;
428 bool zero_page;
429 QEMUFile *file;
430 QemuMutex mutex;
431 QemuCond cond;
432 RAMBlock *block;
433 ram_addr_t offset;
434
435 /* internally used fields */
436 z_stream stream;
437 uint8_t *originbuf;
438 };
439 typedef struct CompressParam CompressParam;
440
441 struct DecompressParam {
442 bool done;
443 bool quit;
444 QemuMutex mutex;
445 QemuCond cond;
446 void *des;
447 uint8_t *compbuf;
448 int len;
449 z_stream stream;
450 };
451 typedef struct DecompressParam DecompressParam;
452
453 static CompressParam *comp_param;
454 static QemuThread *compress_threads;
455 /* comp_done_cond is used to wake up the migration thread when
456 * one of the compression threads has finished the compression.
457 * comp_done_lock is used to co-work with comp_done_cond.
458 */
459 static QemuMutex comp_done_lock;
460 static QemuCond comp_done_cond;
461
462 static QEMUFile *decomp_file;
463 static DecompressParam *decomp_param;
464 static QemuThread *decompress_threads;
465 static QemuMutex decomp_done_lock;
466 static QemuCond decomp_done_cond;
467
468 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
469 ram_addr_t offset, uint8_t *source_buf);
470
471 static void *do_data_compress(void *opaque)
472 {
473 CompressParam *param = opaque;
474 RAMBlock *block;
475 ram_addr_t offset;
476 bool zero_page;
477
478 qemu_mutex_lock(&param->mutex);
479 while (!param->quit) {
480 if (param->block) {
481 block = param->block;
482 offset = param->offset;
483 param->block = NULL;
484 qemu_mutex_unlock(&param->mutex);
485
486 zero_page = do_compress_ram_page(param->file, &param->stream,
487 block, offset, param->originbuf);
488
489 qemu_mutex_lock(&comp_done_lock);
490 param->done = true;
491 param->zero_page = zero_page;
492 qemu_cond_signal(&comp_done_cond);
493 qemu_mutex_unlock(&comp_done_lock);
494
495 qemu_mutex_lock(&param->mutex);
496 } else {
497 qemu_cond_wait(&param->cond, &param->mutex);
498 }
499 }
500 qemu_mutex_unlock(&param->mutex);
501
502 return NULL;
503 }
504
505 static void compress_threads_save_cleanup(void)
506 {
507 int i, thread_count;
508
509 if (!migrate_use_compression() || !comp_param) {
510 return;
511 }
512
513 thread_count = migrate_compress_threads();
514 for (i = 0; i < thread_count; i++) {
515 /*
516 * we use it as a indicator which shows if the thread is
517 * properly init'd or not
518 */
519 if (!comp_param[i].file) {
520 break;
521 }
522
523 qemu_mutex_lock(&comp_param[i].mutex);
524 comp_param[i].quit = true;
525 qemu_cond_signal(&comp_param[i].cond);
526 qemu_mutex_unlock(&comp_param[i].mutex);
527
528 qemu_thread_join(compress_threads + i);
529 qemu_mutex_destroy(&comp_param[i].mutex);
530 qemu_cond_destroy(&comp_param[i].cond);
531 deflateEnd(&comp_param[i].stream);
532 g_free(comp_param[i].originbuf);
533 qemu_fclose(comp_param[i].file);
534 comp_param[i].file = NULL;
535 }
536 qemu_mutex_destroy(&comp_done_lock);
537 qemu_cond_destroy(&comp_done_cond);
538 g_free(compress_threads);
539 g_free(comp_param);
540 compress_threads = NULL;
541 comp_param = NULL;
542 }
543
544 static int compress_threads_save_setup(void)
545 {
546 int i, thread_count;
547
548 if (!migrate_use_compression()) {
549 return 0;
550 }
551 thread_count = migrate_compress_threads();
552 compress_threads = g_new0(QemuThread, thread_count);
553 comp_param = g_new0(CompressParam, thread_count);
554 qemu_cond_init(&comp_done_cond);
555 qemu_mutex_init(&comp_done_lock);
556 for (i = 0; i < thread_count; i++) {
557 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
558 if (!comp_param[i].originbuf) {
559 goto exit;
560 }
561
562 if (deflateInit(&comp_param[i].stream,
563 migrate_compress_level()) != Z_OK) {
564 g_free(comp_param[i].originbuf);
565 goto exit;
566 }
567
568 /* comp_param[i].file is just used as a dummy buffer to save data,
569 * set its ops to empty.
570 */
571 comp_param[i].file = qemu_file_new_output(
572 QIO_CHANNEL(qio_channel_null_new()));
573 comp_param[i].done = true;
574 comp_param[i].quit = false;
575 qemu_mutex_init(&comp_param[i].mutex);
576 qemu_cond_init(&comp_param[i].cond);
577 qemu_thread_create(compress_threads + i, "compress",
578 do_data_compress, comp_param + i,
579 QEMU_THREAD_JOINABLE);
580 }
581 return 0;
582
583 exit:
584 compress_threads_save_cleanup();
585 return -1;
586 }
587
588 /**
589 * save_page_header: write page header to wire
590 *
591 * If this is the 1st block, it also writes the block identification
592 *
593 * Returns the number of bytes written
594 *
595 * @f: QEMUFile where to send the data
596 * @block: block that contains the page we want to send
597 * @offset: offset inside the block for the page
598 * in the lower bits, it contains flags
599 */
600 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
601 ram_addr_t offset)
602 {
603 size_t size, len;
604
605 if (block == rs->last_sent_block) {
606 offset |= RAM_SAVE_FLAG_CONTINUE;
607 }
608 qemu_put_be64(f, offset);
609 size = 8;
610
611 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
612 len = strlen(block->idstr);
613 qemu_put_byte(f, len);
614 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
615 size += 1 + len;
616 rs->last_sent_block = block;
617 }
618 return size;
619 }
620
621 /**
622 * mig_throttle_guest_down: throttle down the guest
623 *
624 * Reduce amount of guest cpu execution to hopefully slow down memory
625 * writes. If guest dirty memory rate is reduced below the rate at
626 * which we can transfer pages to the destination then we should be
627 * able to complete migration. Some workloads dirty memory way too
628 * fast and will not effectively converge, even with auto-converge.
629 */
630 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
631 uint64_t bytes_dirty_threshold)
632 {
633 MigrationState *s = migrate_get_current();
634 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
635 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
636 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
637 int pct_max = s->parameters.max_cpu_throttle;
638
639 uint64_t throttle_now = cpu_throttle_get_percentage();
640 uint64_t cpu_now, cpu_ideal, throttle_inc;
641
642 /* We have not started throttling yet. Let's start it. */
643 if (!cpu_throttle_active()) {
644 cpu_throttle_set(pct_initial);
645 } else {
646 /* Throttling already on, just increase the rate */
647 if (!pct_tailslow) {
648 throttle_inc = pct_increment;
649 } else {
650 /* Compute the ideal CPU percentage used by Guest, which may
651 * make the dirty rate match the dirty rate threshold. */
652 cpu_now = 100 - throttle_now;
653 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
654 bytes_dirty_period);
655 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
656 }
657 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
658 }
659 }
660
661 void mig_throttle_counter_reset(void)
662 {
663 RAMState *rs = ram_state;
664
665 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
666 rs->num_dirty_pages_period = 0;
667 rs->bytes_xfer_prev = ram_counters.transferred;
668 }
669
670 /**
671 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
672 *
673 * @rs: current RAM state
674 * @current_addr: address for the zero page
675 *
676 * Update the xbzrle cache to reflect a page that's been sent as all 0.
677 * The important thing is that a stale (not-yet-0'd) page be replaced
678 * by the new data.
679 * As a bonus, if the page wasn't in the cache it gets added so that
680 * when a small write is made into the 0'd page it gets XBZRLE sent.
681 */
682 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
683 {
684 if (!rs->xbzrle_enabled) {
685 return;
686 }
687
688 /* We don't care if this fails to allocate a new cache page
689 * as long as it updated an old one */
690 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
691 ram_counters.dirty_sync_count);
692 }
693
694 #define ENCODING_FLAG_XBZRLE 0x1
695
696 /**
697 * save_xbzrle_page: compress and send current page
698 *
699 * Returns: 1 means that we wrote the page
700 * 0 means that page is identical to the one already sent
701 * -1 means that xbzrle would be longer than normal
702 *
703 * @rs: current RAM state
704 * @current_data: pointer to the address of the page contents
705 * @current_addr: addr of the page
706 * @block: block that contains the page we want to send
707 * @offset: offset inside the block for the page
708 */
709 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
710 ram_addr_t current_addr, RAMBlock *block,
711 ram_addr_t offset)
712 {
713 int encoded_len = 0, bytes_xbzrle;
714 uint8_t *prev_cached_page;
715
716 if (!cache_is_cached(XBZRLE.cache, current_addr,
717 ram_counters.dirty_sync_count)) {
718 xbzrle_counters.cache_miss++;
719 if (!rs->last_stage) {
720 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
721 ram_counters.dirty_sync_count) == -1) {
722 return -1;
723 } else {
724 /* update *current_data when the page has been
725 inserted into cache */
726 *current_data = get_cached_data(XBZRLE.cache, current_addr);
727 }
728 }
729 return -1;
730 }
731
732 /*
733 * Reaching here means the page has hit the xbzrle cache, no matter what
734 * encoding result it is (normal encoding, overflow or skipping the page),
735 * count the page as encoded. This is used to calculate the encoding rate.
736 *
737 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
738 * 2nd page turns out to be skipped (i.e. no new bytes written to the
739 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
740 * skipped page included. In this way, the encoding rate can tell if the
741 * guest page is good for xbzrle encoding.
742 */
743 xbzrle_counters.pages++;
744 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
745
746 /* save current buffer into memory */
747 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
748
749 /* XBZRLE encoding (if there is no overflow) */
750 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
751 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
752 TARGET_PAGE_SIZE);
753
754 /*
755 * Update the cache contents, so that it corresponds to the data
756 * sent, in all cases except where we skip the page.
757 */
758 if (!rs->last_stage && encoded_len != 0) {
759 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
760 /*
761 * In the case where we couldn't compress, ensure that the caller
762 * sends the data from the cache, since the guest might have
763 * changed the RAM since we copied it.
764 */
765 *current_data = prev_cached_page;
766 }
767
768 if (encoded_len == 0) {
769 trace_save_xbzrle_page_skipping();
770 return 0;
771 } else if (encoded_len == -1) {
772 trace_save_xbzrle_page_overflow();
773 xbzrle_counters.overflow++;
774 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
775 return -1;
776 }
777
778 /* Send XBZRLE based compressed page */
779 bytes_xbzrle = save_page_header(rs, rs->f, block,
780 offset | RAM_SAVE_FLAG_XBZRLE);
781 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
782 qemu_put_be16(rs->f, encoded_len);
783 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
784 bytes_xbzrle += encoded_len + 1 + 2;
785 /*
786 * Like compressed_size (please see update_compress_thread_counts),
787 * the xbzrle encoded bytes don't count the 8 byte header with
788 * RAM_SAVE_FLAG_CONTINUE.
789 */
790 xbzrle_counters.bytes += bytes_xbzrle - 8;
791 ram_transferred_add(bytes_xbzrle);
792
793 return 1;
794 }
795
796 /**
797 * migration_bitmap_find_dirty: find the next dirty page from start
798 *
799 * Returns the page offset within memory region of the start of a dirty page
800 *
801 * @rs: current RAM state
802 * @rb: RAMBlock where to search for dirty pages
803 * @start: page where we start the search
804 */
805 static inline
806 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
807 unsigned long start)
808 {
809 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
810 unsigned long *bitmap = rb->bmap;
811
812 if (ramblock_is_ignored(rb)) {
813 return size;
814 }
815
816 return find_next_bit(bitmap, size, start);
817 }
818
819 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
820 unsigned long page)
821 {
822 uint8_t shift;
823 hwaddr size, start;
824
825 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
826 return;
827 }
828
829 shift = rb->clear_bmap_shift;
830 /*
831 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
832 * can make things easier sometimes since then start address
833 * of the small chunk will always be 64 pages aligned so the
834 * bitmap will always be aligned to unsigned long. We should
835 * even be able to remove this restriction but I'm simply
836 * keeping it.
837 */
838 assert(shift >= 6);
839
840 size = 1ULL << (TARGET_PAGE_BITS + shift);
841 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
842 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
843 memory_region_clear_dirty_bitmap(rb->mr, start, size);
844 }
845
846 static void
847 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
848 unsigned long start,
849 unsigned long npages)
850 {
851 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
852 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
853 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
854
855 /*
856 * Clear pages from start to start + npages - 1, so the end boundary is
857 * exclusive.
858 */
859 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
860 migration_clear_memory_region_dirty_bitmap(rb, i);
861 }
862 }
863
864 /*
865 * colo_bitmap_find_diry:find contiguous dirty pages from start
866 *
867 * Returns the page offset within memory region of the start of the contiguout
868 * dirty page
869 *
870 * @rs: current RAM state
871 * @rb: RAMBlock where to search for dirty pages
872 * @start: page where we start the search
873 * @num: the number of contiguous dirty pages
874 */
875 static inline
876 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
877 unsigned long start, unsigned long *num)
878 {
879 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
880 unsigned long *bitmap = rb->bmap;
881 unsigned long first, next;
882
883 *num = 0;
884
885 if (ramblock_is_ignored(rb)) {
886 return size;
887 }
888
889 first = find_next_bit(bitmap, size, start);
890 if (first >= size) {
891 return first;
892 }
893 next = find_next_zero_bit(bitmap, size, first + 1);
894 assert(next >= first);
895 *num = next - first;
896 return first;
897 }
898
899 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
900 RAMBlock *rb,
901 unsigned long page)
902 {
903 bool ret;
904
905 /*
906 * Clear dirty bitmap if needed. This _must_ be called before we
907 * send any of the page in the chunk because we need to make sure
908 * we can capture further page content changes when we sync dirty
909 * log the next time. So as long as we are going to send any of
910 * the page in the chunk we clear the remote dirty bitmap for all.
911 * Clearing it earlier won't be a problem, but too late will.
912 */
913 migration_clear_memory_region_dirty_bitmap(rb, page);
914
915 ret = test_and_clear_bit(page, rb->bmap);
916 if (ret) {
917 rs->migration_dirty_pages--;
918 }
919
920 return ret;
921 }
922
923 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
924 void *opaque)
925 {
926 const hwaddr offset = section->offset_within_region;
927 const hwaddr size = int128_get64(section->size);
928 const unsigned long start = offset >> TARGET_PAGE_BITS;
929 const unsigned long npages = size >> TARGET_PAGE_BITS;
930 RAMBlock *rb = section->mr->ram_block;
931 uint64_t *cleared_bits = opaque;
932
933 /*
934 * We don't grab ram_state->bitmap_mutex because we expect to run
935 * only when starting migration or during postcopy recovery where
936 * we don't have concurrent access.
937 */
938 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
939 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
940 }
941 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
942 bitmap_clear(rb->bmap, start, npages);
943 }
944
945 /*
946 * Exclude all dirty pages from migration that fall into a discarded range as
947 * managed by a RamDiscardManager responsible for the mapped memory region of
948 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
949 *
950 * Discarded pages ("logically unplugged") have undefined content and must
951 * not get migrated, because even reading these pages for migration might
952 * result in undesired behavior.
953 *
954 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
955 *
956 * Note: The result is only stable while migrating (precopy/postcopy).
957 */
958 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
959 {
960 uint64_t cleared_bits = 0;
961
962 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
963 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
964 MemoryRegionSection section = {
965 .mr = rb->mr,
966 .offset_within_region = 0,
967 .size = int128_make64(qemu_ram_get_used_length(rb)),
968 };
969
970 ram_discard_manager_replay_discarded(rdm, &section,
971 dirty_bitmap_clear_section,
972 &cleared_bits);
973 }
974 return cleared_bits;
975 }
976
977 /*
978 * Check if a host-page aligned page falls into a discarded range as managed by
979 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
980 *
981 * Note: The result is only stable while migrating (precopy/postcopy).
982 */
983 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
984 {
985 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
986 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
987 MemoryRegionSection section = {
988 .mr = rb->mr,
989 .offset_within_region = start,
990 .size = int128_make64(qemu_ram_pagesize(rb)),
991 };
992
993 return !ram_discard_manager_is_populated(rdm, &section);
994 }
995 return false;
996 }
997
998 /* Called with RCU critical section */
999 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1000 {
1001 uint64_t new_dirty_pages =
1002 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1003
1004 rs->migration_dirty_pages += new_dirty_pages;
1005 rs->num_dirty_pages_period += new_dirty_pages;
1006 }
1007
1008 /**
1009 * ram_pagesize_summary: calculate all the pagesizes of a VM
1010 *
1011 * Returns a summary bitmap of the page sizes of all RAMBlocks
1012 *
1013 * For VMs with just normal pages this is equivalent to the host page
1014 * size. If it's got some huge pages then it's the OR of all the
1015 * different page sizes.
1016 */
1017 uint64_t ram_pagesize_summary(void)
1018 {
1019 RAMBlock *block;
1020 uint64_t summary = 0;
1021
1022 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1023 summary |= block->page_size;
1024 }
1025
1026 return summary;
1027 }
1028
1029 uint64_t ram_get_total_transferred_pages(void)
1030 {
1031 return ram_counters.normal + ram_counters.duplicate +
1032 compression_counters.pages + xbzrle_counters.pages;
1033 }
1034
1035 static void migration_update_rates(RAMState *rs, int64_t end_time)
1036 {
1037 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1038 double compressed_size;
1039
1040 /* calculate period counters */
1041 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1042 / (end_time - rs->time_last_bitmap_sync);
1043
1044 if (!page_count) {
1045 return;
1046 }
1047
1048 if (migrate_use_xbzrle()) {
1049 double encoded_size, unencoded_size;
1050
1051 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1052 rs->xbzrle_cache_miss_prev) / page_count;
1053 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1054 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1055 TARGET_PAGE_SIZE;
1056 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1057 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1058 xbzrle_counters.encoding_rate = 0;
1059 } else {
1060 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1061 }
1062 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1063 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1064 }
1065
1066 if (migrate_use_compression()) {
1067 compression_counters.busy_rate = (double)(compression_counters.busy -
1068 rs->compress_thread_busy_prev) / page_count;
1069 rs->compress_thread_busy_prev = compression_counters.busy;
1070
1071 compressed_size = compression_counters.compressed_size -
1072 rs->compressed_size_prev;
1073 if (compressed_size) {
1074 double uncompressed_size = (compression_counters.pages -
1075 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1076
1077 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1078 compression_counters.compression_rate =
1079 uncompressed_size / compressed_size;
1080
1081 rs->compress_pages_prev = compression_counters.pages;
1082 rs->compressed_size_prev = compression_counters.compressed_size;
1083 }
1084 }
1085 }
1086
1087 static void migration_trigger_throttle(RAMState *rs)
1088 {
1089 MigrationState *s = migrate_get_current();
1090 uint64_t threshold = s->parameters.throttle_trigger_threshold;
1091
1092 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1093 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1094 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1095
1096 /* During block migration the auto-converge logic incorrectly detects
1097 * that ram migration makes no progress. Avoid this by disabling the
1098 * throttling logic during the bulk phase of block migration. */
1099 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1100 /* The following detection logic can be refined later. For now:
1101 Check to see if the ratio between dirtied bytes and the approx.
1102 amount of bytes that just got transferred since the last time
1103 we were in this routine reaches the threshold. If that happens
1104 twice, start or increase throttling. */
1105
1106 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1107 (++rs->dirty_rate_high_cnt >= 2)) {
1108 trace_migration_throttle();
1109 rs->dirty_rate_high_cnt = 0;
1110 mig_throttle_guest_down(bytes_dirty_period,
1111 bytes_dirty_threshold);
1112 }
1113 }
1114 }
1115
1116 static void migration_bitmap_sync(RAMState *rs)
1117 {
1118 RAMBlock *block;
1119 int64_t end_time;
1120
1121 ram_counters.dirty_sync_count++;
1122
1123 if (!rs->time_last_bitmap_sync) {
1124 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1125 }
1126
1127 trace_migration_bitmap_sync_start();
1128 memory_global_dirty_log_sync();
1129
1130 qemu_mutex_lock(&rs->bitmap_mutex);
1131 WITH_RCU_READ_LOCK_GUARD() {
1132 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1133 ramblock_sync_dirty_bitmap(rs, block);
1134 }
1135 ram_counters.remaining = ram_bytes_remaining();
1136 }
1137 qemu_mutex_unlock(&rs->bitmap_mutex);
1138
1139 memory_global_after_dirty_log_sync();
1140 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1141
1142 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1143
1144 /* more than 1 second = 1000 millisecons */
1145 if (end_time > rs->time_last_bitmap_sync + 1000) {
1146 migration_trigger_throttle(rs);
1147
1148 migration_update_rates(rs, end_time);
1149
1150 rs->target_page_count_prev = rs->target_page_count;
1151
1152 /* reset period counters */
1153 rs->time_last_bitmap_sync = end_time;
1154 rs->num_dirty_pages_period = 0;
1155 rs->bytes_xfer_prev = ram_counters.transferred;
1156 }
1157 if (migrate_use_events()) {
1158 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1159 }
1160 }
1161
1162 static void migration_bitmap_sync_precopy(RAMState *rs)
1163 {
1164 Error *local_err = NULL;
1165
1166 /*
1167 * The current notifier usage is just an optimization to migration, so we
1168 * don't stop the normal migration process in the error case.
1169 */
1170 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1171 error_report_err(local_err);
1172 local_err = NULL;
1173 }
1174
1175 migration_bitmap_sync(rs);
1176
1177 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1178 error_report_err(local_err);
1179 }
1180 }
1181
1182 static void ram_release_page(const char *rbname, uint64_t offset)
1183 {
1184 if (!migrate_release_ram() || !migration_in_postcopy()) {
1185 return;
1186 }
1187
1188 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1189 }
1190
1191 /**
1192 * save_zero_page_to_file: send the zero page to the file
1193 *
1194 * Returns the size of data written to the file, 0 means the page is not
1195 * a zero page
1196 *
1197 * @rs: current RAM state
1198 * @file: the file where the data is saved
1199 * @block: block that contains the page we want to send
1200 * @offset: offset inside the block for the page
1201 */
1202 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1203 RAMBlock *block, ram_addr_t offset)
1204 {
1205 uint8_t *p = block->host + offset;
1206 int len = 0;
1207
1208 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1209 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1210 qemu_put_byte(file, 0);
1211 len += 1;
1212 ram_release_page(block->idstr, offset);
1213 }
1214 return len;
1215 }
1216
1217 /**
1218 * save_zero_page: send the zero page to the stream
1219 *
1220 * Returns the number of pages written.
1221 *
1222 * @rs: current RAM state
1223 * @block: block that contains the page we want to send
1224 * @offset: offset inside the block for the page
1225 */
1226 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1227 {
1228 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1229
1230 if (len) {
1231 ram_counters.duplicate++;
1232 ram_transferred_add(len);
1233 return 1;
1234 }
1235 return -1;
1236 }
1237
1238 /*
1239 * @pages: the number of pages written by the control path,
1240 * < 0 - error
1241 * > 0 - number of pages written
1242 *
1243 * Return true if the pages has been saved, otherwise false is returned.
1244 */
1245 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1246 int *pages)
1247 {
1248 uint64_t bytes_xmit = 0;
1249 int ret;
1250
1251 *pages = -1;
1252 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1253 &bytes_xmit);
1254 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1255 return false;
1256 }
1257
1258 if (bytes_xmit) {
1259 ram_transferred_add(bytes_xmit);
1260 *pages = 1;
1261 }
1262
1263 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1264 return true;
1265 }
1266
1267 if (bytes_xmit > 0) {
1268 ram_counters.normal++;
1269 } else if (bytes_xmit == 0) {
1270 ram_counters.duplicate++;
1271 }
1272
1273 return true;
1274 }
1275
1276 /*
1277 * directly send the page to the stream
1278 *
1279 * Returns the number of pages written.
1280 *
1281 * @rs: current RAM state
1282 * @block: block that contains the page we want to send
1283 * @offset: offset inside the block for the page
1284 * @buf: the page to be sent
1285 * @async: send to page asyncly
1286 */
1287 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1288 uint8_t *buf, bool async)
1289 {
1290 ram_transferred_add(save_page_header(rs, rs->f, block,
1291 offset | RAM_SAVE_FLAG_PAGE));
1292 if (async) {
1293 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1294 migrate_release_ram() &&
1295 migration_in_postcopy());
1296 } else {
1297 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1298 }
1299 ram_transferred_add(TARGET_PAGE_SIZE);
1300 ram_counters.normal++;
1301 return 1;
1302 }
1303
1304 /**
1305 * ram_save_page: send the given page to the stream
1306 *
1307 * Returns the number of pages written.
1308 * < 0 - error
1309 * >=0 - Number of pages written - this might legally be 0
1310 * if xbzrle noticed the page was the same.
1311 *
1312 * @rs: current RAM state
1313 * @block: block that contains the page we want to send
1314 * @offset: offset inside the block for the page
1315 */
1316 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1317 {
1318 int pages = -1;
1319 uint8_t *p;
1320 bool send_async = true;
1321 RAMBlock *block = pss->block;
1322 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1323 ram_addr_t current_addr = block->offset + offset;
1324
1325 p = block->host + offset;
1326 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1327
1328 XBZRLE_cache_lock();
1329 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1330 pages = save_xbzrle_page(rs, &p, current_addr, block,
1331 offset);
1332 if (!rs->last_stage) {
1333 /* Can't send this cached data async, since the cache page
1334 * might get updated before it gets to the wire
1335 */
1336 send_async = false;
1337 }
1338 }
1339
1340 /* XBZRLE overflow or normal page */
1341 if (pages == -1) {
1342 pages = save_normal_page(rs, block, offset, p, send_async);
1343 }
1344
1345 XBZRLE_cache_unlock();
1346
1347 return pages;
1348 }
1349
1350 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1351 ram_addr_t offset)
1352 {
1353 if (multifd_queue_page(rs->f, block, offset) < 0) {
1354 return -1;
1355 }
1356 ram_counters.normal++;
1357
1358 return 1;
1359 }
1360
1361 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1362 ram_addr_t offset, uint8_t *source_buf)
1363 {
1364 RAMState *rs = ram_state;
1365 uint8_t *p = block->host + offset;
1366 int ret;
1367
1368 if (save_zero_page_to_file(rs, f, block, offset)) {
1369 return true;
1370 }
1371
1372 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1373
1374 /*
1375 * copy it to a internal buffer to avoid it being modified by VM
1376 * so that we can catch up the error during compression and
1377 * decompression
1378 */
1379 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1380 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1381 if (ret < 0) {
1382 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1383 error_report("compressed data failed!");
1384 }
1385 return false;
1386 }
1387
1388 static void
1389 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1390 {
1391 ram_transferred_add(bytes_xmit);
1392
1393 if (param->zero_page) {
1394 ram_counters.duplicate++;
1395 return;
1396 }
1397
1398 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1399 compression_counters.compressed_size += bytes_xmit - 8;
1400 compression_counters.pages++;
1401 }
1402
1403 static bool save_page_use_compression(RAMState *rs);
1404
1405 static void flush_compressed_data(RAMState *rs)
1406 {
1407 int idx, len, thread_count;
1408
1409 if (!save_page_use_compression(rs)) {
1410 return;
1411 }
1412 thread_count = migrate_compress_threads();
1413
1414 qemu_mutex_lock(&comp_done_lock);
1415 for (idx = 0; idx < thread_count; idx++) {
1416 while (!comp_param[idx].done) {
1417 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1418 }
1419 }
1420 qemu_mutex_unlock(&comp_done_lock);
1421
1422 for (idx = 0; idx < thread_count; idx++) {
1423 qemu_mutex_lock(&comp_param[idx].mutex);
1424 if (!comp_param[idx].quit) {
1425 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1426 /*
1427 * it's safe to fetch zero_page without holding comp_done_lock
1428 * as there is no further request submitted to the thread,
1429 * i.e, the thread should be waiting for a request at this point.
1430 */
1431 update_compress_thread_counts(&comp_param[idx], len);
1432 }
1433 qemu_mutex_unlock(&comp_param[idx].mutex);
1434 }
1435 }
1436
1437 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1438 ram_addr_t offset)
1439 {
1440 param->block = block;
1441 param->offset = offset;
1442 }
1443
1444 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1445 ram_addr_t offset)
1446 {
1447 int idx, thread_count, bytes_xmit = -1, pages = -1;
1448 bool wait = migrate_compress_wait_thread();
1449
1450 thread_count = migrate_compress_threads();
1451 qemu_mutex_lock(&comp_done_lock);
1452 retry:
1453 for (idx = 0; idx < thread_count; idx++) {
1454 if (comp_param[idx].done) {
1455 comp_param[idx].done = false;
1456 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1457 qemu_mutex_lock(&comp_param[idx].mutex);
1458 set_compress_params(&comp_param[idx], block, offset);
1459 qemu_cond_signal(&comp_param[idx].cond);
1460 qemu_mutex_unlock(&comp_param[idx].mutex);
1461 pages = 1;
1462 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1463 break;
1464 }
1465 }
1466
1467 /*
1468 * wait for the free thread if the user specifies 'compress-wait-thread',
1469 * otherwise we will post the page out in the main thread as normal page.
1470 */
1471 if (pages < 0 && wait) {
1472 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1473 goto retry;
1474 }
1475 qemu_mutex_unlock(&comp_done_lock);
1476
1477 return pages;
1478 }
1479
1480 /**
1481 * find_dirty_block: find the next dirty page and update any state
1482 * associated with the search process.
1483 *
1484 * Returns true if a page is found
1485 *
1486 * @rs: current RAM state
1487 * @pss: data about the state of the current dirty page scan
1488 * @again: set to false if the search has scanned the whole of RAM
1489 */
1490 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1491 {
1492 /* This is not a postcopy requested page */
1493 pss->postcopy_requested = false;
1494
1495 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1496 if (pss->complete_round && pss->block == rs->last_seen_block &&
1497 pss->page >= rs->last_page) {
1498 /*
1499 * We've been once around the RAM and haven't found anything.
1500 * Give up.
1501 */
1502 *again = false;
1503 return false;
1504 }
1505 if (!offset_in_ramblock(pss->block,
1506 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1507 /* Didn't find anything in this RAM Block */
1508 pss->page = 0;
1509 pss->block = QLIST_NEXT_RCU(pss->block, next);
1510 if (!pss->block) {
1511 /*
1512 * If memory migration starts over, we will meet a dirtied page
1513 * which may still exists in compression threads's ring, so we
1514 * should flush the compressed data to make sure the new page
1515 * is not overwritten by the old one in the destination.
1516 *
1517 * Also If xbzrle is on, stop using the data compression at this
1518 * point. In theory, xbzrle can do better than compression.
1519 */
1520 flush_compressed_data(rs);
1521
1522 /* Hit the end of the list */
1523 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1524 /* Flag that we've looped */
1525 pss->complete_round = true;
1526 /* After the first round, enable XBZRLE. */
1527 if (migrate_use_xbzrle()) {
1528 rs->xbzrle_enabled = true;
1529 }
1530 }
1531 /* Didn't find anything this time, but try again on the new block */
1532 *again = true;
1533 return false;
1534 } else {
1535 /* Can go around again, but... */
1536 *again = true;
1537 /* We've found something so probably don't need to */
1538 return true;
1539 }
1540 }
1541
1542 /**
1543 * unqueue_page: gets a page of the queue
1544 *
1545 * Helper for 'get_queued_page' - gets a page off the queue
1546 *
1547 * Returns the block of the page (or NULL if none available)
1548 *
1549 * @rs: current RAM state
1550 * @offset: used to return the offset within the RAMBlock
1551 */
1552 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1553 {
1554 struct RAMSrcPageRequest *entry;
1555 RAMBlock *block = NULL;
1556 size_t page_size;
1557
1558 if (!postcopy_has_request(rs)) {
1559 return NULL;
1560 }
1561
1562 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1563
1564 /*
1565 * This should _never_ change even after we take the lock, because no one
1566 * should be taking anything off the request list other than us.
1567 */
1568 assert(postcopy_has_request(rs));
1569
1570 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1571 block = entry->rb;
1572 *offset = entry->offset;
1573 page_size = qemu_ram_pagesize(block);
1574 /* Each page request should only be multiple page size of the ramblock */
1575 assert((entry->len % page_size) == 0);
1576
1577 if (entry->len > page_size) {
1578 entry->len -= page_size;
1579 entry->offset += page_size;
1580 } else {
1581 memory_region_unref(block->mr);
1582 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1583 g_free(entry);
1584 migration_consume_urgent_request();
1585 }
1586
1587 trace_unqueue_page(block->idstr, *offset,
1588 test_bit((*offset >> TARGET_PAGE_BITS), block->bmap));
1589
1590 return block;
1591 }
1592
1593 #if defined(__linux__)
1594 /**
1595 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1596 * is found, return RAM block pointer and page offset
1597 *
1598 * Returns pointer to the RAMBlock containing faulting page,
1599 * NULL if no write faults are pending
1600 *
1601 * @rs: current RAM state
1602 * @offset: page offset from the beginning of the block
1603 */
1604 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1605 {
1606 struct uffd_msg uffd_msg;
1607 void *page_address;
1608 RAMBlock *block;
1609 int res;
1610
1611 if (!migrate_background_snapshot()) {
1612 return NULL;
1613 }
1614
1615 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1616 if (res <= 0) {
1617 return NULL;
1618 }
1619
1620 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1621 block = qemu_ram_block_from_host(page_address, false, offset);
1622 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1623 return block;
1624 }
1625
1626 /**
1627 * ram_save_release_protection: release UFFD write protection after
1628 * a range of pages has been saved
1629 *
1630 * @rs: current RAM state
1631 * @pss: page-search-status structure
1632 * @start_page: index of the first page in the range relative to pss->block
1633 *
1634 * Returns 0 on success, negative value in case of an error
1635 */
1636 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1637 unsigned long start_page)
1638 {
1639 int res = 0;
1640
1641 /* Check if page is from UFFD-managed region. */
1642 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1643 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1644 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1645
1646 /* Flush async buffers before un-protect. */
1647 qemu_fflush(rs->f);
1648 /* Un-protect memory range. */
1649 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1650 false, false);
1651 }
1652
1653 return res;
1654 }
1655
1656 /* ram_write_tracking_available: check if kernel supports required UFFD features
1657 *
1658 * Returns true if supports, false otherwise
1659 */
1660 bool ram_write_tracking_available(void)
1661 {
1662 uint64_t uffd_features;
1663 int res;
1664
1665 res = uffd_query_features(&uffd_features);
1666 return (res == 0 &&
1667 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1668 }
1669
1670 /* ram_write_tracking_compatible: check if guest configuration is
1671 * compatible with 'write-tracking'
1672 *
1673 * Returns true if compatible, false otherwise
1674 */
1675 bool ram_write_tracking_compatible(void)
1676 {
1677 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1678 int uffd_fd;
1679 RAMBlock *block;
1680 bool ret = false;
1681
1682 /* Open UFFD file descriptor */
1683 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1684 if (uffd_fd < 0) {
1685 return false;
1686 }
1687
1688 RCU_READ_LOCK_GUARD();
1689
1690 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1691 uint64_t uffd_ioctls;
1692
1693 /* Nothing to do with read-only and MMIO-writable regions */
1694 if (block->mr->readonly || block->mr->rom_device) {
1695 continue;
1696 }
1697 /* Try to register block memory via UFFD-IO to track writes */
1698 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1699 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1700 goto out;
1701 }
1702 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1703 goto out;
1704 }
1705 }
1706 ret = true;
1707
1708 out:
1709 uffd_close_fd(uffd_fd);
1710 return ret;
1711 }
1712
1713 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1714 ram_addr_t size)
1715 {
1716 /*
1717 * We read one byte of each page; this will preallocate page tables if
1718 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1719 * where no page was populated yet. This might require adaption when
1720 * supporting other mappings, like shmem.
1721 */
1722 for (; offset < size; offset += block->page_size) {
1723 char tmp = *((char *)block->host + offset);
1724
1725 /* Don't optimize the read out */
1726 asm volatile("" : "+r" (tmp));
1727 }
1728 }
1729
1730 static inline int populate_read_section(MemoryRegionSection *section,
1731 void *opaque)
1732 {
1733 const hwaddr size = int128_get64(section->size);
1734 hwaddr offset = section->offset_within_region;
1735 RAMBlock *block = section->mr->ram_block;
1736
1737 populate_read_range(block, offset, size);
1738 return 0;
1739 }
1740
1741 /*
1742 * ram_block_populate_read: preallocate page tables and populate pages in the
1743 * RAM block by reading a byte of each page.
1744 *
1745 * Since it's solely used for userfault_fd WP feature, here we just
1746 * hardcode page size to qemu_real_host_page_size.
1747 *
1748 * @block: RAM block to populate
1749 */
1750 static void ram_block_populate_read(RAMBlock *rb)
1751 {
1752 /*
1753 * Skip populating all pages that fall into a discarded range as managed by
1754 * a RamDiscardManager responsible for the mapped memory region of the
1755 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1756 * must not get populated automatically. We don't have to track
1757 * modifications via userfaultfd WP reliably, because these pages will
1758 * not be part of the migration stream either way -- see
1759 * ramblock_dirty_bitmap_exclude_discarded_pages().
1760 *
1761 * Note: The result is only stable while migrating (precopy/postcopy).
1762 */
1763 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1764 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1765 MemoryRegionSection section = {
1766 .mr = rb->mr,
1767 .offset_within_region = 0,
1768 .size = rb->mr->size,
1769 };
1770
1771 ram_discard_manager_replay_populated(rdm, &section,
1772 populate_read_section, NULL);
1773 } else {
1774 populate_read_range(rb, 0, rb->used_length);
1775 }
1776 }
1777
1778 /*
1779 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1780 */
1781 void ram_write_tracking_prepare(void)
1782 {
1783 RAMBlock *block;
1784
1785 RCU_READ_LOCK_GUARD();
1786
1787 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1788 /* Nothing to do with read-only and MMIO-writable regions */
1789 if (block->mr->readonly || block->mr->rom_device) {
1790 continue;
1791 }
1792
1793 /*
1794 * Populate pages of the RAM block before enabling userfault_fd
1795 * write protection.
1796 *
1797 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1798 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1799 * pages with pte_none() entries in page table.
1800 */
1801 ram_block_populate_read(block);
1802 }
1803 }
1804
1805 /*
1806 * ram_write_tracking_start: start UFFD-WP memory tracking
1807 *
1808 * Returns 0 for success or negative value in case of error
1809 */
1810 int ram_write_tracking_start(void)
1811 {
1812 int uffd_fd;
1813 RAMState *rs = ram_state;
1814 RAMBlock *block;
1815
1816 /* Open UFFD file descriptor */
1817 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1818 if (uffd_fd < 0) {
1819 return uffd_fd;
1820 }
1821 rs->uffdio_fd = uffd_fd;
1822
1823 RCU_READ_LOCK_GUARD();
1824
1825 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1826 /* Nothing to do with read-only and MMIO-writable regions */
1827 if (block->mr->readonly || block->mr->rom_device) {
1828 continue;
1829 }
1830
1831 /* Register block memory with UFFD to track writes */
1832 if (uffd_register_memory(rs->uffdio_fd, block->host,
1833 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1834 goto fail;
1835 }
1836 /* Apply UFFD write protection to the block memory range */
1837 if (uffd_change_protection(rs->uffdio_fd, block->host,
1838 block->max_length, true, false)) {
1839 goto fail;
1840 }
1841 block->flags |= RAM_UF_WRITEPROTECT;
1842 memory_region_ref(block->mr);
1843
1844 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1845 block->host, block->max_length);
1846 }
1847
1848 return 0;
1849
1850 fail:
1851 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1852
1853 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1854 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1855 continue;
1856 }
1857 /*
1858 * In case some memory block failed to be write-protected
1859 * remove protection and unregister all succeeded RAM blocks
1860 */
1861 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1862 false, false);
1863 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1864 /* Cleanup flags and remove reference */
1865 block->flags &= ~RAM_UF_WRITEPROTECT;
1866 memory_region_unref(block->mr);
1867 }
1868
1869 uffd_close_fd(uffd_fd);
1870 rs->uffdio_fd = -1;
1871 return -1;
1872 }
1873
1874 /**
1875 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1876 */
1877 void ram_write_tracking_stop(void)
1878 {
1879 RAMState *rs = ram_state;
1880 RAMBlock *block;
1881
1882 RCU_READ_LOCK_GUARD();
1883
1884 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1885 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1886 continue;
1887 }
1888 /* Remove protection and unregister all affected RAM blocks */
1889 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1890 false, false);
1891 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1892
1893 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1894 block->host, block->max_length);
1895
1896 /* Cleanup flags and remove reference */
1897 block->flags &= ~RAM_UF_WRITEPROTECT;
1898 memory_region_unref(block->mr);
1899 }
1900
1901 /* Finally close UFFD file descriptor */
1902 uffd_close_fd(rs->uffdio_fd);
1903 rs->uffdio_fd = -1;
1904 }
1905
1906 #else
1907 /* No target OS support, stubs just fail or ignore */
1908
1909 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1910 {
1911 (void) rs;
1912 (void) offset;
1913
1914 return NULL;
1915 }
1916
1917 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1918 unsigned long start_page)
1919 {
1920 (void) rs;
1921 (void) pss;
1922 (void) start_page;
1923
1924 return 0;
1925 }
1926
1927 bool ram_write_tracking_available(void)
1928 {
1929 return false;
1930 }
1931
1932 bool ram_write_tracking_compatible(void)
1933 {
1934 assert(0);
1935 return false;
1936 }
1937
1938 int ram_write_tracking_start(void)
1939 {
1940 assert(0);
1941 return -1;
1942 }
1943
1944 void ram_write_tracking_stop(void)
1945 {
1946 assert(0);
1947 }
1948 #endif /* defined(__linux__) */
1949
1950 /**
1951 * get_queued_page: unqueue a page from the postcopy requests
1952 *
1953 * Skips pages that are already sent (!dirty)
1954 *
1955 * Returns true if a queued page is found
1956 *
1957 * @rs: current RAM state
1958 * @pss: data about the state of the current dirty page scan
1959 */
1960 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1961 {
1962 RAMBlock *block;
1963 ram_addr_t offset;
1964
1965 block = unqueue_page(rs, &offset);
1966
1967 if (!block) {
1968 /*
1969 * Poll write faults too if background snapshot is enabled; that's
1970 * when we have vcpus got blocked by the write protected pages.
1971 */
1972 block = poll_fault_page(rs, &offset);
1973 }
1974
1975 if (block) {
1976 /*
1977 * We want the background search to continue from the queued page
1978 * since the guest is likely to want other pages near to the page
1979 * it just requested.
1980 */
1981 pss->block = block;
1982 pss->page = offset >> TARGET_PAGE_BITS;
1983
1984 /*
1985 * This unqueued page would break the "one round" check, even is
1986 * really rare.
1987 */
1988 pss->complete_round = false;
1989 pss->postcopy_requested = true;
1990 }
1991
1992 return !!block;
1993 }
1994
1995 /**
1996 * migration_page_queue_free: drop any remaining pages in the ram
1997 * request queue
1998 *
1999 * It should be empty at the end anyway, but in error cases there may
2000 * be some left. in case that there is any page left, we drop it.
2001 *
2002 */
2003 static void migration_page_queue_free(RAMState *rs)
2004 {
2005 struct RAMSrcPageRequest *mspr, *next_mspr;
2006 /* This queue generally should be empty - but in the case of a failed
2007 * migration might have some droppings in.
2008 */
2009 RCU_READ_LOCK_GUARD();
2010 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2011 memory_region_unref(mspr->rb->mr);
2012 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2013 g_free(mspr);
2014 }
2015 }
2016
2017 /**
2018 * ram_save_queue_pages: queue the page for transmission
2019 *
2020 * A request from postcopy destination for example.
2021 *
2022 * Returns zero on success or negative on error
2023 *
2024 * @rbname: Name of the RAMBLock of the request. NULL means the
2025 * same that last one.
2026 * @start: starting address from the start of the RAMBlock
2027 * @len: length (in bytes) to send
2028 */
2029 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2030 {
2031 RAMBlock *ramblock;
2032 RAMState *rs = ram_state;
2033
2034 ram_counters.postcopy_requests++;
2035 RCU_READ_LOCK_GUARD();
2036
2037 if (!rbname) {
2038 /* Reuse last RAMBlock */
2039 ramblock = rs->last_req_rb;
2040
2041 if (!ramblock) {
2042 /*
2043 * Shouldn't happen, we can't reuse the last RAMBlock if
2044 * it's the 1st request.
2045 */
2046 error_report("ram_save_queue_pages no previous block");
2047 return -1;
2048 }
2049 } else {
2050 ramblock = qemu_ram_block_by_name(rbname);
2051
2052 if (!ramblock) {
2053 /* We shouldn't be asked for a non-existent RAMBlock */
2054 error_report("ram_save_queue_pages no block '%s'", rbname);
2055 return -1;
2056 }
2057 rs->last_req_rb = ramblock;
2058 }
2059 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2060 if (!offset_in_ramblock(ramblock, start + len - 1)) {
2061 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2062 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2063 __func__, start, len, ramblock->used_length);
2064 return -1;
2065 }
2066
2067 struct RAMSrcPageRequest *new_entry =
2068 g_new0(struct RAMSrcPageRequest, 1);
2069 new_entry->rb = ramblock;
2070 new_entry->offset = start;
2071 new_entry->len = len;
2072
2073 memory_region_ref(ramblock->mr);
2074 qemu_mutex_lock(&rs->src_page_req_mutex);
2075 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2076 migration_make_urgent_request();
2077 qemu_mutex_unlock(&rs->src_page_req_mutex);
2078
2079 return 0;
2080 }
2081
2082 static bool save_page_use_compression(RAMState *rs)
2083 {
2084 if (!migrate_use_compression()) {
2085 return false;
2086 }
2087
2088 /*
2089 * If xbzrle is enabled (e.g., after first round of migration), stop
2090 * using the data compression. In theory, xbzrle can do better than
2091 * compression.
2092 */
2093 if (rs->xbzrle_enabled) {
2094 return false;
2095 }
2096
2097 return true;
2098 }
2099
2100 /*
2101 * try to compress the page before posting it out, return true if the page
2102 * has been properly handled by compression, otherwise needs other
2103 * paths to handle it
2104 */
2105 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2106 {
2107 if (!save_page_use_compression(rs)) {
2108 return false;
2109 }
2110
2111 /*
2112 * When starting the process of a new block, the first page of
2113 * the block should be sent out before other pages in the same
2114 * block, and all the pages in last block should have been sent
2115 * out, keeping this order is important, because the 'cont' flag
2116 * is used to avoid resending the block name.
2117 *
2118 * We post the fist page as normal page as compression will take
2119 * much CPU resource.
2120 */
2121 if (block != rs->last_sent_block) {
2122 flush_compressed_data(rs);
2123 return false;
2124 }
2125
2126 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2127 return true;
2128 }
2129
2130 compression_counters.busy++;
2131 return false;
2132 }
2133
2134 /**
2135 * ram_save_target_page: save one target page
2136 *
2137 * Returns the number of pages written
2138 *
2139 * @rs: current RAM state
2140 * @pss: data about the page we want to send
2141 */
2142 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2143 {
2144 RAMBlock *block = pss->block;
2145 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2146 int res;
2147
2148 if (control_save_page(rs, block, offset, &res)) {
2149 return res;
2150 }
2151
2152 if (save_compress_page(rs, block, offset)) {
2153 return 1;
2154 }
2155
2156 res = save_zero_page(rs, block, offset);
2157 if (res > 0) {
2158 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2159 * page would be stale
2160 */
2161 if (!save_page_use_compression(rs)) {
2162 XBZRLE_cache_lock();
2163 xbzrle_cache_zero_page(rs, block->offset + offset);
2164 XBZRLE_cache_unlock();
2165 }
2166 return res;
2167 }
2168
2169 /*
2170 * Do not use multifd for:
2171 * 1. Compression as the first page in the new block should be posted out
2172 * before sending the compressed page
2173 * 2. In postcopy as one whole host page should be placed
2174 */
2175 if (!save_page_use_compression(rs) && migrate_use_multifd()
2176 && !migration_in_postcopy()) {
2177 return ram_save_multifd_page(rs, block, offset);
2178 }
2179
2180 return ram_save_page(rs, pss);
2181 }
2182
2183 /**
2184 * ram_save_host_page: save a whole host page
2185 *
2186 * Starting at *offset send pages up to the end of the current host
2187 * page. It's valid for the initial offset to point into the middle of
2188 * a host page in which case the remainder of the hostpage is sent.
2189 * Only dirty target pages are sent. Note that the host page size may
2190 * be a huge page for this block.
2191 * The saving stops at the boundary of the used_length of the block
2192 * if the RAMBlock isn't a multiple of the host page size.
2193 *
2194 * Returns the number of pages written or negative on error
2195 *
2196 * @rs: current RAM state
2197 * @pss: data about the page we want to send
2198 */
2199 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2200 {
2201 int tmppages, pages = 0;
2202 size_t pagesize_bits =
2203 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2204 unsigned long hostpage_boundary =
2205 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2206 unsigned long start_page = pss->page;
2207 int res;
2208
2209 if (ramblock_is_ignored(pss->block)) {
2210 error_report("block %s should not be migrated !", pss->block->idstr);
2211 return 0;
2212 }
2213
2214 do {
2215 /* Check the pages is dirty and if it is send it */
2216 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2217 tmppages = ram_save_target_page(rs, pss);
2218 if (tmppages < 0) {
2219 return tmppages;
2220 }
2221
2222 pages += tmppages;
2223 /*
2224 * Allow rate limiting to happen in the middle of huge pages if
2225 * something is sent in the current iteration.
2226 */
2227 if (pagesize_bits > 1 && tmppages > 0) {
2228 migration_rate_limit();
2229 }
2230 }
2231 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2232 } while ((pss->page < hostpage_boundary) &&
2233 offset_in_ramblock(pss->block,
2234 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2235 /* The offset we leave with is the min boundary of host page and block */
2236 pss->page = MIN(pss->page, hostpage_boundary);
2237
2238 res = ram_save_release_protection(rs, pss, start_page);
2239 return (res < 0 ? res : pages);
2240 }
2241
2242 /**
2243 * ram_find_and_save_block: finds a dirty page and sends it to f
2244 *
2245 * Called within an RCU critical section.
2246 *
2247 * Returns the number of pages written where zero means no dirty pages,
2248 * or negative on error
2249 *
2250 * @rs: current RAM state
2251 *
2252 * On systems where host-page-size > target-page-size it will send all the
2253 * pages in a host page that are dirty.
2254 */
2255 static int ram_find_and_save_block(RAMState *rs)
2256 {
2257 PageSearchStatus pss;
2258 int pages = 0;
2259 bool again, found;
2260
2261 /* No dirty page as there is zero RAM */
2262 if (!ram_bytes_total()) {
2263 return pages;
2264 }
2265
2266 pss.block = rs->last_seen_block;
2267 pss.page = rs->last_page;
2268 pss.complete_round = false;
2269
2270 if (!pss.block) {
2271 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2272 }
2273
2274 do {
2275 again = true;
2276 found = get_queued_page(rs, &pss);
2277
2278 if (!found) {
2279 /* priority queue empty, so just search for something dirty */
2280 found = find_dirty_block(rs, &pss, &again);
2281 }
2282
2283 if (found) {
2284 pages = ram_save_host_page(rs, &pss);
2285 }
2286 } while (!pages && again);
2287
2288 rs->last_seen_block = pss.block;
2289 rs->last_page = pss.page;
2290
2291 return pages;
2292 }
2293
2294 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2295 {
2296 uint64_t pages = size / TARGET_PAGE_SIZE;
2297
2298 if (zero) {
2299 ram_counters.duplicate += pages;
2300 } else {
2301 ram_counters.normal += pages;
2302 ram_transferred_add(size);
2303 qemu_file_credit_transfer(f, size);
2304 }
2305 }
2306
2307 static uint64_t ram_bytes_total_common(bool count_ignored)
2308 {
2309 RAMBlock *block;
2310 uint64_t total = 0;
2311
2312 RCU_READ_LOCK_GUARD();
2313
2314 if (count_ignored) {
2315 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2316 total += block->used_length;
2317 }
2318 } else {
2319 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2320 total += block->used_length;
2321 }
2322 }
2323 return total;
2324 }
2325
2326 uint64_t ram_bytes_total(void)
2327 {
2328 return ram_bytes_total_common(false);
2329 }
2330
2331 static void xbzrle_load_setup(void)
2332 {
2333 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2334 }
2335
2336 static void xbzrle_load_cleanup(void)
2337 {
2338 g_free(XBZRLE.decoded_buf);
2339 XBZRLE.decoded_buf = NULL;
2340 }
2341
2342 static void ram_state_cleanup(RAMState **rsp)
2343 {
2344 if (*rsp) {
2345 migration_page_queue_free(*rsp);
2346 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2347 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2348 g_free(*rsp);
2349 *rsp = NULL;
2350 }
2351 }
2352
2353 static void xbzrle_cleanup(void)
2354 {
2355 XBZRLE_cache_lock();
2356 if (XBZRLE.cache) {
2357 cache_fini(XBZRLE.cache);
2358 g_free(XBZRLE.encoded_buf);
2359 g_free(XBZRLE.current_buf);
2360 g_free(XBZRLE.zero_target_page);
2361 XBZRLE.cache = NULL;
2362 XBZRLE.encoded_buf = NULL;
2363 XBZRLE.current_buf = NULL;
2364 XBZRLE.zero_target_page = NULL;
2365 }
2366 XBZRLE_cache_unlock();
2367 }
2368
2369 static void ram_save_cleanup(void *opaque)
2370 {
2371 RAMState **rsp = opaque;
2372 RAMBlock *block;
2373
2374 /* We don't use dirty log with background snapshots */
2375 if (!migrate_background_snapshot()) {
2376 /* caller have hold iothread lock or is in a bh, so there is
2377 * no writing race against the migration bitmap
2378 */
2379 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2380 /*
2381 * do not stop dirty log without starting it, since
2382 * memory_global_dirty_log_stop will assert that
2383 * memory_global_dirty_log_start/stop used in pairs
2384 */
2385 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2386 }
2387 }
2388
2389 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2390 g_free(block->clear_bmap);
2391 block->clear_bmap = NULL;
2392 g_free(block->bmap);
2393 block->bmap = NULL;
2394 }
2395
2396 xbzrle_cleanup();
2397 compress_threads_save_cleanup();
2398 ram_state_cleanup(rsp);
2399 }
2400
2401 static void ram_state_reset(RAMState *rs)
2402 {
2403 rs->last_seen_block = NULL;
2404 rs->last_sent_block = NULL;
2405 rs->last_page = 0;
2406 rs->last_version = ram_list.version;
2407 rs->xbzrle_enabled = false;
2408 }
2409
2410 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2411
2412 /* **** functions for postcopy ***** */
2413
2414 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2415 {
2416 struct RAMBlock *block;
2417
2418 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2419 unsigned long *bitmap = block->bmap;
2420 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2421 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2422
2423 while (run_start < range) {
2424 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2425 ram_discard_range(block->idstr,
2426 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2427 ((ram_addr_t)(run_end - run_start))
2428 << TARGET_PAGE_BITS);
2429 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2430 }
2431 }
2432 }
2433
2434 /**
2435 * postcopy_send_discard_bm_ram: discard a RAMBlock
2436 *
2437 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2438 *
2439 * @ms: current migration state
2440 * @block: RAMBlock to discard
2441 */
2442 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2443 {
2444 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2445 unsigned long current;
2446 unsigned long *bitmap = block->bmap;
2447
2448 for (current = 0; current < end; ) {
2449 unsigned long one = find_next_bit(bitmap, end, current);
2450 unsigned long zero, discard_length;
2451
2452 if (one >= end) {
2453 break;
2454 }
2455
2456 zero = find_next_zero_bit(bitmap, end, one + 1);
2457
2458 if (zero >= end) {
2459 discard_length = end - one;
2460 } else {
2461 discard_length = zero - one;
2462 }
2463 postcopy_discard_send_range(ms, one, discard_length);
2464 current = one + discard_length;
2465 }
2466 }
2467
2468 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2469
2470 /**
2471 * postcopy_each_ram_send_discard: discard all RAMBlocks
2472 *
2473 * Utility for the outgoing postcopy code.
2474 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2475 * passing it bitmap indexes and name.
2476 * (qemu_ram_foreach_block ends up passing unscaled lengths
2477 * which would mean postcopy code would have to deal with target page)
2478 *
2479 * @ms: current migration state
2480 */
2481 static void postcopy_each_ram_send_discard(MigrationState *ms)
2482 {
2483 struct RAMBlock *block;
2484
2485 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2486 postcopy_discard_send_init(ms, block->idstr);
2487
2488 /*
2489 * Deal with TPS != HPS and huge pages. It discard any partially sent
2490 * host-page size chunks, mark any partially dirty host-page size
2491 * chunks as all dirty. In this case the host-page is the host-page
2492 * for the particular RAMBlock, i.e. it might be a huge page.
2493 */
2494 postcopy_chunk_hostpages_pass(ms, block);
2495
2496 /*
2497 * Postcopy sends chunks of bitmap over the wire, but it
2498 * just needs indexes at this point, avoids it having
2499 * target page specific code.
2500 */
2501 postcopy_send_discard_bm_ram(ms, block);
2502 postcopy_discard_send_finish(ms);
2503 }
2504 }
2505
2506 /**
2507 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2508 *
2509 * Helper for postcopy_chunk_hostpages; it's called twice to
2510 * canonicalize the two bitmaps, that are similar, but one is
2511 * inverted.
2512 *
2513 * Postcopy requires that all target pages in a hostpage are dirty or
2514 * clean, not a mix. This function canonicalizes the bitmaps.
2515 *
2516 * @ms: current migration state
2517 * @block: block that contains the page we want to canonicalize
2518 */
2519 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2520 {
2521 RAMState *rs = ram_state;
2522 unsigned long *bitmap = block->bmap;
2523 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2524 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2525 unsigned long run_start;
2526
2527 if (block->page_size == TARGET_PAGE_SIZE) {
2528 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2529 return;
2530 }
2531
2532 /* Find a dirty page */
2533 run_start = find_next_bit(bitmap, pages, 0);
2534
2535 while (run_start < pages) {
2536
2537 /*
2538 * If the start of this run of pages is in the middle of a host
2539 * page, then we need to fixup this host page.
2540 */
2541 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2542 /* Find the end of this run */
2543 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2544 /*
2545 * If the end isn't at the start of a host page, then the
2546 * run doesn't finish at the end of a host page
2547 * and we need to discard.
2548 */
2549 }
2550
2551 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2552 unsigned long page;
2553 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2554 host_ratio);
2555 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2556
2557 /* Clean up the bitmap */
2558 for (page = fixup_start_addr;
2559 page < fixup_start_addr + host_ratio; page++) {
2560 /*
2561 * Remark them as dirty, updating the count for any pages
2562 * that weren't previously dirty.
2563 */
2564 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2565 }
2566 }
2567
2568 /* Find the next dirty page for the next iteration */
2569 run_start = find_next_bit(bitmap, pages, run_start);
2570 }
2571 }
2572
2573 /**
2574 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2575 *
2576 * Transmit the set of pages to be discarded after precopy to the target
2577 * these are pages that:
2578 * a) Have been previously transmitted but are now dirty again
2579 * b) Pages that have never been transmitted, this ensures that
2580 * any pages on the destination that have been mapped by background
2581 * tasks get discarded (transparent huge pages is the specific concern)
2582 * Hopefully this is pretty sparse
2583 *
2584 * @ms: current migration state
2585 */
2586 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2587 {
2588 RAMState *rs = ram_state;
2589
2590 RCU_READ_LOCK_GUARD();
2591
2592 /* This should be our last sync, the src is now paused */
2593 migration_bitmap_sync(rs);
2594
2595 /* Easiest way to make sure we don't resume in the middle of a host-page */
2596 rs->last_seen_block = NULL;
2597 rs->last_sent_block = NULL;
2598 rs->last_page = 0;
2599
2600 postcopy_each_ram_send_discard(ms);
2601
2602 trace_ram_postcopy_send_discard_bitmap();
2603 }
2604
2605 /**
2606 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2607 *
2608 * Returns zero on success
2609 *
2610 * @rbname: name of the RAMBlock of the request. NULL means the
2611 * same that last one.
2612 * @start: RAMBlock starting page
2613 * @length: RAMBlock size
2614 */
2615 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2616 {
2617 trace_ram_discard_range(rbname, start, length);
2618
2619 RCU_READ_LOCK_GUARD();
2620 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2621
2622 if (!rb) {
2623 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2624 return -1;
2625 }
2626
2627 /*
2628 * On source VM, we don't need to update the received bitmap since
2629 * we don't even have one.
2630 */
2631 if (rb->receivedmap) {
2632 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2633 length >> qemu_target_page_bits());
2634 }
2635
2636 return ram_block_discard_range(rb, start, length);
2637 }
2638
2639 /*
2640 * For every allocation, we will try not to crash the VM if the
2641 * allocation failed.
2642 */
2643 static int xbzrle_init(void)
2644 {
2645 Error *local_err = NULL;
2646
2647 if (!migrate_use_xbzrle()) {
2648 return 0;
2649 }
2650
2651 XBZRLE_cache_lock();
2652
2653 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2654 if (!XBZRLE.zero_target_page) {
2655 error_report("%s: Error allocating zero page", __func__);
2656 goto err_out;
2657 }
2658
2659 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2660 TARGET_PAGE_SIZE, &local_err);
2661 if (!XBZRLE.cache) {
2662 error_report_err(local_err);
2663 goto free_zero_page;
2664 }
2665
2666 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2667 if (!XBZRLE.encoded_buf) {
2668 error_report("%s: Error allocating encoded_buf", __func__);
2669 goto free_cache;
2670 }
2671
2672 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2673 if (!XBZRLE.current_buf) {
2674 error_report("%s: Error allocating current_buf", __func__);
2675 goto free_encoded_buf;
2676 }
2677
2678 /* We are all good */
2679 XBZRLE_cache_unlock();
2680 return 0;
2681
2682 free_encoded_buf:
2683 g_free(XBZRLE.encoded_buf);
2684 XBZRLE.encoded_buf = NULL;
2685 free_cache:
2686 cache_fini(XBZRLE.cache);
2687 XBZRLE.cache = NULL;
2688 free_zero_page:
2689 g_free(XBZRLE.zero_target_page);
2690 XBZRLE.zero_target_page = NULL;
2691 err_out:
2692 XBZRLE_cache_unlock();
2693 return -ENOMEM;
2694 }
2695
2696 static int ram_state_init(RAMState **rsp)
2697 {
2698 *rsp = g_try_new0(RAMState, 1);
2699
2700 if (!*rsp) {
2701 error_report("%s: Init ramstate fail", __func__);
2702 return -1;
2703 }
2704
2705 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2706 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2707 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2708
2709 /*
2710 * Count the total number of pages used by ram blocks not including any
2711 * gaps due to alignment or unplugs.
2712 * This must match with the initial values of dirty bitmap.
2713 */
2714 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2715 ram_state_reset(*rsp);
2716
2717 return 0;
2718 }
2719
2720 static void ram_list_init_bitmaps(void)
2721 {
2722 MigrationState *ms = migrate_get_current();
2723 RAMBlock *block;
2724 unsigned long pages;
2725 uint8_t shift;
2726
2727 /* Skip setting bitmap if there is no RAM */
2728 if (ram_bytes_total()) {
2729 shift = ms->clear_bitmap_shift;
2730 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2731 error_report("clear_bitmap_shift (%u) too big, using "
2732 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2733 shift = CLEAR_BITMAP_SHIFT_MAX;
2734 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2735 error_report("clear_bitmap_shift (%u) too small, using "
2736 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2737 shift = CLEAR_BITMAP_SHIFT_MIN;
2738 }
2739
2740 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2741 pages = block->max_length >> TARGET_PAGE_BITS;
2742 /*
2743 * The initial dirty bitmap for migration must be set with all
2744 * ones to make sure we'll migrate every guest RAM page to
2745 * destination.
2746 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2747 * new migration after a failed migration, ram_list.
2748 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2749 * guest memory.
2750 */
2751 block->bmap = bitmap_new(pages);
2752 bitmap_set(block->bmap, 0, pages);
2753 block->clear_bmap_shift = shift;
2754 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2755 }
2756 }
2757 }
2758
2759 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2760 {
2761 unsigned long pages;
2762 RAMBlock *rb;
2763
2764 RCU_READ_LOCK_GUARD();
2765
2766 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2767 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2768 rs->migration_dirty_pages -= pages;
2769 }
2770 }
2771
2772 static void ram_init_bitmaps(RAMState *rs)
2773 {
2774 /* For memory_global_dirty_log_start below. */
2775 qemu_mutex_lock_iothread();
2776 qemu_mutex_lock_ramlist();
2777
2778 WITH_RCU_READ_LOCK_GUARD() {
2779 ram_list_init_bitmaps();
2780 /* We don't use dirty log with background snapshots */
2781 if (!migrate_background_snapshot()) {
2782 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2783 migration_bitmap_sync_precopy(rs);
2784 }
2785 }
2786 qemu_mutex_unlock_ramlist();
2787 qemu_mutex_unlock_iothread();
2788
2789 /*
2790 * After an eventual first bitmap sync, fixup the initial bitmap
2791 * containing all 1s to exclude any discarded pages from migration.
2792 */
2793 migration_bitmap_clear_discarded_pages(rs);
2794 }
2795
2796 static int ram_init_all(RAMState **rsp)
2797 {
2798 if (ram_state_init(rsp)) {
2799 return -1;
2800 }
2801
2802 if (xbzrle_init()) {
2803 ram_state_cleanup(rsp);
2804 return -1;
2805 }
2806
2807 ram_init_bitmaps(*rsp);
2808
2809 return 0;
2810 }
2811
2812 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2813 {
2814 RAMBlock *block;
2815 uint64_t pages = 0;
2816
2817 /*
2818 * Postcopy is not using xbzrle/compression, so no need for that.
2819 * Also, since source are already halted, we don't need to care
2820 * about dirty page logging as well.
2821 */
2822
2823 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2824 pages += bitmap_count_one(block->bmap,
2825 block->used_length >> TARGET_PAGE_BITS);
2826 }
2827
2828 /* This may not be aligned with current bitmaps. Recalculate. */
2829 rs->migration_dirty_pages = pages;
2830
2831 ram_state_reset(rs);
2832
2833 /* Update RAMState cache of output QEMUFile */
2834 rs->f = out;
2835
2836 trace_ram_state_resume_prepare(pages);
2837 }
2838
2839 /*
2840 * This function clears bits of the free pages reported by the caller from the
2841 * migration dirty bitmap. @addr is the host address corresponding to the
2842 * start of the continuous guest free pages, and @len is the total bytes of
2843 * those pages.
2844 */
2845 void qemu_guest_free_page_hint(void *addr, size_t len)
2846 {
2847 RAMBlock *block;
2848 ram_addr_t offset;
2849 size_t used_len, start, npages;
2850 MigrationState *s = migrate_get_current();
2851
2852 /* This function is currently expected to be used during live migration */
2853 if (!migration_is_setup_or_active(s->state)) {
2854 return;
2855 }
2856
2857 for (; len > 0; len -= used_len, addr += used_len) {
2858 block = qemu_ram_block_from_host(addr, false, &offset);
2859 if (unlikely(!block || offset >= block->used_length)) {
2860 /*
2861 * The implementation might not support RAMBlock resize during
2862 * live migration, but it could happen in theory with future
2863 * updates. So we add a check here to capture that case.
2864 */
2865 error_report_once("%s unexpected error", __func__);
2866 return;
2867 }
2868
2869 if (len <= block->used_length - offset) {
2870 used_len = len;
2871 } else {
2872 used_len = block->used_length - offset;
2873 }
2874
2875 start = offset >> TARGET_PAGE_BITS;
2876 npages = used_len >> TARGET_PAGE_BITS;
2877
2878 qemu_mutex_lock(&ram_state->bitmap_mutex);
2879 /*
2880 * The skipped free pages are equavalent to be sent from clear_bmap's
2881 * perspective, so clear the bits from the memory region bitmap which
2882 * are initially set. Otherwise those skipped pages will be sent in
2883 * the next round after syncing from the memory region bitmap.
2884 */
2885 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2886 ram_state->migration_dirty_pages -=
2887 bitmap_count_one_with_offset(block->bmap, start, npages);
2888 bitmap_clear(block->bmap, start, npages);
2889 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2890 }
2891 }
2892
2893 /*
2894 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2895 * long-running RCU critical section. When rcu-reclaims in the code
2896 * start to become numerous it will be necessary to reduce the
2897 * granularity of these critical sections.
2898 */
2899
2900 /**
2901 * ram_save_setup: Setup RAM for migration
2902 *
2903 * Returns zero to indicate success and negative for error
2904 *
2905 * @f: QEMUFile where to send the data
2906 * @opaque: RAMState pointer
2907 */
2908 static int ram_save_setup(QEMUFile *f, void *opaque)
2909 {
2910 RAMState **rsp = opaque;
2911 RAMBlock *block;
2912 int ret;
2913
2914 if (compress_threads_save_setup()) {
2915 return -1;
2916 }
2917
2918 /* migration has already setup the bitmap, reuse it. */
2919 if (!migration_in_colo_state()) {
2920 if (ram_init_all(rsp) != 0) {
2921 compress_threads_save_cleanup();
2922 return -1;
2923 }
2924 }
2925 (*rsp)->f = f;
2926
2927 WITH_RCU_READ_LOCK_GUARD() {
2928 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2929
2930 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2931 qemu_put_byte(f, strlen(block->idstr));
2932 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2933 qemu_put_be64(f, block->used_length);
2934 if (migrate_postcopy_ram() && block->page_size !=
2935 qemu_host_page_size) {
2936 qemu_put_be64(f, block->page_size);
2937 }
2938 if (migrate_ignore_shared()) {
2939 qemu_put_be64(f, block->mr->addr);
2940 }
2941 }
2942 }
2943
2944 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2945 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2946
2947 ret = multifd_send_sync_main(f);
2948 if (ret < 0) {
2949 return ret;
2950 }
2951
2952 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2953 qemu_fflush(f);
2954
2955 return 0;
2956 }
2957
2958 /**
2959 * ram_save_iterate: iterative stage for migration
2960 *
2961 * Returns zero to indicate success and negative for error
2962 *
2963 * @f: QEMUFile where to send the data
2964 * @opaque: RAMState pointer
2965 */
2966 static int ram_save_iterate(QEMUFile *f, void *opaque)
2967 {
2968 RAMState **temp = opaque;
2969 RAMState *rs = *temp;
2970 int ret = 0;
2971 int i;
2972 int64_t t0;
2973 int done = 0;
2974
2975 if (blk_mig_bulk_active()) {
2976 /* Avoid transferring ram during bulk phase of block migration as
2977 * the bulk phase will usually take a long time and transferring
2978 * ram updates during that time is pointless. */
2979 goto out;
2980 }
2981
2982 /*
2983 * We'll take this lock a little bit long, but it's okay for two reasons.
2984 * Firstly, the only possible other thread to take it is who calls
2985 * qemu_guest_free_page_hint(), which should be rare; secondly, see
2986 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
2987 * guarantees that we'll at least released it in a regular basis.
2988 */
2989 qemu_mutex_lock(&rs->bitmap_mutex);
2990 WITH_RCU_READ_LOCK_GUARD() {
2991 if (ram_list.version != rs->last_version) {
2992 ram_state_reset(rs);
2993 }
2994
2995 /* Read version before ram_list.blocks */
2996 smp_rmb();
2997
2998 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2999
3000 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3001 i = 0;
3002 while ((ret = qemu_file_rate_limit(f)) == 0 ||
3003 postcopy_has_request(rs)) {
3004 int pages;
3005
3006 if (qemu_file_get_error(f)) {
3007 break;
3008 }
3009
3010 pages = ram_find_and_save_block(rs);
3011 /* no more pages to sent */
3012 if (pages == 0) {
3013 done = 1;
3014 break;
3015 }
3016
3017 if (pages < 0) {
3018 qemu_file_set_error(f, pages);
3019 break;
3020 }
3021
3022 rs->target_page_count += pages;
3023
3024 /*
3025 * During postcopy, it is necessary to make sure one whole host
3026 * page is sent in one chunk.
3027 */
3028 if (migrate_postcopy_ram()) {
3029 flush_compressed_data(rs);
3030 }
3031
3032 /*
3033 * we want to check in the 1st loop, just in case it was the 1st
3034 * time and we had to sync the dirty bitmap.
3035 * qemu_clock_get_ns() is a bit expensive, so we only check each
3036 * some iterations
3037 */
3038 if ((i & 63) == 0) {
3039 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3040 1000000;
3041 if (t1 > MAX_WAIT) {
3042 trace_ram_save_iterate_big_wait(t1, i);
3043 break;
3044 }
3045 }
3046 i++;
3047 }
3048 }
3049 qemu_mutex_unlock(&rs->bitmap_mutex);
3050
3051 /*
3052 * Must occur before EOS (or any QEMUFile operation)
3053 * because of RDMA protocol.
3054 */
3055 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3056
3057 out:
3058 if (ret >= 0
3059 && migration_is_setup_or_active(migrate_get_current()->state)) {
3060 ret = multifd_send_sync_main(rs->f);
3061 if (ret < 0) {
3062 return ret;
3063 }
3064
3065 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3066 qemu_fflush(f);
3067 ram_transferred_add(8);
3068
3069 ret = qemu_file_get_error(f);
3070 }
3071 if (ret < 0) {
3072 return ret;
3073 }
3074
3075 return done;
3076 }
3077
3078 /**
3079 * ram_save_complete: function called to send the remaining amount of ram
3080 *
3081 * Returns zero to indicate success or negative on error
3082 *
3083 * Called with iothread lock
3084 *
3085 * @f: QEMUFile where to send the data
3086 * @opaque: RAMState pointer
3087 */
3088 static int ram_save_complete(QEMUFile *f, void *opaque)
3089 {
3090 RAMState **temp = opaque;
3091 RAMState *rs = *temp;
3092 int ret = 0;
3093
3094 rs->last_stage = !migration_in_colo_state();
3095
3096 WITH_RCU_READ_LOCK_GUARD() {
3097 if (!migration_in_postcopy()) {
3098 migration_bitmap_sync_precopy(rs);
3099 }
3100
3101 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3102
3103 /* try transferring iterative blocks of memory */
3104
3105 /* flush all remaining blocks regardless of rate limiting */
3106 while (true) {
3107 int pages;
3108
3109 pages = ram_find_and_save_block(rs);
3110 /* no more blocks to sent */
3111 if (pages == 0) {
3112 break;
3113 }
3114 if (pages < 0) {
3115 ret = pages;
3116 break;
3117 }
3118 }
3119
3120 flush_compressed_data(rs);
3121 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3122 }
3123
3124 if (ret < 0) {
3125 return ret;
3126 }
3127
3128 ret = multifd_send_sync_main(rs->f);
3129 if (ret < 0) {
3130 return ret;
3131 }
3132
3133 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3134 qemu_fflush(f);
3135
3136 return 0;
3137 }
3138
3139 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3140 uint64_t *res_precopy_only,
3141 uint64_t *res_compatible,
3142 uint64_t *res_postcopy_only)
3143 {
3144 RAMState **temp = opaque;
3145 RAMState *rs = *temp;
3146 uint64_t remaining_size;
3147
3148 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3149
3150 if (!migration_in_postcopy() &&
3151 remaining_size < max_size) {
3152 qemu_mutex_lock_iothread();
3153 WITH_RCU_READ_LOCK_GUARD() {
3154 migration_bitmap_sync_precopy(rs);
3155 }
3156 qemu_mutex_unlock_iothread();
3157 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3158 }
3159
3160 if (migrate_postcopy_ram()) {
3161 /* We can do postcopy, and all the data is postcopiable */
3162 *res_compatible += remaining_size;
3163 } else {
3164 *res_precopy_only += remaining_size;
3165 }
3166 }
3167
3168 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3169 {
3170 unsigned int xh_len;
3171 int xh_flags;
3172 uint8_t *loaded_data;
3173
3174 /* extract RLE header */
3175 xh_flags = qemu_get_byte(f);
3176 xh_len = qemu_get_be16(f);
3177
3178 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3179 error_report("Failed to load XBZRLE page - wrong compression!");
3180 return -1;
3181 }
3182
3183 if (xh_len > TARGET_PAGE_SIZE) {
3184 error_report("Failed to load XBZRLE page - len overflow!");
3185 return -1;
3186 }
3187 loaded_data = XBZRLE.decoded_buf;
3188 /* load data and decode */
3189 /* it can change loaded_data to point to an internal buffer */
3190 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3191
3192 /* decode RLE */
3193 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3194 TARGET_PAGE_SIZE) == -1) {
3195 error_report("Failed to load XBZRLE page - decode error!");
3196 return -1;
3197 }
3198
3199 return 0;
3200 }
3201
3202 /**
3203 * ram_block_from_stream: read a RAMBlock id from the migration stream
3204 *
3205 * Must be called from within a rcu critical section.
3206 *
3207 * Returns a pointer from within the RCU-protected ram_list.
3208 *
3209 * @mis: the migration incoming state pointer
3210 * @f: QEMUFile where to read the data from
3211 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3212 */
3213 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3214 QEMUFile *f, int flags)
3215 {
3216 RAMBlock *block = mis->last_recv_block;
3217 char id[256];
3218 uint8_t len;
3219
3220 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3221 if (!block) {
3222 error_report("Ack, bad migration stream!");
3223 return NULL;
3224 }
3225 return block;
3226 }
3227
3228 len = qemu_get_byte(f);
3229 qemu_get_buffer(f, (uint8_t *)id, len);
3230 id[len] = 0;
3231
3232 block = qemu_ram_block_by_name(id);
3233 if (!block) {
3234 error_report("Can't find block %s", id);
3235 return NULL;
3236 }
3237
3238 if (ramblock_is_ignored(block)) {
3239 error_report("block %s should not be migrated !", id);
3240 return NULL;
3241 }
3242
3243 mis->last_recv_block = block;
3244
3245 return block;
3246 }
3247
3248 static inline void *host_from_ram_block_offset(RAMBlock *block,
3249 ram_addr_t offset)
3250 {
3251 if (!offset_in_ramblock(block, offset)) {
3252 return NULL;
3253 }
3254
3255 return block->host + offset;
3256 }
3257
3258 static void *host_page_from_ram_block_offset(RAMBlock *block,
3259 ram_addr_t offset)
3260 {
3261 /* Note: Explicitly no check against offset_in_ramblock(). */
3262 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3263 block->page_size);
3264 }
3265
3266 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3267 ram_addr_t offset)
3268 {
3269 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3270 }
3271
3272 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3273 ram_addr_t offset, bool record_bitmap)
3274 {
3275 if (!offset_in_ramblock(block, offset)) {
3276 return NULL;
3277 }
3278 if (!block->colo_cache) {
3279 error_report("%s: colo_cache is NULL in block :%s",
3280 __func__, block->idstr);
3281 return NULL;
3282 }
3283
3284 /*
3285 * During colo checkpoint, we need bitmap of these migrated pages.
3286 * It help us to decide which pages in ram cache should be flushed
3287 * into VM's RAM later.
3288 */
3289 if (record_bitmap &&
3290 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3291 ram_state->migration_dirty_pages++;
3292 }
3293 return block->colo_cache + offset;
3294 }
3295
3296 /**
3297 * ram_handle_compressed: handle the zero page case
3298 *
3299 * If a page (or a whole RDMA chunk) has been
3300 * determined to be zero, then zap it.
3301 *
3302 * @host: host address for the zero page
3303 * @ch: what the page is filled from. We only support zero
3304 * @size: size of the zero page
3305 */
3306 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3307 {
3308 if (ch != 0 || !buffer_is_zero(host, size)) {
3309 memset(host, ch, size);
3310 }
3311 }
3312
3313 /* return the size after decompression, or negative value on error */
3314 static int
3315 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3316 const uint8_t *source, size_t source_len)
3317 {
3318 int err;
3319
3320 err = inflateReset(stream);
3321 if (err != Z_OK) {
3322 return -1;
3323 }
3324
3325 stream->avail_in = source_len;
3326 stream->next_in = (uint8_t *)source;
3327 stream->avail_out = dest_len;
3328 stream->next_out = dest;
3329
3330 err = inflate(stream, Z_NO_FLUSH);
3331 if (err != Z_STREAM_END) {
3332 return -1;
3333 }
3334
3335 return stream->total_out;
3336 }
3337
3338 static void *do_data_decompress(void *opaque)
3339 {
3340 DecompressParam *param = opaque;
3341 unsigned long pagesize;
3342 uint8_t *des;
3343 int len, ret;
3344
3345 qemu_mutex_lock(&param->mutex);
3346 while (!param->quit) {
3347 if (param->des) {
3348 des = param->des;
3349 len = param->len;
3350 param->des = 0;
3351 qemu_mutex_unlock(&param->mutex);
3352
3353 pagesize = TARGET_PAGE_SIZE;
3354
3355 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3356 param->compbuf, len);
3357 if (ret < 0 && migrate_get_current()->decompress_error_check) {
3358 error_report("decompress data failed");
3359 qemu_file_set_error(decomp_file, ret);
3360 }
3361
3362 qemu_mutex_lock(&decomp_done_lock);
3363 param->done = true;
3364 qemu_cond_signal(&decomp_done_cond);
3365 qemu_mutex_unlock(&decomp_done_lock);
3366
3367 qemu_mutex_lock(&param->mutex);
3368 } else {
3369 qemu_cond_wait(&param->cond, &param->mutex);
3370 }
3371 }
3372 qemu_mutex_unlock(&param->mutex);
3373
3374 return NULL;
3375 }
3376
3377 static int wait_for_decompress_done(void)
3378 {
3379 int idx, thread_count;
3380
3381 if (!migrate_use_compression()) {
3382 return 0;
3383 }
3384
3385 thread_count = migrate_decompress_threads();
3386 qemu_mutex_lock(&decomp_done_lock);
3387 for (idx = 0; idx < thread_count; idx++) {
3388 while (!decomp_param[idx].done) {
3389 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3390 }
3391 }
3392 qemu_mutex_unlock(&decomp_done_lock);
3393 return qemu_file_get_error(decomp_file);
3394 }
3395
3396 static void compress_threads_load_cleanup(void)
3397 {
3398 int i, thread_count;
3399
3400 if (!migrate_use_compression()) {
3401 return;
3402 }
3403 thread_count = migrate_decompress_threads();
3404 for (i = 0; i < thread_count; i++) {
3405 /*
3406 * we use it as a indicator which shows if the thread is
3407 * properly init'd or not
3408 */
3409 if (!decomp_param[i].compbuf) {
3410 break;
3411 }
3412
3413 qemu_mutex_lock(&decomp_param[i].mutex);
3414 decomp_param[i].quit = true;
3415 qemu_cond_signal(&decomp_param[i].cond);
3416 qemu_mutex_unlock(&decomp_param[i].mutex);
3417 }
3418 for (i = 0; i < thread_count; i++) {
3419 if (!decomp_param[i].compbuf) {
3420 break;
3421 }
3422
3423 qemu_thread_join(decompress_threads + i);
3424 qemu_mutex_destroy(&decomp_param[i].mutex);
3425 qemu_cond_destroy(&decomp_param[i].cond);
3426 inflateEnd(&decomp_param[i].stream);
3427 g_free(decomp_param[i].compbuf);
3428 decomp_param[i].compbuf = NULL;
3429 }
3430 g_free(decompress_threads);
3431 g_free(decomp_param);
3432 decompress_threads = NULL;
3433 decomp_param = NULL;
3434 decomp_file = NULL;
3435 }
3436
3437 static int compress_threads_load_setup(QEMUFile *f)
3438 {
3439 int i, thread_count;
3440
3441 if (!migrate_use_compression()) {
3442 return 0;
3443 }
3444
3445 thread_count = migrate_decompress_threads();
3446 decompress_threads = g_new0(QemuThread, thread_count);
3447 decomp_param = g_new0(DecompressParam, thread_count);
3448 qemu_mutex_init(&decomp_done_lock);
3449 qemu_cond_init(&decomp_done_cond);
3450 decomp_file = f;
3451 for (i = 0; i < thread_count; i++) {
3452 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3453 goto exit;
3454 }
3455
3456 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3457 qemu_mutex_init(&decomp_param[i].mutex);
3458 qemu_cond_init(&decomp_param[i].cond);
3459 decomp_param[i].done = true;
3460 decomp_param[i].quit = false;
3461 qemu_thread_create(decompress_threads + i, "decompress",
3462 do_data_decompress, decomp_param + i,
3463 QEMU_THREAD_JOINABLE);
3464 }
3465 return 0;
3466 exit:
3467 compress_threads_load_cleanup();
3468 return -1;
3469 }
3470
3471 static void decompress_data_with_multi_threads(QEMUFile *f,
3472 void *host, int len)
3473 {
3474 int idx, thread_count;
3475
3476 thread_count = migrate_decompress_threads();
3477 QEMU_LOCK_GUARD(&decomp_done_lock);
3478 while (true) {
3479 for (idx = 0; idx < thread_count; idx++) {
3480 if (decomp_param[idx].done) {
3481 decomp_param[idx].done = false;
3482 qemu_mutex_lock(&decomp_param[idx].mutex);
3483 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3484 decomp_param[idx].des = host;
3485 decomp_param[idx].len = len;
3486 qemu_cond_signal(&decomp_param[idx].cond);
3487 qemu_mutex_unlock(&decomp_param[idx].mutex);
3488 break;
3489 }
3490 }
3491 if (idx < thread_count) {
3492 break;
3493 } else {
3494 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3495 }
3496 }
3497 }
3498
3499 static void colo_init_ram_state(void)
3500 {
3501 ram_state_init(&ram_state);
3502 }
3503
3504 /*
3505 * colo cache: this is for secondary VM, we cache the whole
3506 * memory of the secondary VM, it is need to hold the global lock
3507 * to call this helper.
3508 */
3509 int colo_init_ram_cache(void)
3510 {
3511 RAMBlock *block;
3512
3513 WITH_RCU_READ_LOCK_GUARD() {
3514 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3515 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3516 NULL, false, false);
3517 if (!block->colo_cache) {
3518 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3519 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3520 block->used_length);
3521 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3522 if (block->colo_cache) {
3523 qemu_anon_ram_free(block->colo_cache, block->used_length);
3524 block->colo_cache = NULL;
3525 }
3526 }
3527 return -errno;
3528 }
3529 if (!machine_dump_guest_core(current_machine)) {
3530 qemu_madvise(block->colo_cache, block->used_length,
3531 QEMU_MADV_DONTDUMP);
3532 }
3533 }
3534 }
3535
3536 /*
3537 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3538 * with to decide which page in cache should be flushed into SVM's RAM. Here
3539 * we use the same name 'ram_bitmap' as for migration.
3540 */
3541 if (ram_bytes_total()) {
3542 RAMBlock *block;
3543
3544 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3545 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3546 block->bmap = bitmap_new(pages);
3547 }
3548 }
3549
3550 colo_init_ram_state();
3551 return 0;
3552 }
3553
3554 /* TODO: duplicated with ram_init_bitmaps */
3555 void colo_incoming_start_dirty_log(void)
3556 {
3557 RAMBlock *block = NULL;
3558 /* For memory_global_dirty_log_start below. */
3559 qemu_mutex_lock_iothread();
3560 qemu_mutex_lock_ramlist();
3561
3562 memory_global_dirty_log_sync();
3563 WITH_RCU_READ_LOCK_GUARD() {
3564 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3565 ramblock_sync_dirty_bitmap(ram_state, block);
3566 /* Discard this dirty bitmap record */
3567 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3568 }
3569 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3570 }
3571 ram_state->migration_dirty_pages = 0;
3572 qemu_mutex_unlock_ramlist();
3573 qemu_mutex_unlock_iothread();
3574 }
3575
3576 /* It is need to hold the global lock to call this helper */
3577 void colo_release_ram_cache(void)
3578 {
3579 RAMBlock *block;
3580
3581 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3582 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3583 g_free(block->bmap);
3584 block->bmap = NULL;
3585 }
3586
3587 WITH_RCU_READ_LOCK_GUARD() {
3588 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3589 if (block->colo_cache) {
3590 qemu_anon_ram_free(block->colo_cache, block->used_length);
3591 block->colo_cache = NULL;
3592 }
3593 }
3594 }
3595 ram_state_cleanup(&ram_state);
3596 }
3597
3598 /**
3599 * ram_load_setup: Setup RAM for migration incoming side
3600 *
3601 * Returns zero to indicate success and negative for error
3602 *
3603 * @f: QEMUFile where to receive the data
3604 * @opaque: RAMState pointer
3605 */
3606 static int ram_load_setup(QEMUFile *f, void *opaque)
3607 {
3608 if (compress_threads_load_setup(f)) {
3609 return -1;
3610 }
3611
3612 xbzrle_load_setup();
3613 ramblock_recv_map_init();
3614
3615 return 0;
3616 }
3617
3618 static int ram_load_cleanup(void *opaque)
3619 {
3620 RAMBlock *rb;
3621
3622 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3623 qemu_ram_block_writeback(rb);
3624 }
3625
3626 xbzrle_load_cleanup();
3627 compress_threads_load_cleanup();
3628
3629 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3630 g_free(rb->receivedmap);
3631 rb->receivedmap = NULL;
3632 }
3633
3634 return 0;
3635 }
3636
3637 /**
3638 * ram_postcopy_incoming_init: allocate postcopy data structures
3639 *
3640 * Returns 0 for success and negative if there was one error
3641 *
3642 * @mis: current migration incoming state
3643 *
3644 * Allocate data structures etc needed by incoming migration with
3645 * postcopy-ram. postcopy-ram's similarly names
3646 * postcopy_ram_incoming_init does the work.
3647 */
3648 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3649 {
3650 return postcopy_ram_incoming_init(mis);
3651 }
3652
3653 /**
3654 * ram_load_postcopy: load a page in postcopy case
3655 *
3656 * Returns 0 for success or -errno in case of error
3657 *
3658 * Called in postcopy mode by ram_load().
3659 * rcu_read_lock is taken prior to this being called.
3660 *
3661 * @f: QEMUFile where to send the data
3662 */
3663 int ram_load_postcopy(QEMUFile *f)
3664 {
3665 int flags = 0, ret = 0;
3666 bool place_needed = false;
3667 bool matches_target_page_size = false;
3668 MigrationIncomingState *mis = migration_incoming_get_current();
3669 /* Currently we only use channel 0. TODO: use all the channels */
3670 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[0];
3671
3672 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3673 ram_addr_t addr;
3674 void *page_buffer = NULL;
3675 void *place_source = NULL;
3676 RAMBlock *block = NULL;
3677 uint8_t ch;
3678 int len;
3679
3680 addr = qemu_get_be64(f);
3681
3682 /*
3683 * If qemu file error, we should stop here, and then "addr"
3684 * may be invalid
3685 */
3686 ret = qemu_file_get_error(f);
3687 if (ret) {
3688 break;
3689 }
3690
3691 flags = addr & ~TARGET_PAGE_MASK;
3692 addr &= TARGET_PAGE_MASK;
3693
3694 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3695 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3696 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3697 block = ram_block_from_stream(mis, f, flags);
3698 if (!block) {
3699 ret = -EINVAL;
3700 break;
3701 }
3702
3703 /*
3704 * Relying on used_length is racy and can result in false positives.
3705 * We might place pages beyond used_length in case RAM was shrunk
3706 * while in postcopy, which is fine - trying to place via
3707 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3708 */
3709 if (!block->host || addr >= block->postcopy_length) {
3710 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3711 ret = -EINVAL;
3712 break;
3713 }
3714 tmp_page->target_pages++;
3715 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3716 /*
3717 * Postcopy requires that we place whole host pages atomically;
3718 * these may be huge pages for RAMBlocks that are backed by
3719 * hugetlbfs.
3720 * To make it atomic, the data is read into a temporary page
3721 * that's moved into place later.
3722 * The migration protocol uses, possibly smaller, target-pages
3723 * however the source ensures it always sends all the components
3724 * of a host page in one chunk.
3725 */
3726 page_buffer = tmp_page->tmp_huge_page +
3727 host_page_offset_from_ram_block_offset(block, addr);
3728 /* If all TP are zero then we can optimise the place */
3729 if (tmp_page->target_pages == 1) {
3730 tmp_page->host_addr =
3731 host_page_from_ram_block_offset(block, addr);
3732 } else if (tmp_page->host_addr !=
3733 host_page_from_ram_block_offset(block, addr)) {
3734 /* not the 1st TP within the HP */
3735 error_report("Non-same host page detected. "
3736 "Target host page %p, received host page %p "
3737 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
3738 tmp_page->host_addr,
3739 host_page_from_ram_block_offset(block, addr),
3740 block->idstr, addr, tmp_page->target_pages);
3741 ret = -EINVAL;
3742 break;
3743 }
3744
3745 /*
3746 * If it's the last part of a host page then we place the host
3747 * page
3748 */
3749 if (tmp_page->target_pages ==
3750 (block->page_size / TARGET_PAGE_SIZE)) {
3751 place_needed = true;
3752 }
3753 place_source = tmp_page->tmp_huge_page;
3754 }
3755
3756 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3757 case RAM_SAVE_FLAG_ZERO:
3758 ch = qemu_get_byte(f);
3759 /*
3760 * Can skip to set page_buffer when
3761 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3762 */
3763 if (ch || !matches_target_page_size) {
3764 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3765 }
3766 if (ch) {
3767 tmp_page->all_zero = false;
3768 }
3769 break;
3770
3771 case RAM_SAVE_FLAG_PAGE:
3772 tmp_page->all_zero = false;
3773 if (!matches_target_page_size) {
3774 /* For huge pages, we always use temporary buffer */
3775 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3776 } else {
3777 /*
3778 * For small pages that matches target page size, we
3779 * avoid the qemu_file copy. Instead we directly use
3780 * the buffer of QEMUFile to place the page. Note: we
3781 * cannot do any QEMUFile operation before using that
3782 * buffer to make sure the buffer is valid when
3783 * placing the page.
3784 */
3785 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3786 TARGET_PAGE_SIZE);
3787 }
3788 break;
3789 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3790 tmp_page->all_zero = false;
3791 len = qemu_get_be32(f);
3792 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3793 error_report("Invalid compressed data length: %d", len);
3794 ret = -EINVAL;
3795 break;
3796 }
3797 decompress_data_with_multi_threads(f, page_buffer, len);
3798 break;
3799
3800 case RAM_SAVE_FLAG_EOS:
3801 /* normal exit */
3802 multifd_recv_sync_main();
3803 break;
3804 default:
3805 error_report("Unknown combination of migration flags: 0x%x"
3806 " (postcopy mode)", flags);
3807 ret = -EINVAL;
3808 break;
3809 }
3810
3811 /* Got the whole host page, wait for decompress before placing. */
3812 if (place_needed) {
3813 ret |= wait_for_decompress_done();
3814 }
3815
3816 /* Detect for any possible file errors */
3817 if (!ret && qemu_file_get_error(f)) {
3818 ret = qemu_file_get_error(f);
3819 }
3820
3821 if (!ret && place_needed) {
3822 if (tmp_page->all_zero) {
3823 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
3824 } else {
3825 ret = postcopy_place_page(mis, tmp_page->host_addr,
3826 place_source, block);
3827 }
3828 place_needed = false;
3829 postcopy_temp_page_reset(tmp_page);
3830 }
3831 }
3832
3833 return ret;
3834 }
3835
3836 static bool postcopy_is_advised(void)
3837 {
3838 PostcopyState ps = postcopy_state_get();
3839 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3840 }
3841
3842 static bool postcopy_is_running(void)
3843 {
3844 PostcopyState ps = postcopy_state_get();
3845 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3846 }
3847
3848 /*
3849 * Flush content of RAM cache into SVM's memory.
3850 * Only flush the pages that be dirtied by PVM or SVM or both.
3851 */
3852 void colo_flush_ram_cache(void)
3853 {
3854 RAMBlock *block = NULL;
3855 void *dst_host;
3856 void *src_host;
3857 unsigned long offset = 0;
3858
3859 memory_global_dirty_log_sync();
3860 WITH_RCU_READ_LOCK_GUARD() {
3861 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3862 ramblock_sync_dirty_bitmap(ram_state, block);
3863 }
3864 }
3865
3866 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3867 WITH_RCU_READ_LOCK_GUARD() {
3868 block = QLIST_FIRST_RCU(&ram_list.blocks);
3869
3870 while (block) {
3871 unsigned long num = 0;
3872
3873 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3874 if (!offset_in_ramblock(block,
3875 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3876 offset = 0;
3877 num = 0;
3878 block = QLIST_NEXT_RCU(block, next);
3879 } else {
3880 unsigned long i = 0;
3881
3882 for (i = 0; i < num; i++) {
3883 migration_bitmap_clear_dirty(ram_state, block, offset + i);
3884 }
3885 dst_host = block->host
3886 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3887 src_host = block->colo_cache
3888 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3889 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3890 offset += num;
3891 }
3892 }
3893 }
3894 trace_colo_flush_ram_cache_end();
3895 }
3896
3897 /**
3898 * ram_load_precopy: load pages in precopy case
3899 *
3900 * Returns 0 for success or -errno in case of error
3901 *
3902 * Called in precopy mode by ram_load().
3903 * rcu_read_lock is taken prior to this being called.
3904 *
3905 * @f: QEMUFile where to send the data
3906 */
3907 static int ram_load_precopy(QEMUFile *f)
3908 {
3909 MigrationIncomingState *mis = migration_incoming_get_current();
3910 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3911 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3912 bool postcopy_advised = postcopy_is_advised();
3913 if (!migrate_use_compression()) {
3914 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3915 }
3916
3917 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3918 ram_addr_t addr, total_ram_bytes;
3919 void *host = NULL, *host_bak = NULL;
3920 uint8_t ch;
3921
3922 /*
3923 * Yield periodically to let main loop run, but an iteration of
3924 * the main loop is expensive, so do it each some iterations
3925 */
3926 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3927 aio_co_schedule(qemu_get_current_aio_context(),
3928 qemu_coroutine_self());
3929 qemu_coroutine_yield();
3930 }
3931 i++;
3932
3933 addr = qemu_get_be64(f);
3934 flags = addr & ~TARGET_PAGE_MASK;
3935 addr &= TARGET_PAGE_MASK;
3936
3937 if (flags & invalid_flags) {
3938 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3939 error_report("Received an unexpected compressed page");
3940 }
3941
3942 ret = -EINVAL;
3943 break;
3944 }
3945
3946 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3947 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3948 RAMBlock *block = ram_block_from_stream(mis, f, flags);
3949
3950 host = host_from_ram_block_offset(block, addr);
3951 /*
3952 * After going into COLO stage, we should not load the page
3953 * into SVM's memory directly, we put them into colo_cache firstly.
3954 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3955 * Previously, we copied all these memory in preparing stage of COLO
3956 * while we need to stop VM, which is a time-consuming process.
3957 * Here we optimize it by a trick, back-up every page while in
3958 * migration process while COLO is enabled, though it affects the
3959 * speed of the migration, but it obviously reduce the downtime of
3960 * back-up all SVM'S memory in COLO preparing stage.
3961 */
3962 if (migration_incoming_colo_enabled()) {
3963 if (migration_incoming_in_colo_state()) {
3964 /* In COLO stage, put all pages into cache temporarily */
3965 host = colo_cache_from_block_offset(block, addr, true);
3966 } else {
3967 /*
3968 * In migration stage but before COLO stage,
3969 * Put all pages into both cache and SVM's memory.
3970 */
3971 host_bak = colo_cache_from_block_offset(block, addr, false);
3972 }
3973 }
3974 if (!host) {
3975 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3976 ret = -EINVAL;
3977 break;
3978 }
3979 if (!migration_incoming_in_colo_state()) {
3980 ramblock_recv_bitmap_set(block, host);
3981 }
3982
3983 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3984 }
3985
3986 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3987 case RAM_SAVE_FLAG_MEM_SIZE:
3988 /* Synchronize RAM block list */
3989 total_ram_bytes = addr;
3990 while (!ret && total_ram_bytes) {
3991 RAMBlock *block;
3992 char id[256];
3993 ram_addr_t length;
3994
3995 len = qemu_get_byte(f);
3996 qemu_get_buffer(f, (uint8_t *)id, len);
3997 id[len] = 0;
3998 length = qemu_get_be64(f);
3999
4000 block = qemu_ram_block_by_name(id);
4001 if (block && !qemu_ram_is_migratable(block)) {
4002 error_report("block %s should not be migrated !", id);
4003 ret = -EINVAL;
4004 } else if (block) {
4005 if (length != block->used_length) {
4006 Error *local_err = NULL;
4007
4008 ret = qemu_ram_resize(block, length,
4009 &local_err);
4010 if (local_err) {
4011 error_report_err(local_err);
4012 }
4013 }
4014 /* For postcopy we need to check hugepage sizes match */
4015 if (postcopy_advised && migrate_postcopy_ram() &&
4016 block->page_size != qemu_host_page_size) {
4017 uint64_t remote_page_size = qemu_get_be64(f);
4018 if (remote_page_size != block->page_size) {
4019 error_report("Mismatched RAM page size %s "
4020 "(local) %zd != %" PRId64,
4021 id, block->page_size,
4022 remote_page_size);
4023 ret = -EINVAL;
4024 }
4025 }
4026 if (migrate_ignore_shared()) {
4027 hwaddr addr = qemu_get_be64(f);
4028 if (ramblock_is_ignored(block) &&
4029 block->mr->addr != addr) {
4030 error_report("Mismatched GPAs for block %s "
4031 "%" PRId64 "!= %" PRId64,
4032 id, (uint64_t)addr,
4033 (uint64_t)block->mr->addr);
4034 ret = -EINVAL;
4035 }
4036 }
4037 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4038 block->idstr);
4039 } else {
4040 error_report("Unknown ramblock \"%s\", cannot "
4041 "accept migration", id);
4042 ret = -EINVAL;
4043 }
4044
4045 total_ram_bytes -= length;
4046 }
4047 break;
4048
4049 case RAM_SAVE_FLAG_ZERO:
4050 ch = qemu_get_byte(f);
4051 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4052 break;
4053
4054 case RAM_SAVE_FLAG_PAGE:
4055 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4056 break;
4057
4058 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4059 len = qemu_get_be32(f);
4060 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4061 error_report("Invalid compressed data length: %d", len);
4062 ret = -EINVAL;
4063 break;
4064 }
4065 decompress_data_with_multi_threads(f, host, len);
4066 break;
4067
4068 case RAM_SAVE_FLAG_XBZRLE:
4069 if (load_xbzrle(f, addr, host) < 0) {
4070 error_report("Failed to decompress XBZRLE page at "
4071 RAM_ADDR_FMT, addr);
4072 ret = -EINVAL;
4073 break;
4074 }
4075 break;
4076 case RAM_SAVE_FLAG_EOS:
4077 /* normal exit */
4078 multifd_recv_sync_main();
4079 break;
4080 default:
4081 if (flags & RAM_SAVE_FLAG_HOOK) {
4082 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4083 } else {
4084 error_report("Unknown combination of migration flags: 0x%x",
4085 flags);
4086 ret = -EINVAL;
4087 }
4088 }
4089 if (!ret) {
4090 ret = qemu_file_get_error(f);
4091 }
4092 if (!ret && host_bak) {
4093 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4094 }
4095 }
4096
4097 ret |= wait_for_decompress_done();
4098 return ret;
4099 }
4100
4101 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4102 {
4103 int ret = 0;
4104 static uint64_t seq_iter;
4105 /*
4106 * If system is running in postcopy mode, page inserts to host memory must
4107 * be atomic
4108 */
4109 bool postcopy_running = postcopy_is_running();
4110
4111 seq_iter++;
4112
4113 if (version_id != 4) {
4114 return -EINVAL;
4115 }
4116
4117 /*
4118 * This RCU critical section can be very long running.
4119 * When RCU reclaims in the code start to become numerous,
4120 * it will be necessary to reduce the granularity of this
4121 * critical section.
4122 */
4123 WITH_RCU_READ_LOCK_GUARD() {
4124 if (postcopy_running) {
4125 ret = ram_load_postcopy(f);
4126 } else {
4127 ret = ram_load_precopy(f);
4128 }
4129 }
4130 trace_ram_load_complete(ret, seq_iter);
4131
4132 return ret;
4133 }
4134
4135 static bool ram_has_postcopy(void *opaque)
4136 {
4137 RAMBlock *rb;
4138 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4139 if (ramblock_is_pmem(rb)) {
4140 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4141 "is not supported now!", rb->idstr, rb->host);
4142 return false;
4143 }
4144 }
4145
4146 return migrate_postcopy_ram();
4147 }
4148
4149 /* Sync all the dirty bitmap with destination VM. */
4150 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4151 {
4152 RAMBlock *block;
4153 QEMUFile *file = s->to_dst_file;
4154 int ramblock_count = 0;
4155
4156 trace_ram_dirty_bitmap_sync_start();
4157
4158 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4159 qemu_savevm_send_recv_bitmap(file, block->idstr);
4160 trace_ram_dirty_bitmap_request(block->idstr);
4161 ramblock_count++;
4162 }
4163
4164 trace_ram_dirty_bitmap_sync_wait();
4165
4166 /* Wait until all the ramblocks' dirty bitmap synced */
4167 while (ramblock_count--) {
4168 qemu_sem_wait(&s->rp_state.rp_sem);
4169 }
4170
4171 trace_ram_dirty_bitmap_sync_complete();
4172
4173 return 0;
4174 }
4175
4176 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4177 {
4178 qemu_sem_post(&s->rp_state.rp_sem);
4179 }
4180
4181 /*
4182 * Read the received bitmap, revert it as the initial dirty bitmap.
4183 * This is only used when the postcopy migration is paused but wants
4184 * to resume from a middle point.
4185 */
4186 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4187 {
4188 int ret = -EINVAL;
4189 /* from_dst_file is always valid because we're within rp_thread */
4190 QEMUFile *file = s->rp_state.from_dst_file;
4191 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4192 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4193 uint64_t size, end_mark;
4194
4195 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4196
4197 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4198 error_report("%s: incorrect state %s", __func__,
4199 MigrationStatus_str(s->state));
4200 return -EINVAL;
4201 }
4202
4203 /*
4204 * Note: see comments in ramblock_recv_bitmap_send() on why we
4205 * need the endianness conversion, and the paddings.
4206 */
4207 local_size = ROUND_UP(local_size, 8);
4208
4209 /* Add paddings */
4210 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4211
4212 size = qemu_get_be64(file);
4213
4214 /* The size of the bitmap should match with our ramblock */
4215 if (size != local_size) {
4216 error_report("%s: ramblock '%s' bitmap size mismatch "
4217 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4218 block->idstr, size, local_size);
4219 ret = -EINVAL;
4220 goto out;
4221 }
4222
4223 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4224 end_mark = qemu_get_be64(file);
4225
4226 ret = qemu_file_get_error(file);
4227 if (ret || size != local_size) {
4228 error_report("%s: read bitmap failed for ramblock '%s': %d"
4229 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4230 __func__, block->idstr, ret, local_size, size);
4231 ret = -EIO;
4232 goto out;
4233 }
4234
4235 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4236 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4237 __func__, block->idstr, end_mark);
4238 ret = -EINVAL;
4239 goto out;
4240 }
4241
4242 /*
4243 * Endianness conversion. We are during postcopy (though paused).
4244 * The dirty bitmap won't change. We can directly modify it.
4245 */
4246 bitmap_from_le(block->bmap, le_bitmap, nbits);
4247
4248 /*
4249 * What we received is "received bitmap". Revert it as the initial
4250 * dirty bitmap for this ramblock.
4251 */
4252 bitmap_complement(block->bmap, block->bmap, nbits);
4253
4254 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4255 ramblock_dirty_bitmap_clear_discarded_pages(block);
4256
4257 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4258 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4259
4260 /*
4261 * We succeeded to sync bitmap for current ramblock. If this is
4262 * the last one to sync, we need to notify the main send thread.
4263 */
4264 ram_dirty_bitmap_reload_notify(s);
4265
4266 ret = 0;
4267 out:
4268 g_free(le_bitmap);
4269 return ret;
4270 }
4271
4272 static int ram_resume_prepare(MigrationState *s, void *opaque)
4273 {
4274 RAMState *rs = *(RAMState **)opaque;
4275 int ret;
4276
4277 ret = ram_dirty_bitmap_sync_all(s, rs);
4278 if (ret) {
4279 return ret;
4280 }
4281
4282 ram_state_resume_prepare(rs, s->to_dst_file);
4283
4284 return 0;
4285 }
4286
4287 static SaveVMHandlers savevm_ram_handlers = {
4288 .save_setup = ram_save_setup,
4289 .save_live_iterate = ram_save_iterate,
4290 .save_live_complete_postcopy = ram_save_complete,
4291 .save_live_complete_precopy = ram_save_complete,
4292 .has_postcopy = ram_has_postcopy,
4293 .save_live_pending = ram_save_pending,
4294 .load_state = ram_load,
4295 .save_cleanup = ram_save_cleanup,
4296 .load_setup = ram_load_setup,
4297 .load_cleanup = ram_load_cleanup,
4298 .resume_prepare = ram_resume_prepare,
4299 };
4300
4301 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4302 size_t old_size, size_t new_size)
4303 {
4304 PostcopyState ps = postcopy_state_get();
4305 ram_addr_t offset;
4306 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4307 Error *err = NULL;
4308
4309 if (ramblock_is_ignored(rb)) {
4310 return;
4311 }
4312
4313 if (!migration_is_idle()) {
4314 /*
4315 * Precopy code on the source cannot deal with the size of RAM blocks
4316 * changing at random points in time - especially after sending the
4317 * RAM block sizes in the migration stream, they must no longer change.
4318 * Abort and indicate a proper reason.
4319 */
4320 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4321 migration_cancel(err);
4322 error_free(err);
4323 }
4324
4325 switch (ps) {
4326 case POSTCOPY_INCOMING_ADVISE:
4327 /*
4328 * Update what ram_postcopy_incoming_init()->init_range() does at the
4329 * time postcopy was advised. Syncing RAM blocks with the source will
4330 * result in RAM resizes.
4331 */
4332 if (old_size < new_size) {
4333 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4334 error_report("RAM block '%s' discard of resized RAM failed",
4335 rb->idstr);
4336 }
4337 }
4338 rb->postcopy_length = new_size;
4339 break;
4340 case POSTCOPY_INCOMING_NONE:
4341 case POSTCOPY_INCOMING_RUNNING:
4342 case POSTCOPY_INCOMING_END:
4343 /*
4344 * Once our guest is running, postcopy does no longer care about
4345 * resizes. When growing, the new memory was not available on the
4346 * source, no handler needed.
4347 */
4348 break;
4349 default:
4350 error_report("RAM block '%s' resized during postcopy state: %d",
4351 rb->idstr, ps);
4352 exit(-1);
4353 }
4354 }
4355
4356 static RAMBlockNotifier ram_mig_ram_notifier = {
4357 .ram_block_resized = ram_mig_ram_block_resized,
4358 };
4359
4360 void ram_mig_init(void)
4361 {
4362 qemu_mutex_init(&XBZRLE.lock);
4363 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4364 ram_block_notifier_add(&ram_mig_ram_notifier);
4365 }