]> git.proxmox.com Git - mirror_qemu.git/blob - migration/ram.c
migration: Remove vmstate.h from migration.h
[mirror_qemu.git] / migration / ram.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28 #include "qemu/osdep.h"
29 #include "qemu-common.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qapi-event.h"
33 #include "qemu/cutils.h"
34 #include "qemu/bitops.h"
35 #include "qemu/bitmap.h"
36 #include "qemu/timer.h"
37 #include "qemu/main-loop.h"
38 #include "xbzrle.h"
39 #include "migration/migration.h"
40 #include "migration/qemu-file.h"
41 #include "migration/vmstate.h"
42 #include "postcopy-ram.h"
43 #include "exec/address-spaces.h"
44 #include "migration/page_cache.h"
45 #include "qemu/error-report.h"
46 #include "trace.h"
47 #include "exec/ram_addr.h"
48 #include "qemu/rcu_queue.h"
49 #include "migration/colo.h"
50
51 /***********************************************************/
52 /* ram save/restore */
53
54 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
55 * worked for pages that where filled with the same char. We switched
56 * it to only search for the zero value. And to avoid confusion with
57 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
58 */
59
60 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
61 #define RAM_SAVE_FLAG_ZERO 0x02
62 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
63 #define RAM_SAVE_FLAG_PAGE 0x08
64 #define RAM_SAVE_FLAG_EOS 0x10
65 #define RAM_SAVE_FLAG_CONTINUE 0x20
66 #define RAM_SAVE_FLAG_XBZRLE 0x40
67 /* 0x80 is reserved in migration.h start with 0x100 next */
68 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
69
70 static uint8_t *ZERO_TARGET_PAGE;
71
72 static inline bool is_zero_range(uint8_t *p, uint64_t size)
73 {
74 return buffer_is_zero(p, size);
75 }
76
77 /* struct contains XBZRLE cache and a static page
78 used by the compression */
79 static struct {
80 /* buffer used for XBZRLE encoding */
81 uint8_t *encoded_buf;
82 /* buffer for storing page content */
83 uint8_t *current_buf;
84 /* Cache for XBZRLE, Protected by lock. */
85 PageCache *cache;
86 QemuMutex lock;
87 } XBZRLE;
88
89 /* buffer used for XBZRLE decoding */
90 static uint8_t *xbzrle_decoded_buf;
91
92 static void XBZRLE_cache_lock(void)
93 {
94 if (migrate_use_xbzrle())
95 qemu_mutex_lock(&XBZRLE.lock);
96 }
97
98 static void XBZRLE_cache_unlock(void)
99 {
100 if (migrate_use_xbzrle())
101 qemu_mutex_unlock(&XBZRLE.lock);
102 }
103
104 /**
105 * xbzrle_cache_resize: resize the xbzrle cache
106 *
107 * This function is called from qmp_migrate_set_cache_size in main
108 * thread, possibly while a migration is in progress. A running
109 * migration may be using the cache and might finish during this call,
110 * hence changes to the cache are protected by XBZRLE.lock().
111 *
112 * Returns the new_size or negative in case of error.
113 *
114 * @new_size: new cache size
115 */
116 int64_t xbzrle_cache_resize(int64_t new_size)
117 {
118 PageCache *new_cache;
119 int64_t ret;
120
121 if (new_size < TARGET_PAGE_SIZE) {
122 return -1;
123 }
124
125 XBZRLE_cache_lock();
126
127 if (XBZRLE.cache != NULL) {
128 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
129 goto out_new_size;
130 }
131 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
132 TARGET_PAGE_SIZE);
133 if (!new_cache) {
134 error_report("Error creating cache");
135 ret = -1;
136 goto out;
137 }
138
139 cache_fini(XBZRLE.cache);
140 XBZRLE.cache = new_cache;
141 }
142
143 out_new_size:
144 ret = pow2floor(new_size);
145 out:
146 XBZRLE_cache_unlock();
147 return ret;
148 }
149
150 /*
151 * An outstanding page request, on the source, having been received
152 * and queued
153 */
154 struct RAMSrcPageRequest {
155 RAMBlock *rb;
156 hwaddr offset;
157 hwaddr len;
158
159 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
160 };
161
162 /* State of RAM for migration */
163 struct RAMState {
164 /* QEMUFile used for this migration */
165 QEMUFile *f;
166 /* Last block that we have visited searching for dirty pages */
167 RAMBlock *last_seen_block;
168 /* Last block from where we have sent data */
169 RAMBlock *last_sent_block;
170 /* Last dirty target page we have sent */
171 ram_addr_t last_page;
172 /* last ram version we have seen */
173 uint32_t last_version;
174 /* We are in the first round */
175 bool ram_bulk_stage;
176 /* How many times we have dirty too many pages */
177 int dirty_rate_high_cnt;
178 /* How many times we have synchronized the bitmap */
179 uint64_t bitmap_sync_count;
180 /* these variables are used for bitmap sync */
181 /* last time we did a full bitmap_sync */
182 int64_t time_last_bitmap_sync;
183 /* bytes transferred at start_time */
184 uint64_t bytes_xfer_prev;
185 /* number of dirty pages since start_time */
186 uint64_t num_dirty_pages_period;
187 /* xbzrle misses since the beginning of the period */
188 uint64_t xbzrle_cache_miss_prev;
189 /* number of iterations at the beginning of period */
190 uint64_t iterations_prev;
191 /* Accounting fields */
192 /* number of zero pages. It used to be pages filled by the same char. */
193 uint64_t zero_pages;
194 /* number of normal transferred pages */
195 uint64_t norm_pages;
196 /* Iterations since start */
197 uint64_t iterations;
198 /* xbzrle transmitted bytes. Notice that this is with
199 * compression, they can't be calculated from the pages */
200 uint64_t xbzrle_bytes;
201 /* xbzrle transmmited pages */
202 uint64_t xbzrle_pages;
203 /* xbzrle number of cache miss */
204 uint64_t xbzrle_cache_miss;
205 /* xbzrle miss rate */
206 double xbzrle_cache_miss_rate;
207 /* xbzrle number of overflows */
208 uint64_t xbzrle_overflows;
209 /* number of dirty bits in the bitmap */
210 uint64_t migration_dirty_pages;
211 /* total number of bytes transferred */
212 uint64_t bytes_transferred;
213 /* number of dirtied pages in the last second */
214 uint64_t dirty_pages_rate;
215 /* Count of requests incoming from destination */
216 uint64_t postcopy_requests;
217 /* protects modification of the bitmap */
218 QemuMutex bitmap_mutex;
219 /* The RAMBlock used in the last src_page_requests */
220 RAMBlock *last_req_rb;
221 /* Queue of outstanding page requests from the destination */
222 QemuMutex src_page_req_mutex;
223 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
224 };
225 typedef struct RAMState RAMState;
226
227 static RAMState ram_state;
228
229 uint64_t dup_mig_pages_transferred(void)
230 {
231 return ram_state.zero_pages;
232 }
233
234 uint64_t norm_mig_pages_transferred(void)
235 {
236 return ram_state.norm_pages;
237 }
238
239 uint64_t xbzrle_mig_bytes_transferred(void)
240 {
241 return ram_state.xbzrle_bytes;
242 }
243
244 uint64_t xbzrle_mig_pages_transferred(void)
245 {
246 return ram_state.xbzrle_pages;
247 }
248
249 uint64_t xbzrle_mig_pages_cache_miss(void)
250 {
251 return ram_state.xbzrle_cache_miss;
252 }
253
254 double xbzrle_mig_cache_miss_rate(void)
255 {
256 return ram_state.xbzrle_cache_miss_rate;
257 }
258
259 uint64_t xbzrle_mig_pages_overflow(void)
260 {
261 return ram_state.xbzrle_overflows;
262 }
263
264 uint64_t ram_bytes_transferred(void)
265 {
266 return ram_state.bytes_transferred;
267 }
268
269 uint64_t ram_bytes_remaining(void)
270 {
271 return ram_state.migration_dirty_pages * TARGET_PAGE_SIZE;
272 }
273
274 uint64_t ram_dirty_sync_count(void)
275 {
276 return ram_state.bitmap_sync_count;
277 }
278
279 uint64_t ram_dirty_pages_rate(void)
280 {
281 return ram_state.dirty_pages_rate;
282 }
283
284 uint64_t ram_postcopy_requests(void)
285 {
286 return ram_state.postcopy_requests;
287 }
288
289 /* used by the search for pages to send */
290 struct PageSearchStatus {
291 /* Current block being searched */
292 RAMBlock *block;
293 /* Current page to search from */
294 unsigned long page;
295 /* Set once we wrap around */
296 bool complete_round;
297 };
298 typedef struct PageSearchStatus PageSearchStatus;
299
300 struct CompressParam {
301 bool done;
302 bool quit;
303 QEMUFile *file;
304 QemuMutex mutex;
305 QemuCond cond;
306 RAMBlock *block;
307 ram_addr_t offset;
308 };
309 typedef struct CompressParam CompressParam;
310
311 struct DecompressParam {
312 bool done;
313 bool quit;
314 QemuMutex mutex;
315 QemuCond cond;
316 void *des;
317 uint8_t *compbuf;
318 int len;
319 };
320 typedef struct DecompressParam DecompressParam;
321
322 static CompressParam *comp_param;
323 static QemuThread *compress_threads;
324 /* comp_done_cond is used to wake up the migration thread when
325 * one of the compression threads has finished the compression.
326 * comp_done_lock is used to co-work with comp_done_cond.
327 */
328 static QemuMutex comp_done_lock;
329 static QemuCond comp_done_cond;
330 /* The empty QEMUFileOps will be used by file in CompressParam */
331 static const QEMUFileOps empty_ops = { };
332
333 static DecompressParam *decomp_param;
334 static QemuThread *decompress_threads;
335 static QemuMutex decomp_done_lock;
336 static QemuCond decomp_done_cond;
337
338 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
339 ram_addr_t offset);
340
341 static void *do_data_compress(void *opaque)
342 {
343 CompressParam *param = opaque;
344 RAMBlock *block;
345 ram_addr_t offset;
346
347 qemu_mutex_lock(&param->mutex);
348 while (!param->quit) {
349 if (param->block) {
350 block = param->block;
351 offset = param->offset;
352 param->block = NULL;
353 qemu_mutex_unlock(&param->mutex);
354
355 do_compress_ram_page(param->file, block, offset);
356
357 qemu_mutex_lock(&comp_done_lock);
358 param->done = true;
359 qemu_cond_signal(&comp_done_cond);
360 qemu_mutex_unlock(&comp_done_lock);
361
362 qemu_mutex_lock(&param->mutex);
363 } else {
364 qemu_cond_wait(&param->cond, &param->mutex);
365 }
366 }
367 qemu_mutex_unlock(&param->mutex);
368
369 return NULL;
370 }
371
372 static inline void terminate_compression_threads(void)
373 {
374 int idx, thread_count;
375
376 thread_count = migrate_compress_threads();
377
378 for (idx = 0; idx < thread_count; idx++) {
379 qemu_mutex_lock(&comp_param[idx].mutex);
380 comp_param[idx].quit = true;
381 qemu_cond_signal(&comp_param[idx].cond);
382 qemu_mutex_unlock(&comp_param[idx].mutex);
383 }
384 }
385
386 void migrate_compress_threads_join(void)
387 {
388 int i, thread_count;
389
390 if (!migrate_use_compression()) {
391 return;
392 }
393 terminate_compression_threads();
394 thread_count = migrate_compress_threads();
395 for (i = 0; i < thread_count; i++) {
396 qemu_thread_join(compress_threads + i);
397 qemu_fclose(comp_param[i].file);
398 qemu_mutex_destroy(&comp_param[i].mutex);
399 qemu_cond_destroy(&comp_param[i].cond);
400 }
401 qemu_mutex_destroy(&comp_done_lock);
402 qemu_cond_destroy(&comp_done_cond);
403 g_free(compress_threads);
404 g_free(comp_param);
405 compress_threads = NULL;
406 comp_param = NULL;
407 }
408
409 void migrate_compress_threads_create(void)
410 {
411 int i, thread_count;
412
413 if (!migrate_use_compression()) {
414 return;
415 }
416 thread_count = migrate_compress_threads();
417 compress_threads = g_new0(QemuThread, thread_count);
418 comp_param = g_new0(CompressParam, thread_count);
419 qemu_cond_init(&comp_done_cond);
420 qemu_mutex_init(&comp_done_lock);
421 for (i = 0; i < thread_count; i++) {
422 /* comp_param[i].file is just used as a dummy buffer to save data,
423 * set its ops to empty.
424 */
425 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
426 comp_param[i].done = true;
427 comp_param[i].quit = false;
428 qemu_mutex_init(&comp_param[i].mutex);
429 qemu_cond_init(&comp_param[i].cond);
430 qemu_thread_create(compress_threads + i, "compress",
431 do_data_compress, comp_param + i,
432 QEMU_THREAD_JOINABLE);
433 }
434 }
435
436 /**
437 * save_page_header: write page header to wire
438 *
439 * If this is the 1st block, it also writes the block identification
440 *
441 * Returns the number of bytes written
442 *
443 * @f: QEMUFile where to send the data
444 * @block: block that contains the page we want to send
445 * @offset: offset inside the block for the page
446 * in the lower bits, it contains flags
447 */
448 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
449 ram_addr_t offset)
450 {
451 size_t size, len;
452
453 if (block == rs->last_sent_block) {
454 offset |= RAM_SAVE_FLAG_CONTINUE;
455 }
456 qemu_put_be64(f, offset);
457 size = 8;
458
459 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
460 len = strlen(block->idstr);
461 qemu_put_byte(f, len);
462 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
463 size += 1 + len;
464 rs->last_sent_block = block;
465 }
466 return size;
467 }
468
469 /**
470 * mig_throttle_guest_down: throotle down the guest
471 *
472 * Reduce amount of guest cpu execution to hopefully slow down memory
473 * writes. If guest dirty memory rate is reduced below the rate at
474 * which we can transfer pages to the destination then we should be
475 * able to complete migration. Some workloads dirty memory way too
476 * fast and will not effectively converge, even with auto-converge.
477 */
478 static void mig_throttle_guest_down(void)
479 {
480 MigrationState *s = migrate_get_current();
481 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
482 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
483
484 /* We have not started throttling yet. Let's start it. */
485 if (!cpu_throttle_active()) {
486 cpu_throttle_set(pct_initial);
487 } else {
488 /* Throttling already on, just increase the rate */
489 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
490 }
491 }
492
493 /**
494 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
495 *
496 * @rs: current RAM state
497 * @current_addr: address for the zero page
498 *
499 * Update the xbzrle cache to reflect a page that's been sent as all 0.
500 * The important thing is that a stale (not-yet-0'd) page be replaced
501 * by the new data.
502 * As a bonus, if the page wasn't in the cache it gets added so that
503 * when a small write is made into the 0'd page it gets XBZRLE sent.
504 */
505 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
506 {
507 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
508 return;
509 }
510
511 /* We don't care if this fails to allocate a new cache page
512 * as long as it updated an old one */
513 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
514 rs->bitmap_sync_count);
515 }
516
517 #define ENCODING_FLAG_XBZRLE 0x1
518
519 /**
520 * save_xbzrle_page: compress and send current page
521 *
522 * Returns: 1 means that we wrote the page
523 * 0 means that page is identical to the one already sent
524 * -1 means that xbzrle would be longer than normal
525 *
526 * @rs: current RAM state
527 * @current_data: pointer to the address of the page contents
528 * @current_addr: addr of the page
529 * @block: block that contains the page we want to send
530 * @offset: offset inside the block for the page
531 * @last_stage: if we are at the completion stage
532 */
533 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
534 ram_addr_t current_addr, RAMBlock *block,
535 ram_addr_t offset, bool last_stage)
536 {
537 int encoded_len = 0, bytes_xbzrle;
538 uint8_t *prev_cached_page;
539
540 if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
541 rs->xbzrle_cache_miss++;
542 if (!last_stage) {
543 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
544 rs->bitmap_sync_count) == -1) {
545 return -1;
546 } else {
547 /* update *current_data when the page has been
548 inserted into cache */
549 *current_data = get_cached_data(XBZRLE.cache, current_addr);
550 }
551 }
552 return -1;
553 }
554
555 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
556
557 /* save current buffer into memory */
558 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
559
560 /* XBZRLE encoding (if there is no overflow) */
561 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
562 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
563 TARGET_PAGE_SIZE);
564 if (encoded_len == 0) {
565 trace_save_xbzrle_page_skipping();
566 return 0;
567 } else if (encoded_len == -1) {
568 trace_save_xbzrle_page_overflow();
569 rs->xbzrle_overflows++;
570 /* update data in the cache */
571 if (!last_stage) {
572 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
573 *current_data = prev_cached_page;
574 }
575 return -1;
576 }
577
578 /* we need to update the data in the cache, in order to get the same data */
579 if (!last_stage) {
580 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
581 }
582
583 /* Send XBZRLE based compressed page */
584 bytes_xbzrle = save_page_header(rs, rs->f, block,
585 offset | RAM_SAVE_FLAG_XBZRLE);
586 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
587 qemu_put_be16(rs->f, encoded_len);
588 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
589 bytes_xbzrle += encoded_len + 1 + 2;
590 rs->xbzrle_pages++;
591 rs->xbzrle_bytes += bytes_xbzrle;
592 rs->bytes_transferred += bytes_xbzrle;
593
594 return 1;
595 }
596
597 /**
598 * migration_bitmap_find_dirty: find the next dirty page from start
599 *
600 * Called with rcu_read_lock() to protect migration_bitmap
601 *
602 * Returns the byte offset within memory region of the start of a dirty page
603 *
604 * @rs: current RAM state
605 * @rb: RAMBlock where to search for dirty pages
606 * @start: page where we start the search
607 */
608 static inline
609 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
610 unsigned long start)
611 {
612 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
613 unsigned long *bitmap = rb->bmap;
614 unsigned long next;
615
616 if (rs->ram_bulk_stage && start > 0) {
617 next = start + 1;
618 } else {
619 next = find_next_bit(bitmap, size, start);
620 }
621
622 return next;
623 }
624
625 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
626 RAMBlock *rb,
627 unsigned long page)
628 {
629 bool ret;
630
631 ret = test_and_clear_bit(page, rb->bmap);
632
633 if (ret) {
634 rs->migration_dirty_pages--;
635 }
636 return ret;
637 }
638
639 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
640 ram_addr_t start, ram_addr_t length)
641 {
642 rs->migration_dirty_pages +=
643 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
644 &rs->num_dirty_pages_period);
645 }
646
647 /**
648 * ram_pagesize_summary: calculate all the pagesizes of a VM
649 *
650 * Returns a summary bitmap of the page sizes of all RAMBlocks
651 *
652 * For VMs with just normal pages this is equivalent to the host page
653 * size. If it's got some huge pages then it's the OR of all the
654 * different page sizes.
655 */
656 uint64_t ram_pagesize_summary(void)
657 {
658 RAMBlock *block;
659 uint64_t summary = 0;
660
661 RAMBLOCK_FOREACH(block) {
662 summary |= block->page_size;
663 }
664
665 return summary;
666 }
667
668 static void migration_bitmap_sync(RAMState *rs)
669 {
670 RAMBlock *block;
671 int64_t end_time;
672 uint64_t bytes_xfer_now;
673
674 rs->bitmap_sync_count++;
675
676 if (!rs->bytes_xfer_prev) {
677 rs->bytes_xfer_prev = ram_bytes_transferred();
678 }
679
680 if (!rs->time_last_bitmap_sync) {
681 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
682 }
683
684 trace_migration_bitmap_sync_start();
685 memory_global_dirty_log_sync();
686
687 qemu_mutex_lock(&rs->bitmap_mutex);
688 rcu_read_lock();
689 RAMBLOCK_FOREACH(block) {
690 migration_bitmap_sync_range(rs, block, 0, block->used_length);
691 }
692 rcu_read_unlock();
693 qemu_mutex_unlock(&rs->bitmap_mutex);
694
695 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
696
697 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
698
699 /* more than 1 second = 1000 millisecons */
700 if (end_time > rs->time_last_bitmap_sync + 1000) {
701 if (migrate_auto_converge()) {
702 /* The following detection logic can be refined later. For now:
703 Check to see if the dirtied bytes is 50% more than the approx.
704 amount of bytes that just got transferred since the last time we
705 were in this routine. If that happens twice, start or increase
706 throttling */
707 bytes_xfer_now = ram_bytes_transferred();
708
709 if (rs->dirty_pages_rate &&
710 (rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
711 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
712 (rs->dirty_rate_high_cnt++ >= 2)) {
713 trace_migration_throttle();
714 rs->dirty_rate_high_cnt = 0;
715 mig_throttle_guest_down();
716 }
717 rs->bytes_xfer_prev = bytes_xfer_now;
718 }
719
720 if (migrate_use_xbzrle()) {
721 if (rs->iterations_prev != rs->iterations) {
722 rs->xbzrle_cache_miss_rate =
723 (double)(rs->xbzrle_cache_miss -
724 rs->xbzrle_cache_miss_prev) /
725 (rs->iterations - rs->iterations_prev);
726 }
727 rs->iterations_prev = rs->iterations;
728 rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss;
729 }
730 rs->dirty_pages_rate = rs->num_dirty_pages_period * 1000
731 / (end_time - rs->time_last_bitmap_sync);
732 rs->time_last_bitmap_sync = end_time;
733 rs->num_dirty_pages_period = 0;
734 }
735 if (migrate_use_events()) {
736 qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
737 }
738 }
739
740 /**
741 * save_zero_page: send the zero page to the stream
742 *
743 * Returns the number of pages written.
744 *
745 * @rs: current RAM state
746 * @block: block that contains the page we want to send
747 * @offset: offset inside the block for the page
748 * @p: pointer to the page
749 */
750 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
751 uint8_t *p)
752 {
753 int pages = -1;
754
755 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
756 rs->zero_pages++;
757 rs->bytes_transferred +=
758 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
759 qemu_put_byte(rs->f, 0);
760 rs->bytes_transferred += 1;
761 pages = 1;
762 }
763
764 return pages;
765 }
766
767 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
768 {
769 if (!migrate_release_ram() || !migration_in_postcopy()) {
770 return;
771 }
772
773 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
774 }
775
776 /**
777 * ram_save_page: send the given page to the stream
778 *
779 * Returns the number of pages written.
780 * < 0 - error
781 * >=0 - Number of pages written - this might legally be 0
782 * if xbzrle noticed the page was the same.
783 *
784 * @rs: current RAM state
785 * @block: block that contains the page we want to send
786 * @offset: offset inside the block for the page
787 * @last_stage: if we are at the completion stage
788 */
789 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
790 {
791 int pages = -1;
792 uint64_t bytes_xmit;
793 ram_addr_t current_addr;
794 uint8_t *p;
795 int ret;
796 bool send_async = true;
797 RAMBlock *block = pss->block;
798 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
799
800 p = block->host + offset;
801 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
802
803 /* In doubt sent page as normal */
804 bytes_xmit = 0;
805 ret = ram_control_save_page(rs->f, block->offset,
806 offset, TARGET_PAGE_SIZE, &bytes_xmit);
807 if (bytes_xmit) {
808 rs->bytes_transferred += bytes_xmit;
809 pages = 1;
810 }
811
812 XBZRLE_cache_lock();
813
814 current_addr = block->offset + offset;
815
816 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
817 if (ret != RAM_SAVE_CONTROL_DELAYED) {
818 if (bytes_xmit > 0) {
819 rs->norm_pages++;
820 } else if (bytes_xmit == 0) {
821 rs->zero_pages++;
822 }
823 }
824 } else {
825 pages = save_zero_page(rs, block, offset, p);
826 if (pages > 0) {
827 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
828 * page would be stale
829 */
830 xbzrle_cache_zero_page(rs, current_addr);
831 ram_release_pages(block->idstr, offset, pages);
832 } else if (!rs->ram_bulk_stage &&
833 !migration_in_postcopy() && migrate_use_xbzrle()) {
834 pages = save_xbzrle_page(rs, &p, current_addr, block,
835 offset, last_stage);
836 if (!last_stage) {
837 /* Can't send this cached data async, since the cache page
838 * might get updated before it gets to the wire
839 */
840 send_async = false;
841 }
842 }
843 }
844
845 /* XBZRLE overflow or normal page */
846 if (pages == -1) {
847 rs->bytes_transferred += save_page_header(rs, rs->f, block,
848 offset | RAM_SAVE_FLAG_PAGE);
849 if (send_async) {
850 qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
851 migrate_release_ram() &
852 migration_in_postcopy());
853 } else {
854 qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
855 }
856 rs->bytes_transferred += TARGET_PAGE_SIZE;
857 pages = 1;
858 rs->norm_pages++;
859 }
860
861 XBZRLE_cache_unlock();
862
863 return pages;
864 }
865
866 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
867 ram_addr_t offset)
868 {
869 RAMState *rs = &ram_state;
870 int bytes_sent, blen;
871 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
872
873 bytes_sent = save_page_header(rs, f, block, offset |
874 RAM_SAVE_FLAG_COMPRESS_PAGE);
875 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
876 migrate_compress_level());
877 if (blen < 0) {
878 bytes_sent = 0;
879 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
880 error_report("compressed data failed!");
881 } else {
882 bytes_sent += blen;
883 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
884 }
885
886 return bytes_sent;
887 }
888
889 static void flush_compressed_data(RAMState *rs)
890 {
891 int idx, len, thread_count;
892
893 if (!migrate_use_compression()) {
894 return;
895 }
896 thread_count = migrate_compress_threads();
897
898 qemu_mutex_lock(&comp_done_lock);
899 for (idx = 0; idx < thread_count; idx++) {
900 while (!comp_param[idx].done) {
901 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
902 }
903 }
904 qemu_mutex_unlock(&comp_done_lock);
905
906 for (idx = 0; idx < thread_count; idx++) {
907 qemu_mutex_lock(&comp_param[idx].mutex);
908 if (!comp_param[idx].quit) {
909 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
910 rs->bytes_transferred += len;
911 }
912 qemu_mutex_unlock(&comp_param[idx].mutex);
913 }
914 }
915
916 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
917 ram_addr_t offset)
918 {
919 param->block = block;
920 param->offset = offset;
921 }
922
923 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
924 ram_addr_t offset)
925 {
926 int idx, thread_count, bytes_xmit = -1, pages = -1;
927
928 thread_count = migrate_compress_threads();
929 qemu_mutex_lock(&comp_done_lock);
930 while (true) {
931 for (idx = 0; idx < thread_count; idx++) {
932 if (comp_param[idx].done) {
933 comp_param[idx].done = false;
934 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
935 qemu_mutex_lock(&comp_param[idx].mutex);
936 set_compress_params(&comp_param[idx], block, offset);
937 qemu_cond_signal(&comp_param[idx].cond);
938 qemu_mutex_unlock(&comp_param[idx].mutex);
939 pages = 1;
940 rs->norm_pages++;
941 rs->bytes_transferred += bytes_xmit;
942 break;
943 }
944 }
945 if (pages > 0) {
946 break;
947 } else {
948 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
949 }
950 }
951 qemu_mutex_unlock(&comp_done_lock);
952
953 return pages;
954 }
955
956 /**
957 * ram_save_compressed_page: compress the given page and send it to the stream
958 *
959 * Returns the number of pages written.
960 *
961 * @rs: current RAM state
962 * @block: block that contains the page we want to send
963 * @offset: offset inside the block for the page
964 * @last_stage: if we are at the completion stage
965 */
966 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
967 bool last_stage)
968 {
969 int pages = -1;
970 uint64_t bytes_xmit = 0;
971 uint8_t *p;
972 int ret, blen;
973 RAMBlock *block = pss->block;
974 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
975
976 p = block->host + offset;
977
978 ret = ram_control_save_page(rs->f, block->offset,
979 offset, TARGET_PAGE_SIZE, &bytes_xmit);
980 if (bytes_xmit) {
981 rs->bytes_transferred += bytes_xmit;
982 pages = 1;
983 }
984 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
985 if (ret != RAM_SAVE_CONTROL_DELAYED) {
986 if (bytes_xmit > 0) {
987 rs->norm_pages++;
988 } else if (bytes_xmit == 0) {
989 rs->zero_pages++;
990 }
991 }
992 } else {
993 /* When starting the process of a new block, the first page of
994 * the block should be sent out before other pages in the same
995 * block, and all the pages in last block should have been sent
996 * out, keeping this order is important, because the 'cont' flag
997 * is used to avoid resending the block name.
998 */
999 if (block != rs->last_sent_block) {
1000 flush_compressed_data(rs);
1001 pages = save_zero_page(rs, block, offset, p);
1002 if (pages == -1) {
1003 /* Make sure the first page is sent out before other pages */
1004 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1005 RAM_SAVE_FLAG_COMPRESS_PAGE);
1006 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1007 migrate_compress_level());
1008 if (blen > 0) {
1009 rs->bytes_transferred += bytes_xmit + blen;
1010 rs->norm_pages++;
1011 pages = 1;
1012 } else {
1013 qemu_file_set_error(rs->f, blen);
1014 error_report("compressed data failed!");
1015 }
1016 }
1017 if (pages > 0) {
1018 ram_release_pages(block->idstr, offset, pages);
1019 }
1020 } else {
1021 pages = save_zero_page(rs, block, offset, p);
1022 if (pages == -1) {
1023 pages = compress_page_with_multi_thread(rs, block, offset);
1024 } else {
1025 ram_release_pages(block->idstr, offset, pages);
1026 }
1027 }
1028 }
1029
1030 return pages;
1031 }
1032
1033 /**
1034 * find_dirty_block: find the next dirty page and update any state
1035 * associated with the search process.
1036 *
1037 * Returns if a page is found
1038 *
1039 * @rs: current RAM state
1040 * @pss: data about the state of the current dirty page scan
1041 * @again: set to false if the search has scanned the whole of RAM
1042 */
1043 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1044 {
1045 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1046 if (pss->complete_round && pss->block == rs->last_seen_block &&
1047 pss->page >= rs->last_page) {
1048 /*
1049 * We've been once around the RAM and haven't found anything.
1050 * Give up.
1051 */
1052 *again = false;
1053 return false;
1054 }
1055 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1056 /* Didn't find anything in this RAM Block */
1057 pss->page = 0;
1058 pss->block = QLIST_NEXT_RCU(pss->block, next);
1059 if (!pss->block) {
1060 /* Hit the end of the list */
1061 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1062 /* Flag that we've looped */
1063 pss->complete_round = true;
1064 rs->ram_bulk_stage = false;
1065 if (migrate_use_xbzrle()) {
1066 /* If xbzrle is on, stop using the data compression at this
1067 * point. In theory, xbzrle can do better than compression.
1068 */
1069 flush_compressed_data(rs);
1070 }
1071 }
1072 /* Didn't find anything this time, but try again on the new block */
1073 *again = true;
1074 return false;
1075 } else {
1076 /* Can go around again, but... */
1077 *again = true;
1078 /* We've found something so probably don't need to */
1079 return true;
1080 }
1081 }
1082
1083 /**
1084 * unqueue_page: gets a page of the queue
1085 *
1086 * Helper for 'get_queued_page' - gets a page off the queue
1087 *
1088 * Returns the block of the page (or NULL if none available)
1089 *
1090 * @rs: current RAM state
1091 * @offset: used to return the offset within the RAMBlock
1092 */
1093 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1094 {
1095 RAMBlock *block = NULL;
1096
1097 qemu_mutex_lock(&rs->src_page_req_mutex);
1098 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1099 struct RAMSrcPageRequest *entry =
1100 QSIMPLEQ_FIRST(&rs->src_page_requests);
1101 block = entry->rb;
1102 *offset = entry->offset;
1103
1104 if (entry->len > TARGET_PAGE_SIZE) {
1105 entry->len -= TARGET_PAGE_SIZE;
1106 entry->offset += TARGET_PAGE_SIZE;
1107 } else {
1108 memory_region_unref(block->mr);
1109 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1110 g_free(entry);
1111 }
1112 }
1113 qemu_mutex_unlock(&rs->src_page_req_mutex);
1114
1115 return block;
1116 }
1117
1118 /**
1119 * get_queued_page: unqueue a page from the postocpy requests
1120 *
1121 * Skips pages that are already sent (!dirty)
1122 *
1123 * Returns if a queued page is found
1124 *
1125 * @rs: current RAM state
1126 * @pss: data about the state of the current dirty page scan
1127 */
1128 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1129 {
1130 RAMBlock *block;
1131 ram_addr_t offset;
1132 bool dirty;
1133
1134 do {
1135 block = unqueue_page(rs, &offset);
1136 /*
1137 * We're sending this page, and since it's postcopy nothing else
1138 * will dirty it, and we must make sure it doesn't get sent again
1139 * even if this queue request was received after the background
1140 * search already sent it.
1141 */
1142 if (block) {
1143 unsigned long page;
1144
1145 page = offset >> TARGET_PAGE_BITS;
1146 dirty = test_bit(page, block->bmap);
1147 if (!dirty) {
1148 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1149 page, test_bit(page, block->unsentmap));
1150 } else {
1151 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1152 }
1153 }
1154
1155 } while (block && !dirty);
1156
1157 if (block) {
1158 /*
1159 * As soon as we start servicing pages out of order, then we have
1160 * to kill the bulk stage, since the bulk stage assumes
1161 * in (migration_bitmap_find_and_reset_dirty) that every page is
1162 * dirty, that's no longer true.
1163 */
1164 rs->ram_bulk_stage = false;
1165
1166 /*
1167 * We want the background search to continue from the queued page
1168 * since the guest is likely to want other pages near to the page
1169 * it just requested.
1170 */
1171 pss->block = block;
1172 pss->page = offset >> TARGET_PAGE_BITS;
1173 }
1174
1175 return !!block;
1176 }
1177
1178 /**
1179 * migration_page_queue_free: drop any remaining pages in the ram
1180 * request queue
1181 *
1182 * It should be empty at the end anyway, but in error cases there may
1183 * be some left. in case that there is any page left, we drop it.
1184 *
1185 */
1186 void migration_page_queue_free(void)
1187 {
1188 struct RAMSrcPageRequest *mspr, *next_mspr;
1189 RAMState *rs = &ram_state;
1190 /* This queue generally should be empty - but in the case of a failed
1191 * migration might have some droppings in.
1192 */
1193 rcu_read_lock();
1194 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1195 memory_region_unref(mspr->rb->mr);
1196 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1197 g_free(mspr);
1198 }
1199 rcu_read_unlock();
1200 }
1201
1202 /**
1203 * ram_save_queue_pages: queue the page for transmission
1204 *
1205 * A request from postcopy destination for example.
1206 *
1207 * Returns zero on success or negative on error
1208 *
1209 * @rbname: Name of the RAMBLock of the request. NULL means the
1210 * same that last one.
1211 * @start: starting address from the start of the RAMBlock
1212 * @len: length (in bytes) to send
1213 */
1214 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1215 {
1216 RAMBlock *ramblock;
1217 RAMState *rs = &ram_state;
1218
1219 rs->postcopy_requests++;
1220 rcu_read_lock();
1221 if (!rbname) {
1222 /* Reuse last RAMBlock */
1223 ramblock = rs->last_req_rb;
1224
1225 if (!ramblock) {
1226 /*
1227 * Shouldn't happen, we can't reuse the last RAMBlock if
1228 * it's the 1st request.
1229 */
1230 error_report("ram_save_queue_pages no previous block");
1231 goto err;
1232 }
1233 } else {
1234 ramblock = qemu_ram_block_by_name(rbname);
1235
1236 if (!ramblock) {
1237 /* We shouldn't be asked for a non-existent RAMBlock */
1238 error_report("ram_save_queue_pages no block '%s'", rbname);
1239 goto err;
1240 }
1241 rs->last_req_rb = ramblock;
1242 }
1243 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1244 if (start+len > ramblock->used_length) {
1245 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1246 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1247 __func__, start, len, ramblock->used_length);
1248 goto err;
1249 }
1250
1251 struct RAMSrcPageRequest *new_entry =
1252 g_malloc0(sizeof(struct RAMSrcPageRequest));
1253 new_entry->rb = ramblock;
1254 new_entry->offset = start;
1255 new_entry->len = len;
1256
1257 memory_region_ref(ramblock->mr);
1258 qemu_mutex_lock(&rs->src_page_req_mutex);
1259 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1260 qemu_mutex_unlock(&rs->src_page_req_mutex);
1261 rcu_read_unlock();
1262
1263 return 0;
1264
1265 err:
1266 rcu_read_unlock();
1267 return -1;
1268 }
1269
1270 /**
1271 * ram_save_target_page: save one target page
1272 *
1273 * Returns the number of pages written
1274 *
1275 * @rs: current RAM state
1276 * @ms: current migration state
1277 * @pss: data about the page we want to send
1278 * @last_stage: if we are at the completion stage
1279 */
1280 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1281 bool last_stage)
1282 {
1283 int res = 0;
1284
1285 /* Check the pages is dirty and if it is send it */
1286 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1287 /*
1288 * If xbzrle is on, stop using the data compression after first
1289 * round of migration even if compression is enabled. In theory,
1290 * xbzrle can do better than compression.
1291 */
1292 if (migrate_use_compression() &&
1293 (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1294 res = ram_save_compressed_page(rs, pss, last_stage);
1295 } else {
1296 res = ram_save_page(rs, pss, last_stage);
1297 }
1298
1299 if (res < 0) {
1300 return res;
1301 }
1302 if (pss->block->unsentmap) {
1303 clear_bit(pss->page, pss->block->unsentmap);
1304 }
1305 }
1306
1307 return res;
1308 }
1309
1310 /**
1311 * ram_save_host_page: save a whole host page
1312 *
1313 * Starting at *offset send pages up to the end of the current host
1314 * page. It's valid for the initial offset to point into the middle of
1315 * a host page in which case the remainder of the hostpage is sent.
1316 * Only dirty target pages are sent. Note that the host page size may
1317 * be a huge page for this block.
1318 * The saving stops at the boundary of the used_length of the block
1319 * if the RAMBlock isn't a multiple of the host page size.
1320 *
1321 * Returns the number of pages written or negative on error
1322 *
1323 * @rs: current RAM state
1324 * @ms: current migration state
1325 * @pss: data about the page we want to send
1326 * @last_stage: if we are at the completion stage
1327 */
1328 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1329 bool last_stage)
1330 {
1331 int tmppages, pages = 0;
1332 size_t pagesize_bits =
1333 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1334
1335 do {
1336 tmppages = ram_save_target_page(rs, pss, last_stage);
1337 if (tmppages < 0) {
1338 return tmppages;
1339 }
1340
1341 pages += tmppages;
1342 pss->page++;
1343 } while ((pss->page & (pagesize_bits - 1)) &&
1344 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1345
1346 /* The offset we leave with is the last one we looked at */
1347 pss->page--;
1348 return pages;
1349 }
1350
1351 /**
1352 * ram_find_and_save_block: finds a dirty page and sends it to f
1353 *
1354 * Called within an RCU critical section.
1355 *
1356 * Returns the number of pages written where zero means no dirty pages
1357 *
1358 * @rs: current RAM state
1359 * @last_stage: if we are at the completion stage
1360 *
1361 * On systems where host-page-size > target-page-size it will send all the
1362 * pages in a host page that are dirty.
1363 */
1364
1365 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1366 {
1367 PageSearchStatus pss;
1368 int pages = 0;
1369 bool again, found;
1370
1371 /* No dirty page as there is zero RAM */
1372 if (!ram_bytes_total()) {
1373 return pages;
1374 }
1375
1376 pss.block = rs->last_seen_block;
1377 pss.page = rs->last_page;
1378 pss.complete_round = false;
1379
1380 if (!pss.block) {
1381 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1382 }
1383
1384 do {
1385 again = true;
1386 found = get_queued_page(rs, &pss);
1387
1388 if (!found) {
1389 /* priority queue empty, so just search for something dirty */
1390 found = find_dirty_block(rs, &pss, &again);
1391 }
1392
1393 if (found) {
1394 pages = ram_save_host_page(rs, &pss, last_stage);
1395 }
1396 } while (!pages && again);
1397
1398 rs->last_seen_block = pss.block;
1399 rs->last_page = pss.page;
1400
1401 return pages;
1402 }
1403
1404 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1405 {
1406 uint64_t pages = size / TARGET_PAGE_SIZE;
1407 RAMState *rs = &ram_state;
1408
1409 if (zero) {
1410 rs->zero_pages += pages;
1411 } else {
1412 rs->norm_pages += pages;
1413 rs->bytes_transferred += size;
1414 qemu_update_position(f, size);
1415 }
1416 }
1417
1418 uint64_t ram_bytes_total(void)
1419 {
1420 RAMBlock *block;
1421 uint64_t total = 0;
1422
1423 rcu_read_lock();
1424 RAMBLOCK_FOREACH(block) {
1425 total += block->used_length;
1426 }
1427 rcu_read_unlock();
1428 return total;
1429 }
1430
1431 void free_xbzrle_decoded_buf(void)
1432 {
1433 g_free(xbzrle_decoded_buf);
1434 xbzrle_decoded_buf = NULL;
1435 }
1436
1437 static void ram_migration_cleanup(void *opaque)
1438 {
1439 RAMBlock *block;
1440
1441 /* caller have hold iothread lock or is in a bh, so there is
1442 * no writing race against this migration_bitmap
1443 */
1444 memory_global_dirty_log_stop();
1445
1446 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1447 g_free(block->bmap);
1448 block->bmap = NULL;
1449 g_free(block->unsentmap);
1450 block->unsentmap = NULL;
1451 }
1452
1453 XBZRLE_cache_lock();
1454 if (XBZRLE.cache) {
1455 cache_fini(XBZRLE.cache);
1456 g_free(XBZRLE.encoded_buf);
1457 g_free(XBZRLE.current_buf);
1458 g_free(ZERO_TARGET_PAGE);
1459 XBZRLE.cache = NULL;
1460 XBZRLE.encoded_buf = NULL;
1461 XBZRLE.current_buf = NULL;
1462 }
1463 XBZRLE_cache_unlock();
1464 }
1465
1466 static void ram_state_reset(RAMState *rs)
1467 {
1468 rs->last_seen_block = NULL;
1469 rs->last_sent_block = NULL;
1470 rs->last_page = 0;
1471 rs->last_version = ram_list.version;
1472 rs->ram_bulk_stage = true;
1473 }
1474
1475 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1476
1477 /*
1478 * 'expected' is the value you expect the bitmap mostly to be full
1479 * of; it won't bother printing lines that are all this value.
1480 * If 'todump' is null the migration bitmap is dumped.
1481 */
1482 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1483 unsigned long pages)
1484 {
1485 int64_t cur;
1486 int64_t linelen = 128;
1487 char linebuf[129];
1488
1489 for (cur = 0; cur < pages; cur += linelen) {
1490 int64_t curb;
1491 bool found = false;
1492 /*
1493 * Last line; catch the case where the line length
1494 * is longer than remaining ram
1495 */
1496 if (cur + linelen > pages) {
1497 linelen = pages - cur;
1498 }
1499 for (curb = 0; curb < linelen; curb++) {
1500 bool thisbit = test_bit(cur + curb, todump);
1501 linebuf[curb] = thisbit ? '1' : '.';
1502 found = found || (thisbit != expected);
1503 }
1504 if (found) {
1505 linebuf[curb] = '\0';
1506 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1507 }
1508 }
1509 }
1510
1511 /* **** functions for postcopy ***** */
1512
1513 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1514 {
1515 struct RAMBlock *block;
1516
1517 RAMBLOCK_FOREACH(block) {
1518 unsigned long *bitmap = block->bmap;
1519 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1520 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1521
1522 while (run_start < range) {
1523 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1524 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1525 (run_end - run_start) << TARGET_PAGE_BITS);
1526 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1527 }
1528 }
1529 }
1530
1531 /**
1532 * postcopy_send_discard_bm_ram: discard a RAMBlock
1533 *
1534 * Returns zero on success
1535 *
1536 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1537 * Note: At this point the 'unsentmap' is the processed bitmap combined
1538 * with the dirtymap; so a '1' means it's either dirty or unsent.
1539 *
1540 * @ms: current migration state
1541 * @pds: state for postcopy
1542 * @start: RAMBlock starting page
1543 * @length: RAMBlock size
1544 */
1545 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1546 PostcopyDiscardState *pds,
1547 RAMBlock *block)
1548 {
1549 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1550 unsigned long current;
1551 unsigned long *unsentmap = block->unsentmap;
1552
1553 for (current = 0; current < end; ) {
1554 unsigned long one = find_next_bit(unsentmap, end, current);
1555
1556 if (one <= end) {
1557 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1558 unsigned long discard_length;
1559
1560 if (zero >= end) {
1561 discard_length = end - one;
1562 } else {
1563 discard_length = zero - one;
1564 }
1565 if (discard_length) {
1566 postcopy_discard_send_range(ms, pds, one, discard_length);
1567 }
1568 current = one + discard_length;
1569 } else {
1570 current = one;
1571 }
1572 }
1573
1574 return 0;
1575 }
1576
1577 /**
1578 * postcopy_each_ram_send_discard: discard all RAMBlocks
1579 *
1580 * Returns 0 for success or negative for error
1581 *
1582 * Utility for the outgoing postcopy code.
1583 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1584 * passing it bitmap indexes and name.
1585 * (qemu_ram_foreach_block ends up passing unscaled lengths
1586 * which would mean postcopy code would have to deal with target page)
1587 *
1588 * @ms: current migration state
1589 */
1590 static int postcopy_each_ram_send_discard(MigrationState *ms)
1591 {
1592 struct RAMBlock *block;
1593 int ret;
1594
1595 RAMBLOCK_FOREACH(block) {
1596 PostcopyDiscardState *pds =
1597 postcopy_discard_send_init(ms, block->idstr);
1598
1599 /*
1600 * Postcopy sends chunks of bitmap over the wire, but it
1601 * just needs indexes at this point, avoids it having
1602 * target page specific code.
1603 */
1604 ret = postcopy_send_discard_bm_ram(ms, pds, block);
1605 postcopy_discard_send_finish(ms, pds);
1606 if (ret) {
1607 return ret;
1608 }
1609 }
1610
1611 return 0;
1612 }
1613
1614 /**
1615 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1616 *
1617 * Helper for postcopy_chunk_hostpages; it's called twice to
1618 * canonicalize the two bitmaps, that are similar, but one is
1619 * inverted.
1620 *
1621 * Postcopy requires that all target pages in a hostpage are dirty or
1622 * clean, not a mix. This function canonicalizes the bitmaps.
1623 *
1624 * @ms: current migration state
1625 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1626 * otherwise we need to canonicalize partially dirty host pages
1627 * @block: block that contains the page we want to canonicalize
1628 * @pds: state for postcopy
1629 */
1630 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1631 RAMBlock *block,
1632 PostcopyDiscardState *pds)
1633 {
1634 RAMState *rs = &ram_state;
1635 unsigned long *bitmap = block->bmap;
1636 unsigned long *unsentmap = block->unsentmap;
1637 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1638 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1639 unsigned long run_start;
1640
1641 if (block->page_size == TARGET_PAGE_SIZE) {
1642 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1643 return;
1644 }
1645
1646 if (unsent_pass) {
1647 /* Find a sent page */
1648 run_start = find_next_zero_bit(unsentmap, pages, 0);
1649 } else {
1650 /* Find a dirty page */
1651 run_start = find_next_bit(bitmap, pages, 0);
1652 }
1653
1654 while (run_start < pages) {
1655 bool do_fixup = false;
1656 unsigned long fixup_start_addr;
1657 unsigned long host_offset;
1658
1659 /*
1660 * If the start of this run of pages is in the middle of a host
1661 * page, then we need to fixup this host page.
1662 */
1663 host_offset = run_start % host_ratio;
1664 if (host_offset) {
1665 do_fixup = true;
1666 run_start -= host_offset;
1667 fixup_start_addr = run_start;
1668 /* For the next pass */
1669 run_start = run_start + host_ratio;
1670 } else {
1671 /* Find the end of this run */
1672 unsigned long run_end;
1673 if (unsent_pass) {
1674 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1675 } else {
1676 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1677 }
1678 /*
1679 * If the end isn't at the start of a host page, then the
1680 * run doesn't finish at the end of a host page
1681 * and we need to discard.
1682 */
1683 host_offset = run_end % host_ratio;
1684 if (host_offset) {
1685 do_fixup = true;
1686 fixup_start_addr = run_end - host_offset;
1687 /*
1688 * This host page has gone, the next loop iteration starts
1689 * from after the fixup
1690 */
1691 run_start = fixup_start_addr + host_ratio;
1692 } else {
1693 /*
1694 * No discards on this iteration, next loop starts from
1695 * next sent/dirty page
1696 */
1697 run_start = run_end + 1;
1698 }
1699 }
1700
1701 if (do_fixup) {
1702 unsigned long page;
1703
1704 /* Tell the destination to discard this page */
1705 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1706 /* For the unsent_pass we:
1707 * discard partially sent pages
1708 * For the !unsent_pass (dirty) we:
1709 * discard partially dirty pages that were sent
1710 * (any partially sent pages were already discarded
1711 * by the previous unsent_pass)
1712 */
1713 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1714 host_ratio);
1715 }
1716
1717 /* Clean up the bitmap */
1718 for (page = fixup_start_addr;
1719 page < fixup_start_addr + host_ratio; page++) {
1720 /* All pages in this host page are now not sent */
1721 set_bit(page, unsentmap);
1722
1723 /*
1724 * Remark them as dirty, updating the count for any pages
1725 * that weren't previously dirty.
1726 */
1727 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1728 }
1729 }
1730
1731 if (unsent_pass) {
1732 /* Find the next sent page for the next iteration */
1733 run_start = find_next_zero_bit(unsentmap, pages, run_start);
1734 } else {
1735 /* Find the next dirty page for the next iteration */
1736 run_start = find_next_bit(bitmap, pages, run_start);
1737 }
1738 }
1739 }
1740
1741 /**
1742 * postcopy_chuck_hostpages: discrad any partially sent host page
1743 *
1744 * Utility for the outgoing postcopy code.
1745 *
1746 * Discard any partially sent host-page size chunks, mark any partially
1747 * dirty host-page size chunks as all dirty. In this case the host-page
1748 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1749 *
1750 * Returns zero on success
1751 *
1752 * @ms: current migration state
1753 * @block: block we want to work with
1754 */
1755 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1756 {
1757 PostcopyDiscardState *pds =
1758 postcopy_discard_send_init(ms, block->idstr);
1759
1760 /* First pass: Discard all partially sent host pages */
1761 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1762 /*
1763 * Second pass: Ensure that all partially dirty host pages are made
1764 * fully dirty.
1765 */
1766 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1767
1768 postcopy_discard_send_finish(ms, pds);
1769 return 0;
1770 }
1771
1772 /**
1773 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1774 *
1775 * Returns zero on success
1776 *
1777 * Transmit the set of pages to be discarded after precopy to the target
1778 * these are pages that:
1779 * a) Have been previously transmitted but are now dirty again
1780 * b) Pages that have never been transmitted, this ensures that
1781 * any pages on the destination that have been mapped by background
1782 * tasks get discarded (transparent huge pages is the specific concern)
1783 * Hopefully this is pretty sparse
1784 *
1785 * @ms: current migration state
1786 */
1787 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1788 {
1789 RAMState *rs = &ram_state;
1790 RAMBlock *block;
1791 int ret;
1792
1793 rcu_read_lock();
1794
1795 /* This should be our last sync, the src is now paused */
1796 migration_bitmap_sync(rs);
1797
1798 /* Easiest way to make sure we don't resume in the middle of a host-page */
1799 rs->last_seen_block = NULL;
1800 rs->last_sent_block = NULL;
1801 rs->last_page = 0;
1802
1803 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1804 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1805 unsigned long *bitmap = block->bmap;
1806 unsigned long *unsentmap = block->unsentmap;
1807
1808 if (!unsentmap) {
1809 /* We don't have a safe way to resize the sentmap, so
1810 * if the bitmap was resized it will be NULL at this
1811 * point.
1812 */
1813 error_report("migration ram resized during precopy phase");
1814 rcu_read_unlock();
1815 return -EINVAL;
1816 }
1817 /* Deal with TPS != HPS and huge pages */
1818 ret = postcopy_chunk_hostpages(ms, block);
1819 if (ret) {
1820 rcu_read_unlock();
1821 return ret;
1822 }
1823
1824 /*
1825 * Update the unsentmap to be unsentmap = unsentmap | dirty
1826 */
1827 bitmap_or(unsentmap, unsentmap, bitmap, pages);
1828 #ifdef DEBUG_POSTCOPY
1829 ram_debug_dump_bitmap(unsentmap, true, pages);
1830 #endif
1831 }
1832 trace_ram_postcopy_send_discard_bitmap();
1833
1834 ret = postcopy_each_ram_send_discard(ms);
1835 rcu_read_unlock();
1836
1837 return ret;
1838 }
1839
1840 /**
1841 * ram_discard_range: discard dirtied pages at the beginning of postcopy
1842 *
1843 * Returns zero on success
1844 *
1845 * @rbname: name of the RAMBlock of the request. NULL means the
1846 * same that last one.
1847 * @start: RAMBlock starting page
1848 * @length: RAMBlock size
1849 */
1850 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1851 {
1852 int ret = -1;
1853
1854 trace_ram_discard_range(rbname, start, length);
1855
1856 rcu_read_lock();
1857 RAMBlock *rb = qemu_ram_block_by_name(rbname);
1858
1859 if (!rb) {
1860 error_report("ram_discard_range: Failed to find block '%s'", rbname);
1861 goto err;
1862 }
1863
1864 ret = ram_block_discard_range(rb, start, length);
1865
1866 err:
1867 rcu_read_unlock();
1868
1869 return ret;
1870 }
1871
1872 static int ram_state_init(RAMState *rs)
1873 {
1874 memset(rs, 0, sizeof(*rs));
1875 qemu_mutex_init(&rs->bitmap_mutex);
1876 qemu_mutex_init(&rs->src_page_req_mutex);
1877 QSIMPLEQ_INIT(&rs->src_page_requests);
1878
1879 if (migrate_use_xbzrle()) {
1880 XBZRLE_cache_lock();
1881 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1882 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1883 TARGET_PAGE_SIZE,
1884 TARGET_PAGE_SIZE);
1885 if (!XBZRLE.cache) {
1886 XBZRLE_cache_unlock();
1887 error_report("Error creating cache");
1888 return -1;
1889 }
1890 XBZRLE_cache_unlock();
1891
1892 /* We prefer not to abort if there is no memory */
1893 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1894 if (!XBZRLE.encoded_buf) {
1895 error_report("Error allocating encoded_buf");
1896 return -1;
1897 }
1898
1899 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1900 if (!XBZRLE.current_buf) {
1901 error_report("Error allocating current_buf");
1902 g_free(XBZRLE.encoded_buf);
1903 XBZRLE.encoded_buf = NULL;
1904 return -1;
1905 }
1906 }
1907
1908 /* For memory_global_dirty_log_start below. */
1909 qemu_mutex_lock_iothread();
1910
1911 qemu_mutex_lock_ramlist();
1912 rcu_read_lock();
1913 ram_state_reset(rs);
1914
1915 /* Skip setting bitmap if there is no RAM */
1916 if (ram_bytes_total()) {
1917 RAMBlock *block;
1918
1919 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1920 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
1921
1922 block->bmap = bitmap_new(pages);
1923 bitmap_set(block->bmap, 0, pages);
1924 if (migrate_postcopy_ram()) {
1925 block->unsentmap = bitmap_new(pages);
1926 bitmap_set(block->unsentmap, 0, pages);
1927 }
1928 }
1929 }
1930
1931 /*
1932 * Count the total number of pages used by ram blocks not including any
1933 * gaps due to alignment or unplugs.
1934 */
1935 rs->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1936
1937 memory_global_dirty_log_start();
1938 migration_bitmap_sync(rs);
1939 qemu_mutex_unlock_ramlist();
1940 qemu_mutex_unlock_iothread();
1941 rcu_read_unlock();
1942
1943 return 0;
1944 }
1945
1946 /*
1947 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1948 * long-running RCU critical section. When rcu-reclaims in the code
1949 * start to become numerous it will be necessary to reduce the
1950 * granularity of these critical sections.
1951 */
1952
1953 /**
1954 * ram_save_setup: Setup RAM for migration
1955 *
1956 * Returns zero to indicate success and negative for error
1957 *
1958 * @f: QEMUFile where to send the data
1959 * @opaque: RAMState pointer
1960 */
1961 static int ram_save_setup(QEMUFile *f, void *opaque)
1962 {
1963 RAMState *rs = opaque;
1964 RAMBlock *block;
1965
1966 /* migration has already setup the bitmap, reuse it. */
1967 if (!migration_in_colo_state()) {
1968 if (ram_state_init(rs) < 0) {
1969 return -1;
1970 }
1971 }
1972 rs->f = f;
1973
1974 rcu_read_lock();
1975
1976 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1977
1978 RAMBLOCK_FOREACH(block) {
1979 qemu_put_byte(f, strlen(block->idstr));
1980 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1981 qemu_put_be64(f, block->used_length);
1982 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1983 qemu_put_be64(f, block->page_size);
1984 }
1985 }
1986
1987 rcu_read_unlock();
1988
1989 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1990 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1991
1992 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1993
1994 return 0;
1995 }
1996
1997 /**
1998 * ram_save_iterate: iterative stage for migration
1999 *
2000 * Returns zero to indicate success and negative for error
2001 *
2002 * @f: QEMUFile where to send the data
2003 * @opaque: RAMState pointer
2004 */
2005 static int ram_save_iterate(QEMUFile *f, void *opaque)
2006 {
2007 RAMState *rs = opaque;
2008 int ret;
2009 int i;
2010 int64_t t0;
2011 int done = 0;
2012
2013 rcu_read_lock();
2014 if (ram_list.version != rs->last_version) {
2015 ram_state_reset(rs);
2016 }
2017
2018 /* Read version before ram_list.blocks */
2019 smp_rmb();
2020
2021 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2022
2023 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2024 i = 0;
2025 while ((ret = qemu_file_rate_limit(f)) == 0) {
2026 int pages;
2027
2028 pages = ram_find_and_save_block(rs, false);
2029 /* no more pages to sent */
2030 if (pages == 0) {
2031 done = 1;
2032 break;
2033 }
2034 rs->iterations++;
2035
2036 /* we want to check in the 1st loop, just in case it was the 1st time
2037 and we had to sync the dirty bitmap.
2038 qemu_get_clock_ns() is a bit expensive, so we only check each some
2039 iterations
2040 */
2041 if ((i & 63) == 0) {
2042 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2043 if (t1 > MAX_WAIT) {
2044 trace_ram_save_iterate_big_wait(t1, i);
2045 break;
2046 }
2047 }
2048 i++;
2049 }
2050 flush_compressed_data(rs);
2051 rcu_read_unlock();
2052
2053 /*
2054 * Must occur before EOS (or any QEMUFile operation)
2055 * because of RDMA protocol.
2056 */
2057 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2058
2059 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2060 rs->bytes_transferred += 8;
2061
2062 ret = qemu_file_get_error(f);
2063 if (ret < 0) {
2064 return ret;
2065 }
2066
2067 return done;
2068 }
2069
2070 /**
2071 * ram_save_complete: function called to send the remaining amount of ram
2072 *
2073 * Returns zero to indicate success
2074 *
2075 * Called with iothread lock
2076 *
2077 * @f: QEMUFile where to send the data
2078 * @opaque: RAMState pointer
2079 */
2080 static int ram_save_complete(QEMUFile *f, void *opaque)
2081 {
2082 RAMState *rs = opaque;
2083
2084 rcu_read_lock();
2085
2086 if (!migration_in_postcopy()) {
2087 migration_bitmap_sync(rs);
2088 }
2089
2090 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2091
2092 /* try transferring iterative blocks of memory */
2093
2094 /* flush all remaining blocks regardless of rate limiting */
2095 while (true) {
2096 int pages;
2097
2098 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2099 /* no more blocks to sent */
2100 if (pages == 0) {
2101 break;
2102 }
2103 }
2104
2105 flush_compressed_data(rs);
2106 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2107
2108 rcu_read_unlock();
2109
2110 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2111
2112 return 0;
2113 }
2114
2115 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2116 uint64_t *non_postcopiable_pending,
2117 uint64_t *postcopiable_pending)
2118 {
2119 RAMState *rs = opaque;
2120 uint64_t remaining_size;
2121
2122 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2123
2124 if (!migration_in_postcopy() &&
2125 remaining_size < max_size) {
2126 qemu_mutex_lock_iothread();
2127 rcu_read_lock();
2128 migration_bitmap_sync(rs);
2129 rcu_read_unlock();
2130 qemu_mutex_unlock_iothread();
2131 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2132 }
2133
2134 /* We can do postcopy, and all the data is postcopiable */
2135 *postcopiable_pending += remaining_size;
2136 }
2137
2138 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2139 {
2140 unsigned int xh_len;
2141 int xh_flags;
2142 uint8_t *loaded_data;
2143
2144 if (!xbzrle_decoded_buf) {
2145 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2146 }
2147 loaded_data = xbzrle_decoded_buf;
2148
2149 /* extract RLE header */
2150 xh_flags = qemu_get_byte(f);
2151 xh_len = qemu_get_be16(f);
2152
2153 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2154 error_report("Failed to load XBZRLE page - wrong compression!");
2155 return -1;
2156 }
2157
2158 if (xh_len > TARGET_PAGE_SIZE) {
2159 error_report("Failed to load XBZRLE page - len overflow!");
2160 return -1;
2161 }
2162 /* load data and decode */
2163 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2164
2165 /* decode RLE */
2166 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2167 TARGET_PAGE_SIZE) == -1) {
2168 error_report("Failed to load XBZRLE page - decode error!");
2169 return -1;
2170 }
2171
2172 return 0;
2173 }
2174
2175 /**
2176 * ram_block_from_stream: read a RAMBlock id from the migration stream
2177 *
2178 * Must be called from within a rcu critical section.
2179 *
2180 * Returns a pointer from within the RCU-protected ram_list.
2181 *
2182 * @f: QEMUFile where to read the data from
2183 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2184 */
2185 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2186 {
2187 static RAMBlock *block = NULL;
2188 char id[256];
2189 uint8_t len;
2190
2191 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2192 if (!block) {
2193 error_report("Ack, bad migration stream!");
2194 return NULL;
2195 }
2196 return block;
2197 }
2198
2199 len = qemu_get_byte(f);
2200 qemu_get_buffer(f, (uint8_t *)id, len);
2201 id[len] = 0;
2202
2203 block = qemu_ram_block_by_name(id);
2204 if (!block) {
2205 error_report("Can't find block %s", id);
2206 return NULL;
2207 }
2208
2209 return block;
2210 }
2211
2212 static inline void *host_from_ram_block_offset(RAMBlock *block,
2213 ram_addr_t offset)
2214 {
2215 if (!offset_in_ramblock(block, offset)) {
2216 return NULL;
2217 }
2218
2219 return block->host + offset;
2220 }
2221
2222 /**
2223 * ram_handle_compressed: handle the zero page case
2224 *
2225 * If a page (or a whole RDMA chunk) has been
2226 * determined to be zero, then zap it.
2227 *
2228 * @host: host address for the zero page
2229 * @ch: what the page is filled from. We only support zero
2230 * @size: size of the zero page
2231 */
2232 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2233 {
2234 if (ch != 0 || !is_zero_range(host, size)) {
2235 memset(host, ch, size);
2236 }
2237 }
2238
2239 static void *do_data_decompress(void *opaque)
2240 {
2241 DecompressParam *param = opaque;
2242 unsigned long pagesize;
2243 uint8_t *des;
2244 int len;
2245
2246 qemu_mutex_lock(&param->mutex);
2247 while (!param->quit) {
2248 if (param->des) {
2249 des = param->des;
2250 len = param->len;
2251 param->des = 0;
2252 qemu_mutex_unlock(&param->mutex);
2253
2254 pagesize = TARGET_PAGE_SIZE;
2255 /* uncompress() will return failed in some case, especially
2256 * when the page is dirted when doing the compression, it's
2257 * not a problem because the dirty page will be retransferred
2258 * and uncompress() won't break the data in other pages.
2259 */
2260 uncompress((Bytef *)des, &pagesize,
2261 (const Bytef *)param->compbuf, len);
2262
2263 qemu_mutex_lock(&decomp_done_lock);
2264 param->done = true;
2265 qemu_cond_signal(&decomp_done_cond);
2266 qemu_mutex_unlock(&decomp_done_lock);
2267
2268 qemu_mutex_lock(&param->mutex);
2269 } else {
2270 qemu_cond_wait(&param->cond, &param->mutex);
2271 }
2272 }
2273 qemu_mutex_unlock(&param->mutex);
2274
2275 return NULL;
2276 }
2277
2278 static void wait_for_decompress_done(void)
2279 {
2280 int idx, thread_count;
2281
2282 if (!migrate_use_compression()) {
2283 return;
2284 }
2285
2286 thread_count = migrate_decompress_threads();
2287 qemu_mutex_lock(&decomp_done_lock);
2288 for (idx = 0; idx < thread_count; idx++) {
2289 while (!decomp_param[idx].done) {
2290 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2291 }
2292 }
2293 qemu_mutex_unlock(&decomp_done_lock);
2294 }
2295
2296 void migrate_decompress_threads_create(void)
2297 {
2298 int i, thread_count;
2299
2300 thread_count = migrate_decompress_threads();
2301 decompress_threads = g_new0(QemuThread, thread_count);
2302 decomp_param = g_new0(DecompressParam, thread_count);
2303 qemu_mutex_init(&decomp_done_lock);
2304 qemu_cond_init(&decomp_done_cond);
2305 for (i = 0; i < thread_count; i++) {
2306 qemu_mutex_init(&decomp_param[i].mutex);
2307 qemu_cond_init(&decomp_param[i].cond);
2308 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2309 decomp_param[i].done = true;
2310 decomp_param[i].quit = false;
2311 qemu_thread_create(decompress_threads + i, "decompress",
2312 do_data_decompress, decomp_param + i,
2313 QEMU_THREAD_JOINABLE);
2314 }
2315 }
2316
2317 void migrate_decompress_threads_join(void)
2318 {
2319 int i, thread_count;
2320
2321 thread_count = migrate_decompress_threads();
2322 for (i = 0; i < thread_count; i++) {
2323 qemu_mutex_lock(&decomp_param[i].mutex);
2324 decomp_param[i].quit = true;
2325 qemu_cond_signal(&decomp_param[i].cond);
2326 qemu_mutex_unlock(&decomp_param[i].mutex);
2327 }
2328 for (i = 0; i < thread_count; i++) {
2329 qemu_thread_join(decompress_threads + i);
2330 qemu_mutex_destroy(&decomp_param[i].mutex);
2331 qemu_cond_destroy(&decomp_param[i].cond);
2332 g_free(decomp_param[i].compbuf);
2333 }
2334 g_free(decompress_threads);
2335 g_free(decomp_param);
2336 decompress_threads = NULL;
2337 decomp_param = NULL;
2338 }
2339
2340 static void decompress_data_with_multi_threads(QEMUFile *f,
2341 void *host, int len)
2342 {
2343 int idx, thread_count;
2344
2345 thread_count = migrate_decompress_threads();
2346 qemu_mutex_lock(&decomp_done_lock);
2347 while (true) {
2348 for (idx = 0; idx < thread_count; idx++) {
2349 if (decomp_param[idx].done) {
2350 decomp_param[idx].done = false;
2351 qemu_mutex_lock(&decomp_param[idx].mutex);
2352 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2353 decomp_param[idx].des = host;
2354 decomp_param[idx].len = len;
2355 qemu_cond_signal(&decomp_param[idx].cond);
2356 qemu_mutex_unlock(&decomp_param[idx].mutex);
2357 break;
2358 }
2359 }
2360 if (idx < thread_count) {
2361 break;
2362 } else {
2363 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2364 }
2365 }
2366 qemu_mutex_unlock(&decomp_done_lock);
2367 }
2368
2369 /**
2370 * ram_postcopy_incoming_init: allocate postcopy data structures
2371 *
2372 * Returns 0 for success and negative if there was one error
2373 *
2374 * @mis: current migration incoming state
2375 *
2376 * Allocate data structures etc needed by incoming migration with
2377 * postcopy-ram. postcopy-ram's similarly names
2378 * postcopy_ram_incoming_init does the work.
2379 */
2380 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2381 {
2382 unsigned long ram_pages = last_ram_page();
2383
2384 return postcopy_ram_incoming_init(mis, ram_pages);
2385 }
2386
2387 /**
2388 * ram_load_postcopy: load a page in postcopy case
2389 *
2390 * Returns 0 for success or -errno in case of error
2391 *
2392 * Called in postcopy mode by ram_load().
2393 * rcu_read_lock is taken prior to this being called.
2394 *
2395 * @f: QEMUFile where to send the data
2396 */
2397 static int ram_load_postcopy(QEMUFile *f)
2398 {
2399 int flags = 0, ret = 0;
2400 bool place_needed = false;
2401 bool matching_page_sizes = false;
2402 MigrationIncomingState *mis = migration_incoming_get_current();
2403 /* Temporary page that is later 'placed' */
2404 void *postcopy_host_page = postcopy_get_tmp_page(mis);
2405 void *last_host = NULL;
2406 bool all_zero = false;
2407
2408 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2409 ram_addr_t addr;
2410 void *host = NULL;
2411 void *page_buffer = NULL;
2412 void *place_source = NULL;
2413 RAMBlock *block = NULL;
2414 uint8_t ch;
2415
2416 addr = qemu_get_be64(f);
2417 flags = addr & ~TARGET_PAGE_MASK;
2418 addr &= TARGET_PAGE_MASK;
2419
2420 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2421 place_needed = false;
2422 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2423 block = ram_block_from_stream(f, flags);
2424
2425 host = host_from_ram_block_offset(block, addr);
2426 if (!host) {
2427 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2428 ret = -EINVAL;
2429 break;
2430 }
2431 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2432 /*
2433 * Postcopy requires that we place whole host pages atomically;
2434 * these may be huge pages for RAMBlocks that are backed by
2435 * hugetlbfs.
2436 * To make it atomic, the data is read into a temporary page
2437 * that's moved into place later.
2438 * The migration protocol uses, possibly smaller, target-pages
2439 * however the source ensures it always sends all the components
2440 * of a host page in order.
2441 */
2442 page_buffer = postcopy_host_page +
2443 ((uintptr_t)host & (block->page_size - 1));
2444 /* If all TP are zero then we can optimise the place */
2445 if (!((uintptr_t)host & (block->page_size - 1))) {
2446 all_zero = true;
2447 } else {
2448 /* not the 1st TP within the HP */
2449 if (host != (last_host + TARGET_PAGE_SIZE)) {
2450 error_report("Non-sequential target page %p/%p",
2451 host, last_host);
2452 ret = -EINVAL;
2453 break;
2454 }
2455 }
2456
2457
2458 /*
2459 * If it's the last part of a host page then we place the host
2460 * page
2461 */
2462 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2463 (block->page_size - 1)) == 0;
2464 place_source = postcopy_host_page;
2465 }
2466 last_host = host;
2467
2468 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2469 case RAM_SAVE_FLAG_ZERO:
2470 ch = qemu_get_byte(f);
2471 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2472 if (ch) {
2473 all_zero = false;
2474 }
2475 break;
2476
2477 case RAM_SAVE_FLAG_PAGE:
2478 all_zero = false;
2479 if (!place_needed || !matching_page_sizes) {
2480 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2481 } else {
2482 /* Avoids the qemu_file copy during postcopy, which is
2483 * going to do a copy later; can only do it when we
2484 * do this read in one go (matching page sizes)
2485 */
2486 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2487 TARGET_PAGE_SIZE);
2488 }
2489 break;
2490 case RAM_SAVE_FLAG_EOS:
2491 /* normal exit */
2492 break;
2493 default:
2494 error_report("Unknown combination of migration flags: %#x"
2495 " (postcopy mode)", flags);
2496 ret = -EINVAL;
2497 }
2498
2499 if (place_needed) {
2500 /* This gets called at the last target page in the host page */
2501 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2502
2503 if (all_zero) {
2504 ret = postcopy_place_page_zero(mis, place_dest,
2505 block->page_size);
2506 } else {
2507 ret = postcopy_place_page(mis, place_dest,
2508 place_source, block->page_size);
2509 }
2510 }
2511 if (!ret) {
2512 ret = qemu_file_get_error(f);
2513 }
2514 }
2515
2516 return ret;
2517 }
2518
2519 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2520 {
2521 int flags = 0, ret = 0;
2522 static uint64_t seq_iter;
2523 int len = 0;
2524 /*
2525 * If system is running in postcopy mode, page inserts to host memory must
2526 * be atomic
2527 */
2528 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2529 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2530 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2531
2532 seq_iter++;
2533
2534 if (version_id != 4) {
2535 ret = -EINVAL;
2536 }
2537
2538 /* This RCU critical section can be very long running.
2539 * When RCU reclaims in the code start to become numerous,
2540 * it will be necessary to reduce the granularity of this
2541 * critical section.
2542 */
2543 rcu_read_lock();
2544
2545 if (postcopy_running) {
2546 ret = ram_load_postcopy(f);
2547 }
2548
2549 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2550 ram_addr_t addr, total_ram_bytes;
2551 void *host = NULL;
2552 uint8_t ch;
2553
2554 addr = qemu_get_be64(f);
2555 flags = addr & ~TARGET_PAGE_MASK;
2556 addr &= TARGET_PAGE_MASK;
2557
2558 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2559 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2560 RAMBlock *block = ram_block_from_stream(f, flags);
2561
2562 host = host_from_ram_block_offset(block, addr);
2563 if (!host) {
2564 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2565 ret = -EINVAL;
2566 break;
2567 }
2568 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2569 }
2570
2571 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2572 case RAM_SAVE_FLAG_MEM_SIZE:
2573 /* Synchronize RAM block list */
2574 total_ram_bytes = addr;
2575 while (!ret && total_ram_bytes) {
2576 RAMBlock *block;
2577 char id[256];
2578 ram_addr_t length;
2579
2580 len = qemu_get_byte(f);
2581 qemu_get_buffer(f, (uint8_t *)id, len);
2582 id[len] = 0;
2583 length = qemu_get_be64(f);
2584
2585 block = qemu_ram_block_by_name(id);
2586 if (block) {
2587 if (length != block->used_length) {
2588 Error *local_err = NULL;
2589
2590 ret = qemu_ram_resize(block, length,
2591 &local_err);
2592 if (local_err) {
2593 error_report_err(local_err);
2594 }
2595 }
2596 /* For postcopy we need to check hugepage sizes match */
2597 if (postcopy_advised &&
2598 block->page_size != qemu_host_page_size) {
2599 uint64_t remote_page_size = qemu_get_be64(f);
2600 if (remote_page_size != block->page_size) {
2601 error_report("Mismatched RAM page size %s "
2602 "(local) %zd != %" PRId64,
2603 id, block->page_size,
2604 remote_page_size);
2605 ret = -EINVAL;
2606 }
2607 }
2608 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2609 block->idstr);
2610 } else {
2611 error_report("Unknown ramblock \"%s\", cannot "
2612 "accept migration", id);
2613 ret = -EINVAL;
2614 }
2615
2616 total_ram_bytes -= length;
2617 }
2618 break;
2619
2620 case RAM_SAVE_FLAG_ZERO:
2621 ch = qemu_get_byte(f);
2622 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2623 break;
2624
2625 case RAM_SAVE_FLAG_PAGE:
2626 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2627 break;
2628
2629 case RAM_SAVE_FLAG_COMPRESS_PAGE:
2630 len = qemu_get_be32(f);
2631 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2632 error_report("Invalid compressed data length: %d", len);
2633 ret = -EINVAL;
2634 break;
2635 }
2636 decompress_data_with_multi_threads(f, host, len);
2637 break;
2638
2639 case RAM_SAVE_FLAG_XBZRLE:
2640 if (load_xbzrle(f, addr, host) < 0) {
2641 error_report("Failed to decompress XBZRLE page at "
2642 RAM_ADDR_FMT, addr);
2643 ret = -EINVAL;
2644 break;
2645 }
2646 break;
2647 case RAM_SAVE_FLAG_EOS:
2648 /* normal exit */
2649 break;
2650 default:
2651 if (flags & RAM_SAVE_FLAG_HOOK) {
2652 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2653 } else {
2654 error_report("Unknown combination of migration flags: %#x",
2655 flags);
2656 ret = -EINVAL;
2657 }
2658 }
2659 if (!ret) {
2660 ret = qemu_file_get_error(f);
2661 }
2662 }
2663
2664 wait_for_decompress_done();
2665 rcu_read_unlock();
2666 trace_ram_load_complete(ret, seq_iter);
2667 return ret;
2668 }
2669
2670 static SaveVMHandlers savevm_ram_handlers = {
2671 .save_live_setup = ram_save_setup,
2672 .save_live_iterate = ram_save_iterate,
2673 .save_live_complete_postcopy = ram_save_complete,
2674 .save_live_complete_precopy = ram_save_complete,
2675 .save_live_pending = ram_save_pending,
2676 .load_state = ram_load,
2677 .cleanup = ram_migration_cleanup,
2678 };
2679
2680 void ram_mig_init(void)
2681 {
2682 qemu_mutex_init(&XBZRLE.lock);
2683 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2684 }