]> git.proxmox.com Git - mirror_qemu.git/blob - migration/ram.c
migration: add MigrationState arg for ram_save_/compressed_/page()
[mirror_qemu.git] / migration / ram.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28 #include "qemu/osdep.h"
29 #include "qemu-common.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qapi-event.h"
33 #include "qemu/cutils.h"
34 #include "qemu/bitops.h"
35 #include "qemu/bitmap.h"
36 #include "qemu/timer.h"
37 #include "qemu/main-loop.h"
38 #include "migration/migration.h"
39 #include "migration/postcopy-ram.h"
40 #include "exec/address-spaces.h"
41 #include "migration/page_cache.h"
42 #include "qemu/error-report.h"
43 #include "trace.h"
44 #include "exec/ram_addr.h"
45 #include "qemu/rcu_queue.h"
46 #include "migration/colo.h"
47
48 static int dirty_rate_high_cnt;
49
50 static uint64_t bitmap_sync_count;
51
52 /***********************************************************/
53 /* ram save/restore */
54
55 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
56 #define RAM_SAVE_FLAG_COMPRESS 0x02
57 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
58 #define RAM_SAVE_FLAG_PAGE 0x08
59 #define RAM_SAVE_FLAG_EOS 0x10
60 #define RAM_SAVE_FLAG_CONTINUE 0x20
61 #define RAM_SAVE_FLAG_XBZRLE 0x40
62 /* 0x80 is reserved in migration.h start with 0x100 next */
63 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
64
65 static uint8_t *ZERO_TARGET_PAGE;
66
67 static inline bool is_zero_range(uint8_t *p, uint64_t size)
68 {
69 return buffer_is_zero(p, size);
70 }
71
72 /* struct contains XBZRLE cache and a static page
73 used by the compression */
74 static struct {
75 /* buffer used for XBZRLE encoding */
76 uint8_t *encoded_buf;
77 /* buffer for storing page content */
78 uint8_t *current_buf;
79 /* Cache for XBZRLE, Protected by lock. */
80 PageCache *cache;
81 QemuMutex lock;
82 } XBZRLE;
83
84 /* buffer used for XBZRLE decoding */
85 static uint8_t *xbzrle_decoded_buf;
86
87 static void XBZRLE_cache_lock(void)
88 {
89 if (migrate_use_xbzrle())
90 qemu_mutex_lock(&XBZRLE.lock);
91 }
92
93 static void XBZRLE_cache_unlock(void)
94 {
95 if (migrate_use_xbzrle())
96 qemu_mutex_unlock(&XBZRLE.lock);
97 }
98
99 /*
100 * called from qmp_migrate_set_cache_size in main thread, possibly while
101 * a migration is in progress.
102 * A running migration maybe using the cache and might finish during this
103 * call, hence changes to the cache are protected by XBZRLE.lock().
104 */
105 int64_t xbzrle_cache_resize(int64_t new_size)
106 {
107 PageCache *new_cache;
108 int64_t ret;
109
110 if (new_size < TARGET_PAGE_SIZE) {
111 return -1;
112 }
113
114 XBZRLE_cache_lock();
115
116 if (XBZRLE.cache != NULL) {
117 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
118 goto out_new_size;
119 }
120 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
121 TARGET_PAGE_SIZE);
122 if (!new_cache) {
123 error_report("Error creating cache");
124 ret = -1;
125 goto out;
126 }
127
128 cache_fini(XBZRLE.cache);
129 XBZRLE.cache = new_cache;
130 }
131
132 out_new_size:
133 ret = pow2floor(new_size);
134 out:
135 XBZRLE_cache_unlock();
136 return ret;
137 }
138
139 /* accounting for migration statistics */
140 typedef struct AccountingInfo {
141 uint64_t dup_pages;
142 uint64_t skipped_pages;
143 uint64_t norm_pages;
144 uint64_t iterations;
145 uint64_t xbzrle_bytes;
146 uint64_t xbzrle_pages;
147 uint64_t xbzrle_cache_miss;
148 double xbzrle_cache_miss_rate;
149 uint64_t xbzrle_overflows;
150 } AccountingInfo;
151
152 static AccountingInfo acct_info;
153
154 static void acct_clear(void)
155 {
156 memset(&acct_info, 0, sizeof(acct_info));
157 }
158
159 uint64_t dup_mig_bytes_transferred(void)
160 {
161 return acct_info.dup_pages * TARGET_PAGE_SIZE;
162 }
163
164 uint64_t dup_mig_pages_transferred(void)
165 {
166 return acct_info.dup_pages;
167 }
168
169 uint64_t skipped_mig_bytes_transferred(void)
170 {
171 return acct_info.skipped_pages * TARGET_PAGE_SIZE;
172 }
173
174 uint64_t skipped_mig_pages_transferred(void)
175 {
176 return acct_info.skipped_pages;
177 }
178
179 uint64_t norm_mig_bytes_transferred(void)
180 {
181 return acct_info.norm_pages * TARGET_PAGE_SIZE;
182 }
183
184 uint64_t norm_mig_pages_transferred(void)
185 {
186 return acct_info.norm_pages;
187 }
188
189 uint64_t xbzrle_mig_bytes_transferred(void)
190 {
191 return acct_info.xbzrle_bytes;
192 }
193
194 uint64_t xbzrle_mig_pages_transferred(void)
195 {
196 return acct_info.xbzrle_pages;
197 }
198
199 uint64_t xbzrle_mig_pages_cache_miss(void)
200 {
201 return acct_info.xbzrle_cache_miss;
202 }
203
204 double xbzrle_mig_cache_miss_rate(void)
205 {
206 return acct_info.xbzrle_cache_miss_rate;
207 }
208
209 uint64_t xbzrle_mig_pages_overflow(void)
210 {
211 return acct_info.xbzrle_overflows;
212 }
213
214 /* This is the last block that we have visited serching for dirty pages
215 */
216 static RAMBlock *last_seen_block;
217 /* This is the last block from where we have sent data */
218 static RAMBlock *last_sent_block;
219 static ram_addr_t last_offset;
220 static QemuMutex migration_bitmap_mutex;
221 static uint64_t migration_dirty_pages;
222 static uint32_t last_version;
223 static bool ram_bulk_stage;
224
225 /* used by the search for pages to send */
226 struct PageSearchStatus {
227 /* Current block being searched */
228 RAMBlock *block;
229 /* Current offset to search from */
230 ram_addr_t offset;
231 /* Set once we wrap around */
232 bool complete_round;
233 };
234 typedef struct PageSearchStatus PageSearchStatus;
235
236 static struct BitmapRcu {
237 struct rcu_head rcu;
238 /* Main migration bitmap */
239 unsigned long *bmap;
240 /* bitmap of pages that haven't been sent even once
241 * only maintained and used in postcopy at the moment
242 * where it's used to send the dirtymap at the start
243 * of the postcopy phase
244 */
245 unsigned long *unsentmap;
246 } *migration_bitmap_rcu;
247
248 struct CompressParam {
249 bool done;
250 bool quit;
251 QEMUFile *file;
252 QemuMutex mutex;
253 QemuCond cond;
254 RAMBlock *block;
255 ram_addr_t offset;
256 };
257 typedef struct CompressParam CompressParam;
258
259 struct DecompressParam {
260 bool done;
261 bool quit;
262 QemuMutex mutex;
263 QemuCond cond;
264 void *des;
265 uint8_t *compbuf;
266 int len;
267 };
268 typedef struct DecompressParam DecompressParam;
269
270 static CompressParam *comp_param;
271 static QemuThread *compress_threads;
272 /* comp_done_cond is used to wake up the migration thread when
273 * one of the compression threads has finished the compression.
274 * comp_done_lock is used to co-work with comp_done_cond.
275 */
276 static QemuMutex comp_done_lock;
277 static QemuCond comp_done_cond;
278 /* The empty QEMUFileOps will be used by file in CompressParam */
279 static const QEMUFileOps empty_ops = { };
280
281 static bool compression_switch;
282 static DecompressParam *decomp_param;
283 static QemuThread *decompress_threads;
284 static QemuMutex decomp_done_lock;
285 static QemuCond decomp_done_cond;
286
287 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
288 ram_addr_t offset);
289
290 static void *do_data_compress(void *opaque)
291 {
292 CompressParam *param = opaque;
293 RAMBlock *block;
294 ram_addr_t offset;
295
296 qemu_mutex_lock(&param->mutex);
297 while (!param->quit) {
298 if (param->block) {
299 block = param->block;
300 offset = param->offset;
301 param->block = NULL;
302 qemu_mutex_unlock(&param->mutex);
303
304 do_compress_ram_page(param->file, block, offset);
305
306 qemu_mutex_lock(&comp_done_lock);
307 param->done = true;
308 qemu_cond_signal(&comp_done_cond);
309 qemu_mutex_unlock(&comp_done_lock);
310
311 qemu_mutex_lock(&param->mutex);
312 } else {
313 qemu_cond_wait(&param->cond, &param->mutex);
314 }
315 }
316 qemu_mutex_unlock(&param->mutex);
317
318 return NULL;
319 }
320
321 static inline void terminate_compression_threads(void)
322 {
323 int idx, thread_count;
324
325 thread_count = migrate_compress_threads();
326 for (idx = 0; idx < thread_count; idx++) {
327 qemu_mutex_lock(&comp_param[idx].mutex);
328 comp_param[idx].quit = true;
329 qemu_cond_signal(&comp_param[idx].cond);
330 qemu_mutex_unlock(&comp_param[idx].mutex);
331 }
332 }
333
334 void migrate_compress_threads_join(void)
335 {
336 int i, thread_count;
337
338 if (!migrate_use_compression()) {
339 return;
340 }
341 terminate_compression_threads();
342 thread_count = migrate_compress_threads();
343 for (i = 0; i < thread_count; i++) {
344 qemu_thread_join(compress_threads + i);
345 qemu_fclose(comp_param[i].file);
346 qemu_mutex_destroy(&comp_param[i].mutex);
347 qemu_cond_destroy(&comp_param[i].cond);
348 }
349 qemu_mutex_destroy(&comp_done_lock);
350 qemu_cond_destroy(&comp_done_cond);
351 g_free(compress_threads);
352 g_free(comp_param);
353 compress_threads = NULL;
354 comp_param = NULL;
355 }
356
357 void migrate_compress_threads_create(void)
358 {
359 int i, thread_count;
360
361 if (!migrate_use_compression()) {
362 return;
363 }
364 compression_switch = true;
365 thread_count = migrate_compress_threads();
366 compress_threads = g_new0(QemuThread, thread_count);
367 comp_param = g_new0(CompressParam, thread_count);
368 qemu_cond_init(&comp_done_cond);
369 qemu_mutex_init(&comp_done_lock);
370 for (i = 0; i < thread_count; i++) {
371 /* comp_param[i].file is just used as a dummy buffer to save data,
372 * set its ops to empty.
373 */
374 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
375 comp_param[i].done = true;
376 comp_param[i].quit = false;
377 qemu_mutex_init(&comp_param[i].mutex);
378 qemu_cond_init(&comp_param[i].cond);
379 qemu_thread_create(compress_threads + i, "compress",
380 do_data_compress, comp_param + i,
381 QEMU_THREAD_JOINABLE);
382 }
383 }
384
385 /**
386 * save_page_header: Write page header to wire
387 *
388 * If this is the 1st block, it also writes the block identification
389 *
390 * Returns: Number of bytes written
391 *
392 * @f: QEMUFile where to send the data
393 * @block: block that contains the page we want to send
394 * @offset: offset inside the block for the page
395 * in the lower bits, it contains flags
396 */
397 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
398 {
399 size_t size, len;
400
401 qemu_put_be64(f, offset);
402 size = 8;
403
404 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
405 len = strlen(block->idstr);
406 qemu_put_byte(f, len);
407 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
408 size += 1 + len;
409 }
410 return size;
411 }
412
413 /* Reduce amount of guest cpu execution to hopefully slow down memory writes.
414 * If guest dirty memory rate is reduced below the rate at which we can
415 * transfer pages to the destination then we should be able to complete
416 * migration. Some workloads dirty memory way too fast and will not effectively
417 * converge, even with auto-converge.
418 */
419 static void mig_throttle_guest_down(void)
420 {
421 MigrationState *s = migrate_get_current();
422 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
423 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
424
425 /* We have not started throttling yet. Let's start it. */
426 if (!cpu_throttle_active()) {
427 cpu_throttle_set(pct_initial);
428 } else {
429 /* Throttling already on, just increase the rate */
430 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
431 }
432 }
433
434 /* Update the xbzrle cache to reflect a page that's been sent as all 0.
435 * The important thing is that a stale (not-yet-0'd) page be replaced
436 * by the new data.
437 * As a bonus, if the page wasn't in the cache it gets added so that
438 * when a small write is made into the 0'd page it gets XBZRLE sent
439 */
440 static void xbzrle_cache_zero_page(ram_addr_t current_addr)
441 {
442 if (ram_bulk_stage || !migrate_use_xbzrle()) {
443 return;
444 }
445
446 /* We don't care if this fails to allocate a new cache page
447 * as long as it updated an old one */
448 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
449 bitmap_sync_count);
450 }
451
452 #define ENCODING_FLAG_XBZRLE 0x1
453
454 /**
455 * save_xbzrle_page: compress and send current page
456 *
457 * Returns: 1 means that we wrote the page
458 * 0 means that page is identical to the one already sent
459 * -1 means that xbzrle would be longer than normal
460 *
461 * @f: QEMUFile where to send the data
462 * @current_data:
463 * @current_addr:
464 * @block: block that contains the page we want to send
465 * @offset: offset inside the block for the page
466 * @last_stage: if we are at the completion stage
467 * @bytes_transferred: increase it with the number of transferred bytes
468 */
469 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
470 ram_addr_t current_addr, RAMBlock *block,
471 ram_addr_t offset, bool last_stage,
472 uint64_t *bytes_transferred)
473 {
474 int encoded_len = 0, bytes_xbzrle;
475 uint8_t *prev_cached_page;
476
477 if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
478 acct_info.xbzrle_cache_miss++;
479 if (!last_stage) {
480 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
481 bitmap_sync_count) == -1) {
482 return -1;
483 } else {
484 /* update *current_data when the page has been
485 inserted into cache */
486 *current_data = get_cached_data(XBZRLE.cache, current_addr);
487 }
488 }
489 return -1;
490 }
491
492 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
493
494 /* save current buffer into memory */
495 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
496
497 /* XBZRLE encoding (if there is no overflow) */
498 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
499 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
500 TARGET_PAGE_SIZE);
501 if (encoded_len == 0) {
502 trace_save_xbzrle_page_skipping();
503 return 0;
504 } else if (encoded_len == -1) {
505 trace_save_xbzrle_page_overflow();
506 acct_info.xbzrle_overflows++;
507 /* update data in the cache */
508 if (!last_stage) {
509 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
510 *current_data = prev_cached_page;
511 }
512 return -1;
513 }
514
515 /* we need to update the data in the cache, in order to get the same data */
516 if (!last_stage) {
517 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
518 }
519
520 /* Send XBZRLE based compressed page */
521 bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
522 qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
523 qemu_put_be16(f, encoded_len);
524 qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
525 bytes_xbzrle += encoded_len + 1 + 2;
526 acct_info.xbzrle_pages++;
527 acct_info.xbzrle_bytes += bytes_xbzrle;
528 *bytes_transferred += bytes_xbzrle;
529
530 return 1;
531 }
532
533 /* Called with rcu_read_lock() to protect migration_bitmap
534 * rb: The RAMBlock to search for dirty pages in
535 * start: Start address (typically so we can continue from previous page)
536 * ram_addr_abs: Pointer into which to store the address of the dirty page
537 * within the global ram_addr space
538 *
539 * Returns: byte offset within memory region of the start of a dirty page
540 */
541 static inline
542 ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
543 ram_addr_t start,
544 ram_addr_t *ram_addr_abs)
545 {
546 unsigned long base = rb->offset >> TARGET_PAGE_BITS;
547 unsigned long nr = base + (start >> TARGET_PAGE_BITS);
548 uint64_t rb_size = rb->used_length;
549 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
550 unsigned long *bitmap;
551
552 unsigned long next;
553
554 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
555 if (ram_bulk_stage && nr > base) {
556 next = nr + 1;
557 } else {
558 next = find_next_bit(bitmap, size, nr);
559 }
560
561 *ram_addr_abs = next << TARGET_PAGE_BITS;
562 return (next - base) << TARGET_PAGE_BITS;
563 }
564
565 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
566 {
567 bool ret;
568 int nr = addr >> TARGET_PAGE_BITS;
569 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
570
571 ret = test_and_clear_bit(nr, bitmap);
572
573 if (ret) {
574 migration_dirty_pages--;
575 }
576 return ret;
577 }
578
579 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
580 {
581 unsigned long *bitmap;
582 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
583 migration_dirty_pages +=
584 cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
585 }
586
587 /* Fix me: there are too many global variables used in migration process. */
588 static int64_t start_time;
589 static int64_t bytes_xfer_prev;
590 static int64_t num_dirty_pages_period;
591 static uint64_t xbzrle_cache_miss_prev;
592 static uint64_t iterations_prev;
593
594 static void migration_bitmap_sync_init(void)
595 {
596 start_time = 0;
597 bytes_xfer_prev = 0;
598 num_dirty_pages_period = 0;
599 xbzrle_cache_miss_prev = 0;
600 iterations_prev = 0;
601 }
602
603 static void migration_bitmap_sync(void)
604 {
605 RAMBlock *block;
606 uint64_t num_dirty_pages_init = migration_dirty_pages;
607 MigrationState *s = migrate_get_current();
608 int64_t end_time;
609 int64_t bytes_xfer_now;
610
611 bitmap_sync_count++;
612
613 if (!bytes_xfer_prev) {
614 bytes_xfer_prev = ram_bytes_transferred();
615 }
616
617 if (!start_time) {
618 start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
619 }
620
621 trace_migration_bitmap_sync_start();
622 memory_global_dirty_log_sync();
623
624 qemu_mutex_lock(&migration_bitmap_mutex);
625 rcu_read_lock();
626 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
627 migration_bitmap_sync_range(block->offset, block->used_length);
628 }
629 rcu_read_unlock();
630 qemu_mutex_unlock(&migration_bitmap_mutex);
631
632 trace_migration_bitmap_sync_end(migration_dirty_pages
633 - num_dirty_pages_init);
634 num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
635 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
636
637 /* more than 1 second = 1000 millisecons */
638 if (end_time > start_time + 1000) {
639 if (migrate_auto_converge()) {
640 /* The following detection logic can be refined later. For now:
641 Check to see if the dirtied bytes is 50% more than the approx.
642 amount of bytes that just got transferred since the last time we
643 were in this routine. If that happens twice, start or increase
644 throttling */
645 bytes_xfer_now = ram_bytes_transferred();
646
647 if (s->dirty_pages_rate &&
648 (num_dirty_pages_period * TARGET_PAGE_SIZE >
649 (bytes_xfer_now - bytes_xfer_prev)/2) &&
650 (dirty_rate_high_cnt++ >= 2)) {
651 trace_migration_throttle();
652 dirty_rate_high_cnt = 0;
653 mig_throttle_guest_down();
654 }
655 bytes_xfer_prev = bytes_xfer_now;
656 }
657
658 if (migrate_use_xbzrle()) {
659 if (iterations_prev != acct_info.iterations) {
660 acct_info.xbzrle_cache_miss_rate =
661 (double)(acct_info.xbzrle_cache_miss -
662 xbzrle_cache_miss_prev) /
663 (acct_info.iterations - iterations_prev);
664 }
665 iterations_prev = acct_info.iterations;
666 xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
667 }
668 s->dirty_pages_rate = num_dirty_pages_period * 1000
669 / (end_time - start_time);
670 s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
671 start_time = end_time;
672 num_dirty_pages_period = 0;
673 }
674 s->dirty_sync_count = bitmap_sync_count;
675 if (migrate_use_events()) {
676 qapi_event_send_migration_pass(bitmap_sync_count, NULL);
677 }
678 }
679
680 /**
681 * save_zero_page: Send the zero page to the stream
682 *
683 * Returns: Number of pages written.
684 *
685 * @f: QEMUFile where to send the data
686 * @block: block that contains the page we want to send
687 * @offset: offset inside the block for the page
688 * @p: pointer to the page
689 * @bytes_transferred: increase it with the number of transferred bytes
690 */
691 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
692 uint8_t *p, uint64_t *bytes_transferred)
693 {
694 int pages = -1;
695
696 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
697 acct_info.dup_pages++;
698 *bytes_transferred += save_page_header(f, block,
699 offset | RAM_SAVE_FLAG_COMPRESS);
700 qemu_put_byte(f, 0);
701 *bytes_transferred += 1;
702 pages = 1;
703 }
704
705 return pages;
706 }
707
708 /**
709 * ram_save_page: Send the given page to the stream
710 *
711 * Returns: Number of pages written.
712 * < 0 - error
713 * >=0 - Number of pages written - this might legally be 0
714 * if xbzrle noticed the page was the same.
715 *
716 * @ms: The current migration state.
717 * @f: QEMUFile where to send the data
718 * @block: block that contains the page we want to send
719 * @offset: offset inside the block for the page
720 * @last_stage: if we are at the completion stage
721 * @bytes_transferred: increase it with the number of transferred bytes
722 */
723 static int ram_save_page(MigrationState *ms, QEMUFile *f, PageSearchStatus *pss,
724 bool last_stage, uint64_t *bytes_transferred)
725 {
726 int pages = -1;
727 uint64_t bytes_xmit;
728 ram_addr_t current_addr;
729 uint8_t *p;
730 int ret;
731 bool send_async = true;
732 RAMBlock *block = pss->block;
733 ram_addr_t offset = pss->offset;
734
735 p = block->host + offset;
736
737 /* In doubt sent page as normal */
738 bytes_xmit = 0;
739 ret = ram_control_save_page(f, block->offset,
740 offset, TARGET_PAGE_SIZE, &bytes_xmit);
741 if (bytes_xmit) {
742 *bytes_transferred += bytes_xmit;
743 pages = 1;
744 }
745
746 XBZRLE_cache_lock();
747
748 current_addr = block->offset + offset;
749
750 if (block == last_sent_block) {
751 offset |= RAM_SAVE_FLAG_CONTINUE;
752 }
753 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
754 if (ret != RAM_SAVE_CONTROL_DELAYED) {
755 if (bytes_xmit > 0) {
756 acct_info.norm_pages++;
757 } else if (bytes_xmit == 0) {
758 acct_info.dup_pages++;
759 }
760 }
761 } else {
762 pages = save_zero_page(f, block, offset, p, bytes_transferred);
763 if (pages > 0) {
764 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
765 * page would be stale
766 */
767 xbzrle_cache_zero_page(current_addr);
768 } else if (!ram_bulk_stage &&
769 !migration_in_postcopy(ms) && migrate_use_xbzrle()) {
770 pages = save_xbzrle_page(f, &p, current_addr, block,
771 offset, last_stage, bytes_transferred);
772 if (!last_stage) {
773 /* Can't send this cached data async, since the cache page
774 * might get updated before it gets to the wire
775 */
776 send_async = false;
777 }
778 }
779 }
780
781 /* XBZRLE overflow or normal page */
782 if (pages == -1) {
783 *bytes_transferred += save_page_header(f, block,
784 offset | RAM_SAVE_FLAG_PAGE);
785 if (send_async) {
786 qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
787 } else {
788 qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
789 }
790 *bytes_transferred += TARGET_PAGE_SIZE;
791 pages = 1;
792 acct_info.norm_pages++;
793 }
794
795 XBZRLE_cache_unlock();
796
797 return pages;
798 }
799
800 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
801 ram_addr_t offset)
802 {
803 int bytes_sent, blen;
804 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
805
806 bytes_sent = save_page_header(f, block, offset |
807 RAM_SAVE_FLAG_COMPRESS_PAGE);
808 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
809 migrate_compress_level());
810 if (blen < 0) {
811 bytes_sent = 0;
812 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
813 error_report("compressed data failed!");
814 } else {
815 bytes_sent += blen;
816 }
817
818 return bytes_sent;
819 }
820
821 static uint64_t bytes_transferred;
822
823 static void flush_compressed_data(QEMUFile *f)
824 {
825 int idx, len, thread_count;
826
827 if (!migrate_use_compression()) {
828 return;
829 }
830 thread_count = migrate_compress_threads();
831
832 qemu_mutex_lock(&comp_done_lock);
833 for (idx = 0; idx < thread_count; idx++) {
834 while (!comp_param[idx].done) {
835 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
836 }
837 }
838 qemu_mutex_unlock(&comp_done_lock);
839
840 for (idx = 0; idx < thread_count; idx++) {
841 qemu_mutex_lock(&comp_param[idx].mutex);
842 if (!comp_param[idx].quit) {
843 len = qemu_put_qemu_file(f, comp_param[idx].file);
844 bytes_transferred += len;
845 }
846 qemu_mutex_unlock(&comp_param[idx].mutex);
847 }
848 }
849
850 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
851 ram_addr_t offset)
852 {
853 param->block = block;
854 param->offset = offset;
855 }
856
857 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
858 ram_addr_t offset,
859 uint64_t *bytes_transferred)
860 {
861 int idx, thread_count, bytes_xmit = -1, pages = -1;
862
863 thread_count = migrate_compress_threads();
864 qemu_mutex_lock(&comp_done_lock);
865 while (true) {
866 for (idx = 0; idx < thread_count; idx++) {
867 if (comp_param[idx].done) {
868 comp_param[idx].done = false;
869 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
870 qemu_mutex_lock(&comp_param[idx].mutex);
871 set_compress_params(&comp_param[idx], block, offset);
872 qemu_cond_signal(&comp_param[idx].cond);
873 qemu_mutex_unlock(&comp_param[idx].mutex);
874 pages = 1;
875 acct_info.norm_pages++;
876 *bytes_transferred += bytes_xmit;
877 break;
878 }
879 }
880 if (pages > 0) {
881 break;
882 } else {
883 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
884 }
885 }
886 qemu_mutex_unlock(&comp_done_lock);
887
888 return pages;
889 }
890
891 /**
892 * ram_save_compressed_page: compress the given page and send it to the stream
893 *
894 * Returns: Number of pages written.
895 *
896 * @ms: The current migration state.
897 * @f: QEMUFile where to send the data
898 * @block: block that contains the page we want to send
899 * @offset: offset inside the block for the page
900 * @last_stage: if we are at the completion stage
901 * @bytes_transferred: increase it with the number of transferred bytes
902 */
903 static int ram_save_compressed_page(MigrationState *ms, QEMUFile *f,
904 PageSearchStatus *pss, bool last_stage,
905 uint64_t *bytes_transferred)
906 {
907 int pages = -1;
908 uint64_t bytes_xmit = 0;
909 uint8_t *p;
910 int ret, blen;
911 RAMBlock *block = pss->block;
912 ram_addr_t offset = pss->offset;
913
914 p = block->host + offset;
915
916 ret = ram_control_save_page(f, block->offset,
917 offset, TARGET_PAGE_SIZE, &bytes_xmit);
918 if (bytes_xmit) {
919 *bytes_transferred += bytes_xmit;
920 pages = 1;
921 }
922 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
923 if (ret != RAM_SAVE_CONTROL_DELAYED) {
924 if (bytes_xmit > 0) {
925 acct_info.norm_pages++;
926 } else if (bytes_xmit == 0) {
927 acct_info.dup_pages++;
928 }
929 }
930 } else {
931 /* When starting the process of a new block, the first page of
932 * the block should be sent out before other pages in the same
933 * block, and all the pages in last block should have been sent
934 * out, keeping this order is important, because the 'cont' flag
935 * is used to avoid resending the block name.
936 */
937 if (block != last_sent_block) {
938 flush_compressed_data(f);
939 pages = save_zero_page(f, block, offset, p, bytes_transferred);
940 if (pages == -1) {
941 /* Make sure the first page is sent out before other pages */
942 bytes_xmit = save_page_header(f, block, offset |
943 RAM_SAVE_FLAG_COMPRESS_PAGE);
944 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
945 migrate_compress_level());
946 if (blen > 0) {
947 *bytes_transferred += bytes_xmit + blen;
948 acct_info.norm_pages++;
949 pages = 1;
950 } else {
951 qemu_file_set_error(f, blen);
952 error_report("compressed data failed!");
953 }
954 }
955 } else {
956 offset |= RAM_SAVE_FLAG_CONTINUE;
957 pages = save_zero_page(f, block, offset, p, bytes_transferred);
958 if (pages == -1) {
959 pages = compress_page_with_multi_thread(f, block, offset,
960 bytes_transferred);
961 }
962 }
963 }
964
965 return pages;
966 }
967
968 /*
969 * Find the next dirty page and update any state associated with
970 * the search process.
971 *
972 * Returns: True if a page is found
973 *
974 * @f: Current migration stream.
975 * @pss: Data about the state of the current dirty page scan.
976 * @*again: Set to false if the search has scanned the whole of RAM
977 * *ram_addr_abs: Pointer into which to store the address of the dirty page
978 * within the global ram_addr space
979 */
980 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
981 bool *again, ram_addr_t *ram_addr_abs)
982 {
983 pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
984 ram_addr_abs);
985 if (pss->complete_round && pss->block == last_seen_block &&
986 pss->offset >= last_offset) {
987 /*
988 * We've been once around the RAM and haven't found anything.
989 * Give up.
990 */
991 *again = false;
992 return false;
993 }
994 if (pss->offset >= pss->block->used_length) {
995 /* Didn't find anything in this RAM Block */
996 pss->offset = 0;
997 pss->block = QLIST_NEXT_RCU(pss->block, next);
998 if (!pss->block) {
999 /* Hit the end of the list */
1000 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1001 /* Flag that we've looped */
1002 pss->complete_round = true;
1003 ram_bulk_stage = false;
1004 if (migrate_use_xbzrle()) {
1005 /* If xbzrle is on, stop using the data compression at this
1006 * point. In theory, xbzrle can do better than compression.
1007 */
1008 flush_compressed_data(f);
1009 compression_switch = false;
1010 }
1011 }
1012 /* Didn't find anything this time, but try again on the new block */
1013 *again = true;
1014 return false;
1015 } else {
1016 /* Can go around again, but... */
1017 *again = true;
1018 /* We've found something so probably don't need to */
1019 return true;
1020 }
1021 }
1022
1023 /*
1024 * Helper for 'get_queued_page' - gets a page off the queue
1025 * ms: MigrationState in
1026 * *offset: Used to return the offset within the RAMBlock
1027 * ram_addr_abs: global offset in the dirty/sent bitmaps
1028 *
1029 * Returns: block (or NULL if none available)
1030 */
1031 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1032 ram_addr_t *ram_addr_abs)
1033 {
1034 RAMBlock *block = NULL;
1035
1036 qemu_mutex_lock(&ms->src_page_req_mutex);
1037 if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1038 struct MigrationSrcPageRequest *entry =
1039 QSIMPLEQ_FIRST(&ms->src_page_requests);
1040 block = entry->rb;
1041 *offset = entry->offset;
1042 *ram_addr_abs = (entry->offset + entry->rb->offset) &
1043 TARGET_PAGE_MASK;
1044
1045 if (entry->len > TARGET_PAGE_SIZE) {
1046 entry->len -= TARGET_PAGE_SIZE;
1047 entry->offset += TARGET_PAGE_SIZE;
1048 } else {
1049 memory_region_unref(block->mr);
1050 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1051 g_free(entry);
1052 }
1053 }
1054 qemu_mutex_unlock(&ms->src_page_req_mutex);
1055
1056 return block;
1057 }
1058
1059 /*
1060 * Unqueue a page from the queue fed by postcopy page requests; skips pages
1061 * that are already sent (!dirty)
1062 *
1063 * ms: MigrationState in
1064 * pss: PageSearchStatus structure updated with found block/offset
1065 * ram_addr_abs: global offset in the dirty/sent bitmaps
1066 *
1067 * Returns: true if a queued page is found
1068 */
1069 static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
1070 ram_addr_t *ram_addr_abs)
1071 {
1072 RAMBlock *block;
1073 ram_addr_t offset;
1074 bool dirty;
1075
1076 do {
1077 block = unqueue_page(ms, &offset, ram_addr_abs);
1078 /*
1079 * We're sending this page, and since it's postcopy nothing else
1080 * will dirty it, and we must make sure it doesn't get sent again
1081 * even if this queue request was received after the background
1082 * search already sent it.
1083 */
1084 if (block) {
1085 unsigned long *bitmap;
1086 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1087 dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1088 if (!dirty) {
1089 trace_get_queued_page_not_dirty(
1090 block->idstr, (uint64_t)offset,
1091 (uint64_t)*ram_addr_abs,
1092 test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1093 atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1094 } else {
1095 trace_get_queued_page(block->idstr,
1096 (uint64_t)offset,
1097 (uint64_t)*ram_addr_abs);
1098 }
1099 }
1100
1101 } while (block && !dirty);
1102
1103 if (block) {
1104 /*
1105 * As soon as we start servicing pages out of order, then we have
1106 * to kill the bulk stage, since the bulk stage assumes
1107 * in (migration_bitmap_find_and_reset_dirty) that every page is
1108 * dirty, that's no longer true.
1109 */
1110 ram_bulk_stage = false;
1111
1112 /*
1113 * We want the background search to continue from the queued page
1114 * since the guest is likely to want other pages near to the page
1115 * it just requested.
1116 */
1117 pss->block = block;
1118 pss->offset = offset;
1119 }
1120
1121 return !!block;
1122 }
1123
1124 /**
1125 * flush_page_queue: Flush any remaining pages in the ram request queue
1126 * it should be empty at the end anyway, but in error cases there may be
1127 * some left.
1128 *
1129 * ms: MigrationState
1130 */
1131 void flush_page_queue(MigrationState *ms)
1132 {
1133 struct MigrationSrcPageRequest *mspr, *next_mspr;
1134 /* This queue generally should be empty - but in the case of a failed
1135 * migration might have some droppings in.
1136 */
1137 rcu_read_lock();
1138 QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1139 memory_region_unref(mspr->rb->mr);
1140 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1141 g_free(mspr);
1142 }
1143 rcu_read_unlock();
1144 }
1145
1146 /**
1147 * Queue the pages for transmission, e.g. a request from postcopy destination
1148 * ms: MigrationStatus in which the queue is held
1149 * rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
1150 * start: Offset from the start of the RAMBlock
1151 * len: Length (in bytes) to send
1152 * Return: 0 on success
1153 */
1154 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1155 ram_addr_t start, ram_addr_t len)
1156 {
1157 RAMBlock *ramblock;
1158
1159 ms->postcopy_requests++;
1160 rcu_read_lock();
1161 if (!rbname) {
1162 /* Reuse last RAMBlock */
1163 ramblock = ms->last_req_rb;
1164
1165 if (!ramblock) {
1166 /*
1167 * Shouldn't happen, we can't reuse the last RAMBlock if
1168 * it's the 1st request.
1169 */
1170 error_report("ram_save_queue_pages no previous block");
1171 goto err;
1172 }
1173 } else {
1174 ramblock = qemu_ram_block_by_name(rbname);
1175
1176 if (!ramblock) {
1177 /* We shouldn't be asked for a non-existent RAMBlock */
1178 error_report("ram_save_queue_pages no block '%s'", rbname);
1179 goto err;
1180 }
1181 ms->last_req_rb = ramblock;
1182 }
1183 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1184 if (start+len > ramblock->used_length) {
1185 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1186 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1187 __func__, start, len, ramblock->used_length);
1188 goto err;
1189 }
1190
1191 struct MigrationSrcPageRequest *new_entry =
1192 g_malloc0(sizeof(struct MigrationSrcPageRequest));
1193 new_entry->rb = ramblock;
1194 new_entry->offset = start;
1195 new_entry->len = len;
1196
1197 memory_region_ref(ramblock->mr);
1198 qemu_mutex_lock(&ms->src_page_req_mutex);
1199 QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1200 qemu_mutex_unlock(&ms->src_page_req_mutex);
1201 rcu_read_unlock();
1202
1203 return 0;
1204
1205 err:
1206 rcu_read_unlock();
1207 return -1;
1208 }
1209
1210 /**
1211 * ram_save_target_page: Save one target page
1212 *
1213 *
1214 * @f: QEMUFile where to send the data
1215 * @block: pointer to block that contains the page we want to send
1216 * @offset: offset inside the block for the page;
1217 * @last_stage: if we are at the completion stage
1218 * @bytes_transferred: increase it with the number of transferred bytes
1219 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1220 *
1221 * Returns: Number of pages written.
1222 */
1223 static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
1224 PageSearchStatus *pss,
1225 bool last_stage,
1226 uint64_t *bytes_transferred,
1227 ram_addr_t dirty_ram_abs)
1228 {
1229 int res = 0;
1230
1231 /* Check the pages is dirty and if it is send it */
1232 if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1233 unsigned long *unsentmap;
1234 if (compression_switch && migrate_use_compression()) {
1235 res = ram_save_compressed_page(ms, f, pss,
1236 last_stage,
1237 bytes_transferred);
1238 } else {
1239 res = ram_save_page(ms, f, pss, last_stage,
1240 bytes_transferred);
1241 }
1242
1243 if (res < 0) {
1244 return res;
1245 }
1246 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1247 if (unsentmap) {
1248 clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1249 }
1250 /* Only update last_sent_block if a block was actually sent; xbzrle
1251 * might have decided the page was identical so didn't bother writing
1252 * to the stream.
1253 */
1254 if (res > 0) {
1255 last_sent_block = pss->block;
1256 }
1257 }
1258
1259 return res;
1260 }
1261
1262 /**
1263 * ram_save_host_page: Starting at *offset send pages up to the end
1264 * of the current host page. It's valid for the initial
1265 * offset to point into the middle of a host page
1266 * in which case the remainder of the hostpage is sent.
1267 * Only dirty target pages are sent.
1268 *
1269 * Returns: Number of pages written.
1270 *
1271 * @f: QEMUFile where to send the data
1272 * @block: pointer to block that contains the page we want to send
1273 * @offset: offset inside the block for the page; updated to last target page
1274 * sent
1275 * @last_stage: if we are at the completion stage
1276 * @bytes_transferred: increase it with the number of transferred bytes
1277 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1278 */
1279 static int ram_save_host_page(MigrationState *ms, QEMUFile *f,
1280 PageSearchStatus *pss,
1281 bool last_stage,
1282 uint64_t *bytes_transferred,
1283 ram_addr_t dirty_ram_abs)
1284 {
1285 int tmppages, pages = 0;
1286 do {
1287 tmppages = ram_save_target_page(ms, f, pss, last_stage,
1288 bytes_transferred, dirty_ram_abs);
1289 if (tmppages < 0) {
1290 return tmppages;
1291 }
1292
1293 pages += tmppages;
1294 pss->offset += TARGET_PAGE_SIZE;
1295 dirty_ram_abs += TARGET_PAGE_SIZE;
1296 } while (pss->offset & (qemu_host_page_size - 1));
1297
1298 /* The offset we leave with is the last one we looked at */
1299 pss->offset -= TARGET_PAGE_SIZE;
1300 return pages;
1301 }
1302
1303 /**
1304 * ram_find_and_save_block: Finds a dirty page and sends it to f
1305 *
1306 * Called within an RCU critical section.
1307 *
1308 * Returns: The number of pages written
1309 * 0 means no dirty pages
1310 *
1311 * @f: QEMUFile where to send the data
1312 * @last_stage: if we are at the completion stage
1313 * @bytes_transferred: increase it with the number of transferred bytes
1314 *
1315 * On systems where host-page-size > target-page-size it will send all the
1316 * pages in a host page that are dirty.
1317 */
1318
1319 static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1320 uint64_t *bytes_transferred)
1321 {
1322 PageSearchStatus pss;
1323 MigrationState *ms = migrate_get_current();
1324 int pages = 0;
1325 bool again, found;
1326 ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1327 ram_addr_t space */
1328
1329 pss.block = last_seen_block;
1330 pss.offset = last_offset;
1331 pss.complete_round = false;
1332
1333 if (!pss.block) {
1334 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1335 }
1336
1337 do {
1338 again = true;
1339 found = get_queued_page(ms, &pss, &dirty_ram_abs);
1340
1341 if (!found) {
1342 /* priority queue empty, so just search for something dirty */
1343 found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
1344 }
1345
1346 if (found) {
1347 pages = ram_save_host_page(ms, f, &pss,
1348 last_stage, bytes_transferred,
1349 dirty_ram_abs);
1350 }
1351 } while (!pages && again);
1352
1353 last_seen_block = pss.block;
1354 last_offset = pss.offset;
1355
1356 return pages;
1357 }
1358
1359 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1360 {
1361 uint64_t pages = size / TARGET_PAGE_SIZE;
1362 if (zero) {
1363 acct_info.dup_pages += pages;
1364 } else {
1365 acct_info.norm_pages += pages;
1366 bytes_transferred += size;
1367 qemu_update_position(f, size);
1368 }
1369 }
1370
1371 static ram_addr_t ram_save_remaining(void)
1372 {
1373 return migration_dirty_pages;
1374 }
1375
1376 uint64_t ram_bytes_remaining(void)
1377 {
1378 return ram_save_remaining() * TARGET_PAGE_SIZE;
1379 }
1380
1381 uint64_t ram_bytes_transferred(void)
1382 {
1383 return bytes_transferred;
1384 }
1385
1386 uint64_t ram_bytes_total(void)
1387 {
1388 RAMBlock *block;
1389 uint64_t total = 0;
1390
1391 rcu_read_lock();
1392 QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1393 total += block->used_length;
1394 rcu_read_unlock();
1395 return total;
1396 }
1397
1398 void free_xbzrle_decoded_buf(void)
1399 {
1400 g_free(xbzrle_decoded_buf);
1401 xbzrle_decoded_buf = NULL;
1402 }
1403
1404 static void migration_bitmap_free(struct BitmapRcu *bmap)
1405 {
1406 g_free(bmap->bmap);
1407 g_free(bmap->unsentmap);
1408 g_free(bmap);
1409 }
1410
1411 static void ram_migration_cleanup(void *opaque)
1412 {
1413 /* caller have hold iothread lock or is in a bh, so there is
1414 * no writing race against this migration_bitmap
1415 */
1416 struct BitmapRcu *bitmap = migration_bitmap_rcu;
1417 atomic_rcu_set(&migration_bitmap_rcu, NULL);
1418 if (bitmap) {
1419 memory_global_dirty_log_stop();
1420 call_rcu(bitmap, migration_bitmap_free, rcu);
1421 }
1422
1423 XBZRLE_cache_lock();
1424 if (XBZRLE.cache) {
1425 cache_fini(XBZRLE.cache);
1426 g_free(XBZRLE.encoded_buf);
1427 g_free(XBZRLE.current_buf);
1428 g_free(ZERO_TARGET_PAGE);
1429 XBZRLE.cache = NULL;
1430 XBZRLE.encoded_buf = NULL;
1431 XBZRLE.current_buf = NULL;
1432 }
1433 XBZRLE_cache_unlock();
1434 }
1435
1436 static void reset_ram_globals(void)
1437 {
1438 last_seen_block = NULL;
1439 last_sent_block = NULL;
1440 last_offset = 0;
1441 last_version = ram_list.version;
1442 ram_bulk_stage = true;
1443 }
1444
1445 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1446
1447 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1448 {
1449 /* called in qemu main thread, so there is
1450 * no writing race against this migration_bitmap
1451 */
1452 if (migration_bitmap_rcu) {
1453 struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1454 bitmap = g_new(struct BitmapRcu, 1);
1455 bitmap->bmap = bitmap_new(new);
1456
1457 /* prevent migration_bitmap content from being set bit
1458 * by migration_bitmap_sync_range() at the same time.
1459 * it is safe to migration if migration_bitmap is cleared bit
1460 * at the same time.
1461 */
1462 qemu_mutex_lock(&migration_bitmap_mutex);
1463 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1464 bitmap_set(bitmap->bmap, old, new - old);
1465
1466 /* We don't have a way to safely extend the sentmap
1467 * with RCU; so mark it as missing, entry to postcopy
1468 * will fail.
1469 */
1470 bitmap->unsentmap = NULL;
1471
1472 atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1473 qemu_mutex_unlock(&migration_bitmap_mutex);
1474 migration_dirty_pages += new - old;
1475 call_rcu(old_bitmap, migration_bitmap_free, rcu);
1476 }
1477 }
1478
1479 /*
1480 * 'expected' is the value you expect the bitmap mostly to be full
1481 * of; it won't bother printing lines that are all this value.
1482 * If 'todump' is null the migration bitmap is dumped.
1483 */
1484 void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1485 {
1486 int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1487
1488 int64_t cur;
1489 int64_t linelen = 128;
1490 char linebuf[129];
1491
1492 if (!todump) {
1493 todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1494 }
1495
1496 for (cur = 0; cur < ram_pages; cur += linelen) {
1497 int64_t curb;
1498 bool found = false;
1499 /*
1500 * Last line; catch the case where the line length
1501 * is longer than remaining ram
1502 */
1503 if (cur + linelen > ram_pages) {
1504 linelen = ram_pages - cur;
1505 }
1506 for (curb = 0; curb < linelen; curb++) {
1507 bool thisbit = test_bit(cur + curb, todump);
1508 linebuf[curb] = thisbit ? '1' : '.';
1509 found = found || (thisbit != expected);
1510 }
1511 if (found) {
1512 linebuf[curb] = '\0';
1513 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1514 }
1515 }
1516 }
1517
1518 /* **** functions for postcopy ***** */
1519
1520 /*
1521 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1522 * Note: At this point the 'unsentmap' is the processed bitmap combined
1523 * with the dirtymap; so a '1' means it's either dirty or unsent.
1524 * start,length: Indexes into the bitmap for the first bit
1525 * representing the named block and length in target-pages
1526 */
1527 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1528 PostcopyDiscardState *pds,
1529 unsigned long start,
1530 unsigned long length)
1531 {
1532 unsigned long end = start + length; /* one after the end */
1533 unsigned long current;
1534 unsigned long *unsentmap;
1535
1536 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1537 for (current = start; current < end; ) {
1538 unsigned long one = find_next_bit(unsentmap, end, current);
1539
1540 if (one <= end) {
1541 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1542 unsigned long discard_length;
1543
1544 if (zero >= end) {
1545 discard_length = end - one;
1546 } else {
1547 discard_length = zero - one;
1548 }
1549 if (discard_length) {
1550 postcopy_discard_send_range(ms, pds, one, discard_length);
1551 }
1552 current = one + discard_length;
1553 } else {
1554 current = one;
1555 }
1556 }
1557
1558 return 0;
1559 }
1560
1561 /*
1562 * Utility for the outgoing postcopy code.
1563 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1564 * passing it bitmap indexes and name.
1565 * Returns: 0 on success
1566 * (qemu_ram_foreach_block ends up passing unscaled lengths
1567 * which would mean postcopy code would have to deal with target page)
1568 */
1569 static int postcopy_each_ram_send_discard(MigrationState *ms)
1570 {
1571 struct RAMBlock *block;
1572 int ret;
1573
1574 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1575 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1576 PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1577 first,
1578 block->idstr);
1579
1580 /*
1581 * Postcopy sends chunks of bitmap over the wire, but it
1582 * just needs indexes at this point, avoids it having
1583 * target page specific code.
1584 */
1585 ret = postcopy_send_discard_bm_ram(ms, pds, first,
1586 block->used_length >> TARGET_PAGE_BITS);
1587 postcopy_discard_send_finish(ms, pds);
1588 if (ret) {
1589 return ret;
1590 }
1591 }
1592
1593 return 0;
1594 }
1595
1596 /*
1597 * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
1598 * the two bitmaps, that are similar, but one is inverted.
1599 *
1600 * We search for runs of target-pages that don't start or end on a
1601 * host page boundary;
1602 * unsent_pass=true: Cleans up partially unsent host pages by searching
1603 * the unsentmap
1604 * unsent_pass=false: Cleans up partially dirty host pages by searching
1605 * the main migration bitmap
1606 *
1607 */
1608 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1609 RAMBlock *block,
1610 PostcopyDiscardState *pds)
1611 {
1612 unsigned long *bitmap;
1613 unsigned long *unsentmap;
1614 unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE;
1615 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1616 unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1617 unsigned long last = first + (len - 1);
1618 unsigned long run_start;
1619
1620 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1621 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1622
1623 if (unsent_pass) {
1624 /* Find a sent page */
1625 run_start = find_next_zero_bit(unsentmap, last + 1, first);
1626 } else {
1627 /* Find a dirty page */
1628 run_start = find_next_bit(bitmap, last + 1, first);
1629 }
1630
1631 while (run_start <= last) {
1632 bool do_fixup = false;
1633 unsigned long fixup_start_addr;
1634 unsigned long host_offset;
1635
1636 /*
1637 * If the start of this run of pages is in the middle of a host
1638 * page, then we need to fixup this host page.
1639 */
1640 host_offset = run_start % host_ratio;
1641 if (host_offset) {
1642 do_fixup = true;
1643 run_start -= host_offset;
1644 fixup_start_addr = run_start;
1645 /* For the next pass */
1646 run_start = run_start + host_ratio;
1647 } else {
1648 /* Find the end of this run */
1649 unsigned long run_end;
1650 if (unsent_pass) {
1651 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1652 } else {
1653 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1654 }
1655 /*
1656 * If the end isn't at the start of a host page, then the
1657 * run doesn't finish at the end of a host page
1658 * and we need to discard.
1659 */
1660 host_offset = run_end % host_ratio;
1661 if (host_offset) {
1662 do_fixup = true;
1663 fixup_start_addr = run_end - host_offset;
1664 /*
1665 * This host page has gone, the next loop iteration starts
1666 * from after the fixup
1667 */
1668 run_start = fixup_start_addr + host_ratio;
1669 } else {
1670 /*
1671 * No discards on this iteration, next loop starts from
1672 * next sent/dirty page
1673 */
1674 run_start = run_end + 1;
1675 }
1676 }
1677
1678 if (do_fixup) {
1679 unsigned long page;
1680
1681 /* Tell the destination to discard this page */
1682 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1683 /* For the unsent_pass we:
1684 * discard partially sent pages
1685 * For the !unsent_pass (dirty) we:
1686 * discard partially dirty pages that were sent
1687 * (any partially sent pages were already discarded
1688 * by the previous unsent_pass)
1689 */
1690 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1691 host_ratio);
1692 }
1693
1694 /* Clean up the bitmap */
1695 for (page = fixup_start_addr;
1696 page < fixup_start_addr + host_ratio; page++) {
1697 /* All pages in this host page are now not sent */
1698 set_bit(page, unsentmap);
1699
1700 /*
1701 * Remark them as dirty, updating the count for any pages
1702 * that weren't previously dirty.
1703 */
1704 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1705 }
1706 }
1707
1708 if (unsent_pass) {
1709 /* Find the next sent page for the next iteration */
1710 run_start = find_next_zero_bit(unsentmap, last + 1,
1711 run_start);
1712 } else {
1713 /* Find the next dirty page for the next iteration */
1714 run_start = find_next_bit(bitmap, last + 1, run_start);
1715 }
1716 }
1717 }
1718
1719 /*
1720 * Utility for the outgoing postcopy code.
1721 *
1722 * Discard any partially sent host-page size chunks, mark any partially
1723 * dirty host-page size chunks as all dirty.
1724 *
1725 * Returns: 0 on success
1726 */
1727 static int postcopy_chunk_hostpages(MigrationState *ms)
1728 {
1729 struct RAMBlock *block;
1730
1731 if (qemu_host_page_size == TARGET_PAGE_SIZE) {
1732 /* Easy case - TPS==HPS - nothing to be done */
1733 return 0;
1734 }
1735
1736 /* Easiest way to make sure we don't resume in the middle of a host-page */
1737 last_seen_block = NULL;
1738 last_sent_block = NULL;
1739 last_offset = 0;
1740
1741 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1742 unsigned long first = block->offset >> TARGET_PAGE_BITS;
1743
1744 PostcopyDiscardState *pds =
1745 postcopy_discard_send_init(ms, first, block->idstr);
1746
1747 /* First pass: Discard all partially sent host pages */
1748 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1749 /*
1750 * Second pass: Ensure that all partially dirty host pages are made
1751 * fully dirty.
1752 */
1753 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1754
1755 postcopy_discard_send_finish(ms, pds);
1756 } /* ram_list loop */
1757
1758 return 0;
1759 }
1760
1761 /*
1762 * Transmit the set of pages to be discarded after precopy to the target
1763 * these are pages that:
1764 * a) Have been previously transmitted but are now dirty again
1765 * b) Pages that have never been transmitted, this ensures that
1766 * any pages on the destination that have been mapped by background
1767 * tasks get discarded (transparent huge pages is the specific concern)
1768 * Hopefully this is pretty sparse
1769 */
1770 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1771 {
1772 int ret;
1773 unsigned long *bitmap, *unsentmap;
1774
1775 rcu_read_lock();
1776
1777 /* This should be our last sync, the src is now paused */
1778 migration_bitmap_sync();
1779
1780 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1781 if (!unsentmap) {
1782 /* We don't have a safe way to resize the sentmap, so
1783 * if the bitmap was resized it will be NULL at this
1784 * point.
1785 */
1786 error_report("migration ram resized during precopy phase");
1787 rcu_read_unlock();
1788 return -EINVAL;
1789 }
1790
1791 /* Deal with TPS != HPS */
1792 ret = postcopy_chunk_hostpages(ms);
1793 if (ret) {
1794 rcu_read_unlock();
1795 return ret;
1796 }
1797
1798 /*
1799 * Update the unsentmap to be unsentmap = unsentmap | dirty
1800 */
1801 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1802 bitmap_or(unsentmap, unsentmap, bitmap,
1803 last_ram_offset() >> TARGET_PAGE_BITS);
1804
1805
1806 trace_ram_postcopy_send_discard_bitmap();
1807 #ifdef DEBUG_POSTCOPY
1808 ram_debug_dump_bitmap(unsentmap, true);
1809 #endif
1810
1811 ret = postcopy_each_ram_send_discard(ms);
1812 rcu_read_unlock();
1813
1814 return ret;
1815 }
1816
1817 /*
1818 * At the start of the postcopy phase of migration, any now-dirty
1819 * precopied pages are discarded.
1820 *
1821 * start, length describe a byte address range within the RAMBlock
1822 *
1823 * Returns 0 on success.
1824 */
1825 int ram_discard_range(MigrationIncomingState *mis,
1826 const char *block_name,
1827 uint64_t start, size_t length)
1828 {
1829 int ret = -1;
1830
1831 rcu_read_lock();
1832 RAMBlock *rb = qemu_ram_block_by_name(block_name);
1833
1834 if (!rb) {
1835 error_report("ram_discard_range: Failed to find block '%s'",
1836 block_name);
1837 goto err;
1838 }
1839
1840 uint8_t *host_startaddr = rb->host + start;
1841
1842 if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) {
1843 error_report("ram_discard_range: Unaligned start address: %p",
1844 host_startaddr);
1845 goto err;
1846 }
1847
1848 if ((start + length) <= rb->used_length) {
1849 uint8_t *host_endaddr = host_startaddr + length;
1850 if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) {
1851 error_report("ram_discard_range: Unaligned end address: %p",
1852 host_endaddr);
1853 goto err;
1854 }
1855 ret = postcopy_ram_discard_range(mis, host_startaddr, length);
1856 } else {
1857 error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
1858 "/%zx/" RAM_ADDR_FMT")",
1859 block_name, start, length, rb->used_length);
1860 }
1861
1862 err:
1863 rcu_read_unlock();
1864
1865 return ret;
1866 }
1867
1868 static int ram_save_init_globals(void)
1869 {
1870 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1871
1872 dirty_rate_high_cnt = 0;
1873 bitmap_sync_count = 0;
1874 migration_bitmap_sync_init();
1875 qemu_mutex_init(&migration_bitmap_mutex);
1876
1877 if (migrate_use_xbzrle()) {
1878 XBZRLE_cache_lock();
1879 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1880 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1881 TARGET_PAGE_SIZE,
1882 TARGET_PAGE_SIZE);
1883 if (!XBZRLE.cache) {
1884 XBZRLE_cache_unlock();
1885 error_report("Error creating cache");
1886 return -1;
1887 }
1888 XBZRLE_cache_unlock();
1889
1890 /* We prefer not to abort if there is no memory */
1891 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1892 if (!XBZRLE.encoded_buf) {
1893 error_report("Error allocating encoded_buf");
1894 return -1;
1895 }
1896
1897 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1898 if (!XBZRLE.current_buf) {
1899 error_report("Error allocating current_buf");
1900 g_free(XBZRLE.encoded_buf);
1901 XBZRLE.encoded_buf = NULL;
1902 return -1;
1903 }
1904
1905 acct_clear();
1906 }
1907
1908 /* For memory_global_dirty_log_start below. */
1909 qemu_mutex_lock_iothread();
1910
1911 qemu_mutex_lock_ramlist();
1912 rcu_read_lock();
1913 bytes_transferred = 0;
1914 reset_ram_globals();
1915
1916 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1917 migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
1918 migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1919 bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
1920
1921 if (migrate_postcopy_ram()) {
1922 migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
1923 bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
1924 }
1925
1926 /*
1927 * Count the total number of pages used by ram blocks not including any
1928 * gaps due to alignment or unplugs.
1929 */
1930 migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1931
1932 memory_global_dirty_log_start();
1933 migration_bitmap_sync();
1934 qemu_mutex_unlock_ramlist();
1935 qemu_mutex_unlock_iothread();
1936 rcu_read_unlock();
1937
1938 return 0;
1939 }
1940
1941 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1942 * long-running RCU critical section. When rcu-reclaims in the code
1943 * start to become numerous it will be necessary to reduce the
1944 * granularity of these critical sections.
1945 */
1946
1947 static int ram_save_setup(QEMUFile *f, void *opaque)
1948 {
1949 RAMBlock *block;
1950
1951 /* migration has already setup the bitmap, reuse it. */
1952 if (!migration_in_colo_state()) {
1953 if (ram_save_init_globals() < 0) {
1954 return -1;
1955 }
1956 }
1957
1958 rcu_read_lock();
1959
1960 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1961
1962 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1963 qemu_put_byte(f, strlen(block->idstr));
1964 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1965 qemu_put_be64(f, block->used_length);
1966 }
1967
1968 rcu_read_unlock();
1969
1970 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1971 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1972
1973 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1974
1975 return 0;
1976 }
1977
1978 static int ram_save_iterate(QEMUFile *f, void *opaque)
1979 {
1980 int ret;
1981 int i;
1982 int64_t t0;
1983 int done = 0;
1984
1985 rcu_read_lock();
1986 if (ram_list.version != last_version) {
1987 reset_ram_globals();
1988 }
1989
1990 /* Read version before ram_list.blocks */
1991 smp_rmb();
1992
1993 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1994
1995 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1996 i = 0;
1997 while ((ret = qemu_file_rate_limit(f)) == 0) {
1998 int pages;
1999
2000 pages = ram_find_and_save_block(f, false, &bytes_transferred);
2001 /* no more pages to sent */
2002 if (pages == 0) {
2003 done = 1;
2004 break;
2005 }
2006 acct_info.iterations++;
2007
2008 /* we want to check in the 1st loop, just in case it was the 1st time
2009 and we had to sync the dirty bitmap.
2010 qemu_get_clock_ns() is a bit expensive, so we only check each some
2011 iterations
2012 */
2013 if ((i & 63) == 0) {
2014 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2015 if (t1 > MAX_WAIT) {
2016 trace_ram_save_iterate_big_wait(t1, i);
2017 break;
2018 }
2019 }
2020 i++;
2021 }
2022 flush_compressed_data(f);
2023 rcu_read_unlock();
2024
2025 /*
2026 * Must occur before EOS (or any QEMUFile operation)
2027 * because of RDMA protocol.
2028 */
2029 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2030
2031 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2032 bytes_transferred += 8;
2033
2034 ret = qemu_file_get_error(f);
2035 if (ret < 0) {
2036 return ret;
2037 }
2038
2039 return done;
2040 }
2041
2042 /* Called with iothread lock */
2043 static int ram_save_complete(QEMUFile *f, void *opaque)
2044 {
2045 rcu_read_lock();
2046
2047 if (!migration_in_postcopy(migrate_get_current())) {
2048 migration_bitmap_sync();
2049 }
2050
2051 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2052
2053 /* try transferring iterative blocks of memory */
2054
2055 /* flush all remaining blocks regardless of rate limiting */
2056 while (true) {
2057 int pages;
2058
2059 pages = ram_find_and_save_block(f, !migration_in_colo_state(),
2060 &bytes_transferred);
2061 /* no more blocks to sent */
2062 if (pages == 0) {
2063 break;
2064 }
2065 }
2066
2067 flush_compressed_data(f);
2068 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2069
2070 rcu_read_unlock();
2071
2072 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2073
2074 return 0;
2075 }
2076
2077 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2078 uint64_t *non_postcopiable_pending,
2079 uint64_t *postcopiable_pending)
2080 {
2081 uint64_t remaining_size;
2082
2083 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2084
2085 if (!migration_in_postcopy(migrate_get_current()) &&
2086 remaining_size < max_size) {
2087 qemu_mutex_lock_iothread();
2088 rcu_read_lock();
2089 migration_bitmap_sync();
2090 rcu_read_unlock();
2091 qemu_mutex_unlock_iothread();
2092 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2093 }
2094
2095 /* We can do postcopy, and all the data is postcopiable */
2096 *postcopiable_pending += remaining_size;
2097 }
2098
2099 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2100 {
2101 unsigned int xh_len;
2102 int xh_flags;
2103 uint8_t *loaded_data;
2104
2105 if (!xbzrle_decoded_buf) {
2106 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2107 }
2108 loaded_data = xbzrle_decoded_buf;
2109
2110 /* extract RLE header */
2111 xh_flags = qemu_get_byte(f);
2112 xh_len = qemu_get_be16(f);
2113
2114 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2115 error_report("Failed to load XBZRLE page - wrong compression!");
2116 return -1;
2117 }
2118
2119 if (xh_len > TARGET_PAGE_SIZE) {
2120 error_report("Failed to load XBZRLE page - len overflow!");
2121 return -1;
2122 }
2123 /* load data and decode */
2124 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2125
2126 /* decode RLE */
2127 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2128 TARGET_PAGE_SIZE) == -1) {
2129 error_report("Failed to load XBZRLE page - decode error!");
2130 return -1;
2131 }
2132
2133 return 0;
2134 }
2135
2136 /* Must be called from within a rcu critical section.
2137 * Returns a pointer from within the RCU-protected ram_list.
2138 */
2139 /*
2140 * Read a RAMBlock ID from the stream f.
2141 *
2142 * f: Stream to read from
2143 * flags: Page flags (mostly to see if it's a continuation of previous block)
2144 */
2145 static inline RAMBlock *ram_block_from_stream(QEMUFile *f,
2146 int flags)
2147 {
2148 static RAMBlock *block = NULL;
2149 char id[256];
2150 uint8_t len;
2151
2152 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2153 if (!block) {
2154 error_report("Ack, bad migration stream!");
2155 return NULL;
2156 }
2157 return block;
2158 }
2159
2160 len = qemu_get_byte(f);
2161 qemu_get_buffer(f, (uint8_t *)id, len);
2162 id[len] = 0;
2163
2164 block = qemu_ram_block_by_name(id);
2165 if (!block) {
2166 error_report("Can't find block %s", id);
2167 return NULL;
2168 }
2169
2170 return block;
2171 }
2172
2173 static inline void *host_from_ram_block_offset(RAMBlock *block,
2174 ram_addr_t offset)
2175 {
2176 if (!offset_in_ramblock(block, offset)) {
2177 return NULL;
2178 }
2179
2180 return block->host + offset;
2181 }
2182
2183 /*
2184 * If a page (or a whole RDMA chunk) has been
2185 * determined to be zero, then zap it.
2186 */
2187 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2188 {
2189 if (ch != 0 || !is_zero_range(host, size)) {
2190 memset(host, ch, size);
2191 }
2192 }
2193
2194 static void *do_data_decompress(void *opaque)
2195 {
2196 DecompressParam *param = opaque;
2197 unsigned long pagesize;
2198 uint8_t *des;
2199 int len;
2200
2201 qemu_mutex_lock(&param->mutex);
2202 while (!param->quit) {
2203 if (param->des) {
2204 des = param->des;
2205 len = param->len;
2206 param->des = 0;
2207 qemu_mutex_unlock(&param->mutex);
2208
2209 pagesize = TARGET_PAGE_SIZE;
2210 /* uncompress() will return failed in some case, especially
2211 * when the page is dirted when doing the compression, it's
2212 * not a problem because the dirty page will be retransferred
2213 * and uncompress() won't break the data in other pages.
2214 */
2215 uncompress((Bytef *)des, &pagesize,
2216 (const Bytef *)param->compbuf, len);
2217
2218 qemu_mutex_lock(&decomp_done_lock);
2219 param->done = true;
2220 qemu_cond_signal(&decomp_done_cond);
2221 qemu_mutex_unlock(&decomp_done_lock);
2222
2223 qemu_mutex_lock(&param->mutex);
2224 } else {
2225 qemu_cond_wait(&param->cond, &param->mutex);
2226 }
2227 }
2228 qemu_mutex_unlock(&param->mutex);
2229
2230 return NULL;
2231 }
2232
2233 static void wait_for_decompress_done(void)
2234 {
2235 int idx, thread_count;
2236
2237 if (!migrate_use_compression()) {
2238 return;
2239 }
2240
2241 thread_count = migrate_decompress_threads();
2242 qemu_mutex_lock(&decomp_done_lock);
2243 for (idx = 0; idx < thread_count; idx++) {
2244 while (!decomp_param[idx].done) {
2245 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2246 }
2247 }
2248 qemu_mutex_unlock(&decomp_done_lock);
2249 }
2250
2251 void migrate_decompress_threads_create(void)
2252 {
2253 int i, thread_count;
2254
2255 thread_count = migrate_decompress_threads();
2256 decompress_threads = g_new0(QemuThread, thread_count);
2257 decomp_param = g_new0(DecompressParam, thread_count);
2258 qemu_mutex_init(&decomp_done_lock);
2259 qemu_cond_init(&decomp_done_cond);
2260 for (i = 0; i < thread_count; i++) {
2261 qemu_mutex_init(&decomp_param[i].mutex);
2262 qemu_cond_init(&decomp_param[i].cond);
2263 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2264 decomp_param[i].done = true;
2265 decomp_param[i].quit = false;
2266 qemu_thread_create(decompress_threads + i, "decompress",
2267 do_data_decompress, decomp_param + i,
2268 QEMU_THREAD_JOINABLE);
2269 }
2270 }
2271
2272 void migrate_decompress_threads_join(void)
2273 {
2274 int i, thread_count;
2275
2276 thread_count = migrate_decompress_threads();
2277 for (i = 0; i < thread_count; i++) {
2278 qemu_mutex_lock(&decomp_param[i].mutex);
2279 decomp_param[i].quit = true;
2280 qemu_cond_signal(&decomp_param[i].cond);
2281 qemu_mutex_unlock(&decomp_param[i].mutex);
2282 }
2283 for (i = 0; i < thread_count; i++) {
2284 qemu_thread_join(decompress_threads + i);
2285 qemu_mutex_destroy(&decomp_param[i].mutex);
2286 qemu_cond_destroy(&decomp_param[i].cond);
2287 g_free(decomp_param[i].compbuf);
2288 }
2289 g_free(decompress_threads);
2290 g_free(decomp_param);
2291 decompress_threads = NULL;
2292 decomp_param = NULL;
2293 }
2294
2295 static void decompress_data_with_multi_threads(QEMUFile *f,
2296 void *host, int len)
2297 {
2298 int idx, thread_count;
2299
2300 thread_count = migrate_decompress_threads();
2301 qemu_mutex_lock(&decomp_done_lock);
2302 while (true) {
2303 for (idx = 0; idx < thread_count; idx++) {
2304 if (decomp_param[idx].done) {
2305 decomp_param[idx].done = false;
2306 qemu_mutex_lock(&decomp_param[idx].mutex);
2307 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2308 decomp_param[idx].des = host;
2309 decomp_param[idx].len = len;
2310 qemu_cond_signal(&decomp_param[idx].cond);
2311 qemu_mutex_unlock(&decomp_param[idx].mutex);
2312 break;
2313 }
2314 }
2315 if (idx < thread_count) {
2316 break;
2317 } else {
2318 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2319 }
2320 }
2321 qemu_mutex_unlock(&decomp_done_lock);
2322 }
2323
2324 /*
2325 * Allocate data structures etc needed by incoming migration with postcopy-ram
2326 * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
2327 */
2328 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2329 {
2330 size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2331
2332 return postcopy_ram_incoming_init(mis, ram_pages);
2333 }
2334
2335 /*
2336 * Called in postcopy mode by ram_load().
2337 * rcu_read_lock is taken prior to this being called.
2338 */
2339 static int ram_load_postcopy(QEMUFile *f)
2340 {
2341 int flags = 0, ret = 0;
2342 bool place_needed = false;
2343 bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
2344 MigrationIncomingState *mis = migration_incoming_get_current();
2345 /* Temporary page that is later 'placed' */
2346 void *postcopy_host_page = postcopy_get_tmp_page(mis);
2347 void *last_host = NULL;
2348 bool all_zero = false;
2349
2350 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2351 ram_addr_t addr;
2352 void *host = NULL;
2353 void *page_buffer = NULL;
2354 void *place_source = NULL;
2355 uint8_t ch;
2356
2357 addr = qemu_get_be64(f);
2358 flags = addr & ~TARGET_PAGE_MASK;
2359 addr &= TARGET_PAGE_MASK;
2360
2361 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2362 place_needed = false;
2363 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2364 RAMBlock *block = ram_block_from_stream(f, flags);
2365
2366 host = host_from_ram_block_offset(block, addr);
2367 if (!host) {
2368 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2369 ret = -EINVAL;
2370 break;
2371 }
2372 /*
2373 * Postcopy requires that we place whole host pages atomically.
2374 * To make it atomic, the data is read into a temporary page
2375 * that's moved into place later.
2376 * The migration protocol uses, possibly smaller, target-pages
2377 * however the source ensures it always sends all the components
2378 * of a host page in order.
2379 */
2380 page_buffer = postcopy_host_page +
2381 ((uintptr_t)host & ~qemu_host_page_mask);
2382 /* If all TP are zero then we can optimise the place */
2383 if (!((uintptr_t)host & ~qemu_host_page_mask)) {
2384 all_zero = true;
2385 } else {
2386 /* not the 1st TP within the HP */
2387 if (host != (last_host + TARGET_PAGE_SIZE)) {
2388 error_report("Non-sequential target page %p/%p",
2389 host, last_host);
2390 ret = -EINVAL;
2391 break;
2392 }
2393 }
2394
2395
2396 /*
2397 * If it's the last part of a host page then we place the host
2398 * page
2399 */
2400 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2401 ~qemu_host_page_mask) == 0;
2402 place_source = postcopy_host_page;
2403 }
2404 last_host = host;
2405
2406 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2407 case RAM_SAVE_FLAG_COMPRESS:
2408 ch = qemu_get_byte(f);
2409 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2410 if (ch) {
2411 all_zero = false;
2412 }
2413 break;
2414
2415 case RAM_SAVE_FLAG_PAGE:
2416 all_zero = false;
2417 if (!place_needed || !matching_page_sizes) {
2418 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2419 } else {
2420 /* Avoids the qemu_file copy during postcopy, which is
2421 * going to do a copy later; can only do it when we
2422 * do this read in one go (matching page sizes)
2423 */
2424 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2425 TARGET_PAGE_SIZE);
2426 }
2427 break;
2428 case RAM_SAVE_FLAG_EOS:
2429 /* normal exit */
2430 break;
2431 default:
2432 error_report("Unknown combination of migration flags: %#x"
2433 " (postcopy mode)", flags);
2434 ret = -EINVAL;
2435 }
2436
2437 if (place_needed) {
2438 /* This gets called at the last target page in the host page */
2439 if (all_zero) {
2440 ret = postcopy_place_page_zero(mis,
2441 host + TARGET_PAGE_SIZE -
2442 qemu_host_page_size);
2443 } else {
2444 ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE -
2445 qemu_host_page_size,
2446 place_source);
2447 }
2448 }
2449 if (!ret) {
2450 ret = qemu_file_get_error(f);
2451 }
2452 }
2453
2454 return ret;
2455 }
2456
2457 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2458 {
2459 int flags = 0, ret = 0;
2460 static uint64_t seq_iter;
2461 int len = 0;
2462 /*
2463 * If system is running in postcopy mode, page inserts to host memory must
2464 * be atomic
2465 */
2466 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2467
2468 seq_iter++;
2469
2470 if (version_id != 4) {
2471 ret = -EINVAL;
2472 }
2473
2474 /* This RCU critical section can be very long running.
2475 * When RCU reclaims in the code start to become numerous,
2476 * it will be necessary to reduce the granularity of this
2477 * critical section.
2478 */
2479 rcu_read_lock();
2480
2481 if (postcopy_running) {
2482 ret = ram_load_postcopy(f);
2483 }
2484
2485 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2486 ram_addr_t addr, total_ram_bytes;
2487 void *host = NULL;
2488 uint8_t ch;
2489
2490 addr = qemu_get_be64(f);
2491 flags = addr & ~TARGET_PAGE_MASK;
2492 addr &= TARGET_PAGE_MASK;
2493
2494 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2495 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2496 RAMBlock *block = ram_block_from_stream(f, flags);
2497
2498 host = host_from_ram_block_offset(block, addr);
2499 if (!host) {
2500 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2501 ret = -EINVAL;
2502 break;
2503 }
2504 }
2505
2506 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2507 case RAM_SAVE_FLAG_MEM_SIZE:
2508 /* Synchronize RAM block list */
2509 total_ram_bytes = addr;
2510 while (!ret && total_ram_bytes) {
2511 RAMBlock *block;
2512 char id[256];
2513 ram_addr_t length;
2514
2515 len = qemu_get_byte(f);
2516 qemu_get_buffer(f, (uint8_t *)id, len);
2517 id[len] = 0;
2518 length = qemu_get_be64(f);
2519
2520 block = qemu_ram_block_by_name(id);
2521 if (block) {
2522 if (length != block->used_length) {
2523 Error *local_err = NULL;
2524
2525 ret = qemu_ram_resize(block, length,
2526 &local_err);
2527 if (local_err) {
2528 error_report_err(local_err);
2529 }
2530 }
2531 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2532 block->idstr);
2533 } else {
2534 error_report("Unknown ramblock \"%s\", cannot "
2535 "accept migration", id);
2536 ret = -EINVAL;
2537 }
2538
2539 total_ram_bytes -= length;
2540 }
2541 break;
2542
2543 case RAM_SAVE_FLAG_COMPRESS:
2544 ch = qemu_get_byte(f);
2545 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2546 break;
2547
2548 case RAM_SAVE_FLAG_PAGE:
2549 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2550 break;
2551
2552 case RAM_SAVE_FLAG_COMPRESS_PAGE:
2553 len = qemu_get_be32(f);
2554 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2555 error_report("Invalid compressed data length: %d", len);
2556 ret = -EINVAL;
2557 break;
2558 }
2559 decompress_data_with_multi_threads(f, host, len);
2560 break;
2561
2562 case RAM_SAVE_FLAG_XBZRLE:
2563 if (load_xbzrle(f, addr, host) < 0) {
2564 error_report("Failed to decompress XBZRLE page at "
2565 RAM_ADDR_FMT, addr);
2566 ret = -EINVAL;
2567 break;
2568 }
2569 break;
2570 case RAM_SAVE_FLAG_EOS:
2571 /* normal exit */
2572 break;
2573 default:
2574 if (flags & RAM_SAVE_FLAG_HOOK) {
2575 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2576 } else {
2577 error_report("Unknown combination of migration flags: %#x",
2578 flags);
2579 ret = -EINVAL;
2580 }
2581 }
2582 if (!ret) {
2583 ret = qemu_file_get_error(f);
2584 }
2585 }
2586
2587 wait_for_decompress_done();
2588 rcu_read_unlock();
2589 trace_ram_load_complete(ret, seq_iter);
2590 return ret;
2591 }
2592
2593 static SaveVMHandlers savevm_ram_handlers = {
2594 .save_live_setup = ram_save_setup,
2595 .save_live_iterate = ram_save_iterate,
2596 .save_live_complete_postcopy = ram_save_complete,
2597 .save_live_complete_precopy = ram_save_complete,
2598 .save_live_pending = ram_save_pending,
2599 .load_state = ram_load,
2600 .cleanup = ram_migration_cleanup,
2601 };
2602
2603 void ram_mig_init(void)
2604 {
2605 qemu_mutex_init(&XBZRLE.lock);
2606 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
2607 }