]> git.proxmox.com Git - mirror_qemu.git/blob - migration/ram.c
multifd: Rename "size" member to pages_alloc
[mirror_qemu.git] / migration / ram.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
6 *
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qemu/cutils.h"
33 #include "qemu/bitops.h"
34 #include "qemu/bitmap.h"
35 #include "qemu/main-loop.h"
36 #include "qemu/pmem.h"
37 #include "xbzrle.h"
38 #include "ram.h"
39 #include "migration.h"
40 #include "socket.h"
41 #include "migration/register.h"
42 #include "migration/misc.h"
43 #include "qemu-file.h"
44 #include "postcopy-ram.h"
45 #include "page_cache.h"
46 #include "qemu/error-report.h"
47 #include "qapi/error.h"
48 #include "qapi/qapi-events-migration.h"
49 #include "qapi/qmp/qerror.h"
50 #include "trace.h"
51 #include "exec/ram_addr.h"
52 #include "exec/target_page.h"
53 #include "qemu/rcu_queue.h"
54 #include "migration/colo.h"
55 #include "block.h"
56 #include "sysemu/sysemu.h"
57 #include "qemu/uuid.h"
58 #include "savevm.h"
59 #include "qemu/iov.h"
60
61 /***********************************************************/
62 /* ram save/restore */
63
64 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
65 * worked for pages that where filled with the same char. We switched
66 * it to only search for the zero value. And to avoid confusion with
67 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
68 */
69
70 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
71 #define RAM_SAVE_FLAG_ZERO 0x02
72 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
73 #define RAM_SAVE_FLAG_PAGE 0x08
74 #define RAM_SAVE_FLAG_EOS 0x10
75 #define RAM_SAVE_FLAG_CONTINUE 0x20
76 #define RAM_SAVE_FLAG_XBZRLE 0x40
77 /* 0x80 is reserved in migration.h start with 0x100 next */
78 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
79
80 static inline bool is_zero_range(uint8_t *p, uint64_t size)
81 {
82 return buffer_is_zero(p, size);
83 }
84
85 XBZRLECacheStats xbzrle_counters;
86
87 /* struct contains XBZRLE cache and a static page
88 used by the compression */
89 static struct {
90 /* buffer used for XBZRLE encoding */
91 uint8_t *encoded_buf;
92 /* buffer for storing page content */
93 uint8_t *current_buf;
94 /* Cache for XBZRLE, Protected by lock. */
95 PageCache *cache;
96 QemuMutex lock;
97 /* it will store a page full of zeros */
98 uint8_t *zero_target_page;
99 /* buffer used for XBZRLE decoding */
100 uint8_t *decoded_buf;
101 } XBZRLE;
102
103 static void XBZRLE_cache_lock(void)
104 {
105 if (migrate_use_xbzrle())
106 qemu_mutex_lock(&XBZRLE.lock);
107 }
108
109 static void XBZRLE_cache_unlock(void)
110 {
111 if (migrate_use_xbzrle())
112 qemu_mutex_unlock(&XBZRLE.lock);
113 }
114
115 /**
116 * xbzrle_cache_resize: resize the xbzrle cache
117 *
118 * This function is called from qmp_migrate_set_cache_size in main
119 * thread, possibly while a migration is in progress. A running
120 * migration may be using the cache and might finish during this call,
121 * hence changes to the cache are protected by XBZRLE.lock().
122 *
123 * Returns 0 for success or -1 for error
124 *
125 * @new_size: new cache size
126 * @errp: set *errp if the check failed, with reason
127 */
128 int xbzrle_cache_resize(int64_t new_size, Error **errp)
129 {
130 PageCache *new_cache;
131 int64_t ret = 0;
132
133 /* Check for truncation */
134 if (new_size != (size_t)new_size) {
135 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
136 "exceeding address space");
137 return -1;
138 }
139
140 if (new_size == migrate_xbzrle_cache_size()) {
141 /* nothing to do */
142 return 0;
143 }
144
145 XBZRLE_cache_lock();
146
147 if (XBZRLE.cache != NULL) {
148 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
149 if (!new_cache) {
150 ret = -1;
151 goto out;
152 }
153
154 cache_fini(XBZRLE.cache);
155 XBZRLE.cache = new_cache;
156 }
157 out:
158 XBZRLE_cache_unlock();
159 return ret;
160 }
161
162 static bool ramblock_is_ignored(RAMBlock *block)
163 {
164 return !qemu_ram_is_migratable(block) ||
165 (migrate_ignore_shared() && qemu_ram_is_shared(block));
166 }
167
168 /* Should be holding either ram_list.mutex, or the RCU lock. */
169 #define RAMBLOCK_FOREACH_NOT_IGNORED(block) \
170 INTERNAL_RAMBLOCK_FOREACH(block) \
171 if (ramblock_is_ignored(block)) {} else
172
173 #define RAMBLOCK_FOREACH_MIGRATABLE(block) \
174 INTERNAL_RAMBLOCK_FOREACH(block) \
175 if (!qemu_ram_is_migratable(block)) {} else
176
177 #undef RAMBLOCK_FOREACH
178
179 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
180 {
181 RAMBlock *block;
182 int ret = 0;
183
184 rcu_read_lock();
185 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
186 ret = func(block, opaque);
187 if (ret) {
188 break;
189 }
190 }
191 rcu_read_unlock();
192 return ret;
193 }
194
195 static void ramblock_recv_map_init(void)
196 {
197 RAMBlock *rb;
198
199 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
200 assert(!rb->receivedmap);
201 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
202 }
203 }
204
205 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
206 {
207 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
208 rb->receivedmap);
209 }
210
211 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
212 {
213 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
214 }
215
216 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
217 {
218 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
219 }
220
221 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
222 size_t nr)
223 {
224 bitmap_set_atomic(rb->receivedmap,
225 ramblock_recv_bitmap_offset(host_addr, rb),
226 nr);
227 }
228
229 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
230
231 /*
232 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
233 *
234 * Returns >0 if success with sent bytes, or <0 if error.
235 */
236 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
237 const char *block_name)
238 {
239 RAMBlock *block = qemu_ram_block_by_name(block_name);
240 unsigned long *le_bitmap, nbits;
241 uint64_t size;
242
243 if (!block) {
244 error_report("%s: invalid block name: %s", __func__, block_name);
245 return -1;
246 }
247
248 nbits = block->used_length >> TARGET_PAGE_BITS;
249
250 /*
251 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
252 * machines we may need 4 more bytes for padding (see below
253 * comment). So extend it a bit before hand.
254 */
255 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
256
257 /*
258 * Always use little endian when sending the bitmap. This is
259 * required that when source and destination VMs are not using the
260 * same endianess. (Note: big endian won't work.)
261 */
262 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
263
264 /* Size of the bitmap, in bytes */
265 size = DIV_ROUND_UP(nbits, 8);
266
267 /*
268 * size is always aligned to 8 bytes for 64bit machines, but it
269 * may not be true for 32bit machines. We need this padding to
270 * make sure the migration can survive even between 32bit and
271 * 64bit machines.
272 */
273 size = ROUND_UP(size, 8);
274
275 qemu_put_be64(file, size);
276 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
277 /*
278 * Mark as an end, in case the middle part is screwed up due to
279 * some "misterious" reason.
280 */
281 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
282 qemu_fflush(file);
283
284 g_free(le_bitmap);
285
286 if (qemu_file_get_error(file)) {
287 return qemu_file_get_error(file);
288 }
289
290 return size + sizeof(size);
291 }
292
293 /*
294 * An outstanding page request, on the source, having been received
295 * and queued
296 */
297 struct RAMSrcPageRequest {
298 RAMBlock *rb;
299 hwaddr offset;
300 hwaddr len;
301
302 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
303 };
304
305 /* State of RAM for migration */
306 struct RAMState {
307 /* QEMUFile used for this migration */
308 QEMUFile *f;
309 /* Last block that we have visited searching for dirty pages */
310 RAMBlock *last_seen_block;
311 /* Last block from where we have sent data */
312 RAMBlock *last_sent_block;
313 /* Last dirty target page we have sent */
314 ram_addr_t last_page;
315 /* last ram version we have seen */
316 uint32_t last_version;
317 /* We are in the first round */
318 bool ram_bulk_stage;
319 /* The free page optimization is enabled */
320 bool fpo_enabled;
321 /* How many times we have dirty too many pages */
322 int dirty_rate_high_cnt;
323 /* these variables are used for bitmap sync */
324 /* last time we did a full bitmap_sync */
325 int64_t time_last_bitmap_sync;
326 /* bytes transferred at start_time */
327 uint64_t bytes_xfer_prev;
328 /* number of dirty pages since start_time */
329 uint64_t num_dirty_pages_period;
330 /* xbzrle misses since the beginning of the period */
331 uint64_t xbzrle_cache_miss_prev;
332
333 /* compression statistics since the beginning of the period */
334 /* amount of count that no free thread to compress data */
335 uint64_t compress_thread_busy_prev;
336 /* amount bytes after compression */
337 uint64_t compressed_size_prev;
338 /* amount of compressed pages */
339 uint64_t compress_pages_prev;
340
341 /* total handled target pages at the beginning of period */
342 uint64_t target_page_count_prev;
343 /* total handled target pages since start */
344 uint64_t target_page_count;
345 /* number of dirty bits in the bitmap */
346 uint64_t migration_dirty_pages;
347 /* Protects modification of the bitmap and migration dirty pages */
348 QemuMutex bitmap_mutex;
349 /* The RAMBlock used in the last src_page_requests */
350 RAMBlock *last_req_rb;
351 /* Queue of outstanding page requests from the destination */
352 QemuMutex src_page_req_mutex;
353 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
354 };
355 typedef struct RAMState RAMState;
356
357 static RAMState *ram_state;
358
359 static NotifierWithReturnList precopy_notifier_list;
360
361 void precopy_infrastructure_init(void)
362 {
363 notifier_with_return_list_init(&precopy_notifier_list);
364 }
365
366 void precopy_add_notifier(NotifierWithReturn *n)
367 {
368 notifier_with_return_list_add(&precopy_notifier_list, n);
369 }
370
371 void precopy_remove_notifier(NotifierWithReturn *n)
372 {
373 notifier_with_return_remove(n);
374 }
375
376 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
377 {
378 PrecopyNotifyData pnd;
379 pnd.reason = reason;
380 pnd.errp = errp;
381
382 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
383 }
384
385 void precopy_enable_free_page_optimization(void)
386 {
387 if (!ram_state) {
388 return;
389 }
390
391 ram_state->fpo_enabled = true;
392 }
393
394 uint64_t ram_bytes_remaining(void)
395 {
396 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
397 0;
398 }
399
400 MigrationStats ram_counters;
401
402 /* used by the search for pages to send */
403 struct PageSearchStatus {
404 /* Current block being searched */
405 RAMBlock *block;
406 /* Current page to search from */
407 unsigned long page;
408 /* Set once we wrap around */
409 bool complete_round;
410 };
411 typedef struct PageSearchStatus PageSearchStatus;
412
413 CompressionStats compression_counters;
414
415 struct CompressParam {
416 bool done;
417 bool quit;
418 bool zero_page;
419 QEMUFile *file;
420 QemuMutex mutex;
421 QemuCond cond;
422 RAMBlock *block;
423 ram_addr_t offset;
424
425 /* internally used fields */
426 z_stream stream;
427 uint8_t *originbuf;
428 };
429 typedef struct CompressParam CompressParam;
430
431 struct DecompressParam {
432 bool done;
433 bool quit;
434 QemuMutex mutex;
435 QemuCond cond;
436 void *des;
437 uint8_t *compbuf;
438 int len;
439 z_stream stream;
440 };
441 typedef struct DecompressParam DecompressParam;
442
443 static CompressParam *comp_param;
444 static QemuThread *compress_threads;
445 /* comp_done_cond is used to wake up the migration thread when
446 * one of the compression threads has finished the compression.
447 * comp_done_lock is used to co-work with comp_done_cond.
448 */
449 static QemuMutex comp_done_lock;
450 static QemuCond comp_done_cond;
451 /* The empty QEMUFileOps will be used by file in CompressParam */
452 static const QEMUFileOps empty_ops = { };
453
454 static QEMUFile *decomp_file;
455 static DecompressParam *decomp_param;
456 static QemuThread *decompress_threads;
457 static QemuMutex decomp_done_lock;
458 static QemuCond decomp_done_cond;
459
460 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
461 ram_addr_t offset, uint8_t *source_buf);
462
463 static void *do_data_compress(void *opaque)
464 {
465 CompressParam *param = opaque;
466 RAMBlock *block;
467 ram_addr_t offset;
468 bool zero_page;
469
470 qemu_mutex_lock(&param->mutex);
471 while (!param->quit) {
472 if (param->block) {
473 block = param->block;
474 offset = param->offset;
475 param->block = NULL;
476 qemu_mutex_unlock(&param->mutex);
477
478 zero_page = do_compress_ram_page(param->file, &param->stream,
479 block, offset, param->originbuf);
480
481 qemu_mutex_lock(&comp_done_lock);
482 param->done = true;
483 param->zero_page = zero_page;
484 qemu_cond_signal(&comp_done_cond);
485 qemu_mutex_unlock(&comp_done_lock);
486
487 qemu_mutex_lock(&param->mutex);
488 } else {
489 qemu_cond_wait(&param->cond, &param->mutex);
490 }
491 }
492 qemu_mutex_unlock(&param->mutex);
493
494 return NULL;
495 }
496
497 static void compress_threads_save_cleanup(void)
498 {
499 int i, thread_count;
500
501 if (!migrate_use_compression() || !comp_param) {
502 return;
503 }
504
505 thread_count = migrate_compress_threads();
506 for (i = 0; i < thread_count; i++) {
507 /*
508 * we use it as a indicator which shows if the thread is
509 * properly init'd or not
510 */
511 if (!comp_param[i].file) {
512 break;
513 }
514
515 qemu_mutex_lock(&comp_param[i].mutex);
516 comp_param[i].quit = true;
517 qemu_cond_signal(&comp_param[i].cond);
518 qemu_mutex_unlock(&comp_param[i].mutex);
519
520 qemu_thread_join(compress_threads + i);
521 qemu_mutex_destroy(&comp_param[i].mutex);
522 qemu_cond_destroy(&comp_param[i].cond);
523 deflateEnd(&comp_param[i].stream);
524 g_free(comp_param[i].originbuf);
525 qemu_fclose(comp_param[i].file);
526 comp_param[i].file = NULL;
527 }
528 qemu_mutex_destroy(&comp_done_lock);
529 qemu_cond_destroy(&comp_done_cond);
530 g_free(compress_threads);
531 g_free(comp_param);
532 compress_threads = NULL;
533 comp_param = NULL;
534 }
535
536 static int compress_threads_save_setup(void)
537 {
538 int i, thread_count;
539
540 if (!migrate_use_compression()) {
541 return 0;
542 }
543 thread_count = migrate_compress_threads();
544 compress_threads = g_new0(QemuThread, thread_count);
545 comp_param = g_new0(CompressParam, thread_count);
546 qemu_cond_init(&comp_done_cond);
547 qemu_mutex_init(&comp_done_lock);
548 for (i = 0; i < thread_count; i++) {
549 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
550 if (!comp_param[i].originbuf) {
551 goto exit;
552 }
553
554 if (deflateInit(&comp_param[i].stream,
555 migrate_compress_level()) != Z_OK) {
556 g_free(comp_param[i].originbuf);
557 goto exit;
558 }
559
560 /* comp_param[i].file is just used as a dummy buffer to save data,
561 * set its ops to empty.
562 */
563 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
564 comp_param[i].done = true;
565 comp_param[i].quit = false;
566 qemu_mutex_init(&comp_param[i].mutex);
567 qemu_cond_init(&comp_param[i].cond);
568 qemu_thread_create(compress_threads + i, "compress",
569 do_data_compress, comp_param + i,
570 QEMU_THREAD_JOINABLE);
571 }
572 return 0;
573
574 exit:
575 compress_threads_save_cleanup();
576 return -1;
577 }
578
579 /* Multiple fd's */
580
581 #define MULTIFD_MAGIC 0x11223344U
582 #define MULTIFD_VERSION 1
583
584 #define MULTIFD_FLAG_SYNC (1 << 0)
585
586 typedef struct {
587 uint32_t magic;
588 uint32_t version;
589 unsigned char uuid[16]; /* QemuUUID */
590 uint8_t id;
591 } __attribute__((packed)) MultiFDInit_t;
592
593 typedef struct {
594 uint32_t magic;
595 uint32_t version;
596 uint32_t flags;
597 /* maximum number of allocated pages */
598 uint32_t pages_alloc;
599 uint32_t pages_used;
600 uint64_t packet_num;
601 char ramblock[256];
602 uint64_t offset[];
603 } __attribute__((packed)) MultiFDPacket_t;
604
605 typedef struct {
606 /* number of used pages */
607 uint32_t used;
608 /* number of allocated pages */
609 uint32_t allocated;
610 /* global number of generated multifd packets */
611 uint64_t packet_num;
612 /* offset of each page */
613 ram_addr_t *offset;
614 /* pointer to each page */
615 struct iovec *iov;
616 RAMBlock *block;
617 } MultiFDPages_t;
618
619 typedef struct {
620 /* this fields are not changed once the thread is created */
621 /* channel number */
622 uint8_t id;
623 /* channel thread name */
624 char *name;
625 /* channel thread id */
626 QemuThread thread;
627 /* communication channel */
628 QIOChannel *c;
629 /* sem where to wait for more work */
630 QemuSemaphore sem;
631 /* this mutex protects the following parameters */
632 QemuMutex mutex;
633 /* is this channel thread running */
634 bool running;
635 /* should this thread finish */
636 bool quit;
637 /* thread has work to do */
638 int pending_job;
639 /* array of pages to sent */
640 MultiFDPages_t *pages;
641 /* packet allocated len */
642 uint32_t packet_len;
643 /* pointer to the packet */
644 MultiFDPacket_t *packet;
645 /* multifd flags for each packet */
646 uint32_t flags;
647 /* global number of generated multifd packets */
648 uint64_t packet_num;
649 /* thread local variables */
650 /* packets sent through this channel */
651 uint64_t num_packets;
652 /* pages sent through this channel */
653 uint64_t num_pages;
654 /* syncs main thread and channels */
655 QemuSemaphore sem_sync;
656 } MultiFDSendParams;
657
658 typedef struct {
659 /* this fields are not changed once the thread is created */
660 /* channel number */
661 uint8_t id;
662 /* channel thread name */
663 char *name;
664 /* channel thread id */
665 QemuThread thread;
666 /* communication channel */
667 QIOChannel *c;
668 /* this mutex protects the following parameters */
669 QemuMutex mutex;
670 /* is this channel thread running */
671 bool running;
672 /* array of pages to receive */
673 MultiFDPages_t *pages;
674 /* packet allocated len */
675 uint32_t packet_len;
676 /* pointer to the packet */
677 MultiFDPacket_t *packet;
678 /* multifd flags for each packet */
679 uint32_t flags;
680 /* global number of generated multifd packets */
681 uint64_t packet_num;
682 /* thread local variables */
683 /* packets sent through this channel */
684 uint64_t num_packets;
685 /* pages sent through this channel */
686 uint64_t num_pages;
687 /* syncs main thread and channels */
688 QemuSemaphore sem_sync;
689 } MultiFDRecvParams;
690
691 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
692 {
693 MultiFDInit_t msg;
694 int ret;
695
696 msg.magic = cpu_to_be32(MULTIFD_MAGIC);
697 msg.version = cpu_to_be32(MULTIFD_VERSION);
698 msg.id = p->id;
699 memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
700
701 ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
702 if (ret != 0) {
703 return -1;
704 }
705 return 0;
706 }
707
708 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
709 {
710 MultiFDInit_t msg;
711 int ret;
712
713 ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
714 if (ret != 0) {
715 return -1;
716 }
717
718 msg.magic = be32_to_cpu(msg.magic);
719 msg.version = be32_to_cpu(msg.version);
720
721 if (msg.magic != MULTIFD_MAGIC) {
722 error_setg(errp, "multifd: received packet magic %x "
723 "expected %x", msg.magic, MULTIFD_MAGIC);
724 return -1;
725 }
726
727 if (msg.version != MULTIFD_VERSION) {
728 error_setg(errp, "multifd: received packet version %d "
729 "expected %d", msg.version, MULTIFD_VERSION);
730 return -1;
731 }
732
733 if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
734 char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
735 char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
736
737 error_setg(errp, "multifd: received uuid '%s' and expected "
738 "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
739 g_free(uuid);
740 g_free(msg_uuid);
741 return -1;
742 }
743
744 if (msg.id > migrate_multifd_channels()) {
745 error_setg(errp, "multifd: received channel version %d "
746 "expected %d", msg.version, MULTIFD_VERSION);
747 return -1;
748 }
749
750 return msg.id;
751 }
752
753 static MultiFDPages_t *multifd_pages_init(size_t size)
754 {
755 MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
756
757 pages->allocated = size;
758 pages->iov = g_new0(struct iovec, size);
759 pages->offset = g_new0(ram_addr_t, size);
760
761 return pages;
762 }
763
764 static void multifd_pages_clear(MultiFDPages_t *pages)
765 {
766 pages->used = 0;
767 pages->allocated = 0;
768 pages->packet_num = 0;
769 pages->block = NULL;
770 g_free(pages->iov);
771 pages->iov = NULL;
772 g_free(pages->offset);
773 pages->offset = NULL;
774 g_free(pages);
775 }
776
777 static void multifd_send_fill_packet(MultiFDSendParams *p)
778 {
779 MultiFDPacket_t *packet = p->packet;
780 int i;
781
782 packet->magic = cpu_to_be32(MULTIFD_MAGIC);
783 packet->version = cpu_to_be32(MULTIFD_VERSION);
784 packet->flags = cpu_to_be32(p->flags);
785 packet->pages_alloc = cpu_to_be32(migrate_multifd_page_count());
786 packet->pages_used = cpu_to_be32(p->pages->used);
787 packet->packet_num = cpu_to_be64(p->packet_num);
788
789 if (p->pages->block) {
790 strncpy(packet->ramblock, p->pages->block->idstr, 256);
791 }
792
793 for (i = 0; i < p->pages->used; i++) {
794 packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
795 }
796 }
797
798 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
799 {
800 MultiFDPacket_t *packet = p->packet;
801 RAMBlock *block;
802 int i;
803
804 packet->magic = be32_to_cpu(packet->magic);
805 if (packet->magic != MULTIFD_MAGIC) {
806 error_setg(errp, "multifd: received packet "
807 "magic %x and expected magic %x",
808 packet->magic, MULTIFD_MAGIC);
809 return -1;
810 }
811
812 packet->version = be32_to_cpu(packet->version);
813 if (packet->version != MULTIFD_VERSION) {
814 error_setg(errp, "multifd: received packet "
815 "version %d and expected version %d",
816 packet->version, MULTIFD_VERSION);
817 return -1;
818 }
819
820 p->flags = be32_to_cpu(packet->flags);
821
822 packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
823 if (packet->pages_alloc > migrate_multifd_page_count()) {
824 error_setg(errp, "multifd: received packet "
825 "with size %d and expected maximum size %d",
826 packet->pages_alloc, migrate_multifd_page_count()) ;
827 return -1;
828 }
829
830 p->pages->used = be32_to_cpu(packet->pages_used);
831 if (p->pages->used > packet->pages_alloc) {
832 error_setg(errp, "multifd: received packet "
833 "with %d pages and expected maximum pages are %d",
834 p->pages->used, packet->pages_alloc) ;
835 return -1;
836 }
837
838 p->packet_num = be64_to_cpu(packet->packet_num);
839
840 if (p->pages->used) {
841 /* make sure that ramblock is 0 terminated */
842 packet->ramblock[255] = 0;
843 block = qemu_ram_block_by_name(packet->ramblock);
844 if (!block) {
845 error_setg(errp, "multifd: unknown ram block %s",
846 packet->ramblock);
847 return -1;
848 }
849 }
850
851 for (i = 0; i < p->pages->used; i++) {
852 ram_addr_t offset = be64_to_cpu(packet->offset[i]);
853
854 if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
855 error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
856 " (max " RAM_ADDR_FMT ")",
857 offset, block->max_length);
858 return -1;
859 }
860 p->pages->iov[i].iov_base = block->host + offset;
861 p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
862 }
863
864 return 0;
865 }
866
867 struct {
868 MultiFDSendParams *params;
869 /* number of created threads */
870 int count;
871 /* array of pages to sent */
872 MultiFDPages_t *pages;
873 /* syncs main thread and channels */
874 QemuSemaphore sem_sync;
875 /* global number of generated multifd packets */
876 uint64_t packet_num;
877 /* send channels ready */
878 QemuSemaphore channels_ready;
879 } *multifd_send_state;
880
881 /*
882 * How we use multifd_send_state->pages and channel->pages?
883 *
884 * We create a pages for each channel, and a main one. Each time that
885 * we need to send a batch of pages we interchange the ones between
886 * multifd_send_state and the channel that is sending it. There are
887 * two reasons for that:
888 * - to not have to do so many mallocs during migration
889 * - to make easier to know what to free at the end of migration
890 *
891 * This way we always know who is the owner of each "pages" struct,
892 * and we don't need any loocking. It belongs to the migration thread
893 * or to the channel thread. Switching is safe because the migration
894 * thread is using the channel mutex when changing it, and the channel
895 * have to had finish with its own, otherwise pending_job can't be
896 * false.
897 */
898
899 static void multifd_send_pages(void)
900 {
901 int i;
902 static int next_channel;
903 MultiFDSendParams *p = NULL; /* make happy gcc */
904 MultiFDPages_t *pages = multifd_send_state->pages;
905 uint64_t transferred;
906
907 qemu_sem_wait(&multifd_send_state->channels_ready);
908 for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
909 p = &multifd_send_state->params[i];
910
911 qemu_mutex_lock(&p->mutex);
912 if (!p->pending_job) {
913 p->pending_job++;
914 next_channel = (i + 1) % migrate_multifd_channels();
915 break;
916 }
917 qemu_mutex_unlock(&p->mutex);
918 }
919 p->pages->used = 0;
920
921 p->packet_num = multifd_send_state->packet_num++;
922 p->pages->block = NULL;
923 multifd_send_state->pages = p->pages;
924 p->pages = pages;
925 transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
926 ram_counters.multifd_bytes += transferred;
927 ram_counters.transferred += transferred;;
928 qemu_mutex_unlock(&p->mutex);
929 qemu_sem_post(&p->sem);
930 }
931
932 static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
933 {
934 MultiFDPages_t *pages = multifd_send_state->pages;
935
936 if (!pages->block) {
937 pages->block = block;
938 }
939
940 if (pages->block == block) {
941 pages->offset[pages->used] = offset;
942 pages->iov[pages->used].iov_base = block->host + offset;
943 pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
944 pages->used++;
945
946 if (pages->used < pages->allocated) {
947 return;
948 }
949 }
950
951 multifd_send_pages();
952
953 if (pages->block != block) {
954 multifd_queue_page(block, offset);
955 }
956 }
957
958 static void multifd_send_terminate_threads(Error *err)
959 {
960 int i;
961
962 if (err) {
963 MigrationState *s = migrate_get_current();
964 migrate_set_error(s, err);
965 if (s->state == MIGRATION_STATUS_SETUP ||
966 s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
967 s->state == MIGRATION_STATUS_DEVICE ||
968 s->state == MIGRATION_STATUS_ACTIVE) {
969 migrate_set_state(&s->state, s->state,
970 MIGRATION_STATUS_FAILED);
971 }
972 }
973
974 for (i = 0; i < migrate_multifd_channels(); i++) {
975 MultiFDSendParams *p = &multifd_send_state->params[i];
976
977 qemu_mutex_lock(&p->mutex);
978 p->quit = true;
979 qemu_sem_post(&p->sem);
980 qemu_mutex_unlock(&p->mutex);
981 }
982 }
983
984 void multifd_save_cleanup(void)
985 {
986 int i;
987
988 if (!migrate_use_multifd()) {
989 return;
990 }
991 multifd_send_terminate_threads(NULL);
992 for (i = 0; i < migrate_multifd_channels(); i++) {
993 MultiFDSendParams *p = &multifd_send_state->params[i];
994
995 if (p->running) {
996 qemu_thread_join(&p->thread);
997 }
998 socket_send_channel_destroy(p->c);
999 p->c = NULL;
1000 qemu_mutex_destroy(&p->mutex);
1001 qemu_sem_destroy(&p->sem);
1002 qemu_sem_destroy(&p->sem_sync);
1003 g_free(p->name);
1004 p->name = NULL;
1005 multifd_pages_clear(p->pages);
1006 p->pages = NULL;
1007 p->packet_len = 0;
1008 g_free(p->packet);
1009 p->packet = NULL;
1010 }
1011 qemu_sem_destroy(&multifd_send_state->channels_ready);
1012 qemu_sem_destroy(&multifd_send_state->sem_sync);
1013 g_free(multifd_send_state->params);
1014 multifd_send_state->params = NULL;
1015 multifd_pages_clear(multifd_send_state->pages);
1016 multifd_send_state->pages = NULL;
1017 g_free(multifd_send_state);
1018 multifd_send_state = NULL;
1019 }
1020
1021 static void multifd_send_sync_main(void)
1022 {
1023 int i;
1024
1025 if (!migrate_use_multifd()) {
1026 return;
1027 }
1028 if (multifd_send_state->pages->used) {
1029 multifd_send_pages();
1030 }
1031 for (i = 0; i < migrate_multifd_channels(); i++) {
1032 MultiFDSendParams *p = &multifd_send_state->params[i];
1033
1034 trace_multifd_send_sync_main_signal(p->id);
1035
1036 qemu_mutex_lock(&p->mutex);
1037
1038 p->packet_num = multifd_send_state->packet_num++;
1039 p->flags |= MULTIFD_FLAG_SYNC;
1040 p->pending_job++;
1041 qemu_mutex_unlock(&p->mutex);
1042 qemu_sem_post(&p->sem);
1043 }
1044 for (i = 0; i < migrate_multifd_channels(); i++) {
1045 MultiFDSendParams *p = &multifd_send_state->params[i];
1046
1047 trace_multifd_send_sync_main_wait(p->id);
1048 qemu_sem_wait(&multifd_send_state->sem_sync);
1049 }
1050 trace_multifd_send_sync_main(multifd_send_state->packet_num);
1051 }
1052
1053 static void *multifd_send_thread(void *opaque)
1054 {
1055 MultiFDSendParams *p = opaque;
1056 Error *local_err = NULL;
1057 int ret;
1058
1059 trace_multifd_send_thread_start(p->id);
1060 rcu_register_thread();
1061
1062 if (multifd_send_initial_packet(p, &local_err) < 0) {
1063 goto out;
1064 }
1065 /* initial packet */
1066 p->num_packets = 1;
1067
1068 while (true) {
1069 qemu_sem_wait(&p->sem);
1070 qemu_mutex_lock(&p->mutex);
1071
1072 if (p->pending_job) {
1073 uint32_t used = p->pages->used;
1074 uint64_t packet_num = p->packet_num;
1075 uint32_t flags = p->flags;
1076
1077 multifd_send_fill_packet(p);
1078 p->flags = 0;
1079 p->num_packets++;
1080 p->num_pages += used;
1081 p->pages->used = 0;
1082 qemu_mutex_unlock(&p->mutex);
1083
1084 trace_multifd_send(p->id, packet_num, used, flags);
1085
1086 ret = qio_channel_write_all(p->c, (void *)p->packet,
1087 p->packet_len, &local_err);
1088 if (ret != 0) {
1089 break;
1090 }
1091
1092 if (used) {
1093 ret = qio_channel_writev_all(p->c, p->pages->iov,
1094 used, &local_err);
1095 if (ret != 0) {
1096 break;
1097 }
1098 }
1099
1100 qemu_mutex_lock(&p->mutex);
1101 p->pending_job--;
1102 qemu_mutex_unlock(&p->mutex);
1103
1104 if (flags & MULTIFD_FLAG_SYNC) {
1105 qemu_sem_post(&multifd_send_state->sem_sync);
1106 }
1107 qemu_sem_post(&multifd_send_state->channels_ready);
1108 } else if (p->quit) {
1109 qemu_mutex_unlock(&p->mutex);
1110 break;
1111 } else {
1112 qemu_mutex_unlock(&p->mutex);
1113 /* sometimes there are spurious wakeups */
1114 }
1115 }
1116
1117 out:
1118 if (local_err) {
1119 multifd_send_terminate_threads(local_err);
1120 }
1121
1122 qemu_mutex_lock(&p->mutex);
1123 p->running = false;
1124 qemu_mutex_unlock(&p->mutex);
1125
1126 rcu_unregister_thread();
1127 trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1128
1129 return NULL;
1130 }
1131
1132 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1133 {
1134 MultiFDSendParams *p = opaque;
1135 QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1136 Error *local_err = NULL;
1137
1138 if (qio_task_propagate_error(task, &local_err)) {
1139 migrate_set_error(migrate_get_current(), local_err);
1140 multifd_save_cleanup();
1141 } else {
1142 p->c = QIO_CHANNEL(sioc);
1143 qio_channel_set_delay(p->c, false);
1144 p->running = true;
1145 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1146 QEMU_THREAD_JOINABLE);
1147
1148 atomic_inc(&multifd_send_state->count);
1149 }
1150 }
1151
1152 int multifd_save_setup(void)
1153 {
1154 int thread_count;
1155 uint32_t page_count = migrate_multifd_page_count();
1156 uint8_t i;
1157
1158 if (!migrate_use_multifd()) {
1159 return 0;
1160 }
1161 thread_count = migrate_multifd_channels();
1162 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1163 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1164 atomic_set(&multifd_send_state->count, 0);
1165 multifd_send_state->pages = multifd_pages_init(page_count);
1166 qemu_sem_init(&multifd_send_state->sem_sync, 0);
1167 qemu_sem_init(&multifd_send_state->channels_ready, 0);
1168
1169 for (i = 0; i < thread_count; i++) {
1170 MultiFDSendParams *p = &multifd_send_state->params[i];
1171
1172 qemu_mutex_init(&p->mutex);
1173 qemu_sem_init(&p->sem, 0);
1174 qemu_sem_init(&p->sem_sync, 0);
1175 p->quit = false;
1176 p->pending_job = 0;
1177 p->id = i;
1178 p->pages = multifd_pages_init(page_count);
1179 p->packet_len = sizeof(MultiFDPacket_t)
1180 + sizeof(ram_addr_t) * page_count;
1181 p->packet = g_malloc0(p->packet_len);
1182 p->name = g_strdup_printf("multifdsend_%d", i);
1183 socket_send_channel_create(multifd_new_send_channel_async, p);
1184 }
1185 return 0;
1186 }
1187
1188 struct {
1189 MultiFDRecvParams *params;
1190 /* number of created threads */
1191 int count;
1192 /* syncs main thread and channels */
1193 QemuSemaphore sem_sync;
1194 /* global number of generated multifd packets */
1195 uint64_t packet_num;
1196 } *multifd_recv_state;
1197
1198 static void multifd_recv_terminate_threads(Error *err)
1199 {
1200 int i;
1201
1202 if (err) {
1203 MigrationState *s = migrate_get_current();
1204 migrate_set_error(s, err);
1205 if (s->state == MIGRATION_STATUS_SETUP ||
1206 s->state == MIGRATION_STATUS_ACTIVE) {
1207 migrate_set_state(&s->state, s->state,
1208 MIGRATION_STATUS_FAILED);
1209 }
1210 }
1211
1212 for (i = 0; i < migrate_multifd_channels(); i++) {
1213 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1214
1215 qemu_mutex_lock(&p->mutex);
1216 /* We could arrive here for two reasons:
1217 - normal quit, i.e. everything went fine, just finished
1218 - error quit: We close the channels so the channel threads
1219 finish the qio_channel_read_all_eof() */
1220 qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1221 qemu_mutex_unlock(&p->mutex);
1222 }
1223 }
1224
1225 int multifd_load_cleanup(Error **errp)
1226 {
1227 int i;
1228 int ret = 0;
1229
1230 if (!migrate_use_multifd()) {
1231 return 0;
1232 }
1233 multifd_recv_terminate_threads(NULL);
1234 for (i = 0; i < migrate_multifd_channels(); i++) {
1235 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1236
1237 if (p->running) {
1238 qemu_thread_join(&p->thread);
1239 }
1240 object_unref(OBJECT(p->c));
1241 p->c = NULL;
1242 qemu_mutex_destroy(&p->mutex);
1243 qemu_sem_destroy(&p->sem_sync);
1244 g_free(p->name);
1245 p->name = NULL;
1246 multifd_pages_clear(p->pages);
1247 p->pages = NULL;
1248 p->packet_len = 0;
1249 g_free(p->packet);
1250 p->packet = NULL;
1251 }
1252 qemu_sem_destroy(&multifd_recv_state->sem_sync);
1253 g_free(multifd_recv_state->params);
1254 multifd_recv_state->params = NULL;
1255 g_free(multifd_recv_state);
1256 multifd_recv_state = NULL;
1257
1258 return ret;
1259 }
1260
1261 static void multifd_recv_sync_main(void)
1262 {
1263 int i;
1264
1265 if (!migrate_use_multifd()) {
1266 return;
1267 }
1268 for (i = 0; i < migrate_multifd_channels(); i++) {
1269 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1270
1271 trace_multifd_recv_sync_main_wait(p->id);
1272 qemu_sem_wait(&multifd_recv_state->sem_sync);
1273 qemu_mutex_lock(&p->mutex);
1274 if (multifd_recv_state->packet_num < p->packet_num) {
1275 multifd_recv_state->packet_num = p->packet_num;
1276 }
1277 qemu_mutex_unlock(&p->mutex);
1278 }
1279 for (i = 0; i < migrate_multifd_channels(); i++) {
1280 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1281
1282 trace_multifd_recv_sync_main_signal(p->id);
1283 qemu_sem_post(&p->sem_sync);
1284 }
1285 trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1286 }
1287
1288 static void *multifd_recv_thread(void *opaque)
1289 {
1290 MultiFDRecvParams *p = opaque;
1291 Error *local_err = NULL;
1292 int ret;
1293
1294 trace_multifd_recv_thread_start(p->id);
1295 rcu_register_thread();
1296
1297 while (true) {
1298 uint32_t used;
1299 uint32_t flags;
1300
1301 ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1302 p->packet_len, &local_err);
1303 if (ret == 0) { /* EOF */
1304 break;
1305 }
1306 if (ret == -1) { /* Error */
1307 break;
1308 }
1309
1310 qemu_mutex_lock(&p->mutex);
1311 ret = multifd_recv_unfill_packet(p, &local_err);
1312 if (ret) {
1313 qemu_mutex_unlock(&p->mutex);
1314 break;
1315 }
1316
1317 used = p->pages->used;
1318 flags = p->flags;
1319 trace_multifd_recv(p->id, p->packet_num, used, flags);
1320 p->num_packets++;
1321 p->num_pages += used;
1322 qemu_mutex_unlock(&p->mutex);
1323
1324 if (used) {
1325 ret = qio_channel_readv_all(p->c, p->pages->iov,
1326 used, &local_err);
1327 if (ret != 0) {
1328 break;
1329 }
1330 }
1331
1332 if (flags & MULTIFD_FLAG_SYNC) {
1333 qemu_sem_post(&multifd_recv_state->sem_sync);
1334 qemu_sem_wait(&p->sem_sync);
1335 }
1336 }
1337
1338 if (local_err) {
1339 multifd_recv_terminate_threads(local_err);
1340 }
1341 qemu_mutex_lock(&p->mutex);
1342 p->running = false;
1343 qemu_mutex_unlock(&p->mutex);
1344
1345 rcu_unregister_thread();
1346 trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1347
1348 return NULL;
1349 }
1350
1351 int multifd_load_setup(void)
1352 {
1353 int thread_count;
1354 uint32_t page_count = migrate_multifd_page_count();
1355 uint8_t i;
1356
1357 if (!migrate_use_multifd()) {
1358 return 0;
1359 }
1360 thread_count = migrate_multifd_channels();
1361 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1362 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1363 atomic_set(&multifd_recv_state->count, 0);
1364 qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1365
1366 for (i = 0; i < thread_count; i++) {
1367 MultiFDRecvParams *p = &multifd_recv_state->params[i];
1368
1369 qemu_mutex_init(&p->mutex);
1370 qemu_sem_init(&p->sem_sync, 0);
1371 p->id = i;
1372 p->pages = multifd_pages_init(page_count);
1373 p->packet_len = sizeof(MultiFDPacket_t)
1374 + sizeof(ram_addr_t) * page_count;
1375 p->packet = g_malloc0(p->packet_len);
1376 p->name = g_strdup_printf("multifdrecv_%d", i);
1377 }
1378 return 0;
1379 }
1380
1381 bool multifd_recv_all_channels_created(void)
1382 {
1383 int thread_count = migrate_multifd_channels();
1384
1385 if (!migrate_use_multifd()) {
1386 return true;
1387 }
1388
1389 return thread_count == atomic_read(&multifd_recv_state->count);
1390 }
1391
1392 /*
1393 * Try to receive all multifd channels to get ready for the migration.
1394 * - Return true and do not set @errp when correctly receving all channels;
1395 * - Return false and do not set @errp when correctly receiving the current one;
1396 * - Return false and set @errp when failing to receive the current channel.
1397 */
1398 bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1399 {
1400 MultiFDRecvParams *p;
1401 Error *local_err = NULL;
1402 int id;
1403
1404 id = multifd_recv_initial_packet(ioc, &local_err);
1405 if (id < 0) {
1406 multifd_recv_terminate_threads(local_err);
1407 error_propagate_prepend(errp, local_err,
1408 "failed to receive packet"
1409 " via multifd channel %d: ",
1410 atomic_read(&multifd_recv_state->count));
1411 return false;
1412 }
1413
1414 p = &multifd_recv_state->params[id];
1415 if (p->c != NULL) {
1416 error_setg(&local_err, "multifd: received id '%d' already setup'",
1417 id);
1418 multifd_recv_terminate_threads(local_err);
1419 error_propagate(errp, local_err);
1420 return false;
1421 }
1422 p->c = ioc;
1423 object_ref(OBJECT(ioc));
1424 /* initial packet */
1425 p->num_packets = 1;
1426
1427 p->running = true;
1428 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1429 QEMU_THREAD_JOINABLE);
1430 atomic_inc(&multifd_recv_state->count);
1431 return atomic_read(&multifd_recv_state->count) ==
1432 migrate_multifd_channels();
1433 }
1434
1435 /**
1436 * save_page_header: write page header to wire
1437 *
1438 * If this is the 1st block, it also writes the block identification
1439 *
1440 * Returns the number of bytes written
1441 *
1442 * @f: QEMUFile where to send the data
1443 * @block: block that contains the page we want to send
1444 * @offset: offset inside the block for the page
1445 * in the lower bits, it contains flags
1446 */
1447 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
1448 ram_addr_t offset)
1449 {
1450 size_t size, len;
1451
1452 if (block == rs->last_sent_block) {
1453 offset |= RAM_SAVE_FLAG_CONTINUE;
1454 }
1455 qemu_put_be64(f, offset);
1456 size = 8;
1457
1458 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1459 len = strlen(block->idstr);
1460 qemu_put_byte(f, len);
1461 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1462 size += 1 + len;
1463 rs->last_sent_block = block;
1464 }
1465 return size;
1466 }
1467
1468 /**
1469 * mig_throttle_guest_down: throotle down the guest
1470 *
1471 * Reduce amount of guest cpu execution to hopefully slow down memory
1472 * writes. If guest dirty memory rate is reduced below the rate at
1473 * which we can transfer pages to the destination then we should be
1474 * able to complete migration. Some workloads dirty memory way too
1475 * fast and will not effectively converge, even with auto-converge.
1476 */
1477 static void mig_throttle_guest_down(void)
1478 {
1479 MigrationState *s = migrate_get_current();
1480 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1481 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1482 int pct_max = s->parameters.max_cpu_throttle;
1483
1484 /* We have not started throttling yet. Let's start it. */
1485 if (!cpu_throttle_active()) {
1486 cpu_throttle_set(pct_initial);
1487 } else {
1488 /* Throttling already on, just increase the rate */
1489 cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1490 pct_max));
1491 }
1492 }
1493
1494 /**
1495 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1496 *
1497 * @rs: current RAM state
1498 * @current_addr: address for the zero page
1499 *
1500 * Update the xbzrle cache to reflect a page that's been sent as all 0.
1501 * The important thing is that a stale (not-yet-0'd) page be replaced
1502 * by the new data.
1503 * As a bonus, if the page wasn't in the cache it gets added so that
1504 * when a small write is made into the 0'd page it gets XBZRLE sent.
1505 */
1506 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1507 {
1508 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1509 return;
1510 }
1511
1512 /* We don't care if this fails to allocate a new cache page
1513 * as long as it updated an old one */
1514 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1515 ram_counters.dirty_sync_count);
1516 }
1517
1518 #define ENCODING_FLAG_XBZRLE 0x1
1519
1520 /**
1521 * save_xbzrle_page: compress and send current page
1522 *
1523 * Returns: 1 means that we wrote the page
1524 * 0 means that page is identical to the one already sent
1525 * -1 means that xbzrle would be longer than normal
1526 *
1527 * @rs: current RAM state
1528 * @current_data: pointer to the address of the page contents
1529 * @current_addr: addr of the page
1530 * @block: block that contains the page we want to send
1531 * @offset: offset inside the block for the page
1532 * @last_stage: if we are at the completion stage
1533 */
1534 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1535 ram_addr_t current_addr, RAMBlock *block,
1536 ram_addr_t offset, bool last_stage)
1537 {
1538 int encoded_len = 0, bytes_xbzrle;
1539 uint8_t *prev_cached_page;
1540
1541 if (!cache_is_cached(XBZRLE.cache, current_addr,
1542 ram_counters.dirty_sync_count)) {
1543 xbzrle_counters.cache_miss++;
1544 if (!last_stage) {
1545 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1546 ram_counters.dirty_sync_count) == -1) {
1547 return -1;
1548 } else {
1549 /* update *current_data when the page has been
1550 inserted into cache */
1551 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1552 }
1553 }
1554 return -1;
1555 }
1556
1557 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1558
1559 /* save current buffer into memory */
1560 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1561
1562 /* XBZRLE encoding (if there is no overflow) */
1563 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1564 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1565 TARGET_PAGE_SIZE);
1566 if (encoded_len == 0) {
1567 trace_save_xbzrle_page_skipping();
1568 return 0;
1569 } else if (encoded_len == -1) {
1570 trace_save_xbzrle_page_overflow();
1571 xbzrle_counters.overflow++;
1572 /* update data in the cache */
1573 if (!last_stage) {
1574 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1575 *current_data = prev_cached_page;
1576 }
1577 return -1;
1578 }
1579
1580 /* we need to update the data in the cache, in order to get the same data */
1581 if (!last_stage) {
1582 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1583 }
1584
1585 /* Send XBZRLE based compressed page */
1586 bytes_xbzrle = save_page_header(rs, rs->f, block,
1587 offset | RAM_SAVE_FLAG_XBZRLE);
1588 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1589 qemu_put_be16(rs->f, encoded_len);
1590 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1591 bytes_xbzrle += encoded_len + 1 + 2;
1592 xbzrle_counters.pages++;
1593 xbzrle_counters.bytes += bytes_xbzrle;
1594 ram_counters.transferred += bytes_xbzrle;
1595
1596 return 1;
1597 }
1598
1599 /**
1600 * migration_bitmap_find_dirty: find the next dirty page from start
1601 *
1602 * Called with rcu_read_lock() to protect migration_bitmap
1603 *
1604 * Returns the byte offset within memory region of the start of a dirty page
1605 *
1606 * @rs: current RAM state
1607 * @rb: RAMBlock where to search for dirty pages
1608 * @start: page where we start the search
1609 */
1610 static inline
1611 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1612 unsigned long start)
1613 {
1614 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1615 unsigned long *bitmap = rb->bmap;
1616 unsigned long next;
1617
1618 if (ramblock_is_ignored(rb)) {
1619 return size;
1620 }
1621
1622 /*
1623 * When the free page optimization is enabled, we need to check the bitmap
1624 * to send the non-free pages rather than all the pages in the bulk stage.
1625 */
1626 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
1627 next = start + 1;
1628 } else {
1629 next = find_next_bit(bitmap, size, start);
1630 }
1631
1632 return next;
1633 }
1634
1635 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1636 RAMBlock *rb,
1637 unsigned long page)
1638 {
1639 bool ret;
1640
1641 qemu_mutex_lock(&rs->bitmap_mutex);
1642 ret = test_and_clear_bit(page, rb->bmap);
1643
1644 if (ret) {
1645 rs->migration_dirty_pages--;
1646 }
1647 qemu_mutex_unlock(&rs->bitmap_mutex);
1648
1649 return ret;
1650 }
1651
1652 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1653 ram_addr_t start, ram_addr_t length)
1654 {
1655 rs->migration_dirty_pages +=
1656 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1657 &rs->num_dirty_pages_period);
1658 }
1659
1660 /**
1661 * ram_pagesize_summary: calculate all the pagesizes of a VM
1662 *
1663 * Returns a summary bitmap of the page sizes of all RAMBlocks
1664 *
1665 * For VMs with just normal pages this is equivalent to the host page
1666 * size. If it's got some huge pages then it's the OR of all the
1667 * different page sizes.
1668 */
1669 uint64_t ram_pagesize_summary(void)
1670 {
1671 RAMBlock *block;
1672 uint64_t summary = 0;
1673
1674 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1675 summary |= block->page_size;
1676 }
1677
1678 return summary;
1679 }
1680
1681 uint64_t ram_get_total_transferred_pages(void)
1682 {
1683 return ram_counters.normal + ram_counters.duplicate +
1684 compression_counters.pages + xbzrle_counters.pages;
1685 }
1686
1687 static void migration_update_rates(RAMState *rs, int64_t end_time)
1688 {
1689 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1690 double compressed_size;
1691
1692 /* calculate period counters */
1693 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1694 / (end_time - rs->time_last_bitmap_sync);
1695
1696 if (!page_count) {
1697 return;
1698 }
1699
1700 if (migrate_use_xbzrle()) {
1701 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1702 rs->xbzrle_cache_miss_prev) / page_count;
1703 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1704 }
1705
1706 if (migrate_use_compression()) {
1707 compression_counters.busy_rate = (double)(compression_counters.busy -
1708 rs->compress_thread_busy_prev) / page_count;
1709 rs->compress_thread_busy_prev = compression_counters.busy;
1710
1711 compressed_size = compression_counters.compressed_size -
1712 rs->compressed_size_prev;
1713 if (compressed_size) {
1714 double uncompressed_size = (compression_counters.pages -
1715 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1716
1717 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1718 compression_counters.compression_rate =
1719 uncompressed_size / compressed_size;
1720
1721 rs->compress_pages_prev = compression_counters.pages;
1722 rs->compressed_size_prev = compression_counters.compressed_size;
1723 }
1724 }
1725 }
1726
1727 static void migration_bitmap_sync(RAMState *rs)
1728 {
1729 RAMBlock *block;
1730 int64_t end_time;
1731 uint64_t bytes_xfer_now;
1732
1733 ram_counters.dirty_sync_count++;
1734
1735 if (!rs->time_last_bitmap_sync) {
1736 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1737 }
1738
1739 trace_migration_bitmap_sync_start();
1740 memory_global_dirty_log_sync();
1741
1742 qemu_mutex_lock(&rs->bitmap_mutex);
1743 rcu_read_lock();
1744 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1745 migration_bitmap_sync_range(rs, block, 0, block->used_length);
1746 }
1747 ram_counters.remaining = ram_bytes_remaining();
1748 rcu_read_unlock();
1749 qemu_mutex_unlock(&rs->bitmap_mutex);
1750
1751 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1752
1753 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1754
1755 /* more than 1 second = 1000 millisecons */
1756 if (end_time > rs->time_last_bitmap_sync + 1000) {
1757 bytes_xfer_now = ram_counters.transferred;
1758
1759 /* During block migration the auto-converge logic incorrectly detects
1760 * that ram migration makes no progress. Avoid this by disabling the
1761 * throttling logic during the bulk phase of block migration. */
1762 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1763 /* The following detection logic can be refined later. For now:
1764 Check to see if the dirtied bytes is 50% more than the approx.
1765 amount of bytes that just got transferred since the last time we
1766 were in this routine. If that happens twice, start or increase
1767 throttling */
1768
1769 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1770 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1771 (++rs->dirty_rate_high_cnt >= 2)) {
1772 trace_migration_throttle();
1773 rs->dirty_rate_high_cnt = 0;
1774 mig_throttle_guest_down();
1775 }
1776 }
1777
1778 migration_update_rates(rs, end_time);
1779
1780 rs->target_page_count_prev = rs->target_page_count;
1781
1782 /* reset period counters */
1783 rs->time_last_bitmap_sync = end_time;
1784 rs->num_dirty_pages_period = 0;
1785 rs->bytes_xfer_prev = bytes_xfer_now;
1786 }
1787 if (migrate_use_events()) {
1788 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1789 }
1790 }
1791
1792 static void migration_bitmap_sync_precopy(RAMState *rs)
1793 {
1794 Error *local_err = NULL;
1795
1796 /*
1797 * The current notifier usage is just an optimization to migration, so we
1798 * don't stop the normal migration process in the error case.
1799 */
1800 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1801 error_report_err(local_err);
1802 }
1803
1804 migration_bitmap_sync(rs);
1805
1806 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1807 error_report_err(local_err);
1808 }
1809 }
1810
1811 /**
1812 * save_zero_page_to_file: send the zero page to the file
1813 *
1814 * Returns the size of data written to the file, 0 means the page is not
1815 * a zero page
1816 *
1817 * @rs: current RAM state
1818 * @file: the file where the data is saved
1819 * @block: block that contains the page we want to send
1820 * @offset: offset inside the block for the page
1821 */
1822 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1823 RAMBlock *block, ram_addr_t offset)
1824 {
1825 uint8_t *p = block->host + offset;
1826 int len = 0;
1827
1828 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1829 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1830 qemu_put_byte(file, 0);
1831 len += 1;
1832 }
1833 return len;
1834 }
1835
1836 /**
1837 * save_zero_page: send the zero page to the stream
1838 *
1839 * Returns the number of pages written.
1840 *
1841 * @rs: current RAM state
1842 * @block: block that contains the page we want to send
1843 * @offset: offset inside the block for the page
1844 */
1845 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1846 {
1847 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1848
1849 if (len) {
1850 ram_counters.duplicate++;
1851 ram_counters.transferred += len;
1852 return 1;
1853 }
1854 return -1;
1855 }
1856
1857 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1858 {
1859 if (!migrate_release_ram() || !migration_in_postcopy()) {
1860 return;
1861 }
1862
1863 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1864 }
1865
1866 /*
1867 * @pages: the number of pages written by the control path,
1868 * < 0 - error
1869 * > 0 - number of pages written
1870 *
1871 * Return true if the pages has been saved, otherwise false is returned.
1872 */
1873 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1874 int *pages)
1875 {
1876 uint64_t bytes_xmit = 0;
1877 int ret;
1878
1879 *pages = -1;
1880 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1881 &bytes_xmit);
1882 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1883 return false;
1884 }
1885
1886 if (bytes_xmit) {
1887 ram_counters.transferred += bytes_xmit;
1888 *pages = 1;
1889 }
1890
1891 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1892 return true;
1893 }
1894
1895 if (bytes_xmit > 0) {
1896 ram_counters.normal++;
1897 } else if (bytes_xmit == 0) {
1898 ram_counters.duplicate++;
1899 }
1900
1901 return true;
1902 }
1903
1904 /*
1905 * directly send the page to the stream
1906 *
1907 * Returns the number of pages written.
1908 *
1909 * @rs: current RAM state
1910 * @block: block that contains the page we want to send
1911 * @offset: offset inside the block for the page
1912 * @buf: the page to be sent
1913 * @async: send to page asyncly
1914 */
1915 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1916 uint8_t *buf, bool async)
1917 {
1918 ram_counters.transferred += save_page_header(rs, rs->f, block,
1919 offset | RAM_SAVE_FLAG_PAGE);
1920 if (async) {
1921 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1922 migrate_release_ram() &
1923 migration_in_postcopy());
1924 } else {
1925 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1926 }
1927 ram_counters.transferred += TARGET_PAGE_SIZE;
1928 ram_counters.normal++;
1929 return 1;
1930 }
1931
1932 /**
1933 * ram_save_page: send the given page to the stream
1934 *
1935 * Returns the number of pages written.
1936 * < 0 - error
1937 * >=0 - Number of pages written - this might legally be 0
1938 * if xbzrle noticed the page was the same.
1939 *
1940 * @rs: current RAM state
1941 * @block: block that contains the page we want to send
1942 * @offset: offset inside the block for the page
1943 * @last_stage: if we are at the completion stage
1944 */
1945 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1946 {
1947 int pages = -1;
1948 uint8_t *p;
1949 bool send_async = true;
1950 RAMBlock *block = pss->block;
1951 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1952 ram_addr_t current_addr = block->offset + offset;
1953
1954 p = block->host + offset;
1955 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1956
1957 XBZRLE_cache_lock();
1958 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1959 migrate_use_xbzrle()) {
1960 pages = save_xbzrle_page(rs, &p, current_addr, block,
1961 offset, last_stage);
1962 if (!last_stage) {
1963 /* Can't send this cached data async, since the cache page
1964 * might get updated before it gets to the wire
1965 */
1966 send_async = false;
1967 }
1968 }
1969
1970 /* XBZRLE overflow or normal page */
1971 if (pages == -1) {
1972 pages = save_normal_page(rs, block, offset, p, send_async);
1973 }
1974
1975 XBZRLE_cache_unlock();
1976
1977 return pages;
1978 }
1979
1980 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1981 ram_addr_t offset)
1982 {
1983 multifd_queue_page(block, offset);
1984 ram_counters.normal++;
1985
1986 return 1;
1987 }
1988
1989 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1990 ram_addr_t offset, uint8_t *source_buf)
1991 {
1992 RAMState *rs = ram_state;
1993 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1994 bool zero_page = false;
1995 int ret;
1996
1997 if (save_zero_page_to_file(rs, f, block, offset)) {
1998 zero_page = true;
1999 goto exit;
2000 }
2001
2002 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
2003
2004 /*
2005 * copy it to a internal buffer to avoid it being modified by VM
2006 * so that we can catch up the error during compression and
2007 * decompression
2008 */
2009 memcpy(source_buf, p, TARGET_PAGE_SIZE);
2010 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2011 if (ret < 0) {
2012 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
2013 error_report("compressed data failed!");
2014 return false;
2015 }
2016
2017 exit:
2018 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
2019 return zero_page;
2020 }
2021
2022 static void
2023 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2024 {
2025 ram_counters.transferred += bytes_xmit;
2026
2027 if (param->zero_page) {
2028 ram_counters.duplicate++;
2029 return;
2030 }
2031
2032 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2033 compression_counters.compressed_size += bytes_xmit - 8;
2034 compression_counters.pages++;
2035 }
2036
2037 static bool save_page_use_compression(RAMState *rs);
2038
2039 static void flush_compressed_data(RAMState *rs)
2040 {
2041 int idx, len, thread_count;
2042
2043 if (!save_page_use_compression(rs)) {
2044 return;
2045 }
2046 thread_count = migrate_compress_threads();
2047
2048 qemu_mutex_lock(&comp_done_lock);
2049 for (idx = 0; idx < thread_count; idx++) {
2050 while (!comp_param[idx].done) {
2051 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2052 }
2053 }
2054 qemu_mutex_unlock(&comp_done_lock);
2055
2056 for (idx = 0; idx < thread_count; idx++) {
2057 qemu_mutex_lock(&comp_param[idx].mutex);
2058 if (!comp_param[idx].quit) {
2059 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2060 /*
2061 * it's safe to fetch zero_page without holding comp_done_lock
2062 * as there is no further request submitted to the thread,
2063 * i.e, the thread should be waiting for a request at this point.
2064 */
2065 update_compress_thread_counts(&comp_param[idx], len);
2066 }
2067 qemu_mutex_unlock(&comp_param[idx].mutex);
2068 }
2069 }
2070
2071 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2072 ram_addr_t offset)
2073 {
2074 param->block = block;
2075 param->offset = offset;
2076 }
2077
2078 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2079 ram_addr_t offset)
2080 {
2081 int idx, thread_count, bytes_xmit = -1, pages = -1;
2082 bool wait = migrate_compress_wait_thread();
2083
2084 thread_count = migrate_compress_threads();
2085 qemu_mutex_lock(&comp_done_lock);
2086 retry:
2087 for (idx = 0; idx < thread_count; idx++) {
2088 if (comp_param[idx].done) {
2089 comp_param[idx].done = false;
2090 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2091 qemu_mutex_lock(&comp_param[idx].mutex);
2092 set_compress_params(&comp_param[idx], block, offset);
2093 qemu_cond_signal(&comp_param[idx].cond);
2094 qemu_mutex_unlock(&comp_param[idx].mutex);
2095 pages = 1;
2096 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
2097 break;
2098 }
2099 }
2100
2101 /*
2102 * wait for the free thread if the user specifies 'compress-wait-thread',
2103 * otherwise we will post the page out in the main thread as normal page.
2104 */
2105 if (pages < 0 && wait) {
2106 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2107 goto retry;
2108 }
2109 qemu_mutex_unlock(&comp_done_lock);
2110
2111 return pages;
2112 }
2113
2114 /**
2115 * find_dirty_block: find the next dirty page and update any state
2116 * associated with the search process.
2117 *
2118 * Returns if a page is found
2119 *
2120 * @rs: current RAM state
2121 * @pss: data about the state of the current dirty page scan
2122 * @again: set to false if the search has scanned the whole of RAM
2123 */
2124 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
2125 {
2126 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2127 if (pss->complete_round && pss->block == rs->last_seen_block &&
2128 pss->page >= rs->last_page) {
2129 /*
2130 * We've been once around the RAM and haven't found anything.
2131 * Give up.
2132 */
2133 *again = false;
2134 return false;
2135 }
2136 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
2137 /* Didn't find anything in this RAM Block */
2138 pss->page = 0;
2139 pss->block = QLIST_NEXT_RCU(pss->block, next);
2140 if (!pss->block) {
2141 /*
2142 * If memory migration starts over, we will meet a dirtied page
2143 * which may still exists in compression threads's ring, so we
2144 * should flush the compressed data to make sure the new page
2145 * is not overwritten by the old one in the destination.
2146 *
2147 * Also If xbzrle is on, stop using the data compression at this
2148 * point. In theory, xbzrle can do better than compression.
2149 */
2150 flush_compressed_data(rs);
2151
2152 /* Hit the end of the list */
2153 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2154 /* Flag that we've looped */
2155 pss->complete_round = true;
2156 rs->ram_bulk_stage = false;
2157 }
2158 /* Didn't find anything this time, but try again on the new block */
2159 *again = true;
2160 return false;
2161 } else {
2162 /* Can go around again, but... */
2163 *again = true;
2164 /* We've found something so probably don't need to */
2165 return true;
2166 }
2167 }
2168
2169 /**
2170 * unqueue_page: gets a page of the queue
2171 *
2172 * Helper for 'get_queued_page' - gets a page off the queue
2173 *
2174 * Returns the block of the page (or NULL if none available)
2175 *
2176 * @rs: current RAM state
2177 * @offset: used to return the offset within the RAMBlock
2178 */
2179 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2180 {
2181 RAMBlock *block = NULL;
2182
2183 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2184 return NULL;
2185 }
2186
2187 qemu_mutex_lock(&rs->src_page_req_mutex);
2188 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2189 struct RAMSrcPageRequest *entry =
2190 QSIMPLEQ_FIRST(&rs->src_page_requests);
2191 block = entry->rb;
2192 *offset = entry->offset;
2193
2194 if (entry->len > TARGET_PAGE_SIZE) {
2195 entry->len -= TARGET_PAGE_SIZE;
2196 entry->offset += TARGET_PAGE_SIZE;
2197 } else {
2198 memory_region_unref(block->mr);
2199 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2200 g_free(entry);
2201 migration_consume_urgent_request();
2202 }
2203 }
2204 qemu_mutex_unlock(&rs->src_page_req_mutex);
2205
2206 return block;
2207 }
2208
2209 /**
2210 * get_queued_page: unqueue a page from the postocpy requests
2211 *
2212 * Skips pages that are already sent (!dirty)
2213 *
2214 * Returns if a queued page is found
2215 *
2216 * @rs: current RAM state
2217 * @pss: data about the state of the current dirty page scan
2218 */
2219 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2220 {
2221 RAMBlock *block;
2222 ram_addr_t offset;
2223 bool dirty;
2224
2225 do {
2226 block = unqueue_page(rs, &offset);
2227 /*
2228 * We're sending this page, and since it's postcopy nothing else
2229 * will dirty it, and we must make sure it doesn't get sent again
2230 * even if this queue request was received after the background
2231 * search already sent it.
2232 */
2233 if (block) {
2234 unsigned long page;
2235
2236 page = offset >> TARGET_PAGE_BITS;
2237 dirty = test_bit(page, block->bmap);
2238 if (!dirty) {
2239 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2240 page, test_bit(page, block->unsentmap));
2241 } else {
2242 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2243 }
2244 }
2245
2246 } while (block && !dirty);
2247
2248 if (block) {
2249 /*
2250 * As soon as we start servicing pages out of order, then we have
2251 * to kill the bulk stage, since the bulk stage assumes
2252 * in (migration_bitmap_find_and_reset_dirty) that every page is
2253 * dirty, that's no longer true.
2254 */
2255 rs->ram_bulk_stage = false;
2256
2257 /*
2258 * We want the background search to continue from the queued page
2259 * since the guest is likely to want other pages near to the page
2260 * it just requested.
2261 */
2262 pss->block = block;
2263 pss->page = offset >> TARGET_PAGE_BITS;
2264 }
2265
2266 return !!block;
2267 }
2268
2269 /**
2270 * migration_page_queue_free: drop any remaining pages in the ram
2271 * request queue
2272 *
2273 * It should be empty at the end anyway, but in error cases there may
2274 * be some left. in case that there is any page left, we drop it.
2275 *
2276 */
2277 static void migration_page_queue_free(RAMState *rs)
2278 {
2279 struct RAMSrcPageRequest *mspr, *next_mspr;
2280 /* This queue generally should be empty - but in the case of a failed
2281 * migration might have some droppings in.
2282 */
2283 rcu_read_lock();
2284 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2285 memory_region_unref(mspr->rb->mr);
2286 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2287 g_free(mspr);
2288 }
2289 rcu_read_unlock();
2290 }
2291
2292 /**
2293 * ram_save_queue_pages: queue the page for transmission
2294 *
2295 * A request from postcopy destination for example.
2296 *
2297 * Returns zero on success or negative on error
2298 *
2299 * @rbname: Name of the RAMBLock of the request. NULL means the
2300 * same that last one.
2301 * @start: starting address from the start of the RAMBlock
2302 * @len: length (in bytes) to send
2303 */
2304 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2305 {
2306 RAMBlock *ramblock;
2307 RAMState *rs = ram_state;
2308
2309 ram_counters.postcopy_requests++;
2310 rcu_read_lock();
2311 if (!rbname) {
2312 /* Reuse last RAMBlock */
2313 ramblock = rs->last_req_rb;
2314
2315 if (!ramblock) {
2316 /*
2317 * Shouldn't happen, we can't reuse the last RAMBlock if
2318 * it's the 1st request.
2319 */
2320 error_report("ram_save_queue_pages no previous block");
2321 goto err;
2322 }
2323 } else {
2324 ramblock = qemu_ram_block_by_name(rbname);
2325
2326 if (!ramblock) {
2327 /* We shouldn't be asked for a non-existent RAMBlock */
2328 error_report("ram_save_queue_pages no block '%s'", rbname);
2329 goto err;
2330 }
2331 rs->last_req_rb = ramblock;
2332 }
2333 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2334 if (start+len > ramblock->used_length) {
2335 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2336 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2337 __func__, start, len, ramblock->used_length);
2338 goto err;
2339 }
2340
2341 struct RAMSrcPageRequest *new_entry =
2342 g_malloc0(sizeof(struct RAMSrcPageRequest));
2343 new_entry->rb = ramblock;
2344 new_entry->offset = start;
2345 new_entry->len = len;
2346
2347 memory_region_ref(ramblock->mr);
2348 qemu_mutex_lock(&rs->src_page_req_mutex);
2349 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2350 migration_make_urgent_request();
2351 qemu_mutex_unlock(&rs->src_page_req_mutex);
2352 rcu_read_unlock();
2353
2354 return 0;
2355
2356 err:
2357 rcu_read_unlock();
2358 return -1;
2359 }
2360
2361 static bool save_page_use_compression(RAMState *rs)
2362 {
2363 if (!migrate_use_compression()) {
2364 return false;
2365 }
2366
2367 /*
2368 * If xbzrle is on, stop using the data compression after first
2369 * round of migration even if compression is enabled. In theory,
2370 * xbzrle can do better than compression.
2371 */
2372 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2373 return true;
2374 }
2375
2376 return false;
2377 }
2378
2379 /*
2380 * try to compress the page before posting it out, return true if the page
2381 * has been properly handled by compression, otherwise needs other
2382 * paths to handle it
2383 */
2384 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2385 {
2386 if (!save_page_use_compression(rs)) {
2387 return false;
2388 }
2389
2390 /*
2391 * When starting the process of a new block, the first page of
2392 * the block should be sent out before other pages in the same
2393 * block, and all the pages in last block should have been sent
2394 * out, keeping this order is important, because the 'cont' flag
2395 * is used to avoid resending the block name.
2396 *
2397 * We post the fist page as normal page as compression will take
2398 * much CPU resource.
2399 */
2400 if (block != rs->last_sent_block) {
2401 flush_compressed_data(rs);
2402 return false;
2403 }
2404
2405 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2406 return true;
2407 }
2408
2409 compression_counters.busy++;
2410 return false;
2411 }
2412
2413 /**
2414 * ram_save_target_page: save one target page
2415 *
2416 * Returns the number of pages written
2417 *
2418 * @rs: current RAM state
2419 * @pss: data about the page we want to send
2420 * @last_stage: if we are at the completion stage
2421 */
2422 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2423 bool last_stage)
2424 {
2425 RAMBlock *block = pss->block;
2426 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2427 int res;
2428
2429 if (control_save_page(rs, block, offset, &res)) {
2430 return res;
2431 }
2432
2433 if (save_compress_page(rs, block, offset)) {
2434 return 1;
2435 }
2436
2437 res = save_zero_page(rs, block, offset);
2438 if (res > 0) {
2439 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2440 * page would be stale
2441 */
2442 if (!save_page_use_compression(rs)) {
2443 XBZRLE_cache_lock();
2444 xbzrle_cache_zero_page(rs, block->offset + offset);
2445 XBZRLE_cache_unlock();
2446 }
2447 ram_release_pages(block->idstr, offset, res);
2448 return res;
2449 }
2450
2451 /*
2452 * do not use multifd for compression as the first page in the new
2453 * block should be posted out before sending the compressed page
2454 */
2455 if (!save_page_use_compression(rs) && migrate_use_multifd()) {
2456 return ram_save_multifd_page(rs, block, offset);
2457 }
2458
2459 return ram_save_page(rs, pss, last_stage);
2460 }
2461
2462 /**
2463 * ram_save_host_page: save a whole host page
2464 *
2465 * Starting at *offset send pages up to the end of the current host
2466 * page. It's valid for the initial offset to point into the middle of
2467 * a host page in which case the remainder of the hostpage is sent.
2468 * Only dirty target pages are sent. Note that the host page size may
2469 * be a huge page for this block.
2470 * The saving stops at the boundary of the used_length of the block
2471 * if the RAMBlock isn't a multiple of the host page size.
2472 *
2473 * Returns the number of pages written or negative on error
2474 *
2475 * @rs: current RAM state
2476 * @ms: current migration state
2477 * @pss: data about the page we want to send
2478 * @last_stage: if we are at the completion stage
2479 */
2480 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2481 bool last_stage)
2482 {
2483 int tmppages, pages = 0;
2484 size_t pagesize_bits =
2485 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2486
2487 if (ramblock_is_ignored(pss->block)) {
2488 error_report("block %s should not be migrated !", pss->block->idstr);
2489 return 0;
2490 }
2491
2492 do {
2493 /* Check the pages is dirty and if it is send it */
2494 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2495 pss->page++;
2496 continue;
2497 }
2498
2499 tmppages = ram_save_target_page(rs, pss, last_stage);
2500 if (tmppages < 0) {
2501 return tmppages;
2502 }
2503
2504 pages += tmppages;
2505 if (pss->block->unsentmap) {
2506 clear_bit(pss->page, pss->block->unsentmap);
2507 }
2508
2509 pss->page++;
2510 } while ((pss->page & (pagesize_bits - 1)) &&
2511 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2512
2513 /* The offset we leave with is the last one we looked at */
2514 pss->page--;
2515 return pages;
2516 }
2517
2518 /**
2519 * ram_find_and_save_block: finds a dirty page and sends it to f
2520 *
2521 * Called within an RCU critical section.
2522 *
2523 * Returns the number of pages written where zero means no dirty pages,
2524 * or negative on error
2525 *
2526 * @rs: current RAM state
2527 * @last_stage: if we are at the completion stage
2528 *
2529 * On systems where host-page-size > target-page-size it will send all the
2530 * pages in a host page that are dirty.
2531 */
2532
2533 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2534 {
2535 PageSearchStatus pss;
2536 int pages = 0;
2537 bool again, found;
2538
2539 /* No dirty page as there is zero RAM */
2540 if (!ram_bytes_total()) {
2541 return pages;
2542 }
2543
2544 pss.block = rs->last_seen_block;
2545 pss.page = rs->last_page;
2546 pss.complete_round = false;
2547
2548 if (!pss.block) {
2549 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2550 }
2551
2552 do {
2553 again = true;
2554 found = get_queued_page(rs, &pss);
2555
2556 if (!found) {
2557 /* priority queue empty, so just search for something dirty */
2558 found = find_dirty_block(rs, &pss, &again);
2559 }
2560
2561 if (found) {
2562 pages = ram_save_host_page(rs, &pss, last_stage);
2563 }
2564 } while (!pages && again);
2565
2566 rs->last_seen_block = pss.block;
2567 rs->last_page = pss.page;
2568
2569 return pages;
2570 }
2571
2572 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2573 {
2574 uint64_t pages = size / TARGET_PAGE_SIZE;
2575
2576 if (zero) {
2577 ram_counters.duplicate += pages;
2578 } else {
2579 ram_counters.normal += pages;
2580 ram_counters.transferred += size;
2581 qemu_update_position(f, size);
2582 }
2583 }
2584
2585 static uint64_t ram_bytes_total_common(bool count_ignored)
2586 {
2587 RAMBlock *block;
2588 uint64_t total = 0;
2589
2590 rcu_read_lock();
2591 if (count_ignored) {
2592 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2593 total += block->used_length;
2594 }
2595 } else {
2596 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2597 total += block->used_length;
2598 }
2599 }
2600 rcu_read_unlock();
2601 return total;
2602 }
2603
2604 uint64_t ram_bytes_total(void)
2605 {
2606 return ram_bytes_total_common(false);
2607 }
2608
2609 static void xbzrle_load_setup(void)
2610 {
2611 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2612 }
2613
2614 static void xbzrle_load_cleanup(void)
2615 {
2616 g_free(XBZRLE.decoded_buf);
2617 XBZRLE.decoded_buf = NULL;
2618 }
2619
2620 static void ram_state_cleanup(RAMState **rsp)
2621 {
2622 if (*rsp) {
2623 migration_page_queue_free(*rsp);
2624 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2625 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2626 g_free(*rsp);
2627 *rsp = NULL;
2628 }
2629 }
2630
2631 static void xbzrle_cleanup(void)
2632 {
2633 XBZRLE_cache_lock();
2634 if (XBZRLE.cache) {
2635 cache_fini(XBZRLE.cache);
2636 g_free(XBZRLE.encoded_buf);
2637 g_free(XBZRLE.current_buf);
2638 g_free(XBZRLE.zero_target_page);
2639 XBZRLE.cache = NULL;
2640 XBZRLE.encoded_buf = NULL;
2641 XBZRLE.current_buf = NULL;
2642 XBZRLE.zero_target_page = NULL;
2643 }
2644 XBZRLE_cache_unlock();
2645 }
2646
2647 static void ram_save_cleanup(void *opaque)
2648 {
2649 RAMState **rsp = opaque;
2650 RAMBlock *block;
2651
2652 /* caller have hold iothread lock or is in a bh, so there is
2653 * no writing race against this migration_bitmap
2654 */
2655 memory_global_dirty_log_stop();
2656
2657 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2658 g_free(block->bmap);
2659 block->bmap = NULL;
2660 g_free(block->unsentmap);
2661 block->unsentmap = NULL;
2662 }
2663
2664 xbzrle_cleanup();
2665 compress_threads_save_cleanup();
2666 ram_state_cleanup(rsp);
2667 }
2668
2669 static void ram_state_reset(RAMState *rs)
2670 {
2671 rs->last_seen_block = NULL;
2672 rs->last_sent_block = NULL;
2673 rs->last_page = 0;
2674 rs->last_version = ram_list.version;
2675 rs->ram_bulk_stage = true;
2676 rs->fpo_enabled = false;
2677 }
2678
2679 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2680
2681 /*
2682 * 'expected' is the value you expect the bitmap mostly to be full
2683 * of; it won't bother printing lines that are all this value.
2684 * If 'todump' is null the migration bitmap is dumped.
2685 */
2686 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2687 unsigned long pages)
2688 {
2689 int64_t cur;
2690 int64_t linelen = 128;
2691 char linebuf[129];
2692
2693 for (cur = 0; cur < pages; cur += linelen) {
2694 int64_t curb;
2695 bool found = false;
2696 /*
2697 * Last line; catch the case where the line length
2698 * is longer than remaining ram
2699 */
2700 if (cur + linelen > pages) {
2701 linelen = pages - cur;
2702 }
2703 for (curb = 0; curb < linelen; curb++) {
2704 bool thisbit = test_bit(cur + curb, todump);
2705 linebuf[curb] = thisbit ? '1' : '.';
2706 found = found || (thisbit != expected);
2707 }
2708 if (found) {
2709 linebuf[curb] = '\0';
2710 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2711 }
2712 }
2713 }
2714
2715 /* **** functions for postcopy ***** */
2716
2717 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2718 {
2719 struct RAMBlock *block;
2720
2721 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2722 unsigned long *bitmap = block->bmap;
2723 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2724 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2725
2726 while (run_start < range) {
2727 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2728 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2729 (run_end - run_start) << TARGET_PAGE_BITS);
2730 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2731 }
2732 }
2733 }
2734
2735 /**
2736 * postcopy_send_discard_bm_ram: discard a RAMBlock
2737 *
2738 * Returns zero on success
2739 *
2740 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2741 * Note: At this point the 'unsentmap' is the processed bitmap combined
2742 * with the dirtymap; so a '1' means it's either dirty or unsent.
2743 *
2744 * @ms: current migration state
2745 * @pds: state for postcopy
2746 * @start: RAMBlock starting page
2747 * @length: RAMBlock size
2748 */
2749 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2750 PostcopyDiscardState *pds,
2751 RAMBlock *block)
2752 {
2753 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2754 unsigned long current;
2755 unsigned long *unsentmap = block->unsentmap;
2756
2757 for (current = 0; current < end; ) {
2758 unsigned long one = find_next_bit(unsentmap, end, current);
2759
2760 if (one <= end) {
2761 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2762 unsigned long discard_length;
2763
2764 if (zero >= end) {
2765 discard_length = end - one;
2766 } else {
2767 discard_length = zero - one;
2768 }
2769 if (discard_length) {
2770 postcopy_discard_send_range(ms, pds, one, discard_length);
2771 }
2772 current = one + discard_length;
2773 } else {
2774 current = one;
2775 }
2776 }
2777
2778 return 0;
2779 }
2780
2781 /**
2782 * postcopy_each_ram_send_discard: discard all RAMBlocks
2783 *
2784 * Returns 0 for success or negative for error
2785 *
2786 * Utility for the outgoing postcopy code.
2787 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2788 * passing it bitmap indexes and name.
2789 * (qemu_ram_foreach_block ends up passing unscaled lengths
2790 * which would mean postcopy code would have to deal with target page)
2791 *
2792 * @ms: current migration state
2793 */
2794 static int postcopy_each_ram_send_discard(MigrationState *ms)
2795 {
2796 struct RAMBlock *block;
2797 int ret;
2798
2799 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2800 PostcopyDiscardState *pds =
2801 postcopy_discard_send_init(ms, block->idstr);
2802
2803 /*
2804 * Postcopy sends chunks of bitmap over the wire, but it
2805 * just needs indexes at this point, avoids it having
2806 * target page specific code.
2807 */
2808 ret = postcopy_send_discard_bm_ram(ms, pds, block);
2809 postcopy_discard_send_finish(ms, pds);
2810 if (ret) {
2811 return ret;
2812 }
2813 }
2814
2815 return 0;
2816 }
2817
2818 /**
2819 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2820 *
2821 * Helper for postcopy_chunk_hostpages; it's called twice to
2822 * canonicalize the two bitmaps, that are similar, but one is
2823 * inverted.
2824 *
2825 * Postcopy requires that all target pages in a hostpage are dirty or
2826 * clean, not a mix. This function canonicalizes the bitmaps.
2827 *
2828 * @ms: current migration state
2829 * @unsent_pass: if true we need to canonicalize partially unsent host pages
2830 * otherwise we need to canonicalize partially dirty host pages
2831 * @block: block that contains the page we want to canonicalize
2832 * @pds: state for postcopy
2833 */
2834 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2835 RAMBlock *block,
2836 PostcopyDiscardState *pds)
2837 {
2838 RAMState *rs = ram_state;
2839 unsigned long *bitmap = block->bmap;
2840 unsigned long *unsentmap = block->unsentmap;
2841 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2842 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2843 unsigned long run_start;
2844
2845 if (block->page_size == TARGET_PAGE_SIZE) {
2846 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2847 return;
2848 }
2849
2850 if (unsent_pass) {
2851 /* Find a sent page */
2852 run_start = find_next_zero_bit(unsentmap, pages, 0);
2853 } else {
2854 /* Find a dirty page */
2855 run_start = find_next_bit(bitmap, pages, 0);
2856 }
2857
2858 while (run_start < pages) {
2859 bool do_fixup = false;
2860 unsigned long fixup_start_addr;
2861 unsigned long host_offset;
2862
2863 /*
2864 * If the start of this run of pages is in the middle of a host
2865 * page, then we need to fixup this host page.
2866 */
2867 host_offset = run_start % host_ratio;
2868 if (host_offset) {
2869 do_fixup = true;
2870 run_start -= host_offset;
2871 fixup_start_addr = run_start;
2872 /* For the next pass */
2873 run_start = run_start + host_ratio;
2874 } else {
2875 /* Find the end of this run */
2876 unsigned long run_end;
2877 if (unsent_pass) {
2878 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2879 } else {
2880 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2881 }
2882 /*
2883 * If the end isn't at the start of a host page, then the
2884 * run doesn't finish at the end of a host page
2885 * and we need to discard.
2886 */
2887 host_offset = run_end % host_ratio;
2888 if (host_offset) {
2889 do_fixup = true;
2890 fixup_start_addr = run_end - host_offset;
2891 /*
2892 * This host page has gone, the next loop iteration starts
2893 * from after the fixup
2894 */
2895 run_start = fixup_start_addr + host_ratio;
2896 } else {
2897 /*
2898 * No discards on this iteration, next loop starts from
2899 * next sent/dirty page
2900 */
2901 run_start = run_end + 1;
2902 }
2903 }
2904
2905 if (do_fixup) {
2906 unsigned long page;
2907
2908 /* Tell the destination to discard this page */
2909 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2910 /* For the unsent_pass we:
2911 * discard partially sent pages
2912 * For the !unsent_pass (dirty) we:
2913 * discard partially dirty pages that were sent
2914 * (any partially sent pages were already discarded
2915 * by the previous unsent_pass)
2916 */
2917 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2918 host_ratio);
2919 }
2920
2921 /* Clean up the bitmap */
2922 for (page = fixup_start_addr;
2923 page < fixup_start_addr + host_ratio; page++) {
2924 /* All pages in this host page are now not sent */
2925 set_bit(page, unsentmap);
2926
2927 /*
2928 * Remark them as dirty, updating the count for any pages
2929 * that weren't previously dirty.
2930 */
2931 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2932 }
2933 }
2934
2935 if (unsent_pass) {
2936 /* Find the next sent page for the next iteration */
2937 run_start = find_next_zero_bit(unsentmap, pages, run_start);
2938 } else {
2939 /* Find the next dirty page for the next iteration */
2940 run_start = find_next_bit(bitmap, pages, run_start);
2941 }
2942 }
2943 }
2944
2945 /**
2946 * postcopy_chuck_hostpages: discrad any partially sent host page
2947 *
2948 * Utility for the outgoing postcopy code.
2949 *
2950 * Discard any partially sent host-page size chunks, mark any partially
2951 * dirty host-page size chunks as all dirty. In this case the host-page
2952 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2953 *
2954 * Returns zero on success
2955 *
2956 * @ms: current migration state
2957 * @block: block we want to work with
2958 */
2959 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2960 {
2961 PostcopyDiscardState *pds =
2962 postcopy_discard_send_init(ms, block->idstr);
2963
2964 /* First pass: Discard all partially sent host pages */
2965 postcopy_chunk_hostpages_pass(ms, true, block, pds);
2966 /*
2967 * Second pass: Ensure that all partially dirty host pages are made
2968 * fully dirty.
2969 */
2970 postcopy_chunk_hostpages_pass(ms, false, block, pds);
2971
2972 postcopy_discard_send_finish(ms, pds);
2973 return 0;
2974 }
2975
2976 /**
2977 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2978 *
2979 * Returns zero on success
2980 *
2981 * Transmit the set of pages to be discarded after precopy to the target
2982 * these are pages that:
2983 * a) Have been previously transmitted but are now dirty again
2984 * b) Pages that have never been transmitted, this ensures that
2985 * any pages on the destination that have been mapped by background
2986 * tasks get discarded (transparent huge pages is the specific concern)
2987 * Hopefully this is pretty sparse
2988 *
2989 * @ms: current migration state
2990 */
2991 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2992 {
2993 RAMState *rs = ram_state;
2994 RAMBlock *block;
2995 int ret;
2996
2997 rcu_read_lock();
2998
2999 /* This should be our last sync, the src is now paused */
3000 migration_bitmap_sync(rs);
3001
3002 /* Easiest way to make sure we don't resume in the middle of a host-page */
3003 rs->last_seen_block = NULL;
3004 rs->last_sent_block = NULL;
3005 rs->last_page = 0;
3006
3007 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3008 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
3009 unsigned long *bitmap = block->bmap;
3010 unsigned long *unsentmap = block->unsentmap;
3011
3012 if (!unsentmap) {
3013 /* We don't have a safe way to resize the sentmap, so
3014 * if the bitmap was resized it will be NULL at this
3015 * point.
3016 */
3017 error_report("migration ram resized during precopy phase");
3018 rcu_read_unlock();
3019 return -EINVAL;
3020 }
3021 /* Deal with TPS != HPS and huge pages */
3022 ret = postcopy_chunk_hostpages(ms, block);
3023 if (ret) {
3024 rcu_read_unlock();
3025 return ret;
3026 }
3027
3028 /*
3029 * Update the unsentmap to be unsentmap = unsentmap | dirty
3030 */
3031 bitmap_or(unsentmap, unsentmap, bitmap, pages);
3032 #ifdef DEBUG_POSTCOPY
3033 ram_debug_dump_bitmap(unsentmap, true, pages);
3034 #endif
3035 }
3036 trace_ram_postcopy_send_discard_bitmap();
3037
3038 ret = postcopy_each_ram_send_discard(ms);
3039 rcu_read_unlock();
3040
3041 return ret;
3042 }
3043
3044 /**
3045 * ram_discard_range: discard dirtied pages at the beginning of postcopy
3046 *
3047 * Returns zero on success
3048 *
3049 * @rbname: name of the RAMBlock of the request. NULL means the
3050 * same that last one.
3051 * @start: RAMBlock starting page
3052 * @length: RAMBlock size
3053 */
3054 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3055 {
3056 int ret = -1;
3057
3058 trace_ram_discard_range(rbname, start, length);
3059
3060 rcu_read_lock();
3061 RAMBlock *rb = qemu_ram_block_by_name(rbname);
3062
3063 if (!rb) {
3064 error_report("ram_discard_range: Failed to find block '%s'", rbname);
3065 goto err;
3066 }
3067
3068 /*
3069 * On source VM, we don't need to update the received bitmap since
3070 * we don't even have one.
3071 */
3072 if (rb->receivedmap) {
3073 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3074 length >> qemu_target_page_bits());
3075 }
3076
3077 ret = ram_block_discard_range(rb, start, length);
3078
3079 err:
3080 rcu_read_unlock();
3081
3082 return ret;
3083 }
3084
3085 /*
3086 * For every allocation, we will try not to crash the VM if the
3087 * allocation failed.
3088 */
3089 static int xbzrle_init(void)
3090 {
3091 Error *local_err = NULL;
3092
3093 if (!migrate_use_xbzrle()) {
3094 return 0;
3095 }
3096
3097 XBZRLE_cache_lock();
3098
3099 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3100 if (!XBZRLE.zero_target_page) {
3101 error_report("%s: Error allocating zero page", __func__);
3102 goto err_out;
3103 }
3104
3105 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3106 TARGET_PAGE_SIZE, &local_err);
3107 if (!XBZRLE.cache) {
3108 error_report_err(local_err);
3109 goto free_zero_page;
3110 }
3111
3112 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3113 if (!XBZRLE.encoded_buf) {
3114 error_report("%s: Error allocating encoded_buf", __func__);
3115 goto free_cache;
3116 }
3117
3118 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3119 if (!XBZRLE.current_buf) {
3120 error_report("%s: Error allocating current_buf", __func__);
3121 goto free_encoded_buf;
3122 }
3123
3124 /* We are all good */
3125 XBZRLE_cache_unlock();
3126 return 0;
3127
3128 free_encoded_buf:
3129 g_free(XBZRLE.encoded_buf);
3130 XBZRLE.encoded_buf = NULL;
3131 free_cache:
3132 cache_fini(XBZRLE.cache);
3133 XBZRLE.cache = NULL;
3134 free_zero_page:
3135 g_free(XBZRLE.zero_target_page);
3136 XBZRLE.zero_target_page = NULL;
3137 err_out:
3138 XBZRLE_cache_unlock();
3139 return -ENOMEM;
3140 }
3141
3142 static int ram_state_init(RAMState **rsp)
3143 {
3144 *rsp = g_try_new0(RAMState, 1);
3145
3146 if (!*rsp) {
3147 error_report("%s: Init ramstate fail", __func__);
3148 return -1;
3149 }
3150
3151 qemu_mutex_init(&(*rsp)->bitmap_mutex);
3152 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3153 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3154
3155 /*
3156 * Count the total number of pages used by ram blocks not including any
3157 * gaps due to alignment or unplugs.
3158 */
3159 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3160
3161 ram_state_reset(*rsp);
3162
3163 return 0;
3164 }
3165
3166 static void ram_list_init_bitmaps(void)
3167 {
3168 RAMBlock *block;
3169 unsigned long pages;
3170
3171 /* Skip setting bitmap if there is no RAM */
3172 if (ram_bytes_total()) {
3173 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3174 pages = block->max_length >> TARGET_PAGE_BITS;
3175 block->bmap = bitmap_new(pages);
3176 bitmap_set(block->bmap, 0, pages);
3177 if (migrate_postcopy_ram()) {
3178 block->unsentmap = bitmap_new(pages);
3179 bitmap_set(block->unsentmap, 0, pages);
3180 }
3181 }
3182 }
3183 }
3184
3185 static void ram_init_bitmaps(RAMState *rs)
3186 {
3187 /* For memory_global_dirty_log_start below. */
3188 qemu_mutex_lock_iothread();
3189 qemu_mutex_lock_ramlist();
3190 rcu_read_lock();
3191
3192 ram_list_init_bitmaps();
3193 memory_global_dirty_log_start();
3194 migration_bitmap_sync_precopy(rs);
3195
3196 rcu_read_unlock();
3197 qemu_mutex_unlock_ramlist();
3198 qemu_mutex_unlock_iothread();
3199 }
3200
3201 static int ram_init_all(RAMState **rsp)
3202 {
3203 if (ram_state_init(rsp)) {
3204 return -1;
3205 }
3206
3207 if (xbzrle_init()) {
3208 ram_state_cleanup(rsp);
3209 return -1;
3210 }
3211
3212 ram_init_bitmaps(*rsp);
3213
3214 return 0;
3215 }
3216
3217 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3218 {
3219 RAMBlock *block;
3220 uint64_t pages = 0;
3221
3222 /*
3223 * Postcopy is not using xbzrle/compression, so no need for that.
3224 * Also, since source are already halted, we don't need to care
3225 * about dirty page logging as well.
3226 */
3227
3228 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3229 pages += bitmap_count_one(block->bmap,
3230 block->used_length >> TARGET_PAGE_BITS);
3231 }
3232
3233 /* This may not be aligned with current bitmaps. Recalculate. */
3234 rs->migration_dirty_pages = pages;
3235
3236 rs->last_seen_block = NULL;
3237 rs->last_sent_block = NULL;
3238 rs->last_page = 0;
3239 rs->last_version = ram_list.version;
3240 /*
3241 * Disable the bulk stage, otherwise we'll resend the whole RAM no
3242 * matter what we have sent.
3243 */
3244 rs->ram_bulk_stage = false;
3245
3246 /* Update RAMState cache of output QEMUFile */
3247 rs->f = out;
3248
3249 trace_ram_state_resume_prepare(pages);
3250 }
3251
3252 /*
3253 * This function clears bits of the free pages reported by the caller from the
3254 * migration dirty bitmap. @addr is the host address corresponding to the
3255 * start of the continuous guest free pages, and @len is the total bytes of
3256 * those pages.
3257 */
3258 void qemu_guest_free_page_hint(void *addr, size_t len)
3259 {
3260 RAMBlock *block;
3261 ram_addr_t offset;
3262 size_t used_len, start, npages;
3263 MigrationState *s = migrate_get_current();
3264
3265 /* This function is currently expected to be used during live migration */
3266 if (!migration_is_setup_or_active(s->state)) {
3267 return;
3268 }
3269
3270 for (; len > 0; len -= used_len, addr += used_len) {
3271 block = qemu_ram_block_from_host(addr, false, &offset);
3272 if (unlikely(!block || offset >= block->used_length)) {
3273 /*
3274 * The implementation might not support RAMBlock resize during
3275 * live migration, but it could happen in theory with future
3276 * updates. So we add a check here to capture that case.
3277 */
3278 error_report_once("%s unexpected error", __func__);
3279 return;
3280 }
3281
3282 if (len <= block->used_length - offset) {
3283 used_len = len;
3284 } else {
3285 used_len = block->used_length - offset;
3286 }
3287
3288 start = offset >> TARGET_PAGE_BITS;
3289 npages = used_len >> TARGET_PAGE_BITS;
3290
3291 qemu_mutex_lock(&ram_state->bitmap_mutex);
3292 ram_state->migration_dirty_pages -=
3293 bitmap_count_one_with_offset(block->bmap, start, npages);
3294 bitmap_clear(block->bmap, start, npages);
3295 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3296 }
3297 }
3298
3299 /*
3300 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3301 * long-running RCU critical section. When rcu-reclaims in the code
3302 * start to become numerous it will be necessary to reduce the
3303 * granularity of these critical sections.
3304 */
3305
3306 /**
3307 * ram_save_setup: Setup RAM for migration
3308 *
3309 * Returns zero to indicate success and negative for error
3310 *
3311 * @f: QEMUFile where to send the data
3312 * @opaque: RAMState pointer
3313 */
3314 static int ram_save_setup(QEMUFile *f, void *opaque)
3315 {
3316 RAMState **rsp = opaque;
3317 RAMBlock *block;
3318
3319 if (compress_threads_save_setup()) {
3320 return -1;
3321 }
3322
3323 /* migration has already setup the bitmap, reuse it. */
3324 if (!migration_in_colo_state()) {
3325 if (ram_init_all(rsp) != 0) {
3326 compress_threads_save_cleanup();
3327 return -1;
3328 }
3329 }
3330 (*rsp)->f = f;
3331
3332 rcu_read_lock();
3333
3334 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3335
3336 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3337 qemu_put_byte(f, strlen(block->idstr));
3338 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3339 qemu_put_be64(f, block->used_length);
3340 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3341 qemu_put_be64(f, block->page_size);
3342 }
3343 if (migrate_ignore_shared()) {
3344 qemu_put_be64(f, block->mr->addr);
3345 qemu_put_byte(f, ramblock_is_ignored(block) ? 1 : 0);
3346 }
3347 }
3348
3349 rcu_read_unlock();
3350
3351 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3352 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3353
3354 multifd_send_sync_main();
3355 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3356 qemu_fflush(f);
3357
3358 return 0;
3359 }
3360
3361 /**
3362 * ram_save_iterate: iterative stage for migration
3363 *
3364 * Returns zero to indicate success and negative for error
3365 *
3366 * @f: QEMUFile where to send the data
3367 * @opaque: RAMState pointer
3368 */
3369 static int ram_save_iterate(QEMUFile *f, void *opaque)
3370 {
3371 RAMState **temp = opaque;
3372 RAMState *rs = *temp;
3373 int ret;
3374 int i;
3375 int64_t t0;
3376 int done = 0;
3377
3378 if (blk_mig_bulk_active()) {
3379 /* Avoid transferring ram during bulk phase of block migration as
3380 * the bulk phase will usually take a long time and transferring
3381 * ram updates during that time is pointless. */
3382 goto out;
3383 }
3384
3385 rcu_read_lock();
3386 if (ram_list.version != rs->last_version) {
3387 ram_state_reset(rs);
3388 }
3389
3390 /* Read version before ram_list.blocks */
3391 smp_rmb();
3392
3393 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3394
3395 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3396 i = 0;
3397 while ((ret = qemu_file_rate_limit(f)) == 0 ||
3398 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3399 int pages;
3400
3401 if (qemu_file_get_error(f)) {
3402 break;
3403 }
3404
3405 pages = ram_find_and_save_block(rs, false);
3406 /* no more pages to sent */
3407 if (pages == 0) {
3408 done = 1;
3409 break;
3410 }
3411
3412 if (pages < 0) {
3413 qemu_file_set_error(f, pages);
3414 break;
3415 }
3416
3417 rs->target_page_count += pages;
3418
3419 /* we want to check in the 1st loop, just in case it was the 1st time
3420 and we had to sync the dirty bitmap.
3421 qemu_get_clock_ns() is a bit expensive, so we only check each some
3422 iterations
3423 */
3424 if ((i & 63) == 0) {
3425 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3426 if (t1 > MAX_WAIT) {
3427 trace_ram_save_iterate_big_wait(t1, i);
3428 break;
3429 }
3430 }
3431 i++;
3432 }
3433 rcu_read_unlock();
3434
3435 /*
3436 * Must occur before EOS (or any QEMUFile operation)
3437 * because of RDMA protocol.
3438 */
3439 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3440
3441 multifd_send_sync_main();
3442 out:
3443 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3444 qemu_fflush(f);
3445 ram_counters.transferred += 8;
3446
3447 ret = qemu_file_get_error(f);
3448 if (ret < 0) {
3449 return ret;
3450 }
3451
3452 return done;
3453 }
3454
3455 /**
3456 * ram_save_complete: function called to send the remaining amount of ram
3457 *
3458 * Returns zero to indicate success or negative on error
3459 *
3460 * Called with iothread lock
3461 *
3462 * @f: QEMUFile where to send the data
3463 * @opaque: RAMState pointer
3464 */
3465 static int ram_save_complete(QEMUFile *f, void *opaque)
3466 {
3467 RAMState **temp = opaque;
3468 RAMState *rs = *temp;
3469 int ret = 0;
3470
3471 rcu_read_lock();
3472
3473 if (!migration_in_postcopy()) {
3474 migration_bitmap_sync_precopy(rs);
3475 }
3476
3477 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3478
3479 /* try transferring iterative blocks of memory */
3480
3481 /* flush all remaining blocks regardless of rate limiting */
3482 while (true) {
3483 int pages;
3484
3485 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3486 /* no more blocks to sent */
3487 if (pages == 0) {
3488 break;
3489 }
3490 if (pages < 0) {
3491 ret = pages;
3492 break;
3493 }
3494 }
3495
3496 flush_compressed_data(rs);
3497 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3498
3499 rcu_read_unlock();
3500
3501 multifd_send_sync_main();
3502 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3503 qemu_fflush(f);
3504
3505 return ret;
3506 }
3507
3508 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3509 uint64_t *res_precopy_only,
3510 uint64_t *res_compatible,
3511 uint64_t *res_postcopy_only)
3512 {
3513 RAMState **temp = opaque;
3514 RAMState *rs = *temp;
3515 uint64_t remaining_size;
3516
3517 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3518
3519 if (!migration_in_postcopy() &&
3520 remaining_size < max_size) {
3521 qemu_mutex_lock_iothread();
3522 rcu_read_lock();
3523 migration_bitmap_sync_precopy(rs);
3524 rcu_read_unlock();
3525 qemu_mutex_unlock_iothread();
3526 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3527 }
3528
3529 if (migrate_postcopy_ram()) {
3530 /* We can do postcopy, and all the data is postcopiable */
3531 *res_compatible += remaining_size;
3532 } else {
3533 *res_precopy_only += remaining_size;
3534 }
3535 }
3536
3537 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3538 {
3539 unsigned int xh_len;
3540 int xh_flags;
3541 uint8_t *loaded_data;
3542
3543 /* extract RLE header */
3544 xh_flags = qemu_get_byte(f);
3545 xh_len = qemu_get_be16(f);
3546
3547 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3548 error_report("Failed to load XBZRLE page - wrong compression!");
3549 return -1;
3550 }
3551
3552 if (xh_len > TARGET_PAGE_SIZE) {
3553 error_report("Failed to load XBZRLE page - len overflow!");
3554 return -1;
3555 }
3556 loaded_data = XBZRLE.decoded_buf;
3557 /* load data and decode */
3558 /* it can change loaded_data to point to an internal buffer */
3559 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3560
3561 /* decode RLE */
3562 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3563 TARGET_PAGE_SIZE) == -1) {
3564 error_report("Failed to load XBZRLE page - decode error!");
3565 return -1;
3566 }
3567
3568 return 0;
3569 }
3570
3571 /**
3572 * ram_block_from_stream: read a RAMBlock id from the migration stream
3573 *
3574 * Must be called from within a rcu critical section.
3575 *
3576 * Returns a pointer from within the RCU-protected ram_list.
3577 *
3578 * @f: QEMUFile where to read the data from
3579 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3580 */
3581 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3582 {
3583 static RAMBlock *block = NULL;
3584 char id[256];
3585 uint8_t len;
3586
3587 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3588 if (!block) {
3589 error_report("Ack, bad migration stream!");
3590 return NULL;
3591 }
3592 return block;
3593 }
3594
3595 len = qemu_get_byte(f);
3596 qemu_get_buffer(f, (uint8_t *)id, len);
3597 id[len] = 0;
3598
3599 block = qemu_ram_block_by_name(id);
3600 if (!block) {
3601 error_report("Can't find block %s", id);
3602 return NULL;
3603 }
3604
3605 if (ramblock_is_ignored(block)) {
3606 error_report("block %s should not be migrated !", id);
3607 return NULL;
3608 }
3609
3610 return block;
3611 }
3612
3613 static inline void *host_from_ram_block_offset(RAMBlock *block,
3614 ram_addr_t offset)
3615 {
3616 if (!offset_in_ramblock(block, offset)) {
3617 return NULL;
3618 }
3619
3620 return block->host + offset;
3621 }
3622
3623 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3624 ram_addr_t offset)
3625 {
3626 if (!offset_in_ramblock(block, offset)) {
3627 return NULL;
3628 }
3629 if (!block->colo_cache) {
3630 error_report("%s: colo_cache is NULL in block :%s",
3631 __func__, block->idstr);
3632 return NULL;
3633 }
3634
3635 /*
3636 * During colo checkpoint, we need bitmap of these migrated pages.
3637 * It help us to decide which pages in ram cache should be flushed
3638 * into VM's RAM later.
3639 */
3640 if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3641 ram_state->migration_dirty_pages++;
3642 }
3643 return block->colo_cache + offset;
3644 }
3645
3646 /**
3647 * ram_handle_compressed: handle the zero page case
3648 *
3649 * If a page (or a whole RDMA chunk) has been
3650 * determined to be zero, then zap it.
3651 *
3652 * @host: host address for the zero page
3653 * @ch: what the page is filled from. We only support zero
3654 * @size: size of the zero page
3655 */
3656 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3657 {
3658 if (ch != 0 || !is_zero_range(host, size)) {
3659 memset(host, ch, size);
3660 }
3661 }
3662
3663 /* return the size after decompression, or negative value on error */
3664 static int
3665 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3666 const uint8_t *source, size_t source_len)
3667 {
3668 int err;
3669
3670 err = inflateReset(stream);
3671 if (err != Z_OK) {
3672 return -1;
3673 }
3674
3675 stream->avail_in = source_len;
3676 stream->next_in = (uint8_t *)source;
3677 stream->avail_out = dest_len;
3678 stream->next_out = dest;
3679
3680 err = inflate(stream, Z_NO_FLUSH);
3681 if (err != Z_STREAM_END) {
3682 return -1;
3683 }
3684
3685 return stream->total_out;
3686 }
3687
3688 static void *do_data_decompress(void *opaque)
3689 {
3690 DecompressParam *param = opaque;
3691 unsigned long pagesize;
3692 uint8_t *des;
3693 int len, ret;
3694
3695 qemu_mutex_lock(&param->mutex);
3696 while (!param->quit) {
3697 if (param->des) {
3698 des = param->des;
3699 len = param->len;
3700 param->des = 0;
3701 qemu_mutex_unlock(&param->mutex);
3702
3703 pagesize = TARGET_PAGE_SIZE;
3704
3705 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3706 param->compbuf, len);
3707 if (ret < 0 && migrate_get_current()->decompress_error_check) {
3708 error_report("decompress data failed");
3709 qemu_file_set_error(decomp_file, ret);
3710 }
3711
3712 qemu_mutex_lock(&decomp_done_lock);
3713 param->done = true;
3714 qemu_cond_signal(&decomp_done_cond);
3715 qemu_mutex_unlock(&decomp_done_lock);
3716
3717 qemu_mutex_lock(&param->mutex);
3718 } else {
3719 qemu_cond_wait(&param->cond, &param->mutex);
3720 }
3721 }
3722 qemu_mutex_unlock(&param->mutex);
3723
3724 return NULL;
3725 }
3726
3727 static int wait_for_decompress_done(void)
3728 {
3729 int idx, thread_count;
3730
3731 if (!migrate_use_compression()) {
3732 return 0;
3733 }
3734
3735 thread_count = migrate_decompress_threads();
3736 qemu_mutex_lock(&decomp_done_lock);
3737 for (idx = 0; idx < thread_count; idx++) {
3738 while (!decomp_param[idx].done) {
3739 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3740 }
3741 }
3742 qemu_mutex_unlock(&decomp_done_lock);
3743 return qemu_file_get_error(decomp_file);
3744 }
3745
3746 static void compress_threads_load_cleanup(void)
3747 {
3748 int i, thread_count;
3749
3750 if (!migrate_use_compression()) {
3751 return;
3752 }
3753 thread_count = migrate_decompress_threads();
3754 for (i = 0; i < thread_count; i++) {
3755 /*
3756 * we use it as a indicator which shows if the thread is
3757 * properly init'd or not
3758 */
3759 if (!decomp_param[i].compbuf) {
3760 break;
3761 }
3762
3763 qemu_mutex_lock(&decomp_param[i].mutex);
3764 decomp_param[i].quit = true;
3765 qemu_cond_signal(&decomp_param[i].cond);
3766 qemu_mutex_unlock(&decomp_param[i].mutex);
3767 }
3768 for (i = 0; i < thread_count; i++) {
3769 if (!decomp_param[i].compbuf) {
3770 break;
3771 }
3772
3773 qemu_thread_join(decompress_threads + i);
3774 qemu_mutex_destroy(&decomp_param[i].mutex);
3775 qemu_cond_destroy(&decomp_param[i].cond);
3776 inflateEnd(&decomp_param[i].stream);
3777 g_free(decomp_param[i].compbuf);
3778 decomp_param[i].compbuf = NULL;
3779 }
3780 g_free(decompress_threads);
3781 g_free(decomp_param);
3782 decompress_threads = NULL;
3783 decomp_param = NULL;
3784 decomp_file = NULL;
3785 }
3786
3787 static int compress_threads_load_setup(QEMUFile *f)
3788 {
3789 int i, thread_count;
3790
3791 if (!migrate_use_compression()) {
3792 return 0;
3793 }
3794
3795 thread_count = migrate_decompress_threads();
3796 decompress_threads = g_new0(QemuThread, thread_count);
3797 decomp_param = g_new0(DecompressParam, thread_count);
3798 qemu_mutex_init(&decomp_done_lock);
3799 qemu_cond_init(&decomp_done_cond);
3800 decomp_file = f;
3801 for (i = 0; i < thread_count; i++) {
3802 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3803 goto exit;
3804 }
3805
3806 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3807 qemu_mutex_init(&decomp_param[i].mutex);
3808 qemu_cond_init(&decomp_param[i].cond);
3809 decomp_param[i].done = true;
3810 decomp_param[i].quit = false;
3811 qemu_thread_create(decompress_threads + i, "decompress",
3812 do_data_decompress, decomp_param + i,
3813 QEMU_THREAD_JOINABLE);
3814 }
3815 return 0;
3816 exit:
3817 compress_threads_load_cleanup();
3818 return -1;
3819 }
3820
3821 static void decompress_data_with_multi_threads(QEMUFile *f,
3822 void *host, int len)
3823 {
3824 int idx, thread_count;
3825
3826 thread_count = migrate_decompress_threads();
3827 qemu_mutex_lock(&decomp_done_lock);
3828 while (true) {
3829 for (idx = 0; idx < thread_count; idx++) {
3830 if (decomp_param[idx].done) {
3831 decomp_param[idx].done = false;
3832 qemu_mutex_lock(&decomp_param[idx].mutex);
3833 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3834 decomp_param[idx].des = host;
3835 decomp_param[idx].len = len;
3836 qemu_cond_signal(&decomp_param[idx].cond);
3837 qemu_mutex_unlock(&decomp_param[idx].mutex);
3838 break;
3839 }
3840 }
3841 if (idx < thread_count) {
3842 break;
3843 } else {
3844 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3845 }
3846 }
3847 qemu_mutex_unlock(&decomp_done_lock);
3848 }
3849
3850 /*
3851 * colo cache: this is for secondary VM, we cache the whole
3852 * memory of the secondary VM, it is need to hold the global lock
3853 * to call this helper.
3854 */
3855 int colo_init_ram_cache(void)
3856 {
3857 RAMBlock *block;
3858
3859 rcu_read_lock();
3860 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3861 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3862 NULL,
3863 false);
3864 if (!block->colo_cache) {
3865 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3866 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3867 block->used_length);
3868 goto out_locked;
3869 }
3870 memcpy(block->colo_cache, block->host, block->used_length);
3871 }
3872 rcu_read_unlock();
3873 /*
3874 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3875 * with to decide which page in cache should be flushed into SVM's RAM. Here
3876 * we use the same name 'ram_bitmap' as for migration.
3877 */
3878 if (ram_bytes_total()) {
3879 RAMBlock *block;
3880
3881 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3882 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3883
3884 block->bmap = bitmap_new(pages);
3885 bitmap_set(block->bmap, 0, pages);
3886 }
3887 }
3888 ram_state = g_new0(RAMState, 1);
3889 ram_state->migration_dirty_pages = 0;
3890 memory_global_dirty_log_start();
3891
3892 return 0;
3893
3894 out_locked:
3895
3896 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3897 if (block->colo_cache) {
3898 qemu_anon_ram_free(block->colo_cache, block->used_length);
3899 block->colo_cache = NULL;
3900 }
3901 }
3902
3903 rcu_read_unlock();
3904 return -errno;
3905 }
3906
3907 /* It is need to hold the global lock to call this helper */
3908 void colo_release_ram_cache(void)
3909 {
3910 RAMBlock *block;
3911
3912 memory_global_dirty_log_stop();
3913 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3914 g_free(block->bmap);
3915 block->bmap = NULL;
3916 }
3917
3918 rcu_read_lock();
3919
3920 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3921 if (block->colo_cache) {
3922 qemu_anon_ram_free(block->colo_cache, block->used_length);
3923 block->colo_cache = NULL;
3924 }
3925 }
3926
3927 rcu_read_unlock();
3928 g_free(ram_state);
3929 ram_state = NULL;
3930 }
3931
3932 /**
3933 * ram_load_setup: Setup RAM for migration incoming side
3934 *
3935 * Returns zero to indicate success and negative for error
3936 *
3937 * @f: QEMUFile where to receive the data
3938 * @opaque: RAMState pointer
3939 */
3940 static int ram_load_setup(QEMUFile *f, void *opaque)
3941 {
3942 if (compress_threads_load_setup(f)) {
3943 return -1;
3944 }
3945
3946 xbzrle_load_setup();
3947 ramblock_recv_map_init();
3948
3949 return 0;
3950 }
3951
3952 static int ram_load_cleanup(void *opaque)
3953 {
3954 RAMBlock *rb;
3955
3956 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3957 if (ramblock_is_pmem(rb)) {
3958 pmem_persist(rb->host, rb->used_length);
3959 }
3960 }
3961
3962 xbzrle_load_cleanup();
3963 compress_threads_load_cleanup();
3964
3965 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3966 g_free(rb->receivedmap);
3967 rb->receivedmap = NULL;
3968 }
3969
3970 return 0;
3971 }
3972
3973 /**
3974 * ram_postcopy_incoming_init: allocate postcopy data structures
3975 *
3976 * Returns 0 for success and negative if there was one error
3977 *
3978 * @mis: current migration incoming state
3979 *
3980 * Allocate data structures etc needed by incoming migration with
3981 * postcopy-ram. postcopy-ram's similarly names
3982 * postcopy_ram_incoming_init does the work.
3983 */
3984 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3985 {
3986 return postcopy_ram_incoming_init(mis);
3987 }
3988
3989 /**
3990 * ram_load_postcopy: load a page in postcopy case
3991 *
3992 * Returns 0 for success or -errno in case of error
3993 *
3994 * Called in postcopy mode by ram_load().
3995 * rcu_read_lock is taken prior to this being called.
3996 *
3997 * @f: QEMUFile where to send the data
3998 */
3999 static int ram_load_postcopy(QEMUFile *f)
4000 {
4001 int flags = 0, ret = 0;
4002 bool place_needed = false;
4003 bool matches_target_page_size = false;
4004 MigrationIncomingState *mis = migration_incoming_get_current();
4005 /* Temporary page that is later 'placed' */
4006 void *postcopy_host_page = postcopy_get_tmp_page(mis);
4007 void *last_host = NULL;
4008 bool all_zero = false;
4009
4010 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4011 ram_addr_t addr;
4012 void *host = NULL;
4013 void *page_buffer = NULL;
4014 void *place_source = NULL;
4015 RAMBlock *block = NULL;
4016 uint8_t ch;
4017
4018 addr = qemu_get_be64(f);
4019
4020 /*
4021 * If qemu file error, we should stop here, and then "addr"
4022 * may be invalid
4023 */
4024 ret = qemu_file_get_error(f);
4025 if (ret) {
4026 break;
4027 }
4028
4029 flags = addr & ~TARGET_PAGE_MASK;
4030 addr &= TARGET_PAGE_MASK;
4031
4032 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4033 place_needed = false;
4034 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
4035 block = ram_block_from_stream(f, flags);
4036
4037 host = host_from_ram_block_offset(block, addr);
4038 if (!host) {
4039 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4040 ret = -EINVAL;
4041 break;
4042 }
4043 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4044 /*
4045 * Postcopy requires that we place whole host pages atomically;
4046 * these may be huge pages for RAMBlocks that are backed by
4047 * hugetlbfs.
4048 * To make it atomic, the data is read into a temporary page
4049 * that's moved into place later.
4050 * The migration protocol uses, possibly smaller, target-pages
4051 * however the source ensures it always sends all the components
4052 * of a host page in order.
4053 */
4054 page_buffer = postcopy_host_page +
4055 ((uintptr_t)host & (block->page_size - 1));
4056 /* If all TP are zero then we can optimise the place */
4057 if (!((uintptr_t)host & (block->page_size - 1))) {
4058 all_zero = true;
4059 } else {
4060 /* not the 1st TP within the HP */
4061 if (host != (last_host + TARGET_PAGE_SIZE)) {
4062 error_report("Non-sequential target page %p/%p",
4063 host, last_host);
4064 ret = -EINVAL;
4065 break;
4066 }
4067 }
4068
4069
4070 /*
4071 * If it's the last part of a host page then we place the host
4072 * page
4073 */
4074 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
4075 (block->page_size - 1)) == 0;
4076 place_source = postcopy_host_page;
4077 }
4078 last_host = host;
4079
4080 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4081 case RAM_SAVE_FLAG_ZERO:
4082 ch = qemu_get_byte(f);
4083 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4084 if (ch) {
4085 all_zero = false;
4086 }
4087 break;
4088
4089 case RAM_SAVE_FLAG_PAGE:
4090 all_zero = false;
4091 if (!matches_target_page_size) {
4092 /* For huge pages, we always use temporary buffer */
4093 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4094 } else {
4095 /*
4096 * For small pages that matches target page size, we
4097 * avoid the qemu_file copy. Instead we directly use
4098 * the buffer of QEMUFile to place the page. Note: we
4099 * cannot do any QEMUFile operation before using that
4100 * buffer to make sure the buffer is valid when
4101 * placing the page.
4102 */
4103 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4104 TARGET_PAGE_SIZE);
4105 }
4106 break;
4107 case RAM_SAVE_FLAG_EOS:
4108 /* normal exit */
4109 multifd_recv_sync_main();
4110 break;
4111 default:
4112 error_report("Unknown combination of migration flags: %#x"
4113 " (postcopy mode)", flags);
4114 ret = -EINVAL;
4115 break;
4116 }
4117
4118 /* Detect for any possible file errors */
4119 if (!ret && qemu_file_get_error(f)) {
4120 ret = qemu_file_get_error(f);
4121 }
4122
4123 if (!ret && place_needed) {
4124 /* This gets called at the last target page in the host page */
4125 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
4126
4127 if (all_zero) {
4128 ret = postcopy_place_page_zero(mis, place_dest,
4129 block);
4130 } else {
4131 ret = postcopy_place_page(mis, place_dest,
4132 place_source, block);
4133 }
4134 }
4135 }
4136
4137 return ret;
4138 }
4139
4140 static bool postcopy_is_advised(void)
4141 {
4142 PostcopyState ps = postcopy_state_get();
4143 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4144 }
4145
4146 static bool postcopy_is_running(void)
4147 {
4148 PostcopyState ps = postcopy_state_get();
4149 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4150 }
4151
4152 /*
4153 * Flush content of RAM cache into SVM's memory.
4154 * Only flush the pages that be dirtied by PVM or SVM or both.
4155 */
4156 static void colo_flush_ram_cache(void)
4157 {
4158 RAMBlock *block = NULL;
4159 void *dst_host;
4160 void *src_host;
4161 unsigned long offset = 0;
4162
4163 memory_global_dirty_log_sync();
4164 rcu_read_lock();
4165 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4166 migration_bitmap_sync_range(ram_state, block, 0, block->used_length);
4167 }
4168 rcu_read_unlock();
4169
4170 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4171 rcu_read_lock();
4172 block = QLIST_FIRST_RCU(&ram_list.blocks);
4173
4174 while (block) {
4175 offset = migration_bitmap_find_dirty(ram_state, block, offset);
4176
4177 if (offset << TARGET_PAGE_BITS >= block->used_length) {
4178 offset = 0;
4179 block = QLIST_NEXT_RCU(block, next);
4180 } else {
4181 migration_bitmap_clear_dirty(ram_state, block, offset);
4182 dst_host = block->host + (offset << TARGET_PAGE_BITS);
4183 src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4184 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4185 }
4186 }
4187
4188 rcu_read_unlock();
4189 trace_colo_flush_ram_cache_end();
4190 }
4191
4192 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4193 {
4194 int flags = 0, ret = 0, invalid_flags = 0;
4195 static uint64_t seq_iter;
4196 int len = 0;
4197 /*
4198 * If system is running in postcopy mode, page inserts to host memory must
4199 * be atomic
4200 */
4201 bool postcopy_running = postcopy_is_running();
4202 /* ADVISE is earlier, it shows the source has the postcopy capability on */
4203 bool postcopy_advised = postcopy_is_advised();
4204
4205 seq_iter++;
4206
4207 if (version_id != 4) {
4208 ret = -EINVAL;
4209 }
4210
4211 if (!migrate_use_compression()) {
4212 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4213 }
4214 /* This RCU critical section can be very long running.
4215 * When RCU reclaims in the code start to become numerous,
4216 * it will be necessary to reduce the granularity of this
4217 * critical section.
4218 */
4219 rcu_read_lock();
4220
4221 if (postcopy_running) {
4222 ret = ram_load_postcopy(f);
4223 }
4224
4225 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4226 ram_addr_t addr, total_ram_bytes;
4227 void *host = NULL;
4228 uint8_t ch;
4229
4230 addr = qemu_get_be64(f);
4231 flags = addr & ~TARGET_PAGE_MASK;
4232 addr &= TARGET_PAGE_MASK;
4233
4234 if (flags & invalid_flags) {
4235 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4236 error_report("Received an unexpected compressed page");
4237 }
4238
4239 ret = -EINVAL;
4240 break;
4241 }
4242
4243 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4244 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4245 RAMBlock *block = ram_block_from_stream(f, flags);
4246
4247 /*
4248 * After going into COLO, we should load the Page into colo_cache.
4249 */
4250 if (migration_incoming_in_colo_state()) {
4251 host = colo_cache_from_block_offset(block, addr);
4252 } else {
4253 host = host_from_ram_block_offset(block, addr);
4254 }
4255 if (!host) {
4256 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4257 ret = -EINVAL;
4258 break;
4259 }
4260
4261 if (!migration_incoming_in_colo_state()) {
4262 ramblock_recv_bitmap_set(block, host);
4263 }
4264
4265 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4266 }
4267
4268 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4269 case RAM_SAVE_FLAG_MEM_SIZE:
4270 /* Synchronize RAM block list */
4271 total_ram_bytes = addr;
4272 while (!ret && total_ram_bytes) {
4273 RAMBlock *block;
4274 char id[256];
4275 ram_addr_t length;
4276
4277 len = qemu_get_byte(f);
4278 qemu_get_buffer(f, (uint8_t *)id, len);
4279 id[len] = 0;
4280 length = qemu_get_be64(f);
4281
4282 block = qemu_ram_block_by_name(id);
4283 if (block && !qemu_ram_is_migratable(block)) {
4284 error_report("block %s should not be migrated !", id);
4285 ret = -EINVAL;
4286 } else if (block) {
4287 if (length != block->used_length) {
4288 Error *local_err = NULL;
4289
4290 ret = qemu_ram_resize(block, length,
4291 &local_err);
4292 if (local_err) {
4293 error_report_err(local_err);
4294 }
4295 }
4296 /* For postcopy we need to check hugepage sizes match */
4297 if (postcopy_advised &&
4298 block->page_size != qemu_host_page_size) {
4299 uint64_t remote_page_size = qemu_get_be64(f);
4300 if (remote_page_size != block->page_size) {
4301 error_report("Mismatched RAM page size %s "
4302 "(local) %zd != %" PRId64,
4303 id, block->page_size,
4304 remote_page_size);
4305 ret = -EINVAL;
4306 }
4307 }
4308 if (migrate_ignore_shared()) {
4309 hwaddr addr = qemu_get_be64(f);
4310 bool ignored = qemu_get_byte(f);
4311 if (ignored != ramblock_is_ignored(block)) {
4312 error_report("RAM block %s should %s be migrated",
4313 id, ignored ? "" : "not");
4314 ret = -EINVAL;
4315 }
4316 if (ramblock_is_ignored(block) &&
4317 block->mr->addr != addr) {
4318 error_report("Mismatched GPAs for block %s "
4319 "%" PRId64 "!= %" PRId64,
4320 id, (uint64_t)addr,
4321 (uint64_t)block->mr->addr);
4322 ret = -EINVAL;
4323 }
4324 }
4325 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4326 block->idstr);
4327 } else {
4328 error_report("Unknown ramblock \"%s\", cannot "
4329 "accept migration", id);
4330 ret = -EINVAL;
4331 }
4332
4333 total_ram_bytes -= length;
4334 }
4335 break;
4336
4337 case RAM_SAVE_FLAG_ZERO:
4338 ch = qemu_get_byte(f);
4339 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4340 break;
4341
4342 case RAM_SAVE_FLAG_PAGE:
4343 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4344 break;
4345
4346 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4347 len = qemu_get_be32(f);
4348 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4349 error_report("Invalid compressed data length: %d", len);
4350 ret = -EINVAL;
4351 break;
4352 }
4353 decompress_data_with_multi_threads(f, host, len);
4354 break;
4355
4356 case RAM_SAVE_FLAG_XBZRLE:
4357 if (load_xbzrle(f, addr, host) < 0) {
4358 error_report("Failed to decompress XBZRLE page at "
4359 RAM_ADDR_FMT, addr);
4360 ret = -EINVAL;
4361 break;
4362 }
4363 break;
4364 case RAM_SAVE_FLAG_EOS:
4365 /* normal exit */
4366 multifd_recv_sync_main();
4367 break;
4368 default:
4369 if (flags & RAM_SAVE_FLAG_HOOK) {
4370 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4371 } else {
4372 error_report("Unknown combination of migration flags: %#x",
4373 flags);
4374 ret = -EINVAL;
4375 }
4376 }
4377 if (!ret) {
4378 ret = qemu_file_get_error(f);
4379 }
4380 }
4381
4382 ret |= wait_for_decompress_done();
4383 rcu_read_unlock();
4384 trace_ram_load_complete(ret, seq_iter);
4385
4386 if (!ret && migration_incoming_in_colo_state()) {
4387 colo_flush_ram_cache();
4388 }
4389 return ret;
4390 }
4391
4392 static bool ram_has_postcopy(void *opaque)
4393 {
4394 RAMBlock *rb;
4395 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4396 if (ramblock_is_pmem(rb)) {
4397 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4398 "is not supported now!", rb->idstr, rb->host);
4399 return false;
4400 }
4401 }
4402
4403 return migrate_postcopy_ram();
4404 }
4405
4406 /* Sync all the dirty bitmap with destination VM. */
4407 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4408 {
4409 RAMBlock *block;
4410 QEMUFile *file = s->to_dst_file;
4411 int ramblock_count = 0;
4412
4413 trace_ram_dirty_bitmap_sync_start();
4414
4415 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4416 qemu_savevm_send_recv_bitmap(file, block->idstr);
4417 trace_ram_dirty_bitmap_request(block->idstr);
4418 ramblock_count++;
4419 }
4420
4421 trace_ram_dirty_bitmap_sync_wait();
4422
4423 /* Wait until all the ramblocks' dirty bitmap synced */
4424 while (ramblock_count--) {
4425 qemu_sem_wait(&s->rp_state.rp_sem);
4426 }
4427
4428 trace_ram_dirty_bitmap_sync_complete();
4429
4430 return 0;
4431 }
4432
4433 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4434 {
4435 qemu_sem_post(&s->rp_state.rp_sem);
4436 }
4437
4438 /*
4439 * Read the received bitmap, revert it as the initial dirty bitmap.
4440 * This is only used when the postcopy migration is paused but wants
4441 * to resume from a middle point.
4442 */
4443 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4444 {
4445 int ret = -EINVAL;
4446 QEMUFile *file = s->rp_state.from_dst_file;
4447 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4448 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4449 uint64_t size, end_mark;
4450
4451 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4452
4453 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4454 error_report("%s: incorrect state %s", __func__,
4455 MigrationStatus_str(s->state));
4456 return -EINVAL;
4457 }
4458
4459 /*
4460 * Note: see comments in ramblock_recv_bitmap_send() on why we
4461 * need the endianess convertion, and the paddings.
4462 */
4463 local_size = ROUND_UP(local_size, 8);
4464
4465 /* Add paddings */
4466 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4467
4468 size = qemu_get_be64(file);
4469
4470 /* The size of the bitmap should match with our ramblock */
4471 if (size != local_size) {
4472 error_report("%s: ramblock '%s' bitmap size mismatch "
4473 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4474 block->idstr, size, local_size);
4475 ret = -EINVAL;
4476 goto out;
4477 }
4478
4479 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4480 end_mark = qemu_get_be64(file);
4481
4482 ret = qemu_file_get_error(file);
4483 if (ret || size != local_size) {
4484 error_report("%s: read bitmap failed for ramblock '%s': %d"
4485 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4486 __func__, block->idstr, ret, local_size, size);
4487 ret = -EIO;
4488 goto out;
4489 }
4490
4491 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4492 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4493 __func__, block->idstr, end_mark);
4494 ret = -EINVAL;
4495 goto out;
4496 }
4497
4498 /*
4499 * Endianess convertion. We are during postcopy (though paused).
4500 * The dirty bitmap won't change. We can directly modify it.
4501 */
4502 bitmap_from_le(block->bmap, le_bitmap, nbits);
4503
4504 /*
4505 * What we received is "received bitmap". Revert it as the initial
4506 * dirty bitmap for this ramblock.
4507 */
4508 bitmap_complement(block->bmap, block->bmap, nbits);
4509
4510 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4511
4512 /*
4513 * We succeeded to sync bitmap for current ramblock. If this is
4514 * the last one to sync, we need to notify the main send thread.
4515 */
4516 ram_dirty_bitmap_reload_notify(s);
4517
4518 ret = 0;
4519 out:
4520 g_free(le_bitmap);
4521 return ret;
4522 }
4523
4524 static int ram_resume_prepare(MigrationState *s, void *opaque)
4525 {
4526 RAMState *rs = *(RAMState **)opaque;
4527 int ret;
4528
4529 ret = ram_dirty_bitmap_sync_all(s, rs);
4530 if (ret) {
4531 return ret;
4532 }
4533
4534 ram_state_resume_prepare(rs, s->to_dst_file);
4535
4536 return 0;
4537 }
4538
4539 static SaveVMHandlers savevm_ram_handlers = {
4540 .save_setup = ram_save_setup,
4541 .save_live_iterate = ram_save_iterate,
4542 .save_live_complete_postcopy = ram_save_complete,
4543 .save_live_complete_precopy = ram_save_complete,
4544 .has_postcopy = ram_has_postcopy,
4545 .save_live_pending = ram_save_pending,
4546 .load_state = ram_load,
4547 .save_cleanup = ram_save_cleanup,
4548 .load_setup = ram_load_setup,
4549 .load_cleanup = ram_load_cleanup,
4550 .resume_prepare = ram_resume_prepare,
4551 };
4552
4553 void ram_mig_init(void)
4554 {
4555 qemu_mutex_init(&XBZRLE.lock);
4556 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4557 }