migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "qemu/pmem.h"
  37 #include "xbzrle.h"
  38 #include "ram.h"
  39 #include "migration.h"
  40 #include "socket.h"
  41 #include "migration/register.h"
  42 #include "migration/misc.h"
  43 #include "qemu-file.h"
  44 #include "postcopy-ram.h"
  45 #include "page_cache.h"
  46 #include "qemu/error-report.h"
  47 #include "qapi/error.h"
  48 #include "qapi/qapi-events-migration.h"
  49 #include "qapi/qmp/qerror.h"
  50 #include "trace.h"
  51 #include "exec/ram_addr.h"
  52 #include "exec/target_page.h"
  53 #include "qemu/rcu_queue.h"
  54 #include "migration/colo.h"
  55 #include "block.h"
  56 #include "sysemu/sysemu.h"
  57 #include "qemu/uuid.h"
  58 #include "savevm.h"
  59 #include "qemu/iov.h"
  60
  61 /***********************************************************/
  62 /* ram save/restore */
  63
  64 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  65  * worked for pages that where filled with the same char.  We switched
  66  * it to only search for the zero value.  And to avoid confusion with
  67  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  68  */
  69
  70 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  71 #define RAM_SAVE_FLAG_ZERO     0x02
  72 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  73 #define RAM_SAVE_FLAG_PAGE     0x08
  74 #define RAM_SAVE_FLAG_EOS      0x10
  75 #define RAM_SAVE_FLAG_CONTINUE 0x20
  76 #define RAM_SAVE_FLAG_XBZRLE   0x40
  77 /* 0x80 is reserved in migration.h start with 0x100 next */
  78 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  79
  80 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  81 {
  82     return buffer_is_zero(p, size);
  83 }
  84
  85 XBZRLECacheStats xbzrle_counters;
  86
  87 /* struct contains XBZRLE cache and a static page
  88    used by the compression */
  89 static struct {
  90     /* buffer used for XBZRLE encoding */
  91     uint8_t *encoded_buf;
  92     /* buffer for storing page content */
  93     uint8_t *current_buf;
  94     /* Cache for XBZRLE, Protected by lock. */
  95     PageCache *cache;
  96     QemuMutex lock;
  97     /* it will store a page full of zeros */
  98     uint8_t *zero_target_page;
  99     /* buffer used for XBZRLE decoding */
 100     uint8_t *decoded_buf;
 101 } XBZRLE;
 102
 103 static void XBZRLE_cache_lock(void)
 104 {
 105     if (migrate_use_xbzrle())
 106         qemu_mutex_lock(&XBZRLE.lock);
 107 }
 108
 109 static void XBZRLE_cache_unlock(void)
 110 {
 111     if (migrate_use_xbzrle())
 112         qemu_mutex_unlock(&XBZRLE.lock);
 113 }
 114
 115 /**
 116  * xbzrle_cache_resize: resize the xbzrle cache
 117  *
 118  * This function is called from qmp_migrate_set_cache_size in main
 119  * thread, possibly while a migration is in progress.  A running
 120  * migration may be using the cache and might finish during this call,
 121  * hence changes to the cache are protected by XBZRLE.lock().
 122  *
 123  * Returns 0 for success or -1 for error
 124  *
 125  * @new_size: new cache size
 126  * @errp: set *errp if the check failed, with reason
 127  */
 128 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 129 {
 130     PageCache *new_cache;
 131     int64_t ret = 0;
 132
 133     /* Check for truncation */
 134     if (new_size != (size_t)new_size) {
 135         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 136                    "exceeding address space");
 137         return -1;
 138     }
 139
 140     if (new_size == migrate_xbzrle_cache_size()) {
 141         /* nothing to do */
 142         return 0;
 143     }
 144
 145     XBZRLE_cache_lock();
 146
 147     if (XBZRLE.cache != NULL) {
 148         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 149         if (!new_cache) {
 150             ret = -1;
 151             goto out;
 152         }
 153
 154         cache_fini(XBZRLE.cache);
 155         XBZRLE.cache = new_cache;
 156     }
 157 out:
 158     XBZRLE_cache_unlock();
 159     return ret;
 160 }
 161
 162 static bool ramblock_is_ignored(RAMBlock *block)
 163 {
 164     return !qemu_ram_is_migratable(block) ||
 165            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 166 }
 167
 168 /* Should be holding either ram_list.mutex, or the RCU lock. */
 169 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 170     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 171         if (ramblock_is_ignored(block)) {} else
 172
 173 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 174     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 175         if (!qemu_ram_is_migratable(block)) {} else
 176
 177 #undef RAMBLOCK_FOREACH
 178
 179 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 180 {
 181     RAMBlock *block;
 182     int ret = 0;
 183
 184     rcu_read_lock();
 185     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 186         ret = func(block, opaque);
 187         if (ret) {
 188             break;
 189         }
 190     }
 191     rcu_read_unlock();
 192     return ret;
 193 }
 194
 195 static void ramblock_recv_map_init(void)
 196 {
 197     RAMBlock *rb;
 198
 199     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 200         assert(!rb->receivedmap);
 201         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 202     }
 203 }
 204
 205 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 206 {
 207     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 208                     rb->receivedmap);
 209 }
 210
 211 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 212 {
 213     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 214 }
 215
 216 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 217 {
 218     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 219 }
 220
 221 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 222                                     size_t nr)
 223 {
 224     bitmap_set_atomic(rb->receivedmap,
 225                       ramblock_recv_bitmap_offset(host_addr, rb),
 226                       nr);
 227 }
 228
 229 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 230
 231 /*
 232  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 233  *
 234  * Returns >0 if success with sent bytes, or <0 if error.
 235  */
 236 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 237                                   const char *block_name)
 238 {
 239     RAMBlock *block = qemu_ram_block_by_name(block_name);
 240     unsigned long *le_bitmap, nbits;
 241     uint64_t size;
 242
 243     if (!block) {
 244         error_report("%s: invalid block name: %s", __func__, block_name);
 245         return -1;
 246     }
 247
 248     nbits = block->used_length >> TARGET_PAGE_BITS;
 249
 250     /*
 251      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 252      * machines we may need 4 more bytes for padding (see below
 253      * comment). So extend it a bit before hand.
 254      */
 255     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 256
 257     /*
 258      * Always use little endian when sending the bitmap. This is
 259      * required that when source and destination VMs are not using the
 260      * same endianess. (Note: big endian won't work.)
 261      */
 262     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 263
 264     /* Size of the bitmap, in bytes */
 265     size = DIV_ROUND_UP(nbits, 8);
 266
 267     /*
 268      * size is always aligned to 8 bytes for 64bit machines, but it
 269      * may not be true for 32bit machines. We need this padding to
 270      * make sure the migration can survive even between 32bit and
 271      * 64bit machines.
 272      */
 273     size = ROUND_UP(size, 8);
 274
 275     qemu_put_be64(file, size);
 276     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 277     /*
 278      * Mark as an end, in case the middle part is screwed up due to
 279      * some "misterious" reason.
 280      */
 281     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 282     qemu_fflush(file);
 283
 284     g_free(le_bitmap);
 285
 286     if (qemu_file_get_error(file)) {
 287         return qemu_file_get_error(file);
 288     }
 289
 290     return size + sizeof(size);
 291 }
 292
 293 /*
 294  * An outstanding page request, on the source, having been received
 295  * and queued
 296  */
 297 struct RAMSrcPageRequest {
 298     RAMBlock *rb;
 299     hwaddr    offset;
 300     hwaddr    len;
 301
 302     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 303 };
 304
 305 /* State of RAM for migration */
 306 struct RAMState {
 307     /* QEMUFile used for this migration */
 308     QEMUFile *f;
 309     /* Last block that we have visited searching for dirty pages */
 310     RAMBlock *last_seen_block;
 311     /* Last block from where we have sent data */
 312     RAMBlock *last_sent_block;
 313     /* Last dirty target page we have sent */
 314     ram_addr_t last_page;
 315     /* last ram version we have seen */
 316     uint32_t last_version;
 317     /* We are in the first round */
 318     bool ram_bulk_stage;
 319     /* The free page optimization is enabled */
 320     bool fpo_enabled;
 321     /* How many times we have dirty too many pages */
 322     int dirty_rate_high_cnt;
 323     /* these variables are used for bitmap sync */
 324     /* last time we did a full bitmap_sync */
 325     int64_t time_last_bitmap_sync;
 326     /* bytes transferred at start_time */
 327     uint64_t bytes_xfer_prev;
 328     /* number of dirty pages since start_time */
 329     uint64_t num_dirty_pages_period;
 330     /* xbzrle misses since the beginning of the period */
 331     uint64_t xbzrle_cache_miss_prev;
 332
 333     /* compression statistics since the beginning of the period */
 334     /* amount of count that no free thread to compress data */
 335     uint64_t compress_thread_busy_prev;
 336     /* amount bytes after compression */
 337     uint64_t compressed_size_prev;
 338     /* amount of compressed pages */
 339     uint64_t compress_pages_prev;
 340
 341     /* total handled target pages at the beginning of period */
 342     uint64_t target_page_count_prev;
 343     /* total handled target pages since start */
 344     uint64_t target_page_count;
 345     /* number of dirty bits in the bitmap */
 346     uint64_t migration_dirty_pages;
 347     /* Protects modification of the bitmap and migration dirty pages */
 348     QemuMutex bitmap_mutex;
 349     /* The RAMBlock used in the last src_page_requests */
 350     RAMBlock *last_req_rb;
 351     /* Queue of outstanding page requests from the destination */
 352     QemuMutex src_page_req_mutex;
 353     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 354 };
 355 typedef struct RAMState RAMState;
 356
 357 static RAMState *ram_state;
 358
 359 static NotifierWithReturnList precopy_notifier_list;
 360
 361 void precopy_infrastructure_init(void)
 362 {
 363     notifier_with_return_list_init(&precopy_notifier_list);
 364 }
 365
 366 void precopy_add_notifier(NotifierWithReturn *n)
 367 {
 368     notifier_with_return_list_add(&precopy_notifier_list, n);
 369 }
 370
 371 void precopy_remove_notifier(NotifierWithReturn *n)
 372 {
 373     notifier_with_return_remove(n);
 374 }
 375
 376 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 377 {
 378     PrecopyNotifyData pnd;
 379     pnd.reason = reason;
 380     pnd.errp = errp;
 381
 382     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 383 }
 384
 385 void precopy_enable_free_page_optimization(void)
 386 {
 387     if (!ram_state) {
 388         return;
 389     }
 390
 391     ram_state->fpo_enabled = true;
 392 }
 393
 394 uint64_t ram_bytes_remaining(void)
 395 {
 396     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 397                        0;
 398 }
 399
 400 MigrationStats ram_counters;
 401
 402 /* used by the search for pages to send */
 403 struct PageSearchStatus {
 404     /* Current block being searched */
 405     RAMBlock    *block;
 406     /* Current page to search from */
 407     unsigned long page;
 408     /* Set once we wrap around */
 409     bool         complete_round;
 410 };
 411 typedef struct PageSearchStatus PageSearchStatus;
 412
 413 CompressionStats compression_counters;
 414
 415 struct CompressParam {
 416     bool done;
 417     bool quit;
 418     bool zero_page;
 419     QEMUFile *file;
 420     QemuMutex mutex;
 421     QemuCond cond;
 422     RAMBlock *block;
 423     ram_addr_t offset;
 424
 425     /* internally used fields */
 426     z_stream stream;
 427     uint8_t *originbuf;
 428 };
 429 typedef struct CompressParam CompressParam;
 430
 431 struct DecompressParam {
 432     bool done;
 433     bool quit;
 434     QemuMutex mutex;
 435     QemuCond cond;
 436     void *des;
 437     uint8_t *compbuf;
 438     int len;
 439     z_stream stream;
 440 };
 441 typedef struct DecompressParam DecompressParam;
 442
 443 static CompressParam *comp_param;
 444 static QemuThread *compress_threads;
 445 /* comp_done_cond is used to wake up the migration thread when
 446  * one of the compression threads has finished the compression.
 447  * comp_done_lock is used to co-work with comp_done_cond.
 448  */
 449 static QemuMutex comp_done_lock;
 450 static QemuCond comp_done_cond;
 451 /* The empty QEMUFileOps will be used by file in CompressParam */
 452 static const QEMUFileOps empty_ops = { };
 453
 454 static QEMUFile *decomp_file;
 455 static DecompressParam *decomp_param;
 456 static QemuThread *decompress_threads;
 457 static QemuMutex decomp_done_lock;
 458 static QemuCond decomp_done_cond;
 459
 460 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 461                                  ram_addr_t offset, uint8_t *source_buf);
 462
 463 static void *do_data_compress(void *opaque)
 464 {
 465     CompressParam *param = opaque;
 466     RAMBlock *block;
 467     ram_addr_t offset;
 468     bool zero_page;
 469
 470     qemu_mutex_lock(&param->mutex);
 471     while (!param->quit) {
 472         if (param->block) {
 473             block = param->block;
 474             offset = param->offset;
 475             param->block = NULL;
 476             qemu_mutex_unlock(&param->mutex);
 477
 478             zero_page = do_compress_ram_page(param->file, &param->stream,
 479                                              block, offset, param->originbuf);
 480
 481             qemu_mutex_lock(&comp_done_lock);
 482             param->done = true;
 483             param->zero_page = zero_page;
 484             qemu_cond_signal(&comp_done_cond);
 485             qemu_mutex_unlock(&comp_done_lock);
 486
 487             qemu_mutex_lock(&param->mutex);
 488         } else {
 489             qemu_cond_wait(&param->cond, &param->mutex);
 490         }
 491     }
 492     qemu_mutex_unlock(&param->mutex);
 493
 494     return NULL;
 495 }
 496
 497 static void compress_threads_save_cleanup(void)
 498 {
 499     int i, thread_count;
 500
 501     if (!migrate_use_compression() || !comp_param) {
 502         return;
 503     }
 504
 505     thread_count = migrate_compress_threads();
 506     for (i = 0; i < thread_count; i++) {
 507         /*
 508          * we use it as a indicator which shows if the thread is
 509          * properly init'd or not
 510          */
 511         if (!comp_param[i].file) {
 512             break;
 513         }
 514
 515         qemu_mutex_lock(&comp_param[i].mutex);
 516         comp_param[i].quit = true;
 517         qemu_cond_signal(&comp_param[i].cond);
 518         qemu_mutex_unlock(&comp_param[i].mutex);
 519
 520         qemu_thread_join(compress_threads + i);
 521         qemu_mutex_destroy(&comp_param[i].mutex);
 522         qemu_cond_destroy(&comp_param[i].cond);
 523         deflateEnd(&comp_param[i].stream);
 524         g_free(comp_param[i].originbuf);
 525         qemu_fclose(comp_param[i].file);
 526         comp_param[i].file = NULL;
 527     }
 528     qemu_mutex_destroy(&comp_done_lock);
 529     qemu_cond_destroy(&comp_done_cond);
 530     g_free(compress_threads);
 531     g_free(comp_param);
 532     compress_threads = NULL;
 533     comp_param = NULL;
 534 }
 535
 536 static int compress_threads_save_setup(void)
 537 {
 538     int i, thread_count;
 539
 540     if (!migrate_use_compression()) {
 541         return 0;
 542     }
 543     thread_count = migrate_compress_threads();
 544     compress_threads = g_new0(QemuThread, thread_count);
 545     comp_param = g_new0(CompressParam, thread_count);
 546     qemu_cond_init(&comp_done_cond);
 547     qemu_mutex_init(&comp_done_lock);
 548     for (i = 0; i < thread_count; i++) {
 549         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 550         if (!comp_param[i].originbuf) {
 551             goto exit;
 552         }
 553
 554         if (deflateInit(&comp_param[i].stream,
 555                         migrate_compress_level()) != Z_OK) {
 556             g_free(comp_param[i].originbuf);
 557             goto exit;
 558         }
 559
 560         /* comp_param[i].file is just used as a dummy buffer to save data,
 561          * set its ops to empty.
 562          */
 563         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 564         comp_param[i].done = true;
 565         comp_param[i].quit = false;
 566         qemu_mutex_init(&comp_param[i].mutex);
 567         qemu_cond_init(&comp_param[i].cond);
 568         qemu_thread_create(compress_threads + i, "compress",
 569                            do_data_compress, comp_param + i,
 570                            QEMU_THREAD_JOINABLE);
 571     }
 572     return 0;
 573
 574 exit:
 575     compress_threads_save_cleanup();
 576     return -1;
 577 }
 578
 579 /* Multiple fd's */
 580
 581 #define MULTIFD_MAGIC 0x11223344U
 582 #define MULTIFD_VERSION 1
 583
 584 #define MULTIFD_FLAG_SYNC (1 << 0)
 585
 586 /* This value needs to be a multiple of qemu_target_page_size() */
 587 #define MULTIFD_PACKET_SIZE (512 * 1024)
 588
 589 typedef struct {
 590     uint32_t magic;
 591     uint32_t version;
 592     unsigned char uuid[16]; /* QemuUUID */
 593     uint8_t id;
 594     uint8_t unused1[7];     /* Reserved for future use */
 595     uint64_t unused2[4];    /* Reserved for future use */
 596 } __attribute__((packed)) MultiFDInit_t;
 597
 598 typedef struct {
 599     uint32_t magic;
 600     uint32_t version;
 601     uint32_t flags;
 602     /* maximum number of allocated pages */
 603     uint32_t pages_alloc;
 604     uint32_t pages_used;
 605     /* size of the next packet that contains pages */
 606     uint32_t next_packet_size;
 607     uint64_t packet_num;
 608     uint64_t unused[4];    /* Reserved for future use */
 609     char ramblock[256];
 610     uint64_t offset[];
 611 } __attribute__((packed)) MultiFDPacket_t;
 612
 613 typedef struct {
 614     /* number of used pages */
 615     uint32_t used;
 616     /* number of allocated pages */
 617     uint32_t allocated;
 618     /* global number of generated multifd packets */
 619     uint64_t packet_num;
 620     /* offset of each page */
 621     ram_addr_t *offset;
 622     /* pointer to each page */
 623     struct iovec *iov;
 624     RAMBlock *block;
 625 } MultiFDPages_t;
 626
 627 typedef struct {
 628     /* this fields are not changed once the thread is created */
 629     /* channel number */
 630     uint8_t id;
 631     /* channel thread name */
 632     char *name;
 633     /* channel thread id */
 634     QemuThread thread;
 635     /* communication channel */
 636     QIOChannel *c;
 637     /* sem where to wait for more work */
 638     QemuSemaphore sem;
 639     /* this mutex protects the following parameters */
 640     QemuMutex mutex;
 641     /* is this channel thread running */
 642     bool running;
 643     /* should this thread finish */
 644     bool quit;
 645     /* thread has work to do */
 646     int pending_job;
 647     /* array of pages to sent */
 648     MultiFDPages_t *pages;
 649     /* packet allocated len */
 650     uint32_t packet_len;
 651     /* pointer to the packet */
 652     MultiFDPacket_t *packet;
 653     /* multifd flags for each packet */
 654     uint32_t flags;
 655     /* size of the next packet that contains pages */
 656     uint32_t next_packet_size;
 657     /* global number of generated multifd packets */
 658     uint64_t packet_num;
 659     /* thread local variables */
 660     /* packets sent through this channel */
 661     uint64_t num_packets;
 662     /* pages sent through this channel */
 663     uint64_t num_pages;
 664 }  MultiFDSendParams;
 665
 666 typedef struct {
 667     /* this fields are not changed once the thread is created */
 668     /* channel number */
 669     uint8_t id;
 670     /* channel thread name */
 671     char *name;
 672     /* channel thread id */
 673     QemuThread thread;
 674     /* communication channel */
 675     QIOChannel *c;
 676     /* this mutex protects the following parameters */
 677     QemuMutex mutex;
 678     /* is this channel thread running */
 679     bool running;
 680     /* array of pages to receive */
 681     MultiFDPages_t *pages;
 682     /* packet allocated len */
 683     uint32_t packet_len;
 684     /* pointer to the packet */
 685     MultiFDPacket_t *packet;
 686     /* multifd flags for each packet */
 687     uint32_t flags;
 688     /* global number of generated multifd packets */
 689     uint64_t packet_num;
 690     /* thread local variables */
 691     /* size of the next packet that contains pages */
 692     uint32_t next_packet_size;
 693     /* packets sent through this channel */
 694     uint64_t num_packets;
 695     /* pages sent through this channel */
 696     uint64_t num_pages;
 697     /* syncs main thread and channels */
 698     QemuSemaphore sem_sync;
 699 } MultiFDRecvParams;
 700
 701 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 702 {
 703     MultiFDInit_t msg;
 704     int ret;
 705
 706     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 707     msg.version = cpu_to_be32(MULTIFD_VERSION);
 708     msg.id = p->id;
 709     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 710
 711     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 712     if (ret != 0) {
 713         return -1;
 714     }
 715     return 0;
 716 }
 717
 718 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 719 {
 720     MultiFDInit_t msg;
 721     int ret;
 722
 723     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 724     if (ret != 0) {
 725         return -1;
 726     }
 727
 728     msg.magic = be32_to_cpu(msg.magic);
 729     msg.version = be32_to_cpu(msg.version);
 730
 731     if (msg.magic != MULTIFD_MAGIC) {
 732         error_setg(errp, "multifd: received packet magic %x "
 733                    "expected %x", msg.magic, MULTIFD_MAGIC);
 734         return -1;
 735     }
 736
 737     if (msg.version != MULTIFD_VERSION) {
 738         error_setg(errp, "multifd: received packet version %d "
 739                    "expected %d", msg.version, MULTIFD_VERSION);
 740         return -1;
 741     }
 742
 743     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 744         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 745         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 746
 747         error_setg(errp, "multifd: received uuid '%s' and expected "
 748                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 749         g_free(uuid);
 750         g_free(msg_uuid);
 751         return -1;
 752     }
 753
 754     if (msg.id > migrate_multifd_channels()) {
 755         error_setg(errp, "multifd: received channel version %d "
 756                    "expected %d", msg.version, MULTIFD_VERSION);
 757         return -1;
 758     }
 759
 760     return msg.id;
 761 }
 762
 763 static MultiFDPages_t *multifd_pages_init(size_t size)
 764 {
 765     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 766
 767     pages->allocated = size;
 768     pages->iov = g_new0(struct iovec, size);
 769     pages->offset = g_new0(ram_addr_t, size);
 770
 771     return pages;
 772 }
 773
 774 static void multifd_pages_clear(MultiFDPages_t *pages)
 775 {
 776     pages->used = 0;
 777     pages->allocated = 0;
 778     pages->packet_num = 0;
 779     pages->block = NULL;
 780     g_free(pages->iov);
 781     pages->iov = NULL;
 782     g_free(pages->offset);
 783     pages->offset = NULL;
 784     g_free(pages);
 785 }
 786
 787 static void multifd_send_fill_packet(MultiFDSendParams *p)
 788 {
 789     MultiFDPacket_t *packet = p->packet;
 790     uint32_t page_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
 791     int i;
 792
 793     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 794     packet->version = cpu_to_be32(MULTIFD_VERSION);
 795     packet->flags = cpu_to_be32(p->flags);
 796     packet->pages_alloc = cpu_to_be32(page_max);
 797     packet->pages_used = cpu_to_be32(p->pages->used);
 798     packet->next_packet_size = cpu_to_be32(p->next_packet_size);
 799     packet->packet_num = cpu_to_be64(p->packet_num);
 800
 801     if (p->pages->block) {
 802         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 803     }
 804
 805     for (i = 0; i < p->pages->used; i++) {
 806         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 807     }
 808 }
 809
 810 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 811 {
 812     MultiFDPacket_t *packet = p->packet;
 813     uint32_t pages_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
 814     RAMBlock *block;
 815     int i;
 816
 817     packet->magic = be32_to_cpu(packet->magic);
 818     if (packet->magic != MULTIFD_MAGIC) {
 819         error_setg(errp, "multifd: received packet "
 820                    "magic %x and expected magic %x",
 821                    packet->magic, MULTIFD_MAGIC);
 822         return -1;
 823     }
 824
 825     packet->version = be32_to_cpu(packet->version);
 826     if (packet->version != MULTIFD_VERSION) {
 827         error_setg(errp, "multifd: received packet "
 828                    "version %d and expected version %d",
 829                    packet->version, MULTIFD_VERSION);
 830         return -1;
 831     }
 832
 833     p->flags = be32_to_cpu(packet->flags);
 834
 835     packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
 836     /*
 837      * If we recevied a packet that is 100 times bigger than expected
 838      * just stop migration.  It is a magic number.
 839      */
 840     if (packet->pages_alloc > pages_max * 100) {
 841         error_setg(errp, "multifd: received packet "
 842                    "with size %d and expected a maximum size of %d",
 843                    packet->pages_alloc, pages_max * 100) ;
 844         return -1;
 845     }
 846     /*
 847      * We received a packet that is bigger than expected but inside
 848      * reasonable limits (see previous comment).  Just reallocate.
 849      */
 850     if (packet->pages_alloc > p->pages->allocated) {
 851         multifd_pages_clear(p->pages);
 852         p->pages = multifd_pages_init(packet->pages_alloc);
 853     }
 854
 855     p->pages->used = be32_to_cpu(packet->pages_used);
 856     if (p->pages->used > packet->pages_alloc) {
 857         error_setg(errp, "multifd: received packet "
 858                    "with %d pages and expected maximum pages are %d",
 859                    p->pages->used, packet->pages_alloc) ;
 860         return -1;
 861     }
 862
 863     p->next_packet_size = be32_to_cpu(packet->next_packet_size);
 864     p->packet_num = be64_to_cpu(packet->packet_num);
 865
 866     if (p->pages->used) {
 867         /* make sure that ramblock is 0 terminated */
 868         packet->ramblock[255] = 0;
 869         block = qemu_ram_block_by_name(packet->ramblock);
 870         if (!block) {
 871             error_setg(errp, "multifd: unknown ram block %s",
 872                        packet->ramblock);
 873             return -1;
 874         }
 875     }
 876
 877     for (i = 0; i < p->pages->used; i++) {
 878         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 879
 880         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 881             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 882                        " (max " RAM_ADDR_FMT ")",
 883                        offset, block->max_length);
 884             return -1;
 885         }
 886         p->pages->iov[i].iov_base = block->host + offset;
 887         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 888     }
 889
 890     return 0;
 891 }
 892
 893 struct {
 894     MultiFDSendParams *params;
 895     /* number of created threads */
 896     int count;
 897     /* array of pages to sent */
 898     MultiFDPages_t *pages;
 899     /* syncs main thread and channels */
 900     QemuSemaphore sem_sync;
 901     /* global number of generated multifd packets */
 902     uint64_t packet_num;
 903     /* send channels ready */
 904     QemuSemaphore channels_ready;
 905 } *multifd_send_state;
 906
 907 /*
 908  * How we use multifd_send_state->pages and channel->pages?
 909  *
 910  * We create a pages for each channel, and a main one.  Each time that
 911  * we need to send a batch of pages we interchange the ones between
 912  * multifd_send_state and the channel that is sending it.  There are
 913  * two reasons for that:
 914  *    - to not have to do so many mallocs during migration
 915  *    - to make easier to know what to free at the end of migration
 916  *
 917  * This way we always know who is the owner of each "pages" struct,
 918  * and we don't need any locking.  It belongs to the migration thread
 919  * or to the channel thread.  Switching is safe because the migration
 920  * thread is using the channel mutex when changing it, and the channel
 921  * have to had finish with its own, otherwise pending_job can't be
 922  * false.
 923  */
 924
 925 static void multifd_send_pages(void)
 926 {
 927     int i;
 928     static int next_channel;
 929     MultiFDSendParams *p = NULL; /* make happy gcc */
 930     MultiFDPages_t *pages = multifd_send_state->pages;
 931     uint64_t transferred;
 932
 933     qemu_sem_wait(&multifd_send_state->channels_ready);
 934     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 935         p = &multifd_send_state->params[i];
 936
 937         qemu_mutex_lock(&p->mutex);
 938         if (!p->pending_job) {
 939             p->pending_job++;
 940             next_channel = (i + 1) % migrate_multifd_channels();
 941             break;
 942         }
 943         qemu_mutex_unlock(&p->mutex);
 944     }
 945     p->pages->used = 0;
 946
 947     p->packet_num = multifd_send_state->packet_num++;
 948     p->pages->block = NULL;
 949     multifd_send_state->pages = p->pages;
 950     p->pages = pages;
 951     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
 952     ram_counters.multifd_bytes += transferred;
 953     ram_counters.transferred += transferred;;
 954     qemu_mutex_unlock(&p->mutex);
 955     qemu_sem_post(&p->sem);
 956 }
 957
 958 static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
 959 {
 960     MultiFDPages_t *pages = multifd_send_state->pages;
 961
 962     if (!pages->block) {
 963         pages->block = block;
 964     }
 965
 966     if (pages->block == block) {
 967         pages->offset[pages->used] = offset;
 968         pages->iov[pages->used].iov_base = block->host + offset;
 969         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 970         pages->used++;
 971
 972         if (pages->used < pages->allocated) {
 973             return;
 974         }
 975     }
 976
 977     multifd_send_pages();
 978
 979     if (pages->block != block) {
 980         multifd_queue_page(block, offset);
 981     }
 982 }
 983
 984 static void multifd_send_terminate_threads(Error *err)
 985 {
 986     int i;
 987
 988     if (err) {
 989         MigrationState *s = migrate_get_current();
 990         migrate_set_error(s, err);
 991         if (s->state == MIGRATION_STATUS_SETUP ||
 992             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
 993             s->state == MIGRATION_STATUS_DEVICE ||
 994             s->state == MIGRATION_STATUS_ACTIVE) {
 995             migrate_set_state(&s->state, s->state,
 996                               MIGRATION_STATUS_FAILED);
 997         }
 998     }
 999
1000     for (i = 0; i < migrate_multifd_channels(); i++) {
1001         MultiFDSendParams *p = &multifd_send_state->params[i];
1002
1003         qemu_mutex_lock(&p->mutex);
1004         p->quit = true;
1005         qemu_sem_post(&p->sem);
1006         qemu_mutex_unlock(&p->mutex);
1007     }
1008 }
1009
1010 void multifd_save_cleanup(void)
1011 {
1012     int i;
1013
1014     if (!migrate_use_multifd()) {
1015         return;
1016     }
1017     multifd_send_terminate_threads(NULL);
1018     for (i = 0; i < migrate_multifd_channels(); i++) {
1019         MultiFDSendParams *p = &multifd_send_state->params[i];
1020
1021         if (p->running) {
1022             qemu_thread_join(&p->thread);
1023         }
1024         socket_send_channel_destroy(p->c);
1025         p->c = NULL;
1026         qemu_mutex_destroy(&p->mutex);
1027         qemu_sem_destroy(&p->sem);
1028         g_free(p->name);
1029         p->name = NULL;
1030         multifd_pages_clear(p->pages);
1031         p->pages = NULL;
1032         p->packet_len = 0;
1033         g_free(p->packet);
1034         p->packet = NULL;
1035     }
1036     qemu_sem_destroy(&multifd_send_state->channels_ready);
1037     qemu_sem_destroy(&multifd_send_state->sem_sync);
1038     g_free(multifd_send_state->params);
1039     multifd_send_state->params = NULL;
1040     multifd_pages_clear(multifd_send_state->pages);
1041     multifd_send_state->pages = NULL;
1042     g_free(multifd_send_state);
1043     multifd_send_state = NULL;
1044 }
1045
1046 static void multifd_send_sync_main(void)
1047 {
1048     int i;
1049
1050     if (!migrate_use_multifd()) {
1051         return;
1052     }
1053     if (multifd_send_state->pages->used) {
1054         multifd_send_pages();
1055     }
1056     for (i = 0; i < migrate_multifd_channels(); i++) {
1057         MultiFDSendParams *p = &multifd_send_state->params[i];
1058
1059         trace_multifd_send_sync_main_signal(p->id);
1060
1061         qemu_mutex_lock(&p->mutex);
1062
1063         p->packet_num = multifd_send_state->packet_num++;
1064         p->flags |= MULTIFD_FLAG_SYNC;
1065         p->pending_job++;
1066         qemu_mutex_unlock(&p->mutex);
1067         qemu_sem_post(&p->sem);
1068     }
1069     for (i = 0; i < migrate_multifd_channels(); i++) {
1070         MultiFDSendParams *p = &multifd_send_state->params[i];
1071
1072         trace_multifd_send_sync_main_wait(p->id);
1073         qemu_sem_wait(&multifd_send_state->sem_sync);
1074     }
1075     trace_multifd_send_sync_main(multifd_send_state->packet_num);
1076 }
1077
1078 static void *multifd_send_thread(void *opaque)
1079 {
1080     MultiFDSendParams *p = opaque;
1081     Error *local_err = NULL;
1082     int ret;
1083
1084     trace_multifd_send_thread_start(p->id);
1085     rcu_register_thread();
1086
1087     if (multifd_send_initial_packet(p, &local_err) < 0) {
1088         goto out;
1089     }
1090     /* initial packet */
1091     p->num_packets = 1;
1092
1093     while (true) {
1094         qemu_sem_wait(&p->sem);
1095         qemu_mutex_lock(&p->mutex);
1096
1097         if (p->pending_job) {
1098             uint32_t used = p->pages->used;
1099             uint64_t packet_num = p->packet_num;
1100             uint32_t flags = p->flags;
1101
1102             p->next_packet_size = used * qemu_target_page_size();
1103             multifd_send_fill_packet(p);
1104             p->flags = 0;
1105             p->num_packets++;
1106             p->num_pages += used;
1107             p->pages->used = 0;
1108             qemu_mutex_unlock(&p->mutex);
1109
1110             trace_multifd_send(p->id, packet_num, used, flags,
1111                                p->next_packet_size);
1112
1113             ret = qio_channel_write_all(p->c, (void *)p->packet,
1114                                         p->packet_len, &local_err);
1115             if (ret != 0) {
1116                 break;
1117             }
1118
1119             if (used) {
1120                 ret = qio_channel_writev_all(p->c, p->pages->iov,
1121                                              used, &local_err);
1122                 if (ret != 0) {
1123                     break;
1124                 }
1125             }
1126
1127             qemu_mutex_lock(&p->mutex);
1128             p->pending_job--;
1129             qemu_mutex_unlock(&p->mutex);
1130
1131             if (flags & MULTIFD_FLAG_SYNC) {
1132                 qemu_sem_post(&multifd_send_state->sem_sync);
1133             }
1134             qemu_sem_post(&multifd_send_state->channels_ready);
1135         } else if (p->quit) {
1136             qemu_mutex_unlock(&p->mutex);
1137             break;
1138         } else {
1139             qemu_mutex_unlock(&p->mutex);
1140             /* sometimes there are spurious wakeups */
1141         }
1142     }
1143
1144 out:
1145     if (local_err) {
1146         multifd_send_terminate_threads(local_err);
1147     }
1148
1149     qemu_mutex_lock(&p->mutex);
1150     p->running = false;
1151     qemu_mutex_unlock(&p->mutex);
1152
1153     rcu_unregister_thread();
1154     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1155
1156     return NULL;
1157 }
1158
1159 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1160 {
1161     MultiFDSendParams *p = opaque;
1162     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1163     Error *local_err = NULL;
1164
1165     if (qio_task_propagate_error(task, &local_err)) {
1166         migrate_set_error(migrate_get_current(), local_err);
1167         multifd_save_cleanup();
1168     } else {
1169         p->c = QIO_CHANNEL(sioc);
1170         qio_channel_set_delay(p->c, false);
1171         p->running = true;
1172         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1173                            QEMU_THREAD_JOINABLE);
1174
1175         atomic_inc(&multifd_send_state->count);
1176     }
1177 }
1178
1179 int multifd_save_setup(void)
1180 {
1181     int thread_count;
1182     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1183     uint8_t i;
1184
1185     if (!migrate_use_multifd()) {
1186         return 0;
1187     }
1188     thread_count = migrate_multifd_channels();
1189     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1190     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1191     atomic_set(&multifd_send_state->count, 0);
1192     multifd_send_state->pages = multifd_pages_init(page_count);
1193     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1194     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1195
1196     for (i = 0; i < thread_count; i++) {
1197         MultiFDSendParams *p = &multifd_send_state->params[i];
1198
1199         qemu_mutex_init(&p->mutex);
1200         qemu_sem_init(&p->sem, 0);
1201         p->quit = false;
1202         p->pending_job = 0;
1203         p->id = i;
1204         p->pages = multifd_pages_init(page_count);
1205         p->packet_len = sizeof(MultiFDPacket_t)
1206                       + sizeof(ram_addr_t) * page_count;
1207         p->packet = g_malloc0(p->packet_len);
1208         p->name = g_strdup_printf("multifdsend_%d", i);
1209         socket_send_channel_create(multifd_new_send_channel_async, p);
1210     }
1211     return 0;
1212 }
1213
1214 struct {
1215     MultiFDRecvParams *params;
1216     /* number of created threads */
1217     int count;
1218     /* syncs main thread and channels */
1219     QemuSemaphore sem_sync;
1220     /* global number of generated multifd packets */
1221     uint64_t packet_num;
1222 } *multifd_recv_state;
1223
1224 static void multifd_recv_terminate_threads(Error *err)
1225 {
1226     int i;
1227
1228     if (err) {
1229         MigrationState *s = migrate_get_current();
1230         migrate_set_error(s, err);
1231         if (s->state == MIGRATION_STATUS_SETUP ||
1232             s->state == MIGRATION_STATUS_ACTIVE) {
1233             migrate_set_state(&s->state, s->state,
1234                               MIGRATION_STATUS_FAILED);
1235         }
1236     }
1237
1238     for (i = 0; i < migrate_multifd_channels(); i++) {
1239         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1240
1241         qemu_mutex_lock(&p->mutex);
1242         /* We could arrive here for two reasons:
1243            - normal quit, i.e. everything went fine, just finished
1244            - error quit: We close the channels so the channel threads
1245              finish the qio_channel_read_all_eof() */
1246         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1247         qemu_mutex_unlock(&p->mutex);
1248     }
1249 }
1250
1251 int multifd_load_cleanup(Error **errp)
1252 {
1253     int i;
1254     int ret = 0;
1255
1256     if (!migrate_use_multifd()) {
1257         return 0;
1258     }
1259     multifd_recv_terminate_threads(NULL);
1260     for (i = 0; i < migrate_multifd_channels(); i++) {
1261         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1262
1263         if (p->running) {
1264             qemu_thread_join(&p->thread);
1265         }
1266         object_unref(OBJECT(p->c));
1267         p->c = NULL;
1268         qemu_mutex_destroy(&p->mutex);
1269         qemu_sem_destroy(&p->sem_sync);
1270         g_free(p->name);
1271         p->name = NULL;
1272         multifd_pages_clear(p->pages);
1273         p->pages = NULL;
1274         p->packet_len = 0;
1275         g_free(p->packet);
1276         p->packet = NULL;
1277     }
1278     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1279     g_free(multifd_recv_state->params);
1280     multifd_recv_state->params = NULL;
1281     g_free(multifd_recv_state);
1282     multifd_recv_state = NULL;
1283
1284     return ret;
1285 }
1286
1287 static void multifd_recv_sync_main(void)
1288 {
1289     int i;
1290
1291     if (!migrate_use_multifd()) {
1292         return;
1293     }
1294     for (i = 0; i < migrate_multifd_channels(); i++) {
1295         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1296
1297         trace_multifd_recv_sync_main_wait(p->id);
1298         qemu_sem_wait(&multifd_recv_state->sem_sync);
1299         qemu_mutex_lock(&p->mutex);
1300         if (multifd_recv_state->packet_num < p->packet_num) {
1301             multifd_recv_state->packet_num = p->packet_num;
1302         }
1303         qemu_mutex_unlock(&p->mutex);
1304     }
1305     for (i = 0; i < migrate_multifd_channels(); i++) {
1306         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1307
1308         trace_multifd_recv_sync_main_signal(p->id);
1309         qemu_sem_post(&p->sem_sync);
1310     }
1311     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1312 }
1313
1314 static void *multifd_recv_thread(void *opaque)
1315 {
1316     MultiFDRecvParams *p = opaque;
1317     Error *local_err = NULL;
1318     int ret;
1319
1320     trace_multifd_recv_thread_start(p->id);
1321     rcu_register_thread();
1322
1323     while (true) {
1324         uint32_t used;
1325         uint32_t flags;
1326
1327         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1328                                        p->packet_len, &local_err);
1329         if (ret == 0) {   /* EOF */
1330             break;
1331         }
1332         if (ret == -1) {   /* Error */
1333             break;
1334         }
1335
1336         qemu_mutex_lock(&p->mutex);
1337         ret = multifd_recv_unfill_packet(p, &local_err);
1338         if (ret) {
1339             qemu_mutex_unlock(&p->mutex);
1340             break;
1341         }
1342
1343         used = p->pages->used;
1344         flags = p->flags;
1345         trace_multifd_recv(p->id, p->packet_num, used, flags,
1346                            p->next_packet_size);
1347         p->num_packets++;
1348         p->num_pages += used;
1349         qemu_mutex_unlock(&p->mutex);
1350
1351         if (used) {
1352             ret = qio_channel_readv_all(p->c, p->pages->iov,
1353                                         used, &local_err);
1354             if (ret != 0) {
1355                 break;
1356             }
1357         }
1358
1359         if (flags & MULTIFD_FLAG_SYNC) {
1360             qemu_sem_post(&multifd_recv_state->sem_sync);
1361             qemu_sem_wait(&p->sem_sync);
1362         }
1363     }
1364
1365     if (local_err) {
1366         multifd_recv_terminate_threads(local_err);
1367     }
1368     qemu_mutex_lock(&p->mutex);
1369     p->running = false;
1370     qemu_mutex_unlock(&p->mutex);
1371
1372     rcu_unregister_thread();
1373     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1374
1375     return NULL;
1376 }
1377
1378 int multifd_load_setup(void)
1379 {
1380     int thread_count;
1381     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1382     uint8_t i;
1383
1384     if (!migrate_use_multifd()) {
1385         return 0;
1386     }
1387     thread_count = migrate_multifd_channels();
1388     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1389     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1390     atomic_set(&multifd_recv_state->count, 0);
1391     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1392
1393     for (i = 0; i < thread_count; i++) {
1394         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1395
1396         qemu_mutex_init(&p->mutex);
1397         qemu_sem_init(&p->sem_sync, 0);
1398         p->id = i;
1399         p->pages = multifd_pages_init(page_count);
1400         p->packet_len = sizeof(MultiFDPacket_t)
1401                       + sizeof(ram_addr_t) * page_count;
1402         p->packet = g_malloc0(p->packet_len);
1403         p->name = g_strdup_printf("multifdrecv_%d", i);
1404     }
1405     return 0;
1406 }
1407
1408 bool multifd_recv_all_channels_created(void)
1409 {
1410     int thread_count = migrate_multifd_channels();
1411
1412     if (!migrate_use_multifd()) {
1413         return true;
1414     }
1415
1416     return thread_count == atomic_read(&multifd_recv_state->count);
1417 }
1418
1419 /*
1420  * Try to receive all multifd channels to get ready for the migration.
1421  * - Return true and do not set @errp when correctly receving all channels;
1422  * - Return false and do not set @errp when correctly receiving the current one;
1423  * - Return false and set @errp when failing to receive the current channel.
1424  */
1425 bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1426 {
1427     MultiFDRecvParams *p;
1428     Error *local_err = NULL;
1429     int id;
1430
1431     id = multifd_recv_initial_packet(ioc, &local_err);
1432     if (id < 0) {
1433         multifd_recv_terminate_threads(local_err);
1434         error_propagate_prepend(errp, local_err,
1435                                 "failed to receive packet"
1436                                 " via multifd channel %d: ",
1437                                 atomic_read(&multifd_recv_state->count));
1438         return false;
1439     }
1440
1441     p = &multifd_recv_state->params[id];
1442     if (p->c != NULL) {
1443         error_setg(&local_err, "multifd: received id '%d' already setup'",
1444                    id);
1445         multifd_recv_terminate_threads(local_err);
1446         error_propagate(errp, local_err);
1447         return false;
1448     }
1449     p->c = ioc;
1450     object_ref(OBJECT(ioc));
1451     /* initial packet */
1452     p->num_packets = 1;
1453
1454     p->running = true;
1455     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1456                        QEMU_THREAD_JOINABLE);
1457     atomic_inc(&multifd_recv_state->count);
1458     return atomic_read(&multifd_recv_state->count) ==
1459            migrate_multifd_channels();
1460 }
1461
1462 /**
1463  * save_page_header: write page header to wire
1464  *
1465  * If this is the 1st block, it also writes the block identification
1466  *
1467  * Returns the number of bytes written
1468  *
1469  * @f: QEMUFile where to send the data
1470  * @block: block that contains the page we want to send
1471  * @offset: offset inside the block for the page
1472  *          in the lower bits, it contains flags
1473  */
1474 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1475                                ram_addr_t offset)
1476 {
1477     size_t size, len;
1478
1479     if (block == rs->last_sent_block) {
1480         offset |= RAM_SAVE_FLAG_CONTINUE;
1481     }
1482     qemu_put_be64(f, offset);
1483     size = 8;
1484
1485     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1486         len = strlen(block->idstr);
1487         qemu_put_byte(f, len);
1488         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1489         size += 1 + len;
1490         rs->last_sent_block = block;
1491     }
1492     return size;
1493 }
1494
1495 /**
1496  * mig_throttle_guest_down: throotle down the guest
1497  *
1498  * Reduce amount of guest cpu execution to hopefully slow down memory
1499  * writes. If guest dirty memory rate is reduced below the rate at
1500  * which we can transfer pages to the destination then we should be
1501  * able to complete migration. Some workloads dirty memory way too
1502  * fast and will not effectively converge, even with auto-converge.
1503  */
1504 static void mig_throttle_guest_down(void)
1505 {
1506     MigrationState *s = migrate_get_current();
1507     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1508     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1509     int pct_max = s->parameters.max_cpu_throttle;
1510
1511     /* We have not started throttling yet. Let's start it. */
1512     if (!cpu_throttle_active()) {
1513         cpu_throttle_set(pct_initial);
1514     } else {
1515         /* Throttling already on, just increase the rate */
1516         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1517                          pct_max));
1518     }
1519 }
1520
1521 /**
1522  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1523  *
1524  * @rs: current RAM state
1525  * @current_addr: address for the zero page
1526  *
1527  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1528  * The important thing is that a stale (not-yet-0'd) page be replaced
1529  * by the new data.
1530  * As a bonus, if the page wasn't in the cache it gets added so that
1531  * when a small write is made into the 0'd page it gets XBZRLE sent.
1532  */
1533 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1534 {
1535     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1536         return;
1537     }
1538
1539     /* We don't care if this fails to allocate a new cache page
1540      * as long as it updated an old one */
1541     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1542                  ram_counters.dirty_sync_count);
1543 }
1544
1545 #define ENCODING_FLAG_XBZRLE 0x1
1546
1547 /**
1548  * save_xbzrle_page: compress and send current page
1549  *
1550  * Returns: 1 means that we wrote the page
1551  *          0 means that page is identical to the one already sent
1552  *          -1 means that xbzrle would be longer than normal
1553  *
1554  * @rs: current RAM state
1555  * @current_data: pointer to the address of the page contents
1556  * @current_addr: addr of the page
1557  * @block: block that contains the page we want to send
1558  * @offset: offset inside the block for the page
1559  * @last_stage: if we are at the completion stage
1560  */
1561 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1562                             ram_addr_t current_addr, RAMBlock *block,
1563                             ram_addr_t offset, bool last_stage)
1564 {
1565     int encoded_len = 0, bytes_xbzrle;
1566     uint8_t *prev_cached_page;
1567
1568     if (!cache_is_cached(XBZRLE.cache, current_addr,
1569                          ram_counters.dirty_sync_count)) {
1570         xbzrle_counters.cache_miss++;
1571         if (!last_stage) {
1572             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1573                              ram_counters.dirty_sync_count) == -1) {
1574                 return -1;
1575             } else {
1576                 /* update *current_data when the page has been
1577                    inserted into cache */
1578                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1579             }
1580         }
1581         return -1;
1582     }
1583
1584     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1585
1586     /* save current buffer into memory */
1587     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1588
1589     /* XBZRLE encoding (if there is no overflow) */
1590     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1591                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1592                                        TARGET_PAGE_SIZE);
1593     if (encoded_len == 0) {
1594         trace_save_xbzrle_page_skipping();
1595         return 0;
1596     } else if (encoded_len == -1) {
1597         trace_save_xbzrle_page_overflow();
1598         xbzrle_counters.overflow++;
1599         /* update data in the cache */
1600         if (!last_stage) {
1601             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1602             *current_data = prev_cached_page;
1603         }
1604         return -1;
1605     }
1606
1607     /* we need to update the data in the cache, in order to get the same data */
1608     if (!last_stage) {
1609         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1610     }
1611
1612     /* Send XBZRLE based compressed page */
1613     bytes_xbzrle = save_page_header(rs, rs->f, block,
1614                                     offset | RAM_SAVE_FLAG_XBZRLE);
1615     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1616     qemu_put_be16(rs->f, encoded_len);
1617     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1618     bytes_xbzrle += encoded_len + 1 + 2;
1619     xbzrle_counters.pages++;
1620     xbzrle_counters.bytes += bytes_xbzrle;
1621     ram_counters.transferred += bytes_xbzrle;
1622
1623     return 1;
1624 }
1625
1626 /**
1627  * migration_bitmap_find_dirty: find the next dirty page from start
1628  *
1629  * Returns the page offset within memory region of the start of a dirty page
1630  *
1631  * @rs: current RAM state
1632  * @rb: RAMBlock where to search for dirty pages
1633  * @start: page where we start the search
1634  */
1635 static inline
1636 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1637                                           unsigned long start)
1638 {
1639     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1640     unsigned long *bitmap = rb->bmap;
1641     unsigned long next;
1642
1643     if (ramblock_is_ignored(rb)) {
1644         return size;
1645     }
1646
1647     /*
1648      * When the free page optimization is enabled, we need to check the bitmap
1649      * to send the non-free pages rather than all the pages in the bulk stage.
1650      */
1651     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
1652         next = start + 1;
1653     } else {
1654         next = find_next_bit(bitmap, size, start);
1655     }
1656
1657     return next;
1658 }
1659
1660 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1661                                                 RAMBlock *rb,
1662                                                 unsigned long page)
1663 {
1664     bool ret;
1665
1666     qemu_mutex_lock(&rs->bitmap_mutex);
1667     ret = test_and_clear_bit(page, rb->bmap);
1668
1669     if (ret) {
1670         rs->migration_dirty_pages--;
1671     }
1672     qemu_mutex_unlock(&rs->bitmap_mutex);
1673
1674     return ret;
1675 }
1676
1677 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1678                                         ram_addr_t length)
1679 {
1680     rs->migration_dirty_pages +=
1681         cpu_physical_memory_sync_dirty_bitmap(rb, 0, length,
1682                                               &rs->num_dirty_pages_period);
1683 }
1684
1685 /**
1686  * ram_pagesize_summary: calculate all the pagesizes of a VM
1687  *
1688  * Returns a summary bitmap of the page sizes of all RAMBlocks
1689  *
1690  * For VMs with just normal pages this is equivalent to the host page
1691  * size. If it's got some huge pages then it's the OR of all the
1692  * different page sizes.
1693  */
1694 uint64_t ram_pagesize_summary(void)
1695 {
1696     RAMBlock *block;
1697     uint64_t summary = 0;
1698
1699     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1700         summary |= block->page_size;
1701     }
1702
1703     return summary;
1704 }
1705
1706 uint64_t ram_get_total_transferred_pages(void)
1707 {
1708     return  ram_counters.normal + ram_counters.duplicate +
1709                 compression_counters.pages + xbzrle_counters.pages;
1710 }
1711
1712 static void migration_update_rates(RAMState *rs, int64_t end_time)
1713 {
1714     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1715     double compressed_size;
1716
1717     /* calculate period counters */
1718     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1719                 / (end_time - rs->time_last_bitmap_sync);
1720
1721     if (!page_count) {
1722         return;
1723     }
1724
1725     if (migrate_use_xbzrle()) {
1726         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1727             rs->xbzrle_cache_miss_prev) / page_count;
1728         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1729     }
1730
1731     if (migrate_use_compression()) {
1732         compression_counters.busy_rate = (double)(compression_counters.busy -
1733             rs->compress_thread_busy_prev) / page_count;
1734         rs->compress_thread_busy_prev = compression_counters.busy;
1735
1736         compressed_size = compression_counters.compressed_size -
1737                           rs->compressed_size_prev;
1738         if (compressed_size) {
1739             double uncompressed_size = (compression_counters.pages -
1740                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1741
1742             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1743             compression_counters.compression_rate =
1744                                         uncompressed_size / compressed_size;
1745
1746             rs->compress_pages_prev = compression_counters.pages;
1747             rs->compressed_size_prev = compression_counters.compressed_size;
1748         }
1749     }
1750 }
1751
1752 static void migration_bitmap_sync(RAMState *rs)
1753 {
1754     RAMBlock *block;
1755     int64_t end_time;
1756     uint64_t bytes_xfer_now;
1757
1758     ram_counters.dirty_sync_count++;
1759
1760     if (!rs->time_last_bitmap_sync) {
1761         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1762     }
1763
1764     trace_migration_bitmap_sync_start();
1765     memory_global_dirty_log_sync();
1766
1767     qemu_mutex_lock(&rs->bitmap_mutex);
1768     rcu_read_lock();
1769     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1770         migration_bitmap_sync_range(rs, block, block->used_length);
1771     }
1772     ram_counters.remaining = ram_bytes_remaining();
1773     rcu_read_unlock();
1774     qemu_mutex_unlock(&rs->bitmap_mutex);
1775
1776     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1777
1778     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1779
1780     /* more than 1 second = 1000 millisecons */
1781     if (end_time > rs->time_last_bitmap_sync + 1000) {
1782         bytes_xfer_now = ram_counters.transferred;
1783
1784         /* During block migration the auto-converge logic incorrectly detects
1785          * that ram migration makes no progress. Avoid this by disabling the
1786          * throttling logic during the bulk phase of block migration. */
1787         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1788             /* The following detection logic can be refined later. For now:
1789                Check to see if the dirtied bytes is 50% more than the approx.
1790                amount of bytes that just got transferred since the last time we
1791                were in this routine. If that happens twice, start or increase
1792                throttling */
1793
1794             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1795                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1796                 (++rs->dirty_rate_high_cnt >= 2)) {
1797                     trace_migration_throttle();
1798                     rs->dirty_rate_high_cnt = 0;
1799                     mig_throttle_guest_down();
1800             }
1801         }
1802
1803         migration_update_rates(rs, end_time);
1804
1805         rs->target_page_count_prev = rs->target_page_count;
1806
1807         /* reset period counters */
1808         rs->time_last_bitmap_sync = end_time;
1809         rs->num_dirty_pages_period = 0;
1810         rs->bytes_xfer_prev = bytes_xfer_now;
1811     }
1812     if (migrate_use_events()) {
1813         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1814     }
1815 }
1816
1817 static void migration_bitmap_sync_precopy(RAMState *rs)
1818 {
1819     Error *local_err = NULL;
1820
1821     /*
1822      * The current notifier usage is just an optimization to migration, so we
1823      * don't stop the normal migration process in the error case.
1824      */
1825     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1826         error_report_err(local_err);
1827     }
1828
1829     migration_bitmap_sync(rs);
1830
1831     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1832         error_report_err(local_err);
1833     }
1834 }
1835
1836 /**
1837  * save_zero_page_to_file: send the zero page to the file
1838  *
1839  * Returns the size of data written to the file, 0 means the page is not
1840  * a zero page
1841  *
1842  * @rs: current RAM state
1843  * @file: the file where the data is saved
1844  * @block: block that contains the page we want to send
1845  * @offset: offset inside the block for the page
1846  */
1847 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1848                                   RAMBlock *block, ram_addr_t offset)
1849 {
1850     uint8_t *p = block->host + offset;
1851     int len = 0;
1852
1853     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1854         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1855         qemu_put_byte(file, 0);
1856         len += 1;
1857     }
1858     return len;
1859 }
1860
1861 /**
1862  * save_zero_page: send the zero page to the stream
1863  *
1864  * Returns the number of pages written.
1865  *
1866  * @rs: current RAM state
1867  * @block: block that contains the page we want to send
1868  * @offset: offset inside the block for the page
1869  */
1870 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1871 {
1872     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1873
1874     if (len) {
1875         ram_counters.duplicate++;
1876         ram_counters.transferred += len;
1877         return 1;
1878     }
1879     return -1;
1880 }
1881
1882 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1883 {
1884     if (!migrate_release_ram() || !migration_in_postcopy()) {
1885         return;
1886     }
1887
1888     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1889 }
1890
1891 /*
1892  * @pages: the number of pages written by the control path,
1893  *        < 0 - error
1894  *        > 0 - number of pages written
1895  *
1896  * Return true if the pages has been saved, otherwise false is returned.
1897  */
1898 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1899                               int *pages)
1900 {
1901     uint64_t bytes_xmit = 0;
1902     int ret;
1903
1904     *pages = -1;
1905     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1906                                 &bytes_xmit);
1907     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1908         return false;
1909     }
1910
1911     if (bytes_xmit) {
1912         ram_counters.transferred += bytes_xmit;
1913         *pages = 1;
1914     }
1915
1916     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1917         return true;
1918     }
1919
1920     if (bytes_xmit > 0) {
1921         ram_counters.normal++;
1922     } else if (bytes_xmit == 0) {
1923         ram_counters.duplicate++;
1924     }
1925
1926     return true;
1927 }
1928
1929 /*
1930  * directly send the page to the stream
1931  *
1932  * Returns the number of pages written.
1933  *
1934  * @rs: current RAM state
1935  * @block: block that contains the page we want to send
1936  * @offset: offset inside the block for the page
1937  * @buf: the page to be sent
1938  * @async: send to page asyncly
1939  */
1940 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1941                             uint8_t *buf, bool async)
1942 {
1943     ram_counters.transferred += save_page_header(rs, rs->f, block,
1944                                                  offset | RAM_SAVE_FLAG_PAGE);
1945     if (async) {
1946         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1947                               migrate_release_ram() &
1948                               migration_in_postcopy());
1949     } else {
1950         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1951     }
1952     ram_counters.transferred += TARGET_PAGE_SIZE;
1953     ram_counters.normal++;
1954     return 1;
1955 }
1956
1957 /**
1958  * ram_save_page: send the given page to the stream
1959  *
1960  * Returns the number of pages written.
1961  *          < 0 - error
1962  *          >=0 - Number of pages written - this might legally be 0
1963  *                if xbzrle noticed the page was the same.
1964  *
1965  * @rs: current RAM state
1966  * @block: block that contains the page we want to send
1967  * @offset: offset inside the block for the page
1968  * @last_stage: if we are at the completion stage
1969  */
1970 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1971 {
1972     int pages = -1;
1973     uint8_t *p;
1974     bool send_async = true;
1975     RAMBlock *block = pss->block;
1976     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1977     ram_addr_t current_addr = block->offset + offset;
1978
1979     p = block->host + offset;
1980     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1981
1982     XBZRLE_cache_lock();
1983     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1984         migrate_use_xbzrle()) {
1985         pages = save_xbzrle_page(rs, &p, current_addr, block,
1986                                  offset, last_stage);
1987         if (!last_stage) {
1988             /* Can't send this cached data async, since the cache page
1989              * might get updated before it gets to the wire
1990              */
1991             send_async = false;
1992         }
1993     }
1994
1995     /* XBZRLE overflow or normal page */
1996     if (pages == -1) {
1997         pages = save_normal_page(rs, block, offset, p, send_async);
1998     }
1999
2000     XBZRLE_cache_unlock();
2001
2002     return pages;
2003 }
2004
2005 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
2006                                  ram_addr_t offset)
2007 {
2008     multifd_queue_page(block, offset);
2009     ram_counters.normal++;
2010
2011     return 1;
2012 }
2013
2014 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
2015                                  ram_addr_t offset, uint8_t *source_buf)
2016 {
2017     RAMState *rs = ram_state;
2018     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
2019     bool zero_page = false;
2020     int ret;
2021
2022     if (save_zero_page_to_file(rs, f, block, offset)) {
2023         zero_page = true;
2024         goto exit;
2025     }
2026
2027     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
2028
2029     /*
2030      * copy it to a internal buffer to avoid it being modified by VM
2031      * so that we can catch up the error during compression and
2032      * decompression
2033      */
2034     memcpy(source_buf, p, TARGET_PAGE_SIZE);
2035     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2036     if (ret < 0) {
2037         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
2038         error_report("compressed data failed!");
2039         return false;
2040     }
2041
2042 exit:
2043     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
2044     return zero_page;
2045 }
2046
2047 static void
2048 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2049 {
2050     ram_counters.transferred += bytes_xmit;
2051
2052     if (param->zero_page) {
2053         ram_counters.duplicate++;
2054         return;
2055     }
2056
2057     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2058     compression_counters.compressed_size += bytes_xmit - 8;
2059     compression_counters.pages++;
2060 }
2061
2062 static bool save_page_use_compression(RAMState *rs);
2063
2064 static void flush_compressed_data(RAMState *rs)
2065 {
2066     int idx, len, thread_count;
2067
2068     if (!save_page_use_compression(rs)) {
2069         return;
2070     }
2071     thread_count = migrate_compress_threads();
2072
2073     qemu_mutex_lock(&comp_done_lock);
2074     for (idx = 0; idx < thread_count; idx++) {
2075         while (!comp_param[idx].done) {
2076             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2077         }
2078     }
2079     qemu_mutex_unlock(&comp_done_lock);
2080
2081     for (idx = 0; idx < thread_count; idx++) {
2082         qemu_mutex_lock(&comp_param[idx].mutex);
2083         if (!comp_param[idx].quit) {
2084             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2085             /*
2086              * it's safe to fetch zero_page without holding comp_done_lock
2087              * as there is no further request submitted to the thread,
2088              * i.e, the thread should be waiting for a request at this point.
2089              */
2090             update_compress_thread_counts(&comp_param[idx], len);
2091         }
2092         qemu_mutex_unlock(&comp_param[idx].mutex);
2093     }
2094 }
2095
2096 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2097                                        ram_addr_t offset)
2098 {
2099     param->block = block;
2100     param->offset = offset;
2101 }
2102
2103 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2104                                            ram_addr_t offset)
2105 {
2106     int idx, thread_count, bytes_xmit = -1, pages = -1;
2107     bool wait = migrate_compress_wait_thread();
2108
2109     thread_count = migrate_compress_threads();
2110     qemu_mutex_lock(&comp_done_lock);
2111 retry:
2112     for (idx = 0; idx < thread_count; idx++) {
2113         if (comp_param[idx].done) {
2114             comp_param[idx].done = false;
2115             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2116             qemu_mutex_lock(&comp_param[idx].mutex);
2117             set_compress_params(&comp_param[idx], block, offset);
2118             qemu_cond_signal(&comp_param[idx].cond);
2119             qemu_mutex_unlock(&comp_param[idx].mutex);
2120             pages = 1;
2121             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
2122             break;
2123         }
2124     }
2125
2126     /*
2127      * wait for the free thread if the user specifies 'compress-wait-thread',
2128      * otherwise we will post the page out in the main thread as normal page.
2129      */
2130     if (pages < 0 && wait) {
2131         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2132         goto retry;
2133     }
2134     qemu_mutex_unlock(&comp_done_lock);
2135
2136     return pages;
2137 }
2138
2139 /**
2140  * find_dirty_block: find the next dirty page and update any state
2141  * associated with the search process.
2142  *
2143  * Returns true if a page is found
2144  *
2145  * @rs: current RAM state
2146  * @pss: data about the state of the current dirty page scan
2147  * @again: set to false if the search has scanned the whole of RAM
2148  */
2149 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
2150 {
2151     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2152     if (pss->complete_round && pss->block == rs->last_seen_block &&
2153         pss->page >= rs->last_page) {
2154         /*
2155          * We've been once around the RAM and haven't found anything.
2156          * Give up.
2157          */
2158         *again = false;
2159         return false;
2160     }
2161     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
2162         /* Didn't find anything in this RAM Block */
2163         pss->page = 0;
2164         pss->block = QLIST_NEXT_RCU(pss->block, next);
2165         if (!pss->block) {
2166             /*
2167              * If memory migration starts over, we will meet a dirtied page
2168              * which may still exists in compression threads's ring, so we
2169              * should flush the compressed data to make sure the new page
2170              * is not overwritten by the old one in the destination.
2171              *
2172              * Also If xbzrle is on, stop using the data compression at this
2173              * point. In theory, xbzrle can do better than compression.
2174              */
2175             flush_compressed_data(rs);
2176
2177             /* Hit the end of the list */
2178             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2179             /* Flag that we've looped */
2180             pss->complete_round = true;
2181             rs->ram_bulk_stage = false;
2182         }
2183         /* Didn't find anything this time, but try again on the new block */
2184         *again = true;
2185         return false;
2186     } else {
2187         /* Can go around again, but... */
2188         *again = true;
2189         /* We've found something so probably don't need to */
2190         return true;
2191     }
2192 }
2193
2194 /**
2195  * unqueue_page: gets a page of the queue
2196  *
2197  * Helper for 'get_queued_page' - gets a page off the queue
2198  *
2199  * Returns the block of the page (or NULL if none available)
2200  *
2201  * @rs: current RAM state
2202  * @offset: used to return the offset within the RAMBlock
2203  */
2204 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2205 {
2206     RAMBlock *block = NULL;
2207
2208     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2209         return NULL;
2210     }
2211
2212     qemu_mutex_lock(&rs->src_page_req_mutex);
2213     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2214         struct RAMSrcPageRequest *entry =
2215                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
2216         block = entry->rb;
2217         *offset = entry->offset;
2218
2219         if (entry->len > TARGET_PAGE_SIZE) {
2220             entry->len -= TARGET_PAGE_SIZE;
2221             entry->offset += TARGET_PAGE_SIZE;
2222         } else {
2223             memory_region_unref(block->mr);
2224             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2225             g_free(entry);
2226             migration_consume_urgent_request();
2227         }
2228     }
2229     qemu_mutex_unlock(&rs->src_page_req_mutex);
2230
2231     return block;
2232 }
2233
2234 /**
2235  * get_queued_page: unqueue a page from the postocpy requests
2236  *
2237  * Skips pages that are already sent (!dirty)
2238  *
2239  * Returns true if a queued page is found
2240  *
2241  * @rs: current RAM state
2242  * @pss: data about the state of the current dirty page scan
2243  */
2244 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2245 {
2246     RAMBlock  *block;
2247     ram_addr_t offset;
2248     bool dirty;
2249
2250     do {
2251         block = unqueue_page(rs, &offset);
2252         /*
2253          * We're sending this page, and since it's postcopy nothing else
2254          * will dirty it, and we must make sure it doesn't get sent again
2255          * even if this queue request was received after the background
2256          * search already sent it.
2257          */
2258         if (block) {
2259             unsigned long page;
2260
2261             page = offset >> TARGET_PAGE_BITS;
2262             dirty = test_bit(page, block->bmap);
2263             if (!dirty) {
2264                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2265                        page, test_bit(page, block->unsentmap));
2266             } else {
2267                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2268             }
2269         }
2270
2271     } while (block && !dirty);
2272
2273     if (block) {
2274         /*
2275          * As soon as we start servicing pages out of order, then we have
2276          * to kill the bulk stage, since the bulk stage assumes
2277          * in (migration_bitmap_find_and_reset_dirty) that every page is
2278          * dirty, that's no longer true.
2279          */
2280         rs->ram_bulk_stage = false;
2281
2282         /*
2283          * We want the background search to continue from the queued page
2284          * since the guest is likely to want other pages near to the page
2285          * it just requested.
2286          */
2287         pss->block = block;
2288         pss->page = offset >> TARGET_PAGE_BITS;
2289     }
2290
2291     return !!block;
2292 }
2293
2294 /**
2295  * migration_page_queue_free: drop any remaining pages in the ram
2296  * request queue
2297  *
2298  * It should be empty at the end anyway, but in error cases there may
2299  * be some left.  in case that there is any page left, we drop it.
2300  *
2301  */
2302 static void migration_page_queue_free(RAMState *rs)
2303 {
2304     struct RAMSrcPageRequest *mspr, *next_mspr;
2305     /* This queue generally should be empty - but in the case of a failed
2306      * migration might have some droppings in.
2307      */
2308     rcu_read_lock();
2309     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2310         memory_region_unref(mspr->rb->mr);
2311         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2312         g_free(mspr);
2313     }
2314     rcu_read_unlock();
2315 }
2316
2317 /**
2318  * ram_save_queue_pages: queue the page for transmission
2319  *
2320  * A request from postcopy destination for example.
2321  *
2322  * Returns zero on success or negative on error
2323  *
2324  * @rbname: Name of the RAMBLock of the request. NULL means the
2325  *          same that last one.
2326  * @start: starting address from the start of the RAMBlock
2327  * @len: length (in bytes) to send
2328  */
2329 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2330 {
2331     RAMBlock *ramblock;
2332     RAMState *rs = ram_state;
2333
2334     ram_counters.postcopy_requests++;
2335     rcu_read_lock();
2336     if (!rbname) {
2337         /* Reuse last RAMBlock */
2338         ramblock = rs->last_req_rb;
2339
2340         if (!ramblock) {
2341             /*
2342              * Shouldn't happen, we can't reuse the last RAMBlock if
2343              * it's the 1st request.
2344              */
2345             error_report("ram_save_queue_pages no previous block");
2346             goto err;
2347         }
2348     } else {
2349         ramblock = qemu_ram_block_by_name(rbname);
2350
2351         if (!ramblock) {
2352             /* We shouldn't be asked for a non-existent RAMBlock */
2353             error_report("ram_save_queue_pages no block '%s'", rbname);
2354             goto err;
2355         }
2356         rs->last_req_rb = ramblock;
2357     }
2358     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2359     if (start+len > ramblock->used_length) {
2360         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2361                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2362                      __func__, start, len, ramblock->used_length);
2363         goto err;
2364     }
2365
2366     struct RAMSrcPageRequest *new_entry =
2367         g_malloc0(sizeof(struct RAMSrcPageRequest));
2368     new_entry->rb = ramblock;
2369     new_entry->offset = start;
2370     new_entry->len = len;
2371
2372     memory_region_ref(ramblock->mr);
2373     qemu_mutex_lock(&rs->src_page_req_mutex);
2374     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2375     migration_make_urgent_request();
2376     qemu_mutex_unlock(&rs->src_page_req_mutex);
2377     rcu_read_unlock();
2378
2379     return 0;
2380
2381 err:
2382     rcu_read_unlock();
2383     return -1;
2384 }
2385
2386 static bool save_page_use_compression(RAMState *rs)
2387 {
2388     if (!migrate_use_compression()) {
2389         return false;
2390     }
2391
2392     /*
2393      * If xbzrle is on, stop using the data compression after first
2394      * round of migration even if compression is enabled. In theory,
2395      * xbzrle can do better than compression.
2396      */
2397     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2398         return true;
2399     }
2400
2401     return false;
2402 }
2403
2404 /*
2405  * try to compress the page before posting it out, return true if the page
2406  * has been properly handled by compression, otherwise needs other
2407  * paths to handle it
2408  */
2409 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2410 {
2411     if (!save_page_use_compression(rs)) {
2412         return false;
2413     }
2414
2415     /*
2416      * When starting the process of a new block, the first page of
2417      * the block should be sent out before other pages in the same
2418      * block, and all the pages in last block should have been sent
2419      * out, keeping this order is important, because the 'cont' flag
2420      * is used to avoid resending the block name.
2421      *
2422      * We post the fist page as normal page as compression will take
2423      * much CPU resource.
2424      */
2425     if (block != rs->last_sent_block) {
2426         flush_compressed_data(rs);
2427         return false;
2428     }
2429
2430     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2431         return true;
2432     }
2433
2434     compression_counters.busy++;
2435     return false;
2436 }
2437
2438 /**
2439  * ram_save_target_page: save one target page
2440  *
2441  * Returns the number of pages written
2442  *
2443  * @rs: current RAM state
2444  * @pss: data about the page we want to send
2445  * @last_stage: if we are at the completion stage
2446  */
2447 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2448                                 bool last_stage)
2449 {
2450     RAMBlock *block = pss->block;
2451     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2452     int res;
2453
2454     if (control_save_page(rs, block, offset, &res)) {
2455         return res;
2456     }
2457
2458     if (save_compress_page(rs, block, offset)) {
2459         return 1;
2460     }
2461
2462     res = save_zero_page(rs, block, offset);
2463     if (res > 0) {
2464         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2465          * page would be stale
2466          */
2467         if (!save_page_use_compression(rs)) {
2468             XBZRLE_cache_lock();
2469             xbzrle_cache_zero_page(rs, block->offset + offset);
2470             XBZRLE_cache_unlock();
2471         }
2472         ram_release_pages(block->idstr, offset, res);
2473         return res;
2474     }
2475
2476     /*
2477      * do not use multifd for compression as the first page in the new
2478      * block should be posted out before sending the compressed page
2479      */
2480     if (!save_page_use_compression(rs) && migrate_use_multifd()) {
2481         return ram_save_multifd_page(rs, block, offset);
2482     }
2483
2484     return ram_save_page(rs, pss, last_stage);
2485 }
2486
2487 /**
2488  * ram_save_host_page: save a whole host page
2489  *
2490  * Starting at *offset send pages up to the end of the current host
2491  * page. It's valid for the initial offset to point into the middle of
2492  * a host page in which case the remainder of the hostpage is sent.
2493  * Only dirty target pages are sent. Note that the host page size may
2494  * be a huge page for this block.
2495  * The saving stops at the boundary of the used_length of the block
2496  * if the RAMBlock isn't a multiple of the host page size.
2497  *
2498  * Returns the number of pages written or negative on error
2499  *
2500  * @rs: current RAM state
2501  * @ms: current migration state
2502  * @pss: data about the page we want to send
2503  * @last_stage: if we are at the completion stage
2504  */
2505 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2506                               bool last_stage)
2507 {
2508     int tmppages, pages = 0;
2509     size_t pagesize_bits =
2510         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2511
2512     if (ramblock_is_ignored(pss->block)) {
2513         error_report("block %s should not be migrated !", pss->block->idstr);
2514         return 0;
2515     }
2516
2517     do {
2518         /* Check the pages is dirty and if it is send it */
2519         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2520             pss->page++;
2521             continue;
2522         }
2523
2524         tmppages = ram_save_target_page(rs, pss, last_stage);
2525         if (tmppages < 0) {
2526             return tmppages;
2527         }
2528
2529         pages += tmppages;
2530         if (pss->block->unsentmap) {
2531             clear_bit(pss->page, pss->block->unsentmap);
2532         }
2533
2534         pss->page++;
2535     } while ((pss->page & (pagesize_bits - 1)) &&
2536              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2537
2538     /* The offset we leave with is the last one we looked at */
2539     pss->page--;
2540     return pages;
2541 }
2542
2543 /**
2544  * ram_find_and_save_block: finds a dirty page and sends it to f
2545  *
2546  * Called within an RCU critical section.
2547  *
2548  * Returns the number of pages written where zero means no dirty pages,
2549  * or negative on error
2550  *
2551  * @rs: current RAM state
2552  * @last_stage: if we are at the completion stage
2553  *
2554  * On systems where host-page-size > target-page-size it will send all the
2555  * pages in a host page that are dirty.
2556  */
2557
2558 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2559 {
2560     PageSearchStatus pss;
2561     int pages = 0;
2562     bool again, found;
2563
2564     /* No dirty page as there is zero RAM */
2565     if (!ram_bytes_total()) {
2566         return pages;
2567     }
2568
2569     pss.block = rs->last_seen_block;
2570     pss.page = rs->last_page;
2571     pss.complete_round = false;
2572
2573     if (!pss.block) {
2574         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2575     }
2576
2577     do {
2578         again = true;
2579         found = get_queued_page(rs, &pss);
2580
2581         if (!found) {
2582             /* priority queue empty, so just search for something dirty */
2583             found = find_dirty_block(rs, &pss, &again);
2584         }
2585
2586         if (found) {
2587             pages = ram_save_host_page(rs, &pss, last_stage);
2588         }
2589     } while (!pages && again);
2590
2591     rs->last_seen_block = pss.block;
2592     rs->last_page = pss.page;
2593
2594     return pages;
2595 }
2596
2597 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2598 {
2599     uint64_t pages = size / TARGET_PAGE_SIZE;
2600
2601     if (zero) {
2602         ram_counters.duplicate += pages;
2603     } else {
2604         ram_counters.normal += pages;
2605         ram_counters.transferred += size;
2606         qemu_update_position(f, size);
2607     }
2608 }
2609
2610 static uint64_t ram_bytes_total_common(bool count_ignored)
2611 {
2612     RAMBlock *block;
2613     uint64_t total = 0;
2614
2615     rcu_read_lock();
2616     if (count_ignored) {
2617         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2618             total += block->used_length;
2619         }
2620     } else {
2621         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2622             total += block->used_length;
2623         }
2624     }
2625     rcu_read_unlock();
2626     return total;
2627 }
2628
2629 uint64_t ram_bytes_total(void)
2630 {
2631     return ram_bytes_total_common(false);
2632 }
2633
2634 static void xbzrle_load_setup(void)
2635 {
2636     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2637 }
2638
2639 static void xbzrle_load_cleanup(void)
2640 {
2641     g_free(XBZRLE.decoded_buf);
2642     XBZRLE.decoded_buf = NULL;
2643 }
2644
2645 static void ram_state_cleanup(RAMState **rsp)
2646 {
2647     if (*rsp) {
2648         migration_page_queue_free(*rsp);
2649         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2650         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2651         g_free(*rsp);
2652         *rsp = NULL;
2653     }
2654 }
2655
2656 static void xbzrle_cleanup(void)
2657 {
2658     XBZRLE_cache_lock();
2659     if (XBZRLE.cache) {
2660         cache_fini(XBZRLE.cache);
2661         g_free(XBZRLE.encoded_buf);
2662         g_free(XBZRLE.current_buf);
2663         g_free(XBZRLE.zero_target_page);
2664         XBZRLE.cache = NULL;
2665         XBZRLE.encoded_buf = NULL;
2666         XBZRLE.current_buf = NULL;
2667         XBZRLE.zero_target_page = NULL;
2668     }
2669     XBZRLE_cache_unlock();
2670 }
2671
2672 static void ram_save_cleanup(void *opaque)
2673 {
2674     RAMState **rsp = opaque;
2675     RAMBlock *block;
2676
2677     /* caller have hold iothread lock or is in a bh, so there is
2678      * no writing race against the migration bitmap
2679      */
2680     memory_global_dirty_log_stop();
2681
2682     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2683         g_free(block->bmap);
2684         block->bmap = NULL;
2685         g_free(block->unsentmap);
2686         block->unsentmap = NULL;
2687     }
2688
2689     xbzrle_cleanup();
2690     compress_threads_save_cleanup();
2691     ram_state_cleanup(rsp);
2692 }
2693
2694 static void ram_state_reset(RAMState *rs)
2695 {
2696     rs->last_seen_block = NULL;
2697     rs->last_sent_block = NULL;
2698     rs->last_page = 0;
2699     rs->last_version = ram_list.version;
2700     rs->ram_bulk_stage = true;
2701     rs->fpo_enabled = false;
2702 }
2703
2704 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2705
2706 /*
2707  * 'expected' is the value you expect the bitmap mostly to be full
2708  * of; it won't bother printing lines that are all this value.
2709  * If 'todump' is null the migration bitmap is dumped.
2710  */
2711 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2712                            unsigned long pages)
2713 {
2714     int64_t cur;
2715     int64_t linelen = 128;
2716     char linebuf[129];
2717
2718     for (cur = 0; cur < pages; cur += linelen) {
2719         int64_t curb;
2720         bool found = false;
2721         /*
2722          * Last line; catch the case where the line length
2723          * is longer than remaining ram
2724          */
2725         if (cur + linelen > pages) {
2726             linelen = pages - cur;
2727         }
2728         for (curb = 0; curb < linelen; curb++) {
2729             bool thisbit = test_bit(cur + curb, todump);
2730             linebuf[curb] = thisbit ? '1' : '.';
2731             found = found || (thisbit != expected);
2732         }
2733         if (found) {
2734             linebuf[curb] = '\0';
2735             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2736         }
2737     }
2738 }
2739
2740 /* **** functions for postcopy ***** */
2741
2742 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2743 {
2744     struct RAMBlock *block;
2745
2746     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2747         unsigned long *bitmap = block->bmap;
2748         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2749         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2750
2751         while (run_start < range) {
2752             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2753             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2754                               (run_end - run_start) << TARGET_PAGE_BITS);
2755             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2756         }
2757     }
2758 }
2759
2760 /**
2761  * postcopy_send_discard_bm_ram: discard a RAMBlock
2762  *
2763  * Returns zero on success
2764  *
2765  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2766  * Note: At this point the 'unsentmap' is the processed bitmap combined
2767  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2768  *
2769  * @ms: current migration state
2770  * @pds: state for postcopy
2771  * @start: RAMBlock starting page
2772  * @length: RAMBlock size
2773  */
2774 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2775                                         PostcopyDiscardState *pds,
2776                                         RAMBlock *block)
2777 {
2778     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2779     unsigned long current;
2780     unsigned long *unsentmap = block->unsentmap;
2781
2782     for (current = 0; current < end; ) {
2783         unsigned long one = find_next_bit(unsentmap, end, current);
2784
2785         if (one <= end) {
2786             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2787             unsigned long discard_length;
2788
2789             if (zero >= end) {
2790                 discard_length = end - one;
2791             } else {
2792                 discard_length = zero - one;
2793             }
2794             if (discard_length) {
2795                 postcopy_discard_send_range(ms, pds, one, discard_length);
2796             }
2797             current = one + discard_length;
2798         } else {
2799             current = one;
2800         }
2801     }
2802
2803     return 0;
2804 }
2805
2806 /**
2807  * postcopy_each_ram_send_discard: discard all RAMBlocks
2808  *
2809  * Returns 0 for success or negative for error
2810  *
2811  * Utility for the outgoing postcopy code.
2812  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2813  *   passing it bitmap indexes and name.
2814  * (qemu_ram_foreach_block ends up passing unscaled lengths
2815  *  which would mean postcopy code would have to deal with target page)
2816  *
2817  * @ms: current migration state
2818  */
2819 static int postcopy_each_ram_send_discard(MigrationState *ms)
2820 {
2821     struct RAMBlock *block;
2822     int ret;
2823
2824     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2825         PostcopyDiscardState *pds =
2826             postcopy_discard_send_init(ms, block->idstr);
2827
2828         /*
2829          * Postcopy sends chunks of bitmap over the wire, but it
2830          * just needs indexes at this point, avoids it having
2831          * target page specific code.
2832          */
2833         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2834         postcopy_discard_send_finish(ms, pds);
2835         if (ret) {
2836             return ret;
2837         }
2838     }
2839
2840     return 0;
2841 }
2842
2843 /**
2844  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2845  *
2846  * Helper for postcopy_chunk_hostpages; it's called twice to
2847  * canonicalize the two bitmaps, that are similar, but one is
2848  * inverted.
2849  *
2850  * Postcopy requires that all target pages in a hostpage are dirty or
2851  * clean, not a mix.  This function canonicalizes the bitmaps.
2852  *
2853  * @ms: current migration state
2854  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2855  *               otherwise we need to canonicalize partially dirty host pages
2856  * @block: block that contains the page we want to canonicalize
2857  * @pds: state for postcopy
2858  */
2859 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2860                                           RAMBlock *block,
2861                                           PostcopyDiscardState *pds)
2862 {
2863     RAMState *rs = ram_state;
2864     unsigned long *bitmap = block->bmap;
2865     unsigned long *unsentmap = block->unsentmap;
2866     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2867     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2868     unsigned long run_start;
2869
2870     if (block->page_size == TARGET_PAGE_SIZE) {
2871         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2872         return;
2873     }
2874
2875     if (unsent_pass) {
2876         /* Find a sent page */
2877         run_start = find_next_zero_bit(unsentmap, pages, 0);
2878     } else {
2879         /* Find a dirty page */
2880         run_start = find_next_bit(bitmap, pages, 0);
2881     }
2882
2883     while (run_start < pages) {
2884         bool do_fixup = false;
2885         unsigned long fixup_start_addr;
2886         unsigned long host_offset;
2887
2888         /*
2889          * If the start of this run of pages is in the middle of a host
2890          * page, then we need to fixup this host page.
2891          */
2892         host_offset = run_start % host_ratio;
2893         if (host_offset) {
2894             do_fixup = true;
2895             run_start -= host_offset;
2896             fixup_start_addr = run_start;
2897             /* For the next pass */
2898             run_start = run_start + host_ratio;
2899         } else {
2900             /* Find the end of this run */
2901             unsigned long run_end;
2902             if (unsent_pass) {
2903                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2904             } else {
2905                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2906             }
2907             /*
2908              * If the end isn't at the start of a host page, then the
2909              * run doesn't finish at the end of a host page
2910              * and we need to discard.
2911              */
2912             host_offset = run_end % host_ratio;
2913             if (host_offset) {
2914                 do_fixup = true;
2915                 fixup_start_addr = run_end - host_offset;
2916                 /*
2917                  * This host page has gone, the next loop iteration starts
2918                  * from after the fixup
2919                  */
2920                 run_start = fixup_start_addr + host_ratio;
2921             } else {
2922                 /*
2923                  * No discards on this iteration, next loop starts from
2924                  * next sent/dirty page
2925                  */
2926                 run_start = run_end + 1;
2927             }
2928         }
2929
2930         if (do_fixup) {
2931             unsigned long page;
2932
2933             /* Tell the destination to discard this page */
2934             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2935                 /* For the unsent_pass we:
2936                  *     discard partially sent pages
2937                  * For the !unsent_pass (dirty) we:
2938                  *     discard partially dirty pages that were sent
2939                  *     (any partially sent pages were already discarded
2940                  *     by the previous unsent_pass)
2941                  */
2942                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2943                                             host_ratio);
2944             }
2945
2946             /* Clean up the bitmap */
2947             for (page = fixup_start_addr;
2948                  page < fixup_start_addr + host_ratio; page++) {
2949                 /* All pages in this host page are now not sent */
2950                 set_bit(page, unsentmap);
2951
2952                 /*
2953                  * Remark them as dirty, updating the count for any pages
2954                  * that weren't previously dirty.
2955                  */
2956                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2957             }
2958         }
2959
2960         if (unsent_pass) {
2961             /* Find the next sent page for the next iteration */
2962             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2963         } else {
2964             /* Find the next dirty page for the next iteration */
2965             run_start = find_next_bit(bitmap, pages, run_start);
2966         }
2967     }
2968 }
2969
2970 /**
2971  * postcopy_chuck_hostpages: discrad any partially sent host page
2972  *
2973  * Utility for the outgoing postcopy code.
2974  *
2975  * Discard any partially sent host-page size chunks, mark any partially
2976  * dirty host-page size chunks as all dirty.  In this case the host-page
2977  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2978  *
2979  * Returns zero on success
2980  *
2981  * @ms: current migration state
2982  * @block: block we want to work with
2983  */
2984 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2985 {
2986     PostcopyDiscardState *pds =
2987         postcopy_discard_send_init(ms, block->idstr);
2988
2989     /* First pass: Discard all partially sent host pages */
2990     postcopy_chunk_hostpages_pass(ms, true, block, pds);
2991     /*
2992      * Second pass: Ensure that all partially dirty host pages are made
2993      * fully dirty.
2994      */
2995     postcopy_chunk_hostpages_pass(ms, false, block, pds);
2996
2997     postcopy_discard_send_finish(ms, pds);
2998     return 0;
2999 }
3000
3001 /**
3002  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3003  *
3004  * Returns zero on success
3005  *
3006  * Transmit the set of pages to be discarded after precopy to the target
3007  * these are pages that:
3008  *     a) Have been previously transmitted but are now dirty again
3009  *     b) Pages that have never been transmitted, this ensures that
3010  *        any pages on the destination that have been mapped by background
3011  *        tasks get discarded (transparent huge pages is the specific concern)
3012  * Hopefully this is pretty sparse
3013  *
3014  * @ms: current migration state
3015  */
3016 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
3017 {
3018     RAMState *rs = ram_state;
3019     RAMBlock *block;
3020     int ret;
3021
3022     rcu_read_lock();
3023
3024     /* This should be our last sync, the src is now paused */
3025     migration_bitmap_sync(rs);
3026
3027     /* Easiest way to make sure we don't resume in the middle of a host-page */
3028     rs->last_seen_block = NULL;
3029     rs->last_sent_block = NULL;
3030     rs->last_page = 0;
3031
3032     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3033         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
3034         unsigned long *bitmap = block->bmap;
3035         unsigned long *unsentmap = block->unsentmap;
3036
3037         if (!unsentmap) {
3038             /* We don't have a safe way to resize the sentmap, so
3039              * if the bitmap was resized it will be NULL at this
3040              * point.
3041              */
3042             error_report("migration ram resized during precopy phase");
3043             rcu_read_unlock();
3044             return -EINVAL;
3045         }
3046         /* Deal with TPS != HPS and huge pages */
3047         ret = postcopy_chunk_hostpages(ms, block);
3048         if (ret) {
3049             rcu_read_unlock();
3050             return ret;
3051         }
3052
3053         /*
3054          * Update the unsentmap to be unsentmap = unsentmap | dirty
3055          */
3056         bitmap_or(unsentmap, unsentmap, bitmap, pages);
3057 #ifdef DEBUG_POSTCOPY
3058         ram_debug_dump_bitmap(unsentmap, true, pages);
3059 #endif
3060     }
3061     trace_ram_postcopy_send_discard_bitmap();
3062
3063     ret = postcopy_each_ram_send_discard(ms);
3064     rcu_read_unlock();
3065
3066     return ret;
3067 }
3068
3069 /**
3070  * ram_discard_range: discard dirtied pages at the beginning of postcopy
3071  *
3072  * Returns zero on success
3073  *
3074  * @rbname: name of the RAMBlock of the request. NULL means the
3075  *          same that last one.
3076  * @start: RAMBlock starting page
3077  * @length: RAMBlock size
3078  */
3079 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3080 {
3081     int ret = -1;
3082
3083     trace_ram_discard_range(rbname, start, length);
3084
3085     rcu_read_lock();
3086     RAMBlock *rb = qemu_ram_block_by_name(rbname);
3087
3088     if (!rb) {
3089         error_report("ram_discard_range: Failed to find block '%s'", rbname);
3090         goto err;
3091     }
3092
3093     /*
3094      * On source VM, we don't need to update the received bitmap since
3095      * we don't even have one.
3096      */
3097     if (rb->receivedmap) {
3098         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3099                      length >> qemu_target_page_bits());
3100     }
3101
3102     ret = ram_block_discard_range(rb, start, length);
3103
3104 err:
3105     rcu_read_unlock();
3106
3107     return ret;
3108 }
3109
3110 /*
3111  * For every allocation, we will try not to crash the VM if the
3112  * allocation failed.
3113  */
3114 static int xbzrle_init(void)
3115 {
3116     Error *local_err = NULL;
3117
3118     if (!migrate_use_xbzrle()) {
3119         return 0;
3120     }
3121
3122     XBZRLE_cache_lock();
3123
3124     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3125     if (!XBZRLE.zero_target_page) {
3126         error_report("%s: Error allocating zero page", __func__);
3127         goto err_out;
3128     }
3129
3130     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3131                               TARGET_PAGE_SIZE, &local_err);
3132     if (!XBZRLE.cache) {
3133         error_report_err(local_err);
3134         goto free_zero_page;
3135     }
3136
3137     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3138     if (!XBZRLE.encoded_buf) {
3139         error_report("%s: Error allocating encoded_buf", __func__);
3140         goto free_cache;
3141     }
3142
3143     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3144     if (!XBZRLE.current_buf) {
3145         error_report("%s: Error allocating current_buf", __func__);
3146         goto free_encoded_buf;
3147     }
3148
3149     /* We are all good */
3150     XBZRLE_cache_unlock();
3151     return 0;
3152
3153 free_encoded_buf:
3154     g_free(XBZRLE.encoded_buf);
3155     XBZRLE.encoded_buf = NULL;
3156 free_cache:
3157     cache_fini(XBZRLE.cache);
3158     XBZRLE.cache = NULL;
3159 free_zero_page:
3160     g_free(XBZRLE.zero_target_page);
3161     XBZRLE.zero_target_page = NULL;
3162 err_out:
3163     XBZRLE_cache_unlock();
3164     return -ENOMEM;
3165 }
3166
3167 static int ram_state_init(RAMState **rsp)
3168 {
3169     *rsp = g_try_new0(RAMState, 1);
3170
3171     if (!*rsp) {
3172         error_report("%s: Init ramstate fail", __func__);
3173         return -1;
3174     }
3175
3176     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3177     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3178     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3179
3180     /*
3181      * Count the total number of pages used by ram blocks not including any
3182      * gaps due to alignment or unplugs.
3183      */
3184     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3185
3186     ram_state_reset(*rsp);
3187
3188     return 0;
3189 }
3190
3191 static void ram_list_init_bitmaps(void)
3192 {
3193     RAMBlock *block;
3194     unsigned long pages;
3195
3196     /* Skip setting bitmap if there is no RAM */
3197     if (ram_bytes_total()) {
3198         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3199             pages = block->max_length >> TARGET_PAGE_BITS;
3200             block->bmap = bitmap_new(pages);
3201             bitmap_set(block->bmap, 0, pages);
3202             if (migrate_postcopy_ram()) {
3203                 block->unsentmap = bitmap_new(pages);
3204                 bitmap_set(block->unsentmap, 0, pages);
3205             }
3206         }
3207     }
3208 }
3209
3210 static void ram_init_bitmaps(RAMState *rs)
3211 {
3212     /* For memory_global_dirty_log_start below.  */
3213     qemu_mutex_lock_iothread();
3214     qemu_mutex_lock_ramlist();
3215     rcu_read_lock();
3216
3217     ram_list_init_bitmaps();
3218     memory_global_dirty_log_start();
3219     migration_bitmap_sync_precopy(rs);
3220
3221     rcu_read_unlock();
3222     qemu_mutex_unlock_ramlist();
3223     qemu_mutex_unlock_iothread();
3224 }
3225
3226 static int ram_init_all(RAMState **rsp)
3227 {
3228     if (ram_state_init(rsp)) {
3229         return -1;
3230     }
3231
3232     if (xbzrle_init()) {
3233         ram_state_cleanup(rsp);
3234         return -1;
3235     }
3236
3237     ram_init_bitmaps(*rsp);
3238
3239     return 0;
3240 }
3241
3242 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3243 {
3244     RAMBlock *block;
3245     uint64_t pages = 0;
3246
3247     /*
3248      * Postcopy is not using xbzrle/compression, so no need for that.
3249      * Also, since source are already halted, we don't need to care
3250      * about dirty page logging as well.
3251      */
3252
3253     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3254         pages += bitmap_count_one(block->bmap,
3255                                   block->used_length >> TARGET_PAGE_BITS);
3256     }
3257
3258     /* This may not be aligned with current bitmaps. Recalculate. */
3259     rs->migration_dirty_pages = pages;
3260
3261     rs->last_seen_block = NULL;
3262     rs->last_sent_block = NULL;
3263     rs->last_page = 0;
3264     rs->last_version = ram_list.version;
3265     /*
3266      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3267      * matter what we have sent.
3268      */
3269     rs->ram_bulk_stage = false;
3270
3271     /* Update RAMState cache of output QEMUFile */
3272     rs->f = out;
3273
3274     trace_ram_state_resume_prepare(pages);
3275 }
3276
3277 /*
3278  * This function clears bits of the free pages reported by the caller from the
3279  * migration dirty bitmap. @addr is the host address corresponding to the
3280  * start of the continuous guest free pages, and @len is the total bytes of
3281  * those pages.
3282  */
3283 void qemu_guest_free_page_hint(void *addr, size_t len)
3284 {
3285     RAMBlock *block;
3286     ram_addr_t offset;
3287     size_t used_len, start, npages;
3288     MigrationState *s = migrate_get_current();
3289
3290     /* This function is currently expected to be used during live migration */
3291     if (!migration_is_setup_or_active(s->state)) {
3292         return;
3293     }
3294
3295     for (; len > 0; len -= used_len, addr += used_len) {
3296         block = qemu_ram_block_from_host(addr, false, &offset);
3297         if (unlikely(!block || offset >= block->used_length)) {
3298             /*
3299              * The implementation might not support RAMBlock resize during
3300              * live migration, but it could happen in theory with future
3301              * updates. So we add a check here to capture that case.
3302              */
3303             error_report_once("%s unexpected error", __func__);
3304             return;
3305         }
3306
3307         if (len <= block->used_length - offset) {
3308             used_len = len;
3309         } else {
3310             used_len = block->used_length - offset;
3311         }
3312
3313         start = offset >> TARGET_PAGE_BITS;
3314         npages = used_len >> TARGET_PAGE_BITS;
3315
3316         qemu_mutex_lock(&ram_state->bitmap_mutex);
3317         ram_state->migration_dirty_pages -=
3318                       bitmap_count_one_with_offset(block->bmap, start, npages);
3319         bitmap_clear(block->bmap, start, npages);
3320         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3321     }
3322 }
3323
3324 /*
3325  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3326  * long-running RCU critical section.  When rcu-reclaims in the code
3327  * start to become numerous it will be necessary to reduce the
3328  * granularity of these critical sections.
3329  */
3330
3331 /**
3332  * ram_save_setup: Setup RAM for migration
3333  *
3334  * Returns zero to indicate success and negative for error
3335  *
3336  * @f: QEMUFile where to send the data
3337  * @opaque: RAMState pointer
3338  */
3339 static int ram_save_setup(QEMUFile *f, void *opaque)
3340 {
3341     RAMState **rsp = opaque;
3342     RAMBlock *block;
3343
3344     if (compress_threads_save_setup()) {
3345         return -1;
3346     }
3347
3348     /* migration has already setup the bitmap, reuse it. */
3349     if (!migration_in_colo_state()) {
3350         if (ram_init_all(rsp) != 0) {
3351             compress_threads_save_cleanup();
3352             return -1;
3353         }
3354     }
3355     (*rsp)->f = f;
3356
3357     rcu_read_lock();
3358
3359     qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3360
3361     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3362         qemu_put_byte(f, strlen(block->idstr));
3363         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3364         qemu_put_be64(f, block->used_length);
3365         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3366             qemu_put_be64(f, block->page_size);
3367         }
3368         if (migrate_ignore_shared()) {
3369             qemu_put_be64(f, block->mr->addr);
3370             qemu_put_byte(f, ramblock_is_ignored(block) ? 1 : 0);
3371         }
3372     }
3373
3374     rcu_read_unlock();
3375
3376     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3377     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3378
3379     multifd_send_sync_main();
3380     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3381     qemu_fflush(f);
3382
3383     return 0;
3384 }
3385
3386 /**
3387  * ram_save_iterate: iterative stage for migration
3388  *
3389  * Returns zero to indicate success and negative for error
3390  *
3391  * @f: QEMUFile where to send the data
3392  * @opaque: RAMState pointer
3393  */
3394 static int ram_save_iterate(QEMUFile *f, void *opaque)
3395 {
3396     RAMState **temp = opaque;
3397     RAMState *rs = *temp;
3398     int ret;
3399     int i;
3400     int64_t t0;
3401     int done = 0;
3402
3403     if (blk_mig_bulk_active()) {
3404         /* Avoid transferring ram during bulk phase of block migration as
3405          * the bulk phase will usually take a long time and transferring
3406          * ram updates during that time is pointless. */
3407         goto out;
3408     }
3409
3410     rcu_read_lock();
3411     if (ram_list.version != rs->last_version) {
3412         ram_state_reset(rs);
3413     }
3414
3415     /* Read version before ram_list.blocks */
3416     smp_rmb();
3417
3418     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3419
3420     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3421     i = 0;
3422     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3423             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3424         int pages;
3425
3426         if (qemu_file_get_error(f)) {
3427             break;
3428         }
3429
3430         pages = ram_find_and_save_block(rs, false);
3431         /* no more pages to sent */
3432         if (pages == 0) {
3433             done = 1;
3434             break;
3435         }
3436
3437         if (pages < 0) {
3438             qemu_file_set_error(f, pages);
3439             break;
3440         }
3441
3442         rs->target_page_count += pages;
3443
3444         /* we want to check in the 1st loop, just in case it was the 1st time
3445            and we had to sync the dirty bitmap.
3446            qemu_clock_get_ns() is a bit expensive, so we only check each some
3447            iterations
3448         */
3449         if ((i & 63) == 0) {
3450             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3451             if (t1 > MAX_WAIT) {
3452                 trace_ram_save_iterate_big_wait(t1, i);
3453                 break;
3454             }
3455         }
3456         i++;
3457     }
3458     rcu_read_unlock();
3459
3460     /*
3461      * Must occur before EOS (or any QEMUFile operation)
3462      * because of RDMA protocol.
3463      */
3464     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3465
3466     multifd_send_sync_main();
3467 out:
3468     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3469     qemu_fflush(f);
3470     ram_counters.transferred += 8;
3471
3472     ret = qemu_file_get_error(f);
3473     if (ret < 0) {
3474         return ret;
3475     }
3476
3477     return done;
3478 }
3479
3480 /**
3481  * ram_save_complete: function called to send the remaining amount of ram
3482  *
3483  * Returns zero to indicate success or negative on error
3484  *
3485  * Called with iothread lock
3486  *
3487  * @f: QEMUFile where to send the data
3488  * @opaque: RAMState pointer
3489  */
3490 static int ram_save_complete(QEMUFile *f, void *opaque)
3491 {
3492     RAMState **temp = opaque;
3493     RAMState *rs = *temp;
3494     int ret = 0;
3495
3496     rcu_read_lock();
3497
3498     if (!migration_in_postcopy()) {
3499         migration_bitmap_sync_precopy(rs);
3500     }
3501
3502     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3503
3504     /* try transferring iterative blocks of memory */
3505
3506     /* flush all remaining blocks regardless of rate limiting */
3507     while (true) {
3508         int pages;
3509
3510         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3511         /* no more blocks to sent */
3512         if (pages == 0) {
3513             break;
3514         }
3515         if (pages < 0) {
3516             ret = pages;
3517             break;
3518         }
3519     }
3520
3521     flush_compressed_data(rs);
3522     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3523
3524     rcu_read_unlock();
3525
3526     multifd_send_sync_main();
3527     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3528     qemu_fflush(f);
3529
3530     return ret;
3531 }
3532
3533 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3534                              uint64_t *res_precopy_only,
3535                              uint64_t *res_compatible,
3536                              uint64_t *res_postcopy_only)
3537 {
3538     RAMState **temp = opaque;
3539     RAMState *rs = *temp;
3540     uint64_t remaining_size;
3541
3542     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3543
3544     if (!migration_in_postcopy() &&
3545         remaining_size < max_size) {
3546         qemu_mutex_lock_iothread();
3547         rcu_read_lock();
3548         migration_bitmap_sync_precopy(rs);
3549         rcu_read_unlock();
3550         qemu_mutex_unlock_iothread();
3551         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3552     }
3553
3554     if (migrate_postcopy_ram()) {
3555         /* We can do postcopy, and all the data is postcopiable */
3556         *res_compatible += remaining_size;
3557     } else {
3558         *res_precopy_only += remaining_size;
3559     }
3560 }
3561
3562 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3563 {
3564     unsigned int xh_len;
3565     int xh_flags;
3566     uint8_t *loaded_data;
3567
3568     /* extract RLE header */
3569     xh_flags = qemu_get_byte(f);
3570     xh_len = qemu_get_be16(f);
3571
3572     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3573         error_report("Failed to load XBZRLE page - wrong compression!");
3574         return -1;
3575     }
3576
3577     if (xh_len > TARGET_PAGE_SIZE) {
3578         error_report("Failed to load XBZRLE page - len overflow!");
3579         return -1;
3580     }
3581     loaded_data = XBZRLE.decoded_buf;
3582     /* load data and decode */
3583     /* it can change loaded_data to point to an internal buffer */
3584     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3585
3586     /* decode RLE */
3587     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3588                              TARGET_PAGE_SIZE) == -1) {
3589         error_report("Failed to load XBZRLE page - decode error!");
3590         return -1;
3591     }
3592
3593     return 0;
3594 }
3595
3596 /**
3597  * ram_block_from_stream: read a RAMBlock id from the migration stream
3598  *
3599  * Must be called from within a rcu critical section.
3600  *
3601  * Returns a pointer from within the RCU-protected ram_list.
3602  *
3603  * @f: QEMUFile where to read the data from
3604  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3605  */
3606 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3607 {
3608     static RAMBlock *block = NULL;
3609     char id[256];
3610     uint8_t len;
3611
3612     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3613         if (!block) {
3614             error_report("Ack, bad migration stream!");
3615             return NULL;
3616         }
3617         return block;
3618     }
3619
3620     len = qemu_get_byte(f);
3621     qemu_get_buffer(f, (uint8_t *)id, len);
3622     id[len] = 0;
3623
3624     block = qemu_ram_block_by_name(id);
3625     if (!block) {
3626         error_report("Can't find block %s", id);
3627         return NULL;
3628     }
3629
3630     if (ramblock_is_ignored(block)) {
3631         error_report("block %s should not be migrated !", id);
3632         return NULL;
3633     }
3634
3635     return block;
3636 }
3637
3638 static inline void *host_from_ram_block_offset(RAMBlock *block,
3639                                                ram_addr_t offset)
3640 {
3641     if (!offset_in_ramblock(block, offset)) {
3642         return NULL;
3643     }
3644
3645     return block->host + offset;
3646 }
3647
3648 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3649                                                  ram_addr_t offset)
3650 {
3651     if (!offset_in_ramblock(block, offset)) {
3652         return NULL;
3653     }
3654     if (!block->colo_cache) {
3655         error_report("%s: colo_cache is NULL in block :%s",
3656                      __func__, block->idstr);
3657         return NULL;
3658     }
3659
3660     /*
3661     * During colo checkpoint, we need bitmap of these migrated pages.
3662     * It help us to decide which pages in ram cache should be flushed
3663     * into VM's RAM later.
3664     */
3665     if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3666         ram_state->migration_dirty_pages++;
3667     }
3668     return block->colo_cache + offset;
3669 }
3670
3671 /**
3672  * ram_handle_compressed: handle the zero page case
3673  *
3674  * If a page (or a whole RDMA chunk) has been
3675  * determined to be zero, then zap it.
3676  *
3677  * @host: host address for the zero page
3678  * @ch: what the page is filled from.  We only support zero
3679  * @size: size of the zero page
3680  */
3681 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3682 {
3683     if (ch != 0 || !is_zero_range(host, size)) {
3684         memset(host, ch, size);
3685     }
3686 }
3687
3688 /* return the size after decompression, or negative value on error */
3689 static int
3690 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3691                      const uint8_t *source, size_t source_len)
3692 {
3693     int err;
3694
3695     err = inflateReset(stream);
3696     if (err != Z_OK) {
3697         return -1;
3698     }
3699
3700     stream->avail_in = source_len;
3701     stream->next_in = (uint8_t *)source;
3702     stream->avail_out = dest_len;
3703     stream->next_out = dest;
3704
3705     err = inflate(stream, Z_NO_FLUSH);
3706     if (err != Z_STREAM_END) {
3707         return -1;
3708     }
3709
3710     return stream->total_out;
3711 }
3712
3713 static void *do_data_decompress(void *opaque)
3714 {
3715     DecompressParam *param = opaque;
3716     unsigned long pagesize;
3717     uint8_t *des;
3718     int len, ret;
3719
3720     qemu_mutex_lock(&param->mutex);
3721     while (!param->quit) {
3722         if (param->des) {
3723             des = param->des;
3724             len = param->len;
3725             param->des = 0;
3726             qemu_mutex_unlock(&param->mutex);
3727
3728             pagesize = TARGET_PAGE_SIZE;
3729
3730             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3731                                        param->compbuf, len);
3732             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3733                 error_report("decompress data failed");
3734                 qemu_file_set_error(decomp_file, ret);
3735             }
3736
3737             qemu_mutex_lock(&decomp_done_lock);
3738             param->done = true;
3739             qemu_cond_signal(&decomp_done_cond);
3740             qemu_mutex_unlock(&decomp_done_lock);
3741
3742             qemu_mutex_lock(&param->mutex);
3743         } else {
3744             qemu_cond_wait(&param->cond, &param->mutex);
3745         }
3746     }
3747     qemu_mutex_unlock(&param->mutex);
3748
3749     return NULL;
3750 }
3751
3752 static int wait_for_decompress_done(void)
3753 {
3754     int idx, thread_count;
3755
3756     if (!migrate_use_compression()) {
3757         return 0;
3758     }
3759
3760     thread_count = migrate_decompress_threads();
3761     qemu_mutex_lock(&decomp_done_lock);
3762     for (idx = 0; idx < thread_count; idx++) {
3763         while (!decomp_param[idx].done) {
3764             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3765         }
3766     }
3767     qemu_mutex_unlock(&decomp_done_lock);
3768     return qemu_file_get_error(decomp_file);
3769 }
3770
3771 static void compress_threads_load_cleanup(void)
3772 {
3773     int i, thread_count;
3774
3775     if (!migrate_use_compression()) {
3776         return;
3777     }
3778     thread_count = migrate_decompress_threads();
3779     for (i = 0; i < thread_count; i++) {
3780         /*
3781          * we use it as a indicator which shows if the thread is
3782          * properly init'd or not
3783          */
3784         if (!decomp_param[i].compbuf) {
3785             break;
3786         }
3787
3788         qemu_mutex_lock(&decomp_param[i].mutex);
3789         decomp_param[i].quit = true;
3790         qemu_cond_signal(&decomp_param[i].cond);
3791         qemu_mutex_unlock(&decomp_param[i].mutex);
3792     }
3793     for (i = 0; i < thread_count; i++) {
3794         if (!decomp_param[i].compbuf) {
3795             break;
3796         }
3797
3798         qemu_thread_join(decompress_threads + i);
3799         qemu_mutex_destroy(&decomp_param[i].mutex);
3800         qemu_cond_destroy(&decomp_param[i].cond);
3801         inflateEnd(&decomp_param[i].stream);
3802         g_free(decomp_param[i].compbuf);
3803         decomp_param[i].compbuf = NULL;
3804     }
3805     g_free(decompress_threads);
3806     g_free(decomp_param);
3807     decompress_threads = NULL;
3808     decomp_param = NULL;
3809     decomp_file = NULL;
3810 }
3811
3812 static int compress_threads_load_setup(QEMUFile *f)
3813 {
3814     int i, thread_count;
3815
3816     if (!migrate_use_compression()) {
3817         return 0;
3818     }
3819
3820     thread_count = migrate_decompress_threads();
3821     decompress_threads = g_new0(QemuThread, thread_count);
3822     decomp_param = g_new0(DecompressParam, thread_count);
3823     qemu_mutex_init(&decomp_done_lock);
3824     qemu_cond_init(&decomp_done_cond);
3825     decomp_file = f;
3826     for (i = 0; i < thread_count; i++) {
3827         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3828             goto exit;
3829         }
3830
3831         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3832         qemu_mutex_init(&decomp_param[i].mutex);
3833         qemu_cond_init(&decomp_param[i].cond);
3834         decomp_param[i].done = true;
3835         decomp_param[i].quit = false;
3836         qemu_thread_create(decompress_threads + i, "decompress",
3837                            do_data_decompress, decomp_param + i,
3838                            QEMU_THREAD_JOINABLE);
3839     }
3840     return 0;
3841 exit:
3842     compress_threads_load_cleanup();
3843     return -1;
3844 }
3845
3846 static void decompress_data_with_multi_threads(QEMUFile *f,
3847                                                void *host, int len)
3848 {
3849     int idx, thread_count;
3850
3851     thread_count = migrate_decompress_threads();
3852     qemu_mutex_lock(&decomp_done_lock);
3853     while (true) {
3854         for (idx = 0; idx < thread_count; idx++) {
3855             if (decomp_param[idx].done) {
3856                 decomp_param[idx].done = false;
3857                 qemu_mutex_lock(&decomp_param[idx].mutex);
3858                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3859                 decomp_param[idx].des = host;
3860                 decomp_param[idx].len = len;
3861                 qemu_cond_signal(&decomp_param[idx].cond);
3862                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3863                 break;
3864             }
3865         }
3866         if (idx < thread_count) {
3867             break;
3868         } else {
3869             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3870         }
3871     }
3872     qemu_mutex_unlock(&decomp_done_lock);
3873 }
3874
3875 /*
3876  * colo cache: this is for secondary VM, we cache the whole
3877  * memory of the secondary VM, it is need to hold the global lock
3878  * to call this helper.
3879  */
3880 int colo_init_ram_cache(void)
3881 {
3882     RAMBlock *block;
3883
3884     rcu_read_lock();
3885     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3886         block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3887                                                 NULL,
3888                                                 false);
3889         if (!block->colo_cache) {
3890             error_report("%s: Can't alloc memory for COLO cache of block %s,"
3891                          "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3892                          block->used_length);
3893             goto out_locked;
3894         }
3895         memcpy(block->colo_cache, block->host, block->used_length);
3896     }
3897     rcu_read_unlock();
3898     /*
3899     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3900     * with to decide which page in cache should be flushed into SVM's RAM. Here
3901     * we use the same name 'ram_bitmap' as for migration.
3902     */
3903     if (ram_bytes_total()) {
3904         RAMBlock *block;
3905
3906         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3907             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3908
3909             block->bmap = bitmap_new(pages);
3910             bitmap_set(block->bmap, 0, pages);
3911         }
3912     }
3913     ram_state = g_new0(RAMState, 1);
3914     ram_state->migration_dirty_pages = 0;
3915     qemu_mutex_init(&ram_state->bitmap_mutex);
3916     memory_global_dirty_log_start();
3917
3918     return 0;
3919
3920 out_locked:
3921
3922     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3923         if (block->colo_cache) {
3924             qemu_anon_ram_free(block->colo_cache, block->used_length);
3925             block->colo_cache = NULL;
3926         }
3927     }
3928
3929     rcu_read_unlock();
3930     return -errno;
3931 }
3932
3933 /* It is need to hold the global lock to call this helper */
3934 void colo_release_ram_cache(void)
3935 {
3936     RAMBlock *block;
3937
3938     memory_global_dirty_log_stop();
3939     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3940         g_free(block->bmap);
3941         block->bmap = NULL;
3942     }
3943
3944     rcu_read_lock();
3945
3946     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3947         if (block->colo_cache) {
3948             qemu_anon_ram_free(block->colo_cache, block->used_length);
3949             block->colo_cache = NULL;
3950         }
3951     }
3952
3953     rcu_read_unlock();
3954     qemu_mutex_destroy(&ram_state->bitmap_mutex);
3955     g_free(ram_state);
3956     ram_state = NULL;
3957 }
3958
3959 /**
3960  * ram_load_setup: Setup RAM for migration incoming side
3961  *
3962  * Returns zero to indicate success and negative for error
3963  *
3964  * @f: QEMUFile where to receive the data
3965  * @opaque: RAMState pointer
3966  */
3967 static int ram_load_setup(QEMUFile *f, void *opaque)
3968 {
3969     if (compress_threads_load_setup(f)) {
3970         return -1;
3971     }
3972
3973     xbzrle_load_setup();
3974     ramblock_recv_map_init();
3975
3976     return 0;
3977 }
3978
3979 static int ram_load_cleanup(void *opaque)
3980 {
3981     RAMBlock *rb;
3982
3983     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3984         if (ramblock_is_pmem(rb)) {
3985             pmem_persist(rb->host, rb->used_length);
3986         }
3987     }
3988
3989     xbzrle_load_cleanup();
3990     compress_threads_load_cleanup();
3991
3992     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3993         g_free(rb->receivedmap);
3994         rb->receivedmap = NULL;
3995     }
3996
3997     return 0;
3998 }
3999
4000 /**
4001  * ram_postcopy_incoming_init: allocate postcopy data structures
4002  *
4003  * Returns 0 for success and negative if there was one error
4004  *
4005  * @mis: current migration incoming state
4006  *
4007  * Allocate data structures etc needed by incoming migration with
4008  * postcopy-ram. postcopy-ram's similarly names
4009  * postcopy_ram_incoming_init does the work.
4010  */
4011 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4012 {
4013     return postcopy_ram_incoming_init(mis);
4014 }
4015
4016 /**
4017  * ram_load_postcopy: load a page in postcopy case
4018  *
4019  * Returns 0 for success or -errno in case of error
4020  *
4021  * Called in postcopy mode by ram_load().
4022  * rcu_read_lock is taken prior to this being called.
4023  *
4024  * @f: QEMUFile where to send the data
4025  */
4026 static int ram_load_postcopy(QEMUFile *f)
4027 {
4028     int flags = 0, ret = 0;
4029     bool place_needed = false;
4030     bool matches_target_page_size = false;
4031     MigrationIncomingState *mis = migration_incoming_get_current();
4032     /* Temporary page that is later 'placed' */
4033     void *postcopy_host_page = postcopy_get_tmp_page(mis);
4034     void *last_host = NULL;
4035     bool all_zero = false;
4036
4037     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4038         ram_addr_t addr;
4039         void *host = NULL;
4040         void *page_buffer = NULL;
4041         void *place_source = NULL;
4042         RAMBlock *block = NULL;
4043         uint8_t ch;
4044
4045         addr = qemu_get_be64(f);
4046
4047         /*
4048          * If qemu file error, we should stop here, and then "addr"
4049          * may be invalid
4050          */
4051         ret = qemu_file_get_error(f);
4052         if (ret) {
4053             break;
4054         }
4055
4056         flags = addr & ~TARGET_PAGE_MASK;
4057         addr &= TARGET_PAGE_MASK;
4058
4059         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4060         place_needed = false;
4061         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
4062             block = ram_block_from_stream(f, flags);
4063
4064             host = host_from_ram_block_offset(block, addr);
4065             if (!host) {
4066                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4067                 ret = -EINVAL;
4068                 break;
4069             }
4070             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4071             /*
4072              * Postcopy requires that we place whole host pages atomically;
4073              * these may be huge pages for RAMBlocks that are backed by
4074              * hugetlbfs.
4075              * To make it atomic, the data is read into a temporary page
4076              * that's moved into place later.
4077              * The migration protocol uses,  possibly smaller, target-pages
4078              * however the source ensures it always sends all the components
4079              * of a host page in order.
4080              */
4081             page_buffer = postcopy_host_page +
4082                           ((uintptr_t)host & (block->page_size - 1));
4083             /* If all TP are zero then we can optimise the place */
4084             if (!((uintptr_t)host & (block->page_size - 1))) {
4085                 all_zero = true;
4086             } else {
4087                 /* not the 1st TP within the HP */
4088                 if (host != (last_host + TARGET_PAGE_SIZE)) {
4089                     error_report("Non-sequential target page %p/%p",
4090                                   host, last_host);
4091                     ret = -EINVAL;
4092                     break;
4093                 }
4094             }
4095
4096
4097             /*
4098              * If it's the last part of a host page then we place the host
4099              * page
4100              */
4101             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
4102                                      (block->page_size - 1)) == 0;
4103             place_source = postcopy_host_page;
4104         }
4105         last_host = host;
4106
4107         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4108         case RAM_SAVE_FLAG_ZERO:
4109             ch = qemu_get_byte(f);
4110             memset(page_buffer, ch, TARGET_PAGE_SIZE);
4111             if (ch) {
4112                 all_zero = false;
4113             }
4114             break;
4115
4116         case RAM_SAVE_FLAG_PAGE:
4117             all_zero = false;
4118             if (!matches_target_page_size) {
4119                 /* For huge pages, we always use temporary buffer */
4120                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4121             } else {
4122                 /*
4123                  * For small pages that matches target page size, we
4124                  * avoid the qemu_file copy.  Instead we directly use
4125                  * the buffer of QEMUFile to place the page.  Note: we
4126                  * cannot do any QEMUFile operation before using that
4127                  * buffer to make sure the buffer is valid when
4128                  * placing the page.
4129                  */
4130                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4131                                          TARGET_PAGE_SIZE);
4132             }
4133             break;
4134         case RAM_SAVE_FLAG_EOS:
4135             /* normal exit */
4136             multifd_recv_sync_main();
4137             break;
4138         default:
4139             error_report("Unknown combination of migration flags: %#x"
4140                          " (postcopy mode)", flags);
4141             ret = -EINVAL;
4142             break;
4143         }
4144
4145         /* Detect for any possible file errors */
4146         if (!ret && qemu_file_get_error(f)) {
4147             ret = qemu_file_get_error(f);
4148         }
4149
4150         if (!ret && place_needed) {
4151             /* This gets called at the last target page in the host page */
4152             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
4153
4154             if (all_zero) {
4155                 ret = postcopy_place_page_zero(mis, place_dest,
4156                                                block);
4157             } else {
4158                 ret = postcopy_place_page(mis, place_dest,
4159                                           place_source, block);
4160             }
4161         }
4162     }
4163
4164     return ret;
4165 }
4166
4167 static bool postcopy_is_advised(void)
4168 {
4169     PostcopyState ps = postcopy_state_get();
4170     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4171 }
4172
4173 static bool postcopy_is_running(void)
4174 {
4175     PostcopyState ps = postcopy_state_get();
4176     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4177 }
4178
4179 /*
4180  * Flush content of RAM cache into SVM's memory.
4181  * Only flush the pages that be dirtied by PVM or SVM or both.
4182  */
4183 static void colo_flush_ram_cache(void)
4184 {
4185     RAMBlock *block = NULL;
4186     void *dst_host;
4187     void *src_host;
4188     unsigned long offset = 0;
4189
4190     memory_global_dirty_log_sync();
4191     rcu_read_lock();
4192     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4193         migration_bitmap_sync_range(ram_state, block, block->used_length);
4194     }
4195     rcu_read_unlock();
4196
4197     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4198     rcu_read_lock();
4199     block = QLIST_FIRST_RCU(&ram_list.blocks);
4200
4201     while (block) {
4202         offset = migration_bitmap_find_dirty(ram_state, block, offset);
4203
4204         if (offset << TARGET_PAGE_BITS >= block->used_length) {
4205             offset = 0;
4206             block = QLIST_NEXT_RCU(block, next);
4207         } else {
4208             migration_bitmap_clear_dirty(ram_state, block, offset);
4209             dst_host = block->host + (offset << TARGET_PAGE_BITS);
4210             src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4211             memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4212         }
4213     }
4214
4215     rcu_read_unlock();
4216     trace_colo_flush_ram_cache_end();
4217 }
4218
4219 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4220 {
4221     int flags = 0, ret = 0, invalid_flags = 0;
4222     static uint64_t seq_iter;
4223     int len = 0;
4224     /*
4225      * If system is running in postcopy mode, page inserts to host memory must
4226      * be atomic
4227      */
4228     bool postcopy_running = postcopy_is_running();
4229     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4230     bool postcopy_advised = postcopy_is_advised();
4231
4232     seq_iter++;
4233
4234     if (version_id != 4) {
4235         ret = -EINVAL;
4236     }
4237
4238     if (!migrate_use_compression()) {
4239         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4240     }
4241     /* This RCU critical section can be very long running.
4242      * When RCU reclaims in the code start to become numerous,
4243      * it will be necessary to reduce the granularity of this
4244      * critical section.
4245      */
4246     rcu_read_lock();
4247
4248     if (postcopy_running) {
4249         ret = ram_load_postcopy(f);
4250     }
4251
4252     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4253         ram_addr_t addr, total_ram_bytes;
4254         void *host = NULL;
4255         uint8_t ch;
4256
4257         addr = qemu_get_be64(f);
4258         flags = addr & ~TARGET_PAGE_MASK;
4259         addr &= TARGET_PAGE_MASK;
4260
4261         if (flags & invalid_flags) {
4262             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4263                 error_report("Received an unexpected compressed page");
4264             }
4265
4266             ret = -EINVAL;
4267             break;
4268         }
4269
4270         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4271                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4272             RAMBlock *block = ram_block_from_stream(f, flags);
4273
4274             /*
4275              * After going into COLO, we should load the Page into colo_cache.
4276              */
4277             if (migration_incoming_in_colo_state()) {
4278                 host = colo_cache_from_block_offset(block, addr);
4279             } else {
4280                 host = host_from_ram_block_offset(block, addr);
4281             }
4282             if (!host) {
4283                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4284                 ret = -EINVAL;
4285                 break;
4286             }
4287
4288             if (!migration_incoming_in_colo_state()) {
4289                 ramblock_recv_bitmap_set(block, host);
4290             }
4291
4292             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4293         }
4294
4295         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4296         case RAM_SAVE_FLAG_MEM_SIZE:
4297             /* Synchronize RAM block list */
4298             total_ram_bytes = addr;
4299             while (!ret && total_ram_bytes) {
4300                 RAMBlock *block;
4301                 char id[256];
4302                 ram_addr_t length;
4303
4304                 len = qemu_get_byte(f);
4305                 qemu_get_buffer(f, (uint8_t *)id, len);
4306                 id[len] = 0;
4307                 length = qemu_get_be64(f);
4308
4309                 block = qemu_ram_block_by_name(id);
4310                 if (block && !qemu_ram_is_migratable(block)) {
4311                     error_report("block %s should not be migrated !", id);
4312                     ret = -EINVAL;
4313                 } else if (block) {
4314                     if (length != block->used_length) {
4315                         Error *local_err = NULL;
4316
4317                         ret = qemu_ram_resize(block, length,
4318                                               &local_err);
4319                         if (local_err) {
4320                             error_report_err(local_err);
4321                         }
4322                     }
4323                     /* For postcopy we need to check hugepage sizes match */
4324                     if (postcopy_advised &&
4325                         block->page_size != qemu_host_page_size) {
4326                         uint64_t remote_page_size = qemu_get_be64(f);
4327                         if (remote_page_size != block->page_size) {
4328                             error_report("Mismatched RAM page size %s "
4329                                          "(local) %zd != %" PRId64,
4330                                          id, block->page_size,
4331                                          remote_page_size);
4332                             ret = -EINVAL;
4333                         }
4334                     }
4335                     if (migrate_ignore_shared()) {
4336                         hwaddr addr = qemu_get_be64(f);
4337                         bool ignored = qemu_get_byte(f);
4338                         if (ignored != ramblock_is_ignored(block)) {
4339                             error_report("RAM block %s should %s be migrated",
4340                                          id, ignored ? "" : "not");
4341                             ret = -EINVAL;
4342                         }
4343                         if (ramblock_is_ignored(block) &&
4344                             block->mr->addr != addr) {
4345                             error_report("Mismatched GPAs for block %s "
4346                                          "%" PRId64 "!= %" PRId64,
4347                                          id, (uint64_t)addr,
4348                                          (uint64_t)block->mr->addr);
4349                             ret = -EINVAL;
4350                         }
4351                     }
4352                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4353                                           block->idstr);
4354                 } else {
4355                     error_report("Unknown ramblock \"%s\", cannot "
4356                                  "accept migration", id);
4357                     ret = -EINVAL;
4358                 }
4359
4360                 total_ram_bytes -= length;
4361             }
4362             break;
4363
4364         case RAM_SAVE_FLAG_ZERO:
4365             ch = qemu_get_byte(f);
4366             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4367             break;
4368
4369         case RAM_SAVE_FLAG_PAGE:
4370             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4371             break;
4372
4373         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4374             len = qemu_get_be32(f);
4375             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4376                 error_report("Invalid compressed data length: %d", len);
4377                 ret = -EINVAL;
4378                 break;
4379             }
4380             decompress_data_with_multi_threads(f, host, len);
4381             break;
4382
4383         case RAM_SAVE_FLAG_XBZRLE:
4384             if (load_xbzrle(f, addr, host) < 0) {
4385                 error_report("Failed to decompress XBZRLE page at "
4386                              RAM_ADDR_FMT, addr);
4387                 ret = -EINVAL;
4388                 break;
4389             }
4390             break;
4391         case RAM_SAVE_FLAG_EOS:
4392             /* normal exit */
4393             multifd_recv_sync_main();
4394             break;
4395         default:
4396             if (flags & RAM_SAVE_FLAG_HOOK) {
4397                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4398             } else {
4399                 error_report("Unknown combination of migration flags: %#x",
4400                              flags);
4401                 ret = -EINVAL;
4402             }
4403         }
4404         if (!ret) {
4405             ret = qemu_file_get_error(f);
4406         }
4407     }
4408
4409     ret |= wait_for_decompress_done();
4410     rcu_read_unlock();
4411     trace_ram_load_complete(ret, seq_iter);
4412
4413     if (!ret  && migration_incoming_in_colo_state()) {
4414         colo_flush_ram_cache();
4415     }
4416     return ret;
4417 }
4418
4419 static bool ram_has_postcopy(void *opaque)
4420 {
4421     RAMBlock *rb;
4422     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4423         if (ramblock_is_pmem(rb)) {
4424             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4425                          "is not supported now!", rb->idstr, rb->host);
4426             return false;
4427         }
4428     }
4429
4430     return migrate_postcopy_ram();
4431 }
4432
4433 /* Sync all the dirty bitmap with destination VM.  */
4434 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4435 {
4436     RAMBlock *block;
4437     QEMUFile *file = s->to_dst_file;
4438     int ramblock_count = 0;
4439
4440     trace_ram_dirty_bitmap_sync_start();
4441
4442     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4443         qemu_savevm_send_recv_bitmap(file, block->idstr);
4444         trace_ram_dirty_bitmap_request(block->idstr);
4445         ramblock_count++;
4446     }
4447
4448     trace_ram_dirty_bitmap_sync_wait();
4449
4450     /* Wait until all the ramblocks' dirty bitmap synced */
4451     while (ramblock_count--) {
4452         qemu_sem_wait(&s->rp_state.rp_sem);
4453     }
4454
4455     trace_ram_dirty_bitmap_sync_complete();
4456
4457     return 0;
4458 }
4459
4460 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4461 {
4462     qemu_sem_post(&s->rp_state.rp_sem);
4463 }
4464
4465 /*
4466  * Read the received bitmap, revert it as the initial dirty bitmap.
4467  * This is only used when the postcopy migration is paused but wants
4468  * to resume from a middle point.
4469  */
4470 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4471 {
4472     int ret = -EINVAL;
4473     QEMUFile *file = s->rp_state.from_dst_file;
4474     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4475     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4476     uint64_t size, end_mark;
4477
4478     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4479
4480     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4481         error_report("%s: incorrect state %s", __func__,
4482                      MigrationStatus_str(s->state));
4483         return -EINVAL;
4484     }
4485
4486     /*
4487      * Note: see comments in ramblock_recv_bitmap_send() on why we
4488      * need the endianess convertion, and the paddings.
4489      */
4490     local_size = ROUND_UP(local_size, 8);
4491
4492     /* Add paddings */
4493     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4494
4495     size = qemu_get_be64(file);
4496
4497     /* The size of the bitmap should match with our ramblock */
4498     if (size != local_size) {
4499         error_report("%s: ramblock '%s' bitmap size mismatch "
4500                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4501                      block->idstr, size, local_size);
4502         ret = -EINVAL;
4503         goto out;
4504     }
4505
4506     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4507     end_mark = qemu_get_be64(file);
4508
4509     ret = qemu_file_get_error(file);
4510     if (ret || size != local_size) {
4511         error_report("%s: read bitmap failed for ramblock '%s': %d"
4512                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4513                      __func__, block->idstr, ret, local_size, size);
4514         ret = -EIO;
4515         goto out;
4516     }
4517
4518     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4519         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4520                      __func__, block->idstr, end_mark);
4521         ret = -EINVAL;
4522         goto out;
4523     }
4524
4525     /*
4526      * Endianess convertion. We are during postcopy (though paused).
4527      * The dirty bitmap won't change. We can directly modify it.
4528      */
4529     bitmap_from_le(block->bmap, le_bitmap, nbits);
4530
4531     /*
4532      * What we received is "received bitmap". Revert it as the initial
4533      * dirty bitmap for this ramblock.
4534      */
4535     bitmap_complement(block->bmap, block->bmap, nbits);
4536
4537     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4538
4539     /*
4540      * We succeeded to sync bitmap for current ramblock. If this is
4541      * the last one to sync, we need to notify the main send thread.
4542      */
4543     ram_dirty_bitmap_reload_notify(s);
4544
4545     ret = 0;
4546 out:
4547     g_free(le_bitmap);
4548     return ret;
4549 }
4550
4551 static int ram_resume_prepare(MigrationState *s, void *opaque)
4552 {
4553     RAMState *rs = *(RAMState **)opaque;
4554     int ret;
4555
4556     ret = ram_dirty_bitmap_sync_all(s, rs);
4557     if (ret) {
4558         return ret;
4559     }
4560
4561     ram_state_resume_prepare(rs, s->to_dst_file);
4562
4563     return 0;
4564 }
4565
4566 static SaveVMHandlers savevm_ram_handlers = {
4567     .save_setup = ram_save_setup,
4568     .save_live_iterate = ram_save_iterate,
4569     .save_live_complete_postcopy = ram_save_complete,
4570     .save_live_complete_precopy = ram_save_complete,
4571     .has_postcopy = ram_has_postcopy,
4572     .save_live_pending = ram_save_pending,
4573     .load_state = ram_load,
4574     .save_cleanup = ram_save_cleanup,
4575     .load_setup = ram_load_setup,
4576     .load_cleanup = ram_load_cleanup,
4577     .resume_prepare = ram_resume_prepare,
4578 };
4579
4580 void ram_mig_init(void)
4581 {
4582     qemu_mutex_init(&XBZRLE.lock);
4583     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4584 }