migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "qemu/pmem.h"
  37 #include "xbzrle.h"
  38 #include "ram.h"
  39 #include "migration.h"
  40 #include "socket.h"
  41 #include "migration/register.h"
  42 #include "migration/misc.h"
  43 #include "qemu-file.h"
  44 #include "postcopy-ram.h"
  45 #include "page_cache.h"
  46 #include "qemu/error-report.h"
  47 #include "qapi/error.h"
  48 #include "qapi/qapi-events-migration.h"
  49 #include "qapi/qmp/qerror.h"
  50 #include "trace.h"
  51 #include "exec/ram_addr.h"
  52 #include "exec/target_page.h"
  53 #include "qemu/rcu_queue.h"
  54 #include "migration/colo.h"
  55 #include "block.h"
  56 #include "sysemu/sysemu.h"
  57 #include "qemu/uuid.h"
  58 #include "savevm.h"
  59 #include "qemu/iov.h"
  60
  61 /***********************************************************/
  62 /* ram save/restore */
  63
  64 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  65  * worked for pages that where filled with the same char.  We switched
  66  * it to only search for the zero value.  And to avoid confusion with
  67  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  68  */
  69
  70 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  71 #define RAM_SAVE_FLAG_ZERO     0x02
  72 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  73 #define RAM_SAVE_FLAG_PAGE     0x08
  74 #define RAM_SAVE_FLAG_EOS      0x10
  75 #define RAM_SAVE_FLAG_CONTINUE 0x20
  76 #define RAM_SAVE_FLAG_XBZRLE   0x40
  77 /* 0x80 is reserved in migration.h start with 0x100 next */
  78 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  79
  80 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  81 {
  82     return buffer_is_zero(p, size);
  83 }
  84
  85 XBZRLECacheStats xbzrle_counters;
  86
  87 /* struct contains XBZRLE cache and a static page
  88    used by the compression */
  89 static struct {
  90     /* buffer used for XBZRLE encoding */
  91     uint8_t *encoded_buf;
  92     /* buffer for storing page content */
  93     uint8_t *current_buf;
  94     /* Cache for XBZRLE, Protected by lock. */
  95     PageCache *cache;
  96     QemuMutex lock;
  97     /* it will store a page full of zeros */
  98     uint8_t *zero_target_page;
  99     /* buffer used for XBZRLE decoding */
 100     uint8_t *decoded_buf;
 101 } XBZRLE;
 102
 103 static void XBZRLE_cache_lock(void)
 104 {
 105     if (migrate_use_xbzrle())
 106         qemu_mutex_lock(&XBZRLE.lock);
 107 }
 108
 109 static void XBZRLE_cache_unlock(void)
 110 {
 111     if (migrate_use_xbzrle())
 112         qemu_mutex_unlock(&XBZRLE.lock);
 113 }
 114
 115 /**
 116  * xbzrle_cache_resize: resize the xbzrle cache
 117  *
 118  * This function is called from qmp_migrate_set_cache_size in main
 119  * thread, possibly while a migration is in progress.  A running
 120  * migration may be using the cache and might finish during this call,
 121  * hence changes to the cache are protected by XBZRLE.lock().
 122  *
 123  * Returns 0 for success or -1 for error
 124  *
 125  * @new_size: new cache size
 126  * @errp: set *errp if the check failed, with reason
 127  */
 128 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 129 {
 130     PageCache *new_cache;
 131     int64_t ret = 0;
 132
 133     /* Check for truncation */
 134     if (new_size != (size_t)new_size) {
 135         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 136                    "exceeding address space");
 137         return -1;
 138     }
 139
 140     if (new_size == migrate_xbzrle_cache_size()) {
 141         /* nothing to do */
 142         return 0;
 143     }
 144
 145     XBZRLE_cache_lock();
 146
 147     if (XBZRLE.cache != NULL) {
 148         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 149         if (!new_cache) {
 150             ret = -1;
 151             goto out;
 152         }
 153
 154         cache_fini(XBZRLE.cache);
 155         XBZRLE.cache = new_cache;
 156     }
 157 out:
 158     XBZRLE_cache_unlock();
 159     return ret;
 160 }
 161
 162 static bool ramblock_is_ignored(RAMBlock *block)
 163 {
 164     return !qemu_ram_is_migratable(block) ||
 165            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 166 }
 167
 168 /* Should be holding either ram_list.mutex, or the RCU lock. */
 169 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 170     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 171         if (ramblock_is_ignored(block)) {} else
 172
 173 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 174     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 175         if (!qemu_ram_is_migratable(block)) {} else
 176
 177 #undef RAMBLOCK_FOREACH
 178
 179 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 180 {
 181     RAMBlock *block;
 182     int ret = 0;
 183
 184     rcu_read_lock();
 185     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 186         ret = func(block, opaque);
 187         if (ret) {
 188             break;
 189         }
 190     }
 191     rcu_read_unlock();
 192     return ret;
 193 }
 194
 195 static void ramblock_recv_map_init(void)
 196 {
 197     RAMBlock *rb;
 198
 199     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 200         assert(!rb->receivedmap);
 201         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 202     }
 203 }
 204
 205 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 206 {
 207     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 208                     rb->receivedmap);
 209 }
 210
 211 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 212 {
 213     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 214 }
 215
 216 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 217 {
 218     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 219 }
 220
 221 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 222                                     size_t nr)
 223 {
 224     bitmap_set_atomic(rb->receivedmap,
 225                       ramblock_recv_bitmap_offset(host_addr, rb),
 226                       nr);
 227 }
 228
 229 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 230
 231 /*
 232  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 233  *
 234  * Returns >0 if success with sent bytes, or <0 if error.
 235  */
 236 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 237                                   const char *block_name)
 238 {
 239     RAMBlock *block = qemu_ram_block_by_name(block_name);
 240     unsigned long *le_bitmap, nbits;
 241     uint64_t size;
 242
 243     if (!block) {
 244         error_report("%s: invalid block name: %s", __func__, block_name);
 245         return -1;
 246     }
 247
 248     nbits = block->used_length >> TARGET_PAGE_BITS;
 249
 250     /*
 251      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 252      * machines we may need 4 more bytes for padding (see below
 253      * comment). So extend it a bit before hand.
 254      */
 255     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 256
 257     /*
 258      * Always use little endian when sending the bitmap. This is
 259      * required that when source and destination VMs are not using the
 260      * same endianess. (Note: big endian won't work.)
 261      */
 262     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 263
 264     /* Size of the bitmap, in bytes */
 265     size = DIV_ROUND_UP(nbits, 8);
 266
 267     /*
 268      * size is always aligned to 8 bytes for 64bit machines, but it
 269      * may not be true for 32bit machines. We need this padding to
 270      * make sure the migration can survive even between 32bit and
 271      * 64bit machines.
 272      */
 273     size = ROUND_UP(size, 8);
 274
 275     qemu_put_be64(file, size);
 276     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 277     /*
 278      * Mark as an end, in case the middle part is screwed up due to
 279      * some "misterious" reason.
 280      */
 281     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 282     qemu_fflush(file);
 283
 284     g_free(le_bitmap);
 285
 286     if (qemu_file_get_error(file)) {
 287         return qemu_file_get_error(file);
 288     }
 289
 290     return size + sizeof(size);
 291 }
 292
 293 /*
 294  * An outstanding page request, on the source, having been received
 295  * and queued
 296  */
 297 struct RAMSrcPageRequest {
 298     RAMBlock *rb;
 299     hwaddr    offset;
 300     hwaddr    len;
 301
 302     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 303 };
 304
 305 /* State of RAM for migration */
 306 struct RAMState {
 307     /* QEMUFile used for this migration */
 308     QEMUFile *f;
 309     /* Last block that we have visited searching for dirty pages */
 310     RAMBlock *last_seen_block;
 311     /* Last block from where we have sent data */
 312     RAMBlock *last_sent_block;
 313     /* Last dirty target page we have sent */
 314     ram_addr_t last_page;
 315     /* last ram version we have seen */
 316     uint32_t last_version;
 317     /* We are in the first round */
 318     bool ram_bulk_stage;
 319     /* The free page optimization is enabled */
 320     bool fpo_enabled;
 321     /* How many times we have dirty too many pages */
 322     int dirty_rate_high_cnt;
 323     /* these variables are used for bitmap sync */
 324     /* last time we did a full bitmap_sync */
 325     int64_t time_last_bitmap_sync;
 326     /* bytes transferred at start_time */
 327     uint64_t bytes_xfer_prev;
 328     /* number of dirty pages since start_time */
 329     uint64_t num_dirty_pages_period;
 330     /* xbzrle misses since the beginning of the period */
 331     uint64_t xbzrle_cache_miss_prev;
 332
 333     /* compression statistics since the beginning of the period */
 334     /* amount of count that no free thread to compress data */
 335     uint64_t compress_thread_busy_prev;
 336     /* amount bytes after compression */
 337     uint64_t compressed_size_prev;
 338     /* amount of compressed pages */
 339     uint64_t compress_pages_prev;
 340
 341     /* total handled target pages at the beginning of period */
 342     uint64_t target_page_count_prev;
 343     /* total handled target pages since start */
 344     uint64_t target_page_count;
 345     /* number of dirty bits in the bitmap */
 346     uint64_t migration_dirty_pages;
 347     /* Protects modification of the bitmap and migration dirty pages */
 348     QemuMutex bitmap_mutex;
 349     /* The RAMBlock used in the last src_page_requests */
 350     RAMBlock *last_req_rb;
 351     /* Queue of outstanding page requests from the destination */
 352     QemuMutex src_page_req_mutex;
 353     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 354 };
 355 typedef struct RAMState RAMState;
 356
 357 static RAMState *ram_state;
 358
 359 static NotifierWithReturnList precopy_notifier_list;
 360
 361 void precopy_infrastructure_init(void)
 362 {
 363     notifier_with_return_list_init(&precopy_notifier_list);
 364 }
 365
 366 void precopy_add_notifier(NotifierWithReturn *n)
 367 {
 368     notifier_with_return_list_add(&precopy_notifier_list, n);
 369 }
 370
 371 void precopy_remove_notifier(NotifierWithReturn *n)
 372 {
 373     notifier_with_return_remove(n);
 374 }
 375
 376 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 377 {
 378     PrecopyNotifyData pnd;
 379     pnd.reason = reason;
 380     pnd.errp = errp;
 381
 382     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 383 }
 384
 385 void precopy_enable_free_page_optimization(void)
 386 {
 387     if (!ram_state) {
 388         return;
 389     }
 390
 391     ram_state->fpo_enabled = true;
 392 }
 393
 394 uint64_t ram_bytes_remaining(void)
 395 {
 396     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 397                        0;
 398 }
 399
 400 MigrationStats ram_counters;
 401
 402 /* used by the search for pages to send */
 403 struct PageSearchStatus {
 404     /* Current block being searched */
 405     RAMBlock    *block;
 406     /* Current page to search from */
 407     unsigned long page;
 408     /* Set once we wrap around */
 409     bool         complete_round;
 410 };
 411 typedef struct PageSearchStatus PageSearchStatus;
 412
 413 CompressionStats compression_counters;
 414
 415 struct CompressParam {
 416     bool done;
 417     bool quit;
 418     bool zero_page;
 419     QEMUFile *file;
 420     QemuMutex mutex;
 421     QemuCond cond;
 422     RAMBlock *block;
 423     ram_addr_t offset;
 424
 425     /* internally used fields */
 426     z_stream stream;
 427     uint8_t *originbuf;
 428 };
 429 typedef struct CompressParam CompressParam;
 430
 431 struct DecompressParam {
 432     bool done;
 433     bool quit;
 434     QemuMutex mutex;
 435     QemuCond cond;
 436     void *des;
 437     uint8_t *compbuf;
 438     int len;
 439     z_stream stream;
 440 };
 441 typedef struct DecompressParam DecompressParam;
 442
 443 static CompressParam *comp_param;
 444 static QemuThread *compress_threads;
 445 /* comp_done_cond is used to wake up the migration thread when
 446  * one of the compression threads has finished the compression.
 447  * comp_done_lock is used to co-work with comp_done_cond.
 448  */
 449 static QemuMutex comp_done_lock;
 450 static QemuCond comp_done_cond;
 451 /* The empty QEMUFileOps will be used by file in CompressParam */
 452 static const QEMUFileOps empty_ops = { };
 453
 454 static QEMUFile *decomp_file;
 455 static DecompressParam *decomp_param;
 456 static QemuThread *decompress_threads;
 457 static QemuMutex decomp_done_lock;
 458 static QemuCond decomp_done_cond;
 459
 460 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 461                                  ram_addr_t offset, uint8_t *source_buf);
 462
 463 static void *do_data_compress(void *opaque)
 464 {
 465     CompressParam *param = opaque;
 466     RAMBlock *block;
 467     ram_addr_t offset;
 468     bool zero_page;
 469
 470     qemu_mutex_lock(&param->mutex);
 471     while (!param->quit) {
 472         if (param->block) {
 473             block = param->block;
 474             offset = param->offset;
 475             param->block = NULL;
 476             qemu_mutex_unlock(&param->mutex);
 477
 478             zero_page = do_compress_ram_page(param->file, &param->stream,
 479                                              block, offset, param->originbuf);
 480
 481             qemu_mutex_lock(&comp_done_lock);
 482             param->done = true;
 483             param->zero_page = zero_page;
 484             qemu_cond_signal(&comp_done_cond);
 485             qemu_mutex_unlock(&comp_done_lock);
 486
 487             qemu_mutex_lock(&param->mutex);
 488         } else {
 489             qemu_cond_wait(&param->cond, &param->mutex);
 490         }
 491     }
 492     qemu_mutex_unlock(&param->mutex);
 493
 494     return NULL;
 495 }
 496
 497 static void compress_threads_save_cleanup(void)
 498 {
 499     int i, thread_count;
 500
 501     if (!migrate_use_compression() || !comp_param) {
 502         return;
 503     }
 504
 505     thread_count = migrate_compress_threads();
 506     for (i = 0; i < thread_count; i++) {
 507         /*
 508          * we use it as a indicator which shows if the thread is
 509          * properly init'd or not
 510          */
 511         if (!comp_param[i].file) {
 512             break;
 513         }
 514
 515         qemu_mutex_lock(&comp_param[i].mutex);
 516         comp_param[i].quit = true;
 517         qemu_cond_signal(&comp_param[i].cond);
 518         qemu_mutex_unlock(&comp_param[i].mutex);
 519
 520         qemu_thread_join(compress_threads + i);
 521         qemu_mutex_destroy(&comp_param[i].mutex);
 522         qemu_cond_destroy(&comp_param[i].cond);
 523         deflateEnd(&comp_param[i].stream);
 524         g_free(comp_param[i].originbuf);
 525         qemu_fclose(comp_param[i].file);
 526         comp_param[i].file = NULL;
 527     }
 528     qemu_mutex_destroy(&comp_done_lock);
 529     qemu_cond_destroy(&comp_done_cond);
 530     g_free(compress_threads);
 531     g_free(comp_param);
 532     compress_threads = NULL;
 533     comp_param = NULL;
 534 }
 535
 536 static int compress_threads_save_setup(void)
 537 {
 538     int i, thread_count;
 539
 540     if (!migrate_use_compression()) {
 541         return 0;
 542     }
 543     thread_count = migrate_compress_threads();
 544     compress_threads = g_new0(QemuThread, thread_count);
 545     comp_param = g_new0(CompressParam, thread_count);
 546     qemu_cond_init(&comp_done_cond);
 547     qemu_mutex_init(&comp_done_lock);
 548     for (i = 0; i < thread_count; i++) {
 549         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 550         if (!comp_param[i].originbuf) {
 551             goto exit;
 552         }
 553
 554         if (deflateInit(&comp_param[i].stream,
 555                         migrate_compress_level()) != Z_OK) {
 556             g_free(comp_param[i].originbuf);
 557             goto exit;
 558         }
 559
 560         /* comp_param[i].file is just used as a dummy buffer to save data,
 561          * set its ops to empty.
 562          */
 563         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 564         comp_param[i].done = true;
 565         comp_param[i].quit = false;
 566         qemu_mutex_init(&comp_param[i].mutex);
 567         qemu_cond_init(&comp_param[i].cond);
 568         qemu_thread_create(compress_threads + i, "compress",
 569                            do_data_compress, comp_param + i,
 570                            QEMU_THREAD_JOINABLE);
 571     }
 572     return 0;
 573
 574 exit:
 575     compress_threads_save_cleanup();
 576     return -1;
 577 }
 578
 579 /* Multiple fd's */
 580
 581 #define MULTIFD_MAGIC 0x11223344U
 582 #define MULTIFD_VERSION 1
 583
 584 #define MULTIFD_FLAG_SYNC (1 << 0)
 585
 586 /* This value needs to be a multiple of qemu_target_page_size() */
 587 #define MULTIFD_PACKET_SIZE (512 * 1024)
 588
 589 typedef struct {
 590     uint32_t magic;
 591     uint32_t version;
 592     unsigned char uuid[16]; /* QemuUUID */
 593     uint8_t id;
 594     uint8_t unused1[7];     /* Reserved for future use */
 595     uint64_t unused2[4];    /* Reserved for future use */
 596 } __attribute__((packed)) MultiFDInit_t;
 597
 598 typedef struct {
 599     uint32_t magic;
 600     uint32_t version;
 601     uint32_t flags;
 602     /* maximum number of allocated pages */
 603     uint32_t pages_alloc;
 604     uint32_t pages_used;
 605     /* size of the next packet that contains pages */
 606     uint32_t next_packet_size;
 607     uint64_t packet_num;
 608     uint64_t unused[4];    /* Reserved for future use */
 609     char ramblock[256];
 610     uint64_t offset[];
 611 } __attribute__((packed)) MultiFDPacket_t;
 612
 613 typedef struct {
 614     /* number of used pages */
 615     uint32_t used;
 616     /* number of allocated pages */
 617     uint32_t allocated;
 618     /* global number of generated multifd packets */
 619     uint64_t packet_num;
 620     /* offset of each page */
 621     ram_addr_t *offset;
 622     /* pointer to each page */
 623     struct iovec *iov;
 624     RAMBlock *block;
 625 } MultiFDPages_t;
 626
 627 typedef struct {
 628     /* this fields are not changed once the thread is created */
 629     /* channel number */
 630     uint8_t id;
 631     /* channel thread name */
 632     char *name;
 633     /* channel thread id */
 634     QemuThread thread;
 635     /* communication channel */
 636     QIOChannel *c;
 637     /* sem where to wait for more work */
 638     QemuSemaphore sem;
 639     /* this mutex protects the following parameters */
 640     QemuMutex mutex;
 641     /* is this channel thread running */
 642     bool running;
 643     /* should this thread finish */
 644     bool quit;
 645     /* thread has work to do */
 646     int pending_job;
 647     /* array of pages to sent */
 648     MultiFDPages_t *pages;
 649     /* packet allocated len */
 650     uint32_t packet_len;
 651     /* pointer to the packet */
 652     MultiFDPacket_t *packet;
 653     /* multifd flags for each packet */
 654     uint32_t flags;
 655     /* size of the next packet that contains pages */
 656     uint32_t next_packet_size;
 657     /* global number of generated multifd packets */
 658     uint64_t packet_num;
 659     /* thread local variables */
 660     /* packets sent through this channel */
 661     uint64_t num_packets;
 662     /* pages sent through this channel */
 663     uint64_t num_pages;
 664 }  MultiFDSendParams;
 665
 666 typedef struct {
 667     /* this fields are not changed once the thread is created */
 668     /* channel number */
 669     uint8_t id;
 670     /* channel thread name */
 671     char *name;
 672     /* channel thread id */
 673     QemuThread thread;
 674     /* communication channel */
 675     QIOChannel *c;
 676     /* this mutex protects the following parameters */
 677     QemuMutex mutex;
 678     /* is this channel thread running */
 679     bool running;
 680     /* should this thread finish */
 681     bool quit;
 682     /* array of pages to receive */
 683     MultiFDPages_t *pages;
 684     /* packet allocated len */
 685     uint32_t packet_len;
 686     /* pointer to the packet */
 687     MultiFDPacket_t *packet;
 688     /* multifd flags for each packet */
 689     uint32_t flags;
 690     /* global number of generated multifd packets */
 691     uint64_t packet_num;
 692     /* thread local variables */
 693     /* size of the next packet that contains pages */
 694     uint32_t next_packet_size;
 695     /* packets sent through this channel */
 696     uint64_t num_packets;
 697     /* pages sent through this channel */
 698     uint64_t num_pages;
 699     /* syncs main thread and channels */
 700     QemuSemaphore sem_sync;
 701 } MultiFDRecvParams;
 702
 703 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 704 {
 705     MultiFDInit_t msg;
 706     int ret;
 707
 708     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 709     msg.version = cpu_to_be32(MULTIFD_VERSION);
 710     msg.id = p->id;
 711     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 712
 713     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 714     if (ret != 0) {
 715         return -1;
 716     }
 717     return 0;
 718 }
 719
 720 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 721 {
 722     MultiFDInit_t msg;
 723     int ret;
 724
 725     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 726     if (ret != 0) {
 727         return -1;
 728     }
 729
 730     msg.magic = be32_to_cpu(msg.magic);
 731     msg.version = be32_to_cpu(msg.version);
 732
 733     if (msg.magic != MULTIFD_MAGIC) {
 734         error_setg(errp, "multifd: received packet magic %x "
 735                    "expected %x", msg.magic, MULTIFD_MAGIC);
 736         return -1;
 737     }
 738
 739     if (msg.version != MULTIFD_VERSION) {
 740         error_setg(errp, "multifd: received packet version %d "
 741                    "expected %d", msg.version, MULTIFD_VERSION);
 742         return -1;
 743     }
 744
 745     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 746         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 747         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 748
 749         error_setg(errp, "multifd: received uuid '%s' and expected "
 750                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 751         g_free(uuid);
 752         g_free(msg_uuid);
 753         return -1;
 754     }
 755
 756     if (msg.id > migrate_multifd_channels()) {
 757         error_setg(errp, "multifd: received channel version %d "
 758                    "expected %d", msg.version, MULTIFD_VERSION);
 759         return -1;
 760     }
 761
 762     return msg.id;
 763 }
 764
 765 static MultiFDPages_t *multifd_pages_init(size_t size)
 766 {
 767     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 768
 769     pages->allocated = size;
 770     pages->iov = g_new0(struct iovec, size);
 771     pages->offset = g_new0(ram_addr_t, size);
 772
 773     return pages;
 774 }
 775
 776 static void multifd_pages_clear(MultiFDPages_t *pages)
 777 {
 778     pages->used = 0;
 779     pages->allocated = 0;
 780     pages->packet_num = 0;
 781     pages->block = NULL;
 782     g_free(pages->iov);
 783     pages->iov = NULL;
 784     g_free(pages->offset);
 785     pages->offset = NULL;
 786     g_free(pages);
 787 }
 788
 789 static void multifd_send_fill_packet(MultiFDSendParams *p)
 790 {
 791     MultiFDPacket_t *packet = p->packet;
 792     uint32_t page_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
 793     int i;
 794
 795     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 796     packet->version = cpu_to_be32(MULTIFD_VERSION);
 797     packet->flags = cpu_to_be32(p->flags);
 798     packet->pages_alloc = cpu_to_be32(page_max);
 799     packet->pages_used = cpu_to_be32(p->pages->used);
 800     packet->next_packet_size = cpu_to_be32(p->next_packet_size);
 801     packet->packet_num = cpu_to_be64(p->packet_num);
 802
 803     if (p->pages->block) {
 804         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 805     }
 806
 807     for (i = 0; i < p->pages->used; i++) {
 808         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 809     }
 810 }
 811
 812 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 813 {
 814     MultiFDPacket_t *packet = p->packet;
 815     uint32_t pages_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
 816     RAMBlock *block;
 817     int i;
 818
 819     packet->magic = be32_to_cpu(packet->magic);
 820     if (packet->magic != MULTIFD_MAGIC) {
 821         error_setg(errp, "multifd: received packet "
 822                    "magic %x and expected magic %x",
 823                    packet->magic, MULTIFD_MAGIC);
 824         return -1;
 825     }
 826
 827     packet->version = be32_to_cpu(packet->version);
 828     if (packet->version != MULTIFD_VERSION) {
 829         error_setg(errp, "multifd: received packet "
 830                    "version %d and expected version %d",
 831                    packet->version, MULTIFD_VERSION);
 832         return -1;
 833     }
 834
 835     p->flags = be32_to_cpu(packet->flags);
 836
 837     packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
 838     /*
 839      * If we recevied a packet that is 100 times bigger than expected
 840      * just stop migration.  It is a magic number.
 841      */
 842     if (packet->pages_alloc > pages_max * 100) {
 843         error_setg(errp, "multifd: received packet "
 844                    "with size %d and expected a maximum size of %d",
 845                    packet->pages_alloc, pages_max * 100) ;
 846         return -1;
 847     }
 848     /*
 849      * We received a packet that is bigger than expected but inside
 850      * reasonable limits (see previous comment).  Just reallocate.
 851      */
 852     if (packet->pages_alloc > p->pages->allocated) {
 853         multifd_pages_clear(p->pages);
 854         p->pages = multifd_pages_init(packet->pages_alloc);
 855     }
 856
 857     p->pages->used = be32_to_cpu(packet->pages_used);
 858     if (p->pages->used > packet->pages_alloc) {
 859         error_setg(errp, "multifd: received packet "
 860                    "with %d pages and expected maximum pages are %d",
 861                    p->pages->used, packet->pages_alloc) ;
 862         return -1;
 863     }
 864
 865     p->next_packet_size = be32_to_cpu(packet->next_packet_size);
 866     p->packet_num = be64_to_cpu(packet->packet_num);
 867
 868     if (p->pages->used) {
 869         /* make sure that ramblock is 0 terminated */
 870         packet->ramblock[255] = 0;
 871         block = qemu_ram_block_by_name(packet->ramblock);
 872         if (!block) {
 873             error_setg(errp, "multifd: unknown ram block %s",
 874                        packet->ramblock);
 875             return -1;
 876         }
 877     }
 878
 879     for (i = 0; i < p->pages->used; i++) {
 880         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 881
 882         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 883             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 884                        " (max " RAM_ADDR_FMT ")",
 885                        offset, block->max_length);
 886             return -1;
 887         }
 888         p->pages->iov[i].iov_base = block->host + offset;
 889         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 890     }
 891
 892     return 0;
 893 }
 894
 895 struct {
 896     MultiFDSendParams *params;
 897     /* array of pages to sent */
 898     MultiFDPages_t *pages;
 899     /* syncs main thread and channels */
 900     QemuSemaphore sem_sync;
 901     /* global number of generated multifd packets */
 902     uint64_t packet_num;
 903     /* send channels ready */
 904     QemuSemaphore channels_ready;
 905 } *multifd_send_state;
 906
 907 /*
 908  * How we use multifd_send_state->pages and channel->pages?
 909  *
 910  * We create a pages for each channel, and a main one.  Each time that
 911  * we need to send a batch of pages we interchange the ones between
 912  * multifd_send_state and the channel that is sending it.  There are
 913  * two reasons for that:
 914  *    - to not have to do so many mallocs during migration
 915  *    - to make easier to know what to free at the end of migration
 916  *
 917  * This way we always know who is the owner of each "pages" struct,
 918  * and we don't need any locking.  It belongs to the migration thread
 919  * or to the channel thread.  Switching is safe because the migration
 920  * thread is using the channel mutex when changing it, and the channel
 921  * have to had finish with its own, otherwise pending_job can't be
 922  * false.
 923  */
 924
 925 static int multifd_send_pages(void)
 926 {
 927     int i;
 928     static int next_channel;
 929     MultiFDSendParams *p = NULL; /* make happy gcc */
 930     MultiFDPages_t *pages = multifd_send_state->pages;
 931     uint64_t transferred;
 932
 933     qemu_sem_wait(&multifd_send_state->channels_ready);
 934     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 935         p = &multifd_send_state->params[i];
 936
 937         qemu_mutex_lock(&p->mutex);
 938         if (p->quit) {
 939             error_report("%s: channel %d has already quit!", __func__, i);
 940             qemu_mutex_unlock(&p->mutex);
 941             return -1;
 942         }
 943         if (!p->pending_job) {
 944             p->pending_job++;
 945             next_channel = (i + 1) % migrate_multifd_channels();
 946             break;
 947         }
 948         qemu_mutex_unlock(&p->mutex);
 949     }
 950     p->pages->used = 0;
 951
 952     p->packet_num = multifd_send_state->packet_num++;
 953     p->pages->block = NULL;
 954     multifd_send_state->pages = p->pages;
 955     p->pages = pages;
 956     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
 957     ram_counters.multifd_bytes += transferred;
 958     ram_counters.transferred += transferred;;
 959     qemu_mutex_unlock(&p->mutex);
 960     qemu_sem_post(&p->sem);
 961
 962     return 1;
 963 }
 964
 965 static int multifd_queue_page(RAMBlock *block, ram_addr_t offset)
 966 {
 967     MultiFDPages_t *pages = multifd_send_state->pages;
 968
 969     if (!pages->block) {
 970         pages->block = block;
 971     }
 972
 973     if (pages->block == block) {
 974         pages->offset[pages->used] = offset;
 975         pages->iov[pages->used].iov_base = block->host + offset;
 976         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 977         pages->used++;
 978
 979         if (pages->used < pages->allocated) {
 980             return 1;
 981         }
 982     }
 983
 984     if (multifd_send_pages() < 0) {
 985         return -1;
 986     }
 987
 988     if (pages->block != block) {
 989         return  multifd_queue_page(block, offset);
 990     }
 991
 992     return 1;
 993 }
 994
 995 static void multifd_send_terminate_threads(Error *err)
 996 {
 997     int i;
 998
 999     if (err) {
1000         MigrationState *s = migrate_get_current();
1001         migrate_set_error(s, err);
1002         if (s->state == MIGRATION_STATUS_SETUP ||
1003             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
1004             s->state == MIGRATION_STATUS_DEVICE ||
1005             s->state == MIGRATION_STATUS_ACTIVE) {
1006             migrate_set_state(&s->state, s->state,
1007                               MIGRATION_STATUS_FAILED);
1008         }
1009     }
1010
1011     for (i = 0; i < migrate_multifd_channels(); i++) {
1012         MultiFDSendParams *p = &multifd_send_state->params[i];
1013
1014         qemu_mutex_lock(&p->mutex);
1015         p->quit = true;
1016         qemu_sem_post(&p->sem);
1017         qemu_mutex_unlock(&p->mutex);
1018     }
1019 }
1020
1021 void multifd_save_cleanup(void)
1022 {
1023     int i;
1024
1025     if (!migrate_use_multifd()) {
1026         return;
1027     }
1028     multifd_send_terminate_threads(NULL);
1029     for (i = 0; i < migrate_multifd_channels(); i++) {
1030         MultiFDSendParams *p = &multifd_send_state->params[i];
1031
1032         if (p->running) {
1033             qemu_thread_join(&p->thread);
1034         }
1035         socket_send_channel_destroy(p->c);
1036         p->c = NULL;
1037         qemu_mutex_destroy(&p->mutex);
1038         qemu_sem_destroy(&p->sem);
1039         g_free(p->name);
1040         p->name = NULL;
1041         multifd_pages_clear(p->pages);
1042         p->pages = NULL;
1043         p->packet_len = 0;
1044         g_free(p->packet);
1045         p->packet = NULL;
1046     }
1047     qemu_sem_destroy(&multifd_send_state->channels_ready);
1048     qemu_sem_destroy(&multifd_send_state->sem_sync);
1049     g_free(multifd_send_state->params);
1050     multifd_send_state->params = NULL;
1051     multifd_pages_clear(multifd_send_state->pages);
1052     multifd_send_state->pages = NULL;
1053     g_free(multifd_send_state);
1054     multifd_send_state = NULL;
1055 }
1056
1057 static void multifd_send_sync_main(void)
1058 {
1059     int i;
1060
1061     if (!migrate_use_multifd()) {
1062         return;
1063     }
1064     if (multifd_send_state->pages->used) {
1065         if (multifd_send_pages() < 0) {
1066             error_report("%s: multifd_send_pages fail", __func__);
1067             return;
1068         }
1069     }
1070     for (i = 0; i < migrate_multifd_channels(); i++) {
1071         MultiFDSendParams *p = &multifd_send_state->params[i];
1072
1073         trace_multifd_send_sync_main_signal(p->id);
1074
1075         qemu_mutex_lock(&p->mutex);
1076
1077         if (p->quit) {
1078             error_report("%s: channel %d has already quit", __func__, i);
1079             qemu_mutex_unlock(&p->mutex);
1080             return;
1081         }
1082
1083         p->packet_num = multifd_send_state->packet_num++;
1084         p->flags |= MULTIFD_FLAG_SYNC;
1085         p->pending_job++;
1086         qemu_mutex_unlock(&p->mutex);
1087         qemu_sem_post(&p->sem);
1088     }
1089     for (i = 0; i < migrate_multifd_channels(); i++) {
1090         MultiFDSendParams *p = &multifd_send_state->params[i];
1091
1092         trace_multifd_send_sync_main_wait(p->id);
1093         qemu_sem_wait(&multifd_send_state->sem_sync);
1094     }
1095     trace_multifd_send_sync_main(multifd_send_state->packet_num);
1096 }
1097
1098 static void *multifd_send_thread(void *opaque)
1099 {
1100     MultiFDSendParams *p = opaque;
1101     Error *local_err = NULL;
1102     int ret = 0;
1103     uint32_t flags = 0;
1104
1105     trace_multifd_send_thread_start(p->id);
1106     rcu_register_thread();
1107
1108     if (multifd_send_initial_packet(p, &local_err) < 0) {
1109         goto out;
1110     }
1111     /* initial packet */
1112     p->num_packets = 1;
1113
1114     while (true) {
1115         qemu_sem_wait(&p->sem);
1116         qemu_mutex_lock(&p->mutex);
1117
1118         if (p->pending_job) {
1119             uint32_t used = p->pages->used;
1120             uint64_t packet_num = p->packet_num;
1121             flags = p->flags;
1122
1123             p->next_packet_size = used * qemu_target_page_size();
1124             multifd_send_fill_packet(p);
1125             p->flags = 0;
1126             p->num_packets++;
1127             p->num_pages += used;
1128             p->pages->used = 0;
1129             qemu_mutex_unlock(&p->mutex);
1130
1131             trace_multifd_send(p->id, packet_num, used, flags,
1132                                p->next_packet_size);
1133
1134             ret = qio_channel_write_all(p->c, (void *)p->packet,
1135                                         p->packet_len, &local_err);
1136             if (ret != 0) {
1137                 break;
1138             }
1139
1140             if (used) {
1141                 ret = qio_channel_writev_all(p->c, p->pages->iov,
1142                                              used, &local_err);
1143                 if (ret != 0) {
1144                     break;
1145                 }
1146             }
1147
1148             qemu_mutex_lock(&p->mutex);
1149             p->pending_job--;
1150             qemu_mutex_unlock(&p->mutex);
1151
1152             if (flags & MULTIFD_FLAG_SYNC) {
1153                 qemu_sem_post(&multifd_send_state->sem_sync);
1154             }
1155             qemu_sem_post(&multifd_send_state->channels_ready);
1156         } else if (p->quit) {
1157             qemu_mutex_unlock(&p->mutex);
1158             break;
1159         } else {
1160             qemu_mutex_unlock(&p->mutex);
1161             /* sometimes there are spurious wakeups */
1162         }
1163     }
1164
1165 out:
1166     if (local_err) {
1167         multifd_send_terminate_threads(local_err);
1168     }
1169
1170     /*
1171      * Error happen, I will exit, but I can't just leave, tell
1172      * who pay attention to me.
1173      */
1174     if (ret != 0) {
1175         if (flags & MULTIFD_FLAG_SYNC) {
1176             qemu_sem_post(&multifd_send_state->sem_sync);
1177         }
1178         qemu_sem_post(&multifd_send_state->channels_ready);
1179     }
1180
1181     qemu_mutex_lock(&p->mutex);
1182     p->running = false;
1183     qemu_mutex_unlock(&p->mutex);
1184
1185     rcu_unregister_thread();
1186     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1187
1188     return NULL;
1189 }
1190
1191 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1192 {
1193     MultiFDSendParams *p = opaque;
1194     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1195     Error *local_err = NULL;
1196
1197     if (qio_task_propagate_error(task, &local_err)) {
1198         migrate_set_error(migrate_get_current(), local_err);
1199         multifd_save_cleanup();
1200     } else {
1201         p->c = QIO_CHANNEL(sioc);
1202         qio_channel_set_delay(p->c, false);
1203         p->running = true;
1204         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1205                            QEMU_THREAD_JOINABLE);
1206     }
1207 }
1208
1209 int multifd_save_setup(void)
1210 {
1211     int thread_count;
1212     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1213     uint8_t i;
1214
1215     if (!migrate_use_multifd()) {
1216         return 0;
1217     }
1218     thread_count = migrate_multifd_channels();
1219     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1220     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1221     multifd_send_state->pages = multifd_pages_init(page_count);
1222     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1223     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1224
1225     for (i = 0; i < thread_count; i++) {
1226         MultiFDSendParams *p = &multifd_send_state->params[i];
1227
1228         qemu_mutex_init(&p->mutex);
1229         qemu_sem_init(&p->sem, 0);
1230         p->quit = false;
1231         p->pending_job = 0;
1232         p->id = i;
1233         p->pages = multifd_pages_init(page_count);
1234         p->packet_len = sizeof(MultiFDPacket_t)
1235                       + sizeof(ram_addr_t) * page_count;
1236         p->packet = g_malloc0(p->packet_len);
1237         p->name = g_strdup_printf("multifdsend_%d", i);
1238         socket_send_channel_create(multifd_new_send_channel_async, p);
1239     }
1240     return 0;
1241 }
1242
1243 struct {
1244     MultiFDRecvParams *params;
1245     /* number of created threads */
1246     int count;
1247     /* syncs main thread and channels */
1248     QemuSemaphore sem_sync;
1249     /* global number of generated multifd packets */
1250     uint64_t packet_num;
1251 } *multifd_recv_state;
1252
1253 static void multifd_recv_terminate_threads(Error *err)
1254 {
1255     int i;
1256
1257     if (err) {
1258         MigrationState *s = migrate_get_current();
1259         migrate_set_error(s, err);
1260         if (s->state == MIGRATION_STATUS_SETUP ||
1261             s->state == MIGRATION_STATUS_ACTIVE) {
1262             migrate_set_state(&s->state, s->state,
1263                               MIGRATION_STATUS_FAILED);
1264         }
1265     }
1266
1267     for (i = 0; i < migrate_multifd_channels(); i++) {
1268         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1269
1270         qemu_mutex_lock(&p->mutex);
1271         p->quit = true;
1272         /* We could arrive here for two reasons:
1273            - normal quit, i.e. everything went fine, just finished
1274            - error quit: We close the channels so the channel threads
1275              finish the qio_channel_read_all_eof() */
1276         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1277         qemu_mutex_unlock(&p->mutex);
1278     }
1279 }
1280
1281 int multifd_load_cleanup(Error **errp)
1282 {
1283     int i;
1284     int ret = 0;
1285
1286     if (!migrate_use_multifd()) {
1287         return 0;
1288     }
1289     multifd_recv_terminate_threads(NULL);
1290     for (i = 0; i < migrate_multifd_channels(); i++) {
1291         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1292
1293         if (p->running) {
1294             p->quit = true;
1295             /*
1296              * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
1297              * however try to wakeup it without harm in cleanup phase.
1298              */
1299             qemu_sem_post(&p->sem_sync);
1300             qemu_thread_join(&p->thread);
1301         }
1302         object_unref(OBJECT(p->c));
1303         p->c = NULL;
1304         qemu_mutex_destroy(&p->mutex);
1305         qemu_sem_destroy(&p->sem_sync);
1306         g_free(p->name);
1307         p->name = NULL;
1308         multifd_pages_clear(p->pages);
1309         p->pages = NULL;
1310         p->packet_len = 0;
1311         g_free(p->packet);
1312         p->packet = NULL;
1313     }
1314     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1315     g_free(multifd_recv_state->params);
1316     multifd_recv_state->params = NULL;
1317     g_free(multifd_recv_state);
1318     multifd_recv_state = NULL;
1319
1320     return ret;
1321 }
1322
1323 static void multifd_recv_sync_main(void)
1324 {
1325     int i;
1326
1327     if (!migrate_use_multifd()) {
1328         return;
1329     }
1330     for (i = 0; i < migrate_multifd_channels(); i++) {
1331         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1332
1333         trace_multifd_recv_sync_main_wait(p->id);
1334         qemu_sem_wait(&multifd_recv_state->sem_sync);
1335     }
1336     for (i = 0; i < migrate_multifd_channels(); i++) {
1337         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1338
1339         qemu_mutex_lock(&p->mutex);
1340         if (multifd_recv_state->packet_num < p->packet_num) {
1341             multifd_recv_state->packet_num = p->packet_num;
1342         }
1343         qemu_mutex_unlock(&p->mutex);
1344         trace_multifd_recv_sync_main_signal(p->id);
1345         qemu_sem_post(&p->sem_sync);
1346     }
1347     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1348 }
1349
1350 static void *multifd_recv_thread(void *opaque)
1351 {
1352     MultiFDRecvParams *p = opaque;
1353     Error *local_err = NULL;
1354     int ret;
1355
1356     trace_multifd_recv_thread_start(p->id);
1357     rcu_register_thread();
1358
1359     while (true) {
1360         uint32_t used;
1361         uint32_t flags;
1362
1363         if (p->quit) {
1364             break;
1365         }
1366
1367         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1368                                        p->packet_len, &local_err);
1369         if (ret == 0) {   /* EOF */
1370             break;
1371         }
1372         if (ret == -1) {   /* Error */
1373             break;
1374         }
1375
1376         qemu_mutex_lock(&p->mutex);
1377         ret = multifd_recv_unfill_packet(p, &local_err);
1378         if (ret) {
1379             qemu_mutex_unlock(&p->mutex);
1380             break;
1381         }
1382
1383         used = p->pages->used;
1384         flags = p->flags;
1385         trace_multifd_recv(p->id, p->packet_num, used, flags,
1386                            p->next_packet_size);
1387         p->num_packets++;
1388         p->num_pages += used;
1389         qemu_mutex_unlock(&p->mutex);
1390
1391         if (used) {
1392             ret = qio_channel_readv_all(p->c, p->pages->iov,
1393                                         used, &local_err);
1394             if (ret != 0) {
1395                 break;
1396             }
1397         }
1398
1399         if (flags & MULTIFD_FLAG_SYNC) {
1400             qemu_sem_post(&multifd_recv_state->sem_sync);
1401             qemu_sem_wait(&p->sem_sync);
1402         }
1403     }
1404
1405     if (local_err) {
1406         multifd_recv_terminate_threads(local_err);
1407     }
1408     qemu_mutex_lock(&p->mutex);
1409     p->running = false;
1410     qemu_mutex_unlock(&p->mutex);
1411
1412     rcu_unregister_thread();
1413     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1414
1415     return NULL;
1416 }
1417
1418 int multifd_load_setup(void)
1419 {
1420     int thread_count;
1421     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1422     uint8_t i;
1423
1424     if (!migrate_use_multifd()) {
1425         return 0;
1426     }
1427     thread_count = migrate_multifd_channels();
1428     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1429     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1430     atomic_set(&multifd_recv_state->count, 0);
1431     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1432
1433     for (i = 0; i < thread_count; i++) {
1434         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1435
1436         qemu_mutex_init(&p->mutex);
1437         qemu_sem_init(&p->sem_sync, 0);
1438         p->quit = false;
1439         p->id = i;
1440         p->pages = multifd_pages_init(page_count);
1441         p->packet_len = sizeof(MultiFDPacket_t)
1442                       + sizeof(ram_addr_t) * page_count;
1443         p->packet = g_malloc0(p->packet_len);
1444         p->name = g_strdup_printf("multifdrecv_%d", i);
1445     }
1446     return 0;
1447 }
1448
1449 bool multifd_recv_all_channels_created(void)
1450 {
1451     int thread_count = migrate_multifd_channels();
1452
1453     if (!migrate_use_multifd()) {
1454         return true;
1455     }
1456
1457     return thread_count == atomic_read(&multifd_recv_state->count);
1458 }
1459
1460 /*
1461  * Try to receive all multifd channels to get ready for the migration.
1462  * - Return true and do not set @errp when correctly receving all channels;
1463  * - Return false and do not set @errp when correctly receiving the current one;
1464  * - Return false and set @errp when failing to receive the current channel.
1465  */
1466 bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1467 {
1468     MultiFDRecvParams *p;
1469     Error *local_err = NULL;
1470     int id;
1471
1472     id = multifd_recv_initial_packet(ioc, &local_err);
1473     if (id < 0) {
1474         multifd_recv_terminate_threads(local_err);
1475         error_propagate_prepend(errp, local_err,
1476                                 "failed to receive packet"
1477                                 " via multifd channel %d: ",
1478                                 atomic_read(&multifd_recv_state->count));
1479         return false;
1480     }
1481
1482     p = &multifd_recv_state->params[id];
1483     if (p->c != NULL) {
1484         error_setg(&local_err, "multifd: received id '%d' already setup'",
1485                    id);
1486         multifd_recv_terminate_threads(local_err);
1487         error_propagate(errp, local_err);
1488         return false;
1489     }
1490     p->c = ioc;
1491     object_ref(OBJECT(ioc));
1492     /* initial packet */
1493     p->num_packets = 1;
1494
1495     p->running = true;
1496     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1497                        QEMU_THREAD_JOINABLE);
1498     atomic_inc(&multifd_recv_state->count);
1499     return atomic_read(&multifd_recv_state->count) ==
1500            migrate_multifd_channels();
1501 }
1502
1503 /**
1504  * save_page_header: write page header to wire
1505  *
1506  * If this is the 1st block, it also writes the block identification
1507  *
1508  * Returns the number of bytes written
1509  *
1510  * @f: QEMUFile where to send the data
1511  * @block: block that contains the page we want to send
1512  * @offset: offset inside the block for the page
1513  *          in the lower bits, it contains flags
1514  */
1515 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1516                                ram_addr_t offset)
1517 {
1518     size_t size, len;
1519
1520     if (block == rs->last_sent_block) {
1521         offset |= RAM_SAVE_FLAG_CONTINUE;
1522     }
1523     qemu_put_be64(f, offset);
1524     size = 8;
1525
1526     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1527         len = strlen(block->idstr);
1528         qemu_put_byte(f, len);
1529         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1530         size += 1 + len;
1531         rs->last_sent_block = block;
1532     }
1533     return size;
1534 }
1535
1536 /**
1537  * mig_throttle_guest_down: throotle down the guest
1538  *
1539  * Reduce amount of guest cpu execution to hopefully slow down memory
1540  * writes. If guest dirty memory rate is reduced below the rate at
1541  * which we can transfer pages to the destination then we should be
1542  * able to complete migration. Some workloads dirty memory way too
1543  * fast and will not effectively converge, even with auto-converge.
1544  */
1545 static void mig_throttle_guest_down(void)
1546 {
1547     MigrationState *s = migrate_get_current();
1548     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1549     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1550     int pct_max = s->parameters.max_cpu_throttle;
1551
1552     /* We have not started throttling yet. Let's start it. */
1553     if (!cpu_throttle_active()) {
1554         cpu_throttle_set(pct_initial);
1555     } else {
1556         /* Throttling already on, just increase the rate */
1557         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1558                          pct_max));
1559     }
1560 }
1561
1562 /**
1563  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1564  *
1565  * @rs: current RAM state
1566  * @current_addr: address for the zero page
1567  *
1568  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1569  * The important thing is that a stale (not-yet-0'd) page be replaced
1570  * by the new data.
1571  * As a bonus, if the page wasn't in the cache it gets added so that
1572  * when a small write is made into the 0'd page it gets XBZRLE sent.
1573  */
1574 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1575 {
1576     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1577         return;
1578     }
1579
1580     /* We don't care if this fails to allocate a new cache page
1581      * as long as it updated an old one */
1582     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1583                  ram_counters.dirty_sync_count);
1584 }
1585
1586 #define ENCODING_FLAG_XBZRLE 0x1
1587
1588 /**
1589  * save_xbzrle_page: compress and send current page
1590  *
1591  * Returns: 1 means that we wrote the page
1592  *          0 means that page is identical to the one already sent
1593  *          -1 means that xbzrle would be longer than normal
1594  *
1595  * @rs: current RAM state
1596  * @current_data: pointer to the address of the page contents
1597  * @current_addr: addr of the page
1598  * @block: block that contains the page we want to send
1599  * @offset: offset inside the block for the page
1600  * @last_stage: if we are at the completion stage
1601  */
1602 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1603                             ram_addr_t current_addr, RAMBlock *block,
1604                             ram_addr_t offset, bool last_stage)
1605 {
1606     int encoded_len = 0, bytes_xbzrle;
1607     uint8_t *prev_cached_page;
1608
1609     if (!cache_is_cached(XBZRLE.cache, current_addr,
1610                          ram_counters.dirty_sync_count)) {
1611         xbzrle_counters.cache_miss++;
1612         if (!last_stage) {
1613             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1614                              ram_counters.dirty_sync_count) == -1) {
1615                 return -1;
1616             } else {
1617                 /* update *current_data when the page has been
1618                    inserted into cache */
1619                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1620             }
1621         }
1622         return -1;
1623     }
1624
1625     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1626
1627     /* save current buffer into memory */
1628     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1629
1630     /* XBZRLE encoding (if there is no overflow) */
1631     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1632                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1633                                        TARGET_PAGE_SIZE);
1634
1635     /*
1636      * Update the cache contents, so that it corresponds to the data
1637      * sent, in all cases except where we skip the page.
1638      */
1639     if (!last_stage && encoded_len != 0) {
1640         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1641         /*
1642          * In the case where we couldn't compress, ensure that the caller
1643          * sends the data from the cache, since the guest might have
1644          * changed the RAM since we copied it.
1645          */
1646         *current_data = prev_cached_page;
1647     }
1648
1649     if (encoded_len == 0) {
1650         trace_save_xbzrle_page_skipping();
1651         return 0;
1652     } else if (encoded_len == -1) {
1653         trace_save_xbzrle_page_overflow();
1654         xbzrle_counters.overflow++;
1655         return -1;
1656     }
1657
1658     /* Send XBZRLE based compressed page */
1659     bytes_xbzrle = save_page_header(rs, rs->f, block,
1660                                     offset | RAM_SAVE_FLAG_XBZRLE);
1661     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1662     qemu_put_be16(rs->f, encoded_len);
1663     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1664     bytes_xbzrle += encoded_len + 1 + 2;
1665     xbzrle_counters.pages++;
1666     xbzrle_counters.bytes += bytes_xbzrle;
1667     ram_counters.transferred += bytes_xbzrle;
1668
1669     return 1;
1670 }
1671
1672 /**
1673  * migration_bitmap_find_dirty: find the next dirty page from start
1674  *
1675  * Returns the page offset within memory region of the start of a dirty page
1676  *
1677  * @rs: current RAM state
1678  * @rb: RAMBlock where to search for dirty pages
1679  * @start: page where we start the search
1680  */
1681 static inline
1682 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1683                                           unsigned long start)
1684 {
1685     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1686     unsigned long *bitmap = rb->bmap;
1687     unsigned long next;
1688
1689     if (ramblock_is_ignored(rb)) {
1690         return size;
1691     }
1692
1693     /*
1694      * When the free page optimization is enabled, we need to check the bitmap
1695      * to send the non-free pages rather than all the pages in the bulk stage.
1696      */
1697     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
1698         next = start + 1;
1699     } else {
1700         next = find_next_bit(bitmap, size, start);
1701     }
1702
1703     return next;
1704 }
1705
1706 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1707                                                 RAMBlock *rb,
1708                                                 unsigned long page)
1709 {
1710     bool ret;
1711
1712     qemu_mutex_lock(&rs->bitmap_mutex);
1713
1714     /*
1715      * Clear dirty bitmap if needed.  This _must_ be called before we
1716      * send any of the page in the chunk because we need to make sure
1717      * we can capture further page content changes when we sync dirty
1718      * log the next time.  So as long as we are going to send any of
1719      * the page in the chunk we clear the remote dirty bitmap for all.
1720      * Clearing it earlier won't be a problem, but too late will.
1721      */
1722     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
1723         uint8_t shift = rb->clear_bmap_shift;
1724         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
1725         hwaddr start = (page << TARGET_PAGE_BITS) & (-size);
1726
1727         /*
1728          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
1729          * can make things easier sometimes since then start address
1730          * of the small chunk will always be 64 pages aligned so the
1731          * bitmap will always be aligned to unsigned long.  We should
1732          * even be able to remove this restriction but I'm simply
1733          * keeping it.
1734          */
1735         assert(shift >= 6);
1736         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
1737         memory_region_clear_dirty_bitmap(rb->mr, start, size);
1738     }
1739
1740     ret = test_and_clear_bit(page, rb->bmap);
1741
1742     if (ret) {
1743         rs->migration_dirty_pages--;
1744     }
1745     qemu_mutex_unlock(&rs->bitmap_mutex);
1746
1747     return ret;
1748 }
1749
1750 /* Called with RCU critical section */
1751 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb)
1752 {
1753     rs->migration_dirty_pages +=
1754         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
1755                                               &rs->num_dirty_pages_period);
1756 }
1757
1758 /**
1759  * ram_pagesize_summary: calculate all the pagesizes of a VM
1760  *
1761  * Returns a summary bitmap of the page sizes of all RAMBlocks
1762  *
1763  * For VMs with just normal pages this is equivalent to the host page
1764  * size. If it's got some huge pages then it's the OR of all the
1765  * different page sizes.
1766  */
1767 uint64_t ram_pagesize_summary(void)
1768 {
1769     RAMBlock *block;
1770     uint64_t summary = 0;
1771
1772     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1773         summary |= block->page_size;
1774     }
1775
1776     return summary;
1777 }
1778
1779 uint64_t ram_get_total_transferred_pages(void)
1780 {
1781     return  ram_counters.normal + ram_counters.duplicate +
1782                 compression_counters.pages + xbzrle_counters.pages;
1783 }
1784
1785 static void migration_update_rates(RAMState *rs, int64_t end_time)
1786 {
1787     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1788     double compressed_size;
1789
1790     /* calculate period counters */
1791     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1792                 / (end_time - rs->time_last_bitmap_sync);
1793
1794     if (!page_count) {
1795         return;
1796     }
1797
1798     if (migrate_use_xbzrle()) {
1799         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1800             rs->xbzrle_cache_miss_prev) / page_count;
1801         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1802     }
1803
1804     if (migrate_use_compression()) {
1805         compression_counters.busy_rate = (double)(compression_counters.busy -
1806             rs->compress_thread_busy_prev) / page_count;
1807         rs->compress_thread_busy_prev = compression_counters.busy;
1808
1809         compressed_size = compression_counters.compressed_size -
1810                           rs->compressed_size_prev;
1811         if (compressed_size) {
1812             double uncompressed_size = (compression_counters.pages -
1813                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1814
1815             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1816             compression_counters.compression_rate =
1817                                         uncompressed_size / compressed_size;
1818
1819             rs->compress_pages_prev = compression_counters.pages;
1820             rs->compressed_size_prev = compression_counters.compressed_size;
1821         }
1822     }
1823 }
1824
1825 static void migration_bitmap_sync(RAMState *rs)
1826 {
1827     RAMBlock *block;
1828     int64_t end_time;
1829     uint64_t bytes_xfer_now;
1830
1831     ram_counters.dirty_sync_count++;
1832
1833     if (!rs->time_last_bitmap_sync) {
1834         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1835     }
1836
1837     trace_migration_bitmap_sync_start();
1838     memory_global_dirty_log_sync();
1839
1840     qemu_mutex_lock(&rs->bitmap_mutex);
1841     rcu_read_lock();
1842     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1843         migration_bitmap_sync_range(rs, block);
1844     }
1845     ram_counters.remaining = ram_bytes_remaining();
1846     rcu_read_unlock();
1847     qemu_mutex_unlock(&rs->bitmap_mutex);
1848
1849     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1850
1851     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1852
1853     /* more than 1 second = 1000 millisecons */
1854     if (end_time > rs->time_last_bitmap_sync + 1000) {
1855         bytes_xfer_now = ram_counters.transferred;
1856
1857         /* During block migration the auto-converge logic incorrectly detects
1858          * that ram migration makes no progress. Avoid this by disabling the
1859          * throttling logic during the bulk phase of block migration. */
1860         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1861             /* The following detection logic can be refined later. For now:
1862                Check to see if the dirtied bytes is 50% more than the approx.
1863                amount of bytes that just got transferred since the last time we
1864                were in this routine. If that happens twice, start or increase
1865                throttling */
1866
1867             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1868                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1869                 (++rs->dirty_rate_high_cnt >= 2)) {
1870                     trace_migration_throttle();
1871                     rs->dirty_rate_high_cnt = 0;
1872                     mig_throttle_guest_down();
1873             }
1874         }
1875
1876         migration_update_rates(rs, end_time);
1877
1878         rs->target_page_count_prev = rs->target_page_count;
1879
1880         /* reset period counters */
1881         rs->time_last_bitmap_sync = end_time;
1882         rs->num_dirty_pages_period = 0;
1883         rs->bytes_xfer_prev = bytes_xfer_now;
1884     }
1885     if (migrate_use_events()) {
1886         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1887     }
1888 }
1889
1890 static void migration_bitmap_sync_precopy(RAMState *rs)
1891 {
1892     Error *local_err = NULL;
1893
1894     /*
1895      * The current notifier usage is just an optimization to migration, so we
1896      * don't stop the normal migration process in the error case.
1897      */
1898     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1899         error_report_err(local_err);
1900     }
1901
1902     migration_bitmap_sync(rs);
1903
1904     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1905         error_report_err(local_err);
1906     }
1907 }
1908
1909 /**
1910  * save_zero_page_to_file: send the zero page to the file
1911  *
1912  * Returns the size of data written to the file, 0 means the page is not
1913  * a zero page
1914  *
1915  * @rs: current RAM state
1916  * @file: the file where the data is saved
1917  * @block: block that contains the page we want to send
1918  * @offset: offset inside the block for the page
1919  */
1920 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1921                                   RAMBlock *block, ram_addr_t offset)
1922 {
1923     uint8_t *p = block->host + offset;
1924     int len = 0;
1925
1926     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1927         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1928         qemu_put_byte(file, 0);
1929         len += 1;
1930     }
1931     return len;
1932 }
1933
1934 /**
1935  * save_zero_page: send the zero page to the stream
1936  *
1937  * Returns the number of pages written.
1938  *
1939  * @rs: current RAM state
1940  * @block: block that contains the page we want to send
1941  * @offset: offset inside the block for the page
1942  */
1943 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1944 {
1945     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1946
1947     if (len) {
1948         ram_counters.duplicate++;
1949         ram_counters.transferred += len;
1950         return 1;
1951     }
1952     return -1;
1953 }
1954
1955 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1956 {
1957     if (!migrate_release_ram() || !migration_in_postcopy()) {
1958         return;
1959     }
1960
1961     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1962 }
1963
1964 /*
1965  * @pages: the number of pages written by the control path,
1966  *        < 0 - error
1967  *        > 0 - number of pages written
1968  *
1969  * Return true if the pages has been saved, otherwise false is returned.
1970  */
1971 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1972                               int *pages)
1973 {
1974     uint64_t bytes_xmit = 0;
1975     int ret;
1976
1977     *pages = -1;
1978     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1979                                 &bytes_xmit);
1980     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1981         return false;
1982     }
1983
1984     if (bytes_xmit) {
1985         ram_counters.transferred += bytes_xmit;
1986         *pages = 1;
1987     }
1988
1989     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1990         return true;
1991     }
1992
1993     if (bytes_xmit > 0) {
1994         ram_counters.normal++;
1995     } else if (bytes_xmit == 0) {
1996         ram_counters.duplicate++;
1997     }
1998
1999     return true;
2000 }
2001
2002 /*
2003  * directly send the page to the stream
2004  *
2005  * Returns the number of pages written.
2006  *
2007  * @rs: current RAM state
2008  * @block: block that contains the page we want to send
2009  * @offset: offset inside the block for the page
2010  * @buf: the page to be sent
2011  * @async: send to page asyncly
2012  */
2013 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
2014                             uint8_t *buf, bool async)
2015 {
2016     ram_counters.transferred += save_page_header(rs, rs->f, block,
2017                                                  offset | RAM_SAVE_FLAG_PAGE);
2018     if (async) {
2019         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
2020                               migrate_release_ram() &
2021                               migration_in_postcopy());
2022     } else {
2023         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
2024     }
2025     ram_counters.transferred += TARGET_PAGE_SIZE;
2026     ram_counters.normal++;
2027     return 1;
2028 }
2029
2030 /**
2031  * ram_save_page: send the given page to the stream
2032  *
2033  * Returns the number of pages written.
2034  *          < 0 - error
2035  *          >=0 - Number of pages written - this might legally be 0
2036  *                if xbzrle noticed the page was the same.
2037  *
2038  * @rs: current RAM state
2039  * @block: block that contains the page we want to send
2040  * @offset: offset inside the block for the page
2041  * @last_stage: if we are at the completion stage
2042  */
2043 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
2044 {
2045     int pages = -1;
2046     uint8_t *p;
2047     bool send_async = true;
2048     RAMBlock *block = pss->block;
2049     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2050     ram_addr_t current_addr = block->offset + offset;
2051
2052     p = block->host + offset;
2053     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
2054
2055     XBZRLE_cache_lock();
2056     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
2057         migrate_use_xbzrle()) {
2058         pages = save_xbzrle_page(rs, &p, current_addr, block,
2059                                  offset, last_stage);
2060         if (!last_stage) {
2061             /* Can't send this cached data async, since the cache page
2062              * might get updated before it gets to the wire
2063              */
2064             send_async = false;
2065         }
2066     }
2067
2068     /* XBZRLE overflow or normal page */
2069     if (pages == -1) {
2070         pages = save_normal_page(rs, block, offset, p, send_async);
2071     }
2072
2073     XBZRLE_cache_unlock();
2074
2075     return pages;
2076 }
2077
2078 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
2079                                  ram_addr_t offset)
2080 {
2081     if (multifd_queue_page(block, offset) < 0) {
2082         return -1;
2083     }
2084     ram_counters.normal++;
2085
2086     return 1;
2087 }
2088
2089 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
2090                                  ram_addr_t offset, uint8_t *source_buf)
2091 {
2092     RAMState *rs = ram_state;
2093     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
2094     bool zero_page = false;
2095     int ret;
2096
2097     if (save_zero_page_to_file(rs, f, block, offset)) {
2098         zero_page = true;
2099         goto exit;
2100     }
2101
2102     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
2103
2104     /*
2105      * copy it to a internal buffer to avoid it being modified by VM
2106      * so that we can catch up the error during compression and
2107      * decompression
2108      */
2109     memcpy(source_buf, p, TARGET_PAGE_SIZE);
2110     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2111     if (ret < 0) {
2112         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
2113         error_report("compressed data failed!");
2114         return false;
2115     }
2116
2117 exit:
2118     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
2119     return zero_page;
2120 }
2121
2122 static void
2123 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2124 {
2125     ram_counters.transferred += bytes_xmit;
2126
2127     if (param->zero_page) {
2128         ram_counters.duplicate++;
2129         return;
2130     }
2131
2132     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2133     compression_counters.compressed_size += bytes_xmit - 8;
2134     compression_counters.pages++;
2135 }
2136
2137 static bool save_page_use_compression(RAMState *rs);
2138
2139 static void flush_compressed_data(RAMState *rs)
2140 {
2141     int idx, len, thread_count;
2142
2143     if (!save_page_use_compression(rs)) {
2144         return;
2145     }
2146     thread_count = migrate_compress_threads();
2147
2148     qemu_mutex_lock(&comp_done_lock);
2149     for (idx = 0; idx < thread_count; idx++) {
2150         while (!comp_param[idx].done) {
2151             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2152         }
2153     }
2154     qemu_mutex_unlock(&comp_done_lock);
2155
2156     for (idx = 0; idx < thread_count; idx++) {
2157         qemu_mutex_lock(&comp_param[idx].mutex);
2158         if (!comp_param[idx].quit) {
2159             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2160             /*
2161              * it's safe to fetch zero_page without holding comp_done_lock
2162              * as there is no further request submitted to the thread,
2163              * i.e, the thread should be waiting for a request at this point.
2164              */
2165             update_compress_thread_counts(&comp_param[idx], len);
2166         }
2167         qemu_mutex_unlock(&comp_param[idx].mutex);
2168     }
2169 }
2170
2171 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2172                                        ram_addr_t offset)
2173 {
2174     param->block = block;
2175     param->offset = offset;
2176 }
2177
2178 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2179                                            ram_addr_t offset)
2180 {
2181     int idx, thread_count, bytes_xmit = -1, pages = -1;
2182     bool wait = migrate_compress_wait_thread();
2183
2184     thread_count = migrate_compress_threads();
2185     qemu_mutex_lock(&comp_done_lock);
2186 retry:
2187     for (idx = 0; idx < thread_count; idx++) {
2188         if (comp_param[idx].done) {
2189             comp_param[idx].done = false;
2190             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2191             qemu_mutex_lock(&comp_param[idx].mutex);
2192             set_compress_params(&comp_param[idx], block, offset);
2193             qemu_cond_signal(&comp_param[idx].cond);
2194             qemu_mutex_unlock(&comp_param[idx].mutex);
2195             pages = 1;
2196             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
2197             break;
2198         }
2199     }
2200
2201     /*
2202      * wait for the free thread if the user specifies 'compress-wait-thread',
2203      * otherwise we will post the page out in the main thread as normal page.
2204      */
2205     if (pages < 0 && wait) {
2206         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2207         goto retry;
2208     }
2209     qemu_mutex_unlock(&comp_done_lock);
2210
2211     return pages;
2212 }
2213
2214 /**
2215  * find_dirty_block: find the next dirty page and update any state
2216  * associated with the search process.
2217  *
2218  * Returns true if a page is found
2219  *
2220  * @rs: current RAM state
2221  * @pss: data about the state of the current dirty page scan
2222  * @again: set to false if the search has scanned the whole of RAM
2223  */
2224 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
2225 {
2226     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2227     if (pss->complete_round && pss->block == rs->last_seen_block &&
2228         pss->page >= rs->last_page) {
2229         /*
2230          * We've been once around the RAM and haven't found anything.
2231          * Give up.
2232          */
2233         *again = false;
2234         return false;
2235     }
2236     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
2237         /* Didn't find anything in this RAM Block */
2238         pss->page = 0;
2239         pss->block = QLIST_NEXT_RCU(pss->block, next);
2240         if (!pss->block) {
2241             /*
2242              * If memory migration starts over, we will meet a dirtied page
2243              * which may still exists in compression threads's ring, so we
2244              * should flush the compressed data to make sure the new page
2245              * is not overwritten by the old one in the destination.
2246              *
2247              * Also If xbzrle is on, stop using the data compression at this
2248              * point. In theory, xbzrle can do better than compression.
2249              */
2250             flush_compressed_data(rs);
2251
2252             /* Hit the end of the list */
2253             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2254             /* Flag that we've looped */
2255             pss->complete_round = true;
2256             rs->ram_bulk_stage = false;
2257         }
2258         /* Didn't find anything this time, but try again on the new block */
2259         *again = true;
2260         return false;
2261     } else {
2262         /* Can go around again, but... */
2263         *again = true;
2264         /* We've found something so probably don't need to */
2265         return true;
2266     }
2267 }
2268
2269 /**
2270  * unqueue_page: gets a page of the queue
2271  *
2272  * Helper for 'get_queued_page' - gets a page off the queue
2273  *
2274  * Returns the block of the page (or NULL if none available)
2275  *
2276  * @rs: current RAM state
2277  * @offset: used to return the offset within the RAMBlock
2278  */
2279 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2280 {
2281     RAMBlock *block = NULL;
2282
2283     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2284         return NULL;
2285     }
2286
2287     qemu_mutex_lock(&rs->src_page_req_mutex);
2288     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2289         struct RAMSrcPageRequest *entry =
2290                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
2291         block = entry->rb;
2292         *offset = entry->offset;
2293
2294         if (entry->len > TARGET_PAGE_SIZE) {
2295             entry->len -= TARGET_PAGE_SIZE;
2296             entry->offset += TARGET_PAGE_SIZE;
2297         } else {
2298             memory_region_unref(block->mr);
2299             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2300             g_free(entry);
2301             migration_consume_urgent_request();
2302         }
2303     }
2304     qemu_mutex_unlock(&rs->src_page_req_mutex);
2305
2306     return block;
2307 }
2308
2309 /**
2310  * get_queued_page: unqueue a page from the postcopy requests
2311  *
2312  * Skips pages that are already sent (!dirty)
2313  *
2314  * Returns true if a queued page is found
2315  *
2316  * @rs: current RAM state
2317  * @pss: data about the state of the current dirty page scan
2318  */
2319 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2320 {
2321     RAMBlock  *block;
2322     ram_addr_t offset;
2323     bool dirty;
2324
2325     do {
2326         block = unqueue_page(rs, &offset);
2327         /*
2328          * We're sending this page, and since it's postcopy nothing else
2329          * will dirty it, and we must make sure it doesn't get sent again
2330          * even if this queue request was received after the background
2331          * search already sent it.
2332          */
2333         if (block) {
2334             unsigned long page;
2335
2336             page = offset >> TARGET_PAGE_BITS;
2337             dirty = test_bit(page, block->bmap);
2338             if (!dirty) {
2339                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2340                        page, test_bit(page, block->unsentmap));
2341             } else {
2342                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2343             }
2344         }
2345
2346     } while (block && !dirty);
2347
2348     if (block) {
2349         /*
2350          * As soon as we start servicing pages out of order, then we have
2351          * to kill the bulk stage, since the bulk stage assumes
2352          * in (migration_bitmap_find_and_reset_dirty) that every page is
2353          * dirty, that's no longer true.
2354          */
2355         rs->ram_bulk_stage = false;
2356
2357         /*
2358          * We want the background search to continue from the queued page
2359          * since the guest is likely to want other pages near to the page
2360          * it just requested.
2361          */
2362         pss->block = block;
2363         pss->page = offset >> TARGET_PAGE_BITS;
2364
2365         /*
2366          * This unqueued page would break the "one round" check, even is
2367          * really rare.
2368          */
2369         pss->complete_round = false;
2370     }
2371
2372     return !!block;
2373 }
2374
2375 /**
2376  * migration_page_queue_free: drop any remaining pages in the ram
2377  * request queue
2378  *
2379  * It should be empty at the end anyway, but in error cases there may
2380  * be some left.  in case that there is any page left, we drop it.
2381  *
2382  */
2383 static void migration_page_queue_free(RAMState *rs)
2384 {
2385     struct RAMSrcPageRequest *mspr, *next_mspr;
2386     /* This queue generally should be empty - but in the case of a failed
2387      * migration might have some droppings in.
2388      */
2389     rcu_read_lock();
2390     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2391         memory_region_unref(mspr->rb->mr);
2392         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2393         g_free(mspr);
2394     }
2395     rcu_read_unlock();
2396 }
2397
2398 /**
2399  * ram_save_queue_pages: queue the page for transmission
2400  *
2401  * A request from postcopy destination for example.
2402  *
2403  * Returns zero on success or negative on error
2404  *
2405  * @rbname: Name of the RAMBLock of the request. NULL means the
2406  *          same that last one.
2407  * @start: starting address from the start of the RAMBlock
2408  * @len: length (in bytes) to send
2409  */
2410 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2411 {
2412     RAMBlock *ramblock;
2413     RAMState *rs = ram_state;
2414
2415     ram_counters.postcopy_requests++;
2416     rcu_read_lock();
2417     if (!rbname) {
2418         /* Reuse last RAMBlock */
2419         ramblock = rs->last_req_rb;
2420
2421         if (!ramblock) {
2422             /*
2423              * Shouldn't happen, we can't reuse the last RAMBlock if
2424              * it's the 1st request.
2425              */
2426             error_report("ram_save_queue_pages no previous block");
2427             goto err;
2428         }
2429     } else {
2430         ramblock = qemu_ram_block_by_name(rbname);
2431
2432         if (!ramblock) {
2433             /* We shouldn't be asked for a non-existent RAMBlock */
2434             error_report("ram_save_queue_pages no block '%s'", rbname);
2435             goto err;
2436         }
2437         rs->last_req_rb = ramblock;
2438     }
2439     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2440     if (start+len > ramblock->used_length) {
2441         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2442                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2443                      __func__, start, len, ramblock->used_length);
2444         goto err;
2445     }
2446
2447     struct RAMSrcPageRequest *new_entry =
2448         g_malloc0(sizeof(struct RAMSrcPageRequest));
2449     new_entry->rb = ramblock;
2450     new_entry->offset = start;
2451     new_entry->len = len;
2452
2453     memory_region_ref(ramblock->mr);
2454     qemu_mutex_lock(&rs->src_page_req_mutex);
2455     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2456     migration_make_urgent_request();
2457     qemu_mutex_unlock(&rs->src_page_req_mutex);
2458     rcu_read_unlock();
2459
2460     return 0;
2461
2462 err:
2463     rcu_read_unlock();
2464     return -1;
2465 }
2466
2467 static bool save_page_use_compression(RAMState *rs)
2468 {
2469     if (!migrate_use_compression()) {
2470         return false;
2471     }
2472
2473     /*
2474      * If xbzrle is on, stop using the data compression after first
2475      * round of migration even if compression is enabled. In theory,
2476      * xbzrle can do better than compression.
2477      */
2478     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2479         return true;
2480     }
2481
2482     return false;
2483 }
2484
2485 /*
2486  * try to compress the page before posting it out, return true if the page
2487  * has been properly handled by compression, otherwise needs other
2488  * paths to handle it
2489  */
2490 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2491 {
2492     if (!save_page_use_compression(rs)) {
2493         return false;
2494     }
2495
2496     /*
2497      * When starting the process of a new block, the first page of
2498      * the block should be sent out before other pages in the same
2499      * block, and all the pages in last block should have been sent
2500      * out, keeping this order is important, because the 'cont' flag
2501      * is used to avoid resending the block name.
2502      *
2503      * We post the fist page as normal page as compression will take
2504      * much CPU resource.
2505      */
2506     if (block != rs->last_sent_block) {
2507         flush_compressed_data(rs);
2508         return false;
2509     }
2510
2511     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2512         return true;
2513     }
2514
2515     compression_counters.busy++;
2516     return false;
2517 }
2518
2519 /**
2520  * ram_save_target_page: save one target page
2521  *
2522  * Returns the number of pages written
2523  *
2524  * @rs: current RAM state
2525  * @pss: data about the page we want to send
2526  * @last_stage: if we are at the completion stage
2527  */
2528 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2529                                 bool last_stage)
2530 {
2531     RAMBlock *block = pss->block;
2532     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2533     int res;
2534
2535     if (control_save_page(rs, block, offset, &res)) {
2536         return res;
2537     }
2538
2539     if (save_compress_page(rs, block, offset)) {
2540         return 1;
2541     }
2542
2543     res = save_zero_page(rs, block, offset);
2544     if (res > 0) {
2545         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2546          * page would be stale
2547          */
2548         if (!save_page_use_compression(rs)) {
2549             XBZRLE_cache_lock();
2550             xbzrle_cache_zero_page(rs, block->offset + offset);
2551             XBZRLE_cache_unlock();
2552         }
2553         ram_release_pages(block->idstr, offset, res);
2554         return res;
2555     }
2556
2557     /*
2558      * do not use multifd for compression as the first page in the new
2559      * block should be posted out before sending the compressed page
2560      */
2561     if (!save_page_use_compression(rs) && migrate_use_multifd()) {
2562         return ram_save_multifd_page(rs, block, offset);
2563     }
2564
2565     return ram_save_page(rs, pss, last_stage);
2566 }
2567
2568 /**
2569  * ram_save_host_page: save a whole host page
2570  *
2571  * Starting at *offset send pages up to the end of the current host
2572  * page. It's valid for the initial offset to point into the middle of
2573  * a host page in which case the remainder of the hostpage is sent.
2574  * Only dirty target pages are sent. Note that the host page size may
2575  * be a huge page for this block.
2576  * The saving stops at the boundary of the used_length of the block
2577  * if the RAMBlock isn't a multiple of the host page size.
2578  *
2579  * Returns the number of pages written or negative on error
2580  *
2581  * @rs: current RAM state
2582  * @ms: current migration state
2583  * @pss: data about the page we want to send
2584  * @last_stage: if we are at the completion stage
2585  */
2586 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2587                               bool last_stage)
2588 {
2589     int tmppages, pages = 0;
2590     size_t pagesize_bits =
2591         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2592
2593     if (ramblock_is_ignored(pss->block)) {
2594         error_report("block %s should not be migrated !", pss->block->idstr);
2595         return 0;
2596     }
2597
2598     do {
2599         /* Check the pages is dirty and if it is send it */
2600         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2601             pss->page++;
2602             continue;
2603         }
2604
2605         tmppages = ram_save_target_page(rs, pss, last_stage);
2606         if (tmppages < 0) {
2607             return tmppages;
2608         }
2609
2610         pages += tmppages;
2611         if (pss->block->unsentmap) {
2612             clear_bit(pss->page, pss->block->unsentmap);
2613         }
2614
2615         pss->page++;
2616     } while ((pss->page & (pagesize_bits - 1)) &&
2617              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2618
2619     /* The offset we leave with is the last one we looked at */
2620     pss->page--;
2621     return pages;
2622 }
2623
2624 /**
2625  * ram_find_and_save_block: finds a dirty page and sends it to f
2626  *
2627  * Called within an RCU critical section.
2628  *
2629  * Returns the number of pages written where zero means no dirty pages,
2630  * or negative on error
2631  *
2632  * @rs: current RAM state
2633  * @last_stage: if we are at the completion stage
2634  *
2635  * On systems where host-page-size > target-page-size it will send all the
2636  * pages in a host page that are dirty.
2637  */
2638
2639 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2640 {
2641     PageSearchStatus pss;
2642     int pages = 0;
2643     bool again, found;
2644
2645     /* No dirty page as there is zero RAM */
2646     if (!ram_bytes_total()) {
2647         return pages;
2648     }
2649
2650     pss.block = rs->last_seen_block;
2651     pss.page = rs->last_page;
2652     pss.complete_round = false;
2653
2654     if (!pss.block) {
2655         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2656     }
2657
2658     do {
2659         again = true;
2660         found = get_queued_page(rs, &pss);
2661
2662         if (!found) {
2663             /* priority queue empty, so just search for something dirty */
2664             found = find_dirty_block(rs, &pss, &again);
2665         }
2666
2667         if (found) {
2668             pages = ram_save_host_page(rs, &pss, last_stage);
2669         }
2670     } while (!pages && again);
2671
2672     rs->last_seen_block = pss.block;
2673     rs->last_page = pss.page;
2674
2675     return pages;
2676 }
2677
2678 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2679 {
2680     uint64_t pages = size / TARGET_PAGE_SIZE;
2681
2682     if (zero) {
2683         ram_counters.duplicate += pages;
2684     } else {
2685         ram_counters.normal += pages;
2686         ram_counters.transferred += size;
2687         qemu_update_position(f, size);
2688     }
2689 }
2690
2691 static uint64_t ram_bytes_total_common(bool count_ignored)
2692 {
2693     RAMBlock *block;
2694     uint64_t total = 0;
2695
2696     rcu_read_lock();
2697     if (count_ignored) {
2698         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2699             total += block->used_length;
2700         }
2701     } else {
2702         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2703             total += block->used_length;
2704         }
2705     }
2706     rcu_read_unlock();
2707     return total;
2708 }
2709
2710 uint64_t ram_bytes_total(void)
2711 {
2712     return ram_bytes_total_common(false);
2713 }
2714
2715 static void xbzrle_load_setup(void)
2716 {
2717     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2718 }
2719
2720 static void xbzrle_load_cleanup(void)
2721 {
2722     g_free(XBZRLE.decoded_buf);
2723     XBZRLE.decoded_buf = NULL;
2724 }
2725
2726 static void ram_state_cleanup(RAMState **rsp)
2727 {
2728     if (*rsp) {
2729         migration_page_queue_free(*rsp);
2730         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2731         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2732         g_free(*rsp);
2733         *rsp = NULL;
2734     }
2735 }
2736
2737 static void xbzrle_cleanup(void)
2738 {
2739     XBZRLE_cache_lock();
2740     if (XBZRLE.cache) {
2741         cache_fini(XBZRLE.cache);
2742         g_free(XBZRLE.encoded_buf);
2743         g_free(XBZRLE.current_buf);
2744         g_free(XBZRLE.zero_target_page);
2745         XBZRLE.cache = NULL;
2746         XBZRLE.encoded_buf = NULL;
2747         XBZRLE.current_buf = NULL;
2748         XBZRLE.zero_target_page = NULL;
2749     }
2750     XBZRLE_cache_unlock();
2751 }
2752
2753 static void ram_save_cleanup(void *opaque)
2754 {
2755     RAMState **rsp = opaque;
2756     RAMBlock *block;
2757
2758     /* caller have hold iothread lock or is in a bh, so there is
2759      * no writing race against the migration bitmap
2760      */
2761     memory_global_dirty_log_stop();
2762
2763     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2764         g_free(block->clear_bmap);
2765         block->clear_bmap = NULL;
2766         g_free(block->bmap);
2767         block->bmap = NULL;
2768         g_free(block->unsentmap);
2769         block->unsentmap = NULL;
2770     }
2771
2772     xbzrle_cleanup();
2773     compress_threads_save_cleanup();
2774     ram_state_cleanup(rsp);
2775 }
2776
2777 static void ram_state_reset(RAMState *rs)
2778 {
2779     rs->last_seen_block = NULL;
2780     rs->last_sent_block = NULL;
2781     rs->last_page = 0;
2782     rs->last_version = ram_list.version;
2783     rs->ram_bulk_stage = true;
2784     rs->fpo_enabled = false;
2785 }
2786
2787 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2788
2789 /*
2790  * 'expected' is the value you expect the bitmap mostly to be full
2791  * of; it won't bother printing lines that are all this value.
2792  * If 'todump' is null the migration bitmap is dumped.
2793  */
2794 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2795                            unsigned long pages)
2796 {
2797     int64_t cur;
2798     int64_t linelen = 128;
2799     char linebuf[129];
2800
2801     for (cur = 0; cur < pages; cur += linelen) {
2802         int64_t curb;
2803         bool found = false;
2804         /*
2805          * Last line; catch the case where the line length
2806          * is longer than remaining ram
2807          */
2808         if (cur + linelen > pages) {
2809             linelen = pages - cur;
2810         }
2811         for (curb = 0; curb < linelen; curb++) {
2812             bool thisbit = test_bit(cur + curb, todump);
2813             linebuf[curb] = thisbit ? '1' : '.';
2814             found = found || (thisbit != expected);
2815         }
2816         if (found) {
2817             linebuf[curb] = '\0';
2818             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2819         }
2820     }
2821 }
2822
2823 /* **** functions for postcopy ***** */
2824
2825 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2826 {
2827     struct RAMBlock *block;
2828
2829     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2830         unsigned long *bitmap = block->bmap;
2831         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2832         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2833
2834         while (run_start < range) {
2835             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2836             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2837                               (run_end - run_start) << TARGET_PAGE_BITS);
2838             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2839         }
2840     }
2841 }
2842
2843 /**
2844  * postcopy_send_discard_bm_ram: discard a RAMBlock
2845  *
2846  * Returns zero on success
2847  *
2848  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2849  * Note: At this point the 'unsentmap' is the processed bitmap combined
2850  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2851  *
2852  * @ms: current migration state
2853  * @block: RAMBlock to discard
2854  */
2855 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2856 {
2857     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2858     unsigned long current;
2859     unsigned long *unsentmap = block->unsentmap;
2860
2861     for (current = 0; current < end; ) {
2862         unsigned long one = find_next_bit(unsentmap, end, current);
2863         unsigned long zero, discard_length;
2864
2865         if (one >= end) {
2866             break;
2867         }
2868
2869         zero = find_next_zero_bit(unsentmap, end, one + 1);
2870
2871         if (zero >= end) {
2872             discard_length = end - one;
2873         } else {
2874             discard_length = zero - one;
2875         }
2876         postcopy_discard_send_range(ms, one, discard_length);
2877         current = one + discard_length;
2878     }
2879
2880     return 0;
2881 }
2882
2883 /**
2884  * postcopy_each_ram_send_discard: discard all RAMBlocks
2885  *
2886  * Returns 0 for success or negative for error
2887  *
2888  * Utility for the outgoing postcopy code.
2889  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2890  *   passing it bitmap indexes and name.
2891  * (qemu_ram_foreach_block ends up passing unscaled lengths
2892  *  which would mean postcopy code would have to deal with target page)
2893  *
2894  * @ms: current migration state
2895  */
2896 static int postcopy_each_ram_send_discard(MigrationState *ms)
2897 {
2898     struct RAMBlock *block;
2899     int ret;
2900
2901     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2902         postcopy_discard_send_init(ms, block->idstr);
2903
2904         /*
2905          * Postcopy sends chunks of bitmap over the wire, but it
2906          * just needs indexes at this point, avoids it having
2907          * target page specific code.
2908          */
2909         ret = postcopy_send_discard_bm_ram(ms, block);
2910         postcopy_discard_send_finish(ms);
2911         if (ret) {
2912             return ret;
2913         }
2914     }
2915
2916     return 0;
2917 }
2918
2919 /**
2920  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2921  *
2922  * Helper for postcopy_chunk_hostpages; it's called twice to
2923  * canonicalize the two bitmaps, that are similar, but one is
2924  * inverted.
2925  *
2926  * Postcopy requires that all target pages in a hostpage are dirty or
2927  * clean, not a mix.  This function canonicalizes the bitmaps.
2928  *
2929  * @ms: current migration state
2930  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2931  *               otherwise we need to canonicalize partially dirty host pages
2932  * @block: block that contains the page we want to canonicalize
2933  */
2934 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2935                                           RAMBlock *block)
2936 {
2937     RAMState *rs = ram_state;
2938     unsigned long *bitmap = block->bmap;
2939     unsigned long *unsentmap = block->unsentmap;
2940     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2941     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2942     unsigned long run_start;
2943
2944     if (block->page_size == TARGET_PAGE_SIZE) {
2945         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2946         return;
2947     }
2948
2949     if (unsent_pass) {
2950         /* Find a sent page */
2951         run_start = find_next_zero_bit(unsentmap, pages, 0);
2952     } else {
2953         /* Find a dirty page */
2954         run_start = find_next_bit(bitmap, pages, 0);
2955     }
2956
2957     while (run_start < pages) {
2958
2959         /*
2960          * If the start of this run of pages is in the middle of a host
2961          * page, then we need to fixup this host page.
2962          */
2963         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2964             /* Find the end of this run */
2965             if (unsent_pass) {
2966                 run_start = find_next_bit(unsentmap, pages, run_start + 1);
2967             } else {
2968                 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2969             }
2970             /*
2971              * If the end isn't at the start of a host page, then the
2972              * run doesn't finish at the end of a host page
2973              * and we need to discard.
2974              */
2975         }
2976
2977         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2978             unsigned long page;
2979             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2980                                                              host_ratio);
2981             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2982
2983             /* Tell the destination to discard this page */
2984             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2985                 /* For the unsent_pass we:
2986                  *     discard partially sent pages
2987                  * For the !unsent_pass (dirty) we:
2988                  *     discard partially dirty pages that were sent
2989                  *     (any partially sent pages were already discarded
2990                  *     by the previous unsent_pass)
2991                  */
2992                 postcopy_discard_send_range(ms, fixup_start_addr, host_ratio);
2993             }
2994
2995             /* Clean up the bitmap */
2996             for (page = fixup_start_addr;
2997                  page < fixup_start_addr + host_ratio; page++) {
2998                 /* All pages in this host page are now not sent */
2999                 set_bit(page, unsentmap);
3000
3001                 /*
3002                  * Remark them as dirty, updating the count for any pages
3003                  * that weren't previously dirty.
3004                  */
3005                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
3006             }
3007         }
3008
3009         if (unsent_pass) {
3010             /* Find the next sent page for the next iteration */
3011             run_start = find_next_zero_bit(unsentmap, pages, run_start);
3012         } else {
3013             /* Find the next dirty page for the next iteration */
3014             run_start = find_next_bit(bitmap, pages, run_start);
3015         }
3016     }
3017 }
3018
3019 /**
3020  * postcopy_chunk_hostpages: discard any partially sent host page
3021  *
3022  * Utility for the outgoing postcopy code.
3023  *
3024  * Discard any partially sent host-page size chunks, mark any partially
3025  * dirty host-page size chunks as all dirty.  In this case the host-page
3026  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
3027  *
3028  * Returns zero on success
3029  *
3030  * @ms: current migration state
3031  * @block: block we want to work with
3032  */
3033 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
3034 {
3035     postcopy_discard_send_init(ms, block->idstr);
3036
3037     /* First pass: Discard all partially sent host pages */
3038     postcopy_chunk_hostpages_pass(ms, true, block);
3039     /*
3040      * Second pass: Ensure that all partially dirty host pages are made
3041      * fully dirty.
3042      */
3043     postcopy_chunk_hostpages_pass(ms, false, block);
3044
3045     postcopy_discard_send_finish(ms);
3046     return 0;
3047 }
3048
3049 /**
3050  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3051  *
3052  * Returns zero on success
3053  *
3054  * Transmit the set of pages to be discarded after precopy to the target
3055  * these are pages that:
3056  *     a) Have been previously transmitted but are now dirty again
3057  *     b) Pages that have never been transmitted, this ensures that
3058  *        any pages on the destination that have been mapped by background
3059  *        tasks get discarded (transparent huge pages is the specific concern)
3060  * Hopefully this is pretty sparse
3061  *
3062  * @ms: current migration state
3063  */
3064 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
3065 {
3066     RAMState *rs = ram_state;
3067     RAMBlock *block;
3068     int ret;
3069
3070     rcu_read_lock();
3071
3072     /* This should be our last sync, the src is now paused */
3073     migration_bitmap_sync(rs);
3074
3075     /* Easiest way to make sure we don't resume in the middle of a host-page */
3076     rs->last_seen_block = NULL;
3077     rs->last_sent_block = NULL;
3078     rs->last_page = 0;
3079
3080     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3081         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
3082         unsigned long *bitmap = block->bmap;
3083         unsigned long *unsentmap = block->unsentmap;
3084
3085         if (!unsentmap) {
3086             /* We don't have a safe way to resize the sentmap, so
3087              * if the bitmap was resized it will be NULL at this
3088              * point.
3089              */
3090             error_report("migration ram resized during precopy phase");
3091             rcu_read_unlock();
3092             return -EINVAL;
3093         }
3094         /* Deal with TPS != HPS and huge pages */
3095         ret = postcopy_chunk_hostpages(ms, block);
3096         if (ret) {
3097             rcu_read_unlock();
3098             return ret;
3099         }
3100
3101         /*
3102          * Update the unsentmap to be unsentmap = unsentmap | dirty
3103          */
3104         bitmap_or(unsentmap, unsentmap, bitmap, pages);
3105 #ifdef DEBUG_POSTCOPY
3106         ram_debug_dump_bitmap(unsentmap, true, pages);
3107 #endif
3108     }
3109     trace_ram_postcopy_send_discard_bitmap();
3110
3111     ret = postcopy_each_ram_send_discard(ms);
3112     rcu_read_unlock();
3113
3114     return ret;
3115 }
3116
3117 /**
3118  * ram_discard_range: discard dirtied pages at the beginning of postcopy
3119  *
3120  * Returns zero on success
3121  *
3122  * @rbname: name of the RAMBlock of the request. NULL means the
3123  *          same that last one.
3124  * @start: RAMBlock starting page
3125  * @length: RAMBlock size
3126  */
3127 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3128 {
3129     int ret = -1;
3130
3131     trace_ram_discard_range(rbname, start, length);
3132
3133     rcu_read_lock();
3134     RAMBlock *rb = qemu_ram_block_by_name(rbname);
3135
3136     if (!rb) {
3137         error_report("ram_discard_range: Failed to find block '%s'", rbname);
3138         goto err;
3139     }
3140
3141     /*
3142      * On source VM, we don't need to update the received bitmap since
3143      * we don't even have one.
3144      */
3145     if (rb->receivedmap) {
3146         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3147                      length >> qemu_target_page_bits());
3148     }
3149
3150     ret = ram_block_discard_range(rb, start, length);
3151
3152 err:
3153     rcu_read_unlock();
3154
3155     return ret;
3156 }
3157
3158 /*
3159  * For every allocation, we will try not to crash the VM if the
3160  * allocation failed.
3161  */
3162 static int xbzrle_init(void)
3163 {
3164     Error *local_err = NULL;
3165
3166     if (!migrate_use_xbzrle()) {
3167         return 0;
3168     }
3169
3170     XBZRLE_cache_lock();
3171
3172     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3173     if (!XBZRLE.zero_target_page) {
3174         error_report("%s: Error allocating zero page", __func__);
3175         goto err_out;
3176     }
3177
3178     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3179                               TARGET_PAGE_SIZE, &local_err);
3180     if (!XBZRLE.cache) {
3181         error_report_err(local_err);
3182         goto free_zero_page;
3183     }
3184
3185     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3186     if (!XBZRLE.encoded_buf) {
3187         error_report("%s: Error allocating encoded_buf", __func__);
3188         goto free_cache;
3189     }
3190
3191     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3192     if (!XBZRLE.current_buf) {
3193         error_report("%s: Error allocating current_buf", __func__);
3194         goto free_encoded_buf;
3195     }
3196
3197     /* We are all good */
3198     XBZRLE_cache_unlock();
3199     return 0;
3200
3201 free_encoded_buf:
3202     g_free(XBZRLE.encoded_buf);
3203     XBZRLE.encoded_buf = NULL;
3204 free_cache:
3205     cache_fini(XBZRLE.cache);
3206     XBZRLE.cache = NULL;
3207 free_zero_page:
3208     g_free(XBZRLE.zero_target_page);
3209     XBZRLE.zero_target_page = NULL;
3210 err_out:
3211     XBZRLE_cache_unlock();
3212     return -ENOMEM;
3213 }
3214
3215 static int ram_state_init(RAMState **rsp)
3216 {
3217     *rsp = g_try_new0(RAMState, 1);
3218
3219     if (!*rsp) {
3220         error_report("%s: Init ramstate fail", __func__);
3221         return -1;
3222     }
3223
3224     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3225     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3226     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3227
3228     /*
3229      * Count the total number of pages used by ram blocks not including any
3230      * gaps due to alignment or unplugs.
3231      * This must match with the initial values of dirty bitmap.
3232      */
3233     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3234     ram_state_reset(*rsp);
3235
3236     return 0;
3237 }
3238
3239 static void ram_list_init_bitmaps(void)
3240 {
3241     MigrationState *ms = migrate_get_current();
3242     RAMBlock *block;
3243     unsigned long pages;
3244     uint8_t shift;
3245
3246     /* Skip setting bitmap if there is no RAM */
3247     if (ram_bytes_total()) {
3248         shift = ms->clear_bitmap_shift;
3249         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3250             error_report("clear_bitmap_shift (%u) too big, using "
3251                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3252             shift = CLEAR_BITMAP_SHIFT_MAX;
3253         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3254             error_report("clear_bitmap_shift (%u) too small, using "
3255                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3256             shift = CLEAR_BITMAP_SHIFT_MIN;
3257         }
3258
3259         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3260             pages = block->max_length >> TARGET_PAGE_BITS;
3261             /*
3262              * The initial dirty bitmap for migration must be set with all
3263              * ones to make sure we'll migrate every guest RAM page to
3264              * destination.
3265              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3266              * new migration after a failed migration, ram_list.
3267              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3268              * guest memory.
3269              */
3270             block->bmap = bitmap_new(pages);
3271             bitmap_set(block->bmap, 0, pages);
3272             block->clear_bmap_shift = shift;
3273             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3274             if (migrate_postcopy_ram()) {
3275                 block->unsentmap = bitmap_new(pages);
3276                 bitmap_set(block->unsentmap, 0, pages);
3277             }
3278         }
3279     }
3280 }
3281
3282 static void ram_init_bitmaps(RAMState *rs)
3283 {
3284     /* For memory_global_dirty_log_start below.  */
3285     qemu_mutex_lock_iothread();
3286     qemu_mutex_lock_ramlist();
3287     rcu_read_lock();
3288
3289     ram_list_init_bitmaps();
3290     memory_global_dirty_log_start();
3291     migration_bitmap_sync_precopy(rs);
3292
3293     rcu_read_unlock();
3294     qemu_mutex_unlock_ramlist();
3295     qemu_mutex_unlock_iothread();
3296 }
3297
3298 static int ram_init_all(RAMState **rsp)
3299 {
3300     if (ram_state_init(rsp)) {
3301         return -1;
3302     }
3303
3304     if (xbzrle_init()) {
3305         ram_state_cleanup(rsp);
3306         return -1;
3307     }
3308
3309     ram_init_bitmaps(*rsp);
3310
3311     return 0;
3312 }
3313
3314 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3315 {
3316     RAMBlock *block;
3317     uint64_t pages = 0;
3318
3319     /*
3320      * Postcopy is not using xbzrle/compression, so no need for that.
3321      * Also, since source are already halted, we don't need to care
3322      * about dirty page logging as well.
3323      */
3324
3325     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3326         pages += bitmap_count_one(block->bmap,
3327                                   block->used_length >> TARGET_PAGE_BITS);
3328     }
3329
3330     /* This may not be aligned with current bitmaps. Recalculate. */
3331     rs->migration_dirty_pages = pages;
3332
3333     rs->last_seen_block = NULL;
3334     rs->last_sent_block = NULL;
3335     rs->last_page = 0;
3336     rs->last_version = ram_list.version;
3337     /*
3338      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3339      * matter what we have sent.
3340      */
3341     rs->ram_bulk_stage = false;
3342
3343     /* Update RAMState cache of output QEMUFile */
3344     rs->f = out;
3345
3346     trace_ram_state_resume_prepare(pages);
3347 }
3348
3349 /*
3350  * This function clears bits of the free pages reported by the caller from the
3351  * migration dirty bitmap. @addr is the host address corresponding to the
3352  * start of the continuous guest free pages, and @len is the total bytes of
3353  * those pages.
3354  */
3355 void qemu_guest_free_page_hint(void *addr, size_t len)
3356 {
3357     RAMBlock *block;
3358     ram_addr_t offset;
3359     size_t used_len, start, npages;
3360     MigrationState *s = migrate_get_current();
3361
3362     /* This function is currently expected to be used during live migration */
3363     if (!migration_is_setup_or_active(s->state)) {
3364         return;
3365     }
3366
3367     for (; len > 0; len -= used_len, addr += used_len) {
3368         block = qemu_ram_block_from_host(addr, false, &offset);
3369         if (unlikely(!block || offset >= block->used_length)) {
3370             /*
3371              * The implementation might not support RAMBlock resize during
3372              * live migration, but it could happen in theory with future
3373              * updates. So we add a check here to capture that case.
3374              */
3375             error_report_once("%s unexpected error", __func__);
3376             return;
3377         }
3378
3379         if (len <= block->used_length - offset) {
3380             used_len = len;
3381         } else {
3382             used_len = block->used_length - offset;
3383         }
3384
3385         start = offset >> TARGET_PAGE_BITS;
3386         npages = used_len >> TARGET_PAGE_BITS;
3387
3388         qemu_mutex_lock(&ram_state->bitmap_mutex);
3389         ram_state->migration_dirty_pages -=
3390                       bitmap_count_one_with_offset(block->bmap, start, npages);
3391         bitmap_clear(block->bmap, start, npages);
3392         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3393     }
3394 }
3395
3396 /*
3397  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3398  * long-running RCU critical section.  When rcu-reclaims in the code
3399  * start to become numerous it will be necessary to reduce the
3400  * granularity of these critical sections.
3401  */
3402
3403 /**
3404  * ram_save_setup: Setup RAM for migration
3405  *
3406  * Returns zero to indicate success and negative for error
3407  *
3408  * @f: QEMUFile where to send the data
3409  * @opaque: RAMState pointer
3410  */
3411 static int ram_save_setup(QEMUFile *f, void *opaque)
3412 {
3413     RAMState **rsp = opaque;
3414     RAMBlock *block;
3415
3416     if (compress_threads_save_setup()) {
3417         return -1;
3418     }
3419
3420     /* migration has already setup the bitmap, reuse it. */
3421     if (!migration_in_colo_state()) {
3422         if (ram_init_all(rsp) != 0) {
3423             compress_threads_save_cleanup();
3424             return -1;
3425         }
3426     }
3427     (*rsp)->f = f;
3428
3429     rcu_read_lock();
3430
3431     qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3432
3433     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3434         qemu_put_byte(f, strlen(block->idstr));
3435         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3436         qemu_put_be64(f, block->used_length);
3437         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3438             qemu_put_be64(f, block->page_size);
3439         }
3440         if (migrate_ignore_shared()) {
3441             qemu_put_be64(f, block->mr->addr);
3442         }
3443     }
3444
3445     rcu_read_unlock();
3446
3447     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3448     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3449
3450     multifd_send_sync_main();
3451     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3452     qemu_fflush(f);
3453
3454     return 0;
3455 }
3456
3457 /**
3458  * ram_save_iterate: iterative stage for migration
3459  *
3460  * Returns zero to indicate success and negative for error
3461  *
3462  * @f: QEMUFile where to send the data
3463  * @opaque: RAMState pointer
3464  */
3465 static int ram_save_iterate(QEMUFile *f, void *opaque)
3466 {
3467     RAMState **temp = opaque;
3468     RAMState *rs = *temp;
3469     int ret;
3470     int i;
3471     int64_t t0;
3472     int done = 0;
3473
3474     if (blk_mig_bulk_active()) {
3475         /* Avoid transferring ram during bulk phase of block migration as
3476          * the bulk phase will usually take a long time and transferring
3477          * ram updates during that time is pointless. */
3478         goto out;
3479     }
3480
3481     rcu_read_lock();
3482     if (ram_list.version != rs->last_version) {
3483         ram_state_reset(rs);
3484     }
3485
3486     /* Read version before ram_list.blocks */
3487     smp_rmb();
3488
3489     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3490
3491     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3492     i = 0;
3493     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3494             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3495         int pages;
3496
3497         if (qemu_file_get_error(f)) {
3498             break;
3499         }
3500
3501         pages = ram_find_and_save_block(rs, false);
3502         /* no more pages to sent */
3503         if (pages == 0) {
3504             done = 1;
3505             break;
3506         }
3507
3508         if (pages < 0) {
3509             qemu_file_set_error(f, pages);
3510             break;
3511         }
3512
3513         rs->target_page_count += pages;
3514
3515         /* we want to check in the 1st loop, just in case it was the 1st time
3516            and we had to sync the dirty bitmap.
3517            qemu_clock_get_ns() is a bit expensive, so we only check each some
3518            iterations
3519         */
3520         if ((i & 63) == 0) {
3521             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3522             if (t1 > MAX_WAIT) {
3523                 trace_ram_save_iterate_big_wait(t1, i);
3524                 break;
3525             }
3526         }
3527         i++;
3528     }
3529     rcu_read_unlock();
3530
3531     /*
3532      * Must occur before EOS (or any QEMUFile operation)
3533      * because of RDMA protocol.
3534      */
3535     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3536
3537 out:
3538     multifd_send_sync_main();
3539     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3540     qemu_fflush(f);
3541     ram_counters.transferred += 8;
3542
3543     ret = qemu_file_get_error(f);
3544     if (ret < 0) {
3545         return ret;
3546     }
3547
3548     return done;
3549 }
3550
3551 /**
3552  * ram_save_complete: function called to send the remaining amount of ram
3553  *
3554  * Returns zero to indicate success or negative on error
3555  *
3556  * Called with iothread lock
3557  *
3558  * @f: QEMUFile where to send the data
3559  * @opaque: RAMState pointer
3560  */
3561 static int ram_save_complete(QEMUFile *f, void *opaque)
3562 {
3563     RAMState **temp = opaque;
3564     RAMState *rs = *temp;
3565     int ret = 0;
3566
3567     rcu_read_lock();
3568
3569     if (!migration_in_postcopy()) {
3570         migration_bitmap_sync_precopy(rs);
3571     }
3572
3573     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3574
3575     /* try transferring iterative blocks of memory */
3576
3577     /* flush all remaining blocks regardless of rate limiting */
3578     while (true) {
3579         int pages;
3580
3581         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3582         /* no more blocks to sent */
3583         if (pages == 0) {
3584             break;
3585         }
3586         if (pages < 0) {
3587             ret = pages;
3588             break;
3589         }
3590     }
3591
3592     flush_compressed_data(rs);
3593     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3594
3595     rcu_read_unlock();
3596
3597     multifd_send_sync_main();
3598     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3599     qemu_fflush(f);
3600
3601     return ret;
3602 }
3603
3604 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3605                              uint64_t *res_precopy_only,
3606                              uint64_t *res_compatible,
3607                              uint64_t *res_postcopy_only)
3608 {
3609     RAMState **temp = opaque;
3610     RAMState *rs = *temp;
3611     uint64_t remaining_size;
3612
3613     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3614
3615     if (!migration_in_postcopy() &&
3616         remaining_size < max_size) {
3617         qemu_mutex_lock_iothread();
3618         rcu_read_lock();
3619         migration_bitmap_sync_precopy(rs);
3620         rcu_read_unlock();
3621         qemu_mutex_unlock_iothread();
3622         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3623     }
3624
3625     if (migrate_postcopy_ram()) {
3626         /* We can do postcopy, and all the data is postcopiable */
3627         *res_compatible += remaining_size;
3628     } else {
3629         *res_precopy_only += remaining_size;
3630     }
3631 }
3632
3633 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3634 {
3635     unsigned int xh_len;
3636     int xh_flags;
3637     uint8_t *loaded_data;
3638
3639     /* extract RLE header */
3640     xh_flags = qemu_get_byte(f);
3641     xh_len = qemu_get_be16(f);
3642
3643     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3644         error_report("Failed to load XBZRLE page - wrong compression!");
3645         return -1;
3646     }
3647
3648     if (xh_len > TARGET_PAGE_SIZE) {
3649         error_report("Failed to load XBZRLE page - len overflow!");
3650         return -1;
3651     }
3652     loaded_data = XBZRLE.decoded_buf;
3653     /* load data and decode */
3654     /* it can change loaded_data to point to an internal buffer */
3655     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3656
3657     /* decode RLE */
3658     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3659                              TARGET_PAGE_SIZE) == -1) {
3660         error_report("Failed to load XBZRLE page - decode error!");
3661         return -1;
3662     }
3663
3664     return 0;
3665 }
3666
3667 /**
3668  * ram_block_from_stream: read a RAMBlock id from the migration stream
3669  *
3670  * Must be called from within a rcu critical section.
3671  *
3672  * Returns a pointer from within the RCU-protected ram_list.
3673  *
3674  * @f: QEMUFile where to read the data from
3675  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3676  */
3677 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3678 {
3679     static RAMBlock *block = NULL;
3680     char id[256];
3681     uint8_t len;
3682
3683     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3684         if (!block) {
3685             error_report("Ack, bad migration stream!");
3686             return NULL;
3687         }
3688         return block;
3689     }
3690
3691     len = qemu_get_byte(f);
3692     qemu_get_buffer(f, (uint8_t *)id, len);
3693     id[len] = 0;
3694
3695     block = qemu_ram_block_by_name(id);
3696     if (!block) {
3697         error_report("Can't find block %s", id);
3698         return NULL;
3699     }
3700
3701     if (ramblock_is_ignored(block)) {
3702         error_report("block %s should not be migrated !", id);
3703         return NULL;
3704     }
3705
3706     return block;
3707 }
3708
3709 static inline void *host_from_ram_block_offset(RAMBlock *block,
3710                                                ram_addr_t offset)
3711 {
3712     if (!offset_in_ramblock(block, offset)) {
3713         return NULL;
3714     }
3715
3716     return block->host + offset;
3717 }
3718
3719 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3720                                                  ram_addr_t offset)
3721 {
3722     if (!offset_in_ramblock(block, offset)) {
3723         return NULL;
3724     }
3725     if (!block->colo_cache) {
3726         error_report("%s: colo_cache is NULL in block :%s",
3727                      __func__, block->idstr);
3728         return NULL;
3729     }
3730
3731     /*
3732     * During colo checkpoint, we need bitmap of these migrated pages.
3733     * It help us to decide which pages in ram cache should be flushed
3734     * into VM's RAM later.
3735     */
3736     if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3737         ram_state->migration_dirty_pages++;
3738     }
3739     return block->colo_cache + offset;
3740 }
3741
3742 /**
3743  * ram_handle_compressed: handle the zero page case
3744  *
3745  * If a page (or a whole RDMA chunk) has been
3746  * determined to be zero, then zap it.
3747  *
3748  * @host: host address for the zero page
3749  * @ch: what the page is filled from.  We only support zero
3750  * @size: size of the zero page
3751  */
3752 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3753 {
3754     if (ch != 0 || !is_zero_range(host, size)) {
3755         memset(host, ch, size);
3756     }
3757 }
3758
3759 /* return the size after decompression, or negative value on error */
3760 static int
3761 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3762                      const uint8_t *source, size_t source_len)
3763 {
3764     int err;
3765
3766     err = inflateReset(stream);
3767     if (err != Z_OK) {
3768         return -1;
3769     }
3770
3771     stream->avail_in = source_len;
3772     stream->next_in = (uint8_t *)source;
3773     stream->avail_out = dest_len;
3774     stream->next_out = dest;
3775
3776     err = inflate(stream, Z_NO_FLUSH);
3777     if (err != Z_STREAM_END) {
3778         return -1;
3779     }
3780
3781     return stream->total_out;
3782 }
3783
3784 static void *do_data_decompress(void *opaque)
3785 {
3786     DecompressParam *param = opaque;
3787     unsigned long pagesize;
3788     uint8_t *des;
3789     int len, ret;
3790
3791     qemu_mutex_lock(&param->mutex);
3792     while (!param->quit) {
3793         if (param->des) {
3794             des = param->des;
3795             len = param->len;
3796             param->des = 0;
3797             qemu_mutex_unlock(&param->mutex);
3798
3799             pagesize = TARGET_PAGE_SIZE;
3800
3801             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3802                                        param->compbuf, len);
3803             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3804                 error_report("decompress data failed");
3805                 qemu_file_set_error(decomp_file, ret);
3806             }
3807
3808             qemu_mutex_lock(&decomp_done_lock);
3809             param->done = true;
3810             qemu_cond_signal(&decomp_done_cond);
3811             qemu_mutex_unlock(&decomp_done_lock);
3812
3813             qemu_mutex_lock(&param->mutex);
3814         } else {
3815             qemu_cond_wait(&param->cond, &param->mutex);
3816         }
3817     }
3818     qemu_mutex_unlock(&param->mutex);
3819
3820     return NULL;
3821 }
3822
3823 static int wait_for_decompress_done(void)
3824 {
3825     int idx, thread_count;
3826
3827     if (!migrate_use_compression()) {
3828         return 0;
3829     }
3830
3831     thread_count = migrate_decompress_threads();
3832     qemu_mutex_lock(&decomp_done_lock);
3833     for (idx = 0; idx < thread_count; idx++) {
3834         while (!decomp_param[idx].done) {
3835             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3836         }
3837     }
3838     qemu_mutex_unlock(&decomp_done_lock);
3839     return qemu_file_get_error(decomp_file);
3840 }
3841
3842 static void compress_threads_load_cleanup(void)
3843 {
3844     int i, thread_count;
3845
3846     if (!migrate_use_compression()) {
3847         return;
3848     }
3849     thread_count = migrate_decompress_threads();
3850     for (i = 0; i < thread_count; i++) {
3851         /*
3852          * we use it as a indicator which shows if the thread is
3853          * properly init'd or not
3854          */
3855         if (!decomp_param[i].compbuf) {
3856             break;
3857         }
3858
3859         qemu_mutex_lock(&decomp_param[i].mutex);
3860         decomp_param[i].quit = true;
3861         qemu_cond_signal(&decomp_param[i].cond);
3862         qemu_mutex_unlock(&decomp_param[i].mutex);
3863     }
3864     for (i = 0; i < thread_count; i++) {
3865         if (!decomp_param[i].compbuf) {
3866             break;
3867         }
3868
3869         qemu_thread_join(decompress_threads + i);
3870         qemu_mutex_destroy(&decomp_param[i].mutex);
3871         qemu_cond_destroy(&decomp_param[i].cond);
3872         inflateEnd(&decomp_param[i].stream);
3873         g_free(decomp_param[i].compbuf);
3874         decomp_param[i].compbuf = NULL;
3875     }
3876     g_free(decompress_threads);
3877     g_free(decomp_param);
3878     decompress_threads = NULL;
3879     decomp_param = NULL;
3880     decomp_file = NULL;
3881 }
3882
3883 static int compress_threads_load_setup(QEMUFile *f)
3884 {
3885     int i, thread_count;
3886
3887     if (!migrate_use_compression()) {
3888         return 0;
3889     }
3890
3891     thread_count = migrate_decompress_threads();
3892     decompress_threads = g_new0(QemuThread, thread_count);
3893     decomp_param = g_new0(DecompressParam, thread_count);
3894     qemu_mutex_init(&decomp_done_lock);
3895     qemu_cond_init(&decomp_done_cond);
3896     decomp_file = f;
3897     for (i = 0; i < thread_count; i++) {
3898         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3899             goto exit;
3900         }
3901
3902         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3903         qemu_mutex_init(&decomp_param[i].mutex);
3904         qemu_cond_init(&decomp_param[i].cond);
3905         decomp_param[i].done = true;
3906         decomp_param[i].quit = false;
3907         qemu_thread_create(decompress_threads + i, "decompress",
3908                            do_data_decompress, decomp_param + i,
3909                            QEMU_THREAD_JOINABLE);
3910     }
3911     return 0;
3912 exit:
3913     compress_threads_load_cleanup();
3914     return -1;
3915 }
3916
3917 static void decompress_data_with_multi_threads(QEMUFile *f,
3918                                                void *host, int len)
3919 {
3920     int idx, thread_count;
3921
3922     thread_count = migrate_decompress_threads();
3923     qemu_mutex_lock(&decomp_done_lock);
3924     while (true) {
3925         for (idx = 0; idx < thread_count; idx++) {
3926             if (decomp_param[idx].done) {
3927                 decomp_param[idx].done = false;
3928                 qemu_mutex_lock(&decomp_param[idx].mutex);
3929                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3930                 decomp_param[idx].des = host;
3931                 decomp_param[idx].len = len;
3932                 qemu_cond_signal(&decomp_param[idx].cond);
3933                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3934                 break;
3935             }
3936         }
3937         if (idx < thread_count) {
3938             break;
3939         } else {
3940             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3941         }
3942     }
3943     qemu_mutex_unlock(&decomp_done_lock);
3944 }
3945
3946 /*
3947  * colo cache: this is for secondary VM, we cache the whole
3948  * memory of the secondary VM, it is need to hold the global lock
3949  * to call this helper.
3950  */
3951 int colo_init_ram_cache(void)
3952 {
3953     RAMBlock *block;
3954
3955     rcu_read_lock();
3956     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3957         block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3958                                                 NULL,
3959                                                 false);
3960         if (!block->colo_cache) {
3961             error_report("%s: Can't alloc memory for COLO cache of block %s,"
3962                          "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3963                          block->used_length);
3964             goto out_locked;
3965         }
3966         memcpy(block->colo_cache, block->host, block->used_length);
3967     }
3968     rcu_read_unlock();
3969     /*
3970     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3971     * with to decide which page in cache should be flushed into SVM's RAM. Here
3972     * we use the same name 'ram_bitmap' as for migration.
3973     */
3974     if (ram_bytes_total()) {
3975         RAMBlock *block;
3976
3977         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3978             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3979
3980             block->bmap = bitmap_new(pages);
3981             bitmap_set(block->bmap, 0, pages);
3982         }
3983     }
3984     ram_state = g_new0(RAMState, 1);
3985     ram_state->migration_dirty_pages = 0;
3986     qemu_mutex_init(&ram_state->bitmap_mutex);
3987     memory_global_dirty_log_start();
3988
3989     return 0;
3990
3991 out_locked:
3992
3993     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3994         if (block->colo_cache) {
3995             qemu_anon_ram_free(block->colo_cache, block->used_length);
3996             block->colo_cache = NULL;
3997         }
3998     }
3999
4000     rcu_read_unlock();
4001     return -errno;
4002 }
4003
4004 /* It is need to hold the global lock to call this helper */
4005 void colo_release_ram_cache(void)
4006 {
4007     RAMBlock *block;
4008
4009     memory_global_dirty_log_stop();
4010     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4011         g_free(block->bmap);
4012         block->bmap = NULL;
4013     }
4014
4015     rcu_read_lock();
4016
4017     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4018         if (block->colo_cache) {
4019             qemu_anon_ram_free(block->colo_cache, block->used_length);
4020             block->colo_cache = NULL;
4021         }
4022     }
4023
4024     rcu_read_unlock();
4025     qemu_mutex_destroy(&ram_state->bitmap_mutex);
4026     g_free(ram_state);
4027     ram_state = NULL;
4028 }
4029
4030 /**
4031  * ram_load_setup: Setup RAM for migration incoming side
4032  *
4033  * Returns zero to indicate success and negative for error
4034  *
4035  * @f: QEMUFile where to receive the data
4036  * @opaque: RAMState pointer
4037  */
4038 static int ram_load_setup(QEMUFile *f, void *opaque)
4039 {
4040     if (compress_threads_load_setup(f)) {
4041         return -1;
4042     }
4043
4044     xbzrle_load_setup();
4045     ramblock_recv_map_init();
4046
4047     return 0;
4048 }
4049
4050 static int ram_load_cleanup(void *opaque)
4051 {
4052     RAMBlock *rb;
4053
4054     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4055         if (ramblock_is_pmem(rb)) {
4056             pmem_persist(rb->host, rb->used_length);
4057         }
4058     }
4059
4060     xbzrle_load_cleanup();
4061     compress_threads_load_cleanup();
4062
4063     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4064         g_free(rb->receivedmap);
4065         rb->receivedmap = NULL;
4066     }
4067
4068     return 0;
4069 }
4070
4071 /**
4072  * ram_postcopy_incoming_init: allocate postcopy data structures
4073  *
4074  * Returns 0 for success and negative if there was one error
4075  *
4076  * @mis: current migration incoming state
4077  *
4078  * Allocate data structures etc needed by incoming migration with
4079  * postcopy-ram. postcopy-ram's similarly names
4080  * postcopy_ram_incoming_init does the work.
4081  */
4082 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4083 {
4084     return postcopy_ram_incoming_init(mis);
4085 }
4086
4087 /**
4088  * ram_load_postcopy: load a page in postcopy case
4089  *
4090  * Returns 0 for success or -errno in case of error
4091  *
4092  * Called in postcopy mode by ram_load().
4093  * rcu_read_lock is taken prior to this being called.
4094  *
4095  * @f: QEMUFile where to send the data
4096  */
4097 static int ram_load_postcopy(QEMUFile *f)
4098 {
4099     int flags = 0, ret = 0;
4100     bool place_needed = false;
4101     bool matches_target_page_size = false;
4102     MigrationIncomingState *mis = migration_incoming_get_current();
4103     /* Temporary page that is later 'placed' */
4104     void *postcopy_host_page = postcopy_get_tmp_page(mis);
4105     void *last_host = NULL;
4106     bool all_zero = false;
4107
4108     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4109         ram_addr_t addr;
4110         void *host = NULL;
4111         void *page_buffer = NULL;
4112         void *place_source = NULL;
4113         RAMBlock *block = NULL;
4114         uint8_t ch;
4115
4116         addr = qemu_get_be64(f);
4117
4118         /*
4119          * If qemu file error, we should stop here, and then "addr"
4120          * may be invalid
4121          */
4122         ret = qemu_file_get_error(f);
4123         if (ret) {
4124             break;
4125         }
4126
4127         flags = addr & ~TARGET_PAGE_MASK;
4128         addr &= TARGET_PAGE_MASK;
4129
4130         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4131         place_needed = false;
4132         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
4133             block = ram_block_from_stream(f, flags);
4134
4135             host = host_from_ram_block_offset(block, addr);
4136             if (!host) {
4137                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4138                 ret = -EINVAL;
4139                 break;
4140             }
4141             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4142             /*
4143              * Postcopy requires that we place whole host pages atomically;
4144              * these may be huge pages for RAMBlocks that are backed by
4145              * hugetlbfs.
4146              * To make it atomic, the data is read into a temporary page
4147              * that's moved into place later.
4148              * The migration protocol uses,  possibly smaller, target-pages
4149              * however the source ensures it always sends all the components
4150              * of a host page in order.
4151              */
4152             page_buffer = postcopy_host_page +
4153                           ((uintptr_t)host & (block->page_size - 1));
4154             /* If all TP are zero then we can optimise the place */
4155             if (!((uintptr_t)host & (block->page_size - 1))) {
4156                 all_zero = true;
4157             } else {
4158                 /* not the 1st TP within the HP */
4159                 if (host != (last_host + TARGET_PAGE_SIZE)) {
4160                     error_report("Non-sequential target page %p/%p",
4161                                   host, last_host);
4162                     ret = -EINVAL;
4163                     break;
4164                 }
4165             }
4166
4167
4168             /*
4169              * If it's the last part of a host page then we place the host
4170              * page
4171              */
4172             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
4173                                      (block->page_size - 1)) == 0;
4174             place_source = postcopy_host_page;
4175         }
4176         last_host = host;
4177
4178         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4179         case RAM_SAVE_FLAG_ZERO:
4180             ch = qemu_get_byte(f);
4181             memset(page_buffer, ch, TARGET_PAGE_SIZE);
4182             if (ch) {
4183                 all_zero = false;
4184             }
4185             break;
4186
4187         case RAM_SAVE_FLAG_PAGE:
4188             all_zero = false;
4189             if (!matches_target_page_size) {
4190                 /* For huge pages, we always use temporary buffer */
4191                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4192             } else {
4193                 /*
4194                  * For small pages that matches target page size, we
4195                  * avoid the qemu_file copy.  Instead we directly use
4196                  * the buffer of QEMUFile to place the page.  Note: we
4197                  * cannot do any QEMUFile operation before using that
4198                  * buffer to make sure the buffer is valid when
4199                  * placing the page.
4200                  */
4201                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4202                                          TARGET_PAGE_SIZE);
4203             }
4204             break;
4205         case RAM_SAVE_FLAG_EOS:
4206             /* normal exit */
4207             multifd_recv_sync_main();
4208             break;
4209         default:
4210             error_report("Unknown combination of migration flags: %#x"
4211                          " (postcopy mode)", flags);
4212             ret = -EINVAL;
4213             break;
4214         }
4215
4216         /* Detect for any possible file errors */
4217         if (!ret && qemu_file_get_error(f)) {
4218             ret = qemu_file_get_error(f);
4219         }
4220
4221         if (!ret && place_needed) {
4222             /* This gets called at the last target page in the host page */
4223             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
4224
4225             if (all_zero) {
4226                 ret = postcopy_place_page_zero(mis, place_dest,
4227                                                block);
4228             } else {
4229                 ret = postcopy_place_page(mis, place_dest,
4230                                           place_source, block);
4231             }
4232         }
4233     }
4234
4235     return ret;
4236 }
4237
4238 static bool postcopy_is_advised(void)
4239 {
4240     PostcopyState ps = postcopy_state_get();
4241     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4242 }
4243
4244 static bool postcopy_is_running(void)
4245 {
4246     PostcopyState ps = postcopy_state_get();
4247     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4248 }
4249
4250 /*
4251  * Flush content of RAM cache into SVM's memory.
4252  * Only flush the pages that be dirtied by PVM or SVM or both.
4253  */
4254 static void colo_flush_ram_cache(void)
4255 {
4256     RAMBlock *block = NULL;
4257     void *dst_host;
4258     void *src_host;
4259     unsigned long offset = 0;
4260
4261     memory_global_dirty_log_sync();
4262     rcu_read_lock();
4263     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4264         migration_bitmap_sync_range(ram_state, block);
4265     }
4266     rcu_read_unlock();
4267
4268     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4269     rcu_read_lock();
4270     block = QLIST_FIRST_RCU(&ram_list.blocks);
4271
4272     while (block) {
4273         offset = migration_bitmap_find_dirty(ram_state, block, offset);
4274
4275         if (offset << TARGET_PAGE_BITS >= block->used_length) {
4276             offset = 0;
4277             block = QLIST_NEXT_RCU(block, next);
4278         } else {
4279             migration_bitmap_clear_dirty(ram_state, block, offset);
4280             dst_host = block->host + (offset << TARGET_PAGE_BITS);
4281             src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4282             memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4283         }
4284     }
4285
4286     rcu_read_unlock();
4287     trace_colo_flush_ram_cache_end();
4288 }
4289
4290 /**
4291  * ram_load_precopy: load pages in precopy case
4292  *
4293  * Returns 0 for success or -errno in case of error
4294  *
4295  * Called in precopy mode by ram_load().
4296  * rcu_read_lock is taken prior to this being called.
4297  *
4298  * @f: QEMUFile where to send the data
4299  */
4300 static int ram_load_precopy(QEMUFile *f)
4301 {
4302     int flags = 0, ret = 0, invalid_flags = 0, len = 0;
4303     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4304     bool postcopy_advised = postcopy_is_advised();
4305     if (!migrate_use_compression()) {
4306         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4307     }
4308
4309     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4310         ram_addr_t addr, total_ram_bytes;
4311         void *host = NULL;
4312         uint8_t ch;
4313
4314         addr = qemu_get_be64(f);
4315         flags = addr & ~TARGET_PAGE_MASK;
4316         addr &= TARGET_PAGE_MASK;
4317
4318         if (flags & invalid_flags) {
4319             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4320                 error_report("Received an unexpected compressed page");
4321             }
4322
4323             ret = -EINVAL;
4324             break;
4325         }
4326
4327         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4328                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4329             RAMBlock *block = ram_block_from_stream(f, flags);
4330
4331             /*
4332              * After going into COLO, we should load the Page into colo_cache.
4333              */
4334             if (migration_incoming_in_colo_state()) {
4335                 host = colo_cache_from_block_offset(block, addr);
4336             } else {
4337                 host = host_from_ram_block_offset(block, addr);
4338             }
4339             if (!host) {
4340                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4341                 ret = -EINVAL;
4342                 break;
4343             }
4344
4345             if (!migration_incoming_in_colo_state()) {
4346                 ramblock_recv_bitmap_set(block, host);
4347             }
4348
4349             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4350         }
4351
4352         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4353         case RAM_SAVE_FLAG_MEM_SIZE:
4354             /* Synchronize RAM block list */
4355             total_ram_bytes = addr;
4356             while (!ret && total_ram_bytes) {
4357                 RAMBlock *block;
4358                 char id[256];
4359                 ram_addr_t length;
4360
4361                 len = qemu_get_byte(f);
4362                 qemu_get_buffer(f, (uint8_t *)id, len);
4363                 id[len] = 0;
4364                 length = qemu_get_be64(f);
4365
4366                 block = qemu_ram_block_by_name(id);
4367                 if (block && !qemu_ram_is_migratable(block)) {
4368                     error_report("block %s should not be migrated !", id);
4369                     ret = -EINVAL;
4370                 } else if (block) {
4371                     if (length != block->used_length) {
4372                         Error *local_err = NULL;
4373
4374                         ret = qemu_ram_resize(block, length,
4375                                               &local_err);
4376                         if (local_err) {
4377                             error_report_err(local_err);
4378                         }
4379                     }
4380                     /* For postcopy we need to check hugepage sizes match */
4381                     if (postcopy_advised &&
4382                         block->page_size != qemu_host_page_size) {
4383                         uint64_t remote_page_size = qemu_get_be64(f);
4384                         if (remote_page_size != block->page_size) {
4385                             error_report("Mismatched RAM page size %s "
4386                                          "(local) %zd != %" PRId64,
4387                                          id, block->page_size,
4388                                          remote_page_size);
4389                             ret = -EINVAL;
4390                         }
4391                     }
4392                     if (migrate_ignore_shared()) {
4393                         hwaddr addr = qemu_get_be64(f);
4394                         if (ramblock_is_ignored(block) &&
4395                             block->mr->addr != addr) {
4396                             error_report("Mismatched GPAs for block %s "
4397                                          "%" PRId64 "!= %" PRId64,
4398                                          id, (uint64_t)addr,
4399                                          (uint64_t)block->mr->addr);
4400                             ret = -EINVAL;
4401                         }
4402                     }
4403                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4404                                           block->idstr);
4405                 } else {
4406                     error_report("Unknown ramblock \"%s\", cannot "
4407                                  "accept migration", id);
4408                     ret = -EINVAL;
4409                 }
4410
4411                 total_ram_bytes -= length;
4412             }
4413             break;
4414
4415         case RAM_SAVE_FLAG_ZERO:
4416             ch = qemu_get_byte(f);
4417             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4418             break;
4419
4420         case RAM_SAVE_FLAG_PAGE:
4421             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4422             break;
4423
4424         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4425             len = qemu_get_be32(f);
4426             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4427                 error_report("Invalid compressed data length: %d", len);
4428                 ret = -EINVAL;
4429                 break;
4430             }
4431             decompress_data_with_multi_threads(f, host, len);
4432             break;
4433
4434         case RAM_SAVE_FLAG_XBZRLE:
4435             if (load_xbzrle(f, addr, host) < 0) {
4436                 error_report("Failed to decompress XBZRLE page at "
4437                              RAM_ADDR_FMT, addr);
4438                 ret = -EINVAL;
4439                 break;
4440             }
4441             break;
4442         case RAM_SAVE_FLAG_EOS:
4443             /* normal exit */
4444             multifd_recv_sync_main();
4445             break;
4446         default:
4447             if (flags & RAM_SAVE_FLAG_HOOK) {
4448                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4449             } else {
4450                 error_report("Unknown combination of migration flags: %#x",
4451                              flags);
4452                 ret = -EINVAL;
4453             }
4454         }
4455         if (!ret) {
4456             ret = qemu_file_get_error(f);
4457         }
4458     }
4459
4460     return ret;
4461 }
4462
4463 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4464 {
4465     int ret = 0;
4466     static uint64_t seq_iter;
4467     /*
4468      * If system is running in postcopy mode, page inserts to host memory must
4469      * be atomic
4470      */
4471     bool postcopy_running = postcopy_is_running();
4472
4473     seq_iter++;
4474
4475     if (version_id != 4) {
4476         return -EINVAL;
4477     }
4478
4479     /*
4480      * This RCU critical section can be very long running.
4481      * When RCU reclaims in the code start to become numerous,
4482      * it will be necessary to reduce the granularity of this
4483      * critical section.
4484      */
4485     rcu_read_lock();
4486
4487     if (postcopy_running) {
4488         ret = ram_load_postcopy(f);
4489     } else {
4490         ret = ram_load_precopy(f);
4491     }
4492
4493     ret |= wait_for_decompress_done();
4494     rcu_read_unlock();
4495     trace_ram_load_complete(ret, seq_iter);
4496
4497     if (!ret  && migration_incoming_in_colo_state()) {
4498         colo_flush_ram_cache();
4499     }
4500     return ret;
4501 }
4502
4503 static bool ram_has_postcopy(void *opaque)
4504 {
4505     RAMBlock *rb;
4506     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4507         if (ramblock_is_pmem(rb)) {
4508             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4509                          "is not supported now!", rb->idstr, rb->host);
4510             return false;
4511         }
4512     }
4513
4514     return migrate_postcopy_ram();
4515 }
4516
4517 /* Sync all the dirty bitmap with destination VM.  */
4518 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4519 {
4520     RAMBlock *block;
4521     QEMUFile *file = s->to_dst_file;
4522     int ramblock_count = 0;
4523
4524     trace_ram_dirty_bitmap_sync_start();
4525
4526     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4527         qemu_savevm_send_recv_bitmap(file, block->idstr);
4528         trace_ram_dirty_bitmap_request(block->idstr);
4529         ramblock_count++;
4530     }
4531
4532     trace_ram_dirty_bitmap_sync_wait();
4533
4534     /* Wait until all the ramblocks' dirty bitmap synced */
4535     while (ramblock_count--) {
4536         qemu_sem_wait(&s->rp_state.rp_sem);
4537     }
4538
4539     trace_ram_dirty_bitmap_sync_complete();
4540
4541     return 0;
4542 }
4543
4544 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4545 {
4546     qemu_sem_post(&s->rp_state.rp_sem);
4547 }
4548
4549 /*
4550  * Read the received bitmap, revert it as the initial dirty bitmap.
4551  * This is only used when the postcopy migration is paused but wants
4552  * to resume from a middle point.
4553  */
4554 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4555 {
4556     int ret = -EINVAL;
4557     QEMUFile *file = s->rp_state.from_dst_file;
4558     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4559     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4560     uint64_t size, end_mark;
4561
4562     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4563
4564     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4565         error_report("%s: incorrect state %s", __func__,
4566                      MigrationStatus_str(s->state));
4567         return -EINVAL;
4568     }
4569
4570     /*
4571      * Note: see comments in ramblock_recv_bitmap_send() on why we
4572      * need the endianess convertion, and the paddings.
4573      */
4574     local_size = ROUND_UP(local_size, 8);
4575
4576     /* Add paddings */
4577     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4578
4579     size = qemu_get_be64(file);
4580
4581     /* The size of the bitmap should match with our ramblock */
4582     if (size != local_size) {
4583         error_report("%s: ramblock '%s' bitmap size mismatch "
4584                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4585                      block->idstr, size, local_size);
4586         ret = -EINVAL;
4587         goto out;
4588     }
4589
4590     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4591     end_mark = qemu_get_be64(file);
4592
4593     ret = qemu_file_get_error(file);
4594     if (ret || size != local_size) {
4595         error_report("%s: read bitmap failed for ramblock '%s': %d"
4596                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4597                      __func__, block->idstr, ret, local_size, size);
4598         ret = -EIO;
4599         goto out;
4600     }
4601
4602     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4603         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4604                      __func__, block->idstr, end_mark);
4605         ret = -EINVAL;
4606         goto out;
4607     }
4608
4609     /*
4610      * Endianess convertion. We are during postcopy (though paused).
4611      * The dirty bitmap won't change. We can directly modify it.
4612      */
4613     bitmap_from_le(block->bmap, le_bitmap, nbits);
4614
4615     /*
4616      * What we received is "received bitmap". Revert it as the initial
4617      * dirty bitmap for this ramblock.
4618      */
4619     bitmap_complement(block->bmap, block->bmap, nbits);
4620
4621     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4622
4623     /*
4624      * We succeeded to sync bitmap for current ramblock. If this is
4625      * the last one to sync, we need to notify the main send thread.
4626      */
4627     ram_dirty_bitmap_reload_notify(s);
4628
4629     ret = 0;
4630 out:
4631     g_free(le_bitmap);
4632     return ret;
4633 }
4634
4635 static int ram_resume_prepare(MigrationState *s, void *opaque)
4636 {
4637     RAMState *rs = *(RAMState **)opaque;
4638     int ret;
4639
4640     ret = ram_dirty_bitmap_sync_all(s, rs);
4641     if (ret) {
4642         return ret;
4643     }
4644
4645     ram_state_resume_prepare(rs, s->to_dst_file);
4646
4647     return 0;
4648 }
4649
4650 static SaveVMHandlers savevm_ram_handlers = {
4651     .save_setup = ram_save_setup,
4652     .save_live_iterate = ram_save_iterate,
4653     .save_live_complete_postcopy = ram_save_complete,
4654     .save_live_complete_precopy = ram_save_complete,
4655     .has_postcopy = ram_has_postcopy,
4656     .save_live_pending = ram_save_pending,
4657     .load_state = ram_load,
4658     .save_cleanup = ram_save_cleanup,
4659     .load_setup = ram_load_setup,
4660     .load_cleanup = ram_load_cleanup,
4661     .resume_prepare = ram_resume_prepare,
4662 };
4663
4664 void ram_mig_init(void)
4665 {
4666     qemu_mutex_init(&XBZRLE.lock);
4667     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4668 }