migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "qemu/pmem.h"
  37 #include "xbzrle.h"
  38 #include "ram.h"
  39 #include "migration.h"
  40 #include "socket.h"
  41 #include "migration/register.h"
  42 #include "migration/misc.h"
  43 #include "qemu-file.h"
  44 #include "postcopy-ram.h"
  45 #include "page_cache.h"
  46 #include "qemu/error-report.h"
  47 #include "qapi/error.h"
  48 #include "qapi/qapi-events-migration.h"
  49 #include "qapi/qmp/qerror.h"
  50 #include "trace.h"
  51 #include "exec/ram_addr.h"
  52 #include "exec/target_page.h"
  53 #include "qemu/rcu_queue.h"
  54 #include "migration/colo.h"
  55 #include "block.h"
  56 #include "sysemu/sysemu.h"
  57 #include "qemu/uuid.h"
  58 #include "savevm.h"
  59 #include "qemu/iov.h"
  60
  61 /***********************************************************/
  62 /* ram save/restore */
  63
  64 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  65  * worked for pages that where filled with the same char.  We switched
  66  * it to only search for the zero value.  And to avoid confusion with
  67  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  68  */
  69
  70 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  71 #define RAM_SAVE_FLAG_ZERO     0x02
  72 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  73 #define RAM_SAVE_FLAG_PAGE     0x08
  74 #define RAM_SAVE_FLAG_EOS      0x10
  75 #define RAM_SAVE_FLAG_CONTINUE 0x20
  76 #define RAM_SAVE_FLAG_XBZRLE   0x40
  77 /* 0x80 is reserved in migration.h start with 0x100 next */
  78 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  79
  80 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  81 {
  82     return buffer_is_zero(p, size);
  83 }
  84
  85 XBZRLECacheStats xbzrle_counters;
  86
  87 /* struct contains XBZRLE cache and a static page
  88    used by the compression */
  89 static struct {
  90     /* buffer used for XBZRLE encoding */
  91     uint8_t *encoded_buf;
  92     /* buffer for storing page content */
  93     uint8_t *current_buf;
  94     /* Cache for XBZRLE, Protected by lock. */
  95     PageCache *cache;
  96     QemuMutex lock;
  97     /* it will store a page full of zeros */
  98     uint8_t *zero_target_page;
  99     /* buffer used for XBZRLE decoding */
 100     uint8_t *decoded_buf;
 101 } XBZRLE;
 102
 103 static void XBZRLE_cache_lock(void)
 104 {
 105     if (migrate_use_xbzrle())
 106         qemu_mutex_lock(&XBZRLE.lock);
 107 }
 108
 109 static void XBZRLE_cache_unlock(void)
 110 {
 111     if (migrate_use_xbzrle())
 112         qemu_mutex_unlock(&XBZRLE.lock);
 113 }
 114
 115 /**
 116  * xbzrle_cache_resize: resize the xbzrle cache
 117  *
 118  * This function is called from qmp_migrate_set_cache_size in main
 119  * thread, possibly while a migration is in progress.  A running
 120  * migration may be using the cache and might finish during this call,
 121  * hence changes to the cache are protected by XBZRLE.lock().
 122  *
 123  * Returns 0 for success or -1 for error
 124  *
 125  * @new_size: new cache size
 126  * @errp: set *errp if the check failed, with reason
 127  */
 128 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 129 {
 130     PageCache *new_cache;
 131     int64_t ret = 0;
 132
 133     /* Check for truncation */
 134     if (new_size != (size_t)new_size) {
 135         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 136                    "exceeding address space");
 137         return -1;
 138     }
 139
 140     if (new_size == migrate_xbzrle_cache_size()) {
 141         /* nothing to do */
 142         return 0;
 143     }
 144
 145     XBZRLE_cache_lock();
 146
 147     if (XBZRLE.cache != NULL) {
 148         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 149         if (!new_cache) {
 150             ret = -1;
 151             goto out;
 152         }
 153
 154         cache_fini(XBZRLE.cache);
 155         XBZRLE.cache = new_cache;
 156     }
 157 out:
 158     XBZRLE_cache_unlock();
 159     return ret;
 160 }
 161
 162 static bool ramblock_is_ignored(RAMBlock *block)
 163 {
 164     return !qemu_ram_is_migratable(block) ||
 165            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 166 }
 167
 168 /* Should be holding either ram_list.mutex, or the RCU lock. */
 169 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 170     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 171         if (ramblock_is_ignored(block)) {} else
 172
 173 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 174     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 175         if (!qemu_ram_is_migratable(block)) {} else
 176
 177 #undef RAMBLOCK_FOREACH
 178
 179 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 180 {
 181     RAMBlock *block;
 182     int ret = 0;
 183
 184     rcu_read_lock();
 185     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 186         ret = func(block, opaque);
 187         if (ret) {
 188             break;
 189         }
 190     }
 191     rcu_read_unlock();
 192     return ret;
 193 }
 194
 195 static void ramblock_recv_map_init(void)
 196 {
 197     RAMBlock *rb;
 198
 199     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 200         assert(!rb->receivedmap);
 201         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 202     }
 203 }
 204
 205 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 206 {
 207     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 208                     rb->receivedmap);
 209 }
 210
 211 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 212 {
 213     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 214 }
 215
 216 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 217 {
 218     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 219 }
 220
 221 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 222                                     size_t nr)
 223 {
 224     bitmap_set_atomic(rb->receivedmap,
 225                       ramblock_recv_bitmap_offset(host_addr, rb),
 226                       nr);
 227 }
 228
 229 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 230
 231 /*
 232  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 233  *
 234  * Returns >0 if success with sent bytes, or <0 if error.
 235  */
 236 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 237                                   const char *block_name)
 238 {
 239     RAMBlock *block = qemu_ram_block_by_name(block_name);
 240     unsigned long *le_bitmap, nbits;
 241     uint64_t size;
 242
 243     if (!block) {
 244         error_report("%s: invalid block name: %s", __func__, block_name);
 245         return -1;
 246     }
 247
 248     nbits = block->used_length >> TARGET_PAGE_BITS;
 249
 250     /*
 251      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 252      * machines we may need 4 more bytes for padding (see below
 253      * comment). So extend it a bit before hand.
 254      */
 255     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 256
 257     /*
 258      * Always use little endian when sending the bitmap. This is
 259      * required that when source and destination VMs are not using the
 260      * same endianess. (Note: big endian won't work.)
 261      */
 262     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 263
 264     /* Size of the bitmap, in bytes */
 265     size = DIV_ROUND_UP(nbits, 8);
 266
 267     /*
 268      * size is always aligned to 8 bytes for 64bit machines, but it
 269      * may not be true for 32bit machines. We need this padding to
 270      * make sure the migration can survive even between 32bit and
 271      * 64bit machines.
 272      */
 273     size = ROUND_UP(size, 8);
 274
 275     qemu_put_be64(file, size);
 276     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 277     /*
 278      * Mark as an end, in case the middle part is screwed up due to
 279      * some "misterious" reason.
 280      */
 281     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 282     qemu_fflush(file);
 283
 284     g_free(le_bitmap);
 285
 286     if (qemu_file_get_error(file)) {
 287         return qemu_file_get_error(file);
 288     }
 289
 290     return size + sizeof(size);
 291 }
 292
 293 /*
 294  * An outstanding page request, on the source, having been received
 295  * and queued
 296  */
 297 struct RAMSrcPageRequest {
 298     RAMBlock *rb;
 299     hwaddr    offset;
 300     hwaddr    len;
 301
 302     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 303 };
 304
 305 /* State of RAM for migration */
 306 struct RAMState {
 307     /* QEMUFile used for this migration */
 308     QEMUFile *f;
 309     /* Last block that we have visited searching for dirty pages */
 310     RAMBlock *last_seen_block;
 311     /* Last block from where we have sent data */
 312     RAMBlock *last_sent_block;
 313     /* Last dirty target page we have sent */
 314     ram_addr_t last_page;
 315     /* last ram version we have seen */
 316     uint32_t last_version;
 317     /* We are in the first round */
 318     bool ram_bulk_stage;
 319     /* The free page optimization is enabled */
 320     bool fpo_enabled;
 321     /* How many times we have dirty too many pages */
 322     int dirty_rate_high_cnt;
 323     /* these variables are used for bitmap sync */
 324     /* last time we did a full bitmap_sync */
 325     int64_t time_last_bitmap_sync;
 326     /* bytes transferred at start_time */
 327     uint64_t bytes_xfer_prev;
 328     /* number of dirty pages since start_time */
 329     uint64_t num_dirty_pages_period;
 330     /* xbzrle misses since the beginning of the period */
 331     uint64_t xbzrle_cache_miss_prev;
 332
 333     /* compression statistics since the beginning of the period */
 334     /* amount of count that no free thread to compress data */
 335     uint64_t compress_thread_busy_prev;
 336     /* amount bytes after compression */
 337     uint64_t compressed_size_prev;
 338     /* amount of compressed pages */
 339     uint64_t compress_pages_prev;
 340
 341     /* total handled target pages at the beginning of period */
 342     uint64_t target_page_count_prev;
 343     /* total handled target pages since start */
 344     uint64_t target_page_count;
 345     /* number of dirty bits in the bitmap */
 346     uint64_t migration_dirty_pages;
 347     /* Protects modification of the bitmap and migration dirty pages */
 348     QemuMutex bitmap_mutex;
 349     /* The RAMBlock used in the last src_page_requests */
 350     RAMBlock *last_req_rb;
 351     /* Queue of outstanding page requests from the destination */
 352     QemuMutex src_page_req_mutex;
 353     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 354 };
 355 typedef struct RAMState RAMState;
 356
 357 static RAMState *ram_state;
 358
 359 static NotifierWithReturnList precopy_notifier_list;
 360
 361 void precopy_infrastructure_init(void)
 362 {
 363     notifier_with_return_list_init(&precopy_notifier_list);
 364 }
 365
 366 void precopy_add_notifier(NotifierWithReturn *n)
 367 {
 368     notifier_with_return_list_add(&precopy_notifier_list, n);
 369 }
 370
 371 void precopy_remove_notifier(NotifierWithReturn *n)
 372 {
 373     notifier_with_return_remove(n);
 374 }
 375
 376 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 377 {
 378     PrecopyNotifyData pnd;
 379     pnd.reason = reason;
 380     pnd.errp = errp;
 381
 382     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 383 }
 384
 385 void precopy_enable_free_page_optimization(void)
 386 {
 387     if (!ram_state) {
 388         return;
 389     }
 390
 391     ram_state->fpo_enabled = true;
 392 }
 393
 394 uint64_t ram_bytes_remaining(void)
 395 {
 396     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 397                        0;
 398 }
 399
 400 MigrationStats ram_counters;
 401
 402 /* used by the search for pages to send */
 403 struct PageSearchStatus {
 404     /* Current block being searched */
 405     RAMBlock    *block;
 406     /* Current page to search from */
 407     unsigned long page;
 408     /* Set once we wrap around */
 409     bool         complete_round;
 410 };
 411 typedef struct PageSearchStatus PageSearchStatus;
 412
 413 CompressionStats compression_counters;
 414
 415 struct CompressParam {
 416     bool done;
 417     bool quit;
 418     bool zero_page;
 419     QEMUFile *file;
 420     QemuMutex mutex;
 421     QemuCond cond;
 422     RAMBlock *block;
 423     ram_addr_t offset;
 424
 425     /* internally used fields */
 426     z_stream stream;
 427     uint8_t *originbuf;
 428 };
 429 typedef struct CompressParam CompressParam;
 430
 431 struct DecompressParam {
 432     bool done;
 433     bool quit;
 434     QemuMutex mutex;
 435     QemuCond cond;
 436     void *des;
 437     uint8_t *compbuf;
 438     int len;
 439     z_stream stream;
 440 };
 441 typedef struct DecompressParam DecompressParam;
 442
 443 static CompressParam *comp_param;
 444 static QemuThread *compress_threads;
 445 /* comp_done_cond is used to wake up the migration thread when
 446  * one of the compression threads has finished the compression.
 447  * comp_done_lock is used to co-work with comp_done_cond.
 448  */
 449 static QemuMutex comp_done_lock;
 450 static QemuCond comp_done_cond;
 451 /* The empty QEMUFileOps will be used by file in CompressParam */
 452 static const QEMUFileOps empty_ops = { };
 453
 454 static QEMUFile *decomp_file;
 455 static DecompressParam *decomp_param;
 456 static QemuThread *decompress_threads;
 457 static QemuMutex decomp_done_lock;
 458 static QemuCond decomp_done_cond;
 459
 460 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 461                                  ram_addr_t offset, uint8_t *source_buf);
 462
 463 static void *do_data_compress(void *opaque)
 464 {
 465     CompressParam *param = opaque;
 466     RAMBlock *block;
 467     ram_addr_t offset;
 468     bool zero_page;
 469
 470     qemu_mutex_lock(&param->mutex);
 471     while (!param->quit) {
 472         if (param->block) {
 473             block = param->block;
 474             offset = param->offset;
 475             param->block = NULL;
 476             qemu_mutex_unlock(&param->mutex);
 477
 478             zero_page = do_compress_ram_page(param->file, &param->stream,
 479                                              block, offset, param->originbuf);
 480
 481             qemu_mutex_lock(&comp_done_lock);
 482             param->done = true;
 483             param->zero_page = zero_page;
 484             qemu_cond_signal(&comp_done_cond);
 485             qemu_mutex_unlock(&comp_done_lock);
 486
 487             qemu_mutex_lock(&param->mutex);
 488         } else {
 489             qemu_cond_wait(&param->cond, &param->mutex);
 490         }
 491     }
 492     qemu_mutex_unlock(&param->mutex);
 493
 494     return NULL;
 495 }
 496
 497 static void compress_threads_save_cleanup(void)
 498 {
 499     int i, thread_count;
 500
 501     if (!migrate_use_compression() || !comp_param) {
 502         return;
 503     }
 504
 505     thread_count = migrate_compress_threads();
 506     for (i = 0; i < thread_count; i++) {
 507         /*
 508          * we use it as a indicator which shows if the thread is
 509          * properly init'd or not
 510          */
 511         if (!comp_param[i].file) {
 512             break;
 513         }
 514
 515         qemu_mutex_lock(&comp_param[i].mutex);
 516         comp_param[i].quit = true;
 517         qemu_cond_signal(&comp_param[i].cond);
 518         qemu_mutex_unlock(&comp_param[i].mutex);
 519
 520         qemu_thread_join(compress_threads + i);
 521         qemu_mutex_destroy(&comp_param[i].mutex);
 522         qemu_cond_destroy(&comp_param[i].cond);
 523         deflateEnd(&comp_param[i].stream);
 524         g_free(comp_param[i].originbuf);
 525         qemu_fclose(comp_param[i].file);
 526         comp_param[i].file = NULL;
 527     }
 528     qemu_mutex_destroy(&comp_done_lock);
 529     qemu_cond_destroy(&comp_done_cond);
 530     g_free(compress_threads);
 531     g_free(comp_param);
 532     compress_threads = NULL;
 533     comp_param = NULL;
 534 }
 535
 536 static int compress_threads_save_setup(void)
 537 {
 538     int i, thread_count;
 539
 540     if (!migrate_use_compression()) {
 541         return 0;
 542     }
 543     thread_count = migrate_compress_threads();
 544     compress_threads = g_new0(QemuThread, thread_count);
 545     comp_param = g_new0(CompressParam, thread_count);
 546     qemu_cond_init(&comp_done_cond);
 547     qemu_mutex_init(&comp_done_lock);
 548     for (i = 0; i < thread_count; i++) {
 549         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 550         if (!comp_param[i].originbuf) {
 551             goto exit;
 552         }
 553
 554         if (deflateInit(&comp_param[i].stream,
 555                         migrate_compress_level()) != Z_OK) {
 556             g_free(comp_param[i].originbuf);
 557             goto exit;
 558         }
 559
 560         /* comp_param[i].file is just used as a dummy buffer to save data,
 561          * set its ops to empty.
 562          */
 563         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 564         comp_param[i].done = true;
 565         comp_param[i].quit = false;
 566         qemu_mutex_init(&comp_param[i].mutex);
 567         qemu_cond_init(&comp_param[i].cond);
 568         qemu_thread_create(compress_threads + i, "compress",
 569                            do_data_compress, comp_param + i,
 570                            QEMU_THREAD_JOINABLE);
 571     }
 572     return 0;
 573
 574 exit:
 575     compress_threads_save_cleanup();
 576     return -1;
 577 }
 578
 579 /* Multiple fd's */
 580
 581 #define MULTIFD_MAGIC 0x11223344U
 582 #define MULTIFD_VERSION 1
 583
 584 #define MULTIFD_FLAG_SYNC (1 << 0)
 585
 586 typedef struct {
 587     uint32_t magic;
 588     uint32_t version;
 589     unsigned char uuid[16]; /* QemuUUID */
 590     uint8_t id;
 591 } __attribute__((packed)) MultiFDInit_t;
 592
 593 typedef struct {
 594     uint32_t magic;
 595     uint32_t version;
 596     uint32_t flags;
 597     uint32_t size;
 598     uint32_t used;
 599     uint64_t packet_num;
 600     char ramblock[256];
 601     uint64_t offset[];
 602 } __attribute__((packed)) MultiFDPacket_t;
 603
 604 typedef struct {
 605     /* number of used pages */
 606     uint32_t used;
 607     /* number of allocated pages */
 608     uint32_t allocated;
 609     /* global number of generated multifd packets */
 610     uint64_t packet_num;
 611     /* offset of each page */
 612     ram_addr_t *offset;
 613     /* pointer to each page */
 614     struct iovec *iov;
 615     RAMBlock *block;
 616 } MultiFDPages_t;
 617
 618 typedef struct {
 619     /* this fields are not changed once the thread is created */
 620     /* channel number */
 621     uint8_t id;
 622     /* channel thread name */
 623     char *name;
 624     /* channel thread id */
 625     QemuThread thread;
 626     /* communication channel */
 627     QIOChannel *c;
 628     /* sem where to wait for more work */
 629     QemuSemaphore sem;
 630     /* this mutex protects the following parameters */
 631     QemuMutex mutex;
 632     /* is this channel thread running */
 633     bool running;
 634     /* should this thread finish */
 635     bool quit;
 636     /* thread has work to do */
 637     int pending_job;
 638     /* array of pages to sent */
 639     MultiFDPages_t *pages;
 640     /* packet allocated len */
 641     uint32_t packet_len;
 642     /* pointer to the packet */
 643     MultiFDPacket_t *packet;
 644     /* multifd flags for each packet */
 645     uint32_t flags;
 646     /* global number of generated multifd packets */
 647     uint64_t packet_num;
 648     /* thread local variables */
 649     /* packets sent through this channel */
 650     uint64_t num_packets;
 651     /* pages sent through this channel */
 652     uint64_t num_pages;
 653     /* syncs main thread and channels */
 654     QemuSemaphore sem_sync;
 655 }  MultiFDSendParams;
 656
 657 typedef struct {
 658     /* this fields are not changed once the thread is created */
 659     /* channel number */
 660     uint8_t id;
 661     /* channel thread name */
 662     char *name;
 663     /* channel thread id */
 664     QemuThread thread;
 665     /* communication channel */
 666     QIOChannel *c;
 667     /* this mutex protects the following parameters */
 668     QemuMutex mutex;
 669     /* is this channel thread running */
 670     bool running;
 671     /* array of pages to receive */
 672     MultiFDPages_t *pages;
 673     /* packet allocated len */
 674     uint32_t packet_len;
 675     /* pointer to the packet */
 676     MultiFDPacket_t *packet;
 677     /* multifd flags for each packet */
 678     uint32_t flags;
 679     /* global number of generated multifd packets */
 680     uint64_t packet_num;
 681     /* thread local variables */
 682     /* packets sent through this channel */
 683     uint64_t num_packets;
 684     /* pages sent through this channel */
 685     uint64_t num_pages;
 686     /* syncs main thread and channels */
 687     QemuSemaphore sem_sync;
 688 } MultiFDRecvParams;
 689
 690 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 691 {
 692     MultiFDInit_t msg;
 693     int ret;
 694
 695     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 696     msg.version = cpu_to_be32(MULTIFD_VERSION);
 697     msg.id = p->id;
 698     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 699
 700     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 701     if (ret != 0) {
 702         return -1;
 703     }
 704     return 0;
 705 }
 706
 707 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 708 {
 709     MultiFDInit_t msg;
 710     int ret;
 711
 712     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 713     if (ret != 0) {
 714         return -1;
 715     }
 716
 717     msg.magic = be32_to_cpu(msg.magic);
 718     msg.version = be32_to_cpu(msg.version);
 719
 720     if (msg.magic != MULTIFD_MAGIC) {
 721         error_setg(errp, "multifd: received packet magic %x "
 722                    "expected %x", msg.magic, MULTIFD_MAGIC);
 723         return -1;
 724     }
 725
 726     if (msg.version != MULTIFD_VERSION) {
 727         error_setg(errp, "multifd: received packet version %d "
 728                    "expected %d", msg.version, MULTIFD_VERSION);
 729         return -1;
 730     }
 731
 732     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 733         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 734         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 735
 736         error_setg(errp, "multifd: received uuid '%s' and expected "
 737                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 738         g_free(uuid);
 739         g_free(msg_uuid);
 740         return -1;
 741     }
 742
 743     if (msg.id > migrate_multifd_channels()) {
 744         error_setg(errp, "multifd: received channel version %d "
 745                    "expected %d", msg.version, MULTIFD_VERSION);
 746         return -1;
 747     }
 748
 749     return msg.id;
 750 }
 751
 752 static MultiFDPages_t *multifd_pages_init(size_t size)
 753 {
 754     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 755
 756     pages->allocated = size;
 757     pages->iov = g_new0(struct iovec, size);
 758     pages->offset = g_new0(ram_addr_t, size);
 759
 760     return pages;
 761 }
 762
 763 static void multifd_pages_clear(MultiFDPages_t *pages)
 764 {
 765     pages->used = 0;
 766     pages->allocated = 0;
 767     pages->packet_num = 0;
 768     pages->block = NULL;
 769     g_free(pages->iov);
 770     pages->iov = NULL;
 771     g_free(pages->offset);
 772     pages->offset = NULL;
 773     g_free(pages);
 774 }
 775
 776 static void multifd_send_fill_packet(MultiFDSendParams *p)
 777 {
 778     MultiFDPacket_t *packet = p->packet;
 779     int i;
 780
 781     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 782     packet->version = cpu_to_be32(MULTIFD_VERSION);
 783     packet->flags = cpu_to_be32(p->flags);
 784     packet->size = cpu_to_be32(migrate_multifd_page_count());
 785     packet->used = cpu_to_be32(p->pages->used);
 786     packet->packet_num = cpu_to_be64(p->packet_num);
 787
 788     if (p->pages->block) {
 789         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 790     }
 791
 792     for (i = 0; i < p->pages->used; i++) {
 793         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 794     }
 795 }
 796
 797 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 798 {
 799     MultiFDPacket_t *packet = p->packet;
 800     RAMBlock *block;
 801     int i;
 802
 803     packet->magic = be32_to_cpu(packet->magic);
 804     if (packet->magic != MULTIFD_MAGIC) {
 805         error_setg(errp, "multifd: received packet "
 806                    "magic %x and expected magic %x",
 807                    packet->magic, MULTIFD_MAGIC);
 808         return -1;
 809     }
 810
 811     packet->version = be32_to_cpu(packet->version);
 812     if (packet->version != MULTIFD_VERSION) {
 813         error_setg(errp, "multifd: received packet "
 814                    "version %d and expected version %d",
 815                    packet->version, MULTIFD_VERSION);
 816         return -1;
 817     }
 818
 819     p->flags = be32_to_cpu(packet->flags);
 820
 821     packet->size = be32_to_cpu(packet->size);
 822     if (packet->size > migrate_multifd_page_count()) {
 823         error_setg(errp, "multifd: received packet "
 824                    "with size %d and expected maximum size %d",
 825                    packet->size, migrate_multifd_page_count()) ;
 826         return -1;
 827     }
 828
 829     p->pages->used = be32_to_cpu(packet->used);
 830     if (p->pages->used > packet->size) {
 831         error_setg(errp, "multifd: received packet "
 832                    "with size %d and expected maximum size %d",
 833                    p->pages->used, packet->size) ;
 834         return -1;
 835     }
 836
 837     p->packet_num = be64_to_cpu(packet->packet_num);
 838
 839     if (p->pages->used) {
 840         /* make sure that ramblock is 0 terminated */
 841         packet->ramblock[255] = 0;
 842         block = qemu_ram_block_by_name(packet->ramblock);
 843         if (!block) {
 844             error_setg(errp, "multifd: unknown ram block %s",
 845                        packet->ramblock);
 846             return -1;
 847         }
 848     }
 849
 850     for (i = 0; i < p->pages->used; i++) {
 851         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 852
 853         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 854             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 855                        " (max " RAM_ADDR_FMT ")",
 856                        offset, block->max_length);
 857             return -1;
 858         }
 859         p->pages->iov[i].iov_base = block->host + offset;
 860         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 861     }
 862
 863     return 0;
 864 }
 865
 866 struct {
 867     MultiFDSendParams *params;
 868     /* number of created threads */
 869     int count;
 870     /* array of pages to sent */
 871     MultiFDPages_t *pages;
 872     /* syncs main thread and channels */
 873     QemuSemaphore sem_sync;
 874     /* global number of generated multifd packets */
 875     uint64_t packet_num;
 876     /* send channels ready */
 877     QemuSemaphore channels_ready;
 878 } *multifd_send_state;
 879
 880 /*
 881  * How we use multifd_send_state->pages and channel->pages?
 882  *
 883  * We create a pages for each channel, and a main one.  Each time that
 884  * we need to send a batch of pages we interchange the ones between
 885  * multifd_send_state and the channel that is sending it.  There are
 886  * two reasons for that:
 887  *    - to not have to do so many mallocs during migration
 888  *    - to make easier to know what to free at the end of migration
 889  *
 890  * This way we always know who is the owner of each "pages" struct,
 891  * and we don't need any loocking.  It belongs to the migration thread
 892  * or to the channel thread.  Switching is safe because the migration
 893  * thread is using the channel mutex when changing it, and the channel
 894  * have to had finish with its own, otherwise pending_job can't be
 895  * false.
 896  */
 897
 898 static void multifd_send_pages(void)
 899 {
 900     int i;
 901     static int next_channel;
 902     MultiFDSendParams *p = NULL; /* make happy gcc */
 903     MultiFDPages_t *pages = multifd_send_state->pages;
 904     uint64_t transferred;
 905
 906     qemu_sem_wait(&multifd_send_state->channels_ready);
 907     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 908         p = &multifd_send_state->params[i];
 909
 910         qemu_mutex_lock(&p->mutex);
 911         if (!p->pending_job) {
 912             p->pending_job++;
 913             next_channel = (i + 1) % migrate_multifd_channels();
 914             break;
 915         }
 916         qemu_mutex_unlock(&p->mutex);
 917     }
 918     p->pages->used = 0;
 919
 920     p->packet_num = multifd_send_state->packet_num++;
 921     p->pages->block = NULL;
 922     multifd_send_state->pages = p->pages;
 923     p->pages = pages;
 924     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
 925     ram_counters.multifd_bytes += transferred;
 926     ram_counters.transferred += transferred;;
 927     qemu_mutex_unlock(&p->mutex);
 928     qemu_sem_post(&p->sem);
 929 }
 930
 931 static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
 932 {
 933     MultiFDPages_t *pages = multifd_send_state->pages;
 934
 935     if (!pages->block) {
 936         pages->block = block;
 937     }
 938
 939     if (pages->block == block) {
 940         pages->offset[pages->used] = offset;
 941         pages->iov[pages->used].iov_base = block->host + offset;
 942         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 943         pages->used++;
 944
 945         if (pages->used < pages->allocated) {
 946             return;
 947         }
 948     }
 949
 950     multifd_send_pages();
 951
 952     if (pages->block != block) {
 953         multifd_queue_page(block, offset);
 954     }
 955 }
 956
 957 static void multifd_send_terminate_threads(Error *err)
 958 {
 959     int i;
 960
 961     if (err) {
 962         MigrationState *s = migrate_get_current();
 963         migrate_set_error(s, err);
 964         if (s->state == MIGRATION_STATUS_SETUP ||
 965             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
 966             s->state == MIGRATION_STATUS_DEVICE ||
 967             s->state == MIGRATION_STATUS_ACTIVE) {
 968             migrate_set_state(&s->state, s->state,
 969                               MIGRATION_STATUS_FAILED);
 970         }
 971     }
 972
 973     for (i = 0; i < migrate_multifd_channels(); i++) {
 974         MultiFDSendParams *p = &multifd_send_state->params[i];
 975
 976         qemu_mutex_lock(&p->mutex);
 977         p->quit = true;
 978         qemu_sem_post(&p->sem);
 979         qemu_mutex_unlock(&p->mutex);
 980     }
 981 }
 982
 983 void multifd_save_cleanup(void)
 984 {
 985     int i;
 986
 987     if (!migrate_use_multifd()) {
 988         return;
 989     }
 990     multifd_send_terminate_threads(NULL);
 991     for (i = 0; i < migrate_multifd_channels(); i++) {
 992         MultiFDSendParams *p = &multifd_send_state->params[i];
 993
 994         if (p->running) {
 995             qemu_thread_join(&p->thread);
 996         }
 997         socket_send_channel_destroy(p->c);
 998         p->c = NULL;
 999         qemu_mutex_destroy(&p->mutex);
1000         qemu_sem_destroy(&p->sem);
1001         qemu_sem_destroy(&p->sem_sync);
1002         g_free(p->name);
1003         p->name = NULL;
1004         multifd_pages_clear(p->pages);
1005         p->pages = NULL;
1006         p->packet_len = 0;
1007         g_free(p->packet);
1008         p->packet = NULL;
1009     }
1010     qemu_sem_destroy(&multifd_send_state->channels_ready);
1011     qemu_sem_destroy(&multifd_send_state->sem_sync);
1012     g_free(multifd_send_state->params);
1013     multifd_send_state->params = NULL;
1014     multifd_pages_clear(multifd_send_state->pages);
1015     multifd_send_state->pages = NULL;
1016     g_free(multifd_send_state);
1017     multifd_send_state = NULL;
1018 }
1019
1020 static void multifd_send_sync_main(void)
1021 {
1022     int i;
1023
1024     if (!migrate_use_multifd()) {
1025         return;
1026     }
1027     if (multifd_send_state->pages->used) {
1028         multifd_send_pages();
1029     }
1030     for (i = 0; i < migrate_multifd_channels(); i++) {
1031         MultiFDSendParams *p = &multifd_send_state->params[i];
1032
1033         trace_multifd_send_sync_main_signal(p->id);
1034
1035         qemu_mutex_lock(&p->mutex);
1036
1037         p->packet_num = multifd_send_state->packet_num++;
1038         p->flags |= MULTIFD_FLAG_SYNC;
1039         p->pending_job++;
1040         qemu_mutex_unlock(&p->mutex);
1041         qemu_sem_post(&p->sem);
1042     }
1043     for (i = 0; i < migrate_multifd_channels(); i++) {
1044         MultiFDSendParams *p = &multifd_send_state->params[i];
1045
1046         trace_multifd_send_sync_main_wait(p->id);
1047         qemu_sem_wait(&multifd_send_state->sem_sync);
1048     }
1049     trace_multifd_send_sync_main(multifd_send_state->packet_num);
1050 }
1051
1052 static void *multifd_send_thread(void *opaque)
1053 {
1054     MultiFDSendParams *p = opaque;
1055     Error *local_err = NULL;
1056     int ret;
1057
1058     trace_multifd_send_thread_start(p->id);
1059     rcu_register_thread();
1060
1061     if (multifd_send_initial_packet(p, &local_err) < 0) {
1062         goto out;
1063     }
1064     /* initial packet */
1065     p->num_packets = 1;
1066
1067     while (true) {
1068         qemu_sem_wait(&p->sem);
1069         qemu_mutex_lock(&p->mutex);
1070
1071         if (p->pending_job) {
1072             uint32_t used = p->pages->used;
1073             uint64_t packet_num = p->packet_num;
1074             uint32_t flags = p->flags;
1075
1076             multifd_send_fill_packet(p);
1077             p->flags = 0;
1078             p->num_packets++;
1079             p->num_pages += used;
1080             p->pages->used = 0;
1081             qemu_mutex_unlock(&p->mutex);
1082
1083             trace_multifd_send(p->id, packet_num, used, flags);
1084
1085             ret = qio_channel_write_all(p->c, (void *)p->packet,
1086                                         p->packet_len, &local_err);
1087             if (ret != 0) {
1088                 break;
1089             }
1090
1091             if (used) {
1092                 ret = qio_channel_writev_all(p->c, p->pages->iov,
1093                                              used, &local_err);
1094                 if (ret != 0) {
1095                     break;
1096                 }
1097             }
1098
1099             qemu_mutex_lock(&p->mutex);
1100             p->pending_job--;
1101             qemu_mutex_unlock(&p->mutex);
1102
1103             if (flags & MULTIFD_FLAG_SYNC) {
1104                 qemu_sem_post(&multifd_send_state->sem_sync);
1105             }
1106             qemu_sem_post(&multifd_send_state->channels_ready);
1107         } else if (p->quit) {
1108             qemu_mutex_unlock(&p->mutex);
1109             break;
1110         } else {
1111             qemu_mutex_unlock(&p->mutex);
1112             /* sometimes there are spurious wakeups */
1113         }
1114     }
1115
1116 out:
1117     if (local_err) {
1118         multifd_send_terminate_threads(local_err);
1119     }
1120
1121     qemu_mutex_lock(&p->mutex);
1122     p->running = false;
1123     qemu_mutex_unlock(&p->mutex);
1124
1125     rcu_unregister_thread();
1126     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1127
1128     return NULL;
1129 }
1130
1131 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1132 {
1133     MultiFDSendParams *p = opaque;
1134     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1135     Error *local_err = NULL;
1136
1137     if (qio_task_propagate_error(task, &local_err)) {
1138         migrate_set_error(migrate_get_current(), local_err);
1139         multifd_save_cleanup();
1140     } else {
1141         p->c = QIO_CHANNEL(sioc);
1142         qio_channel_set_delay(p->c, false);
1143         p->running = true;
1144         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1145                            QEMU_THREAD_JOINABLE);
1146
1147         atomic_inc(&multifd_send_state->count);
1148     }
1149 }
1150
1151 int multifd_save_setup(void)
1152 {
1153     int thread_count;
1154     uint32_t page_count = migrate_multifd_page_count();
1155     uint8_t i;
1156
1157     if (!migrate_use_multifd()) {
1158         return 0;
1159     }
1160     thread_count = migrate_multifd_channels();
1161     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1162     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1163     atomic_set(&multifd_send_state->count, 0);
1164     multifd_send_state->pages = multifd_pages_init(page_count);
1165     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1166     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1167
1168     for (i = 0; i < thread_count; i++) {
1169         MultiFDSendParams *p = &multifd_send_state->params[i];
1170
1171         qemu_mutex_init(&p->mutex);
1172         qemu_sem_init(&p->sem, 0);
1173         qemu_sem_init(&p->sem_sync, 0);
1174         p->quit = false;
1175         p->pending_job = 0;
1176         p->id = i;
1177         p->pages = multifd_pages_init(page_count);
1178         p->packet_len = sizeof(MultiFDPacket_t)
1179                       + sizeof(ram_addr_t) * page_count;
1180         p->packet = g_malloc0(p->packet_len);
1181         p->name = g_strdup_printf("multifdsend_%d", i);
1182         socket_send_channel_create(multifd_new_send_channel_async, p);
1183     }
1184     return 0;
1185 }
1186
1187 struct {
1188     MultiFDRecvParams *params;
1189     /* number of created threads */
1190     int count;
1191     /* syncs main thread and channels */
1192     QemuSemaphore sem_sync;
1193     /* global number of generated multifd packets */
1194     uint64_t packet_num;
1195 } *multifd_recv_state;
1196
1197 static void multifd_recv_terminate_threads(Error *err)
1198 {
1199     int i;
1200
1201     if (err) {
1202         MigrationState *s = migrate_get_current();
1203         migrate_set_error(s, err);
1204         if (s->state == MIGRATION_STATUS_SETUP ||
1205             s->state == MIGRATION_STATUS_ACTIVE) {
1206             migrate_set_state(&s->state, s->state,
1207                               MIGRATION_STATUS_FAILED);
1208         }
1209     }
1210
1211     for (i = 0; i < migrate_multifd_channels(); i++) {
1212         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1213
1214         qemu_mutex_lock(&p->mutex);
1215         /* We could arrive here for two reasons:
1216            - normal quit, i.e. everything went fine, just finished
1217            - error quit: We close the channels so the channel threads
1218              finish the qio_channel_read_all_eof() */
1219         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1220         qemu_mutex_unlock(&p->mutex);
1221     }
1222 }
1223
1224 int multifd_load_cleanup(Error **errp)
1225 {
1226     int i;
1227     int ret = 0;
1228
1229     if (!migrate_use_multifd()) {
1230         return 0;
1231     }
1232     multifd_recv_terminate_threads(NULL);
1233     for (i = 0; i < migrate_multifd_channels(); i++) {
1234         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1235
1236         if (p->running) {
1237             qemu_thread_join(&p->thread);
1238         }
1239         object_unref(OBJECT(p->c));
1240         p->c = NULL;
1241         qemu_mutex_destroy(&p->mutex);
1242         qemu_sem_destroy(&p->sem_sync);
1243         g_free(p->name);
1244         p->name = NULL;
1245         multifd_pages_clear(p->pages);
1246         p->pages = NULL;
1247         p->packet_len = 0;
1248         g_free(p->packet);
1249         p->packet = NULL;
1250     }
1251     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1252     g_free(multifd_recv_state->params);
1253     multifd_recv_state->params = NULL;
1254     g_free(multifd_recv_state);
1255     multifd_recv_state = NULL;
1256
1257     return ret;
1258 }
1259
1260 static void multifd_recv_sync_main(void)
1261 {
1262     int i;
1263
1264     if (!migrate_use_multifd()) {
1265         return;
1266     }
1267     for (i = 0; i < migrate_multifd_channels(); i++) {
1268         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1269
1270         trace_multifd_recv_sync_main_wait(p->id);
1271         qemu_sem_wait(&multifd_recv_state->sem_sync);
1272         qemu_mutex_lock(&p->mutex);
1273         if (multifd_recv_state->packet_num < p->packet_num) {
1274             multifd_recv_state->packet_num = p->packet_num;
1275         }
1276         qemu_mutex_unlock(&p->mutex);
1277     }
1278     for (i = 0; i < migrate_multifd_channels(); i++) {
1279         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1280
1281         trace_multifd_recv_sync_main_signal(p->id);
1282         qemu_sem_post(&p->sem_sync);
1283     }
1284     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1285 }
1286
1287 static void *multifd_recv_thread(void *opaque)
1288 {
1289     MultiFDRecvParams *p = opaque;
1290     Error *local_err = NULL;
1291     int ret;
1292
1293     trace_multifd_recv_thread_start(p->id);
1294     rcu_register_thread();
1295
1296     while (true) {
1297         uint32_t used;
1298         uint32_t flags;
1299
1300         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1301                                        p->packet_len, &local_err);
1302         if (ret == 0) {   /* EOF */
1303             break;
1304         }
1305         if (ret == -1) {   /* Error */
1306             break;
1307         }
1308
1309         qemu_mutex_lock(&p->mutex);
1310         ret = multifd_recv_unfill_packet(p, &local_err);
1311         if (ret) {
1312             qemu_mutex_unlock(&p->mutex);
1313             break;
1314         }
1315
1316         used = p->pages->used;
1317         flags = p->flags;
1318         trace_multifd_recv(p->id, p->packet_num, used, flags);
1319         p->num_packets++;
1320         p->num_pages += used;
1321         qemu_mutex_unlock(&p->mutex);
1322
1323         if (used) {
1324             ret = qio_channel_readv_all(p->c, p->pages->iov,
1325                                         used, &local_err);
1326             if (ret != 0) {
1327                 break;
1328             }
1329         }
1330
1331         if (flags & MULTIFD_FLAG_SYNC) {
1332             qemu_sem_post(&multifd_recv_state->sem_sync);
1333             qemu_sem_wait(&p->sem_sync);
1334         }
1335     }
1336
1337     if (local_err) {
1338         multifd_recv_terminate_threads(local_err);
1339     }
1340     qemu_mutex_lock(&p->mutex);
1341     p->running = false;
1342     qemu_mutex_unlock(&p->mutex);
1343
1344     rcu_unregister_thread();
1345     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1346
1347     return NULL;
1348 }
1349
1350 int multifd_load_setup(void)
1351 {
1352     int thread_count;
1353     uint32_t page_count = migrate_multifd_page_count();
1354     uint8_t i;
1355
1356     if (!migrate_use_multifd()) {
1357         return 0;
1358     }
1359     thread_count = migrate_multifd_channels();
1360     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1361     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1362     atomic_set(&multifd_recv_state->count, 0);
1363     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1364
1365     for (i = 0; i < thread_count; i++) {
1366         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1367
1368         qemu_mutex_init(&p->mutex);
1369         qemu_sem_init(&p->sem_sync, 0);
1370         p->id = i;
1371         p->pages = multifd_pages_init(page_count);
1372         p->packet_len = sizeof(MultiFDPacket_t)
1373                       + sizeof(ram_addr_t) * page_count;
1374         p->packet = g_malloc0(p->packet_len);
1375         p->name = g_strdup_printf("multifdrecv_%d", i);
1376     }
1377     return 0;
1378 }
1379
1380 bool multifd_recv_all_channels_created(void)
1381 {
1382     int thread_count = migrate_multifd_channels();
1383
1384     if (!migrate_use_multifd()) {
1385         return true;
1386     }
1387
1388     return thread_count == atomic_read(&multifd_recv_state->count);
1389 }
1390
1391 /*
1392  * Try to receive all multifd channels to get ready for the migration.
1393  * - Return true and do not set @errp when correctly receving all channels;
1394  * - Return false and do not set @errp when correctly receiving the current one;
1395  * - Return false and set @errp when failing to receive the current channel.
1396  */
1397 bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1398 {
1399     MultiFDRecvParams *p;
1400     Error *local_err = NULL;
1401     int id;
1402
1403     id = multifd_recv_initial_packet(ioc, &local_err);
1404     if (id < 0) {
1405         multifd_recv_terminate_threads(local_err);
1406         error_propagate_prepend(errp, local_err,
1407                                 "failed to receive packet"
1408                                 " via multifd channel %d: ",
1409                                 atomic_read(&multifd_recv_state->count));
1410         return false;
1411     }
1412
1413     p = &multifd_recv_state->params[id];
1414     if (p->c != NULL) {
1415         error_setg(&local_err, "multifd: received id '%d' already setup'",
1416                    id);
1417         multifd_recv_terminate_threads(local_err);
1418         error_propagate(errp, local_err);
1419         return false;
1420     }
1421     p->c = ioc;
1422     object_ref(OBJECT(ioc));
1423     /* initial packet */
1424     p->num_packets = 1;
1425
1426     p->running = true;
1427     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1428                        QEMU_THREAD_JOINABLE);
1429     atomic_inc(&multifd_recv_state->count);
1430     return atomic_read(&multifd_recv_state->count) ==
1431            migrate_multifd_channels();
1432 }
1433
1434 /**
1435  * save_page_header: write page header to wire
1436  *
1437  * If this is the 1st block, it also writes the block identification
1438  *
1439  * Returns the number of bytes written
1440  *
1441  * @f: QEMUFile where to send the data
1442  * @block: block that contains the page we want to send
1443  * @offset: offset inside the block for the page
1444  *          in the lower bits, it contains flags
1445  */
1446 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1447                                ram_addr_t offset)
1448 {
1449     size_t size, len;
1450
1451     if (block == rs->last_sent_block) {
1452         offset |= RAM_SAVE_FLAG_CONTINUE;
1453     }
1454     qemu_put_be64(f, offset);
1455     size = 8;
1456
1457     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1458         len = strlen(block->idstr);
1459         qemu_put_byte(f, len);
1460         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1461         size += 1 + len;
1462         rs->last_sent_block = block;
1463     }
1464     return size;
1465 }
1466
1467 /**
1468  * mig_throttle_guest_down: throotle down the guest
1469  *
1470  * Reduce amount of guest cpu execution to hopefully slow down memory
1471  * writes. If guest dirty memory rate is reduced below the rate at
1472  * which we can transfer pages to the destination then we should be
1473  * able to complete migration. Some workloads dirty memory way too
1474  * fast and will not effectively converge, even with auto-converge.
1475  */
1476 static void mig_throttle_guest_down(void)
1477 {
1478     MigrationState *s = migrate_get_current();
1479     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1480     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1481     int pct_max = s->parameters.max_cpu_throttle;
1482
1483     /* We have not started throttling yet. Let's start it. */
1484     if (!cpu_throttle_active()) {
1485         cpu_throttle_set(pct_initial);
1486     } else {
1487         /* Throttling already on, just increase the rate */
1488         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1489                          pct_max));
1490     }
1491 }
1492
1493 /**
1494  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1495  *
1496  * @rs: current RAM state
1497  * @current_addr: address for the zero page
1498  *
1499  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1500  * The important thing is that a stale (not-yet-0'd) page be replaced
1501  * by the new data.
1502  * As a bonus, if the page wasn't in the cache it gets added so that
1503  * when a small write is made into the 0'd page it gets XBZRLE sent.
1504  */
1505 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1506 {
1507     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1508         return;
1509     }
1510
1511     /* We don't care if this fails to allocate a new cache page
1512      * as long as it updated an old one */
1513     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1514                  ram_counters.dirty_sync_count);
1515 }
1516
1517 #define ENCODING_FLAG_XBZRLE 0x1
1518
1519 /**
1520  * save_xbzrle_page: compress and send current page
1521  *
1522  * Returns: 1 means that we wrote the page
1523  *          0 means that page is identical to the one already sent
1524  *          -1 means that xbzrle would be longer than normal
1525  *
1526  * @rs: current RAM state
1527  * @current_data: pointer to the address of the page contents
1528  * @current_addr: addr of the page
1529  * @block: block that contains the page we want to send
1530  * @offset: offset inside the block for the page
1531  * @last_stage: if we are at the completion stage
1532  */
1533 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1534                             ram_addr_t current_addr, RAMBlock *block,
1535                             ram_addr_t offset, bool last_stage)
1536 {
1537     int encoded_len = 0, bytes_xbzrle;
1538     uint8_t *prev_cached_page;
1539
1540     if (!cache_is_cached(XBZRLE.cache, current_addr,
1541                          ram_counters.dirty_sync_count)) {
1542         xbzrle_counters.cache_miss++;
1543         if (!last_stage) {
1544             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1545                              ram_counters.dirty_sync_count) == -1) {
1546                 return -1;
1547             } else {
1548                 /* update *current_data when the page has been
1549                    inserted into cache */
1550                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1551             }
1552         }
1553         return -1;
1554     }
1555
1556     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1557
1558     /* save current buffer into memory */
1559     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1560
1561     /* XBZRLE encoding (if there is no overflow) */
1562     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1563                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1564                                        TARGET_PAGE_SIZE);
1565     if (encoded_len == 0) {
1566         trace_save_xbzrle_page_skipping();
1567         return 0;
1568     } else if (encoded_len == -1) {
1569         trace_save_xbzrle_page_overflow();
1570         xbzrle_counters.overflow++;
1571         /* update data in the cache */
1572         if (!last_stage) {
1573             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1574             *current_data = prev_cached_page;
1575         }
1576         return -1;
1577     }
1578
1579     /* we need to update the data in the cache, in order to get the same data */
1580     if (!last_stage) {
1581         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1582     }
1583
1584     /* Send XBZRLE based compressed page */
1585     bytes_xbzrle = save_page_header(rs, rs->f, block,
1586                                     offset | RAM_SAVE_FLAG_XBZRLE);
1587     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1588     qemu_put_be16(rs->f, encoded_len);
1589     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1590     bytes_xbzrle += encoded_len + 1 + 2;
1591     xbzrle_counters.pages++;
1592     xbzrle_counters.bytes += bytes_xbzrle;
1593     ram_counters.transferred += bytes_xbzrle;
1594
1595     return 1;
1596 }
1597
1598 /**
1599  * migration_bitmap_find_dirty: find the next dirty page from start
1600  *
1601  * Called with rcu_read_lock() to protect migration_bitmap
1602  *
1603  * Returns the byte offset within memory region of the start of a dirty page
1604  *
1605  * @rs: current RAM state
1606  * @rb: RAMBlock where to search for dirty pages
1607  * @start: page where we start the search
1608  */
1609 static inline
1610 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1611                                           unsigned long start)
1612 {
1613     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1614     unsigned long *bitmap = rb->bmap;
1615     unsigned long next;
1616
1617     if (ramblock_is_ignored(rb)) {
1618         return size;
1619     }
1620
1621     /*
1622      * When the free page optimization is enabled, we need to check the bitmap
1623      * to send the non-free pages rather than all the pages in the bulk stage.
1624      */
1625     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
1626         next = start + 1;
1627     } else {
1628         next = find_next_bit(bitmap, size, start);
1629     }
1630
1631     return next;
1632 }
1633
1634 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1635                                                 RAMBlock *rb,
1636                                                 unsigned long page)
1637 {
1638     bool ret;
1639
1640     qemu_mutex_lock(&rs->bitmap_mutex);
1641     ret = test_and_clear_bit(page, rb->bmap);
1642
1643     if (ret) {
1644         rs->migration_dirty_pages--;
1645     }
1646     qemu_mutex_unlock(&rs->bitmap_mutex);
1647
1648     return ret;
1649 }
1650
1651 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1652                                         ram_addr_t start, ram_addr_t length)
1653 {
1654     rs->migration_dirty_pages +=
1655         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1656                                               &rs->num_dirty_pages_period);
1657 }
1658
1659 /**
1660  * ram_pagesize_summary: calculate all the pagesizes of a VM
1661  *
1662  * Returns a summary bitmap of the page sizes of all RAMBlocks
1663  *
1664  * For VMs with just normal pages this is equivalent to the host page
1665  * size. If it's got some huge pages then it's the OR of all the
1666  * different page sizes.
1667  */
1668 uint64_t ram_pagesize_summary(void)
1669 {
1670     RAMBlock *block;
1671     uint64_t summary = 0;
1672
1673     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1674         summary |= block->page_size;
1675     }
1676
1677     return summary;
1678 }
1679
1680 uint64_t ram_get_total_transferred_pages(void)
1681 {
1682     return  ram_counters.normal + ram_counters.duplicate +
1683                 compression_counters.pages + xbzrle_counters.pages;
1684 }
1685
1686 static void migration_update_rates(RAMState *rs, int64_t end_time)
1687 {
1688     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1689     double compressed_size;
1690
1691     /* calculate period counters */
1692     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1693                 / (end_time - rs->time_last_bitmap_sync);
1694
1695     if (!page_count) {
1696         return;
1697     }
1698
1699     if (migrate_use_xbzrle()) {
1700         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1701             rs->xbzrle_cache_miss_prev) / page_count;
1702         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1703     }
1704
1705     if (migrate_use_compression()) {
1706         compression_counters.busy_rate = (double)(compression_counters.busy -
1707             rs->compress_thread_busy_prev) / page_count;
1708         rs->compress_thread_busy_prev = compression_counters.busy;
1709
1710         compressed_size = compression_counters.compressed_size -
1711                           rs->compressed_size_prev;
1712         if (compressed_size) {
1713             double uncompressed_size = (compression_counters.pages -
1714                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1715
1716             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1717             compression_counters.compression_rate =
1718                                         uncompressed_size / compressed_size;
1719
1720             rs->compress_pages_prev = compression_counters.pages;
1721             rs->compressed_size_prev = compression_counters.compressed_size;
1722         }
1723     }
1724 }
1725
1726 static void migration_bitmap_sync(RAMState *rs)
1727 {
1728     RAMBlock *block;
1729     int64_t end_time;
1730     uint64_t bytes_xfer_now;
1731
1732     ram_counters.dirty_sync_count++;
1733
1734     if (!rs->time_last_bitmap_sync) {
1735         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1736     }
1737
1738     trace_migration_bitmap_sync_start();
1739     memory_global_dirty_log_sync();
1740
1741     qemu_mutex_lock(&rs->bitmap_mutex);
1742     rcu_read_lock();
1743     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1744         migration_bitmap_sync_range(rs, block, 0, block->used_length);
1745     }
1746     ram_counters.remaining = ram_bytes_remaining();
1747     rcu_read_unlock();
1748     qemu_mutex_unlock(&rs->bitmap_mutex);
1749
1750     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1751
1752     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1753
1754     /* more than 1 second = 1000 millisecons */
1755     if (end_time > rs->time_last_bitmap_sync + 1000) {
1756         bytes_xfer_now = ram_counters.transferred;
1757
1758         /* During block migration the auto-converge logic incorrectly detects
1759          * that ram migration makes no progress. Avoid this by disabling the
1760          * throttling logic during the bulk phase of block migration. */
1761         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1762             /* The following detection logic can be refined later. For now:
1763                Check to see if the dirtied bytes is 50% more than the approx.
1764                amount of bytes that just got transferred since the last time we
1765                were in this routine. If that happens twice, start or increase
1766                throttling */
1767
1768             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1769                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1770                 (++rs->dirty_rate_high_cnt >= 2)) {
1771                     trace_migration_throttle();
1772                     rs->dirty_rate_high_cnt = 0;
1773                     mig_throttle_guest_down();
1774             }
1775         }
1776
1777         migration_update_rates(rs, end_time);
1778
1779         rs->target_page_count_prev = rs->target_page_count;
1780
1781         /* reset period counters */
1782         rs->time_last_bitmap_sync = end_time;
1783         rs->num_dirty_pages_period = 0;
1784         rs->bytes_xfer_prev = bytes_xfer_now;
1785     }
1786     if (migrate_use_events()) {
1787         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1788     }
1789 }
1790
1791 static void migration_bitmap_sync_precopy(RAMState *rs)
1792 {
1793     Error *local_err = NULL;
1794
1795     /*
1796      * The current notifier usage is just an optimization to migration, so we
1797      * don't stop the normal migration process in the error case.
1798      */
1799     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1800         error_report_err(local_err);
1801     }
1802
1803     migration_bitmap_sync(rs);
1804
1805     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1806         error_report_err(local_err);
1807     }
1808 }
1809
1810 /**
1811  * save_zero_page_to_file: send the zero page to the file
1812  *
1813  * Returns the size of data written to the file, 0 means the page is not
1814  * a zero page
1815  *
1816  * @rs: current RAM state
1817  * @file: the file where the data is saved
1818  * @block: block that contains the page we want to send
1819  * @offset: offset inside the block for the page
1820  */
1821 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1822                                   RAMBlock *block, ram_addr_t offset)
1823 {
1824     uint8_t *p = block->host + offset;
1825     int len = 0;
1826
1827     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1828         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1829         qemu_put_byte(file, 0);
1830         len += 1;
1831     }
1832     return len;
1833 }
1834
1835 /**
1836  * save_zero_page: send the zero page to the stream
1837  *
1838  * Returns the number of pages written.
1839  *
1840  * @rs: current RAM state
1841  * @block: block that contains the page we want to send
1842  * @offset: offset inside the block for the page
1843  */
1844 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1845 {
1846     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1847
1848     if (len) {
1849         ram_counters.duplicate++;
1850         ram_counters.transferred += len;
1851         return 1;
1852     }
1853     return -1;
1854 }
1855
1856 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1857 {
1858     if (!migrate_release_ram() || !migration_in_postcopy()) {
1859         return;
1860     }
1861
1862     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1863 }
1864
1865 /*
1866  * @pages: the number of pages written by the control path,
1867  *        < 0 - error
1868  *        > 0 - number of pages written
1869  *
1870  * Return true if the pages has been saved, otherwise false is returned.
1871  */
1872 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1873                               int *pages)
1874 {
1875     uint64_t bytes_xmit = 0;
1876     int ret;
1877
1878     *pages = -1;
1879     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1880                                 &bytes_xmit);
1881     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1882         return false;
1883     }
1884
1885     if (bytes_xmit) {
1886         ram_counters.transferred += bytes_xmit;
1887         *pages = 1;
1888     }
1889
1890     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1891         return true;
1892     }
1893
1894     if (bytes_xmit > 0) {
1895         ram_counters.normal++;
1896     } else if (bytes_xmit == 0) {
1897         ram_counters.duplicate++;
1898     }
1899
1900     return true;
1901 }
1902
1903 /*
1904  * directly send the page to the stream
1905  *
1906  * Returns the number of pages written.
1907  *
1908  * @rs: current RAM state
1909  * @block: block that contains the page we want to send
1910  * @offset: offset inside the block for the page
1911  * @buf: the page to be sent
1912  * @async: send to page asyncly
1913  */
1914 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1915                             uint8_t *buf, bool async)
1916 {
1917     ram_counters.transferred += save_page_header(rs, rs->f, block,
1918                                                  offset | RAM_SAVE_FLAG_PAGE);
1919     if (async) {
1920         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1921                               migrate_release_ram() &
1922                               migration_in_postcopy());
1923     } else {
1924         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1925     }
1926     ram_counters.transferred += TARGET_PAGE_SIZE;
1927     ram_counters.normal++;
1928     return 1;
1929 }
1930
1931 /**
1932  * ram_save_page: send the given page to the stream
1933  *
1934  * Returns the number of pages written.
1935  *          < 0 - error
1936  *          >=0 - Number of pages written - this might legally be 0
1937  *                if xbzrle noticed the page was the same.
1938  *
1939  * @rs: current RAM state
1940  * @block: block that contains the page we want to send
1941  * @offset: offset inside the block for the page
1942  * @last_stage: if we are at the completion stage
1943  */
1944 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1945 {
1946     int pages = -1;
1947     uint8_t *p;
1948     bool send_async = true;
1949     RAMBlock *block = pss->block;
1950     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1951     ram_addr_t current_addr = block->offset + offset;
1952
1953     p = block->host + offset;
1954     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1955
1956     XBZRLE_cache_lock();
1957     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1958         migrate_use_xbzrle()) {
1959         pages = save_xbzrle_page(rs, &p, current_addr, block,
1960                                  offset, last_stage);
1961         if (!last_stage) {
1962             /* Can't send this cached data async, since the cache page
1963              * might get updated before it gets to the wire
1964              */
1965             send_async = false;
1966         }
1967     }
1968
1969     /* XBZRLE overflow or normal page */
1970     if (pages == -1) {
1971         pages = save_normal_page(rs, block, offset, p, send_async);
1972     }
1973
1974     XBZRLE_cache_unlock();
1975
1976     return pages;
1977 }
1978
1979 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1980                                  ram_addr_t offset)
1981 {
1982     multifd_queue_page(block, offset);
1983     ram_counters.normal++;
1984
1985     return 1;
1986 }
1987
1988 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1989                                  ram_addr_t offset, uint8_t *source_buf)
1990 {
1991     RAMState *rs = ram_state;
1992     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1993     bool zero_page = false;
1994     int ret;
1995
1996     if (save_zero_page_to_file(rs, f, block, offset)) {
1997         zero_page = true;
1998         goto exit;
1999     }
2000
2001     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
2002
2003     /*
2004      * copy it to a internal buffer to avoid it being modified by VM
2005      * so that we can catch up the error during compression and
2006      * decompression
2007      */
2008     memcpy(source_buf, p, TARGET_PAGE_SIZE);
2009     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2010     if (ret < 0) {
2011         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
2012         error_report("compressed data failed!");
2013         return false;
2014     }
2015
2016 exit:
2017     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
2018     return zero_page;
2019 }
2020
2021 static void
2022 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2023 {
2024     ram_counters.transferred += bytes_xmit;
2025
2026     if (param->zero_page) {
2027         ram_counters.duplicate++;
2028         return;
2029     }
2030
2031     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2032     compression_counters.compressed_size += bytes_xmit - 8;
2033     compression_counters.pages++;
2034 }
2035
2036 static bool save_page_use_compression(RAMState *rs);
2037
2038 static void flush_compressed_data(RAMState *rs)
2039 {
2040     int idx, len, thread_count;
2041
2042     if (!save_page_use_compression(rs)) {
2043         return;
2044     }
2045     thread_count = migrate_compress_threads();
2046
2047     qemu_mutex_lock(&comp_done_lock);
2048     for (idx = 0; idx < thread_count; idx++) {
2049         while (!comp_param[idx].done) {
2050             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2051         }
2052     }
2053     qemu_mutex_unlock(&comp_done_lock);
2054
2055     for (idx = 0; idx < thread_count; idx++) {
2056         qemu_mutex_lock(&comp_param[idx].mutex);
2057         if (!comp_param[idx].quit) {
2058             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2059             /*
2060              * it's safe to fetch zero_page without holding comp_done_lock
2061              * as there is no further request submitted to the thread,
2062              * i.e, the thread should be waiting for a request at this point.
2063              */
2064             update_compress_thread_counts(&comp_param[idx], len);
2065         }
2066         qemu_mutex_unlock(&comp_param[idx].mutex);
2067     }
2068 }
2069
2070 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2071                                        ram_addr_t offset)
2072 {
2073     param->block = block;
2074     param->offset = offset;
2075 }
2076
2077 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2078                                            ram_addr_t offset)
2079 {
2080     int idx, thread_count, bytes_xmit = -1, pages = -1;
2081     bool wait = migrate_compress_wait_thread();
2082
2083     thread_count = migrate_compress_threads();
2084     qemu_mutex_lock(&comp_done_lock);
2085 retry:
2086     for (idx = 0; idx < thread_count; idx++) {
2087         if (comp_param[idx].done) {
2088             comp_param[idx].done = false;
2089             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2090             qemu_mutex_lock(&comp_param[idx].mutex);
2091             set_compress_params(&comp_param[idx], block, offset);
2092             qemu_cond_signal(&comp_param[idx].cond);
2093             qemu_mutex_unlock(&comp_param[idx].mutex);
2094             pages = 1;
2095             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
2096             break;
2097         }
2098     }
2099
2100     /*
2101      * wait for the free thread if the user specifies 'compress-wait-thread',
2102      * otherwise we will post the page out in the main thread as normal page.
2103      */
2104     if (pages < 0 && wait) {
2105         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2106         goto retry;
2107     }
2108     qemu_mutex_unlock(&comp_done_lock);
2109
2110     return pages;
2111 }
2112
2113 /**
2114  * find_dirty_block: find the next dirty page and update any state
2115  * associated with the search process.
2116  *
2117  * Returns if a page is found
2118  *
2119  * @rs: current RAM state
2120  * @pss: data about the state of the current dirty page scan
2121  * @again: set to false if the search has scanned the whole of RAM
2122  */
2123 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
2124 {
2125     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2126     if (pss->complete_round && pss->block == rs->last_seen_block &&
2127         pss->page >= rs->last_page) {
2128         /*
2129          * We've been once around the RAM and haven't found anything.
2130          * Give up.
2131          */
2132         *again = false;
2133         return false;
2134     }
2135     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
2136         /* Didn't find anything in this RAM Block */
2137         pss->page = 0;
2138         pss->block = QLIST_NEXT_RCU(pss->block, next);
2139         if (!pss->block) {
2140             /*
2141              * If memory migration starts over, we will meet a dirtied page
2142              * which may still exists in compression threads's ring, so we
2143              * should flush the compressed data to make sure the new page
2144              * is not overwritten by the old one in the destination.
2145              *
2146              * Also If xbzrle is on, stop using the data compression at this
2147              * point. In theory, xbzrle can do better than compression.
2148              */
2149             flush_compressed_data(rs);
2150
2151             /* Hit the end of the list */
2152             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2153             /* Flag that we've looped */
2154             pss->complete_round = true;
2155             rs->ram_bulk_stage = false;
2156         }
2157         /* Didn't find anything this time, but try again on the new block */
2158         *again = true;
2159         return false;
2160     } else {
2161         /* Can go around again, but... */
2162         *again = true;
2163         /* We've found something so probably don't need to */
2164         return true;
2165     }
2166 }
2167
2168 /**
2169  * unqueue_page: gets a page of the queue
2170  *
2171  * Helper for 'get_queued_page' - gets a page off the queue
2172  *
2173  * Returns the block of the page (or NULL if none available)
2174  *
2175  * @rs: current RAM state
2176  * @offset: used to return the offset within the RAMBlock
2177  */
2178 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2179 {
2180     RAMBlock *block = NULL;
2181
2182     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2183         return NULL;
2184     }
2185
2186     qemu_mutex_lock(&rs->src_page_req_mutex);
2187     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2188         struct RAMSrcPageRequest *entry =
2189                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
2190         block = entry->rb;
2191         *offset = entry->offset;
2192
2193         if (entry->len > TARGET_PAGE_SIZE) {
2194             entry->len -= TARGET_PAGE_SIZE;
2195             entry->offset += TARGET_PAGE_SIZE;
2196         } else {
2197             memory_region_unref(block->mr);
2198             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2199             g_free(entry);
2200             migration_consume_urgent_request();
2201         }
2202     }
2203     qemu_mutex_unlock(&rs->src_page_req_mutex);
2204
2205     return block;
2206 }
2207
2208 /**
2209  * get_queued_page: unqueue a page from the postocpy requests
2210  *
2211  * Skips pages that are already sent (!dirty)
2212  *
2213  * Returns if a queued page is found
2214  *
2215  * @rs: current RAM state
2216  * @pss: data about the state of the current dirty page scan
2217  */
2218 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2219 {
2220     RAMBlock  *block;
2221     ram_addr_t offset;
2222     bool dirty;
2223
2224     do {
2225         block = unqueue_page(rs, &offset);
2226         /*
2227          * We're sending this page, and since it's postcopy nothing else
2228          * will dirty it, and we must make sure it doesn't get sent again
2229          * even if this queue request was received after the background
2230          * search already sent it.
2231          */
2232         if (block) {
2233             unsigned long page;
2234
2235             page = offset >> TARGET_PAGE_BITS;
2236             dirty = test_bit(page, block->bmap);
2237             if (!dirty) {
2238                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2239                        page, test_bit(page, block->unsentmap));
2240             } else {
2241                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2242             }
2243         }
2244
2245     } while (block && !dirty);
2246
2247     if (block) {
2248         /*
2249          * As soon as we start servicing pages out of order, then we have
2250          * to kill the bulk stage, since the bulk stage assumes
2251          * in (migration_bitmap_find_and_reset_dirty) that every page is
2252          * dirty, that's no longer true.
2253          */
2254         rs->ram_bulk_stage = false;
2255
2256         /*
2257          * We want the background search to continue from the queued page
2258          * since the guest is likely to want other pages near to the page
2259          * it just requested.
2260          */
2261         pss->block = block;
2262         pss->page = offset >> TARGET_PAGE_BITS;
2263     }
2264
2265     return !!block;
2266 }
2267
2268 /**
2269  * migration_page_queue_free: drop any remaining pages in the ram
2270  * request queue
2271  *
2272  * It should be empty at the end anyway, but in error cases there may
2273  * be some left.  in case that there is any page left, we drop it.
2274  *
2275  */
2276 static void migration_page_queue_free(RAMState *rs)
2277 {
2278     struct RAMSrcPageRequest *mspr, *next_mspr;
2279     /* This queue generally should be empty - but in the case of a failed
2280      * migration might have some droppings in.
2281      */
2282     rcu_read_lock();
2283     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2284         memory_region_unref(mspr->rb->mr);
2285         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2286         g_free(mspr);
2287     }
2288     rcu_read_unlock();
2289 }
2290
2291 /**
2292  * ram_save_queue_pages: queue the page for transmission
2293  *
2294  * A request from postcopy destination for example.
2295  *
2296  * Returns zero on success or negative on error
2297  *
2298  * @rbname: Name of the RAMBLock of the request. NULL means the
2299  *          same that last one.
2300  * @start: starting address from the start of the RAMBlock
2301  * @len: length (in bytes) to send
2302  */
2303 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2304 {
2305     RAMBlock *ramblock;
2306     RAMState *rs = ram_state;
2307
2308     ram_counters.postcopy_requests++;
2309     rcu_read_lock();
2310     if (!rbname) {
2311         /* Reuse last RAMBlock */
2312         ramblock = rs->last_req_rb;
2313
2314         if (!ramblock) {
2315             /*
2316              * Shouldn't happen, we can't reuse the last RAMBlock if
2317              * it's the 1st request.
2318              */
2319             error_report("ram_save_queue_pages no previous block");
2320             goto err;
2321         }
2322     } else {
2323         ramblock = qemu_ram_block_by_name(rbname);
2324
2325         if (!ramblock) {
2326             /* We shouldn't be asked for a non-existent RAMBlock */
2327             error_report("ram_save_queue_pages no block '%s'", rbname);
2328             goto err;
2329         }
2330         rs->last_req_rb = ramblock;
2331     }
2332     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2333     if (start+len > ramblock->used_length) {
2334         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2335                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2336                      __func__, start, len, ramblock->used_length);
2337         goto err;
2338     }
2339
2340     struct RAMSrcPageRequest *new_entry =
2341         g_malloc0(sizeof(struct RAMSrcPageRequest));
2342     new_entry->rb = ramblock;
2343     new_entry->offset = start;
2344     new_entry->len = len;
2345
2346     memory_region_ref(ramblock->mr);
2347     qemu_mutex_lock(&rs->src_page_req_mutex);
2348     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2349     migration_make_urgent_request();
2350     qemu_mutex_unlock(&rs->src_page_req_mutex);
2351     rcu_read_unlock();
2352
2353     return 0;
2354
2355 err:
2356     rcu_read_unlock();
2357     return -1;
2358 }
2359
2360 static bool save_page_use_compression(RAMState *rs)
2361 {
2362     if (!migrate_use_compression()) {
2363         return false;
2364     }
2365
2366     /*
2367      * If xbzrle is on, stop using the data compression after first
2368      * round of migration even if compression is enabled. In theory,
2369      * xbzrle can do better than compression.
2370      */
2371     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2372         return true;
2373     }
2374
2375     return false;
2376 }
2377
2378 /*
2379  * try to compress the page before posting it out, return true if the page
2380  * has been properly handled by compression, otherwise needs other
2381  * paths to handle it
2382  */
2383 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2384 {
2385     if (!save_page_use_compression(rs)) {
2386         return false;
2387     }
2388
2389     /*
2390      * When starting the process of a new block, the first page of
2391      * the block should be sent out before other pages in the same
2392      * block, and all the pages in last block should have been sent
2393      * out, keeping this order is important, because the 'cont' flag
2394      * is used to avoid resending the block name.
2395      *
2396      * We post the fist page as normal page as compression will take
2397      * much CPU resource.
2398      */
2399     if (block != rs->last_sent_block) {
2400         flush_compressed_data(rs);
2401         return false;
2402     }
2403
2404     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2405         return true;
2406     }
2407
2408     compression_counters.busy++;
2409     return false;
2410 }
2411
2412 /**
2413  * ram_save_target_page: save one target page
2414  *
2415  * Returns the number of pages written
2416  *
2417  * @rs: current RAM state
2418  * @pss: data about the page we want to send
2419  * @last_stage: if we are at the completion stage
2420  */
2421 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2422                                 bool last_stage)
2423 {
2424     RAMBlock *block = pss->block;
2425     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2426     int res;
2427
2428     if (control_save_page(rs, block, offset, &res)) {
2429         return res;
2430     }
2431
2432     if (save_compress_page(rs, block, offset)) {
2433         return 1;
2434     }
2435
2436     res = save_zero_page(rs, block, offset);
2437     if (res > 0) {
2438         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2439          * page would be stale
2440          */
2441         if (!save_page_use_compression(rs)) {
2442             XBZRLE_cache_lock();
2443             xbzrle_cache_zero_page(rs, block->offset + offset);
2444             XBZRLE_cache_unlock();
2445         }
2446         ram_release_pages(block->idstr, offset, res);
2447         return res;
2448     }
2449
2450     /*
2451      * do not use multifd for compression as the first page in the new
2452      * block should be posted out before sending the compressed page
2453      */
2454     if (!save_page_use_compression(rs) && migrate_use_multifd()) {
2455         return ram_save_multifd_page(rs, block, offset);
2456     }
2457
2458     return ram_save_page(rs, pss, last_stage);
2459 }
2460
2461 /**
2462  * ram_save_host_page: save a whole host page
2463  *
2464  * Starting at *offset send pages up to the end of the current host
2465  * page. It's valid for the initial offset to point into the middle of
2466  * a host page in which case the remainder of the hostpage is sent.
2467  * Only dirty target pages are sent. Note that the host page size may
2468  * be a huge page for this block.
2469  * The saving stops at the boundary of the used_length of the block
2470  * if the RAMBlock isn't a multiple of the host page size.
2471  *
2472  * Returns the number of pages written or negative on error
2473  *
2474  * @rs: current RAM state
2475  * @ms: current migration state
2476  * @pss: data about the page we want to send
2477  * @last_stage: if we are at the completion stage
2478  */
2479 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2480                               bool last_stage)
2481 {
2482     int tmppages, pages = 0;
2483     size_t pagesize_bits =
2484         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2485
2486     if (ramblock_is_ignored(pss->block)) {
2487         error_report("block %s should not be migrated !", pss->block->idstr);
2488         return 0;
2489     }
2490
2491     do {
2492         /* Check the pages is dirty and if it is send it */
2493         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2494             pss->page++;
2495             continue;
2496         }
2497
2498         tmppages = ram_save_target_page(rs, pss, last_stage);
2499         if (tmppages < 0) {
2500             return tmppages;
2501         }
2502
2503         pages += tmppages;
2504         if (pss->block->unsentmap) {
2505             clear_bit(pss->page, pss->block->unsentmap);
2506         }
2507
2508         pss->page++;
2509     } while ((pss->page & (pagesize_bits - 1)) &&
2510              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2511
2512     /* The offset we leave with is the last one we looked at */
2513     pss->page--;
2514     return pages;
2515 }
2516
2517 /**
2518  * ram_find_and_save_block: finds a dirty page and sends it to f
2519  *
2520  * Called within an RCU critical section.
2521  *
2522  * Returns the number of pages written where zero means no dirty pages,
2523  * or negative on error
2524  *
2525  * @rs: current RAM state
2526  * @last_stage: if we are at the completion stage
2527  *
2528  * On systems where host-page-size > target-page-size it will send all the
2529  * pages in a host page that are dirty.
2530  */
2531
2532 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2533 {
2534     PageSearchStatus pss;
2535     int pages = 0;
2536     bool again, found;
2537
2538     /* No dirty page as there is zero RAM */
2539     if (!ram_bytes_total()) {
2540         return pages;
2541     }
2542
2543     pss.block = rs->last_seen_block;
2544     pss.page = rs->last_page;
2545     pss.complete_round = false;
2546
2547     if (!pss.block) {
2548         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2549     }
2550
2551     do {
2552         again = true;
2553         found = get_queued_page(rs, &pss);
2554
2555         if (!found) {
2556             /* priority queue empty, so just search for something dirty */
2557             found = find_dirty_block(rs, &pss, &again);
2558         }
2559
2560         if (found) {
2561             pages = ram_save_host_page(rs, &pss, last_stage);
2562         }
2563     } while (!pages && again);
2564
2565     rs->last_seen_block = pss.block;
2566     rs->last_page = pss.page;
2567
2568     return pages;
2569 }
2570
2571 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2572 {
2573     uint64_t pages = size / TARGET_PAGE_SIZE;
2574
2575     if (zero) {
2576         ram_counters.duplicate += pages;
2577     } else {
2578         ram_counters.normal += pages;
2579         ram_counters.transferred += size;
2580         qemu_update_position(f, size);
2581     }
2582 }
2583
2584 static uint64_t ram_bytes_total_common(bool count_ignored)
2585 {
2586     RAMBlock *block;
2587     uint64_t total = 0;
2588
2589     rcu_read_lock();
2590     if (count_ignored) {
2591         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2592             total += block->used_length;
2593         }
2594     } else {
2595         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2596             total += block->used_length;
2597         }
2598     }
2599     rcu_read_unlock();
2600     return total;
2601 }
2602
2603 uint64_t ram_bytes_total(void)
2604 {
2605     return ram_bytes_total_common(false);
2606 }
2607
2608 static void xbzrle_load_setup(void)
2609 {
2610     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2611 }
2612
2613 static void xbzrle_load_cleanup(void)
2614 {
2615     g_free(XBZRLE.decoded_buf);
2616     XBZRLE.decoded_buf = NULL;
2617 }
2618
2619 static void ram_state_cleanup(RAMState **rsp)
2620 {
2621     if (*rsp) {
2622         migration_page_queue_free(*rsp);
2623         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2624         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2625         g_free(*rsp);
2626         *rsp = NULL;
2627     }
2628 }
2629
2630 static void xbzrle_cleanup(void)
2631 {
2632     XBZRLE_cache_lock();
2633     if (XBZRLE.cache) {
2634         cache_fini(XBZRLE.cache);
2635         g_free(XBZRLE.encoded_buf);
2636         g_free(XBZRLE.current_buf);
2637         g_free(XBZRLE.zero_target_page);
2638         XBZRLE.cache = NULL;
2639         XBZRLE.encoded_buf = NULL;
2640         XBZRLE.current_buf = NULL;
2641         XBZRLE.zero_target_page = NULL;
2642     }
2643     XBZRLE_cache_unlock();
2644 }
2645
2646 static void ram_save_cleanup(void *opaque)
2647 {
2648     RAMState **rsp = opaque;
2649     RAMBlock *block;
2650
2651     /* caller have hold iothread lock or is in a bh, so there is
2652      * no writing race against this migration_bitmap
2653      */
2654     memory_global_dirty_log_stop();
2655
2656     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2657         g_free(block->bmap);
2658         block->bmap = NULL;
2659         g_free(block->unsentmap);
2660         block->unsentmap = NULL;
2661     }
2662
2663     xbzrle_cleanup();
2664     compress_threads_save_cleanup();
2665     ram_state_cleanup(rsp);
2666 }
2667
2668 static void ram_state_reset(RAMState *rs)
2669 {
2670     rs->last_seen_block = NULL;
2671     rs->last_sent_block = NULL;
2672     rs->last_page = 0;
2673     rs->last_version = ram_list.version;
2674     rs->ram_bulk_stage = true;
2675     rs->fpo_enabled = false;
2676 }
2677
2678 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2679
2680 /*
2681  * 'expected' is the value you expect the bitmap mostly to be full
2682  * of; it won't bother printing lines that are all this value.
2683  * If 'todump' is null the migration bitmap is dumped.
2684  */
2685 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2686                            unsigned long pages)
2687 {
2688     int64_t cur;
2689     int64_t linelen = 128;
2690     char linebuf[129];
2691
2692     for (cur = 0; cur < pages; cur += linelen) {
2693         int64_t curb;
2694         bool found = false;
2695         /*
2696          * Last line; catch the case where the line length
2697          * is longer than remaining ram
2698          */
2699         if (cur + linelen > pages) {
2700             linelen = pages - cur;
2701         }
2702         for (curb = 0; curb < linelen; curb++) {
2703             bool thisbit = test_bit(cur + curb, todump);
2704             linebuf[curb] = thisbit ? '1' : '.';
2705             found = found || (thisbit != expected);
2706         }
2707         if (found) {
2708             linebuf[curb] = '\0';
2709             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2710         }
2711     }
2712 }
2713
2714 /* **** functions for postcopy ***** */
2715
2716 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2717 {
2718     struct RAMBlock *block;
2719
2720     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2721         unsigned long *bitmap = block->bmap;
2722         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2723         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2724
2725         while (run_start < range) {
2726             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2727             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2728                               (run_end - run_start) << TARGET_PAGE_BITS);
2729             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2730         }
2731     }
2732 }
2733
2734 /**
2735  * postcopy_send_discard_bm_ram: discard a RAMBlock
2736  *
2737  * Returns zero on success
2738  *
2739  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2740  * Note: At this point the 'unsentmap' is the processed bitmap combined
2741  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2742  *
2743  * @ms: current migration state
2744  * @pds: state for postcopy
2745  * @start: RAMBlock starting page
2746  * @length: RAMBlock size
2747  */
2748 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2749                                         PostcopyDiscardState *pds,
2750                                         RAMBlock *block)
2751 {
2752     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2753     unsigned long current;
2754     unsigned long *unsentmap = block->unsentmap;
2755
2756     for (current = 0; current < end; ) {
2757         unsigned long one = find_next_bit(unsentmap, end, current);
2758
2759         if (one <= end) {
2760             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2761             unsigned long discard_length;
2762
2763             if (zero >= end) {
2764                 discard_length = end - one;
2765             } else {
2766                 discard_length = zero - one;
2767             }
2768             if (discard_length) {
2769                 postcopy_discard_send_range(ms, pds, one, discard_length);
2770             }
2771             current = one + discard_length;
2772         } else {
2773             current = one;
2774         }
2775     }
2776
2777     return 0;
2778 }
2779
2780 /**
2781  * postcopy_each_ram_send_discard: discard all RAMBlocks
2782  *
2783  * Returns 0 for success or negative for error
2784  *
2785  * Utility for the outgoing postcopy code.
2786  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2787  *   passing it bitmap indexes and name.
2788  * (qemu_ram_foreach_block ends up passing unscaled lengths
2789  *  which would mean postcopy code would have to deal with target page)
2790  *
2791  * @ms: current migration state
2792  */
2793 static int postcopy_each_ram_send_discard(MigrationState *ms)
2794 {
2795     struct RAMBlock *block;
2796     int ret;
2797
2798     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2799         PostcopyDiscardState *pds =
2800             postcopy_discard_send_init(ms, block->idstr);
2801
2802         /*
2803          * Postcopy sends chunks of bitmap over the wire, but it
2804          * just needs indexes at this point, avoids it having
2805          * target page specific code.
2806          */
2807         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2808         postcopy_discard_send_finish(ms, pds);
2809         if (ret) {
2810             return ret;
2811         }
2812     }
2813
2814     return 0;
2815 }
2816
2817 /**
2818  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2819  *
2820  * Helper for postcopy_chunk_hostpages; it's called twice to
2821  * canonicalize the two bitmaps, that are similar, but one is
2822  * inverted.
2823  *
2824  * Postcopy requires that all target pages in a hostpage are dirty or
2825  * clean, not a mix.  This function canonicalizes the bitmaps.
2826  *
2827  * @ms: current migration state
2828  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2829  *               otherwise we need to canonicalize partially dirty host pages
2830  * @block: block that contains the page we want to canonicalize
2831  * @pds: state for postcopy
2832  */
2833 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2834                                           RAMBlock *block,
2835                                           PostcopyDiscardState *pds)
2836 {
2837     RAMState *rs = ram_state;
2838     unsigned long *bitmap = block->bmap;
2839     unsigned long *unsentmap = block->unsentmap;
2840     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2841     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2842     unsigned long run_start;
2843
2844     if (block->page_size == TARGET_PAGE_SIZE) {
2845         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2846         return;
2847     }
2848
2849     if (unsent_pass) {
2850         /* Find a sent page */
2851         run_start = find_next_zero_bit(unsentmap, pages, 0);
2852     } else {
2853         /* Find a dirty page */
2854         run_start = find_next_bit(bitmap, pages, 0);
2855     }
2856
2857     while (run_start < pages) {
2858         bool do_fixup = false;
2859         unsigned long fixup_start_addr;
2860         unsigned long host_offset;
2861
2862         /*
2863          * If the start of this run of pages is in the middle of a host
2864          * page, then we need to fixup this host page.
2865          */
2866         host_offset = run_start % host_ratio;
2867         if (host_offset) {
2868             do_fixup = true;
2869             run_start -= host_offset;
2870             fixup_start_addr = run_start;
2871             /* For the next pass */
2872             run_start = run_start + host_ratio;
2873         } else {
2874             /* Find the end of this run */
2875             unsigned long run_end;
2876             if (unsent_pass) {
2877                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2878             } else {
2879                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2880             }
2881             /*
2882              * If the end isn't at the start of a host page, then the
2883              * run doesn't finish at the end of a host page
2884              * and we need to discard.
2885              */
2886             host_offset = run_end % host_ratio;
2887             if (host_offset) {
2888                 do_fixup = true;
2889                 fixup_start_addr = run_end - host_offset;
2890                 /*
2891                  * This host page has gone, the next loop iteration starts
2892                  * from after the fixup
2893                  */
2894                 run_start = fixup_start_addr + host_ratio;
2895             } else {
2896                 /*
2897                  * No discards on this iteration, next loop starts from
2898                  * next sent/dirty page
2899                  */
2900                 run_start = run_end + 1;
2901             }
2902         }
2903
2904         if (do_fixup) {
2905             unsigned long page;
2906
2907             /* Tell the destination to discard this page */
2908             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2909                 /* For the unsent_pass we:
2910                  *     discard partially sent pages
2911                  * For the !unsent_pass (dirty) we:
2912                  *     discard partially dirty pages that were sent
2913                  *     (any partially sent pages were already discarded
2914                  *     by the previous unsent_pass)
2915                  */
2916                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2917                                             host_ratio);
2918             }
2919
2920             /* Clean up the bitmap */
2921             for (page = fixup_start_addr;
2922                  page < fixup_start_addr + host_ratio; page++) {
2923                 /* All pages in this host page are now not sent */
2924                 set_bit(page, unsentmap);
2925
2926                 /*
2927                  * Remark them as dirty, updating the count for any pages
2928                  * that weren't previously dirty.
2929                  */
2930                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2931             }
2932         }
2933
2934         if (unsent_pass) {
2935             /* Find the next sent page for the next iteration */
2936             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2937         } else {
2938             /* Find the next dirty page for the next iteration */
2939             run_start = find_next_bit(bitmap, pages, run_start);
2940         }
2941     }
2942 }
2943
2944 /**
2945  * postcopy_chuck_hostpages: discrad any partially sent host page
2946  *
2947  * Utility for the outgoing postcopy code.
2948  *
2949  * Discard any partially sent host-page size chunks, mark any partially
2950  * dirty host-page size chunks as all dirty.  In this case the host-page
2951  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2952  *
2953  * Returns zero on success
2954  *
2955  * @ms: current migration state
2956  * @block: block we want to work with
2957  */
2958 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2959 {
2960     PostcopyDiscardState *pds =
2961         postcopy_discard_send_init(ms, block->idstr);
2962
2963     /* First pass: Discard all partially sent host pages */
2964     postcopy_chunk_hostpages_pass(ms, true, block, pds);
2965     /*
2966      * Second pass: Ensure that all partially dirty host pages are made
2967      * fully dirty.
2968      */
2969     postcopy_chunk_hostpages_pass(ms, false, block, pds);
2970
2971     postcopy_discard_send_finish(ms, pds);
2972     return 0;
2973 }
2974
2975 /**
2976  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2977  *
2978  * Returns zero on success
2979  *
2980  * Transmit the set of pages to be discarded after precopy to the target
2981  * these are pages that:
2982  *     a) Have been previously transmitted but are now dirty again
2983  *     b) Pages that have never been transmitted, this ensures that
2984  *        any pages on the destination that have been mapped by background
2985  *        tasks get discarded (transparent huge pages is the specific concern)
2986  * Hopefully this is pretty sparse
2987  *
2988  * @ms: current migration state
2989  */
2990 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2991 {
2992     RAMState *rs = ram_state;
2993     RAMBlock *block;
2994     int ret;
2995
2996     rcu_read_lock();
2997
2998     /* This should be our last sync, the src is now paused */
2999     migration_bitmap_sync(rs);
3000
3001     /* Easiest way to make sure we don't resume in the middle of a host-page */
3002     rs->last_seen_block = NULL;
3003     rs->last_sent_block = NULL;
3004     rs->last_page = 0;
3005
3006     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3007         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
3008         unsigned long *bitmap = block->bmap;
3009         unsigned long *unsentmap = block->unsentmap;
3010
3011         if (!unsentmap) {
3012             /* We don't have a safe way to resize the sentmap, so
3013              * if the bitmap was resized it will be NULL at this
3014              * point.
3015              */
3016             error_report("migration ram resized during precopy phase");
3017             rcu_read_unlock();
3018             return -EINVAL;
3019         }
3020         /* Deal with TPS != HPS and huge pages */
3021         ret = postcopy_chunk_hostpages(ms, block);
3022         if (ret) {
3023             rcu_read_unlock();
3024             return ret;
3025         }
3026
3027         /*
3028          * Update the unsentmap to be unsentmap = unsentmap | dirty
3029          */
3030         bitmap_or(unsentmap, unsentmap, bitmap, pages);
3031 #ifdef DEBUG_POSTCOPY
3032         ram_debug_dump_bitmap(unsentmap, true, pages);
3033 #endif
3034     }
3035     trace_ram_postcopy_send_discard_bitmap();
3036
3037     ret = postcopy_each_ram_send_discard(ms);
3038     rcu_read_unlock();
3039
3040     return ret;
3041 }
3042
3043 /**
3044  * ram_discard_range: discard dirtied pages at the beginning of postcopy
3045  *
3046  * Returns zero on success
3047  *
3048  * @rbname: name of the RAMBlock of the request. NULL means the
3049  *          same that last one.
3050  * @start: RAMBlock starting page
3051  * @length: RAMBlock size
3052  */
3053 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3054 {
3055     int ret = -1;
3056
3057     trace_ram_discard_range(rbname, start, length);
3058
3059     rcu_read_lock();
3060     RAMBlock *rb = qemu_ram_block_by_name(rbname);
3061
3062     if (!rb) {
3063         error_report("ram_discard_range: Failed to find block '%s'", rbname);
3064         goto err;
3065     }
3066
3067     /*
3068      * On source VM, we don't need to update the received bitmap since
3069      * we don't even have one.
3070      */
3071     if (rb->receivedmap) {
3072         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3073                      length >> qemu_target_page_bits());
3074     }
3075
3076     ret = ram_block_discard_range(rb, start, length);
3077
3078 err:
3079     rcu_read_unlock();
3080
3081     return ret;
3082 }
3083
3084 /*
3085  * For every allocation, we will try not to crash the VM if the
3086  * allocation failed.
3087  */
3088 static int xbzrle_init(void)
3089 {
3090     Error *local_err = NULL;
3091
3092     if (!migrate_use_xbzrle()) {
3093         return 0;
3094     }
3095
3096     XBZRLE_cache_lock();
3097
3098     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3099     if (!XBZRLE.zero_target_page) {
3100         error_report("%s: Error allocating zero page", __func__);
3101         goto err_out;
3102     }
3103
3104     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3105                               TARGET_PAGE_SIZE, &local_err);
3106     if (!XBZRLE.cache) {
3107         error_report_err(local_err);
3108         goto free_zero_page;
3109     }
3110
3111     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3112     if (!XBZRLE.encoded_buf) {
3113         error_report("%s: Error allocating encoded_buf", __func__);
3114         goto free_cache;
3115     }
3116
3117     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3118     if (!XBZRLE.current_buf) {
3119         error_report("%s: Error allocating current_buf", __func__);
3120         goto free_encoded_buf;
3121     }
3122
3123     /* We are all good */
3124     XBZRLE_cache_unlock();
3125     return 0;
3126
3127 free_encoded_buf:
3128     g_free(XBZRLE.encoded_buf);
3129     XBZRLE.encoded_buf = NULL;
3130 free_cache:
3131     cache_fini(XBZRLE.cache);
3132     XBZRLE.cache = NULL;
3133 free_zero_page:
3134     g_free(XBZRLE.zero_target_page);
3135     XBZRLE.zero_target_page = NULL;
3136 err_out:
3137     XBZRLE_cache_unlock();
3138     return -ENOMEM;
3139 }
3140
3141 static int ram_state_init(RAMState **rsp)
3142 {
3143     *rsp = g_try_new0(RAMState, 1);
3144
3145     if (!*rsp) {
3146         error_report("%s: Init ramstate fail", __func__);
3147         return -1;
3148     }
3149
3150     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3151     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3152     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3153
3154     /*
3155      * Count the total number of pages used by ram blocks not including any
3156      * gaps due to alignment or unplugs.
3157      */
3158     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3159
3160     ram_state_reset(*rsp);
3161
3162     return 0;
3163 }
3164
3165 static void ram_list_init_bitmaps(void)
3166 {
3167     RAMBlock *block;
3168     unsigned long pages;
3169
3170     /* Skip setting bitmap if there is no RAM */
3171     if (ram_bytes_total()) {
3172         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3173             pages = block->max_length >> TARGET_PAGE_BITS;
3174             block->bmap = bitmap_new(pages);
3175             bitmap_set(block->bmap, 0, pages);
3176             if (migrate_postcopy_ram()) {
3177                 block->unsentmap = bitmap_new(pages);
3178                 bitmap_set(block->unsentmap, 0, pages);
3179             }
3180         }
3181     }
3182 }
3183
3184 static void ram_init_bitmaps(RAMState *rs)
3185 {
3186     /* For memory_global_dirty_log_start below.  */
3187     qemu_mutex_lock_iothread();
3188     qemu_mutex_lock_ramlist();
3189     rcu_read_lock();
3190
3191     ram_list_init_bitmaps();
3192     memory_global_dirty_log_start();
3193     migration_bitmap_sync_precopy(rs);
3194
3195     rcu_read_unlock();
3196     qemu_mutex_unlock_ramlist();
3197     qemu_mutex_unlock_iothread();
3198 }
3199
3200 static int ram_init_all(RAMState **rsp)
3201 {
3202     if (ram_state_init(rsp)) {
3203         return -1;
3204     }
3205
3206     if (xbzrle_init()) {
3207         ram_state_cleanup(rsp);
3208         return -1;
3209     }
3210
3211     ram_init_bitmaps(*rsp);
3212
3213     return 0;
3214 }
3215
3216 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3217 {
3218     RAMBlock *block;
3219     uint64_t pages = 0;
3220
3221     /*
3222      * Postcopy is not using xbzrle/compression, so no need for that.
3223      * Also, since source are already halted, we don't need to care
3224      * about dirty page logging as well.
3225      */
3226
3227     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3228         pages += bitmap_count_one(block->bmap,
3229                                   block->used_length >> TARGET_PAGE_BITS);
3230     }
3231
3232     /* This may not be aligned with current bitmaps. Recalculate. */
3233     rs->migration_dirty_pages = pages;
3234
3235     rs->last_seen_block = NULL;
3236     rs->last_sent_block = NULL;
3237     rs->last_page = 0;
3238     rs->last_version = ram_list.version;
3239     /*
3240      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3241      * matter what we have sent.
3242      */
3243     rs->ram_bulk_stage = false;
3244
3245     /* Update RAMState cache of output QEMUFile */
3246     rs->f = out;
3247
3248     trace_ram_state_resume_prepare(pages);
3249 }
3250
3251 /*
3252  * This function clears bits of the free pages reported by the caller from the
3253  * migration dirty bitmap. @addr is the host address corresponding to the
3254  * start of the continuous guest free pages, and @len is the total bytes of
3255  * those pages.
3256  */
3257 void qemu_guest_free_page_hint(void *addr, size_t len)
3258 {
3259     RAMBlock *block;
3260     ram_addr_t offset;
3261     size_t used_len, start, npages;
3262     MigrationState *s = migrate_get_current();
3263
3264     /* This function is currently expected to be used during live migration */
3265     if (!migration_is_setup_or_active(s->state)) {
3266         return;
3267     }
3268
3269     for (; len > 0; len -= used_len, addr += used_len) {
3270         block = qemu_ram_block_from_host(addr, false, &offset);
3271         if (unlikely(!block || offset >= block->used_length)) {
3272             /*
3273              * The implementation might not support RAMBlock resize during
3274              * live migration, but it could happen in theory with future
3275              * updates. So we add a check here to capture that case.
3276              */
3277             error_report_once("%s unexpected error", __func__);
3278             return;
3279         }
3280
3281         if (len <= block->used_length - offset) {
3282             used_len = len;
3283         } else {
3284             used_len = block->used_length - offset;
3285         }
3286
3287         start = offset >> TARGET_PAGE_BITS;
3288         npages = used_len >> TARGET_PAGE_BITS;
3289
3290         qemu_mutex_lock(&ram_state->bitmap_mutex);
3291         ram_state->migration_dirty_pages -=
3292                       bitmap_count_one_with_offset(block->bmap, start, npages);
3293         bitmap_clear(block->bmap, start, npages);
3294         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3295     }
3296 }
3297
3298 /*
3299  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3300  * long-running RCU critical section.  When rcu-reclaims in the code
3301  * start to become numerous it will be necessary to reduce the
3302  * granularity of these critical sections.
3303  */
3304
3305 /**
3306  * ram_save_setup: Setup RAM for migration
3307  *
3308  * Returns zero to indicate success and negative for error
3309  *
3310  * @f: QEMUFile where to send the data
3311  * @opaque: RAMState pointer
3312  */
3313 static int ram_save_setup(QEMUFile *f, void *opaque)
3314 {
3315     RAMState **rsp = opaque;
3316     RAMBlock *block;
3317
3318     if (compress_threads_save_setup()) {
3319         return -1;
3320     }
3321
3322     /* migration has already setup the bitmap, reuse it. */
3323     if (!migration_in_colo_state()) {
3324         if (ram_init_all(rsp) != 0) {
3325             compress_threads_save_cleanup();
3326             return -1;
3327         }
3328     }
3329     (*rsp)->f = f;
3330
3331     rcu_read_lock();
3332
3333     qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3334
3335     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3336         qemu_put_byte(f, strlen(block->idstr));
3337         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3338         qemu_put_be64(f, block->used_length);
3339         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3340             qemu_put_be64(f, block->page_size);
3341         }
3342         if (migrate_ignore_shared()) {
3343             qemu_put_be64(f, block->mr->addr);
3344             qemu_put_byte(f, ramblock_is_ignored(block) ? 1 : 0);
3345         }
3346     }
3347
3348     rcu_read_unlock();
3349
3350     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3351     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3352
3353     multifd_send_sync_main();
3354     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3355     qemu_fflush(f);
3356
3357     return 0;
3358 }
3359
3360 /**
3361  * ram_save_iterate: iterative stage for migration
3362  *
3363  * Returns zero to indicate success and negative for error
3364  *
3365  * @f: QEMUFile where to send the data
3366  * @opaque: RAMState pointer
3367  */
3368 static int ram_save_iterate(QEMUFile *f, void *opaque)
3369 {
3370     RAMState **temp = opaque;
3371     RAMState *rs = *temp;
3372     int ret;
3373     int i;
3374     int64_t t0;
3375     int done = 0;
3376
3377     if (blk_mig_bulk_active()) {
3378         /* Avoid transferring ram during bulk phase of block migration as
3379          * the bulk phase will usually take a long time and transferring
3380          * ram updates during that time is pointless. */
3381         goto out;
3382     }
3383
3384     rcu_read_lock();
3385     if (ram_list.version != rs->last_version) {
3386         ram_state_reset(rs);
3387     }
3388
3389     /* Read version before ram_list.blocks */
3390     smp_rmb();
3391
3392     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3393
3394     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3395     i = 0;
3396     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3397             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3398         int pages;
3399
3400         if (qemu_file_get_error(f)) {
3401             break;
3402         }
3403
3404         pages = ram_find_and_save_block(rs, false);
3405         /* no more pages to sent */
3406         if (pages == 0) {
3407             done = 1;
3408             break;
3409         }
3410
3411         if (pages < 0) {
3412             qemu_file_set_error(f, pages);
3413             break;
3414         }
3415
3416         rs->target_page_count += pages;
3417
3418         /* we want to check in the 1st loop, just in case it was the 1st time
3419            and we had to sync the dirty bitmap.
3420            qemu_get_clock_ns() is a bit expensive, so we only check each some
3421            iterations
3422         */
3423         if ((i & 63) == 0) {
3424             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3425             if (t1 > MAX_WAIT) {
3426                 trace_ram_save_iterate_big_wait(t1, i);
3427                 break;
3428             }
3429         }
3430         i++;
3431     }
3432     rcu_read_unlock();
3433
3434     /*
3435      * Must occur before EOS (or any QEMUFile operation)
3436      * because of RDMA protocol.
3437      */
3438     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3439
3440     multifd_send_sync_main();
3441 out:
3442     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3443     qemu_fflush(f);
3444     ram_counters.transferred += 8;
3445
3446     ret = qemu_file_get_error(f);
3447     if (ret < 0) {
3448         return ret;
3449     }
3450
3451     return done;
3452 }
3453
3454 /**
3455  * ram_save_complete: function called to send the remaining amount of ram
3456  *
3457  * Returns zero to indicate success or negative on error
3458  *
3459  * Called with iothread lock
3460  *
3461  * @f: QEMUFile where to send the data
3462  * @opaque: RAMState pointer
3463  */
3464 static int ram_save_complete(QEMUFile *f, void *opaque)
3465 {
3466     RAMState **temp = opaque;
3467     RAMState *rs = *temp;
3468     int ret = 0;
3469
3470     rcu_read_lock();
3471
3472     if (!migration_in_postcopy()) {
3473         migration_bitmap_sync_precopy(rs);
3474     }
3475
3476     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3477
3478     /* try transferring iterative blocks of memory */
3479
3480     /* flush all remaining blocks regardless of rate limiting */
3481     while (true) {
3482         int pages;
3483
3484         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3485         /* no more blocks to sent */
3486         if (pages == 0) {
3487             break;
3488         }
3489         if (pages < 0) {
3490             ret = pages;
3491             break;
3492         }
3493     }
3494
3495     flush_compressed_data(rs);
3496     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3497
3498     rcu_read_unlock();
3499
3500     multifd_send_sync_main();
3501     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3502     qemu_fflush(f);
3503
3504     return ret;
3505 }
3506
3507 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3508                              uint64_t *res_precopy_only,
3509                              uint64_t *res_compatible,
3510                              uint64_t *res_postcopy_only)
3511 {
3512     RAMState **temp = opaque;
3513     RAMState *rs = *temp;
3514     uint64_t remaining_size;
3515
3516     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3517
3518     if (!migration_in_postcopy() &&
3519         remaining_size < max_size) {
3520         qemu_mutex_lock_iothread();
3521         rcu_read_lock();
3522         migration_bitmap_sync_precopy(rs);
3523         rcu_read_unlock();
3524         qemu_mutex_unlock_iothread();
3525         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3526     }
3527
3528     if (migrate_postcopy_ram()) {
3529         /* We can do postcopy, and all the data is postcopiable */
3530         *res_compatible += remaining_size;
3531     } else {
3532         *res_precopy_only += remaining_size;
3533     }
3534 }
3535
3536 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3537 {
3538     unsigned int xh_len;
3539     int xh_flags;
3540     uint8_t *loaded_data;
3541
3542     /* extract RLE header */
3543     xh_flags = qemu_get_byte(f);
3544     xh_len = qemu_get_be16(f);
3545
3546     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3547         error_report("Failed to load XBZRLE page - wrong compression!");
3548         return -1;
3549     }
3550
3551     if (xh_len > TARGET_PAGE_SIZE) {
3552         error_report("Failed to load XBZRLE page - len overflow!");
3553         return -1;
3554     }
3555     loaded_data = XBZRLE.decoded_buf;
3556     /* load data and decode */
3557     /* it can change loaded_data to point to an internal buffer */
3558     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3559
3560     /* decode RLE */
3561     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3562                              TARGET_PAGE_SIZE) == -1) {
3563         error_report("Failed to load XBZRLE page - decode error!");
3564         return -1;
3565     }
3566
3567     return 0;
3568 }
3569
3570 /**
3571  * ram_block_from_stream: read a RAMBlock id from the migration stream
3572  *
3573  * Must be called from within a rcu critical section.
3574  *
3575  * Returns a pointer from within the RCU-protected ram_list.
3576  *
3577  * @f: QEMUFile where to read the data from
3578  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3579  */
3580 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3581 {
3582     static RAMBlock *block = NULL;
3583     char id[256];
3584     uint8_t len;
3585
3586     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3587         if (!block) {
3588             error_report("Ack, bad migration stream!");
3589             return NULL;
3590         }
3591         return block;
3592     }
3593
3594     len = qemu_get_byte(f);
3595     qemu_get_buffer(f, (uint8_t *)id, len);
3596     id[len] = 0;
3597
3598     block = qemu_ram_block_by_name(id);
3599     if (!block) {
3600         error_report("Can't find block %s", id);
3601         return NULL;
3602     }
3603
3604     if (ramblock_is_ignored(block)) {
3605         error_report("block %s should not be migrated !", id);
3606         return NULL;
3607     }
3608
3609     return block;
3610 }
3611
3612 static inline void *host_from_ram_block_offset(RAMBlock *block,
3613                                                ram_addr_t offset)
3614 {
3615     if (!offset_in_ramblock(block, offset)) {
3616         return NULL;
3617     }
3618
3619     return block->host + offset;
3620 }
3621
3622 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3623                                                  ram_addr_t offset)
3624 {
3625     if (!offset_in_ramblock(block, offset)) {
3626         return NULL;
3627     }
3628     if (!block->colo_cache) {
3629         error_report("%s: colo_cache is NULL in block :%s",
3630                      __func__, block->idstr);
3631         return NULL;
3632     }
3633
3634     /*
3635     * During colo checkpoint, we need bitmap of these migrated pages.
3636     * It help us to decide which pages in ram cache should be flushed
3637     * into VM's RAM later.
3638     */
3639     if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3640         ram_state->migration_dirty_pages++;
3641     }
3642     return block->colo_cache + offset;
3643 }
3644
3645 /**
3646  * ram_handle_compressed: handle the zero page case
3647  *
3648  * If a page (or a whole RDMA chunk) has been
3649  * determined to be zero, then zap it.
3650  *
3651  * @host: host address for the zero page
3652  * @ch: what the page is filled from.  We only support zero
3653  * @size: size of the zero page
3654  */
3655 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3656 {
3657     if (ch != 0 || !is_zero_range(host, size)) {
3658         memset(host, ch, size);
3659     }
3660 }
3661
3662 /* return the size after decompression, or negative value on error */
3663 static int
3664 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3665                      const uint8_t *source, size_t source_len)
3666 {
3667     int err;
3668
3669     err = inflateReset(stream);
3670     if (err != Z_OK) {
3671         return -1;
3672     }
3673
3674     stream->avail_in = source_len;
3675     stream->next_in = (uint8_t *)source;
3676     stream->avail_out = dest_len;
3677     stream->next_out = dest;
3678
3679     err = inflate(stream, Z_NO_FLUSH);
3680     if (err != Z_STREAM_END) {
3681         return -1;
3682     }
3683
3684     return stream->total_out;
3685 }
3686
3687 static void *do_data_decompress(void *opaque)
3688 {
3689     DecompressParam *param = opaque;
3690     unsigned long pagesize;
3691     uint8_t *des;
3692     int len, ret;
3693
3694     qemu_mutex_lock(&param->mutex);
3695     while (!param->quit) {
3696         if (param->des) {
3697             des = param->des;
3698             len = param->len;
3699             param->des = 0;
3700             qemu_mutex_unlock(&param->mutex);
3701
3702             pagesize = TARGET_PAGE_SIZE;
3703
3704             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3705                                        param->compbuf, len);
3706             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3707                 error_report("decompress data failed");
3708                 qemu_file_set_error(decomp_file, ret);
3709             }
3710
3711             qemu_mutex_lock(&decomp_done_lock);
3712             param->done = true;
3713             qemu_cond_signal(&decomp_done_cond);
3714             qemu_mutex_unlock(&decomp_done_lock);
3715
3716             qemu_mutex_lock(&param->mutex);
3717         } else {
3718             qemu_cond_wait(&param->cond, &param->mutex);
3719         }
3720     }
3721     qemu_mutex_unlock(&param->mutex);
3722
3723     return NULL;
3724 }
3725
3726 static int wait_for_decompress_done(void)
3727 {
3728     int idx, thread_count;
3729
3730     if (!migrate_use_compression()) {
3731         return 0;
3732     }
3733
3734     thread_count = migrate_decompress_threads();
3735     qemu_mutex_lock(&decomp_done_lock);
3736     for (idx = 0; idx < thread_count; idx++) {
3737         while (!decomp_param[idx].done) {
3738             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3739         }
3740     }
3741     qemu_mutex_unlock(&decomp_done_lock);
3742     return qemu_file_get_error(decomp_file);
3743 }
3744
3745 static void compress_threads_load_cleanup(void)
3746 {
3747     int i, thread_count;
3748
3749     if (!migrate_use_compression()) {
3750         return;
3751     }
3752     thread_count = migrate_decompress_threads();
3753     for (i = 0; i < thread_count; i++) {
3754         /*
3755          * we use it as a indicator which shows if the thread is
3756          * properly init'd or not
3757          */
3758         if (!decomp_param[i].compbuf) {
3759             break;
3760         }
3761
3762         qemu_mutex_lock(&decomp_param[i].mutex);
3763         decomp_param[i].quit = true;
3764         qemu_cond_signal(&decomp_param[i].cond);
3765         qemu_mutex_unlock(&decomp_param[i].mutex);
3766     }
3767     for (i = 0; i < thread_count; i++) {
3768         if (!decomp_param[i].compbuf) {
3769             break;
3770         }
3771
3772         qemu_thread_join(decompress_threads + i);
3773         qemu_mutex_destroy(&decomp_param[i].mutex);
3774         qemu_cond_destroy(&decomp_param[i].cond);
3775         inflateEnd(&decomp_param[i].stream);
3776         g_free(decomp_param[i].compbuf);
3777         decomp_param[i].compbuf = NULL;
3778     }
3779     g_free(decompress_threads);
3780     g_free(decomp_param);
3781     decompress_threads = NULL;
3782     decomp_param = NULL;
3783     decomp_file = NULL;
3784 }
3785
3786 static int compress_threads_load_setup(QEMUFile *f)
3787 {
3788     int i, thread_count;
3789
3790     if (!migrate_use_compression()) {
3791         return 0;
3792     }
3793
3794     thread_count = migrate_decompress_threads();
3795     decompress_threads = g_new0(QemuThread, thread_count);
3796     decomp_param = g_new0(DecompressParam, thread_count);
3797     qemu_mutex_init(&decomp_done_lock);
3798     qemu_cond_init(&decomp_done_cond);
3799     decomp_file = f;
3800     for (i = 0; i < thread_count; i++) {
3801         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3802             goto exit;
3803         }
3804
3805         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3806         qemu_mutex_init(&decomp_param[i].mutex);
3807         qemu_cond_init(&decomp_param[i].cond);
3808         decomp_param[i].done = true;
3809         decomp_param[i].quit = false;
3810         qemu_thread_create(decompress_threads + i, "decompress",
3811                            do_data_decompress, decomp_param + i,
3812                            QEMU_THREAD_JOINABLE);
3813     }
3814     return 0;
3815 exit:
3816     compress_threads_load_cleanup();
3817     return -1;
3818 }
3819
3820 static void decompress_data_with_multi_threads(QEMUFile *f,
3821                                                void *host, int len)
3822 {
3823     int idx, thread_count;
3824
3825     thread_count = migrate_decompress_threads();
3826     qemu_mutex_lock(&decomp_done_lock);
3827     while (true) {
3828         for (idx = 0; idx < thread_count; idx++) {
3829             if (decomp_param[idx].done) {
3830                 decomp_param[idx].done = false;
3831                 qemu_mutex_lock(&decomp_param[idx].mutex);
3832                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3833                 decomp_param[idx].des = host;
3834                 decomp_param[idx].len = len;
3835                 qemu_cond_signal(&decomp_param[idx].cond);
3836                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3837                 break;
3838             }
3839         }
3840         if (idx < thread_count) {
3841             break;
3842         } else {
3843             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3844         }
3845     }
3846     qemu_mutex_unlock(&decomp_done_lock);
3847 }
3848
3849 /*
3850  * colo cache: this is for secondary VM, we cache the whole
3851  * memory of the secondary VM, it is need to hold the global lock
3852  * to call this helper.
3853  */
3854 int colo_init_ram_cache(void)
3855 {
3856     RAMBlock *block;
3857
3858     rcu_read_lock();
3859     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3860         block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3861                                                 NULL,
3862                                                 false);
3863         if (!block->colo_cache) {
3864             error_report("%s: Can't alloc memory for COLO cache of block %s,"
3865                          "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3866                          block->used_length);
3867             goto out_locked;
3868         }
3869         memcpy(block->colo_cache, block->host, block->used_length);
3870     }
3871     rcu_read_unlock();
3872     /*
3873     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3874     * with to decide which page in cache should be flushed into SVM's RAM. Here
3875     * we use the same name 'ram_bitmap' as for migration.
3876     */
3877     if (ram_bytes_total()) {
3878         RAMBlock *block;
3879
3880         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3881             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3882
3883             block->bmap = bitmap_new(pages);
3884             bitmap_set(block->bmap, 0, pages);
3885         }
3886     }
3887     ram_state = g_new0(RAMState, 1);
3888     ram_state->migration_dirty_pages = 0;
3889     memory_global_dirty_log_start();
3890
3891     return 0;
3892
3893 out_locked:
3894
3895     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3896         if (block->colo_cache) {
3897             qemu_anon_ram_free(block->colo_cache, block->used_length);
3898             block->colo_cache = NULL;
3899         }
3900     }
3901
3902     rcu_read_unlock();
3903     return -errno;
3904 }
3905
3906 /* It is need to hold the global lock to call this helper */
3907 void colo_release_ram_cache(void)
3908 {
3909     RAMBlock *block;
3910
3911     memory_global_dirty_log_stop();
3912     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3913         g_free(block->bmap);
3914         block->bmap = NULL;
3915     }
3916
3917     rcu_read_lock();
3918
3919     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3920         if (block->colo_cache) {
3921             qemu_anon_ram_free(block->colo_cache, block->used_length);
3922             block->colo_cache = NULL;
3923         }
3924     }
3925
3926     rcu_read_unlock();
3927     g_free(ram_state);
3928     ram_state = NULL;
3929 }
3930
3931 /**
3932  * ram_load_setup: Setup RAM for migration incoming side
3933  *
3934  * Returns zero to indicate success and negative for error
3935  *
3936  * @f: QEMUFile where to receive the data
3937  * @opaque: RAMState pointer
3938  */
3939 static int ram_load_setup(QEMUFile *f, void *opaque)
3940 {
3941     if (compress_threads_load_setup(f)) {
3942         return -1;
3943     }
3944
3945     xbzrle_load_setup();
3946     ramblock_recv_map_init();
3947
3948     return 0;
3949 }
3950
3951 static int ram_load_cleanup(void *opaque)
3952 {
3953     RAMBlock *rb;
3954
3955     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3956         if (ramblock_is_pmem(rb)) {
3957             pmem_persist(rb->host, rb->used_length);
3958         }
3959     }
3960
3961     xbzrle_load_cleanup();
3962     compress_threads_load_cleanup();
3963
3964     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3965         g_free(rb->receivedmap);
3966         rb->receivedmap = NULL;
3967     }
3968
3969     return 0;
3970 }
3971
3972 /**
3973  * ram_postcopy_incoming_init: allocate postcopy data structures
3974  *
3975  * Returns 0 for success and negative if there was one error
3976  *
3977  * @mis: current migration incoming state
3978  *
3979  * Allocate data structures etc needed by incoming migration with
3980  * postcopy-ram. postcopy-ram's similarly names
3981  * postcopy_ram_incoming_init does the work.
3982  */
3983 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3984 {
3985     return postcopy_ram_incoming_init(mis);
3986 }
3987
3988 /**
3989  * ram_load_postcopy: load a page in postcopy case
3990  *
3991  * Returns 0 for success or -errno in case of error
3992  *
3993  * Called in postcopy mode by ram_load().
3994  * rcu_read_lock is taken prior to this being called.
3995  *
3996  * @f: QEMUFile where to send the data
3997  */
3998 static int ram_load_postcopy(QEMUFile *f)
3999 {
4000     int flags = 0, ret = 0;
4001     bool place_needed = false;
4002     bool matches_target_page_size = false;
4003     MigrationIncomingState *mis = migration_incoming_get_current();
4004     /* Temporary page that is later 'placed' */
4005     void *postcopy_host_page = postcopy_get_tmp_page(mis);
4006     void *last_host = NULL;
4007     bool all_zero = false;
4008
4009     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4010         ram_addr_t addr;
4011         void *host = NULL;
4012         void *page_buffer = NULL;
4013         void *place_source = NULL;
4014         RAMBlock *block = NULL;
4015         uint8_t ch;
4016
4017         addr = qemu_get_be64(f);
4018
4019         /*
4020          * If qemu file error, we should stop here, and then "addr"
4021          * may be invalid
4022          */
4023         ret = qemu_file_get_error(f);
4024         if (ret) {
4025             break;
4026         }
4027
4028         flags = addr & ~TARGET_PAGE_MASK;
4029         addr &= TARGET_PAGE_MASK;
4030
4031         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4032         place_needed = false;
4033         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
4034             block = ram_block_from_stream(f, flags);
4035
4036             host = host_from_ram_block_offset(block, addr);
4037             if (!host) {
4038                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4039                 ret = -EINVAL;
4040                 break;
4041             }
4042             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4043             /*
4044              * Postcopy requires that we place whole host pages atomically;
4045              * these may be huge pages for RAMBlocks that are backed by
4046              * hugetlbfs.
4047              * To make it atomic, the data is read into a temporary page
4048              * that's moved into place later.
4049              * The migration protocol uses,  possibly smaller, target-pages
4050              * however the source ensures it always sends all the components
4051              * of a host page in order.
4052              */
4053             page_buffer = postcopy_host_page +
4054                           ((uintptr_t)host & (block->page_size - 1));
4055             /* If all TP are zero then we can optimise the place */
4056             if (!((uintptr_t)host & (block->page_size - 1))) {
4057                 all_zero = true;
4058             } else {
4059                 /* not the 1st TP within the HP */
4060                 if (host != (last_host + TARGET_PAGE_SIZE)) {
4061                     error_report("Non-sequential target page %p/%p",
4062                                   host, last_host);
4063                     ret = -EINVAL;
4064                     break;
4065                 }
4066             }
4067
4068
4069             /*
4070              * If it's the last part of a host page then we place the host
4071              * page
4072              */
4073             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
4074                                      (block->page_size - 1)) == 0;
4075             place_source = postcopy_host_page;
4076         }
4077         last_host = host;
4078
4079         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4080         case RAM_SAVE_FLAG_ZERO:
4081             ch = qemu_get_byte(f);
4082             memset(page_buffer, ch, TARGET_PAGE_SIZE);
4083             if (ch) {
4084                 all_zero = false;
4085             }
4086             break;
4087
4088         case RAM_SAVE_FLAG_PAGE:
4089             all_zero = false;
4090             if (!matches_target_page_size) {
4091                 /* For huge pages, we always use temporary buffer */
4092                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4093             } else {
4094                 /*
4095                  * For small pages that matches target page size, we
4096                  * avoid the qemu_file copy.  Instead we directly use
4097                  * the buffer of QEMUFile to place the page.  Note: we
4098                  * cannot do any QEMUFile operation before using that
4099                  * buffer to make sure the buffer is valid when
4100                  * placing the page.
4101                  */
4102                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4103                                          TARGET_PAGE_SIZE);
4104             }
4105             break;
4106         case RAM_SAVE_FLAG_EOS:
4107             /* normal exit */
4108             multifd_recv_sync_main();
4109             break;
4110         default:
4111             error_report("Unknown combination of migration flags: %#x"
4112                          " (postcopy mode)", flags);
4113             ret = -EINVAL;
4114             break;
4115         }
4116
4117         /* Detect for any possible file errors */
4118         if (!ret && qemu_file_get_error(f)) {
4119             ret = qemu_file_get_error(f);
4120         }
4121
4122         if (!ret && place_needed) {
4123             /* This gets called at the last target page in the host page */
4124             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
4125
4126             if (all_zero) {
4127                 ret = postcopy_place_page_zero(mis, place_dest,
4128                                                block);
4129             } else {
4130                 ret = postcopy_place_page(mis, place_dest,
4131                                           place_source, block);
4132             }
4133         }
4134     }
4135
4136     return ret;
4137 }
4138
4139 static bool postcopy_is_advised(void)
4140 {
4141     PostcopyState ps = postcopy_state_get();
4142     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4143 }
4144
4145 static bool postcopy_is_running(void)
4146 {
4147     PostcopyState ps = postcopy_state_get();
4148     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4149 }
4150
4151 /*
4152  * Flush content of RAM cache into SVM's memory.
4153  * Only flush the pages that be dirtied by PVM or SVM or both.
4154  */
4155 static void colo_flush_ram_cache(void)
4156 {
4157     RAMBlock *block = NULL;
4158     void *dst_host;
4159     void *src_host;
4160     unsigned long offset = 0;
4161
4162     memory_global_dirty_log_sync();
4163     rcu_read_lock();
4164     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4165         migration_bitmap_sync_range(ram_state, block, 0, block->used_length);
4166     }
4167     rcu_read_unlock();
4168
4169     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4170     rcu_read_lock();
4171     block = QLIST_FIRST_RCU(&ram_list.blocks);
4172
4173     while (block) {
4174         offset = migration_bitmap_find_dirty(ram_state, block, offset);
4175
4176         if (offset << TARGET_PAGE_BITS >= block->used_length) {
4177             offset = 0;
4178             block = QLIST_NEXT_RCU(block, next);
4179         } else {
4180             migration_bitmap_clear_dirty(ram_state, block, offset);
4181             dst_host = block->host + (offset << TARGET_PAGE_BITS);
4182             src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4183             memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4184         }
4185     }
4186
4187     rcu_read_unlock();
4188     trace_colo_flush_ram_cache_end();
4189 }
4190
4191 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4192 {
4193     int flags = 0, ret = 0, invalid_flags = 0;
4194     static uint64_t seq_iter;
4195     int len = 0;
4196     /*
4197      * If system is running in postcopy mode, page inserts to host memory must
4198      * be atomic
4199      */
4200     bool postcopy_running = postcopy_is_running();
4201     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4202     bool postcopy_advised = postcopy_is_advised();
4203
4204     seq_iter++;
4205
4206     if (version_id != 4) {
4207         ret = -EINVAL;
4208     }
4209
4210     if (!migrate_use_compression()) {
4211         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4212     }
4213     /* This RCU critical section can be very long running.
4214      * When RCU reclaims in the code start to become numerous,
4215      * it will be necessary to reduce the granularity of this
4216      * critical section.
4217      */
4218     rcu_read_lock();
4219
4220     if (postcopy_running) {
4221         ret = ram_load_postcopy(f);
4222     }
4223
4224     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4225         ram_addr_t addr, total_ram_bytes;
4226         void *host = NULL;
4227         uint8_t ch;
4228
4229         addr = qemu_get_be64(f);
4230         flags = addr & ~TARGET_PAGE_MASK;
4231         addr &= TARGET_PAGE_MASK;
4232
4233         if (flags & invalid_flags) {
4234             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4235                 error_report("Received an unexpected compressed page");
4236             }
4237
4238             ret = -EINVAL;
4239             break;
4240         }
4241
4242         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4243                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4244             RAMBlock *block = ram_block_from_stream(f, flags);
4245
4246             /*
4247              * After going into COLO, we should load the Page into colo_cache.
4248              */
4249             if (migration_incoming_in_colo_state()) {
4250                 host = colo_cache_from_block_offset(block, addr);
4251             } else {
4252                 host = host_from_ram_block_offset(block, addr);
4253             }
4254             if (!host) {
4255                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4256                 ret = -EINVAL;
4257                 break;
4258             }
4259
4260             if (!migration_incoming_in_colo_state()) {
4261                 ramblock_recv_bitmap_set(block, host);
4262             }
4263
4264             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4265         }
4266
4267         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4268         case RAM_SAVE_FLAG_MEM_SIZE:
4269             /* Synchronize RAM block list */
4270             total_ram_bytes = addr;
4271             while (!ret && total_ram_bytes) {
4272                 RAMBlock *block;
4273                 char id[256];
4274                 ram_addr_t length;
4275
4276                 len = qemu_get_byte(f);
4277                 qemu_get_buffer(f, (uint8_t *)id, len);
4278                 id[len] = 0;
4279                 length = qemu_get_be64(f);
4280
4281                 block = qemu_ram_block_by_name(id);
4282                 if (block && !qemu_ram_is_migratable(block)) {
4283                     error_report("block %s should not be migrated !", id);
4284                     ret = -EINVAL;
4285                 } else if (block) {
4286                     if (length != block->used_length) {
4287                         Error *local_err = NULL;
4288
4289                         ret = qemu_ram_resize(block, length,
4290                                               &local_err);
4291                         if (local_err) {
4292                             error_report_err(local_err);
4293                         }
4294                     }
4295                     /* For postcopy we need to check hugepage sizes match */
4296                     if (postcopy_advised &&
4297                         block->page_size != qemu_host_page_size) {
4298                         uint64_t remote_page_size = qemu_get_be64(f);
4299                         if (remote_page_size != block->page_size) {
4300                             error_report("Mismatched RAM page size %s "
4301                                          "(local) %zd != %" PRId64,
4302                                          id, block->page_size,
4303                                          remote_page_size);
4304                             ret = -EINVAL;
4305                         }
4306                     }
4307                     if (migrate_ignore_shared()) {
4308                         hwaddr addr = qemu_get_be64(f);
4309                         bool ignored = qemu_get_byte(f);
4310                         if (ignored != ramblock_is_ignored(block)) {
4311                             error_report("RAM block %s should %s be migrated",
4312                                          id, ignored ? "" : "not");
4313                             ret = -EINVAL;
4314                         }
4315                         if (ramblock_is_ignored(block) &&
4316                             block->mr->addr != addr) {
4317                             error_report("Mismatched GPAs for block %s "
4318                                          "%" PRId64 "!= %" PRId64,
4319                                          id, (uint64_t)addr,
4320                                          (uint64_t)block->mr->addr);
4321                             ret = -EINVAL;
4322                         }
4323                     }
4324                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4325                                           block->idstr);
4326                 } else {
4327                     error_report("Unknown ramblock \"%s\", cannot "
4328                                  "accept migration", id);
4329                     ret = -EINVAL;
4330                 }
4331
4332                 total_ram_bytes -= length;
4333             }
4334             break;
4335
4336         case RAM_SAVE_FLAG_ZERO:
4337             ch = qemu_get_byte(f);
4338             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4339             break;
4340
4341         case RAM_SAVE_FLAG_PAGE:
4342             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4343             break;
4344
4345         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4346             len = qemu_get_be32(f);
4347             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4348                 error_report("Invalid compressed data length: %d", len);
4349                 ret = -EINVAL;
4350                 break;
4351             }
4352             decompress_data_with_multi_threads(f, host, len);
4353             break;
4354
4355         case RAM_SAVE_FLAG_XBZRLE:
4356             if (load_xbzrle(f, addr, host) < 0) {
4357                 error_report("Failed to decompress XBZRLE page at "
4358                              RAM_ADDR_FMT, addr);
4359                 ret = -EINVAL;
4360                 break;
4361             }
4362             break;
4363         case RAM_SAVE_FLAG_EOS:
4364             /* normal exit */
4365             multifd_recv_sync_main();
4366             break;
4367         default:
4368             if (flags & RAM_SAVE_FLAG_HOOK) {
4369                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4370             } else {
4371                 error_report("Unknown combination of migration flags: %#x",
4372                              flags);
4373                 ret = -EINVAL;
4374             }
4375         }
4376         if (!ret) {
4377             ret = qemu_file_get_error(f);
4378         }
4379     }
4380
4381     ret |= wait_for_decompress_done();
4382     rcu_read_unlock();
4383     trace_ram_load_complete(ret, seq_iter);
4384
4385     if (!ret  && migration_incoming_in_colo_state()) {
4386         colo_flush_ram_cache();
4387     }
4388     return ret;
4389 }
4390
4391 static bool ram_has_postcopy(void *opaque)
4392 {
4393     RAMBlock *rb;
4394     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4395         if (ramblock_is_pmem(rb)) {
4396             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4397                          "is not supported now!", rb->idstr, rb->host);
4398             return false;
4399         }
4400     }
4401
4402     return migrate_postcopy_ram();
4403 }
4404
4405 /* Sync all the dirty bitmap with destination VM.  */
4406 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4407 {
4408     RAMBlock *block;
4409     QEMUFile *file = s->to_dst_file;
4410     int ramblock_count = 0;
4411
4412     trace_ram_dirty_bitmap_sync_start();
4413
4414     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4415         qemu_savevm_send_recv_bitmap(file, block->idstr);
4416         trace_ram_dirty_bitmap_request(block->idstr);
4417         ramblock_count++;
4418     }
4419
4420     trace_ram_dirty_bitmap_sync_wait();
4421
4422     /* Wait until all the ramblocks' dirty bitmap synced */
4423     while (ramblock_count--) {
4424         qemu_sem_wait(&s->rp_state.rp_sem);
4425     }
4426
4427     trace_ram_dirty_bitmap_sync_complete();
4428
4429     return 0;
4430 }
4431
4432 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4433 {
4434     qemu_sem_post(&s->rp_state.rp_sem);
4435 }
4436
4437 /*
4438  * Read the received bitmap, revert it as the initial dirty bitmap.
4439  * This is only used when the postcopy migration is paused but wants
4440  * to resume from a middle point.
4441  */
4442 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4443 {
4444     int ret = -EINVAL;
4445     QEMUFile *file = s->rp_state.from_dst_file;
4446     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4447     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4448     uint64_t size, end_mark;
4449
4450     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4451
4452     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4453         error_report("%s: incorrect state %s", __func__,
4454                      MigrationStatus_str(s->state));
4455         return -EINVAL;
4456     }
4457
4458     /*
4459      * Note: see comments in ramblock_recv_bitmap_send() on why we
4460      * need the endianess convertion, and the paddings.
4461      */
4462     local_size = ROUND_UP(local_size, 8);
4463
4464     /* Add paddings */
4465     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4466
4467     size = qemu_get_be64(file);
4468
4469     /* The size of the bitmap should match with our ramblock */
4470     if (size != local_size) {
4471         error_report("%s: ramblock '%s' bitmap size mismatch "
4472                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4473                      block->idstr, size, local_size);
4474         ret = -EINVAL;
4475         goto out;
4476     }
4477
4478     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4479     end_mark = qemu_get_be64(file);
4480
4481     ret = qemu_file_get_error(file);
4482     if (ret || size != local_size) {
4483         error_report("%s: read bitmap failed for ramblock '%s': %d"
4484                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4485                      __func__, block->idstr, ret, local_size, size);
4486         ret = -EIO;
4487         goto out;
4488     }
4489
4490     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4491         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4492                      __func__, block->idstr, end_mark);
4493         ret = -EINVAL;
4494         goto out;
4495     }
4496
4497     /*
4498      * Endianess convertion. We are during postcopy (though paused).
4499      * The dirty bitmap won't change. We can directly modify it.
4500      */
4501     bitmap_from_le(block->bmap, le_bitmap, nbits);
4502
4503     /*
4504      * What we received is "received bitmap". Revert it as the initial
4505      * dirty bitmap for this ramblock.
4506      */
4507     bitmap_complement(block->bmap, block->bmap, nbits);
4508
4509     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4510
4511     /*
4512      * We succeeded to sync bitmap for current ramblock. If this is
4513      * the last one to sync, we need to notify the main send thread.
4514      */
4515     ram_dirty_bitmap_reload_notify(s);
4516
4517     ret = 0;
4518 out:
4519     g_free(le_bitmap);
4520     return ret;
4521 }
4522
4523 static int ram_resume_prepare(MigrationState *s, void *opaque)
4524 {
4525     RAMState *rs = *(RAMState **)opaque;
4526     int ret;
4527
4528     ret = ram_dirty_bitmap_sync_all(s, rs);
4529     if (ret) {
4530         return ret;
4531     }
4532
4533     ram_state_resume_prepare(rs, s->to_dst_file);
4534
4535     return 0;
4536 }
4537
4538 static SaveVMHandlers savevm_ram_handlers = {
4539     .save_setup = ram_save_setup,
4540     .save_live_iterate = ram_save_iterate,
4541     .save_live_complete_postcopy = ram_save_complete,
4542     .save_live_complete_precopy = ram_save_complete,
4543     .has_postcopy = ram_has_postcopy,
4544     .save_live_pending = ram_save_pending,
4545     .load_state = ram_load,
4546     .save_cleanup = ram_save_cleanup,
4547     .load_setup = ram_load_setup,
4548     .load_cleanup = ram_load_cleanup,
4549     .resume_prepare = ram_resume_prepare,
4550 };
4551
4552 void ram_mig_init(void)
4553 {
4554     qemu_mutex_init(&XBZRLE.lock);
4555     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4556 }