migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "qemu/pmem.h"
  37 #include "xbzrle.h"
  38 #include "ram.h"
  39 #include "migration.h"
  40 #include "socket.h"
  41 #include "migration/register.h"
  42 #include "migration/misc.h"
  43 #include "qemu-file.h"
  44 #include "postcopy-ram.h"
  45 #include "page_cache.h"
  46 #include "qemu/error-report.h"
  47 #include "qapi/error.h"
  48 #include "qapi/qapi-events-migration.h"
  49 #include "qapi/qmp/qerror.h"
  50 #include "trace.h"
  51 #include "exec/ram_addr.h"
  52 #include "exec/target_page.h"
  53 #include "qemu/rcu_queue.h"
  54 #include "migration/colo.h"
  55 #include "block.h"
  56 #include "sysemu/sysemu.h"
  57 #include "qemu/uuid.h"
  58 #include "savevm.h"
  59 #include "qemu/iov.h"
  60
  61 /***********************************************************/
  62 /* ram save/restore */
  63
  64 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  65  * worked for pages that where filled with the same char.  We switched
  66  * it to only search for the zero value.  And to avoid confusion with
  67  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  68  */
  69
  70 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  71 #define RAM_SAVE_FLAG_ZERO     0x02
  72 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  73 #define RAM_SAVE_FLAG_PAGE     0x08
  74 #define RAM_SAVE_FLAG_EOS      0x10
  75 #define RAM_SAVE_FLAG_CONTINUE 0x20
  76 #define RAM_SAVE_FLAG_XBZRLE   0x40
  77 /* 0x80 is reserved in migration.h start with 0x100 next */
  78 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  79
  80 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  81 {
  82     return buffer_is_zero(p, size);
  83 }
  84
  85 XBZRLECacheStats xbzrle_counters;
  86
  87 /* struct contains XBZRLE cache and a static page
  88    used by the compression */
  89 static struct {
  90     /* buffer used for XBZRLE encoding */
  91     uint8_t *encoded_buf;
  92     /* buffer for storing page content */
  93     uint8_t *current_buf;
  94     /* Cache for XBZRLE, Protected by lock. */
  95     PageCache *cache;
  96     QemuMutex lock;
  97     /* it will store a page full of zeros */
  98     uint8_t *zero_target_page;
  99     /* buffer used for XBZRLE decoding */
 100     uint8_t *decoded_buf;
 101 } XBZRLE;
 102
 103 static void XBZRLE_cache_lock(void)
 104 {
 105     if (migrate_use_xbzrle())
 106         qemu_mutex_lock(&XBZRLE.lock);
 107 }
 108
 109 static void XBZRLE_cache_unlock(void)
 110 {
 111     if (migrate_use_xbzrle())
 112         qemu_mutex_unlock(&XBZRLE.lock);
 113 }
 114
 115 /**
 116  * xbzrle_cache_resize: resize the xbzrle cache
 117  *
 118  * This function is called from qmp_migrate_set_cache_size in main
 119  * thread, possibly while a migration is in progress.  A running
 120  * migration may be using the cache and might finish during this call,
 121  * hence changes to the cache are protected by XBZRLE.lock().
 122  *
 123  * Returns 0 for success or -1 for error
 124  *
 125  * @new_size: new cache size
 126  * @errp: set *errp if the check failed, with reason
 127  */
 128 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 129 {
 130     PageCache *new_cache;
 131     int64_t ret = 0;
 132
 133     /* Check for truncation */
 134     if (new_size != (size_t)new_size) {
 135         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 136                    "exceeding address space");
 137         return -1;
 138     }
 139
 140     if (new_size == migrate_xbzrle_cache_size()) {
 141         /* nothing to do */
 142         return 0;
 143     }
 144
 145     XBZRLE_cache_lock();
 146
 147     if (XBZRLE.cache != NULL) {
 148         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 149         if (!new_cache) {
 150             ret = -1;
 151             goto out;
 152         }
 153
 154         cache_fini(XBZRLE.cache);
 155         XBZRLE.cache = new_cache;
 156     }
 157 out:
 158     XBZRLE_cache_unlock();
 159     return ret;
 160 }
 161
 162 static bool ramblock_is_ignored(RAMBlock *block)
 163 {
 164     return !qemu_ram_is_migratable(block) ||
 165            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 166 }
 167
 168 /* Should be holding either ram_list.mutex, or the RCU lock. */
 169 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 170     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 171         if (ramblock_is_ignored(block)) {} else
 172
 173 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 174     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 175         if (!qemu_ram_is_migratable(block)) {} else
 176
 177 #undef RAMBLOCK_FOREACH
 178
 179 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 180 {
 181     RAMBlock *block;
 182     int ret = 0;
 183
 184     rcu_read_lock();
 185     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 186         ret = func(block, opaque);
 187         if (ret) {
 188             break;
 189         }
 190     }
 191     rcu_read_unlock();
 192     return ret;
 193 }
 194
 195 static void ramblock_recv_map_init(void)
 196 {
 197     RAMBlock *rb;
 198
 199     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 200         assert(!rb->receivedmap);
 201         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 202     }
 203 }
 204
 205 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 206 {
 207     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 208                     rb->receivedmap);
 209 }
 210
 211 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 212 {
 213     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 214 }
 215
 216 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 217 {
 218     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 219 }
 220
 221 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 222                                     size_t nr)
 223 {
 224     bitmap_set_atomic(rb->receivedmap,
 225                       ramblock_recv_bitmap_offset(host_addr, rb),
 226                       nr);
 227 }
 228
 229 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 230
 231 /*
 232  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 233  *
 234  * Returns >0 if success with sent bytes, or <0 if error.
 235  */
 236 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 237                                   const char *block_name)
 238 {
 239     RAMBlock *block = qemu_ram_block_by_name(block_name);
 240     unsigned long *le_bitmap, nbits;
 241     uint64_t size;
 242
 243     if (!block) {
 244         error_report("%s: invalid block name: %s", __func__, block_name);
 245         return -1;
 246     }
 247
 248     nbits = block->used_length >> TARGET_PAGE_BITS;
 249
 250     /*
 251      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 252      * machines we may need 4 more bytes for padding (see below
 253      * comment). So extend it a bit before hand.
 254      */
 255     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 256
 257     /*
 258      * Always use little endian when sending the bitmap. This is
 259      * required that when source and destination VMs are not using the
 260      * same endianess. (Note: big endian won't work.)
 261      */
 262     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 263
 264     /* Size of the bitmap, in bytes */
 265     size = DIV_ROUND_UP(nbits, 8);
 266
 267     /*
 268      * size is always aligned to 8 bytes for 64bit machines, but it
 269      * may not be true for 32bit machines. We need this padding to
 270      * make sure the migration can survive even between 32bit and
 271      * 64bit machines.
 272      */
 273     size = ROUND_UP(size, 8);
 274
 275     qemu_put_be64(file, size);
 276     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 277     /*
 278      * Mark as an end, in case the middle part is screwed up due to
 279      * some "misterious" reason.
 280      */
 281     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 282     qemu_fflush(file);
 283
 284     g_free(le_bitmap);
 285
 286     if (qemu_file_get_error(file)) {
 287         return qemu_file_get_error(file);
 288     }
 289
 290     return size + sizeof(size);
 291 }
 292
 293 /*
 294  * An outstanding page request, on the source, having been received
 295  * and queued
 296  */
 297 struct RAMSrcPageRequest {
 298     RAMBlock *rb;
 299     hwaddr    offset;
 300     hwaddr    len;
 301
 302     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 303 };
 304
 305 /* State of RAM for migration */
 306 struct RAMState {
 307     /* QEMUFile used for this migration */
 308     QEMUFile *f;
 309     /* Last block that we have visited searching for dirty pages */
 310     RAMBlock *last_seen_block;
 311     /* Last block from where we have sent data */
 312     RAMBlock *last_sent_block;
 313     /* Last dirty target page we have sent */
 314     ram_addr_t last_page;
 315     /* last ram version we have seen */
 316     uint32_t last_version;
 317     /* We are in the first round */
 318     bool ram_bulk_stage;
 319     /* The free page optimization is enabled */
 320     bool fpo_enabled;
 321     /* How many times we have dirty too many pages */
 322     int dirty_rate_high_cnt;
 323     /* these variables are used for bitmap sync */
 324     /* last time we did a full bitmap_sync */
 325     int64_t time_last_bitmap_sync;
 326     /* bytes transferred at start_time */
 327     uint64_t bytes_xfer_prev;
 328     /* number of dirty pages since start_time */
 329     uint64_t num_dirty_pages_period;
 330     /* xbzrle misses since the beginning of the period */
 331     uint64_t xbzrle_cache_miss_prev;
 332
 333     /* compression statistics since the beginning of the period */
 334     /* amount of count that no free thread to compress data */
 335     uint64_t compress_thread_busy_prev;
 336     /* amount bytes after compression */
 337     uint64_t compressed_size_prev;
 338     /* amount of compressed pages */
 339     uint64_t compress_pages_prev;
 340
 341     /* total handled target pages at the beginning of period */
 342     uint64_t target_page_count_prev;
 343     /* total handled target pages since start */
 344     uint64_t target_page_count;
 345     /* number of dirty bits in the bitmap */
 346     uint64_t migration_dirty_pages;
 347     /* Protects modification of the bitmap and migration dirty pages */
 348     QemuMutex bitmap_mutex;
 349     /* The RAMBlock used in the last src_page_requests */
 350     RAMBlock *last_req_rb;
 351     /* Queue of outstanding page requests from the destination */
 352     QemuMutex src_page_req_mutex;
 353     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 354 };
 355 typedef struct RAMState RAMState;
 356
 357 static RAMState *ram_state;
 358
 359 static NotifierWithReturnList precopy_notifier_list;
 360
 361 void precopy_infrastructure_init(void)
 362 {
 363     notifier_with_return_list_init(&precopy_notifier_list);
 364 }
 365
 366 void precopy_add_notifier(NotifierWithReturn *n)
 367 {
 368     notifier_with_return_list_add(&precopy_notifier_list, n);
 369 }
 370
 371 void precopy_remove_notifier(NotifierWithReturn *n)
 372 {
 373     notifier_with_return_remove(n);
 374 }
 375
 376 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 377 {
 378     PrecopyNotifyData pnd;
 379     pnd.reason = reason;
 380     pnd.errp = errp;
 381
 382     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 383 }
 384
 385 void precopy_enable_free_page_optimization(void)
 386 {
 387     if (!ram_state) {
 388         return;
 389     }
 390
 391     ram_state->fpo_enabled = true;
 392 }
 393
 394 uint64_t ram_bytes_remaining(void)
 395 {
 396     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 397                        0;
 398 }
 399
 400 MigrationStats ram_counters;
 401
 402 /* used by the search for pages to send */
 403 struct PageSearchStatus {
 404     /* Current block being searched */
 405     RAMBlock    *block;
 406     /* Current page to search from */
 407     unsigned long page;
 408     /* Set once we wrap around */
 409     bool         complete_round;
 410 };
 411 typedef struct PageSearchStatus PageSearchStatus;
 412
 413 CompressionStats compression_counters;
 414
 415 struct CompressParam {
 416     bool done;
 417     bool quit;
 418     bool zero_page;
 419     QEMUFile *file;
 420     QemuMutex mutex;
 421     QemuCond cond;
 422     RAMBlock *block;
 423     ram_addr_t offset;
 424
 425     /* internally used fields */
 426     z_stream stream;
 427     uint8_t *originbuf;
 428 };
 429 typedef struct CompressParam CompressParam;
 430
 431 struct DecompressParam {
 432     bool done;
 433     bool quit;
 434     QemuMutex mutex;
 435     QemuCond cond;
 436     void *des;
 437     uint8_t *compbuf;
 438     int len;
 439     z_stream stream;
 440 };
 441 typedef struct DecompressParam DecompressParam;
 442
 443 static CompressParam *comp_param;
 444 static QemuThread *compress_threads;
 445 /* comp_done_cond is used to wake up the migration thread when
 446  * one of the compression threads has finished the compression.
 447  * comp_done_lock is used to co-work with comp_done_cond.
 448  */
 449 static QemuMutex comp_done_lock;
 450 static QemuCond comp_done_cond;
 451 /* The empty QEMUFileOps will be used by file in CompressParam */
 452 static const QEMUFileOps empty_ops = { };
 453
 454 static QEMUFile *decomp_file;
 455 static DecompressParam *decomp_param;
 456 static QemuThread *decompress_threads;
 457 static QemuMutex decomp_done_lock;
 458 static QemuCond decomp_done_cond;
 459
 460 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 461                                  ram_addr_t offset, uint8_t *source_buf);
 462
 463 static void *do_data_compress(void *opaque)
 464 {
 465     CompressParam *param = opaque;
 466     RAMBlock *block;
 467     ram_addr_t offset;
 468     bool zero_page;
 469
 470     qemu_mutex_lock(&param->mutex);
 471     while (!param->quit) {
 472         if (param->block) {
 473             block = param->block;
 474             offset = param->offset;
 475             param->block = NULL;
 476             qemu_mutex_unlock(&param->mutex);
 477
 478             zero_page = do_compress_ram_page(param->file, &param->stream,
 479                                              block, offset, param->originbuf);
 480
 481             qemu_mutex_lock(&comp_done_lock);
 482             param->done = true;
 483             param->zero_page = zero_page;
 484             qemu_cond_signal(&comp_done_cond);
 485             qemu_mutex_unlock(&comp_done_lock);
 486
 487             qemu_mutex_lock(&param->mutex);
 488         } else {
 489             qemu_cond_wait(&param->cond, &param->mutex);
 490         }
 491     }
 492     qemu_mutex_unlock(&param->mutex);
 493
 494     return NULL;
 495 }
 496
 497 static void compress_threads_save_cleanup(void)
 498 {
 499     int i, thread_count;
 500
 501     if (!migrate_use_compression() || !comp_param) {
 502         return;
 503     }
 504
 505     thread_count = migrate_compress_threads();
 506     for (i = 0; i < thread_count; i++) {
 507         /*
 508          * we use it as a indicator which shows if the thread is
 509          * properly init'd or not
 510          */
 511         if (!comp_param[i].file) {
 512             break;
 513         }
 514
 515         qemu_mutex_lock(&comp_param[i].mutex);
 516         comp_param[i].quit = true;
 517         qemu_cond_signal(&comp_param[i].cond);
 518         qemu_mutex_unlock(&comp_param[i].mutex);
 519
 520         qemu_thread_join(compress_threads + i);
 521         qemu_mutex_destroy(&comp_param[i].mutex);
 522         qemu_cond_destroy(&comp_param[i].cond);
 523         deflateEnd(&comp_param[i].stream);
 524         g_free(comp_param[i].originbuf);
 525         qemu_fclose(comp_param[i].file);
 526         comp_param[i].file = NULL;
 527     }
 528     qemu_mutex_destroy(&comp_done_lock);
 529     qemu_cond_destroy(&comp_done_cond);
 530     g_free(compress_threads);
 531     g_free(comp_param);
 532     compress_threads = NULL;
 533     comp_param = NULL;
 534 }
 535
 536 static int compress_threads_save_setup(void)
 537 {
 538     int i, thread_count;
 539
 540     if (!migrate_use_compression()) {
 541         return 0;
 542     }
 543     thread_count = migrate_compress_threads();
 544     compress_threads = g_new0(QemuThread, thread_count);
 545     comp_param = g_new0(CompressParam, thread_count);
 546     qemu_cond_init(&comp_done_cond);
 547     qemu_mutex_init(&comp_done_lock);
 548     for (i = 0; i < thread_count; i++) {
 549         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 550         if (!comp_param[i].originbuf) {
 551             goto exit;
 552         }
 553
 554         if (deflateInit(&comp_param[i].stream,
 555                         migrate_compress_level()) != Z_OK) {
 556             g_free(comp_param[i].originbuf);
 557             goto exit;
 558         }
 559
 560         /* comp_param[i].file is just used as a dummy buffer to save data,
 561          * set its ops to empty.
 562          */
 563         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 564         comp_param[i].done = true;
 565         comp_param[i].quit = false;
 566         qemu_mutex_init(&comp_param[i].mutex);
 567         qemu_cond_init(&comp_param[i].cond);
 568         qemu_thread_create(compress_threads + i, "compress",
 569                            do_data_compress, comp_param + i,
 570                            QEMU_THREAD_JOINABLE);
 571     }
 572     return 0;
 573
 574 exit:
 575     compress_threads_save_cleanup();
 576     return -1;
 577 }
 578
 579 /* Multiple fd's */
 580
 581 #define MULTIFD_MAGIC 0x11223344U
 582 #define MULTIFD_VERSION 1
 583
 584 #define MULTIFD_FLAG_SYNC (1 << 0)
 585
 586 typedef struct {
 587     uint32_t magic;
 588     uint32_t version;
 589     unsigned char uuid[16]; /* QemuUUID */
 590     uint8_t id;
 591 } __attribute__((packed)) MultiFDInit_t;
 592
 593 typedef struct {
 594     uint32_t magic;
 595     uint32_t version;
 596     uint32_t flags;
 597     /* maximum number of allocated pages */
 598     uint32_t pages_alloc;
 599     uint32_t pages_used;
 600     uint64_t packet_num;
 601     char ramblock[256];
 602     uint64_t offset[];
 603 } __attribute__((packed)) MultiFDPacket_t;
 604
 605 typedef struct {
 606     /* number of used pages */
 607     uint32_t used;
 608     /* number of allocated pages */
 609     uint32_t allocated;
 610     /* global number of generated multifd packets */
 611     uint64_t packet_num;
 612     /* offset of each page */
 613     ram_addr_t *offset;
 614     /* pointer to each page */
 615     struct iovec *iov;
 616     RAMBlock *block;
 617 } MultiFDPages_t;
 618
 619 typedef struct {
 620     /* this fields are not changed once the thread is created */
 621     /* channel number */
 622     uint8_t id;
 623     /* channel thread name */
 624     char *name;
 625     /* channel thread id */
 626     QemuThread thread;
 627     /* communication channel */
 628     QIOChannel *c;
 629     /* sem where to wait for more work */
 630     QemuSemaphore sem;
 631     /* this mutex protects the following parameters */
 632     QemuMutex mutex;
 633     /* is this channel thread running */
 634     bool running;
 635     /* should this thread finish */
 636     bool quit;
 637     /* thread has work to do */
 638     int pending_job;
 639     /* array of pages to sent */
 640     MultiFDPages_t *pages;
 641     /* packet allocated len */
 642     uint32_t packet_len;
 643     /* pointer to the packet */
 644     MultiFDPacket_t *packet;
 645     /* multifd flags for each packet */
 646     uint32_t flags;
 647     /* global number of generated multifd packets */
 648     uint64_t packet_num;
 649     /* thread local variables */
 650     /* packets sent through this channel */
 651     uint64_t num_packets;
 652     /* pages sent through this channel */
 653     uint64_t num_pages;
 654     /* syncs main thread and channels */
 655     QemuSemaphore sem_sync;
 656 }  MultiFDSendParams;
 657
 658 typedef struct {
 659     /* this fields are not changed once the thread is created */
 660     /* channel number */
 661     uint8_t id;
 662     /* channel thread name */
 663     char *name;
 664     /* channel thread id */
 665     QemuThread thread;
 666     /* communication channel */
 667     QIOChannel *c;
 668     /* this mutex protects the following parameters */
 669     QemuMutex mutex;
 670     /* is this channel thread running */
 671     bool running;
 672     /* array of pages to receive */
 673     MultiFDPages_t *pages;
 674     /* packet allocated len */
 675     uint32_t packet_len;
 676     /* pointer to the packet */
 677     MultiFDPacket_t *packet;
 678     /* multifd flags for each packet */
 679     uint32_t flags;
 680     /* global number of generated multifd packets */
 681     uint64_t packet_num;
 682     /* thread local variables */
 683     /* packets sent through this channel */
 684     uint64_t num_packets;
 685     /* pages sent through this channel */
 686     uint64_t num_pages;
 687     /* syncs main thread and channels */
 688     QemuSemaphore sem_sync;
 689 } MultiFDRecvParams;
 690
 691 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 692 {
 693     MultiFDInit_t msg;
 694     int ret;
 695
 696     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 697     msg.version = cpu_to_be32(MULTIFD_VERSION);
 698     msg.id = p->id;
 699     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 700
 701     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 702     if (ret != 0) {
 703         return -1;
 704     }
 705     return 0;
 706 }
 707
 708 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 709 {
 710     MultiFDInit_t msg;
 711     int ret;
 712
 713     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 714     if (ret != 0) {
 715         return -1;
 716     }
 717
 718     msg.magic = be32_to_cpu(msg.magic);
 719     msg.version = be32_to_cpu(msg.version);
 720
 721     if (msg.magic != MULTIFD_MAGIC) {
 722         error_setg(errp, "multifd: received packet magic %x "
 723                    "expected %x", msg.magic, MULTIFD_MAGIC);
 724         return -1;
 725     }
 726
 727     if (msg.version != MULTIFD_VERSION) {
 728         error_setg(errp, "multifd: received packet version %d "
 729                    "expected %d", msg.version, MULTIFD_VERSION);
 730         return -1;
 731     }
 732
 733     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 734         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 735         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 736
 737         error_setg(errp, "multifd: received uuid '%s' and expected "
 738                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 739         g_free(uuid);
 740         g_free(msg_uuid);
 741         return -1;
 742     }
 743
 744     if (msg.id > migrate_multifd_channels()) {
 745         error_setg(errp, "multifd: received channel version %d "
 746                    "expected %d", msg.version, MULTIFD_VERSION);
 747         return -1;
 748     }
 749
 750     return msg.id;
 751 }
 752
 753 static MultiFDPages_t *multifd_pages_init(size_t size)
 754 {
 755     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 756
 757     pages->allocated = size;
 758     pages->iov = g_new0(struct iovec, size);
 759     pages->offset = g_new0(ram_addr_t, size);
 760
 761     return pages;
 762 }
 763
 764 static void multifd_pages_clear(MultiFDPages_t *pages)
 765 {
 766     pages->used = 0;
 767     pages->allocated = 0;
 768     pages->packet_num = 0;
 769     pages->block = NULL;
 770     g_free(pages->iov);
 771     pages->iov = NULL;
 772     g_free(pages->offset);
 773     pages->offset = NULL;
 774     g_free(pages);
 775 }
 776
 777 static void multifd_send_fill_packet(MultiFDSendParams *p)
 778 {
 779     MultiFDPacket_t *packet = p->packet;
 780     int i;
 781
 782     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 783     packet->version = cpu_to_be32(MULTIFD_VERSION);
 784     packet->flags = cpu_to_be32(p->flags);
 785     packet->pages_alloc = cpu_to_be32(migrate_multifd_page_count());
 786     packet->pages_used = cpu_to_be32(p->pages->used);
 787     packet->packet_num = cpu_to_be64(p->packet_num);
 788
 789     if (p->pages->block) {
 790         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 791     }
 792
 793     for (i = 0; i < p->pages->used; i++) {
 794         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 795     }
 796 }
 797
 798 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 799 {
 800     MultiFDPacket_t *packet = p->packet;
 801     RAMBlock *block;
 802     int i;
 803
 804     packet->magic = be32_to_cpu(packet->magic);
 805     if (packet->magic != MULTIFD_MAGIC) {
 806         error_setg(errp, "multifd: received packet "
 807                    "magic %x and expected magic %x",
 808                    packet->magic, MULTIFD_MAGIC);
 809         return -1;
 810     }
 811
 812     packet->version = be32_to_cpu(packet->version);
 813     if (packet->version != MULTIFD_VERSION) {
 814         error_setg(errp, "multifd: received packet "
 815                    "version %d and expected version %d",
 816                    packet->version, MULTIFD_VERSION);
 817         return -1;
 818     }
 819
 820     p->flags = be32_to_cpu(packet->flags);
 821
 822     packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
 823     if (packet->pages_alloc > migrate_multifd_page_count()) {
 824         error_setg(errp, "multifd: received packet "
 825                    "with size %d and expected maximum size %d",
 826                    packet->pages_alloc, migrate_multifd_page_count()) ;
 827         return -1;
 828     }
 829
 830     p->pages->used = be32_to_cpu(packet->pages_used);
 831     if (p->pages->used > packet->pages_alloc) {
 832         error_setg(errp, "multifd: received packet "
 833                    "with %d pages and expected maximum pages are %d",
 834                    p->pages->used, packet->pages_alloc) ;
 835         return -1;
 836     }
 837
 838     p->packet_num = be64_to_cpu(packet->packet_num);
 839
 840     if (p->pages->used) {
 841         /* make sure that ramblock is 0 terminated */
 842         packet->ramblock[255] = 0;
 843         block = qemu_ram_block_by_name(packet->ramblock);
 844         if (!block) {
 845             error_setg(errp, "multifd: unknown ram block %s",
 846                        packet->ramblock);
 847             return -1;
 848         }
 849     }
 850
 851     for (i = 0; i < p->pages->used; i++) {
 852         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 853
 854         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 855             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 856                        " (max " RAM_ADDR_FMT ")",
 857                        offset, block->max_length);
 858             return -1;
 859         }
 860         p->pages->iov[i].iov_base = block->host + offset;
 861         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 862     }
 863
 864     return 0;
 865 }
 866
 867 struct {
 868     MultiFDSendParams *params;
 869     /* number of created threads */
 870     int count;
 871     /* array of pages to sent */
 872     MultiFDPages_t *pages;
 873     /* syncs main thread and channels */
 874     QemuSemaphore sem_sync;
 875     /* global number of generated multifd packets */
 876     uint64_t packet_num;
 877     /* send channels ready */
 878     QemuSemaphore channels_ready;
 879 } *multifd_send_state;
 880
 881 /*
 882  * How we use multifd_send_state->pages and channel->pages?
 883  *
 884  * We create a pages for each channel, and a main one.  Each time that
 885  * we need to send a batch of pages we interchange the ones between
 886  * multifd_send_state and the channel that is sending it.  There are
 887  * two reasons for that:
 888  *    - to not have to do so many mallocs during migration
 889  *    - to make easier to know what to free at the end of migration
 890  *
 891  * This way we always know who is the owner of each "pages" struct,
 892  * and we don't need any loocking.  It belongs to the migration thread
 893  * or to the channel thread.  Switching is safe because the migration
 894  * thread is using the channel mutex when changing it, and the channel
 895  * have to had finish with its own, otherwise pending_job can't be
 896  * false.
 897  */
 898
 899 static void multifd_send_pages(void)
 900 {
 901     int i;
 902     static int next_channel;
 903     MultiFDSendParams *p = NULL; /* make happy gcc */
 904     MultiFDPages_t *pages = multifd_send_state->pages;
 905     uint64_t transferred;
 906
 907     qemu_sem_wait(&multifd_send_state->channels_ready);
 908     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 909         p = &multifd_send_state->params[i];
 910
 911         qemu_mutex_lock(&p->mutex);
 912         if (!p->pending_job) {
 913             p->pending_job++;
 914             next_channel = (i + 1) % migrate_multifd_channels();
 915             break;
 916         }
 917         qemu_mutex_unlock(&p->mutex);
 918     }
 919     p->pages->used = 0;
 920
 921     p->packet_num = multifd_send_state->packet_num++;
 922     p->pages->block = NULL;
 923     multifd_send_state->pages = p->pages;
 924     p->pages = pages;
 925     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
 926     ram_counters.multifd_bytes += transferred;
 927     ram_counters.transferred += transferred;;
 928     qemu_mutex_unlock(&p->mutex);
 929     qemu_sem_post(&p->sem);
 930 }
 931
 932 static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
 933 {
 934     MultiFDPages_t *pages = multifd_send_state->pages;
 935
 936     if (!pages->block) {
 937         pages->block = block;
 938     }
 939
 940     if (pages->block == block) {
 941         pages->offset[pages->used] = offset;
 942         pages->iov[pages->used].iov_base = block->host + offset;
 943         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 944         pages->used++;
 945
 946         if (pages->used < pages->allocated) {
 947             return;
 948         }
 949     }
 950
 951     multifd_send_pages();
 952
 953     if (pages->block != block) {
 954         multifd_queue_page(block, offset);
 955     }
 956 }
 957
 958 static void multifd_send_terminate_threads(Error *err)
 959 {
 960     int i;
 961
 962     if (err) {
 963         MigrationState *s = migrate_get_current();
 964         migrate_set_error(s, err);
 965         if (s->state == MIGRATION_STATUS_SETUP ||
 966             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
 967             s->state == MIGRATION_STATUS_DEVICE ||
 968             s->state == MIGRATION_STATUS_ACTIVE) {
 969             migrate_set_state(&s->state, s->state,
 970                               MIGRATION_STATUS_FAILED);
 971         }
 972     }
 973
 974     for (i = 0; i < migrate_multifd_channels(); i++) {
 975         MultiFDSendParams *p = &multifd_send_state->params[i];
 976
 977         qemu_mutex_lock(&p->mutex);
 978         p->quit = true;
 979         qemu_sem_post(&p->sem);
 980         qemu_mutex_unlock(&p->mutex);
 981     }
 982 }
 983
 984 void multifd_save_cleanup(void)
 985 {
 986     int i;
 987
 988     if (!migrate_use_multifd()) {
 989         return;
 990     }
 991     multifd_send_terminate_threads(NULL);
 992     for (i = 0; i < migrate_multifd_channels(); i++) {
 993         MultiFDSendParams *p = &multifd_send_state->params[i];
 994
 995         if (p->running) {
 996             qemu_thread_join(&p->thread);
 997         }
 998         socket_send_channel_destroy(p->c);
 999         p->c = NULL;
1000         qemu_mutex_destroy(&p->mutex);
1001         qemu_sem_destroy(&p->sem);
1002         qemu_sem_destroy(&p->sem_sync);
1003         g_free(p->name);
1004         p->name = NULL;
1005         multifd_pages_clear(p->pages);
1006         p->pages = NULL;
1007         p->packet_len = 0;
1008         g_free(p->packet);
1009         p->packet = NULL;
1010     }
1011     qemu_sem_destroy(&multifd_send_state->channels_ready);
1012     qemu_sem_destroy(&multifd_send_state->sem_sync);
1013     g_free(multifd_send_state->params);
1014     multifd_send_state->params = NULL;
1015     multifd_pages_clear(multifd_send_state->pages);
1016     multifd_send_state->pages = NULL;
1017     g_free(multifd_send_state);
1018     multifd_send_state = NULL;
1019 }
1020
1021 static void multifd_send_sync_main(void)
1022 {
1023     int i;
1024
1025     if (!migrate_use_multifd()) {
1026         return;
1027     }
1028     if (multifd_send_state->pages->used) {
1029         multifd_send_pages();
1030     }
1031     for (i = 0; i < migrate_multifd_channels(); i++) {
1032         MultiFDSendParams *p = &multifd_send_state->params[i];
1033
1034         trace_multifd_send_sync_main_signal(p->id);
1035
1036         qemu_mutex_lock(&p->mutex);
1037
1038         p->packet_num = multifd_send_state->packet_num++;
1039         p->flags |= MULTIFD_FLAG_SYNC;
1040         p->pending_job++;
1041         qemu_mutex_unlock(&p->mutex);
1042         qemu_sem_post(&p->sem);
1043     }
1044     for (i = 0; i < migrate_multifd_channels(); i++) {
1045         MultiFDSendParams *p = &multifd_send_state->params[i];
1046
1047         trace_multifd_send_sync_main_wait(p->id);
1048         qemu_sem_wait(&multifd_send_state->sem_sync);
1049     }
1050     trace_multifd_send_sync_main(multifd_send_state->packet_num);
1051 }
1052
1053 static void *multifd_send_thread(void *opaque)
1054 {
1055     MultiFDSendParams *p = opaque;
1056     Error *local_err = NULL;
1057     int ret;
1058
1059     trace_multifd_send_thread_start(p->id);
1060     rcu_register_thread();
1061
1062     if (multifd_send_initial_packet(p, &local_err) < 0) {
1063         goto out;
1064     }
1065     /* initial packet */
1066     p->num_packets = 1;
1067
1068     while (true) {
1069         qemu_sem_wait(&p->sem);
1070         qemu_mutex_lock(&p->mutex);
1071
1072         if (p->pending_job) {
1073             uint32_t used = p->pages->used;
1074             uint64_t packet_num = p->packet_num;
1075             uint32_t flags = p->flags;
1076
1077             multifd_send_fill_packet(p);
1078             p->flags = 0;
1079             p->num_packets++;
1080             p->num_pages += used;
1081             p->pages->used = 0;
1082             qemu_mutex_unlock(&p->mutex);
1083
1084             trace_multifd_send(p->id, packet_num, used, flags);
1085
1086             ret = qio_channel_write_all(p->c, (void *)p->packet,
1087                                         p->packet_len, &local_err);
1088             if (ret != 0) {
1089                 break;
1090             }
1091
1092             if (used) {
1093                 ret = qio_channel_writev_all(p->c, p->pages->iov,
1094                                              used, &local_err);
1095                 if (ret != 0) {
1096                     break;
1097                 }
1098             }
1099
1100             qemu_mutex_lock(&p->mutex);
1101             p->pending_job--;
1102             qemu_mutex_unlock(&p->mutex);
1103
1104             if (flags & MULTIFD_FLAG_SYNC) {
1105                 qemu_sem_post(&multifd_send_state->sem_sync);
1106             }
1107             qemu_sem_post(&multifd_send_state->channels_ready);
1108         } else if (p->quit) {
1109             qemu_mutex_unlock(&p->mutex);
1110             break;
1111         } else {
1112             qemu_mutex_unlock(&p->mutex);
1113             /* sometimes there are spurious wakeups */
1114         }
1115     }
1116
1117 out:
1118     if (local_err) {
1119         multifd_send_terminate_threads(local_err);
1120     }
1121
1122     qemu_mutex_lock(&p->mutex);
1123     p->running = false;
1124     qemu_mutex_unlock(&p->mutex);
1125
1126     rcu_unregister_thread();
1127     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1128
1129     return NULL;
1130 }
1131
1132 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1133 {
1134     MultiFDSendParams *p = opaque;
1135     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1136     Error *local_err = NULL;
1137
1138     if (qio_task_propagate_error(task, &local_err)) {
1139         migrate_set_error(migrate_get_current(), local_err);
1140         multifd_save_cleanup();
1141     } else {
1142         p->c = QIO_CHANNEL(sioc);
1143         qio_channel_set_delay(p->c, false);
1144         p->running = true;
1145         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1146                            QEMU_THREAD_JOINABLE);
1147
1148         atomic_inc(&multifd_send_state->count);
1149     }
1150 }
1151
1152 int multifd_save_setup(void)
1153 {
1154     int thread_count;
1155     uint32_t page_count = migrate_multifd_page_count();
1156     uint8_t i;
1157
1158     if (!migrate_use_multifd()) {
1159         return 0;
1160     }
1161     thread_count = migrate_multifd_channels();
1162     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1163     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1164     atomic_set(&multifd_send_state->count, 0);
1165     multifd_send_state->pages = multifd_pages_init(page_count);
1166     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1167     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1168
1169     for (i = 0; i < thread_count; i++) {
1170         MultiFDSendParams *p = &multifd_send_state->params[i];
1171
1172         qemu_mutex_init(&p->mutex);
1173         qemu_sem_init(&p->sem, 0);
1174         qemu_sem_init(&p->sem_sync, 0);
1175         p->quit = false;
1176         p->pending_job = 0;
1177         p->id = i;
1178         p->pages = multifd_pages_init(page_count);
1179         p->packet_len = sizeof(MultiFDPacket_t)
1180                       + sizeof(ram_addr_t) * page_count;
1181         p->packet = g_malloc0(p->packet_len);
1182         p->name = g_strdup_printf("multifdsend_%d", i);
1183         socket_send_channel_create(multifd_new_send_channel_async, p);
1184     }
1185     return 0;
1186 }
1187
1188 struct {
1189     MultiFDRecvParams *params;
1190     /* number of created threads */
1191     int count;
1192     /* syncs main thread and channels */
1193     QemuSemaphore sem_sync;
1194     /* global number of generated multifd packets */
1195     uint64_t packet_num;
1196 } *multifd_recv_state;
1197
1198 static void multifd_recv_terminate_threads(Error *err)
1199 {
1200     int i;
1201
1202     if (err) {
1203         MigrationState *s = migrate_get_current();
1204         migrate_set_error(s, err);
1205         if (s->state == MIGRATION_STATUS_SETUP ||
1206             s->state == MIGRATION_STATUS_ACTIVE) {
1207             migrate_set_state(&s->state, s->state,
1208                               MIGRATION_STATUS_FAILED);
1209         }
1210     }
1211
1212     for (i = 0; i < migrate_multifd_channels(); i++) {
1213         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1214
1215         qemu_mutex_lock(&p->mutex);
1216         /* We could arrive here for two reasons:
1217            - normal quit, i.e. everything went fine, just finished
1218            - error quit: We close the channels so the channel threads
1219              finish the qio_channel_read_all_eof() */
1220         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1221         qemu_mutex_unlock(&p->mutex);
1222     }
1223 }
1224
1225 int multifd_load_cleanup(Error **errp)
1226 {
1227     int i;
1228     int ret = 0;
1229
1230     if (!migrate_use_multifd()) {
1231         return 0;
1232     }
1233     multifd_recv_terminate_threads(NULL);
1234     for (i = 0; i < migrate_multifd_channels(); i++) {
1235         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1236
1237         if (p->running) {
1238             qemu_thread_join(&p->thread);
1239         }
1240         object_unref(OBJECT(p->c));
1241         p->c = NULL;
1242         qemu_mutex_destroy(&p->mutex);
1243         qemu_sem_destroy(&p->sem_sync);
1244         g_free(p->name);
1245         p->name = NULL;
1246         multifd_pages_clear(p->pages);
1247         p->pages = NULL;
1248         p->packet_len = 0;
1249         g_free(p->packet);
1250         p->packet = NULL;
1251     }
1252     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1253     g_free(multifd_recv_state->params);
1254     multifd_recv_state->params = NULL;
1255     g_free(multifd_recv_state);
1256     multifd_recv_state = NULL;
1257
1258     return ret;
1259 }
1260
1261 static void multifd_recv_sync_main(void)
1262 {
1263     int i;
1264
1265     if (!migrate_use_multifd()) {
1266         return;
1267     }
1268     for (i = 0; i < migrate_multifd_channels(); i++) {
1269         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1270
1271         trace_multifd_recv_sync_main_wait(p->id);
1272         qemu_sem_wait(&multifd_recv_state->sem_sync);
1273         qemu_mutex_lock(&p->mutex);
1274         if (multifd_recv_state->packet_num < p->packet_num) {
1275             multifd_recv_state->packet_num = p->packet_num;
1276         }
1277         qemu_mutex_unlock(&p->mutex);
1278     }
1279     for (i = 0; i < migrate_multifd_channels(); i++) {
1280         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1281
1282         trace_multifd_recv_sync_main_signal(p->id);
1283         qemu_sem_post(&p->sem_sync);
1284     }
1285     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1286 }
1287
1288 static void *multifd_recv_thread(void *opaque)
1289 {
1290     MultiFDRecvParams *p = opaque;
1291     Error *local_err = NULL;
1292     int ret;
1293
1294     trace_multifd_recv_thread_start(p->id);
1295     rcu_register_thread();
1296
1297     while (true) {
1298         uint32_t used;
1299         uint32_t flags;
1300
1301         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1302                                        p->packet_len, &local_err);
1303         if (ret == 0) {   /* EOF */
1304             break;
1305         }
1306         if (ret == -1) {   /* Error */
1307             break;
1308         }
1309
1310         qemu_mutex_lock(&p->mutex);
1311         ret = multifd_recv_unfill_packet(p, &local_err);
1312         if (ret) {
1313             qemu_mutex_unlock(&p->mutex);
1314             break;
1315         }
1316
1317         used = p->pages->used;
1318         flags = p->flags;
1319         trace_multifd_recv(p->id, p->packet_num, used, flags);
1320         p->num_packets++;
1321         p->num_pages += used;
1322         qemu_mutex_unlock(&p->mutex);
1323
1324         if (used) {
1325             ret = qio_channel_readv_all(p->c, p->pages->iov,
1326                                         used, &local_err);
1327             if (ret != 0) {
1328                 break;
1329             }
1330         }
1331
1332         if (flags & MULTIFD_FLAG_SYNC) {
1333             qemu_sem_post(&multifd_recv_state->sem_sync);
1334             qemu_sem_wait(&p->sem_sync);
1335         }
1336     }
1337
1338     if (local_err) {
1339         multifd_recv_terminate_threads(local_err);
1340     }
1341     qemu_mutex_lock(&p->mutex);
1342     p->running = false;
1343     qemu_mutex_unlock(&p->mutex);
1344
1345     rcu_unregister_thread();
1346     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1347
1348     return NULL;
1349 }
1350
1351 int multifd_load_setup(void)
1352 {
1353     int thread_count;
1354     uint32_t page_count = migrate_multifd_page_count();
1355     uint8_t i;
1356
1357     if (!migrate_use_multifd()) {
1358         return 0;
1359     }
1360     thread_count = migrate_multifd_channels();
1361     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1362     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1363     atomic_set(&multifd_recv_state->count, 0);
1364     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1365
1366     for (i = 0; i < thread_count; i++) {
1367         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1368
1369         qemu_mutex_init(&p->mutex);
1370         qemu_sem_init(&p->sem_sync, 0);
1371         p->id = i;
1372         p->pages = multifd_pages_init(page_count);
1373         p->packet_len = sizeof(MultiFDPacket_t)
1374                       + sizeof(ram_addr_t) * page_count;
1375         p->packet = g_malloc0(p->packet_len);
1376         p->name = g_strdup_printf("multifdrecv_%d", i);
1377     }
1378     return 0;
1379 }
1380
1381 bool multifd_recv_all_channels_created(void)
1382 {
1383     int thread_count = migrate_multifd_channels();
1384
1385     if (!migrate_use_multifd()) {
1386         return true;
1387     }
1388
1389     return thread_count == atomic_read(&multifd_recv_state->count);
1390 }
1391
1392 /*
1393  * Try to receive all multifd channels to get ready for the migration.
1394  * - Return true and do not set @errp when correctly receving all channels;
1395  * - Return false and do not set @errp when correctly receiving the current one;
1396  * - Return false and set @errp when failing to receive the current channel.
1397  */
1398 bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1399 {
1400     MultiFDRecvParams *p;
1401     Error *local_err = NULL;
1402     int id;
1403
1404     id = multifd_recv_initial_packet(ioc, &local_err);
1405     if (id < 0) {
1406         multifd_recv_terminate_threads(local_err);
1407         error_propagate_prepend(errp, local_err,
1408                                 "failed to receive packet"
1409                                 " via multifd channel %d: ",
1410                                 atomic_read(&multifd_recv_state->count));
1411         return false;
1412     }
1413
1414     p = &multifd_recv_state->params[id];
1415     if (p->c != NULL) {
1416         error_setg(&local_err, "multifd: received id '%d' already setup'",
1417                    id);
1418         multifd_recv_terminate_threads(local_err);
1419         error_propagate(errp, local_err);
1420         return false;
1421     }
1422     p->c = ioc;
1423     object_ref(OBJECT(ioc));
1424     /* initial packet */
1425     p->num_packets = 1;
1426
1427     p->running = true;
1428     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1429                        QEMU_THREAD_JOINABLE);
1430     atomic_inc(&multifd_recv_state->count);
1431     return atomic_read(&multifd_recv_state->count) ==
1432            migrate_multifd_channels();
1433 }
1434
1435 /**
1436  * save_page_header: write page header to wire
1437  *
1438  * If this is the 1st block, it also writes the block identification
1439  *
1440  * Returns the number of bytes written
1441  *
1442  * @f: QEMUFile where to send the data
1443  * @block: block that contains the page we want to send
1444  * @offset: offset inside the block for the page
1445  *          in the lower bits, it contains flags
1446  */
1447 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1448                                ram_addr_t offset)
1449 {
1450     size_t size, len;
1451
1452     if (block == rs->last_sent_block) {
1453         offset |= RAM_SAVE_FLAG_CONTINUE;
1454     }
1455     qemu_put_be64(f, offset);
1456     size = 8;
1457
1458     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1459         len = strlen(block->idstr);
1460         qemu_put_byte(f, len);
1461         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1462         size += 1 + len;
1463         rs->last_sent_block = block;
1464     }
1465     return size;
1466 }
1467
1468 /**
1469  * mig_throttle_guest_down: throotle down the guest
1470  *
1471  * Reduce amount of guest cpu execution to hopefully slow down memory
1472  * writes. If guest dirty memory rate is reduced below the rate at
1473  * which we can transfer pages to the destination then we should be
1474  * able to complete migration. Some workloads dirty memory way too
1475  * fast and will not effectively converge, even with auto-converge.
1476  */
1477 static void mig_throttle_guest_down(void)
1478 {
1479     MigrationState *s = migrate_get_current();
1480     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1481     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1482     int pct_max = s->parameters.max_cpu_throttle;
1483
1484     /* We have not started throttling yet. Let's start it. */
1485     if (!cpu_throttle_active()) {
1486         cpu_throttle_set(pct_initial);
1487     } else {
1488         /* Throttling already on, just increase the rate */
1489         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1490                          pct_max));
1491     }
1492 }
1493
1494 /**
1495  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1496  *
1497  * @rs: current RAM state
1498  * @current_addr: address for the zero page
1499  *
1500  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1501  * The important thing is that a stale (not-yet-0'd) page be replaced
1502  * by the new data.
1503  * As a bonus, if the page wasn't in the cache it gets added so that
1504  * when a small write is made into the 0'd page it gets XBZRLE sent.
1505  */
1506 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1507 {
1508     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1509         return;
1510     }
1511
1512     /* We don't care if this fails to allocate a new cache page
1513      * as long as it updated an old one */
1514     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1515                  ram_counters.dirty_sync_count);
1516 }
1517
1518 #define ENCODING_FLAG_XBZRLE 0x1
1519
1520 /**
1521  * save_xbzrle_page: compress and send current page
1522  *
1523  * Returns: 1 means that we wrote the page
1524  *          0 means that page is identical to the one already sent
1525  *          -1 means that xbzrle would be longer than normal
1526  *
1527  * @rs: current RAM state
1528  * @current_data: pointer to the address of the page contents
1529  * @current_addr: addr of the page
1530  * @block: block that contains the page we want to send
1531  * @offset: offset inside the block for the page
1532  * @last_stage: if we are at the completion stage
1533  */
1534 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1535                             ram_addr_t current_addr, RAMBlock *block,
1536                             ram_addr_t offset, bool last_stage)
1537 {
1538     int encoded_len = 0, bytes_xbzrle;
1539     uint8_t *prev_cached_page;
1540
1541     if (!cache_is_cached(XBZRLE.cache, current_addr,
1542                          ram_counters.dirty_sync_count)) {
1543         xbzrle_counters.cache_miss++;
1544         if (!last_stage) {
1545             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1546                              ram_counters.dirty_sync_count) == -1) {
1547                 return -1;
1548             } else {
1549                 /* update *current_data when the page has been
1550                    inserted into cache */
1551                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1552             }
1553         }
1554         return -1;
1555     }
1556
1557     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1558
1559     /* save current buffer into memory */
1560     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1561
1562     /* XBZRLE encoding (if there is no overflow) */
1563     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1564                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1565                                        TARGET_PAGE_SIZE);
1566     if (encoded_len == 0) {
1567         trace_save_xbzrle_page_skipping();
1568         return 0;
1569     } else if (encoded_len == -1) {
1570         trace_save_xbzrle_page_overflow();
1571         xbzrle_counters.overflow++;
1572         /* update data in the cache */
1573         if (!last_stage) {
1574             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1575             *current_data = prev_cached_page;
1576         }
1577         return -1;
1578     }
1579
1580     /* we need to update the data in the cache, in order to get the same data */
1581     if (!last_stage) {
1582         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1583     }
1584
1585     /* Send XBZRLE based compressed page */
1586     bytes_xbzrle = save_page_header(rs, rs->f, block,
1587                                     offset | RAM_SAVE_FLAG_XBZRLE);
1588     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1589     qemu_put_be16(rs->f, encoded_len);
1590     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1591     bytes_xbzrle += encoded_len + 1 + 2;
1592     xbzrle_counters.pages++;
1593     xbzrle_counters.bytes += bytes_xbzrle;
1594     ram_counters.transferred += bytes_xbzrle;
1595
1596     return 1;
1597 }
1598
1599 /**
1600  * migration_bitmap_find_dirty: find the next dirty page from start
1601  *
1602  * Called with rcu_read_lock() to protect migration_bitmap
1603  *
1604  * Returns the byte offset within memory region of the start of a dirty page
1605  *
1606  * @rs: current RAM state
1607  * @rb: RAMBlock where to search for dirty pages
1608  * @start: page where we start the search
1609  */
1610 static inline
1611 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1612                                           unsigned long start)
1613 {
1614     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1615     unsigned long *bitmap = rb->bmap;
1616     unsigned long next;
1617
1618     if (ramblock_is_ignored(rb)) {
1619         return size;
1620     }
1621
1622     /*
1623      * When the free page optimization is enabled, we need to check the bitmap
1624      * to send the non-free pages rather than all the pages in the bulk stage.
1625      */
1626     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
1627         next = start + 1;
1628     } else {
1629         next = find_next_bit(bitmap, size, start);
1630     }
1631
1632     return next;
1633 }
1634
1635 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1636                                                 RAMBlock *rb,
1637                                                 unsigned long page)
1638 {
1639     bool ret;
1640
1641     qemu_mutex_lock(&rs->bitmap_mutex);
1642     ret = test_and_clear_bit(page, rb->bmap);
1643
1644     if (ret) {
1645         rs->migration_dirty_pages--;
1646     }
1647     qemu_mutex_unlock(&rs->bitmap_mutex);
1648
1649     return ret;
1650 }
1651
1652 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1653                                         ram_addr_t start, ram_addr_t length)
1654 {
1655     rs->migration_dirty_pages +=
1656         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1657                                               &rs->num_dirty_pages_period);
1658 }
1659
1660 /**
1661  * ram_pagesize_summary: calculate all the pagesizes of a VM
1662  *
1663  * Returns a summary bitmap of the page sizes of all RAMBlocks
1664  *
1665  * For VMs with just normal pages this is equivalent to the host page
1666  * size. If it's got some huge pages then it's the OR of all the
1667  * different page sizes.
1668  */
1669 uint64_t ram_pagesize_summary(void)
1670 {
1671     RAMBlock *block;
1672     uint64_t summary = 0;
1673
1674     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1675         summary |= block->page_size;
1676     }
1677
1678     return summary;
1679 }
1680
1681 uint64_t ram_get_total_transferred_pages(void)
1682 {
1683     return  ram_counters.normal + ram_counters.duplicate +
1684                 compression_counters.pages + xbzrle_counters.pages;
1685 }
1686
1687 static void migration_update_rates(RAMState *rs, int64_t end_time)
1688 {
1689     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1690     double compressed_size;
1691
1692     /* calculate period counters */
1693     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1694                 / (end_time - rs->time_last_bitmap_sync);
1695
1696     if (!page_count) {
1697         return;
1698     }
1699
1700     if (migrate_use_xbzrle()) {
1701         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1702             rs->xbzrle_cache_miss_prev) / page_count;
1703         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1704     }
1705
1706     if (migrate_use_compression()) {
1707         compression_counters.busy_rate = (double)(compression_counters.busy -
1708             rs->compress_thread_busy_prev) / page_count;
1709         rs->compress_thread_busy_prev = compression_counters.busy;
1710
1711         compressed_size = compression_counters.compressed_size -
1712                           rs->compressed_size_prev;
1713         if (compressed_size) {
1714             double uncompressed_size = (compression_counters.pages -
1715                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1716
1717             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1718             compression_counters.compression_rate =
1719                                         uncompressed_size / compressed_size;
1720
1721             rs->compress_pages_prev = compression_counters.pages;
1722             rs->compressed_size_prev = compression_counters.compressed_size;
1723         }
1724     }
1725 }
1726
1727 static void migration_bitmap_sync(RAMState *rs)
1728 {
1729     RAMBlock *block;
1730     int64_t end_time;
1731     uint64_t bytes_xfer_now;
1732
1733     ram_counters.dirty_sync_count++;
1734
1735     if (!rs->time_last_bitmap_sync) {
1736         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1737     }
1738
1739     trace_migration_bitmap_sync_start();
1740     memory_global_dirty_log_sync();
1741
1742     qemu_mutex_lock(&rs->bitmap_mutex);
1743     rcu_read_lock();
1744     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1745         migration_bitmap_sync_range(rs, block, 0, block->used_length);
1746     }
1747     ram_counters.remaining = ram_bytes_remaining();
1748     rcu_read_unlock();
1749     qemu_mutex_unlock(&rs->bitmap_mutex);
1750
1751     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1752
1753     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1754
1755     /* more than 1 second = 1000 millisecons */
1756     if (end_time > rs->time_last_bitmap_sync + 1000) {
1757         bytes_xfer_now = ram_counters.transferred;
1758
1759         /* During block migration the auto-converge logic incorrectly detects
1760          * that ram migration makes no progress. Avoid this by disabling the
1761          * throttling logic during the bulk phase of block migration. */
1762         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1763             /* The following detection logic can be refined later. For now:
1764                Check to see if the dirtied bytes is 50% more than the approx.
1765                amount of bytes that just got transferred since the last time we
1766                were in this routine. If that happens twice, start or increase
1767                throttling */
1768
1769             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1770                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1771                 (++rs->dirty_rate_high_cnt >= 2)) {
1772                     trace_migration_throttle();
1773                     rs->dirty_rate_high_cnt = 0;
1774                     mig_throttle_guest_down();
1775             }
1776         }
1777
1778         migration_update_rates(rs, end_time);
1779
1780         rs->target_page_count_prev = rs->target_page_count;
1781
1782         /* reset period counters */
1783         rs->time_last_bitmap_sync = end_time;
1784         rs->num_dirty_pages_period = 0;
1785         rs->bytes_xfer_prev = bytes_xfer_now;
1786     }
1787     if (migrate_use_events()) {
1788         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1789     }
1790 }
1791
1792 static void migration_bitmap_sync_precopy(RAMState *rs)
1793 {
1794     Error *local_err = NULL;
1795
1796     /*
1797      * The current notifier usage is just an optimization to migration, so we
1798      * don't stop the normal migration process in the error case.
1799      */
1800     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1801         error_report_err(local_err);
1802     }
1803
1804     migration_bitmap_sync(rs);
1805
1806     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1807         error_report_err(local_err);
1808     }
1809 }
1810
1811 /**
1812  * save_zero_page_to_file: send the zero page to the file
1813  *
1814  * Returns the size of data written to the file, 0 means the page is not
1815  * a zero page
1816  *
1817  * @rs: current RAM state
1818  * @file: the file where the data is saved
1819  * @block: block that contains the page we want to send
1820  * @offset: offset inside the block for the page
1821  */
1822 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1823                                   RAMBlock *block, ram_addr_t offset)
1824 {
1825     uint8_t *p = block->host + offset;
1826     int len = 0;
1827
1828     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1829         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1830         qemu_put_byte(file, 0);
1831         len += 1;
1832     }
1833     return len;
1834 }
1835
1836 /**
1837  * save_zero_page: send the zero page to the stream
1838  *
1839  * Returns the number of pages written.
1840  *
1841  * @rs: current RAM state
1842  * @block: block that contains the page we want to send
1843  * @offset: offset inside the block for the page
1844  */
1845 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1846 {
1847     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1848
1849     if (len) {
1850         ram_counters.duplicate++;
1851         ram_counters.transferred += len;
1852         return 1;
1853     }
1854     return -1;
1855 }
1856
1857 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1858 {
1859     if (!migrate_release_ram() || !migration_in_postcopy()) {
1860         return;
1861     }
1862
1863     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1864 }
1865
1866 /*
1867  * @pages: the number of pages written by the control path,
1868  *        < 0 - error
1869  *        > 0 - number of pages written
1870  *
1871  * Return true if the pages has been saved, otherwise false is returned.
1872  */
1873 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1874                               int *pages)
1875 {
1876     uint64_t bytes_xmit = 0;
1877     int ret;
1878
1879     *pages = -1;
1880     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1881                                 &bytes_xmit);
1882     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1883         return false;
1884     }
1885
1886     if (bytes_xmit) {
1887         ram_counters.transferred += bytes_xmit;
1888         *pages = 1;
1889     }
1890
1891     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1892         return true;
1893     }
1894
1895     if (bytes_xmit > 0) {
1896         ram_counters.normal++;
1897     } else if (bytes_xmit == 0) {
1898         ram_counters.duplicate++;
1899     }
1900
1901     return true;
1902 }
1903
1904 /*
1905  * directly send the page to the stream
1906  *
1907  * Returns the number of pages written.
1908  *
1909  * @rs: current RAM state
1910  * @block: block that contains the page we want to send
1911  * @offset: offset inside the block for the page
1912  * @buf: the page to be sent
1913  * @async: send to page asyncly
1914  */
1915 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1916                             uint8_t *buf, bool async)
1917 {
1918     ram_counters.transferred += save_page_header(rs, rs->f, block,
1919                                                  offset | RAM_SAVE_FLAG_PAGE);
1920     if (async) {
1921         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1922                               migrate_release_ram() &
1923                               migration_in_postcopy());
1924     } else {
1925         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1926     }
1927     ram_counters.transferred += TARGET_PAGE_SIZE;
1928     ram_counters.normal++;
1929     return 1;
1930 }
1931
1932 /**
1933  * ram_save_page: send the given page to the stream
1934  *
1935  * Returns the number of pages written.
1936  *          < 0 - error
1937  *          >=0 - Number of pages written - this might legally be 0
1938  *                if xbzrle noticed the page was the same.
1939  *
1940  * @rs: current RAM state
1941  * @block: block that contains the page we want to send
1942  * @offset: offset inside the block for the page
1943  * @last_stage: if we are at the completion stage
1944  */
1945 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1946 {
1947     int pages = -1;
1948     uint8_t *p;
1949     bool send_async = true;
1950     RAMBlock *block = pss->block;
1951     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1952     ram_addr_t current_addr = block->offset + offset;
1953
1954     p = block->host + offset;
1955     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1956
1957     XBZRLE_cache_lock();
1958     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1959         migrate_use_xbzrle()) {
1960         pages = save_xbzrle_page(rs, &p, current_addr, block,
1961                                  offset, last_stage);
1962         if (!last_stage) {
1963             /* Can't send this cached data async, since the cache page
1964              * might get updated before it gets to the wire
1965              */
1966             send_async = false;
1967         }
1968     }
1969
1970     /* XBZRLE overflow or normal page */
1971     if (pages == -1) {
1972         pages = save_normal_page(rs, block, offset, p, send_async);
1973     }
1974
1975     XBZRLE_cache_unlock();
1976
1977     return pages;
1978 }
1979
1980 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1981                                  ram_addr_t offset)
1982 {
1983     multifd_queue_page(block, offset);
1984     ram_counters.normal++;
1985
1986     return 1;
1987 }
1988
1989 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1990                                  ram_addr_t offset, uint8_t *source_buf)
1991 {
1992     RAMState *rs = ram_state;
1993     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1994     bool zero_page = false;
1995     int ret;
1996
1997     if (save_zero_page_to_file(rs, f, block, offset)) {
1998         zero_page = true;
1999         goto exit;
2000     }
2001
2002     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
2003
2004     /*
2005      * copy it to a internal buffer to avoid it being modified by VM
2006      * so that we can catch up the error during compression and
2007      * decompression
2008      */
2009     memcpy(source_buf, p, TARGET_PAGE_SIZE);
2010     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2011     if (ret < 0) {
2012         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
2013         error_report("compressed data failed!");
2014         return false;
2015     }
2016
2017 exit:
2018     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
2019     return zero_page;
2020 }
2021
2022 static void
2023 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2024 {
2025     ram_counters.transferred += bytes_xmit;
2026
2027     if (param->zero_page) {
2028         ram_counters.duplicate++;
2029         return;
2030     }
2031
2032     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2033     compression_counters.compressed_size += bytes_xmit - 8;
2034     compression_counters.pages++;
2035 }
2036
2037 static bool save_page_use_compression(RAMState *rs);
2038
2039 static void flush_compressed_data(RAMState *rs)
2040 {
2041     int idx, len, thread_count;
2042
2043     if (!save_page_use_compression(rs)) {
2044         return;
2045     }
2046     thread_count = migrate_compress_threads();
2047
2048     qemu_mutex_lock(&comp_done_lock);
2049     for (idx = 0; idx < thread_count; idx++) {
2050         while (!comp_param[idx].done) {
2051             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2052         }
2053     }
2054     qemu_mutex_unlock(&comp_done_lock);
2055
2056     for (idx = 0; idx < thread_count; idx++) {
2057         qemu_mutex_lock(&comp_param[idx].mutex);
2058         if (!comp_param[idx].quit) {
2059             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2060             /*
2061              * it's safe to fetch zero_page without holding comp_done_lock
2062              * as there is no further request submitted to the thread,
2063              * i.e, the thread should be waiting for a request at this point.
2064              */
2065             update_compress_thread_counts(&comp_param[idx], len);
2066         }
2067         qemu_mutex_unlock(&comp_param[idx].mutex);
2068     }
2069 }
2070
2071 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2072                                        ram_addr_t offset)
2073 {
2074     param->block = block;
2075     param->offset = offset;
2076 }
2077
2078 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2079                                            ram_addr_t offset)
2080 {
2081     int idx, thread_count, bytes_xmit = -1, pages = -1;
2082     bool wait = migrate_compress_wait_thread();
2083
2084     thread_count = migrate_compress_threads();
2085     qemu_mutex_lock(&comp_done_lock);
2086 retry:
2087     for (idx = 0; idx < thread_count; idx++) {
2088         if (comp_param[idx].done) {
2089             comp_param[idx].done = false;
2090             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2091             qemu_mutex_lock(&comp_param[idx].mutex);
2092             set_compress_params(&comp_param[idx], block, offset);
2093             qemu_cond_signal(&comp_param[idx].cond);
2094             qemu_mutex_unlock(&comp_param[idx].mutex);
2095             pages = 1;
2096             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
2097             break;
2098         }
2099     }
2100
2101     /*
2102      * wait for the free thread if the user specifies 'compress-wait-thread',
2103      * otherwise we will post the page out in the main thread as normal page.
2104      */
2105     if (pages < 0 && wait) {
2106         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2107         goto retry;
2108     }
2109     qemu_mutex_unlock(&comp_done_lock);
2110
2111     return pages;
2112 }
2113
2114 /**
2115  * find_dirty_block: find the next dirty page and update any state
2116  * associated with the search process.
2117  *
2118  * Returns if a page is found
2119  *
2120  * @rs: current RAM state
2121  * @pss: data about the state of the current dirty page scan
2122  * @again: set to false if the search has scanned the whole of RAM
2123  */
2124 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
2125 {
2126     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2127     if (pss->complete_round && pss->block == rs->last_seen_block &&
2128         pss->page >= rs->last_page) {
2129         /*
2130          * We've been once around the RAM and haven't found anything.
2131          * Give up.
2132          */
2133         *again = false;
2134         return false;
2135     }
2136     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
2137         /* Didn't find anything in this RAM Block */
2138         pss->page = 0;
2139         pss->block = QLIST_NEXT_RCU(pss->block, next);
2140         if (!pss->block) {
2141             /*
2142              * If memory migration starts over, we will meet a dirtied page
2143              * which may still exists in compression threads's ring, so we
2144              * should flush the compressed data to make sure the new page
2145              * is not overwritten by the old one in the destination.
2146              *
2147              * Also If xbzrle is on, stop using the data compression at this
2148              * point. In theory, xbzrle can do better than compression.
2149              */
2150             flush_compressed_data(rs);
2151
2152             /* Hit the end of the list */
2153             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2154             /* Flag that we've looped */
2155             pss->complete_round = true;
2156             rs->ram_bulk_stage = false;
2157         }
2158         /* Didn't find anything this time, but try again on the new block */
2159         *again = true;
2160         return false;
2161     } else {
2162         /* Can go around again, but... */
2163         *again = true;
2164         /* We've found something so probably don't need to */
2165         return true;
2166     }
2167 }
2168
2169 /**
2170  * unqueue_page: gets a page of the queue
2171  *
2172  * Helper for 'get_queued_page' - gets a page off the queue
2173  *
2174  * Returns the block of the page (or NULL if none available)
2175  *
2176  * @rs: current RAM state
2177  * @offset: used to return the offset within the RAMBlock
2178  */
2179 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2180 {
2181     RAMBlock *block = NULL;
2182
2183     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2184         return NULL;
2185     }
2186
2187     qemu_mutex_lock(&rs->src_page_req_mutex);
2188     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2189         struct RAMSrcPageRequest *entry =
2190                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
2191         block = entry->rb;
2192         *offset = entry->offset;
2193
2194         if (entry->len > TARGET_PAGE_SIZE) {
2195             entry->len -= TARGET_PAGE_SIZE;
2196             entry->offset += TARGET_PAGE_SIZE;
2197         } else {
2198             memory_region_unref(block->mr);
2199             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2200             g_free(entry);
2201             migration_consume_urgent_request();
2202         }
2203     }
2204     qemu_mutex_unlock(&rs->src_page_req_mutex);
2205
2206     return block;
2207 }
2208
2209 /**
2210  * get_queued_page: unqueue a page from the postocpy requests
2211  *
2212  * Skips pages that are already sent (!dirty)
2213  *
2214  * Returns if a queued page is found
2215  *
2216  * @rs: current RAM state
2217  * @pss: data about the state of the current dirty page scan
2218  */
2219 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2220 {
2221     RAMBlock  *block;
2222     ram_addr_t offset;
2223     bool dirty;
2224
2225     do {
2226         block = unqueue_page(rs, &offset);
2227         /*
2228          * We're sending this page, and since it's postcopy nothing else
2229          * will dirty it, and we must make sure it doesn't get sent again
2230          * even if this queue request was received after the background
2231          * search already sent it.
2232          */
2233         if (block) {
2234             unsigned long page;
2235
2236             page = offset >> TARGET_PAGE_BITS;
2237             dirty = test_bit(page, block->bmap);
2238             if (!dirty) {
2239                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2240                        page, test_bit(page, block->unsentmap));
2241             } else {
2242                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2243             }
2244         }
2245
2246     } while (block && !dirty);
2247
2248     if (block) {
2249         /*
2250          * As soon as we start servicing pages out of order, then we have
2251          * to kill the bulk stage, since the bulk stage assumes
2252          * in (migration_bitmap_find_and_reset_dirty) that every page is
2253          * dirty, that's no longer true.
2254          */
2255         rs->ram_bulk_stage = false;
2256
2257         /*
2258          * We want the background search to continue from the queued page
2259          * since the guest is likely to want other pages near to the page
2260          * it just requested.
2261          */
2262         pss->block = block;
2263         pss->page = offset >> TARGET_PAGE_BITS;
2264     }
2265
2266     return !!block;
2267 }
2268
2269 /**
2270  * migration_page_queue_free: drop any remaining pages in the ram
2271  * request queue
2272  *
2273  * It should be empty at the end anyway, but in error cases there may
2274  * be some left.  in case that there is any page left, we drop it.
2275  *
2276  */
2277 static void migration_page_queue_free(RAMState *rs)
2278 {
2279     struct RAMSrcPageRequest *mspr, *next_mspr;
2280     /* This queue generally should be empty - but in the case of a failed
2281      * migration might have some droppings in.
2282      */
2283     rcu_read_lock();
2284     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2285         memory_region_unref(mspr->rb->mr);
2286         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2287         g_free(mspr);
2288     }
2289     rcu_read_unlock();
2290 }
2291
2292 /**
2293  * ram_save_queue_pages: queue the page for transmission
2294  *
2295  * A request from postcopy destination for example.
2296  *
2297  * Returns zero on success or negative on error
2298  *
2299  * @rbname: Name of the RAMBLock of the request. NULL means the
2300  *          same that last one.
2301  * @start: starting address from the start of the RAMBlock
2302  * @len: length (in bytes) to send
2303  */
2304 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2305 {
2306     RAMBlock *ramblock;
2307     RAMState *rs = ram_state;
2308
2309     ram_counters.postcopy_requests++;
2310     rcu_read_lock();
2311     if (!rbname) {
2312         /* Reuse last RAMBlock */
2313         ramblock = rs->last_req_rb;
2314
2315         if (!ramblock) {
2316             /*
2317              * Shouldn't happen, we can't reuse the last RAMBlock if
2318              * it's the 1st request.
2319              */
2320             error_report("ram_save_queue_pages no previous block");
2321             goto err;
2322         }
2323     } else {
2324         ramblock = qemu_ram_block_by_name(rbname);
2325
2326         if (!ramblock) {
2327             /* We shouldn't be asked for a non-existent RAMBlock */
2328             error_report("ram_save_queue_pages no block '%s'", rbname);
2329             goto err;
2330         }
2331         rs->last_req_rb = ramblock;
2332     }
2333     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2334     if (start+len > ramblock->used_length) {
2335         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2336                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2337                      __func__, start, len, ramblock->used_length);
2338         goto err;
2339     }
2340
2341     struct RAMSrcPageRequest *new_entry =
2342         g_malloc0(sizeof(struct RAMSrcPageRequest));
2343     new_entry->rb = ramblock;
2344     new_entry->offset = start;
2345     new_entry->len = len;
2346
2347     memory_region_ref(ramblock->mr);
2348     qemu_mutex_lock(&rs->src_page_req_mutex);
2349     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2350     migration_make_urgent_request();
2351     qemu_mutex_unlock(&rs->src_page_req_mutex);
2352     rcu_read_unlock();
2353
2354     return 0;
2355
2356 err:
2357     rcu_read_unlock();
2358     return -1;
2359 }
2360
2361 static bool save_page_use_compression(RAMState *rs)
2362 {
2363     if (!migrate_use_compression()) {
2364         return false;
2365     }
2366
2367     /*
2368      * If xbzrle is on, stop using the data compression after first
2369      * round of migration even if compression is enabled. In theory,
2370      * xbzrle can do better than compression.
2371      */
2372     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2373         return true;
2374     }
2375
2376     return false;
2377 }
2378
2379 /*
2380  * try to compress the page before posting it out, return true if the page
2381  * has been properly handled by compression, otherwise needs other
2382  * paths to handle it
2383  */
2384 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2385 {
2386     if (!save_page_use_compression(rs)) {
2387         return false;
2388     }
2389
2390     /*
2391      * When starting the process of a new block, the first page of
2392      * the block should be sent out before other pages in the same
2393      * block, and all the pages in last block should have been sent
2394      * out, keeping this order is important, because the 'cont' flag
2395      * is used to avoid resending the block name.
2396      *
2397      * We post the fist page as normal page as compression will take
2398      * much CPU resource.
2399      */
2400     if (block != rs->last_sent_block) {
2401         flush_compressed_data(rs);
2402         return false;
2403     }
2404
2405     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2406         return true;
2407     }
2408
2409     compression_counters.busy++;
2410     return false;
2411 }
2412
2413 /**
2414  * ram_save_target_page: save one target page
2415  *
2416  * Returns the number of pages written
2417  *
2418  * @rs: current RAM state
2419  * @pss: data about the page we want to send
2420  * @last_stage: if we are at the completion stage
2421  */
2422 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2423                                 bool last_stage)
2424 {
2425     RAMBlock *block = pss->block;
2426     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2427     int res;
2428
2429     if (control_save_page(rs, block, offset, &res)) {
2430         return res;
2431     }
2432
2433     if (save_compress_page(rs, block, offset)) {
2434         return 1;
2435     }
2436
2437     res = save_zero_page(rs, block, offset);
2438     if (res > 0) {
2439         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2440          * page would be stale
2441          */
2442         if (!save_page_use_compression(rs)) {
2443             XBZRLE_cache_lock();
2444             xbzrle_cache_zero_page(rs, block->offset + offset);
2445             XBZRLE_cache_unlock();
2446         }
2447         ram_release_pages(block->idstr, offset, res);
2448         return res;
2449     }
2450
2451     /*
2452      * do not use multifd for compression as the first page in the new
2453      * block should be posted out before sending the compressed page
2454      */
2455     if (!save_page_use_compression(rs) && migrate_use_multifd()) {
2456         return ram_save_multifd_page(rs, block, offset);
2457     }
2458
2459     return ram_save_page(rs, pss, last_stage);
2460 }
2461
2462 /**
2463  * ram_save_host_page: save a whole host page
2464  *
2465  * Starting at *offset send pages up to the end of the current host
2466  * page. It's valid for the initial offset to point into the middle of
2467  * a host page in which case the remainder of the hostpage is sent.
2468  * Only dirty target pages are sent. Note that the host page size may
2469  * be a huge page for this block.
2470  * The saving stops at the boundary of the used_length of the block
2471  * if the RAMBlock isn't a multiple of the host page size.
2472  *
2473  * Returns the number of pages written or negative on error
2474  *
2475  * @rs: current RAM state
2476  * @ms: current migration state
2477  * @pss: data about the page we want to send
2478  * @last_stage: if we are at the completion stage
2479  */
2480 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2481                               bool last_stage)
2482 {
2483     int tmppages, pages = 0;
2484     size_t pagesize_bits =
2485         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2486
2487     if (ramblock_is_ignored(pss->block)) {
2488         error_report("block %s should not be migrated !", pss->block->idstr);
2489         return 0;
2490     }
2491
2492     do {
2493         /* Check the pages is dirty and if it is send it */
2494         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2495             pss->page++;
2496             continue;
2497         }
2498
2499         tmppages = ram_save_target_page(rs, pss, last_stage);
2500         if (tmppages < 0) {
2501             return tmppages;
2502         }
2503
2504         pages += tmppages;
2505         if (pss->block->unsentmap) {
2506             clear_bit(pss->page, pss->block->unsentmap);
2507         }
2508
2509         pss->page++;
2510     } while ((pss->page & (pagesize_bits - 1)) &&
2511              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2512
2513     /* The offset we leave with is the last one we looked at */
2514     pss->page--;
2515     return pages;
2516 }
2517
2518 /**
2519  * ram_find_and_save_block: finds a dirty page and sends it to f
2520  *
2521  * Called within an RCU critical section.
2522  *
2523  * Returns the number of pages written where zero means no dirty pages,
2524  * or negative on error
2525  *
2526  * @rs: current RAM state
2527  * @last_stage: if we are at the completion stage
2528  *
2529  * On systems where host-page-size > target-page-size it will send all the
2530  * pages in a host page that are dirty.
2531  */
2532
2533 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2534 {
2535     PageSearchStatus pss;
2536     int pages = 0;
2537     bool again, found;
2538
2539     /* No dirty page as there is zero RAM */
2540     if (!ram_bytes_total()) {
2541         return pages;
2542     }
2543
2544     pss.block = rs->last_seen_block;
2545     pss.page = rs->last_page;
2546     pss.complete_round = false;
2547
2548     if (!pss.block) {
2549         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2550     }
2551
2552     do {
2553         again = true;
2554         found = get_queued_page(rs, &pss);
2555
2556         if (!found) {
2557             /* priority queue empty, so just search for something dirty */
2558             found = find_dirty_block(rs, &pss, &again);
2559         }
2560
2561         if (found) {
2562             pages = ram_save_host_page(rs, &pss, last_stage);
2563         }
2564     } while (!pages && again);
2565
2566     rs->last_seen_block = pss.block;
2567     rs->last_page = pss.page;
2568
2569     return pages;
2570 }
2571
2572 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2573 {
2574     uint64_t pages = size / TARGET_PAGE_SIZE;
2575
2576     if (zero) {
2577         ram_counters.duplicate += pages;
2578     } else {
2579         ram_counters.normal += pages;
2580         ram_counters.transferred += size;
2581         qemu_update_position(f, size);
2582     }
2583 }
2584
2585 static uint64_t ram_bytes_total_common(bool count_ignored)
2586 {
2587     RAMBlock *block;
2588     uint64_t total = 0;
2589
2590     rcu_read_lock();
2591     if (count_ignored) {
2592         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2593             total += block->used_length;
2594         }
2595     } else {
2596         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2597             total += block->used_length;
2598         }
2599     }
2600     rcu_read_unlock();
2601     return total;
2602 }
2603
2604 uint64_t ram_bytes_total(void)
2605 {
2606     return ram_bytes_total_common(false);
2607 }
2608
2609 static void xbzrle_load_setup(void)
2610 {
2611     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2612 }
2613
2614 static void xbzrle_load_cleanup(void)
2615 {
2616     g_free(XBZRLE.decoded_buf);
2617     XBZRLE.decoded_buf = NULL;
2618 }
2619
2620 static void ram_state_cleanup(RAMState **rsp)
2621 {
2622     if (*rsp) {
2623         migration_page_queue_free(*rsp);
2624         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2625         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2626         g_free(*rsp);
2627         *rsp = NULL;
2628     }
2629 }
2630
2631 static void xbzrle_cleanup(void)
2632 {
2633     XBZRLE_cache_lock();
2634     if (XBZRLE.cache) {
2635         cache_fini(XBZRLE.cache);
2636         g_free(XBZRLE.encoded_buf);
2637         g_free(XBZRLE.current_buf);
2638         g_free(XBZRLE.zero_target_page);
2639         XBZRLE.cache = NULL;
2640         XBZRLE.encoded_buf = NULL;
2641         XBZRLE.current_buf = NULL;
2642         XBZRLE.zero_target_page = NULL;
2643     }
2644     XBZRLE_cache_unlock();
2645 }
2646
2647 static void ram_save_cleanup(void *opaque)
2648 {
2649     RAMState **rsp = opaque;
2650     RAMBlock *block;
2651
2652     /* caller have hold iothread lock or is in a bh, so there is
2653      * no writing race against this migration_bitmap
2654      */
2655     memory_global_dirty_log_stop();
2656
2657     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2658         g_free(block->bmap);
2659         block->bmap = NULL;
2660         g_free(block->unsentmap);
2661         block->unsentmap = NULL;
2662     }
2663
2664     xbzrle_cleanup();
2665     compress_threads_save_cleanup();
2666     ram_state_cleanup(rsp);
2667 }
2668
2669 static void ram_state_reset(RAMState *rs)
2670 {
2671     rs->last_seen_block = NULL;
2672     rs->last_sent_block = NULL;
2673     rs->last_page = 0;
2674     rs->last_version = ram_list.version;
2675     rs->ram_bulk_stage = true;
2676     rs->fpo_enabled = false;
2677 }
2678
2679 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2680
2681 /*
2682  * 'expected' is the value you expect the bitmap mostly to be full
2683  * of; it won't bother printing lines that are all this value.
2684  * If 'todump' is null the migration bitmap is dumped.
2685  */
2686 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2687                            unsigned long pages)
2688 {
2689     int64_t cur;
2690     int64_t linelen = 128;
2691     char linebuf[129];
2692
2693     for (cur = 0; cur < pages; cur += linelen) {
2694         int64_t curb;
2695         bool found = false;
2696         /*
2697          * Last line; catch the case where the line length
2698          * is longer than remaining ram
2699          */
2700         if (cur + linelen > pages) {
2701             linelen = pages - cur;
2702         }
2703         for (curb = 0; curb < linelen; curb++) {
2704             bool thisbit = test_bit(cur + curb, todump);
2705             linebuf[curb] = thisbit ? '1' : '.';
2706             found = found || (thisbit != expected);
2707         }
2708         if (found) {
2709             linebuf[curb] = '\0';
2710             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2711         }
2712     }
2713 }
2714
2715 /* **** functions for postcopy ***** */
2716
2717 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2718 {
2719     struct RAMBlock *block;
2720
2721     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2722         unsigned long *bitmap = block->bmap;
2723         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2724         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2725
2726         while (run_start < range) {
2727             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2728             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2729                               (run_end - run_start) << TARGET_PAGE_BITS);
2730             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2731         }
2732     }
2733 }
2734
2735 /**
2736  * postcopy_send_discard_bm_ram: discard a RAMBlock
2737  *
2738  * Returns zero on success
2739  *
2740  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2741  * Note: At this point the 'unsentmap' is the processed bitmap combined
2742  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2743  *
2744  * @ms: current migration state
2745  * @pds: state for postcopy
2746  * @start: RAMBlock starting page
2747  * @length: RAMBlock size
2748  */
2749 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2750                                         PostcopyDiscardState *pds,
2751                                         RAMBlock *block)
2752 {
2753     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2754     unsigned long current;
2755     unsigned long *unsentmap = block->unsentmap;
2756
2757     for (current = 0; current < end; ) {
2758         unsigned long one = find_next_bit(unsentmap, end, current);
2759
2760         if (one <= end) {
2761             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2762             unsigned long discard_length;
2763
2764             if (zero >= end) {
2765                 discard_length = end - one;
2766             } else {
2767                 discard_length = zero - one;
2768             }
2769             if (discard_length) {
2770                 postcopy_discard_send_range(ms, pds, one, discard_length);
2771             }
2772             current = one + discard_length;
2773         } else {
2774             current = one;
2775         }
2776     }
2777
2778     return 0;
2779 }
2780
2781 /**
2782  * postcopy_each_ram_send_discard: discard all RAMBlocks
2783  *
2784  * Returns 0 for success or negative for error
2785  *
2786  * Utility for the outgoing postcopy code.
2787  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2788  *   passing it bitmap indexes and name.
2789  * (qemu_ram_foreach_block ends up passing unscaled lengths
2790  *  which would mean postcopy code would have to deal with target page)
2791  *
2792  * @ms: current migration state
2793  */
2794 static int postcopy_each_ram_send_discard(MigrationState *ms)
2795 {
2796     struct RAMBlock *block;
2797     int ret;
2798
2799     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2800         PostcopyDiscardState *pds =
2801             postcopy_discard_send_init(ms, block->idstr);
2802
2803         /*
2804          * Postcopy sends chunks of bitmap over the wire, but it
2805          * just needs indexes at this point, avoids it having
2806          * target page specific code.
2807          */
2808         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2809         postcopy_discard_send_finish(ms, pds);
2810         if (ret) {
2811             return ret;
2812         }
2813     }
2814
2815     return 0;
2816 }
2817
2818 /**
2819  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2820  *
2821  * Helper for postcopy_chunk_hostpages; it's called twice to
2822  * canonicalize the two bitmaps, that are similar, but one is
2823  * inverted.
2824  *
2825  * Postcopy requires that all target pages in a hostpage are dirty or
2826  * clean, not a mix.  This function canonicalizes the bitmaps.
2827  *
2828  * @ms: current migration state
2829  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2830  *               otherwise we need to canonicalize partially dirty host pages
2831  * @block: block that contains the page we want to canonicalize
2832  * @pds: state for postcopy
2833  */
2834 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2835                                           RAMBlock *block,
2836                                           PostcopyDiscardState *pds)
2837 {
2838     RAMState *rs = ram_state;
2839     unsigned long *bitmap = block->bmap;
2840     unsigned long *unsentmap = block->unsentmap;
2841     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2842     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2843     unsigned long run_start;
2844
2845     if (block->page_size == TARGET_PAGE_SIZE) {
2846         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2847         return;
2848     }
2849
2850     if (unsent_pass) {
2851         /* Find a sent page */
2852         run_start = find_next_zero_bit(unsentmap, pages, 0);
2853     } else {
2854         /* Find a dirty page */
2855         run_start = find_next_bit(bitmap, pages, 0);
2856     }
2857
2858     while (run_start < pages) {
2859         bool do_fixup = false;
2860         unsigned long fixup_start_addr;
2861         unsigned long host_offset;
2862
2863         /*
2864          * If the start of this run of pages is in the middle of a host
2865          * page, then we need to fixup this host page.
2866          */
2867         host_offset = run_start % host_ratio;
2868         if (host_offset) {
2869             do_fixup = true;
2870             run_start -= host_offset;
2871             fixup_start_addr = run_start;
2872             /* For the next pass */
2873             run_start = run_start + host_ratio;
2874         } else {
2875             /* Find the end of this run */
2876             unsigned long run_end;
2877             if (unsent_pass) {
2878                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2879             } else {
2880                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2881             }
2882             /*
2883              * If the end isn't at the start of a host page, then the
2884              * run doesn't finish at the end of a host page
2885              * and we need to discard.
2886              */
2887             host_offset = run_end % host_ratio;
2888             if (host_offset) {
2889                 do_fixup = true;
2890                 fixup_start_addr = run_end - host_offset;
2891                 /*
2892                  * This host page has gone, the next loop iteration starts
2893                  * from after the fixup
2894                  */
2895                 run_start = fixup_start_addr + host_ratio;
2896             } else {
2897                 /*
2898                  * No discards on this iteration, next loop starts from
2899                  * next sent/dirty page
2900                  */
2901                 run_start = run_end + 1;
2902             }
2903         }
2904
2905         if (do_fixup) {
2906             unsigned long page;
2907
2908             /* Tell the destination to discard this page */
2909             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2910                 /* For the unsent_pass we:
2911                  *     discard partially sent pages
2912                  * For the !unsent_pass (dirty) we:
2913                  *     discard partially dirty pages that were sent
2914                  *     (any partially sent pages were already discarded
2915                  *     by the previous unsent_pass)
2916                  */
2917                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2918                                             host_ratio);
2919             }
2920
2921             /* Clean up the bitmap */
2922             for (page = fixup_start_addr;
2923                  page < fixup_start_addr + host_ratio; page++) {
2924                 /* All pages in this host page are now not sent */
2925                 set_bit(page, unsentmap);
2926
2927                 /*
2928                  * Remark them as dirty, updating the count for any pages
2929                  * that weren't previously dirty.
2930                  */
2931                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2932             }
2933         }
2934
2935         if (unsent_pass) {
2936             /* Find the next sent page for the next iteration */
2937             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2938         } else {
2939             /* Find the next dirty page for the next iteration */
2940             run_start = find_next_bit(bitmap, pages, run_start);
2941         }
2942     }
2943 }
2944
2945 /**
2946  * postcopy_chuck_hostpages: discrad any partially sent host page
2947  *
2948  * Utility for the outgoing postcopy code.
2949  *
2950  * Discard any partially sent host-page size chunks, mark any partially
2951  * dirty host-page size chunks as all dirty.  In this case the host-page
2952  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2953  *
2954  * Returns zero on success
2955  *
2956  * @ms: current migration state
2957  * @block: block we want to work with
2958  */
2959 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2960 {
2961     PostcopyDiscardState *pds =
2962         postcopy_discard_send_init(ms, block->idstr);
2963
2964     /* First pass: Discard all partially sent host pages */
2965     postcopy_chunk_hostpages_pass(ms, true, block, pds);
2966     /*
2967      * Second pass: Ensure that all partially dirty host pages are made
2968      * fully dirty.
2969      */
2970     postcopy_chunk_hostpages_pass(ms, false, block, pds);
2971
2972     postcopy_discard_send_finish(ms, pds);
2973     return 0;
2974 }
2975
2976 /**
2977  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2978  *
2979  * Returns zero on success
2980  *
2981  * Transmit the set of pages to be discarded after precopy to the target
2982  * these are pages that:
2983  *     a) Have been previously transmitted but are now dirty again
2984  *     b) Pages that have never been transmitted, this ensures that
2985  *        any pages on the destination that have been mapped by background
2986  *        tasks get discarded (transparent huge pages is the specific concern)
2987  * Hopefully this is pretty sparse
2988  *
2989  * @ms: current migration state
2990  */
2991 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2992 {
2993     RAMState *rs = ram_state;
2994     RAMBlock *block;
2995     int ret;
2996
2997     rcu_read_lock();
2998
2999     /* This should be our last sync, the src is now paused */
3000     migration_bitmap_sync(rs);
3001
3002     /* Easiest way to make sure we don't resume in the middle of a host-page */
3003     rs->last_seen_block = NULL;
3004     rs->last_sent_block = NULL;
3005     rs->last_page = 0;
3006
3007     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3008         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
3009         unsigned long *bitmap = block->bmap;
3010         unsigned long *unsentmap = block->unsentmap;
3011
3012         if (!unsentmap) {
3013             /* We don't have a safe way to resize the sentmap, so
3014              * if the bitmap was resized it will be NULL at this
3015              * point.
3016              */
3017             error_report("migration ram resized during precopy phase");
3018             rcu_read_unlock();
3019             return -EINVAL;
3020         }
3021         /* Deal with TPS != HPS and huge pages */
3022         ret = postcopy_chunk_hostpages(ms, block);
3023         if (ret) {
3024             rcu_read_unlock();
3025             return ret;
3026         }
3027
3028         /*
3029          * Update the unsentmap to be unsentmap = unsentmap | dirty
3030          */
3031         bitmap_or(unsentmap, unsentmap, bitmap, pages);
3032 #ifdef DEBUG_POSTCOPY
3033         ram_debug_dump_bitmap(unsentmap, true, pages);
3034 #endif
3035     }
3036     trace_ram_postcopy_send_discard_bitmap();
3037
3038     ret = postcopy_each_ram_send_discard(ms);
3039     rcu_read_unlock();
3040
3041     return ret;
3042 }
3043
3044 /**
3045  * ram_discard_range: discard dirtied pages at the beginning of postcopy
3046  *
3047  * Returns zero on success
3048  *
3049  * @rbname: name of the RAMBlock of the request. NULL means the
3050  *          same that last one.
3051  * @start: RAMBlock starting page
3052  * @length: RAMBlock size
3053  */
3054 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3055 {
3056     int ret = -1;
3057
3058     trace_ram_discard_range(rbname, start, length);
3059
3060     rcu_read_lock();
3061     RAMBlock *rb = qemu_ram_block_by_name(rbname);
3062
3063     if (!rb) {
3064         error_report("ram_discard_range: Failed to find block '%s'", rbname);
3065         goto err;
3066     }
3067
3068     /*
3069      * On source VM, we don't need to update the received bitmap since
3070      * we don't even have one.
3071      */
3072     if (rb->receivedmap) {
3073         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3074                      length >> qemu_target_page_bits());
3075     }
3076
3077     ret = ram_block_discard_range(rb, start, length);
3078
3079 err:
3080     rcu_read_unlock();
3081
3082     return ret;
3083 }
3084
3085 /*
3086  * For every allocation, we will try not to crash the VM if the
3087  * allocation failed.
3088  */
3089 static int xbzrle_init(void)
3090 {
3091     Error *local_err = NULL;
3092
3093     if (!migrate_use_xbzrle()) {
3094         return 0;
3095     }
3096
3097     XBZRLE_cache_lock();
3098
3099     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3100     if (!XBZRLE.zero_target_page) {
3101         error_report("%s: Error allocating zero page", __func__);
3102         goto err_out;
3103     }
3104
3105     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3106                               TARGET_PAGE_SIZE, &local_err);
3107     if (!XBZRLE.cache) {
3108         error_report_err(local_err);
3109         goto free_zero_page;
3110     }
3111
3112     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3113     if (!XBZRLE.encoded_buf) {
3114         error_report("%s: Error allocating encoded_buf", __func__);
3115         goto free_cache;
3116     }
3117
3118     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3119     if (!XBZRLE.current_buf) {
3120         error_report("%s: Error allocating current_buf", __func__);
3121         goto free_encoded_buf;
3122     }
3123
3124     /* We are all good */
3125     XBZRLE_cache_unlock();
3126     return 0;
3127
3128 free_encoded_buf:
3129     g_free(XBZRLE.encoded_buf);
3130     XBZRLE.encoded_buf = NULL;
3131 free_cache:
3132     cache_fini(XBZRLE.cache);
3133     XBZRLE.cache = NULL;
3134 free_zero_page:
3135     g_free(XBZRLE.zero_target_page);
3136     XBZRLE.zero_target_page = NULL;
3137 err_out:
3138     XBZRLE_cache_unlock();
3139     return -ENOMEM;
3140 }
3141
3142 static int ram_state_init(RAMState **rsp)
3143 {
3144     *rsp = g_try_new0(RAMState, 1);
3145
3146     if (!*rsp) {
3147         error_report("%s: Init ramstate fail", __func__);
3148         return -1;
3149     }
3150
3151     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3152     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3153     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3154
3155     /*
3156      * Count the total number of pages used by ram blocks not including any
3157      * gaps due to alignment or unplugs.
3158      */
3159     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3160
3161     ram_state_reset(*rsp);
3162
3163     return 0;
3164 }
3165
3166 static void ram_list_init_bitmaps(void)
3167 {
3168     RAMBlock *block;
3169     unsigned long pages;
3170
3171     /* Skip setting bitmap if there is no RAM */
3172     if (ram_bytes_total()) {
3173         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3174             pages = block->max_length >> TARGET_PAGE_BITS;
3175             block->bmap = bitmap_new(pages);
3176             bitmap_set(block->bmap, 0, pages);
3177             if (migrate_postcopy_ram()) {
3178                 block->unsentmap = bitmap_new(pages);
3179                 bitmap_set(block->unsentmap, 0, pages);
3180             }
3181         }
3182     }
3183 }
3184
3185 static void ram_init_bitmaps(RAMState *rs)
3186 {
3187     /* For memory_global_dirty_log_start below.  */
3188     qemu_mutex_lock_iothread();
3189     qemu_mutex_lock_ramlist();
3190     rcu_read_lock();
3191
3192     ram_list_init_bitmaps();
3193     memory_global_dirty_log_start();
3194     migration_bitmap_sync_precopy(rs);
3195
3196     rcu_read_unlock();
3197     qemu_mutex_unlock_ramlist();
3198     qemu_mutex_unlock_iothread();
3199 }
3200
3201 static int ram_init_all(RAMState **rsp)
3202 {
3203     if (ram_state_init(rsp)) {
3204         return -1;
3205     }
3206
3207     if (xbzrle_init()) {
3208         ram_state_cleanup(rsp);
3209         return -1;
3210     }
3211
3212     ram_init_bitmaps(*rsp);
3213
3214     return 0;
3215 }
3216
3217 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3218 {
3219     RAMBlock *block;
3220     uint64_t pages = 0;
3221
3222     /*
3223      * Postcopy is not using xbzrle/compression, so no need for that.
3224      * Also, since source are already halted, we don't need to care
3225      * about dirty page logging as well.
3226      */
3227
3228     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3229         pages += bitmap_count_one(block->bmap,
3230                                   block->used_length >> TARGET_PAGE_BITS);
3231     }
3232
3233     /* This may not be aligned with current bitmaps. Recalculate. */
3234     rs->migration_dirty_pages = pages;
3235
3236     rs->last_seen_block = NULL;
3237     rs->last_sent_block = NULL;
3238     rs->last_page = 0;
3239     rs->last_version = ram_list.version;
3240     /*
3241      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3242      * matter what we have sent.
3243      */
3244     rs->ram_bulk_stage = false;
3245
3246     /* Update RAMState cache of output QEMUFile */
3247     rs->f = out;
3248
3249     trace_ram_state_resume_prepare(pages);
3250 }
3251
3252 /*
3253  * This function clears bits of the free pages reported by the caller from the
3254  * migration dirty bitmap. @addr is the host address corresponding to the
3255  * start of the continuous guest free pages, and @len is the total bytes of
3256  * those pages.
3257  */
3258 void qemu_guest_free_page_hint(void *addr, size_t len)
3259 {
3260     RAMBlock *block;
3261     ram_addr_t offset;
3262     size_t used_len, start, npages;
3263     MigrationState *s = migrate_get_current();
3264
3265     /* This function is currently expected to be used during live migration */
3266     if (!migration_is_setup_or_active(s->state)) {
3267         return;
3268     }
3269
3270     for (; len > 0; len -= used_len, addr += used_len) {
3271         block = qemu_ram_block_from_host(addr, false, &offset);
3272         if (unlikely(!block || offset >= block->used_length)) {
3273             /*
3274              * The implementation might not support RAMBlock resize during
3275              * live migration, but it could happen in theory with future
3276              * updates. So we add a check here to capture that case.
3277              */
3278             error_report_once("%s unexpected error", __func__);
3279             return;
3280         }
3281
3282         if (len <= block->used_length - offset) {
3283             used_len = len;
3284         } else {
3285             used_len = block->used_length - offset;
3286         }
3287
3288         start = offset >> TARGET_PAGE_BITS;
3289         npages = used_len >> TARGET_PAGE_BITS;
3290
3291         qemu_mutex_lock(&ram_state->bitmap_mutex);
3292         ram_state->migration_dirty_pages -=
3293                       bitmap_count_one_with_offset(block->bmap, start, npages);
3294         bitmap_clear(block->bmap, start, npages);
3295         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3296     }
3297 }
3298
3299 /*
3300  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3301  * long-running RCU critical section.  When rcu-reclaims in the code
3302  * start to become numerous it will be necessary to reduce the
3303  * granularity of these critical sections.
3304  */
3305
3306 /**
3307  * ram_save_setup: Setup RAM for migration
3308  *
3309  * Returns zero to indicate success and negative for error
3310  *
3311  * @f: QEMUFile where to send the data
3312  * @opaque: RAMState pointer
3313  */
3314 static int ram_save_setup(QEMUFile *f, void *opaque)
3315 {
3316     RAMState **rsp = opaque;
3317     RAMBlock *block;
3318
3319     if (compress_threads_save_setup()) {
3320         return -1;
3321     }
3322
3323     /* migration has already setup the bitmap, reuse it. */
3324     if (!migration_in_colo_state()) {
3325         if (ram_init_all(rsp) != 0) {
3326             compress_threads_save_cleanup();
3327             return -1;
3328         }
3329     }
3330     (*rsp)->f = f;
3331
3332     rcu_read_lock();
3333
3334     qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3335
3336     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3337         qemu_put_byte(f, strlen(block->idstr));
3338         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3339         qemu_put_be64(f, block->used_length);
3340         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3341             qemu_put_be64(f, block->page_size);
3342         }
3343         if (migrate_ignore_shared()) {
3344             qemu_put_be64(f, block->mr->addr);
3345             qemu_put_byte(f, ramblock_is_ignored(block) ? 1 : 0);
3346         }
3347     }
3348
3349     rcu_read_unlock();
3350
3351     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3352     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3353
3354     multifd_send_sync_main();
3355     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3356     qemu_fflush(f);
3357
3358     return 0;
3359 }
3360
3361 /**
3362  * ram_save_iterate: iterative stage for migration
3363  *
3364  * Returns zero to indicate success and negative for error
3365  *
3366  * @f: QEMUFile where to send the data
3367  * @opaque: RAMState pointer
3368  */
3369 static int ram_save_iterate(QEMUFile *f, void *opaque)
3370 {
3371     RAMState **temp = opaque;
3372     RAMState *rs = *temp;
3373     int ret;
3374     int i;
3375     int64_t t0;
3376     int done = 0;
3377
3378     if (blk_mig_bulk_active()) {
3379         /* Avoid transferring ram during bulk phase of block migration as
3380          * the bulk phase will usually take a long time and transferring
3381          * ram updates during that time is pointless. */
3382         goto out;
3383     }
3384
3385     rcu_read_lock();
3386     if (ram_list.version != rs->last_version) {
3387         ram_state_reset(rs);
3388     }
3389
3390     /* Read version before ram_list.blocks */
3391     smp_rmb();
3392
3393     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3394
3395     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3396     i = 0;
3397     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3398             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3399         int pages;
3400
3401         if (qemu_file_get_error(f)) {
3402             break;
3403         }
3404
3405         pages = ram_find_and_save_block(rs, false);
3406         /* no more pages to sent */
3407         if (pages == 0) {
3408             done = 1;
3409             break;
3410         }
3411
3412         if (pages < 0) {
3413             qemu_file_set_error(f, pages);
3414             break;
3415         }
3416
3417         rs->target_page_count += pages;
3418
3419         /* we want to check in the 1st loop, just in case it was the 1st time
3420            and we had to sync the dirty bitmap.
3421            qemu_get_clock_ns() is a bit expensive, so we only check each some
3422            iterations
3423         */
3424         if ((i & 63) == 0) {
3425             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3426             if (t1 > MAX_WAIT) {
3427                 trace_ram_save_iterate_big_wait(t1, i);
3428                 break;
3429             }
3430         }
3431         i++;
3432     }
3433     rcu_read_unlock();
3434
3435     /*
3436      * Must occur before EOS (or any QEMUFile operation)
3437      * because of RDMA protocol.
3438      */
3439     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3440
3441     multifd_send_sync_main();
3442 out:
3443     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3444     qemu_fflush(f);
3445     ram_counters.transferred += 8;
3446
3447     ret = qemu_file_get_error(f);
3448     if (ret < 0) {
3449         return ret;
3450     }
3451
3452     return done;
3453 }
3454
3455 /**
3456  * ram_save_complete: function called to send the remaining amount of ram
3457  *
3458  * Returns zero to indicate success or negative on error
3459  *
3460  * Called with iothread lock
3461  *
3462  * @f: QEMUFile where to send the data
3463  * @opaque: RAMState pointer
3464  */
3465 static int ram_save_complete(QEMUFile *f, void *opaque)
3466 {
3467     RAMState **temp = opaque;
3468     RAMState *rs = *temp;
3469     int ret = 0;
3470
3471     rcu_read_lock();
3472
3473     if (!migration_in_postcopy()) {
3474         migration_bitmap_sync_precopy(rs);
3475     }
3476
3477     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3478
3479     /* try transferring iterative blocks of memory */
3480
3481     /* flush all remaining blocks regardless of rate limiting */
3482     while (true) {
3483         int pages;
3484
3485         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3486         /* no more blocks to sent */
3487         if (pages == 0) {
3488             break;
3489         }
3490         if (pages < 0) {
3491             ret = pages;
3492             break;
3493         }
3494     }
3495
3496     flush_compressed_data(rs);
3497     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3498
3499     rcu_read_unlock();
3500
3501     multifd_send_sync_main();
3502     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3503     qemu_fflush(f);
3504
3505     return ret;
3506 }
3507
3508 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3509                              uint64_t *res_precopy_only,
3510                              uint64_t *res_compatible,
3511                              uint64_t *res_postcopy_only)
3512 {
3513     RAMState **temp = opaque;
3514     RAMState *rs = *temp;
3515     uint64_t remaining_size;
3516
3517     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3518
3519     if (!migration_in_postcopy() &&
3520         remaining_size < max_size) {
3521         qemu_mutex_lock_iothread();
3522         rcu_read_lock();
3523         migration_bitmap_sync_precopy(rs);
3524         rcu_read_unlock();
3525         qemu_mutex_unlock_iothread();
3526         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3527     }
3528
3529     if (migrate_postcopy_ram()) {
3530         /* We can do postcopy, and all the data is postcopiable */
3531         *res_compatible += remaining_size;
3532     } else {
3533         *res_precopy_only += remaining_size;
3534     }
3535 }
3536
3537 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3538 {
3539     unsigned int xh_len;
3540     int xh_flags;
3541     uint8_t *loaded_data;
3542
3543     /* extract RLE header */
3544     xh_flags = qemu_get_byte(f);
3545     xh_len = qemu_get_be16(f);
3546
3547     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3548         error_report("Failed to load XBZRLE page - wrong compression!");
3549         return -1;
3550     }
3551
3552     if (xh_len > TARGET_PAGE_SIZE) {
3553         error_report("Failed to load XBZRLE page - len overflow!");
3554         return -1;
3555     }
3556     loaded_data = XBZRLE.decoded_buf;
3557     /* load data and decode */
3558     /* it can change loaded_data to point to an internal buffer */
3559     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3560
3561     /* decode RLE */
3562     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3563                              TARGET_PAGE_SIZE) == -1) {
3564         error_report("Failed to load XBZRLE page - decode error!");
3565         return -1;
3566     }
3567
3568     return 0;
3569 }
3570
3571 /**
3572  * ram_block_from_stream: read a RAMBlock id from the migration stream
3573  *
3574  * Must be called from within a rcu critical section.
3575  *
3576  * Returns a pointer from within the RCU-protected ram_list.
3577  *
3578  * @f: QEMUFile where to read the data from
3579  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3580  */
3581 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3582 {
3583     static RAMBlock *block = NULL;
3584     char id[256];
3585     uint8_t len;
3586
3587     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3588         if (!block) {
3589             error_report("Ack, bad migration stream!");
3590             return NULL;
3591         }
3592         return block;
3593     }
3594
3595     len = qemu_get_byte(f);
3596     qemu_get_buffer(f, (uint8_t *)id, len);
3597     id[len] = 0;
3598
3599     block = qemu_ram_block_by_name(id);
3600     if (!block) {
3601         error_report("Can't find block %s", id);
3602         return NULL;
3603     }
3604
3605     if (ramblock_is_ignored(block)) {
3606         error_report("block %s should not be migrated !", id);
3607         return NULL;
3608     }
3609
3610     return block;
3611 }
3612
3613 static inline void *host_from_ram_block_offset(RAMBlock *block,
3614                                                ram_addr_t offset)
3615 {
3616     if (!offset_in_ramblock(block, offset)) {
3617         return NULL;
3618     }
3619
3620     return block->host + offset;
3621 }
3622
3623 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3624                                                  ram_addr_t offset)
3625 {
3626     if (!offset_in_ramblock(block, offset)) {
3627         return NULL;
3628     }
3629     if (!block->colo_cache) {
3630         error_report("%s: colo_cache is NULL in block :%s",
3631                      __func__, block->idstr);
3632         return NULL;
3633     }
3634
3635     /*
3636     * During colo checkpoint, we need bitmap of these migrated pages.
3637     * It help us to decide which pages in ram cache should be flushed
3638     * into VM's RAM later.
3639     */
3640     if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3641         ram_state->migration_dirty_pages++;
3642     }
3643     return block->colo_cache + offset;
3644 }
3645
3646 /**
3647  * ram_handle_compressed: handle the zero page case
3648  *
3649  * If a page (or a whole RDMA chunk) has been
3650  * determined to be zero, then zap it.
3651  *
3652  * @host: host address for the zero page
3653  * @ch: what the page is filled from.  We only support zero
3654  * @size: size of the zero page
3655  */
3656 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3657 {
3658     if (ch != 0 || !is_zero_range(host, size)) {
3659         memset(host, ch, size);
3660     }
3661 }
3662
3663 /* return the size after decompression, or negative value on error */
3664 static int
3665 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3666                      const uint8_t *source, size_t source_len)
3667 {
3668     int err;
3669
3670     err = inflateReset(stream);
3671     if (err != Z_OK) {
3672         return -1;
3673     }
3674
3675     stream->avail_in = source_len;
3676     stream->next_in = (uint8_t *)source;
3677     stream->avail_out = dest_len;
3678     stream->next_out = dest;
3679
3680     err = inflate(stream, Z_NO_FLUSH);
3681     if (err != Z_STREAM_END) {
3682         return -1;
3683     }
3684
3685     return stream->total_out;
3686 }
3687
3688 static void *do_data_decompress(void *opaque)
3689 {
3690     DecompressParam *param = opaque;
3691     unsigned long pagesize;
3692     uint8_t *des;
3693     int len, ret;
3694
3695     qemu_mutex_lock(&param->mutex);
3696     while (!param->quit) {
3697         if (param->des) {
3698             des = param->des;
3699             len = param->len;
3700             param->des = 0;
3701             qemu_mutex_unlock(&param->mutex);
3702
3703             pagesize = TARGET_PAGE_SIZE;
3704
3705             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3706                                        param->compbuf, len);
3707             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3708                 error_report("decompress data failed");
3709                 qemu_file_set_error(decomp_file, ret);
3710             }
3711
3712             qemu_mutex_lock(&decomp_done_lock);
3713             param->done = true;
3714             qemu_cond_signal(&decomp_done_cond);
3715             qemu_mutex_unlock(&decomp_done_lock);
3716
3717             qemu_mutex_lock(&param->mutex);
3718         } else {
3719             qemu_cond_wait(&param->cond, &param->mutex);
3720         }
3721     }
3722     qemu_mutex_unlock(&param->mutex);
3723
3724     return NULL;
3725 }
3726
3727 static int wait_for_decompress_done(void)
3728 {
3729     int idx, thread_count;
3730
3731     if (!migrate_use_compression()) {
3732         return 0;
3733     }
3734
3735     thread_count = migrate_decompress_threads();
3736     qemu_mutex_lock(&decomp_done_lock);
3737     for (idx = 0; idx < thread_count; idx++) {
3738         while (!decomp_param[idx].done) {
3739             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3740         }
3741     }
3742     qemu_mutex_unlock(&decomp_done_lock);
3743     return qemu_file_get_error(decomp_file);
3744 }
3745
3746 static void compress_threads_load_cleanup(void)
3747 {
3748     int i, thread_count;
3749
3750     if (!migrate_use_compression()) {
3751         return;
3752     }
3753     thread_count = migrate_decompress_threads();
3754     for (i = 0; i < thread_count; i++) {
3755         /*
3756          * we use it as a indicator which shows if the thread is
3757          * properly init'd or not
3758          */
3759         if (!decomp_param[i].compbuf) {
3760             break;
3761         }
3762
3763         qemu_mutex_lock(&decomp_param[i].mutex);
3764         decomp_param[i].quit = true;
3765         qemu_cond_signal(&decomp_param[i].cond);
3766         qemu_mutex_unlock(&decomp_param[i].mutex);
3767     }
3768     for (i = 0; i < thread_count; i++) {
3769         if (!decomp_param[i].compbuf) {
3770             break;
3771         }
3772
3773         qemu_thread_join(decompress_threads + i);
3774         qemu_mutex_destroy(&decomp_param[i].mutex);
3775         qemu_cond_destroy(&decomp_param[i].cond);
3776         inflateEnd(&decomp_param[i].stream);
3777         g_free(decomp_param[i].compbuf);
3778         decomp_param[i].compbuf = NULL;
3779     }
3780     g_free(decompress_threads);
3781     g_free(decomp_param);
3782     decompress_threads = NULL;
3783     decomp_param = NULL;
3784     decomp_file = NULL;
3785 }
3786
3787 static int compress_threads_load_setup(QEMUFile *f)
3788 {
3789     int i, thread_count;
3790
3791     if (!migrate_use_compression()) {
3792         return 0;
3793     }
3794
3795     thread_count = migrate_decompress_threads();
3796     decompress_threads = g_new0(QemuThread, thread_count);
3797     decomp_param = g_new0(DecompressParam, thread_count);
3798     qemu_mutex_init(&decomp_done_lock);
3799     qemu_cond_init(&decomp_done_cond);
3800     decomp_file = f;
3801     for (i = 0; i < thread_count; i++) {
3802         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3803             goto exit;
3804         }
3805
3806         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3807         qemu_mutex_init(&decomp_param[i].mutex);
3808         qemu_cond_init(&decomp_param[i].cond);
3809         decomp_param[i].done = true;
3810         decomp_param[i].quit = false;
3811         qemu_thread_create(decompress_threads + i, "decompress",
3812                            do_data_decompress, decomp_param + i,
3813                            QEMU_THREAD_JOINABLE);
3814     }
3815     return 0;
3816 exit:
3817     compress_threads_load_cleanup();
3818     return -1;
3819 }
3820
3821 static void decompress_data_with_multi_threads(QEMUFile *f,
3822                                                void *host, int len)
3823 {
3824     int idx, thread_count;
3825
3826     thread_count = migrate_decompress_threads();
3827     qemu_mutex_lock(&decomp_done_lock);
3828     while (true) {
3829         for (idx = 0; idx < thread_count; idx++) {
3830             if (decomp_param[idx].done) {
3831                 decomp_param[idx].done = false;
3832                 qemu_mutex_lock(&decomp_param[idx].mutex);
3833                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3834                 decomp_param[idx].des = host;
3835                 decomp_param[idx].len = len;
3836                 qemu_cond_signal(&decomp_param[idx].cond);
3837                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3838                 break;
3839             }
3840         }
3841         if (idx < thread_count) {
3842             break;
3843         } else {
3844             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3845         }
3846     }
3847     qemu_mutex_unlock(&decomp_done_lock);
3848 }
3849
3850 /*
3851  * colo cache: this is for secondary VM, we cache the whole
3852  * memory of the secondary VM, it is need to hold the global lock
3853  * to call this helper.
3854  */
3855 int colo_init_ram_cache(void)
3856 {
3857     RAMBlock *block;
3858
3859     rcu_read_lock();
3860     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3861         block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3862                                                 NULL,
3863                                                 false);
3864         if (!block->colo_cache) {
3865             error_report("%s: Can't alloc memory for COLO cache of block %s,"
3866                          "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3867                          block->used_length);
3868             goto out_locked;
3869         }
3870         memcpy(block->colo_cache, block->host, block->used_length);
3871     }
3872     rcu_read_unlock();
3873     /*
3874     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3875     * with to decide which page in cache should be flushed into SVM's RAM. Here
3876     * we use the same name 'ram_bitmap' as for migration.
3877     */
3878     if (ram_bytes_total()) {
3879         RAMBlock *block;
3880
3881         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3882             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3883
3884             block->bmap = bitmap_new(pages);
3885             bitmap_set(block->bmap, 0, pages);
3886         }
3887     }
3888     ram_state = g_new0(RAMState, 1);
3889     ram_state->migration_dirty_pages = 0;
3890     memory_global_dirty_log_start();
3891
3892     return 0;
3893
3894 out_locked:
3895
3896     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3897         if (block->colo_cache) {
3898             qemu_anon_ram_free(block->colo_cache, block->used_length);
3899             block->colo_cache = NULL;
3900         }
3901     }
3902
3903     rcu_read_unlock();
3904     return -errno;
3905 }
3906
3907 /* It is need to hold the global lock to call this helper */
3908 void colo_release_ram_cache(void)
3909 {
3910     RAMBlock *block;
3911
3912     memory_global_dirty_log_stop();
3913     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3914         g_free(block->bmap);
3915         block->bmap = NULL;
3916     }
3917
3918     rcu_read_lock();
3919
3920     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3921         if (block->colo_cache) {
3922             qemu_anon_ram_free(block->colo_cache, block->used_length);
3923             block->colo_cache = NULL;
3924         }
3925     }
3926
3927     rcu_read_unlock();
3928     g_free(ram_state);
3929     ram_state = NULL;
3930 }
3931
3932 /**
3933  * ram_load_setup: Setup RAM for migration incoming side
3934  *
3935  * Returns zero to indicate success and negative for error
3936  *
3937  * @f: QEMUFile where to receive the data
3938  * @opaque: RAMState pointer
3939  */
3940 static int ram_load_setup(QEMUFile *f, void *opaque)
3941 {
3942     if (compress_threads_load_setup(f)) {
3943         return -1;
3944     }
3945
3946     xbzrle_load_setup();
3947     ramblock_recv_map_init();
3948
3949     return 0;
3950 }
3951
3952 static int ram_load_cleanup(void *opaque)
3953 {
3954     RAMBlock *rb;
3955
3956     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3957         if (ramblock_is_pmem(rb)) {
3958             pmem_persist(rb->host, rb->used_length);
3959         }
3960     }
3961
3962     xbzrle_load_cleanup();
3963     compress_threads_load_cleanup();
3964
3965     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3966         g_free(rb->receivedmap);
3967         rb->receivedmap = NULL;
3968     }
3969
3970     return 0;
3971 }
3972
3973 /**
3974  * ram_postcopy_incoming_init: allocate postcopy data structures
3975  *
3976  * Returns 0 for success and negative if there was one error
3977  *
3978  * @mis: current migration incoming state
3979  *
3980  * Allocate data structures etc needed by incoming migration with
3981  * postcopy-ram. postcopy-ram's similarly names
3982  * postcopy_ram_incoming_init does the work.
3983  */
3984 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3985 {
3986     return postcopy_ram_incoming_init(mis);
3987 }
3988
3989 /**
3990  * ram_load_postcopy: load a page in postcopy case
3991  *
3992  * Returns 0 for success or -errno in case of error
3993  *
3994  * Called in postcopy mode by ram_load().
3995  * rcu_read_lock is taken prior to this being called.
3996  *
3997  * @f: QEMUFile where to send the data
3998  */
3999 static int ram_load_postcopy(QEMUFile *f)
4000 {
4001     int flags = 0, ret = 0;
4002     bool place_needed = false;
4003     bool matches_target_page_size = false;
4004     MigrationIncomingState *mis = migration_incoming_get_current();
4005     /* Temporary page that is later 'placed' */
4006     void *postcopy_host_page = postcopy_get_tmp_page(mis);
4007     void *last_host = NULL;
4008     bool all_zero = false;
4009
4010     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4011         ram_addr_t addr;
4012         void *host = NULL;
4013         void *page_buffer = NULL;
4014         void *place_source = NULL;
4015         RAMBlock *block = NULL;
4016         uint8_t ch;
4017
4018         addr = qemu_get_be64(f);
4019
4020         /*
4021          * If qemu file error, we should stop here, and then "addr"
4022          * may be invalid
4023          */
4024         ret = qemu_file_get_error(f);
4025         if (ret) {
4026             break;
4027         }
4028
4029         flags = addr & ~TARGET_PAGE_MASK;
4030         addr &= TARGET_PAGE_MASK;
4031
4032         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4033         place_needed = false;
4034         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
4035             block = ram_block_from_stream(f, flags);
4036
4037             host = host_from_ram_block_offset(block, addr);
4038             if (!host) {
4039                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4040                 ret = -EINVAL;
4041                 break;
4042             }
4043             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4044             /*
4045              * Postcopy requires that we place whole host pages atomically;
4046              * these may be huge pages for RAMBlocks that are backed by
4047              * hugetlbfs.
4048              * To make it atomic, the data is read into a temporary page
4049              * that's moved into place later.
4050              * The migration protocol uses,  possibly smaller, target-pages
4051              * however the source ensures it always sends all the components
4052              * of a host page in order.
4053              */
4054             page_buffer = postcopy_host_page +
4055                           ((uintptr_t)host & (block->page_size - 1));
4056             /* If all TP are zero then we can optimise the place */
4057             if (!((uintptr_t)host & (block->page_size - 1))) {
4058                 all_zero = true;
4059             } else {
4060                 /* not the 1st TP within the HP */
4061                 if (host != (last_host + TARGET_PAGE_SIZE)) {
4062                     error_report("Non-sequential target page %p/%p",
4063                                   host, last_host);
4064                     ret = -EINVAL;
4065                     break;
4066                 }
4067             }
4068
4069
4070             /*
4071              * If it's the last part of a host page then we place the host
4072              * page
4073              */
4074             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
4075                                      (block->page_size - 1)) == 0;
4076             place_source = postcopy_host_page;
4077         }
4078         last_host = host;
4079
4080         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4081         case RAM_SAVE_FLAG_ZERO:
4082             ch = qemu_get_byte(f);
4083             memset(page_buffer, ch, TARGET_PAGE_SIZE);
4084             if (ch) {
4085                 all_zero = false;
4086             }
4087             break;
4088
4089         case RAM_SAVE_FLAG_PAGE:
4090             all_zero = false;
4091             if (!matches_target_page_size) {
4092                 /* For huge pages, we always use temporary buffer */
4093                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4094             } else {
4095                 /*
4096                  * For small pages that matches target page size, we
4097                  * avoid the qemu_file copy.  Instead we directly use
4098                  * the buffer of QEMUFile to place the page.  Note: we
4099                  * cannot do any QEMUFile operation before using that
4100                  * buffer to make sure the buffer is valid when
4101                  * placing the page.
4102                  */
4103                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4104                                          TARGET_PAGE_SIZE);
4105             }
4106             break;
4107         case RAM_SAVE_FLAG_EOS:
4108             /* normal exit */
4109             multifd_recv_sync_main();
4110             break;
4111         default:
4112             error_report("Unknown combination of migration flags: %#x"
4113                          " (postcopy mode)", flags);
4114             ret = -EINVAL;
4115             break;
4116         }
4117
4118         /* Detect for any possible file errors */
4119         if (!ret && qemu_file_get_error(f)) {
4120             ret = qemu_file_get_error(f);
4121         }
4122
4123         if (!ret && place_needed) {
4124             /* This gets called at the last target page in the host page */
4125             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
4126
4127             if (all_zero) {
4128                 ret = postcopy_place_page_zero(mis, place_dest,
4129                                                block);
4130             } else {
4131                 ret = postcopy_place_page(mis, place_dest,
4132                                           place_source, block);
4133             }
4134         }
4135     }
4136
4137     return ret;
4138 }
4139
4140 static bool postcopy_is_advised(void)
4141 {
4142     PostcopyState ps = postcopy_state_get();
4143     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4144 }
4145
4146 static bool postcopy_is_running(void)
4147 {
4148     PostcopyState ps = postcopy_state_get();
4149     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4150 }
4151
4152 /*
4153  * Flush content of RAM cache into SVM's memory.
4154  * Only flush the pages that be dirtied by PVM or SVM or both.
4155  */
4156 static void colo_flush_ram_cache(void)
4157 {
4158     RAMBlock *block = NULL;
4159     void *dst_host;
4160     void *src_host;
4161     unsigned long offset = 0;
4162
4163     memory_global_dirty_log_sync();
4164     rcu_read_lock();
4165     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4166         migration_bitmap_sync_range(ram_state, block, 0, block->used_length);
4167     }
4168     rcu_read_unlock();
4169
4170     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4171     rcu_read_lock();
4172     block = QLIST_FIRST_RCU(&ram_list.blocks);
4173
4174     while (block) {
4175         offset = migration_bitmap_find_dirty(ram_state, block, offset);
4176
4177         if (offset << TARGET_PAGE_BITS >= block->used_length) {
4178             offset = 0;
4179             block = QLIST_NEXT_RCU(block, next);
4180         } else {
4181             migration_bitmap_clear_dirty(ram_state, block, offset);
4182             dst_host = block->host + (offset << TARGET_PAGE_BITS);
4183             src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4184             memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4185         }
4186     }
4187
4188     rcu_read_unlock();
4189     trace_colo_flush_ram_cache_end();
4190 }
4191
4192 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4193 {
4194     int flags = 0, ret = 0, invalid_flags = 0;
4195     static uint64_t seq_iter;
4196     int len = 0;
4197     /*
4198      * If system is running in postcopy mode, page inserts to host memory must
4199      * be atomic
4200      */
4201     bool postcopy_running = postcopy_is_running();
4202     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4203     bool postcopy_advised = postcopy_is_advised();
4204
4205     seq_iter++;
4206
4207     if (version_id != 4) {
4208         ret = -EINVAL;
4209     }
4210
4211     if (!migrate_use_compression()) {
4212         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4213     }
4214     /* This RCU critical section can be very long running.
4215      * When RCU reclaims in the code start to become numerous,
4216      * it will be necessary to reduce the granularity of this
4217      * critical section.
4218      */
4219     rcu_read_lock();
4220
4221     if (postcopy_running) {
4222         ret = ram_load_postcopy(f);
4223     }
4224
4225     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4226         ram_addr_t addr, total_ram_bytes;
4227         void *host = NULL;
4228         uint8_t ch;
4229
4230         addr = qemu_get_be64(f);
4231         flags = addr & ~TARGET_PAGE_MASK;
4232         addr &= TARGET_PAGE_MASK;
4233
4234         if (flags & invalid_flags) {
4235             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4236                 error_report("Received an unexpected compressed page");
4237             }
4238
4239             ret = -EINVAL;
4240             break;
4241         }
4242
4243         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4244                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4245             RAMBlock *block = ram_block_from_stream(f, flags);
4246
4247             /*
4248              * After going into COLO, we should load the Page into colo_cache.
4249              */
4250             if (migration_incoming_in_colo_state()) {
4251                 host = colo_cache_from_block_offset(block, addr);
4252             } else {
4253                 host = host_from_ram_block_offset(block, addr);
4254             }
4255             if (!host) {
4256                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4257                 ret = -EINVAL;
4258                 break;
4259             }
4260
4261             if (!migration_incoming_in_colo_state()) {
4262                 ramblock_recv_bitmap_set(block, host);
4263             }
4264
4265             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4266         }
4267
4268         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4269         case RAM_SAVE_FLAG_MEM_SIZE:
4270             /* Synchronize RAM block list */
4271             total_ram_bytes = addr;
4272             while (!ret && total_ram_bytes) {
4273                 RAMBlock *block;
4274                 char id[256];
4275                 ram_addr_t length;
4276
4277                 len = qemu_get_byte(f);
4278                 qemu_get_buffer(f, (uint8_t *)id, len);
4279                 id[len] = 0;
4280                 length = qemu_get_be64(f);
4281
4282                 block = qemu_ram_block_by_name(id);
4283                 if (block && !qemu_ram_is_migratable(block)) {
4284                     error_report("block %s should not be migrated !", id);
4285                     ret = -EINVAL;
4286                 } else if (block) {
4287                     if (length != block->used_length) {
4288                         Error *local_err = NULL;
4289
4290                         ret = qemu_ram_resize(block, length,
4291                                               &local_err);
4292                         if (local_err) {
4293                             error_report_err(local_err);
4294                         }
4295                     }
4296                     /* For postcopy we need to check hugepage sizes match */
4297                     if (postcopy_advised &&
4298                         block->page_size != qemu_host_page_size) {
4299                         uint64_t remote_page_size = qemu_get_be64(f);
4300                         if (remote_page_size != block->page_size) {
4301                             error_report("Mismatched RAM page size %s "
4302                                          "(local) %zd != %" PRId64,
4303                                          id, block->page_size,
4304                                          remote_page_size);
4305                             ret = -EINVAL;
4306                         }
4307                     }
4308                     if (migrate_ignore_shared()) {
4309                         hwaddr addr = qemu_get_be64(f);
4310                         bool ignored = qemu_get_byte(f);
4311                         if (ignored != ramblock_is_ignored(block)) {
4312                             error_report("RAM block %s should %s be migrated",
4313                                          id, ignored ? "" : "not");
4314                             ret = -EINVAL;
4315                         }
4316                         if (ramblock_is_ignored(block) &&
4317                             block->mr->addr != addr) {
4318                             error_report("Mismatched GPAs for block %s "
4319                                          "%" PRId64 "!= %" PRId64,
4320                                          id, (uint64_t)addr,
4321                                          (uint64_t)block->mr->addr);
4322                             ret = -EINVAL;
4323                         }
4324                     }
4325                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4326                                           block->idstr);
4327                 } else {
4328                     error_report("Unknown ramblock \"%s\", cannot "
4329                                  "accept migration", id);
4330                     ret = -EINVAL;
4331                 }
4332
4333                 total_ram_bytes -= length;
4334             }
4335             break;
4336
4337         case RAM_SAVE_FLAG_ZERO:
4338             ch = qemu_get_byte(f);
4339             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4340             break;
4341
4342         case RAM_SAVE_FLAG_PAGE:
4343             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4344             break;
4345
4346         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4347             len = qemu_get_be32(f);
4348             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4349                 error_report("Invalid compressed data length: %d", len);
4350                 ret = -EINVAL;
4351                 break;
4352             }
4353             decompress_data_with_multi_threads(f, host, len);
4354             break;
4355
4356         case RAM_SAVE_FLAG_XBZRLE:
4357             if (load_xbzrle(f, addr, host) < 0) {
4358                 error_report("Failed to decompress XBZRLE page at "
4359                              RAM_ADDR_FMT, addr);
4360                 ret = -EINVAL;
4361                 break;
4362             }
4363             break;
4364         case RAM_SAVE_FLAG_EOS:
4365             /* normal exit */
4366             multifd_recv_sync_main();
4367             break;
4368         default:
4369             if (flags & RAM_SAVE_FLAG_HOOK) {
4370                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4371             } else {
4372                 error_report("Unknown combination of migration flags: %#x",
4373                              flags);
4374                 ret = -EINVAL;
4375             }
4376         }
4377         if (!ret) {
4378             ret = qemu_file_get_error(f);
4379         }
4380     }
4381
4382     ret |= wait_for_decompress_done();
4383     rcu_read_unlock();
4384     trace_ram_load_complete(ret, seq_iter);
4385
4386     if (!ret  && migration_incoming_in_colo_state()) {
4387         colo_flush_ram_cache();
4388     }
4389     return ret;
4390 }
4391
4392 static bool ram_has_postcopy(void *opaque)
4393 {
4394     RAMBlock *rb;
4395     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4396         if (ramblock_is_pmem(rb)) {
4397             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4398                          "is not supported now!", rb->idstr, rb->host);
4399             return false;
4400         }
4401     }
4402
4403     return migrate_postcopy_ram();
4404 }
4405
4406 /* Sync all the dirty bitmap with destination VM.  */
4407 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4408 {
4409     RAMBlock *block;
4410     QEMUFile *file = s->to_dst_file;
4411     int ramblock_count = 0;
4412
4413     trace_ram_dirty_bitmap_sync_start();
4414
4415     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4416         qemu_savevm_send_recv_bitmap(file, block->idstr);
4417         trace_ram_dirty_bitmap_request(block->idstr);
4418         ramblock_count++;
4419     }
4420
4421     trace_ram_dirty_bitmap_sync_wait();
4422
4423     /* Wait until all the ramblocks' dirty bitmap synced */
4424     while (ramblock_count--) {
4425         qemu_sem_wait(&s->rp_state.rp_sem);
4426     }
4427
4428     trace_ram_dirty_bitmap_sync_complete();
4429
4430     return 0;
4431 }
4432
4433 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4434 {
4435     qemu_sem_post(&s->rp_state.rp_sem);
4436 }
4437
4438 /*
4439  * Read the received bitmap, revert it as the initial dirty bitmap.
4440  * This is only used when the postcopy migration is paused but wants
4441  * to resume from a middle point.
4442  */
4443 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4444 {
4445     int ret = -EINVAL;
4446     QEMUFile *file = s->rp_state.from_dst_file;
4447     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4448     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4449     uint64_t size, end_mark;
4450
4451     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4452
4453     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4454         error_report("%s: incorrect state %s", __func__,
4455                      MigrationStatus_str(s->state));
4456         return -EINVAL;
4457     }
4458
4459     /*
4460      * Note: see comments in ramblock_recv_bitmap_send() on why we
4461      * need the endianess convertion, and the paddings.
4462      */
4463     local_size = ROUND_UP(local_size, 8);
4464
4465     /* Add paddings */
4466     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4467
4468     size = qemu_get_be64(file);
4469
4470     /* The size of the bitmap should match with our ramblock */
4471     if (size != local_size) {
4472         error_report("%s: ramblock '%s' bitmap size mismatch "
4473                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4474                      block->idstr, size, local_size);
4475         ret = -EINVAL;
4476         goto out;
4477     }
4478
4479     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4480     end_mark = qemu_get_be64(file);
4481
4482     ret = qemu_file_get_error(file);
4483     if (ret || size != local_size) {
4484         error_report("%s: read bitmap failed for ramblock '%s': %d"
4485                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4486                      __func__, block->idstr, ret, local_size, size);
4487         ret = -EIO;
4488         goto out;
4489     }
4490
4491     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4492         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4493                      __func__, block->idstr, end_mark);
4494         ret = -EINVAL;
4495         goto out;
4496     }
4497
4498     /*
4499      * Endianess convertion. We are during postcopy (though paused).
4500      * The dirty bitmap won't change. We can directly modify it.
4501      */
4502     bitmap_from_le(block->bmap, le_bitmap, nbits);
4503
4504     /*
4505      * What we received is "received bitmap". Revert it as the initial
4506      * dirty bitmap for this ramblock.
4507      */
4508     bitmap_complement(block->bmap, block->bmap, nbits);
4509
4510     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4511
4512     /*
4513      * We succeeded to sync bitmap for current ramblock. If this is
4514      * the last one to sync, we need to notify the main send thread.
4515      */
4516     ram_dirty_bitmap_reload_notify(s);
4517
4518     ret = 0;
4519 out:
4520     g_free(le_bitmap);
4521     return ret;
4522 }
4523
4524 static int ram_resume_prepare(MigrationState *s, void *opaque)
4525 {
4526     RAMState *rs = *(RAMState **)opaque;
4527     int ret;
4528
4529     ret = ram_dirty_bitmap_sync_all(s, rs);
4530     if (ret) {
4531         return ret;
4532     }
4533
4534     ram_state_resume_prepare(rs, s->to_dst_file);
4535
4536     return 0;
4537 }
4538
4539 static SaveVMHandlers savevm_ram_handlers = {
4540     .save_setup = ram_save_setup,
4541     .save_live_iterate = ram_save_iterate,
4542     .save_live_complete_postcopy = ram_save_complete,
4543     .save_live_complete_precopy = ram_save_complete,
4544     .has_postcopy = ram_has_postcopy,
4545     .save_live_pending = ram_save_pending,
4546     .load_state = ram_load,
4547     .save_cleanup = ram_save_cleanup,
4548     .load_setup = ram_load_setup,
4549     .load_cleanup = ram_load_cleanup,
4550     .resume_prepare = ram_resume_prepare,
4551 };
4552
4553 void ram_mig_init(void)
4554 {
4555     qemu_mutex_init(&XBZRLE.lock);
4556     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4557 }