migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "socket.h"
  40 #include "migration/register.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "postcopy-ram.h"
  44 #include "page_cache.h"
  45 #include "qemu/error-report.h"
  46 #include "qapi/error.h"
  47 #include "qapi/qapi-events-migration.h"
  48 #include "qapi/qmp/qerror.h"
  49 #include "trace.h"
  50 #include "exec/ram_addr.h"
  51 #include "exec/target_page.h"
  52 #include "qemu/rcu_queue.h"
  53 #include "migration/colo.h"
  54 #include "block.h"
  55 #include "sysemu/sysemu.h"
  56 #include "qemu/uuid.h"
  57 #include "savevm.h"
  58 #include "qemu/iov.h"
  59
  60 /***********************************************************/
  61 /* ram save/restore */
  62
  63 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  64  * worked for pages that where filled with the same char.  We switched
  65  * it to only search for the zero value.  And to avoid confusion with
  66  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  67  */
  68
  69 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  70 #define RAM_SAVE_FLAG_ZERO     0x02
  71 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  72 #define RAM_SAVE_FLAG_PAGE     0x08
  73 #define RAM_SAVE_FLAG_EOS      0x10
  74 #define RAM_SAVE_FLAG_CONTINUE 0x20
  75 #define RAM_SAVE_FLAG_XBZRLE   0x40
  76 /* 0x80 is reserved in migration.h start with 0x100 next */
  77 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  78
  79 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  80 {
  81     return buffer_is_zero(p, size);
  82 }
  83
  84 XBZRLECacheStats xbzrle_counters;
  85
  86 /* struct contains XBZRLE cache and a static page
  87    used by the compression */
  88 static struct {
  89     /* buffer used for XBZRLE encoding */
  90     uint8_t *encoded_buf;
  91     /* buffer for storing page content */
  92     uint8_t *current_buf;
  93     /* Cache for XBZRLE, Protected by lock. */
  94     PageCache *cache;
  95     QemuMutex lock;
  96     /* it will store a page full of zeros */
  97     uint8_t *zero_target_page;
  98     /* buffer used for XBZRLE decoding */
  99     uint8_t *decoded_buf;
 100 } XBZRLE;
 101
 102 static void XBZRLE_cache_lock(void)
 103 {
 104     if (migrate_use_xbzrle())
 105         qemu_mutex_lock(&XBZRLE.lock);
 106 }
 107
 108 static void XBZRLE_cache_unlock(void)
 109 {
 110     if (migrate_use_xbzrle())
 111         qemu_mutex_unlock(&XBZRLE.lock);
 112 }
 113
 114 /**
 115  * xbzrle_cache_resize: resize the xbzrle cache
 116  *
 117  * This function is called from qmp_migrate_set_cache_size in main
 118  * thread, possibly while a migration is in progress.  A running
 119  * migration may be using the cache and might finish during this call,
 120  * hence changes to the cache are protected by XBZRLE.lock().
 121  *
 122  * Returns 0 for success or -1 for error
 123  *
 124  * @new_size: new cache size
 125  * @errp: set *errp if the check failed, with reason
 126  */
 127 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 128 {
 129     PageCache *new_cache;
 130     int64_t ret = 0;
 131
 132     /* Check for truncation */
 133     if (new_size != (size_t)new_size) {
 134         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 135                    "exceeding address space");
 136         return -1;
 137     }
 138
 139     if (new_size == migrate_xbzrle_cache_size()) {
 140         /* nothing to do */
 141         return 0;
 142     }
 143
 144     XBZRLE_cache_lock();
 145
 146     if (XBZRLE.cache != NULL) {
 147         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 148         if (!new_cache) {
 149             ret = -1;
 150             goto out;
 151         }
 152
 153         cache_fini(XBZRLE.cache);
 154         XBZRLE.cache = new_cache;
 155     }
 156 out:
 157     XBZRLE_cache_unlock();
 158     return ret;
 159 }
 160
 161 static bool ramblock_is_ignored(RAMBlock *block)
 162 {
 163     return !qemu_ram_is_migratable(block) ||
 164            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 165 }
 166
 167 /* Should be holding either ram_list.mutex, or the RCU lock. */
 168 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 169     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 170         if (ramblock_is_ignored(block)) {} else
 171
 172 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 173     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 174         if (!qemu_ram_is_migratable(block)) {} else
 175
 176 #undef RAMBLOCK_FOREACH
 177
 178 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 179 {
 180     RAMBlock *block;
 181     int ret = 0;
 182
 183     RCU_READ_LOCK_GUARD();
 184
 185     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 186         ret = func(block, opaque);
 187         if (ret) {
 188             break;
 189         }
 190     }
 191     return ret;
 192 }
 193
 194 static void ramblock_recv_map_init(void)
 195 {
 196     RAMBlock *rb;
 197
 198     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 199         assert(!rb->receivedmap);
 200         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 201     }
 202 }
 203
 204 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 205 {
 206     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 207                     rb->receivedmap);
 208 }
 209
 210 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 211 {
 212     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 213 }
 214
 215 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 216 {
 217     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 218 }
 219
 220 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 221                                     size_t nr)
 222 {
 223     bitmap_set_atomic(rb->receivedmap,
 224                       ramblock_recv_bitmap_offset(host_addr, rb),
 225                       nr);
 226 }
 227
 228 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 229
 230 /*
 231  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 232  *
 233  * Returns >0 if success with sent bytes, or <0 if error.
 234  */
 235 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 236                                   const char *block_name)
 237 {
 238     RAMBlock *block = qemu_ram_block_by_name(block_name);
 239     unsigned long *le_bitmap, nbits;
 240     uint64_t size;
 241
 242     if (!block) {
 243         error_report("%s: invalid block name: %s", __func__, block_name);
 244         return -1;
 245     }
 246
 247     nbits = block->used_length >> TARGET_PAGE_BITS;
 248
 249     /*
 250      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 251      * machines we may need 4 more bytes for padding (see below
 252      * comment). So extend it a bit before hand.
 253      */
 254     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 255
 256     /*
 257      * Always use little endian when sending the bitmap. This is
 258      * required that when source and destination VMs are not using the
 259      * same endianess. (Note: big endian won't work.)
 260      */
 261     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 262
 263     /* Size of the bitmap, in bytes */
 264     size = DIV_ROUND_UP(nbits, 8);
 265
 266     /*
 267      * size is always aligned to 8 bytes for 64bit machines, but it
 268      * may not be true for 32bit machines. We need this padding to
 269      * make sure the migration can survive even between 32bit and
 270      * 64bit machines.
 271      */
 272     size = ROUND_UP(size, 8);
 273
 274     qemu_put_be64(file, size);
 275     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 276     /*
 277      * Mark as an end, in case the middle part is screwed up due to
 278      * some "misterious" reason.
 279      */
 280     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 281     qemu_fflush(file);
 282
 283     g_free(le_bitmap);
 284
 285     if (qemu_file_get_error(file)) {
 286         return qemu_file_get_error(file);
 287     }
 288
 289     return size + sizeof(size);
 290 }
 291
 292 /*
 293  * An outstanding page request, on the source, having been received
 294  * and queued
 295  */
 296 struct RAMSrcPageRequest {
 297     RAMBlock *rb;
 298     hwaddr    offset;
 299     hwaddr    len;
 300
 301     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 302 };
 303
 304 /* State of RAM for migration */
 305 struct RAMState {
 306     /* QEMUFile used for this migration */
 307     QEMUFile *f;
 308     /* Last block that we have visited searching for dirty pages */
 309     RAMBlock *last_seen_block;
 310     /* Last block from where we have sent data */
 311     RAMBlock *last_sent_block;
 312     /* Last dirty target page we have sent */
 313     ram_addr_t last_page;
 314     /* last ram version we have seen */
 315     uint32_t last_version;
 316     /* We are in the first round */
 317     bool ram_bulk_stage;
 318     /* The free page optimization is enabled */
 319     bool fpo_enabled;
 320     /* How many times we have dirty too many pages */
 321     int dirty_rate_high_cnt;
 322     /* these variables are used for bitmap sync */
 323     /* last time we did a full bitmap_sync */
 324     int64_t time_last_bitmap_sync;
 325     /* bytes transferred at start_time */
 326     uint64_t bytes_xfer_prev;
 327     /* number of dirty pages since start_time */
 328     uint64_t num_dirty_pages_period;
 329     /* xbzrle misses since the beginning of the period */
 330     uint64_t xbzrle_cache_miss_prev;
 331
 332     /* compression statistics since the beginning of the period */
 333     /* amount of count that no free thread to compress data */
 334     uint64_t compress_thread_busy_prev;
 335     /* amount bytes after compression */
 336     uint64_t compressed_size_prev;
 337     /* amount of compressed pages */
 338     uint64_t compress_pages_prev;
 339
 340     /* total handled target pages at the beginning of period */
 341     uint64_t target_page_count_prev;
 342     /* total handled target pages since start */
 343     uint64_t target_page_count;
 344     /* number of dirty bits in the bitmap */
 345     uint64_t migration_dirty_pages;
 346     /* Protects modification of the bitmap and migration dirty pages */
 347     QemuMutex bitmap_mutex;
 348     /* The RAMBlock used in the last src_page_requests */
 349     RAMBlock *last_req_rb;
 350     /* Queue of outstanding page requests from the destination */
 351     QemuMutex src_page_req_mutex;
 352     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 353 };
 354 typedef struct RAMState RAMState;
 355
 356 static RAMState *ram_state;
 357
 358 static NotifierWithReturnList precopy_notifier_list;
 359
 360 void precopy_infrastructure_init(void)
 361 {
 362     notifier_with_return_list_init(&precopy_notifier_list);
 363 }
 364
 365 void precopy_add_notifier(NotifierWithReturn *n)
 366 {
 367     notifier_with_return_list_add(&precopy_notifier_list, n);
 368 }
 369
 370 void precopy_remove_notifier(NotifierWithReturn *n)
 371 {
 372     notifier_with_return_remove(n);
 373 }
 374
 375 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 376 {
 377     PrecopyNotifyData pnd;
 378     pnd.reason = reason;
 379     pnd.errp = errp;
 380
 381     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 382 }
 383
 384 void precopy_enable_free_page_optimization(void)
 385 {
 386     if (!ram_state) {
 387         return;
 388     }
 389
 390     ram_state->fpo_enabled = true;
 391 }
 392
 393 uint64_t ram_bytes_remaining(void)
 394 {
 395     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 396                        0;
 397 }
 398
 399 MigrationStats ram_counters;
 400
 401 /* used by the search for pages to send */
 402 struct PageSearchStatus {
 403     /* Current block being searched */
 404     RAMBlock    *block;
 405     /* Current page to search from */
 406     unsigned long page;
 407     /* Set once we wrap around */
 408     bool         complete_round;
 409 };
 410 typedef struct PageSearchStatus PageSearchStatus;
 411
 412 CompressionStats compression_counters;
 413
 414 struct CompressParam {
 415     bool done;
 416     bool quit;
 417     bool zero_page;
 418     QEMUFile *file;
 419     QemuMutex mutex;
 420     QemuCond cond;
 421     RAMBlock *block;
 422     ram_addr_t offset;
 423
 424     /* internally used fields */
 425     z_stream stream;
 426     uint8_t *originbuf;
 427 };
 428 typedef struct CompressParam CompressParam;
 429
 430 struct DecompressParam {
 431     bool done;
 432     bool quit;
 433     QemuMutex mutex;
 434     QemuCond cond;
 435     void *des;
 436     uint8_t *compbuf;
 437     int len;
 438     z_stream stream;
 439 };
 440 typedef struct DecompressParam DecompressParam;
 441
 442 static CompressParam *comp_param;
 443 static QemuThread *compress_threads;
 444 /* comp_done_cond is used to wake up the migration thread when
 445  * one of the compression threads has finished the compression.
 446  * comp_done_lock is used to co-work with comp_done_cond.
 447  */
 448 static QemuMutex comp_done_lock;
 449 static QemuCond comp_done_cond;
 450 /* The empty QEMUFileOps will be used by file in CompressParam */
 451 static const QEMUFileOps empty_ops = { };
 452
 453 static QEMUFile *decomp_file;
 454 static DecompressParam *decomp_param;
 455 static QemuThread *decompress_threads;
 456 static QemuMutex decomp_done_lock;
 457 static QemuCond decomp_done_cond;
 458
 459 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 460                                  ram_addr_t offset, uint8_t *source_buf);
 461
 462 static void *do_data_compress(void *opaque)
 463 {
 464     CompressParam *param = opaque;
 465     RAMBlock *block;
 466     ram_addr_t offset;
 467     bool zero_page;
 468
 469     qemu_mutex_lock(&param->mutex);
 470     while (!param->quit) {
 471         if (param->block) {
 472             block = param->block;
 473             offset = param->offset;
 474             param->block = NULL;
 475             qemu_mutex_unlock(&param->mutex);
 476
 477             zero_page = do_compress_ram_page(param->file, &param->stream,
 478                                              block, offset, param->originbuf);
 479
 480             qemu_mutex_lock(&comp_done_lock);
 481             param->done = true;
 482             param->zero_page = zero_page;
 483             qemu_cond_signal(&comp_done_cond);
 484             qemu_mutex_unlock(&comp_done_lock);
 485
 486             qemu_mutex_lock(&param->mutex);
 487         } else {
 488             qemu_cond_wait(&param->cond, &param->mutex);
 489         }
 490     }
 491     qemu_mutex_unlock(&param->mutex);
 492
 493     return NULL;
 494 }
 495
 496 static void compress_threads_save_cleanup(void)
 497 {
 498     int i, thread_count;
 499
 500     if (!migrate_use_compression() || !comp_param) {
 501         return;
 502     }
 503
 504     thread_count = migrate_compress_threads();
 505     for (i = 0; i < thread_count; i++) {
 506         /*
 507          * we use it as a indicator which shows if the thread is
 508          * properly init'd or not
 509          */
 510         if (!comp_param[i].file) {
 511             break;
 512         }
 513
 514         qemu_mutex_lock(&comp_param[i].mutex);
 515         comp_param[i].quit = true;
 516         qemu_cond_signal(&comp_param[i].cond);
 517         qemu_mutex_unlock(&comp_param[i].mutex);
 518
 519         qemu_thread_join(compress_threads + i);
 520         qemu_mutex_destroy(&comp_param[i].mutex);
 521         qemu_cond_destroy(&comp_param[i].cond);
 522         deflateEnd(&comp_param[i].stream);
 523         g_free(comp_param[i].originbuf);
 524         qemu_fclose(comp_param[i].file);
 525         comp_param[i].file = NULL;
 526     }
 527     qemu_mutex_destroy(&comp_done_lock);
 528     qemu_cond_destroy(&comp_done_cond);
 529     g_free(compress_threads);
 530     g_free(comp_param);
 531     compress_threads = NULL;
 532     comp_param = NULL;
 533 }
 534
 535 static int compress_threads_save_setup(void)
 536 {
 537     int i, thread_count;
 538
 539     if (!migrate_use_compression()) {
 540         return 0;
 541     }
 542     thread_count = migrate_compress_threads();
 543     compress_threads = g_new0(QemuThread, thread_count);
 544     comp_param = g_new0(CompressParam, thread_count);
 545     qemu_cond_init(&comp_done_cond);
 546     qemu_mutex_init(&comp_done_lock);
 547     for (i = 0; i < thread_count; i++) {
 548         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 549         if (!comp_param[i].originbuf) {
 550             goto exit;
 551         }
 552
 553         if (deflateInit(&comp_param[i].stream,
 554                         migrate_compress_level()) != Z_OK) {
 555             g_free(comp_param[i].originbuf);
 556             goto exit;
 557         }
 558
 559         /* comp_param[i].file is just used as a dummy buffer to save data,
 560          * set its ops to empty.
 561          */
 562         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 563         comp_param[i].done = true;
 564         comp_param[i].quit = false;
 565         qemu_mutex_init(&comp_param[i].mutex);
 566         qemu_cond_init(&comp_param[i].cond);
 567         qemu_thread_create(compress_threads + i, "compress",
 568                            do_data_compress, comp_param + i,
 569                            QEMU_THREAD_JOINABLE);
 570     }
 571     return 0;
 572
 573 exit:
 574     compress_threads_save_cleanup();
 575     return -1;
 576 }
 577
 578 /* Multiple fd's */
 579
 580 #define MULTIFD_MAGIC 0x11223344U
 581 #define MULTIFD_VERSION 1
 582
 583 #define MULTIFD_FLAG_SYNC (1 << 0)
 584
 585 /* This value needs to be a multiple of qemu_target_page_size() */
 586 #define MULTIFD_PACKET_SIZE (512 * 1024)
 587
 588 typedef struct {
 589     uint32_t magic;
 590     uint32_t version;
 591     unsigned char uuid[16]; /* QemuUUID */
 592     uint8_t id;
 593     uint8_t unused1[7];     /* Reserved for future use */
 594     uint64_t unused2[4];    /* Reserved for future use */
 595 } __attribute__((packed)) MultiFDInit_t;
 596
 597 typedef struct {
 598     uint32_t magic;
 599     uint32_t version;
 600     uint32_t flags;
 601     /* maximum number of allocated pages */
 602     uint32_t pages_alloc;
 603     uint32_t pages_used;
 604     /* size of the next packet that contains pages */
 605     uint32_t next_packet_size;
 606     uint64_t packet_num;
 607     uint64_t unused[4];    /* Reserved for future use */
 608     char ramblock[256];
 609     uint64_t offset[];
 610 } __attribute__((packed)) MultiFDPacket_t;
 611
 612 typedef struct {
 613     /* number of used pages */
 614     uint32_t used;
 615     /* number of allocated pages */
 616     uint32_t allocated;
 617     /* global number of generated multifd packets */
 618     uint64_t packet_num;
 619     /* offset of each page */
 620     ram_addr_t *offset;
 621     /* pointer to each page */
 622     struct iovec *iov;
 623     RAMBlock *block;
 624 } MultiFDPages_t;
 625
 626 typedef struct {
 627     /* this fields are not changed once the thread is created */
 628     /* channel number */
 629     uint8_t id;
 630     /* channel thread name */
 631     char *name;
 632     /* channel thread id */
 633     QemuThread thread;
 634     /* communication channel */
 635     QIOChannel *c;
 636     /* sem where to wait for more work */
 637     QemuSemaphore sem;
 638     /* this mutex protects the following parameters */
 639     QemuMutex mutex;
 640     /* is this channel thread running */
 641     bool running;
 642     /* should this thread finish */
 643     bool quit;
 644     /* thread has work to do */
 645     int pending_job;
 646     /* array of pages to sent */
 647     MultiFDPages_t *pages;
 648     /* packet allocated len */
 649     uint32_t packet_len;
 650     /* pointer to the packet */
 651     MultiFDPacket_t *packet;
 652     /* multifd flags for each packet */
 653     uint32_t flags;
 654     /* size of the next packet that contains pages */
 655     uint32_t next_packet_size;
 656     /* global number of generated multifd packets */
 657     uint64_t packet_num;
 658     /* thread local variables */
 659     /* packets sent through this channel */
 660     uint64_t num_packets;
 661     /* pages sent through this channel */
 662     uint64_t num_pages;
 663     /* syncs main thread and channels */
 664     QemuSemaphore sem_sync;
 665 }  MultiFDSendParams;
 666
 667 typedef struct {
 668     /* this fields are not changed once the thread is created */
 669     /* channel number */
 670     uint8_t id;
 671     /* channel thread name */
 672     char *name;
 673     /* channel thread id */
 674     QemuThread thread;
 675     /* communication channel */
 676     QIOChannel *c;
 677     /* this mutex protects the following parameters */
 678     QemuMutex mutex;
 679     /* is this channel thread running */
 680     bool running;
 681     /* should this thread finish */
 682     bool quit;
 683     /* array of pages to receive */
 684     MultiFDPages_t *pages;
 685     /* packet allocated len */
 686     uint32_t packet_len;
 687     /* pointer to the packet */
 688     MultiFDPacket_t *packet;
 689     /* multifd flags for each packet */
 690     uint32_t flags;
 691     /* global number of generated multifd packets */
 692     uint64_t packet_num;
 693     /* thread local variables */
 694     /* size of the next packet that contains pages */
 695     uint32_t next_packet_size;
 696     /* packets sent through this channel */
 697     uint64_t num_packets;
 698     /* pages sent through this channel */
 699     uint64_t num_pages;
 700     /* syncs main thread and channels */
 701     QemuSemaphore sem_sync;
 702 } MultiFDRecvParams;
 703
 704 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 705 {
 706     MultiFDInit_t msg = {};
 707     int ret;
 708
 709     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 710     msg.version = cpu_to_be32(MULTIFD_VERSION);
 711     msg.id = p->id;
 712     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 713
 714     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 715     if (ret != 0) {
 716         return -1;
 717     }
 718     return 0;
 719 }
 720
 721 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 722 {
 723     MultiFDInit_t msg;
 724     int ret;
 725
 726     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 727     if (ret != 0) {
 728         return -1;
 729     }
 730
 731     msg.magic = be32_to_cpu(msg.magic);
 732     msg.version = be32_to_cpu(msg.version);
 733
 734     if (msg.magic != MULTIFD_MAGIC) {
 735         error_setg(errp, "multifd: received packet magic %x "
 736                    "expected %x", msg.magic, MULTIFD_MAGIC);
 737         return -1;
 738     }
 739
 740     if (msg.version != MULTIFD_VERSION) {
 741         error_setg(errp, "multifd: received packet version %d "
 742                    "expected %d", msg.version, MULTIFD_VERSION);
 743         return -1;
 744     }
 745
 746     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 747         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 748         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 749
 750         error_setg(errp, "multifd: received uuid '%s' and expected "
 751                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 752         g_free(uuid);
 753         g_free(msg_uuid);
 754         return -1;
 755     }
 756
 757     if (msg.id > migrate_multifd_channels()) {
 758         error_setg(errp, "multifd: received channel version %d "
 759                    "expected %d", msg.version, MULTIFD_VERSION);
 760         return -1;
 761     }
 762
 763     return msg.id;
 764 }
 765
 766 static MultiFDPages_t *multifd_pages_init(size_t size)
 767 {
 768     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 769
 770     pages->allocated = size;
 771     pages->iov = g_new0(struct iovec, size);
 772     pages->offset = g_new0(ram_addr_t, size);
 773
 774     return pages;
 775 }
 776
 777 static void multifd_pages_clear(MultiFDPages_t *pages)
 778 {
 779     pages->used = 0;
 780     pages->allocated = 0;
 781     pages->packet_num = 0;
 782     pages->block = NULL;
 783     g_free(pages->iov);
 784     pages->iov = NULL;
 785     g_free(pages->offset);
 786     pages->offset = NULL;
 787     g_free(pages);
 788 }
 789
 790 static void multifd_send_fill_packet(MultiFDSendParams *p)
 791 {
 792     MultiFDPacket_t *packet = p->packet;
 793     int i;
 794
 795     packet->flags = cpu_to_be32(p->flags);
 796     packet->pages_alloc = cpu_to_be32(p->pages->allocated);
 797     packet->pages_used = cpu_to_be32(p->pages->used);
 798     packet->next_packet_size = cpu_to_be32(p->next_packet_size);
 799     packet->packet_num = cpu_to_be64(p->packet_num);
 800
 801     if (p->pages->block) {
 802         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 803     }
 804
 805     for (i = 0; i < p->pages->used; i++) {
 806         /* there are architectures where ram_addr_t is 32 bit */
 807         uint64_t temp = p->pages->offset[i];
 808
 809         packet->offset[i] = cpu_to_be64(temp);
 810     }
 811 }
 812
 813 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 814 {
 815     MultiFDPacket_t *packet = p->packet;
 816     uint32_t pages_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
 817     RAMBlock *block;
 818     int i;
 819
 820     packet->magic = be32_to_cpu(packet->magic);
 821     if (packet->magic != MULTIFD_MAGIC) {
 822         error_setg(errp, "multifd: received packet "
 823                    "magic %x and expected magic %x",
 824                    packet->magic, MULTIFD_MAGIC);
 825         return -1;
 826     }
 827
 828     packet->version = be32_to_cpu(packet->version);
 829     if (packet->version != MULTIFD_VERSION) {
 830         error_setg(errp, "multifd: received packet "
 831                    "version %d and expected version %d",
 832                    packet->version, MULTIFD_VERSION);
 833         return -1;
 834     }
 835
 836     p->flags = be32_to_cpu(packet->flags);
 837
 838     packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
 839     /*
 840      * If we received a packet that is 100 times bigger than expected
 841      * just stop migration.  It is a magic number.
 842      */
 843     if (packet->pages_alloc > pages_max * 100) {
 844         error_setg(errp, "multifd: received packet "
 845                    "with size %d and expected a maximum size of %d",
 846                    packet->pages_alloc, pages_max * 100) ;
 847         return -1;
 848     }
 849     /*
 850      * We received a packet that is bigger than expected but inside
 851      * reasonable limits (see previous comment).  Just reallocate.
 852      */
 853     if (packet->pages_alloc > p->pages->allocated) {
 854         multifd_pages_clear(p->pages);
 855         p->pages = multifd_pages_init(packet->pages_alloc);
 856     }
 857
 858     p->pages->used = be32_to_cpu(packet->pages_used);
 859     if (p->pages->used > packet->pages_alloc) {
 860         error_setg(errp, "multifd: received packet "
 861                    "with %d pages and expected maximum pages are %d",
 862                    p->pages->used, packet->pages_alloc) ;
 863         return -1;
 864     }
 865
 866     p->next_packet_size = be32_to_cpu(packet->next_packet_size);
 867     p->packet_num = be64_to_cpu(packet->packet_num);
 868
 869     if (p->pages->used == 0) {
 870         return 0;
 871     }
 872
 873     /* make sure that ramblock is 0 terminated */
 874     packet->ramblock[255] = 0;
 875     block = qemu_ram_block_by_name(packet->ramblock);
 876     if (!block) {
 877         error_setg(errp, "multifd: unknown ram block %s",
 878                    packet->ramblock);
 879         return -1;
 880     }
 881
 882     for (i = 0; i < p->pages->used; i++) {
 883         uint64_t offset = be64_to_cpu(packet->offset[i]);
 884
 885         if (offset > (block->used_length - qemu_target_page_size())) {
 886             error_setg(errp, "multifd: offset too long %" PRIu64
 887                        " (max " RAM_ADDR_FMT ")",
 888                        offset, block->max_length);
 889             return -1;
 890         }
 891         p->pages->iov[i].iov_base = block->host + offset;
 892         p->pages->iov[i].iov_len = qemu_target_page_size();
 893     }
 894
 895     return 0;
 896 }
 897
 898 struct {
 899     MultiFDSendParams *params;
 900     /* array of pages to sent */
 901     MultiFDPages_t *pages;
 902     /* global number of generated multifd packets */
 903     uint64_t packet_num;
 904     /* send channels ready */
 905     QemuSemaphore channels_ready;
 906     /*
 907      * Have we already run terminate threads.  There is a race when it
 908      * happens that we got one error while we are exiting.
 909      * We will use atomic operations.  Only valid values are 0 and 1.
 910      */
 911     int exiting;
 912 } *multifd_send_state;
 913
 914 /*
 915  * How we use multifd_send_state->pages and channel->pages?
 916  *
 917  * We create a pages for each channel, and a main one.  Each time that
 918  * we need to send a batch of pages we interchange the ones between
 919  * multifd_send_state and the channel that is sending it.  There are
 920  * two reasons for that:
 921  *    - to not have to do so many mallocs during migration
 922  *    - to make easier to know what to free at the end of migration
 923  *
 924  * This way we always know who is the owner of each "pages" struct,
 925  * and we don't need any locking.  It belongs to the migration thread
 926  * or to the channel thread.  Switching is safe because the migration
 927  * thread is using the channel mutex when changing it, and the channel
 928  * have to had finish with its own, otherwise pending_job can't be
 929  * false.
 930  */
 931
 932 static int multifd_send_pages(QEMUFile *f)
 933 {
 934     int i;
 935     static int next_channel;
 936     MultiFDSendParams *p = NULL; /* make happy gcc */
 937     MultiFDPages_t *pages = multifd_send_state->pages;
 938     uint64_t transferred;
 939
 940     if (atomic_read(&multifd_send_state->exiting)) {
 941         return -1;
 942     }
 943
 944     qemu_sem_wait(&multifd_send_state->channels_ready);
 945     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 946         p = &multifd_send_state->params[i];
 947
 948         qemu_mutex_lock(&p->mutex);
 949         if (p->quit) {
 950             error_report("%s: channel %d has already quit!", __func__, i);
 951             qemu_mutex_unlock(&p->mutex);
 952             return -1;
 953         }
 954         if (!p->pending_job) {
 955             p->pending_job++;
 956             next_channel = (i + 1) % migrate_multifd_channels();
 957             break;
 958         }
 959         qemu_mutex_unlock(&p->mutex);
 960     }
 961     assert(!p->pages->used);
 962     assert(!p->pages->block);
 963
 964     p->packet_num = multifd_send_state->packet_num++;
 965     multifd_send_state->pages = p->pages;
 966     p->pages = pages;
 967     transferred = ((uint64_t) pages->used) * qemu_target_page_size()
 968                 + p->packet_len;
 969     qemu_file_update_transfer(f, transferred);
 970     ram_counters.multifd_bytes += transferred;
 971     ram_counters.transferred += transferred;;
 972     qemu_mutex_unlock(&p->mutex);
 973     qemu_sem_post(&p->sem);
 974
 975     return 1;
 976 }
 977
 978 static int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
 979 {
 980     MultiFDPages_t *pages = multifd_send_state->pages;
 981
 982     if (!pages->block) {
 983         pages->block = block;
 984     }
 985
 986     if (pages->block == block) {
 987         pages->offset[pages->used] = offset;
 988         pages->iov[pages->used].iov_base = block->host + offset;
 989         pages->iov[pages->used].iov_len = qemu_target_page_size();
 990         pages->used++;
 991
 992         if (pages->used < pages->allocated) {
 993             return 1;
 994         }
 995     }
 996
 997     if (multifd_send_pages(f) < 0) {
 998         return -1;
 999     }
1000
1001     if (pages->block != block) {
1002         return  multifd_queue_page(f, block, offset);
1003     }
1004
1005     return 1;
1006 }
1007
1008 static void multifd_send_terminate_threads(Error *err)
1009 {
1010     int i;
1011
1012     trace_multifd_send_terminate_threads(err != NULL);
1013
1014     if (err) {
1015         MigrationState *s = migrate_get_current();
1016         migrate_set_error(s, err);
1017         if (s->state == MIGRATION_STATUS_SETUP ||
1018             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
1019             s->state == MIGRATION_STATUS_DEVICE ||
1020             s->state == MIGRATION_STATUS_ACTIVE) {
1021             migrate_set_state(&s->state, s->state,
1022                               MIGRATION_STATUS_FAILED);
1023         }
1024     }
1025
1026     /*
1027      * We don't want to exit each threads twice.  Depending on where
1028      * we get the error, or if there are two independent errors in two
1029      * threads at the same time, we can end calling this function
1030      * twice.
1031      */
1032     if (atomic_xchg(&multifd_send_state->exiting, 1)) {
1033         return;
1034     }
1035
1036     for (i = 0; i < migrate_multifd_channels(); i++) {
1037         MultiFDSendParams *p = &multifd_send_state->params[i];
1038
1039         qemu_mutex_lock(&p->mutex);
1040         p->quit = true;
1041         qemu_sem_post(&p->sem);
1042         qemu_mutex_unlock(&p->mutex);
1043     }
1044 }
1045
1046 void multifd_save_cleanup(void)
1047 {
1048     int i;
1049
1050     if (!migrate_use_multifd()) {
1051         return;
1052     }
1053     multifd_send_terminate_threads(NULL);
1054     for (i = 0; i < migrate_multifd_channels(); i++) {
1055         MultiFDSendParams *p = &multifd_send_state->params[i];
1056
1057         if (p->running) {
1058             qemu_thread_join(&p->thread);
1059         }
1060     }
1061     for (i = 0; i < migrate_multifd_channels(); i++) {
1062         MultiFDSendParams *p = &multifd_send_state->params[i];
1063
1064         socket_send_channel_destroy(p->c);
1065         p->c = NULL;
1066         qemu_mutex_destroy(&p->mutex);
1067         qemu_sem_destroy(&p->sem);
1068         qemu_sem_destroy(&p->sem_sync);
1069         g_free(p->name);
1070         p->name = NULL;
1071         multifd_pages_clear(p->pages);
1072         p->pages = NULL;
1073         p->packet_len = 0;
1074         g_free(p->packet);
1075         p->packet = NULL;
1076     }
1077     qemu_sem_destroy(&multifd_send_state->channels_ready);
1078     g_free(multifd_send_state->params);
1079     multifd_send_state->params = NULL;
1080     multifd_pages_clear(multifd_send_state->pages);
1081     multifd_send_state->pages = NULL;
1082     g_free(multifd_send_state);
1083     multifd_send_state = NULL;
1084 }
1085
1086 static void multifd_send_sync_main(QEMUFile *f)
1087 {
1088     int i;
1089
1090     if (!migrate_use_multifd()) {
1091         return;
1092     }
1093     if (multifd_send_state->pages->used) {
1094         if (multifd_send_pages(f) < 0) {
1095             error_report("%s: multifd_send_pages fail", __func__);
1096             return;
1097         }
1098     }
1099     for (i = 0; i < migrate_multifd_channels(); i++) {
1100         MultiFDSendParams *p = &multifd_send_state->params[i];
1101
1102         trace_multifd_send_sync_main_signal(p->id);
1103
1104         qemu_mutex_lock(&p->mutex);
1105
1106         if (p->quit) {
1107             error_report("%s: channel %d has already quit", __func__, i);
1108             qemu_mutex_unlock(&p->mutex);
1109             return;
1110         }
1111
1112         p->packet_num = multifd_send_state->packet_num++;
1113         p->flags |= MULTIFD_FLAG_SYNC;
1114         p->pending_job++;
1115         qemu_file_update_transfer(f, p->packet_len);
1116         ram_counters.multifd_bytes += p->packet_len;
1117         ram_counters.transferred += p->packet_len;
1118         qemu_mutex_unlock(&p->mutex);
1119         qemu_sem_post(&p->sem);
1120     }
1121     for (i = 0; i < migrate_multifd_channels(); i++) {
1122         MultiFDSendParams *p = &multifd_send_state->params[i];
1123
1124         trace_multifd_send_sync_main_wait(p->id);
1125         qemu_sem_wait(&p->sem_sync);
1126     }
1127     trace_multifd_send_sync_main(multifd_send_state->packet_num);
1128 }
1129
1130 static void *multifd_send_thread(void *opaque)
1131 {
1132     MultiFDSendParams *p = opaque;
1133     Error *local_err = NULL;
1134     int ret = 0;
1135     uint32_t flags = 0;
1136
1137     trace_multifd_send_thread_start(p->id);
1138     rcu_register_thread();
1139
1140     if (multifd_send_initial_packet(p, &local_err) < 0) {
1141         ret = -1;
1142         goto out;
1143     }
1144     /* initial packet */
1145     p->num_packets = 1;
1146
1147     while (true) {
1148         qemu_sem_wait(&p->sem);
1149
1150         if (atomic_read(&multifd_send_state->exiting)) {
1151             break;
1152         }
1153         qemu_mutex_lock(&p->mutex);
1154
1155         if (p->pending_job) {
1156             uint32_t used = p->pages->used;
1157             uint64_t packet_num = p->packet_num;
1158             flags = p->flags;
1159
1160             p->next_packet_size = used * qemu_target_page_size();
1161             multifd_send_fill_packet(p);
1162             p->flags = 0;
1163             p->num_packets++;
1164             p->num_pages += used;
1165             p->pages->used = 0;
1166             p->pages->block = NULL;
1167             qemu_mutex_unlock(&p->mutex);
1168
1169             trace_multifd_send(p->id, packet_num, used, flags,
1170                                p->next_packet_size);
1171
1172             ret = qio_channel_write_all(p->c, (void *)p->packet,
1173                                         p->packet_len, &local_err);
1174             if (ret != 0) {
1175                 break;
1176             }
1177
1178             if (used) {
1179                 ret = qio_channel_writev_all(p->c, p->pages->iov,
1180                                              used, &local_err);
1181                 if (ret != 0) {
1182                     break;
1183                 }
1184             }
1185
1186             qemu_mutex_lock(&p->mutex);
1187             p->pending_job--;
1188             qemu_mutex_unlock(&p->mutex);
1189
1190             if (flags & MULTIFD_FLAG_SYNC) {
1191                 qemu_sem_post(&p->sem_sync);
1192             }
1193             qemu_sem_post(&multifd_send_state->channels_ready);
1194         } else if (p->quit) {
1195             qemu_mutex_unlock(&p->mutex);
1196             break;
1197         } else {
1198             qemu_mutex_unlock(&p->mutex);
1199             /* sometimes there are spurious wakeups */
1200         }
1201     }
1202
1203 out:
1204     if (local_err) {
1205         trace_multifd_send_error(p->id);
1206         multifd_send_terminate_threads(local_err);
1207     }
1208
1209     /*
1210      * Error happen, I will exit, but I can't just leave, tell
1211      * who pay attention to me.
1212      */
1213     if (ret != 0) {
1214         qemu_sem_post(&p->sem_sync);
1215         qemu_sem_post(&multifd_send_state->channels_ready);
1216     }
1217
1218     qemu_mutex_lock(&p->mutex);
1219     p->running = false;
1220     qemu_mutex_unlock(&p->mutex);
1221
1222     rcu_unregister_thread();
1223     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1224
1225     return NULL;
1226 }
1227
1228 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1229 {
1230     MultiFDSendParams *p = opaque;
1231     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1232     Error *local_err = NULL;
1233
1234     trace_multifd_new_send_channel_async(p->id);
1235     if (qio_task_propagate_error(task, &local_err)) {
1236         migrate_set_error(migrate_get_current(), local_err);
1237         /* Error happen, we need to tell who pay attention to me */
1238         qemu_sem_post(&multifd_send_state->channels_ready);
1239         qemu_sem_post(&p->sem_sync);
1240         /*
1241          * Although multifd_send_thread is not created, but main migration
1242          * thread neet to judge whether it is running, so we need to mark
1243          * its status.
1244          */
1245         p->quit = true;
1246     } else {
1247         p->c = QIO_CHANNEL(sioc);
1248         qio_channel_set_delay(p->c, false);
1249         p->running = true;
1250         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1251                            QEMU_THREAD_JOINABLE);
1252     }
1253 }
1254
1255 int multifd_save_setup(Error **errp)
1256 {
1257     int thread_count;
1258     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1259     uint8_t i;
1260
1261     if (!migrate_use_multifd()) {
1262         return 0;
1263     }
1264     thread_count = migrate_multifd_channels();
1265     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1266     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1267     multifd_send_state->pages = multifd_pages_init(page_count);
1268     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1269     atomic_set(&multifd_send_state->exiting, 0);
1270
1271     for (i = 0; i < thread_count; i++) {
1272         MultiFDSendParams *p = &multifd_send_state->params[i];
1273
1274         qemu_mutex_init(&p->mutex);
1275         qemu_sem_init(&p->sem, 0);
1276         qemu_sem_init(&p->sem_sync, 0);
1277         p->quit = false;
1278         p->pending_job = 0;
1279         p->id = i;
1280         p->pages = multifd_pages_init(page_count);
1281         p->packet_len = sizeof(MultiFDPacket_t)
1282                       + sizeof(uint64_t) * page_count;
1283         p->packet = g_malloc0(p->packet_len);
1284         p->packet->magic = cpu_to_be32(MULTIFD_MAGIC);
1285         p->packet->version = cpu_to_be32(MULTIFD_VERSION);
1286         p->name = g_strdup_printf("multifdsend_%d", i);
1287         socket_send_channel_create(multifd_new_send_channel_async, p);
1288     }
1289     return 0;
1290 }
1291
1292 struct {
1293     MultiFDRecvParams *params;
1294     /* number of created threads */
1295     int count;
1296     /* syncs main thread and channels */
1297     QemuSemaphore sem_sync;
1298     /* global number of generated multifd packets */
1299     uint64_t packet_num;
1300 } *multifd_recv_state;
1301
1302 static void multifd_recv_terminate_threads(Error *err)
1303 {
1304     int i;
1305
1306     trace_multifd_recv_terminate_threads(err != NULL);
1307
1308     if (err) {
1309         MigrationState *s = migrate_get_current();
1310         migrate_set_error(s, err);
1311         if (s->state == MIGRATION_STATUS_SETUP ||
1312             s->state == MIGRATION_STATUS_ACTIVE) {
1313             migrate_set_state(&s->state, s->state,
1314                               MIGRATION_STATUS_FAILED);
1315         }
1316     }
1317
1318     for (i = 0; i < migrate_multifd_channels(); i++) {
1319         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1320
1321         qemu_mutex_lock(&p->mutex);
1322         p->quit = true;
1323         /*
1324          * We could arrive here for two reasons:
1325          *  - normal quit, i.e. everything went fine, just finished
1326          *  - error quit: We close the channels so the channel threads
1327          *    finish the qio_channel_read_all_eof()
1328          */
1329         if (p->c) {
1330             qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1331         }
1332         qemu_mutex_unlock(&p->mutex);
1333     }
1334 }
1335
1336 int multifd_load_cleanup(Error **errp)
1337 {
1338     int i;
1339     int ret = 0;
1340
1341     if (!migrate_use_multifd()) {
1342         return 0;
1343     }
1344     multifd_recv_terminate_threads(NULL);
1345     for (i = 0; i < migrate_multifd_channels(); i++) {
1346         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1347
1348         if (p->running) {
1349             p->quit = true;
1350             /*
1351              * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
1352              * however try to wakeup it without harm in cleanup phase.
1353              */
1354             qemu_sem_post(&p->sem_sync);
1355             qemu_thread_join(&p->thread);
1356         }
1357     }
1358     for (i = 0; i < migrate_multifd_channels(); i++) {
1359         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1360
1361         object_unref(OBJECT(p->c));
1362         p->c = NULL;
1363         qemu_mutex_destroy(&p->mutex);
1364         qemu_sem_destroy(&p->sem_sync);
1365         g_free(p->name);
1366         p->name = NULL;
1367         multifd_pages_clear(p->pages);
1368         p->pages = NULL;
1369         p->packet_len = 0;
1370         g_free(p->packet);
1371         p->packet = NULL;
1372     }
1373     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1374     g_free(multifd_recv_state->params);
1375     multifd_recv_state->params = NULL;
1376     g_free(multifd_recv_state);
1377     multifd_recv_state = NULL;
1378
1379     return ret;
1380 }
1381
1382 static void multifd_recv_sync_main(void)
1383 {
1384     int i;
1385
1386     if (!migrate_use_multifd()) {
1387         return;
1388     }
1389     for (i = 0; i < migrate_multifd_channels(); i++) {
1390         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1391
1392         trace_multifd_recv_sync_main_wait(p->id);
1393         qemu_sem_wait(&multifd_recv_state->sem_sync);
1394     }
1395     for (i = 0; i < migrate_multifd_channels(); i++) {
1396         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1397
1398         qemu_mutex_lock(&p->mutex);
1399         if (multifd_recv_state->packet_num < p->packet_num) {
1400             multifd_recv_state->packet_num = p->packet_num;
1401         }
1402         qemu_mutex_unlock(&p->mutex);
1403         trace_multifd_recv_sync_main_signal(p->id);
1404         qemu_sem_post(&p->sem_sync);
1405     }
1406     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1407 }
1408
1409 static void *multifd_recv_thread(void *opaque)
1410 {
1411     MultiFDRecvParams *p = opaque;
1412     Error *local_err = NULL;
1413     int ret;
1414
1415     trace_multifd_recv_thread_start(p->id);
1416     rcu_register_thread();
1417
1418     while (true) {
1419         uint32_t used;
1420         uint32_t flags;
1421
1422         if (p->quit) {
1423             break;
1424         }
1425
1426         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1427                                        p->packet_len, &local_err);
1428         if (ret == 0) {   /* EOF */
1429             break;
1430         }
1431         if (ret == -1) {   /* Error */
1432             break;
1433         }
1434
1435         qemu_mutex_lock(&p->mutex);
1436         ret = multifd_recv_unfill_packet(p, &local_err);
1437         if (ret) {
1438             qemu_mutex_unlock(&p->mutex);
1439             break;
1440         }
1441
1442         used = p->pages->used;
1443         flags = p->flags;
1444         trace_multifd_recv(p->id, p->packet_num, used, flags,
1445                            p->next_packet_size);
1446         p->num_packets++;
1447         p->num_pages += used;
1448         qemu_mutex_unlock(&p->mutex);
1449
1450         if (used) {
1451             ret = qio_channel_readv_all(p->c, p->pages->iov,
1452                                         used, &local_err);
1453             if (ret != 0) {
1454                 break;
1455             }
1456         }
1457
1458         if (flags & MULTIFD_FLAG_SYNC) {
1459             qemu_sem_post(&multifd_recv_state->sem_sync);
1460             qemu_sem_wait(&p->sem_sync);
1461         }
1462     }
1463
1464     if (local_err) {
1465         multifd_recv_terminate_threads(local_err);
1466     }
1467     qemu_mutex_lock(&p->mutex);
1468     p->running = false;
1469     qemu_mutex_unlock(&p->mutex);
1470
1471     rcu_unregister_thread();
1472     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1473
1474     return NULL;
1475 }
1476
1477 int multifd_load_setup(Error **errp)
1478 {
1479     int thread_count;
1480     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1481     uint8_t i;
1482
1483     if (!migrate_use_multifd()) {
1484         return 0;
1485     }
1486     thread_count = migrate_multifd_channels();
1487     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1488     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1489     atomic_set(&multifd_recv_state->count, 0);
1490     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1491
1492     for (i = 0; i < thread_count; i++) {
1493         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1494
1495         qemu_mutex_init(&p->mutex);
1496         qemu_sem_init(&p->sem_sync, 0);
1497         p->quit = false;
1498         p->id = i;
1499         p->pages = multifd_pages_init(page_count);
1500         p->packet_len = sizeof(MultiFDPacket_t)
1501                       + sizeof(uint64_t) * page_count;
1502         p->packet = g_malloc0(p->packet_len);
1503         p->name = g_strdup_printf("multifdrecv_%d", i);
1504     }
1505     return 0;
1506 }
1507
1508 bool multifd_recv_all_channels_created(void)
1509 {
1510     int thread_count = migrate_multifd_channels();
1511
1512     if (!migrate_use_multifd()) {
1513         return true;
1514     }
1515
1516     return thread_count == atomic_read(&multifd_recv_state->count);
1517 }
1518
1519 /*
1520  * Try to receive all multifd channels to get ready for the migration.
1521  * - Return true and do not set @errp when correctly receving all channels;
1522  * - Return false and do not set @errp when correctly receiving the current one;
1523  * - Return false and set @errp when failing to receive the current channel.
1524  */
1525 bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1526 {
1527     MultiFDRecvParams *p;
1528     Error *local_err = NULL;
1529     int id;
1530
1531     id = multifd_recv_initial_packet(ioc, &local_err);
1532     if (id < 0) {
1533         multifd_recv_terminate_threads(local_err);
1534         error_propagate_prepend(errp, local_err,
1535                                 "failed to receive packet"
1536                                 " via multifd channel %d: ",
1537                                 atomic_read(&multifd_recv_state->count));
1538         return false;
1539     }
1540     trace_multifd_recv_new_channel(id);
1541
1542     p = &multifd_recv_state->params[id];
1543     if (p->c != NULL) {
1544         error_setg(&local_err, "multifd: received id '%d' already setup'",
1545                    id);
1546         multifd_recv_terminate_threads(local_err);
1547         error_propagate(errp, local_err);
1548         return false;
1549     }
1550     p->c = ioc;
1551     object_ref(OBJECT(ioc));
1552     /* initial packet */
1553     p->num_packets = 1;
1554
1555     p->running = true;
1556     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1557                        QEMU_THREAD_JOINABLE);
1558     atomic_inc(&multifd_recv_state->count);
1559     return atomic_read(&multifd_recv_state->count) ==
1560            migrate_multifd_channels();
1561 }
1562
1563 /**
1564  * save_page_header: write page header to wire
1565  *
1566  * If this is the 1st block, it also writes the block identification
1567  *
1568  * Returns the number of bytes written
1569  *
1570  * @f: QEMUFile where to send the data
1571  * @block: block that contains the page we want to send
1572  * @offset: offset inside the block for the page
1573  *          in the lower bits, it contains flags
1574  */
1575 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1576                                ram_addr_t offset)
1577 {
1578     size_t size, len;
1579
1580     if (block == rs->last_sent_block) {
1581         offset |= RAM_SAVE_FLAG_CONTINUE;
1582     }
1583     qemu_put_be64(f, offset);
1584     size = 8;
1585
1586     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1587         len = strlen(block->idstr);
1588         qemu_put_byte(f, len);
1589         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1590         size += 1 + len;
1591         rs->last_sent_block = block;
1592     }
1593     return size;
1594 }
1595
1596 /**
1597  * mig_throttle_guest_down: throotle down the guest
1598  *
1599  * Reduce amount of guest cpu execution to hopefully slow down memory
1600  * writes. If guest dirty memory rate is reduced below the rate at
1601  * which we can transfer pages to the destination then we should be
1602  * able to complete migration. Some workloads dirty memory way too
1603  * fast and will not effectively converge, even with auto-converge.
1604  */
1605 static void mig_throttle_guest_down(void)
1606 {
1607     MigrationState *s = migrate_get_current();
1608     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1609     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1610     int pct_max = s->parameters.max_cpu_throttle;
1611
1612     /* We have not started throttling yet. Let's start it. */
1613     if (!cpu_throttle_active()) {
1614         cpu_throttle_set(pct_initial);
1615     } else {
1616         /* Throttling already on, just increase the rate */
1617         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1618                          pct_max));
1619     }
1620 }
1621
1622 /**
1623  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1624  *
1625  * @rs: current RAM state
1626  * @current_addr: address for the zero page
1627  *
1628  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1629  * The important thing is that a stale (not-yet-0'd) page be replaced
1630  * by the new data.
1631  * As a bonus, if the page wasn't in the cache it gets added so that
1632  * when a small write is made into the 0'd page it gets XBZRLE sent.
1633  */
1634 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1635 {
1636     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1637         return;
1638     }
1639
1640     /* We don't care if this fails to allocate a new cache page
1641      * as long as it updated an old one */
1642     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1643                  ram_counters.dirty_sync_count);
1644 }
1645
1646 #define ENCODING_FLAG_XBZRLE 0x1
1647
1648 /**
1649  * save_xbzrle_page: compress and send current page
1650  *
1651  * Returns: 1 means that we wrote the page
1652  *          0 means that page is identical to the one already sent
1653  *          -1 means that xbzrle would be longer than normal
1654  *
1655  * @rs: current RAM state
1656  * @current_data: pointer to the address of the page contents
1657  * @current_addr: addr of the page
1658  * @block: block that contains the page we want to send
1659  * @offset: offset inside the block for the page
1660  * @last_stage: if we are at the completion stage
1661  */
1662 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1663                             ram_addr_t current_addr, RAMBlock *block,
1664                             ram_addr_t offset, bool last_stage)
1665 {
1666     int encoded_len = 0, bytes_xbzrle;
1667     uint8_t *prev_cached_page;
1668
1669     if (!cache_is_cached(XBZRLE.cache, current_addr,
1670                          ram_counters.dirty_sync_count)) {
1671         xbzrle_counters.cache_miss++;
1672         if (!last_stage) {
1673             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1674                              ram_counters.dirty_sync_count) == -1) {
1675                 return -1;
1676             } else {
1677                 /* update *current_data when the page has been
1678                    inserted into cache */
1679                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1680             }
1681         }
1682         return -1;
1683     }
1684
1685     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1686
1687     /* save current buffer into memory */
1688     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1689
1690     /* XBZRLE encoding (if there is no overflow) */
1691     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1692                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1693                                        TARGET_PAGE_SIZE);
1694
1695     /*
1696      * Update the cache contents, so that it corresponds to the data
1697      * sent, in all cases except where we skip the page.
1698      */
1699     if (!last_stage && encoded_len != 0) {
1700         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1701         /*
1702          * In the case where we couldn't compress, ensure that the caller
1703          * sends the data from the cache, since the guest might have
1704          * changed the RAM since we copied it.
1705          */
1706         *current_data = prev_cached_page;
1707     }
1708
1709     if (encoded_len == 0) {
1710         trace_save_xbzrle_page_skipping();
1711         return 0;
1712     } else if (encoded_len == -1) {
1713         trace_save_xbzrle_page_overflow();
1714         xbzrle_counters.overflow++;
1715         return -1;
1716     }
1717
1718     /* Send XBZRLE based compressed page */
1719     bytes_xbzrle = save_page_header(rs, rs->f, block,
1720                                     offset | RAM_SAVE_FLAG_XBZRLE);
1721     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1722     qemu_put_be16(rs->f, encoded_len);
1723     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1724     bytes_xbzrle += encoded_len + 1 + 2;
1725     xbzrle_counters.pages++;
1726     xbzrle_counters.bytes += bytes_xbzrle;
1727     ram_counters.transferred += bytes_xbzrle;
1728
1729     return 1;
1730 }
1731
1732 /**
1733  * migration_bitmap_find_dirty: find the next dirty page from start
1734  *
1735  * Returns the page offset within memory region of the start of a dirty page
1736  *
1737  * @rs: current RAM state
1738  * @rb: RAMBlock where to search for dirty pages
1739  * @start: page where we start the search
1740  */
1741 static inline
1742 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1743                                           unsigned long start)
1744 {
1745     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1746     unsigned long *bitmap = rb->bmap;
1747     unsigned long next;
1748
1749     if (ramblock_is_ignored(rb)) {
1750         return size;
1751     }
1752
1753     /*
1754      * When the free page optimization is enabled, we need to check the bitmap
1755      * to send the non-free pages rather than all the pages in the bulk stage.
1756      */
1757     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
1758         next = start + 1;
1759     } else {
1760         next = find_next_bit(bitmap, size, start);
1761     }
1762
1763     return next;
1764 }
1765
1766 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1767                                                 RAMBlock *rb,
1768                                                 unsigned long page)
1769 {
1770     bool ret;
1771
1772     qemu_mutex_lock(&rs->bitmap_mutex);
1773
1774     /*
1775      * Clear dirty bitmap if needed.  This _must_ be called before we
1776      * send any of the page in the chunk because we need to make sure
1777      * we can capture further page content changes when we sync dirty
1778      * log the next time.  So as long as we are going to send any of
1779      * the page in the chunk we clear the remote dirty bitmap for all.
1780      * Clearing it earlier won't be a problem, but too late will.
1781      */
1782     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
1783         uint8_t shift = rb->clear_bmap_shift;
1784         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
1785         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
1786
1787         /*
1788          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
1789          * can make things easier sometimes since then start address
1790          * of the small chunk will always be 64 pages aligned so the
1791          * bitmap will always be aligned to unsigned long.  We should
1792          * even be able to remove this restriction but I'm simply
1793          * keeping it.
1794          */
1795         assert(shift >= 6);
1796         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
1797         memory_region_clear_dirty_bitmap(rb->mr, start, size);
1798     }
1799
1800     ret = test_and_clear_bit(page, rb->bmap);
1801
1802     if (ret) {
1803         rs->migration_dirty_pages--;
1804     }
1805     qemu_mutex_unlock(&rs->bitmap_mutex);
1806
1807     return ret;
1808 }
1809
1810 /* Called with RCU critical section */
1811 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1812 {
1813     rs->migration_dirty_pages +=
1814         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
1815                                               &rs->num_dirty_pages_period);
1816 }
1817
1818 /**
1819  * ram_pagesize_summary: calculate all the pagesizes of a VM
1820  *
1821  * Returns a summary bitmap of the page sizes of all RAMBlocks
1822  *
1823  * For VMs with just normal pages this is equivalent to the host page
1824  * size. If it's got some huge pages then it's the OR of all the
1825  * different page sizes.
1826  */
1827 uint64_t ram_pagesize_summary(void)
1828 {
1829     RAMBlock *block;
1830     uint64_t summary = 0;
1831
1832     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1833         summary |= block->page_size;
1834     }
1835
1836     return summary;
1837 }
1838
1839 uint64_t ram_get_total_transferred_pages(void)
1840 {
1841     return  ram_counters.normal + ram_counters.duplicate +
1842                 compression_counters.pages + xbzrle_counters.pages;
1843 }
1844
1845 static void migration_update_rates(RAMState *rs, int64_t end_time)
1846 {
1847     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1848     double compressed_size;
1849
1850     /* calculate period counters */
1851     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1852                 / (end_time - rs->time_last_bitmap_sync);
1853
1854     if (!page_count) {
1855         return;
1856     }
1857
1858     if (migrate_use_xbzrle()) {
1859         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1860             rs->xbzrle_cache_miss_prev) / page_count;
1861         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1862     }
1863
1864     if (migrate_use_compression()) {
1865         compression_counters.busy_rate = (double)(compression_counters.busy -
1866             rs->compress_thread_busy_prev) / page_count;
1867         rs->compress_thread_busy_prev = compression_counters.busy;
1868
1869         compressed_size = compression_counters.compressed_size -
1870                           rs->compressed_size_prev;
1871         if (compressed_size) {
1872             double uncompressed_size = (compression_counters.pages -
1873                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1874
1875             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1876             compression_counters.compression_rate =
1877                                         uncompressed_size / compressed_size;
1878
1879             rs->compress_pages_prev = compression_counters.pages;
1880             rs->compressed_size_prev = compression_counters.compressed_size;
1881         }
1882     }
1883 }
1884
1885 static void migration_bitmap_sync(RAMState *rs)
1886 {
1887     RAMBlock *block;
1888     int64_t end_time;
1889     uint64_t bytes_xfer_now;
1890
1891     ram_counters.dirty_sync_count++;
1892
1893     if (!rs->time_last_bitmap_sync) {
1894         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1895     }
1896
1897     trace_migration_bitmap_sync_start();
1898     memory_global_dirty_log_sync();
1899
1900     qemu_mutex_lock(&rs->bitmap_mutex);
1901     WITH_RCU_READ_LOCK_GUARD() {
1902         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1903             ramblock_sync_dirty_bitmap(rs, block);
1904         }
1905         ram_counters.remaining = ram_bytes_remaining();
1906     }
1907     qemu_mutex_unlock(&rs->bitmap_mutex);
1908
1909     memory_global_after_dirty_log_sync();
1910     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1911
1912     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1913
1914     /* more than 1 second = 1000 millisecons */
1915     if (end_time > rs->time_last_bitmap_sync + 1000) {
1916         bytes_xfer_now = ram_counters.transferred;
1917
1918         /* During block migration the auto-converge logic incorrectly detects
1919          * that ram migration makes no progress. Avoid this by disabling the
1920          * throttling logic during the bulk phase of block migration. */
1921         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1922             /* The following detection logic can be refined later. For now:
1923                Check to see if the dirtied bytes is 50% more than the approx.
1924                amount of bytes that just got transferred since the last time we
1925                were in this routine. If that happens twice, start or increase
1926                throttling */
1927
1928             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1929                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1930                 (++rs->dirty_rate_high_cnt >= 2)) {
1931                     trace_migration_throttle();
1932                     rs->dirty_rate_high_cnt = 0;
1933                     mig_throttle_guest_down();
1934             }
1935         }
1936
1937         migration_update_rates(rs, end_time);
1938
1939         rs->target_page_count_prev = rs->target_page_count;
1940
1941         /* reset period counters */
1942         rs->time_last_bitmap_sync = end_time;
1943         rs->num_dirty_pages_period = 0;
1944         rs->bytes_xfer_prev = bytes_xfer_now;
1945     }
1946     if (migrate_use_events()) {
1947         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1948     }
1949 }
1950
1951 static void migration_bitmap_sync_precopy(RAMState *rs)
1952 {
1953     Error *local_err = NULL;
1954
1955     /*
1956      * The current notifier usage is just an optimization to migration, so we
1957      * don't stop the normal migration process in the error case.
1958      */
1959     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1960         error_report_err(local_err);
1961     }
1962
1963     migration_bitmap_sync(rs);
1964
1965     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1966         error_report_err(local_err);
1967     }
1968 }
1969
1970 /**
1971  * save_zero_page_to_file: send the zero page to the file
1972  *
1973  * Returns the size of data written to the file, 0 means the page is not
1974  * a zero page
1975  *
1976  * @rs: current RAM state
1977  * @file: the file where the data is saved
1978  * @block: block that contains the page we want to send
1979  * @offset: offset inside the block for the page
1980  */
1981 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1982                                   RAMBlock *block, ram_addr_t offset)
1983 {
1984     uint8_t *p = block->host + offset;
1985     int len = 0;
1986
1987     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1988         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1989         qemu_put_byte(file, 0);
1990         len += 1;
1991     }
1992     return len;
1993 }
1994
1995 /**
1996  * save_zero_page: send the zero page to the stream
1997  *
1998  * Returns the number of pages written.
1999  *
2000  * @rs: current RAM state
2001  * @block: block that contains the page we want to send
2002  * @offset: offset inside the block for the page
2003  */
2004 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2005 {
2006     int len = save_zero_page_to_file(rs, rs->f, block, offset);
2007
2008     if (len) {
2009         ram_counters.duplicate++;
2010         ram_counters.transferred += len;
2011         return 1;
2012     }
2013     return -1;
2014 }
2015
2016 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
2017 {
2018     if (!migrate_release_ram() || !migration_in_postcopy()) {
2019         return;
2020     }
2021
2022     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
2023 }
2024
2025 /*
2026  * @pages: the number of pages written by the control path,
2027  *        < 0 - error
2028  *        > 0 - number of pages written
2029  *
2030  * Return true if the pages has been saved, otherwise false is returned.
2031  */
2032 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
2033                               int *pages)
2034 {
2035     uint64_t bytes_xmit = 0;
2036     int ret;
2037
2038     *pages = -1;
2039     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
2040                                 &bytes_xmit);
2041     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
2042         return false;
2043     }
2044
2045     if (bytes_xmit) {
2046         ram_counters.transferred += bytes_xmit;
2047         *pages = 1;
2048     }
2049
2050     if (ret == RAM_SAVE_CONTROL_DELAYED) {
2051         return true;
2052     }
2053
2054     if (bytes_xmit > 0) {
2055         ram_counters.normal++;
2056     } else if (bytes_xmit == 0) {
2057         ram_counters.duplicate++;
2058     }
2059
2060     return true;
2061 }
2062
2063 /*
2064  * directly send the page to the stream
2065  *
2066  * Returns the number of pages written.
2067  *
2068  * @rs: current RAM state
2069  * @block: block that contains the page we want to send
2070  * @offset: offset inside the block for the page
2071  * @buf: the page to be sent
2072  * @async: send to page asyncly
2073  */
2074 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
2075                             uint8_t *buf, bool async)
2076 {
2077     ram_counters.transferred += save_page_header(rs, rs->f, block,
2078                                                  offset | RAM_SAVE_FLAG_PAGE);
2079     if (async) {
2080         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
2081                               migrate_release_ram() &
2082                               migration_in_postcopy());
2083     } else {
2084         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
2085     }
2086     ram_counters.transferred += TARGET_PAGE_SIZE;
2087     ram_counters.normal++;
2088     return 1;
2089 }
2090
2091 /**
2092  * ram_save_page: send the given page to the stream
2093  *
2094  * Returns the number of pages written.
2095  *          < 0 - error
2096  *          >=0 - Number of pages written - this might legally be 0
2097  *                if xbzrle noticed the page was the same.
2098  *
2099  * @rs: current RAM state
2100  * @block: block that contains the page we want to send
2101  * @offset: offset inside the block for the page
2102  * @last_stage: if we are at the completion stage
2103  */
2104 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
2105 {
2106     int pages = -1;
2107     uint8_t *p;
2108     bool send_async = true;
2109     RAMBlock *block = pss->block;
2110     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2111     ram_addr_t current_addr = block->offset + offset;
2112
2113     p = block->host + offset;
2114     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
2115
2116     XBZRLE_cache_lock();
2117     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
2118         migrate_use_xbzrle()) {
2119         pages = save_xbzrle_page(rs, &p, current_addr, block,
2120                                  offset, last_stage);
2121         if (!last_stage) {
2122             /* Can't send this cached data async, since the cache page
2123              * might get updated before it gets to the wire
2124              */
2125             send_async = false;
2126         }
2127     }
2128
2129     /* XBZRLE overflow or normal page */
2130     if (pages == -1) {
2131         pages = save_normal_page(rs, block, offset, p, send_async);
2132     }
2133
2134     XBZRLE_cache_unlock();
2135
2136     return pages;
2137 }
2138
2139 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
2140                                  ram_addr_t offset)
2141 {
2142     if (multifd_queue_page(rs->f, block, offset) < 0) {
2143         return -1;
2144     }
2145     ram_counters.normal++;
2146
2147     return 1;
2148 }
2149
2150 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
2151                                  ram_addr_t offset, uint8_t *source_buf)
2152 {
2153     RAMState *rs = ram_state;
2154     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
2155     bool zero_page = false;
2156     int ret;
2157
2158     if (save_zero_page_to_file(rs, f, block, offset)) {
2159         zero_page = true;
2160         goto exit;
2161     }
2162
2163     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
2164
2165     /*
2166      * copy it to a internal buffer to avoid it being modified by VM
2167      * so that we can catch up the error during compression and
2168      * decompression
2169      */
2170     memcpy(source_buf, p, TARGET_PAGE_SIZE);
2171     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2172     if (ret < 0) {
2173         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
2174         error_report("compressed data failed!");
2175         return false;
2176     }
2177
2178 exit:
2179     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
2180     return zero_page;
2181 }
2182
2183 static void
2184 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2185 {
2186     ram_counters.transferred += bytes_xmit;
2187
2188     if (param->zero_page) {
2189         ram_counters.duplicate++;
2190         return;
2191     }
2192
2193     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2194     compression_counters.compressed_size += bytes_xmit - 8;
2195     compression_counters.pages++;
2196 }
2197
2198 static bool save_page_use_compression(RAMState *rs);
2199
2200 static void flush_compressed_data(RAMState *rs)
2201 {
2202     int idx, len, thread_count;
2203
2204     if (!save_page_use_compression(rs)) {
2205         return;
2206     }
2207     thread_count = migrate_compress_threads();
2208
2209     qemu_mutex_lock(&comp_done_lock);
2210     for (idx = 0; idx < thread_count; idx++) {
2211         while (!comp_param[idx].done) {
2212             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2213         }
2214     }
2215     qemu_mutex_unlock(&comp_done_lock);
2216
2217     for (idx = 0; idx < thread_count; idx++) {
2218         qemu_mutex_lock(&comp_param[idx].mutex);
2219         if (!comp_param[idx].quit) {
2220             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2221             /*
2222              * it's safe to fetch zero_page without holding comp_done_lock
2223              * as there is no further request submitted to the thread,
2224              * i.e, the thread should be waiting for a request at this point.
2225              */
2226             update_compress_thread_counts(&comp_param[idx], len);
2227         }
2228         qemu_mutex_unlock(&comp_param[idx].mutex);
2229     }
2230 }
2231
2232 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2233                                        ram_addr_t offset)
2234 {
2235     param->block = block;
2236     param->offset = offset;
2237 }
2238
2239 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2240                                            ram_addr_t offset)
2241 {
2242     int idx, thread_count, bytes_xmit = -1, pages = -1;
2243     bool wait = migrate_compress_wait_thread();
2244
2245     thread_count = migrate_compress_threads();
2246     qemu_mutex_lock(&comp_done_lock);
2247 retry:
2248     for (idx = 0; idx < thread_count; idx++) {
2249         if (comp_param[idx].done) {
2250             comp_param[idx].done = false;
2251             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2252             qemu_mutex_lock(&comp_param[idx].mutex);
2253             set_compress_params(&comp_param[idx], block, offset);
2254             qemu_cond_signal(&comp_param[idx].cond);
2255             qemu_mutex_unlock(&comp_param[idx].mutex);
2256             pages = 1;
2257             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
2258             break;
2259         }
2260     }
2261
2262     /*
2263      * wait for the free thread if the user specifies 'compress-wait-thread',
2264      * otherwise we will post the page out in the main thread as normal page.
2265      */
2266     if (pages < 0 && wait) {
2267         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2268         goto retry;
2269     }
2270     qemu_mutex_unlock(&comp_done_lock);
2271
2272     return pages;
2273 }
2274
2275 /**
2276  * find_dirty_block: find the next dirty page and update any state
2277  * associated with the search process.
2278  *
2279  * Returns true if a page is found
2280  *
2281  * @rs: current RAM state
2282  * @pss: data about the state of the current dirty page scan
2283  * @again: set to false if the search has scanned the whole of RAM
2284  */
2285 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
2286 {
2287     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2288     if (pss->complete_round && pss->block == rs->last_seen_block &&
2289         pss->page >= rs->last_page) {
2290         /*
2291          * We've been once around the RAM and haven't found anything.
2292          * Give up.
2293          */
2294         *again = false;
2295         return false;
2296     }
2297     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
2298         >= pss->block->used_length) {
2299         /* Didn't find anything in this RAM Block */
2300         pss->page = 0;
2301         pss->block = QLIST_NEXT_RCU(pss->block, next);
2302         if (!pss->block) {
2303             /*
2304              * If memory migration starts over, we will meet a dirtied page
2305              * which may still exists in compression threads's ring, so we
2306              * should flush the compressed data to make sure the new page
2307              * is not overwritten by the old one in the destination.
2308              *
2309              * Also If xbzrle is on, stop using the data compression at this
2310              * point. In theory, xbzrle can do better than compression.
2311              */
2312             flush_compressed_data(rs);
2313
2314             /* Hit the end of the list */
2315             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2316             /* Flag that we've looped */
2317             pss->complete_round = true;
2318             rs->ram_bulk_stage = false;
2319         }
2320         /* Didn't find anything this time, but try again on the new block */
2321         *again = true;
2322         return false;
2323     } else {
2324         /* Can go around again, but... */
2325         *again = true;
2326         /* We've found something so probably don't need to */
2327         return true;
2328     }
2329 }
2330
2331 /**
2332  * unqueue_page: gets a page of the queue
2333  *
2334  * Helper for 'get_queued_page' - gets a page off the queue
2335  *
2336  * Returns the block of the page (or NULL if none available)
2337  *
2338  * @rs: current RAM state
2339  * @offset: used to return the offset within the RAMBlock
2340  */
2341 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2342 {
2343     RAMBlock *block = NULL;
2344
2345     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2346         return NULL;
2347     }
2348
2349     qemu_mutex_lock(&rs->src_page_req_mutex);
2350     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2351         struct RAMSrcPageRequest *entry =
2352                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
2353         block = entry->rb;
2354         *offset = entry->offset;
2355
2356         if (entry->len > TARGET_PAGE_SIZE) {
2357             entry->len -= TARGET_PAGE_SIZE;
2358             entry->offset += TARGET_PAGE_SIZE;
2359         } else {
2360             memory_region_unref(block->mr);
2361             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2362             g_free(entry);
2363             migration_consume_urgent_request();
2364         }
2365     }
2366     qemu_mutex_unlock(&rs->src_page_req_mutex);
2367
2368     return block;
2369 }
2370
2371 /**
2372  * get_queued_page: unqueue a page from the postcopy requests
2373  *
2374  * Skips pages that are already sent (!dirty)
2375  *
2376  * Returns true if a queued page is found
2377  *
2378  * @rs: current RAM state
2379  * @pss: data about the state of the current dirty page scan
2380  */
2381 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2382 {
2383     RAMBlock  *block;
2384     ram_addr_t offset;
2385     bool dirty;
2386
2387     do {
2388         block = unqueue_page(rs, &offset);
2389         /*
2390          * We're sending this page, and since it's postcopy nothing else
2391          * will dirty it, and we must make sure it doesn't get sent again
2392          * even if this queue request was received after the background
2393          * search already sent it.
2394          */
2395         if (block) {
2396             unsigned long page;
2397
2398             page = offset >> TARGET_PAGE_BITS;
2399             dirty = test_bit(page, block->bmap);
2400             if (!dirty) {
2401                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2402                                                 page);
2403             } else {
2404                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2405             }
2406         }
2407
2408     } while (block && !dirty);
2409
2410     if (block) {
2411         /*
2412          * As soon as we start servicing pages out of order, then we have
2413          * to kill the bulk stage, since the bulk stage assumes
2414          * in (migration_bitmap_find_and_reset_dirty) that every page is
2415          * dirty, that's no longer true.
2416          */
2417         rs->ram_bulk_stage = false;
2418
2419         /*
2420          * We want the background search to continue from the queued page
2421          * since the guest is likely to want other pages near to the page
2422          * it just requested.
2423          */
2424         pss->block = block;
2425         pss->page = offset >> TARGET_PAGE_BITS;
2426
2427         /*
2428          * This unqueued page would break the "one round" check, even is
2429          * really rare.
2430          */
2431         pss->complete_round = false;
2432     }
2433
2434     return !!block;
2435 }
2436
2437 /**
2438  * migration_page_queue_free: drop any remaining pages in the ram
2439  * request queue
2440  *
2441  * It should be empty at the end anyway, but in error cases there may
2442  * be some left.  in case that there is any page left, we drop it.
2443  *
2444  */
2445 static void migration_page_queue_free(RAMState *rs)
2446 {
2447     struct RAMSrcPageRequest *mspr, *next_mspr;
2448     /* This queue generally should be empty - but in the case of a failed
2449      * migration might have some droppings in.
2450      */
2451     RCU_READ_LOCK_GUARD();
2452     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2453         memory_region_unref(mspr->rb->mr);
2454         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2455         g_free(mspr);
2456     }
2457 }
2458
2459 /**
2460  * ram_save_queue_pages: queue the page for transmission
2461  *
2462  * A request from postcopy destination for example.
2463  *
2464  * Returns zero on success or negative on error
2465  *
2466  * @rbname: Name of the RAMBLock of the request. NULL means the
2467  *          same that last one.
2468  * @start: starting address from the start of the RAMBlock
2469  * @len: length (in bytes) to send
2470  */
2471 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2472 {
2473     RAMBlock *ramblock;
2474     RAMState *rs = ram_state;
2475
2476     ram_counters.postcopy_requests++;
2477     RCU_READ_LOCK_GUARD();
2478
2479     if (!rbname) {
2480         /* Reuse last RAMBlock */
2481         ramblock = rs->last_req_rb;
2482
2483         if (!ramblock) {
2484             /*
2485              * Shouldn't happen, we can't reuse the last RAMBlock if
2486              * it's the 1st request.
2487              */
2488             error_report("ram_save_queue_pages no previous block");
2489             return -1;
2490         }
2491     } else {
2492         ramblock = qemu_ram_block_by_name(rbname);
2493
2494         if (!ramblock) {
2495             /* We shouldn't be asked for a non-existent RAMBlock */
2496             error_report("ram_save_queue_pages no block '%s'", rbname);
2497             return -1;
2498         }
2499         rs->last_req_rb = ramblock;
2500     }
2501     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2502     if (start+len > ramblock->used_length) {
2503         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2504                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2505                      __func__, start, len, ramblock->used_length);
2506         return -1;
2507     }
2508
2509     struct RAMSrcPageRequest *new_entry =
2510         g_malloc0(sizeof(struct RAMSrcPageRequest));
2511     new_entry->rb = ramblock;
2512     new_entry->offset = start;
2513     new_entry->len = len;
2514
2515     memory_region_ref(ramblock->mr);
2516     qemu_mutex_lock(&rs->src_page_req_mutex);
2517     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2518     migration_make_urgent_request();
2519     qemu_mutex_unlock(&rs->src_page_req_mutex);
2520
2521     return 0;
2522 }
2523
2524 static bool save_page_use_compression(RAMState *rs)
2525 {
2526     if (!migrate_use_compression()) {
2527         return false;
2528     }
2529
2530     /*
2531      * If xbzrle is on, stop using the data compression after first
2532      * round of migration even if compression is enabled. In theory,
2533      * xbzrle can do better than compression.
2534      */
2535     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2536         return true;
2537     }
2538
2539     return false;
2540 }
2541
2542 /*
2543  * try to compress the page before posting it out, return true if the page
2544  * has been properly handled by compression, otherwise needs other
2545  * paths to handle it
2546  */
2547 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2548 {
2549     if (!save_page_use_compression(rs)) {
2550         return false;
2551     }
2552
2553     /*
2554      * When starting the process of a new block, the first page of
2555      * the block should be sent out before other pages in the same
2556      * block, and all the pages in last block should have been sent
2557      * out, keeping this order is important, because the 'cont' flag
2558      * is used to avoid resending the block name.
2559      *
2560      * We post the fist page as normal page as compression will take
2561      * much CPU resource.
2562      */
2563     if (block != rs->last_sent_block) {
2564         flush_compressed_data(rs);
2565         return false;
2566     }
2567
2568     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2569         return true;
2570     }
2571
2572     compression_counters.busy++;
2573     return false;
2574 }
2575
2576 /**
2577  * ram_save_target_page: save one target page
2578  *
2579  * Returns the number of pages written
2580  *
2581  * @rs: current RAM state
2582  * @pss: data about the page we want to send
2583  * @last_stage: if we are at the completion stage
2584  */
2585 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2586                                 bool last_stage)
2587 {
2588     RAMBlock *block = pss->block;
2589     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2590     int res;
2591
2592     if (control_save_page(rs, block, offset, &res)) {
2593         return res;
2594     }
2595
2596     if (save_compress_page(rs, block, offset)) {
2597         return 1;
2598     }
2599
2600     res = save_zero_page(rs, block, offset);
2601     if (res > 0) {
2602         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2603          * page would be stale
2604          */
2605         if (!save_page_use_compression(rs)) {
2606             XBZRLE_cache_lock();
2607             xbzrle_cache_zero_page(rs, block->offset + offset);
2608             XBZRLE_cache_unlock();
2609         }
2610         ram_release_pages(block->idstr, offset, res);
2611         return res;
2612     }
2613
2614     /*
2615      * Do not use multifd for:
2616      * 1. Compression as the first page in the new block should be posted out
2617      *    before sending the compressed page
2618      * 2. In postcopy as one whole host page should be placed
2619      */
2620     if (!save_page_use_compression(rs) && migrate_use_multifd()
2621         && !migration_in_postcopy()) {
2622         return ram_save_multifd_page(rs, block, offset);
2623     }
2624
2625     return ram_save_page(rs, pss, last_stage);
2626 }
2627
2628 /**
2629  * ram_save_host_page: save a whole host page
2630  *
2631  * Starting at *offset send pages up to the end of the current host
2632  * page. It's valid for the initial offset to point into the middle of
2633  * a host page in which case the remainder of the hostpage is sent.
2634  * Only dirty target pages are sent. Note that the host page size may
2635  * be a huge page for this block.
2636  * The saving stops at the boundary of the used_length of the block
2637  * if the RAMBlock isn't a multiple of the host page size.
2638  *
2639  * Returns the number of pages written or negative on error
2640  *
2641  * @rs: current RAM state
2642  * @ms: current migration state
2643  * @pss: data about the page we want to send
2644  * @last_stage: if we are at the completion stage
2645  */
2646 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2647                               bool last_stage)
2648 {
2649     int tmppages, pages = 0;
2650     size_t pagesize_bits =
2651         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2652
2653     if (ramblock_is_ignored(pss->block)) {
2654         error_report("block %s should not be migrated !", pss->block->idstr);
2655         return 0;
2656     }
2657
2658     do {
2659         /* Check the pages is dirty and if it is send it */
2660         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2661             pss->page++;
2662             continue;
2663         }
2664
2665         tmppages = ram_save_target_page(rs, pss, last_stage);
2666         if (tmppages < 0) {
2667             return tmppages;
2668         }
2669
2670         pages += tmppages;
2671         pss->page++;
2672         /* Allow rate limiting to happen in the middle of huge pages */
2673         migration_rate_limit();
2674     } while ((pss->page & (pagesize_bits - 1)) &&
2675              offset_in_ramblock(pss->block,
2676                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2677
2678     /* The offset we leave with is the last one we looked at */
2679     pss->page--;
2680     return pages;
2681 }
2682
2683 /**
2684  * ram_find_and_save_block: finds a dirty page and sends it to f
2685  *
2686  * Called within an RCU critical section.
2687  *
2688  * Returns the number of pages written where zero means no dirty pages,
2689  * or negative on error
2690  *
2691  * @rs: current RAM state
2692  * @last_stage: if we are at the completion stage
2693  *
2694  * On systems where host-page-size > target-page-size it will send all the
2695  * pages in a host page that are dirty.
2696  */
2697
2698 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2699 {
2700     PageSearchStatus pss;
2701     int pages = 0;
2702     bool again, found;
2703
2704     /* No dirty page as there is zero RAM */
2705     if (!ram_bytes_total()) {
2706         return pages;
2707     }
2708
2709     pss.block = rs->last_seen_block;
2710     pss.page = rs->last_page;
2711     pss.complete_round = false;
2712
2713     if (!pss.block) {
2714         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2715     }
2716
2717     do {
2718         again = true;
2719         found = get_queued_page(rs, &pss);
2720
2721         if (!found) {
2722             /* priority queue empty, so just search for something dirty */
2723             found = find_dirty_block(rs, &pss, &again);
2724         }
2725
2726         if (found) {
2727             pages = ram_save_host_page(rs, &pss, last_stage);
2728         }
2729     } while (!pages && again);
2730
2731     rs->last_seen_block = pss.block;
2732     rs->last_page = pss.page;
2733
2734     return pages;
2735 }
2736
2737 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2738 {
2739     uint64_t pages = size / TARGET_PAGE_SIZE;
2740
2741     if (zero) {
2742         ram_counters.duplicate += pages;
2743     } else {
2744         ram_counters.normal += pages;
2745         ram_counters.transferred += size;
2746         qemu_update_position(f, size);
2747     }
2748 }
2749
2750 static uint64_t ram_bytes_total_common(bool count_ignored)
2751 {
2752     RAMBlock *block;
2753     uint64_t total = 0;
2754
2755     RCU_READ_LOCK_GUARD();
2756
2757     if (count_ignored) {
2758         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2759             total += block->used_length;
2760         }
2761     } else {
2762         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2763             total += block->used_length;
2764         }
2765     }
2766     return total;
2767 }
2768
2769 uint64_t ram_bytes_total(void)
2770 {
2771     return ram_bytes_total_common(false);
2772 }
2773
2774 static void xbzrle_load_setup(void)
2775 {
2776     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2777 }
2778
2779 static void xbzrle_load_cleanup(void)
2780 {
2781     g_free(XBZRLE.decoded_buf);
2782     XBZRLE.decoded_buf = NULL;
2783 }
2784
2785 static void ram_state_cleanup(RAMState **rsp)
2786 {
2787     if (*rsp) {
2788         migration_page_queue_free(*rsp);
2789         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2790         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2791         g_free(*rsp);
2792         *rsp = NULL;
2793     }
2794 }
2795
2796 static void xbzrle_cleanup(void)
2797 {
2798     XBZRLE_cache_lock();
2799     if (XBZRLE.cache) {
2800         cache_fini(XBZRLE.cache);
2801         g_free(XBZRLE.encoded_buf);
2802         g_free(XBZRLE.current_buf);
2803         g_free(XBZRLE.zero_target_page);
2804         XBZRLE.cache = NULL;
2805         XBZRLE.encoded_buf = NULL;
2806         XBZRLE.current_buf = NULL;
2807         XBZRLE.zero_target_page = NULL;
2808     }
2809     XBZRLE_cache_unlock();
2810 }
2811
2812 static void ram_save_cleanup(void *opaque)
2813 {
2814     RAMState **rsp = opaque;
2815     RAMBlock *block;
2816
2817     /* caller have hold iothread lock or is in a bh, so there is
2818      * no writing race against the migration bitmap
2819      */
2820     memory_global_dirty_log_stop();
2821
2822     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2823         g_free(block->clear_bmap);
2824         block->clear_bmap = NULL;
2825         g_free(block->bmap);
2826         block->bmap = NULL;
2827     }
2828
2829     xbzrle_cleanup();
2830     compress_threads_save_cleanup();
2831     ram_state_cleanup(rsp);
2832 }
2833
2834 static void ram_state_reset(RAMState *rs)
2835 {
2836     rs->last_seen_block = NULL;
2837     rs->last_sent_block = NULL;
2838     rs->last_page = 0;
2839     rs->last_version = ram_list.version;
2840     rs->ram_bulk_stage = true;
2841     rs->fpo_enabled = false;
2842 }
2843
2844 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2845
2846 /*
2847  * 'expected' is the value you expect the bitmap mostly to be full
2848  * of; it won't bother printing lines that are all this value.
2849  * If 'todump' is null the migration bitmap is dumped.
2850  */
2851 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2852                            unsigned long pages)
2853 {
2854     int64_t cur;
2855     int64_t linelen = 128;
2856     char linebuf[129];
2857
2858     for (cur = 0; cur < pages; cur += linelen) {
2859         int64_t curb;
2860         bool found = false;
2861         /*
2862          * Last line; catch the case where the line length
2863          * is longer than remaining ram
2864          */
2865         if (cur + linelen > pages) {
2866             linelen = pages - cur;
2867         }
2868         for (curb = 0; curb < linelen; curb++) {
2869             bool thisbit = test_bit(cur + curb, todump);
2870             linebuf[curb] = thisbit ? '1' : '.';
2871             found = found || (thisbit != expected);
2872         }
2873         if (found) {
2874             linebuf[curb] = '\0';
2875             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2876         }
2877     }
2878 }
2879
2880 /* **** functions for postcopy ***** */
2881
2882 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2883 {
2884     struct RAMBlock *block;
2885
2886     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2887         unsigned long *bitmap = block->bmap;
2888         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2889         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2890
2891         while (run_start < range) {
2892             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2893             ram_discard_range(block->idstr,
2894                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2895                               ((ram_addr_t)(run_end - run_start))
2896                                 << TARGET_PAGE_BITS);
2897             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2898         }
2899     }
2900 }
2901
2902 /**
2903  * postcopy_send_discard_bm_ram: discard a RAMBlock
2904  *
2905  * Returns zero on success
2906  *
2907  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2908  *
2909  * @ms: current migration state
2910  * @block: RAMBlock to discard
2911  */
2912 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2913 {
2914     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2915     unsigned long current;
2916     unsigned long *bitmap = block->bmap;
2917
2918     for (current = 0; current < end; ) {
2919         unsigned long one = find_next_bit(bitmap, end, current);
2920         unsigned long zero, discard_length;
2921
2922         if (one >= end) {
2923             break;
2924         }
2925
2926         zero = find_next_zero_bit(bitmap, end, one + 1);
2927
2928         if (zero >= end) {
2929             discard_length = end - one;
2930         } else {
2931             discard_length = zero - one;
2932         }
2933         postcopy_discard_send_range(ms, one, discard_length);
2934         current = one + discard_length;
2935     }
2936
2937     return 0;
2938 }
2939
2940 /**
2941  * postcopy_each_ram_send_discard: discard all RAMBlocks
2942  *
2943  * Returns 0 for success or negative for error
2944  *
2945  * Utility for the outgoing postcopy code.
2946  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2947  *   passing it bitmap indexes and name.
2948  * (qemu_ram_foreach_block ends up passing unscaled lengths
2949  *  which would mean postcopy code would have to deal with target page)
2950  *
2951  * @ms: current migration state
2952  */
2953 static int postcopy_each_ram_send_discard(MigrationState *ms)
2954 {
2955     struct RAMBlock *block;
2956     int ret;
2957
2958     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2959         postcopy_discard_send_init(ms, block->idstr);
2960
2961         /*
2962          * Postcopy sends chunks of bitmap over the wire, but it
2963          * just needs indexes at this point, avoids it having
2964          * target page specific code.
2965          */
2966         ret = postcopy_send_discard_bm_ram(ms, block);
2967         postcopy_discard_send_finish(ms);
2968         if (ret) {
2969             return ret;
2970         }
2971     }
2972
2973     return 0;
2974 }
2975
2976 /**
2977  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2978  *
2979  * Helper for postcopy_chunk_hostpages; it's called twice to
2980  * canonicalize the two bitmaps, that are similar, but one is
2981  * inverted.
2982  *
2983  * Postcopy requires that all target pages in a hostpage are dirty or
2984  * clean, not a mix.  This function canonicalizes the bitmaps.
2985  *
2986  * @ms: current migration state
2987  * @block: block that contains the page we want to canonicalize
2988  */
2989 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2990 {
2991     RAMState *rs = ram_state;
2992     unsigned long *bitmap = block->bmap;
2993     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2994     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2995     unsigned long run_start;
2996
2997     if (block->page_size == TARGET_PAGE_SIZE) {
2998         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2999         return;
3000     }
3001
3002     /* Find a dirty page */
3003     run_start = find_next_bit(bitmap, pages, 0);
3004
3005     while (run_start < pages) {
3006
3007         /*
3008          * If the start of this run of pages is in the middle of a host
3009          * page, then we need to fixup this host page.
3010          */
3011         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
3012             /* Find the end of this run */
3013             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
3014             /*
3015              * If the end isn't at the start of a host page, then the
3016              * run doesn't finish at the end of a host page
3017              * and we need to discard.
3018              */
3019         }
3020
3021         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
3022             unsigned long page;
3023             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
3024                                                              host_ratio);
3025             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
3026
3027             /* Clean up the bitmap */
3028             for (page = fixup_start_addr;
3029                  page < fixup_start_addr + host_ratio; page++) {
3030                 /*
3031                  * Remark them as dirty, updating the count for any pages
3032                  * that weren't previously dirty.
3033                  */
3034                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
3035             }
3036         }
3037
3038         /* Find the next dirty page for the next iteration */
3039         run_start = find_next_bit(bitmap, pages, run_start);
3040     }
3041 }
3042
3043 /**
3044  * postcopy_chunk_hostpages: discard any partially sent host page
3045  *
3046  * Utility for the outgoing postcopy code.
3047  *
3048  * Discard any partially sent host-page size chunks, mark any partially
3049  * dirty host-page size chunks as all dirty.  In this case the host-page
3050  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
3051  *
3052  * Returns zero on success
3053  *
3054  * @ms: current migration state
3055  * @block: block we want to work with
3056  */
3057 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
3058 {
3059     postcopy_discard_send_init(ms, block->idstr);
3060
3061     /*
3062      * Ensure that all partially dirty host pages are made fully dirty.
3063      */
3064     postcopy_chunk_hostpages_pass(ms, block);
3065
3066     postcopy_discard_send_finish(ms);
3067     return 0;
3068 }
3069
3070 /**
3071  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3072  *
3073  * Returns zero on success
3074  *
3075  * Transmit the set of pages to be discarded after precopy to the target
3076  * these are pages that:
3077  *     a) Have been previously transmitted but are now dirty again
3078  *     b) Pages that have never been transmitted, this ensures that
3079  *        any pages on the destination that have been mapped by background
3080  *        tasks get discarded (transparent huge pages is the specific concern)
3081  * Hopefully this is pretty sparse
3082  *
3083  * @ms: current migration state
3084  */
3085 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
3086 {
3087     RAMState *rs = ram_state;
3088     RAMBlock *block;
3089     int ret;
3090
3091     RCU_READ_LOCK_GUARD();
3092
3093     /* This should be our last sync, the src is now paused */
3094     migration_bitmap_sync(rs);
3095
3096     /* Easiest way to make sure we don't resume in the middle of a host-page */
3097     rs->last_seen_block = NULL;
3098     rs->last_sent_block = NULL;
3099     rs->last_page = 0;
3100
3101     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3102         /* Deal with TPS != HPS and huge pages */
3103         ret = postcopy_chunk_hostpages(ms, block);
3104         if (ret) {
3105             return ret;
3106         }
3107
3108 #ifdef DEBUG_POSTCOPY
3109         ram_debug_dump_bitmap(block->bmap, true,
3110                               block->used_length >> TARGET_PAGE_BITS);
3111 #endif
3112     }
3113     trace_ram_postcopy_send_discard_bitmap();
3114
3115     ret = postcopy_each_ram_send_discard(ms);
3116
3117     return ret;
3118 }
3119
3120 /**
3121  * ram_discard_range: discard dirtied pages at the beginning of postcopy
3122  *
3123  * Returns zero on success
3124  *
3125  * @rbname: name of the RAMBlock of the request. NULL means the
3126  *          same that last one.
3127  * @start: RAMBlock starting page
3128  * @length: RAMBlock size
3129  */
3130 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3131 {
3132     trace_ram_discard_range(rbname, start, length);
3133
3134     RCU_READ_LOCK_GUARD();
3135     RAMBlock *rb = qemu_ram_block_by_name(rbname);
3136
3137     if (!rb) {
3138         error_report("ram_discard_range: Failed to find block '%s'", rbname);
3139         return -1;
3140     }
3141
3142     /*
3143      * On source VM, we don't need to update the received bitmap since
3144      * we don't even have one.
3145      */
3146     if (rb->receivedmap) {
3147         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3148                      length >> qemu_target_page_bits());
3149     }
3150
3151     return ram_block_discard_range(rb, start, length);
3152 }
3153
3154 /*
3155  * For every allocation, we will try not to crash the VM if the
3156  * allocation failed.
3157  */
3158 static int xbzrle_init(void)
3159 {
3160     Error *local_err = NULL;
3161
3162     if (!migrate_use_xbzrle()) {
3163         return 0;
3164     }
3165
3166     XBZRLE_cache_lock();
3167
3168     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3169     if (!XBZRLE.zero_target_page) {
3170         error_report("%s: Error allocating zero page", __func__);
3171         goto err_out;
3172     }
3173
3174     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3175                               TARGET_PAGE_SIZE, &local_err);
3176     if (!XBZRLE.cache) {
3177         error_report_err(local_err);
3178         goto free_zero_page;
3179     }
3180
3181     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3182     if (!XBZRLE.encoded_buf) {
3183         error_report("%s: Error allocating encoded_buf", __func__);
3184         goto free_cache;
3185     }
3186
3187     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3188     if (!XBZRLE.current_buf) {
3189         error_report("%s: Error allocating current_buf", __func__);
3190         goto free_encoded_buf;
3191     }
3192
3193     /* We are all good */
3194     XBZRLE_cache_unlock();
3195     return 0;
3196
3197 free_encoded_buf:
3198     g_free(XBZRLE.encoded_buf);
3199     XBZRLE.encoded_buf = NULL;
3200 free_cache:
3201     cache_fini(XBZRLE.cache);
3202     XBZRLE.cache = NULL;
3203 free_zero_page:
3204     g_free(XBZRLE.zero_target_page);
3205     XBZRLE.zero_target_page = NULL;
3206 err_out:
3207     XBZRLE_cache_unlock();
3208     return -ENOMEM;
3209 }
3210
3211 static int ram_state_init(RAMState **rsp)
3212 {
3213     *rsp = g_try_new0(RAMState, 1);
3214
3215     if (!*rsp) {
3216         error_report("%s: Init ramstate fail", __func__);
3217         return -1;
3218     }
3219
3220     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3221     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3222     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3223
3224     /*
3225      * Count the total number of pages used by ram blocks not including any
3226      * gaps due to alignment or unplugs.
3227      * This must match with the initial values of dirty bitmap.
3228      */
3229     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3230     ram_state_reset(*rsp);
3231
3232     return 0;
3233 }
3234
3235 static void ram_list_init_bitmaps(void)
3236 {
3237     MigrationState *ms = migrate_get_current();
3238     RAMBlock *block;
3239     unsigned long pages;
3240     uint8_t shift;
3241
3242     /* Skip setting bitmap if there is no RAM */
3243     if (ram_bytes_total()) {
3244         shift = ms->clear_bitmap_shift;
3245         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3246             error_report("clear_bitmap_shift (%u) too big, using "
3247                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3248             shift = CLEAR_BITMAP_SHIFT_MAX;
3249         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3250             error_report("clear_bitmap_shift (%u) too small, using "
3251                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3252             shift = CLEAR_BITMAP_SHIFT_MIN;
3253         }
3254
3255         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3256             pages = block->max_length >> TARGET_PAGE_BITS;
3257             /*
3258              * The initial dirty bitmap for migration must be set with all
3259              * ones to make sure we'll migrate every guest RAM page to
3260              * destination.
3261              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3262              * new migration after a failed migration, ram_list.
3263              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3264              * guest memory.
3265              */
3266             block->bmap = bitmap_new(pages);
3267             bitmap_set(block->bmap, 0, pages);
3268             block->clear_bmap_shift = shift;
3269             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3270         }
3271     }
3272 }
3273
3274 static void ram_init_bitmaps(RAMState *rs)
3275 {
3276     /* For memory_global_dirty_log_start below.  */
3277     qemu_mutex_lock_iothread();
3278     qemu_mutex_lock_ramlist();
3279
3280     WITH_RCU_READ_LOCK_GUARD() {
3281         ram_list_init_bitmaps();
3282         memory_global_dirty_log_start();
3283         migration_bitmap_sync_precopy(rs);
3284     }
3285     qemu_mutex_unlock_ramlist();
3286     qemu_mutex_unlock_iothread();
3287 }
3288
3289 static int ram_init_all(RAMState **rsp)
3290 {
3291     if (ram_state_init(rsp)) {
3292         return -1;
3293     }
3294
3295     if (xbzrle_init()) {
3296         ram_state_cleanup(rsp);
3297         return -1;
3298     }
3299
3300     ram_init_bitmaps(*rsp);
3301
3302     return 0;
3303 }
3304
3305 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3306 {
3307     RAMBlock *block;
3308     uint64_t pages = 0;
3309
3310     /*
3311      * Postcopy is not using xbzrle/compression, so no need for that.
3312      * Also, since source are already halted, we don't need to care
3313      * about dirty page logging as well.
3314      */
3315
3316     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3317         pages += bitmap_count_one(block->bmap,
3318                                   block->used_length >> TARGET_PAGE_BITS);
3319     }
3320
3321     /* This may not be aligned with current bitmaps. Recalculate. */
3322     rs->migration_dirty_pages = pages;
3323
3324     rs->last_seen_block = NULL;
3325     rs->last_sent_block = NULL;
3326     rs->last_page = 0;
3327     rs->last_version = ram_list.version;
3328     /*
3329      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3330      * matter what we have sent.
3331      */
3332     rs->ram_bulk_stage = false;
3333
3334     /* Update RAMState cache of output QEMUFile */
3335     rs->f = out;
3336
3337     trace_ram_state_resume_prepare(pages);
3338 }
3339
3340 /*
3341  * This function clears bits of the free pages reported by the caller from the
3342  * migration dirty bitmap. @addr is the host address corresponding to the
3343  * start of the continuous guest free pages, and @len is the total bytes of
3344  * those pages.
3345  */
3346 void qemu_guest_free_page_hint(void *addr, size_t len)
3347 {
3348     RAMBlock *block;
3349     ram_addr_t offset;
3350     size_t used_len, start, npages;
3351     MigrationState *s = migrate_get_current();
3352
3353     /* This function is currently expected to be used during live migration */
3354     if (!migration_is_setup_or_active(s->state)) {
3355         return;
3356     }
3357
3358     for (; len > 0; len -= used_len, addr += used_len) {
3359         block = qemu_ram_block_from_host(addr, false, &offset);
3360         if (unlikely(!block || offset >= block->used_length)) {
3361             /*
3362              * The implementation might not support RAMBlock resize during
3363              * live migration, but it could happen in theory with future
3364              * updates. So we add a check here to capture that case.
3365              */
3366             error_report_once("%s unexpected error", __func__);
3367             return;
3368         }
3369
3370         if (len <= block->used_length - offset) {
3371             used_len = len;
3372         } else {
3373             used_len = block->used_length - offset;
3374         }
3375
3376         start = offset >> TARGET_PAGE_BITS;
3377         npages = used_len >> TARGET_PAGE_BITS;
3378
3379         qemu_mutex_lock(&ram_state->bitmap_mutex);
3380         ram_state->migration_dirty_pages -=
3381                       bitmap_count_one_with_offset(block->bmap, start, npages);
3382         bitmap_clear(block->bmap, start, npages);
3383         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3384     }
3385 }
3386
3387 /*
3388  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3389  * long-running RCU critical section.  When rcu-reclaims in the code
3390  * start to become numerous it will be necessary to reduce the
3391  * granularity of these critical sections.
3392  */
3393
3394 /**
3395  * ram_save_setup: Setup RAM for migration
3396  *
3397  * Returns zero to indicate success and negative for error
3398  *
3399  * @f: QEMUFile where to send the data
3400  * @opaque: RAMState pointer
3401  */
3402 static int ram_save_setup(QEMUFile *f, void *opaque)
3403 {
3404     RAMState **rsp = opaque;
3405     RAMBlock *block;
3406
3407     if (compress_threads_save_setup()) {
3408         return -1;
3409     }
3410
3411     /* migration has already setup the bitmap, reuse it. */
3412     if (!migration_in_colo_state()) {
3413         if (ram_init_all(rsp) != 0) {
3414             compress_threads_save_cleanup();
3415             return -1;
3416         }
3417     }
3418     (*rsp)->f = f;
3419
3420     WITH_RCU_READ_LOCK_GUARD() {
3421         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3422
3423         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3424             qemu_put_byte(f, strlen(block->idstr));
3425             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3426             qemu_put_be64(f, block->used_length);
3427             if (migrate_postcopy_ram() && block->page_size !=
3428                                           qemu_host_page_size) {
3429                 qemu_put_be64(f, block->page_size);
3430             }
3431             if (migrate_ignore_shared()) {
3432                 qemu_put_be64(f, block->mr->addr);
3433             }
3434         }
3435     }
3436
3437     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3438     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3439
3440     multifd_send_sync_main(f);
3441     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3442     qemu_fflush(f);
3443
3444     return 0;
3445 }
3446
3447 /**
3448  * ram_save_iterate: iterative stage for migration
3449  *
3450  * Returns zero to indicate success and negative for error
3451  *
3452  * @f: QEMUFile where to send the data
3453  * @opaque: RAMState pointer
3454  */
3455 static int ram_save_iterate(QEMUFile *f, void *opaque)
3456 {
3457     RAMState **temp = opaque;
3458     RAMState *rs = *temp;
3459     int ret = 0;
3460     int i;
3461     int64_t t0;
3462     int done = 0;
3463
3464     if (blk_mig_bulk_active()) {
3465         /* Avoid transferring ram during bulk phase of block migration as
3466          * the bulk phase will usually take a long time and transferring
3467          * ram updates during that time is pointless. */
3468         goto out;
3469     }
3470
3471     WITH_RCU_READ_LOCK_GUARD() {
3472         if (ram_list.version != rs->last_version) {
3473             ram_state_reset(rs);
3474         }
3475
3476         /* Read version before ram_list.blocks */
3477         smp_rmb();
3478
3479         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3480
3481         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3482         i = 0;
3483         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3484                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3485             int pages;
3486
3487             if (qemu_file_get_error(f)) {
3488                 break;
3489             }
3490
3491             pages = ram_find_and_save_block(rs, false);
3492             /* no more pages to sent */
3493             if (pages == 0) {
3494                 done = 1;
3495                 break;
3496             }
3497
3498             if (pages < 0) {
3499                 qemu_file_set_error(f, pages);
3500                 break;
3501             }
3502
3503             rs->target_page_count += pages;
3504
3505             /*
3506              * During postcopy, it is necessary to make sure one whole host
3507              * page is sent in one chunk.
3508              */
3509             if (migrate_postcopy_ram()) {
3510                 flush_compressed_data(rs);
3511             }
3512
3513             /*
3514              * we want to check in the 1st loop, just in case it was the 1st
3515              * time and we had to sync the dirty bitmap.
3516              * qemu_clock_get_ns() is a bit expensive, so we only check each
3517              * some iterations
3518              */
3519             if ((i & 63) == 0) {
3520                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3521                               1000000;
3522                 if (t1 > MAX_WAIT) {
3523                     trace_ram_save_iterate_big_wait(t1, i);
3524                     break;
3525                 }
3526             }
3527             i++;
3528         }
3529     }
3530
3531     /*
3532      * Must occur before EOS (or any QEMUFile operation)
3533      * because of RDMA protocol.
3534      */
3535     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3536
3537 out:
3538     if (ret >= 0
3539         && migration_is_setup_or_active(migrate_get_current()->state)) {
3540         multifd_send_sync_main(rs->f);
3541         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3542         qemu_fflush(f);
3543         ram_counters.transferred += 8;
3544
3545         ret = qemu_file_get_error(f);
3546     }
3547     if (ret < 0) {
3548         return ret;
3549     }
3550
3551     return done;
3552 }
3553
3554 /**
3555  * ram_save_complete: function called to send the remaining amount of ram
3556  *
3557  * Returns zero to indicate success or negative on error
3558  *
3559  * Called with iothread lock
3560  *
3561  * @f: QEMUFile where to send the data
3562  * @opaque: RAMState pointer
3563  */
3564 static int ram_save_complete(QEMUFile *f, void *opaque)
3565 {
3566     RAMState **temp = opaque;
3567     RAMState *rs = *temp;
3568     int ret = 0;
3569
3570     WITH_RCU_READ_LOCK_GUARD() {
3571         if (!migration_in_postcopy()) {
3572             migration_bitmap_sync_precopy(rs);
3573         }
3574
3575         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3576
3577         /* try transferring iterative blocks of memory */
3578
3579         /* flush all remaining blocks regardless of rate limiting */
3580         while (true) {
3581             int pages;
3582
3583             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3584             /* no more blocks to sent */
3585             if (pages == 0) {
3586                 break;
3587             }
3588             if (pages < 0) {
3589                 ret = pages;
3590                 break;
3591             }
3592         }
3593
3594         flush_compressed_data(rs);
3595         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3596     }
3597
3598     if (ret >= 0) {
3599         multifd_send_sync_main(rs->f);
3600         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3601         qemu_fflush(f);
3602     }
3603
3604     return ret;
3605 }
3606
3607 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3608                              uint64_t *res_precopy_only,
3609                              uint64_t *res_compatible,
3610                              uint64_t *res_postcopy_only)
3611 {
3612     RAMState **temp = opaque;
3613     RAMState *rs = *temp;
3614     uint64_t remaining_size;
3615
3616     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3617
3618     if (!migration_in_postcopy() &&
3619         remaining_size < max_size) {
3620         qemu_mutex_lock_iothread();
3621         WITH_RCU_READ_LOCK_GUARD() {
3622             migration_bitmap_sync_precopy(rs);
3623         }
3624         qemu_mutex_unlock_iothread();
3625         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3626     }
3627
3628     if (migrate_postcopy_ram()) {
3629         /* We can do postcopy, and all the data is postcopiable */
3630         *res_compatible += remaining_size;
3631     } else {
3632         *res_precopy_only += remaining_size;
3633     }
3634 }
3635
3636 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3637 {
3638     unsigned int xh_len;
3639     int xh_flags;
3640     uint8_t *loaded_data;
3641
3642     /* extract RLE header */
3643     xh_flags = qemu_get_byte(f);
3644     xh_len = qemu_get_be16(f);
3645
3646     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3647         error_report("Failed to load XBZRLE page - wrong compression!");
3648         return -1;
3649     }
3650
3651     if (xh_len > TARGET_PAGE_SIZE) {
3652         error_report("Failed to load XBZRLE page - len overflow!");
3653         return -1;
3654     }
3655     loaded_data = XBZRLE.decoded_buf;
3656     /* load data and decode */
3657     /* it can change loaded_data to point to an internal buffer */
3658     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3659
3660     /* decode RLE */
3661     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3662                              TARGET_PAGE_SIZE) == -1) {
3663         error_report("Failed to load XBZRLE page - decode error!");
3664         return -1;
3665     }
3666
3667     return 0;
3668 }
3669
3670 /**
3671  * ram_block_from_stream: read a RAMBlock id from the migration stream
3672  *
3673  * Must be called from within a rcu critical section.
3674  *
3675  * Returns a pointer from within the RCU-protected ram_list.
3676  *
3677  * @f: QEMUFile where to read the data from
3678  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3679  */
3680 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3681 {
3682     static RAMBlock *block = NULL;
3683     char id[256];
3684     uint8_t len;
3685
3686     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3687         if (!block) {
3688             error_report("Ack, bad migration stream!");
3689             return NULL;
3690         }
3691         return block;
3692     }
3693
3694     len = qemu_get_byte(f);
3695     qemu_get_buffer(f, (uint8_t *)id, len);
3696     id[len] = 0;
3697
3698     block = qemu_ram_block_by_name(id);
3699     if (!block) {
3700         error_report("Can't find block %s", id);
3701         return NULL;
3702     }
3703
3704     if (ramblock_is_ignored(block)) {
3705         error_report("block %s should not be migrated !", id);
3706         return NULL;
3707     }
3708
3709     return block;
3710 }
3711
3712 static inline void *host_from_ram_block_offset(RAMBlock *block,
3713                                                ram_addr_t offset)
3714 {
3715     if (!offset_in_ramblock(block, offset)) {
3716         return NULL;
3717     }
3718
3719     return block->host + offset;
3720 }
3721
3722 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3723                                                  ram_addr_t offset)
3724 {
3725     if (!offset_in_ramblock(block, offset)) {
3726         return NULL;
3727     }
3728     if (!block->colo_cache) {
3729         error_report("%s: colo_cache is NULL in block :%s",
3730                      __func__, block->idstr);
3731         return NULL;
3732     }
3733
3734     /*
3735     * During colo checkpoint, we need bitmap of these migrated pages.
3736     * It help us to decide which pages in ram cache should be flushed
3737     * into VM's RAM later.
3738     */
3739     if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3740         ram_state->migration_dirty_pages++;
3741     }
3742     return block->colo_cache + offset;
3743 }
3744
3745 /**
3746  * ram_handle_compressed: handle the zero page case
3747  *
3748  * If a page (or a whole RDMA chunk) has been
3749  * determined to be zero, then zap it.
3750  *
3751  * @host: host address for the zero page
3752  * @ch: what the page is filled from.  We only support zero
3753  * @size: size of the zero page
3754  */
3755 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3756 {
3757     if (ch != 0 || !is_zero_range(host, size)) {
3758         memset(host, ch, size);
3759     }
3760 }
3761
3762 /* return the size after decompression, or negative value on error */
3763 static int
3764 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3765                      const uint8_t *source, size_t source_len)
3766 {
3767     int err;
3768
3769     err = inflateReset(stream);
3770     if (err != Z_OK) {
3771         return -1;
3772     }
3773
3774     stream->avail_in = source_len;
3775     stream->next_in = (uint8_t *)source;
3776     stream->avail_out = dest_len;
3777     stream->next_out = dest;
3778
3779     err = inflate(stream, Z_NO_FLUSH);
3780     if (err != Z_STREAM_END) {
3781         return -1;
3782     }
3783
3784     return stream->total_out;
3785 }
3786
3787 static void *do_data_decompress(void *opaque)
3788 {
3789     DecompressParam *param = opaque;
3790     unsigned long pagesize;
3791     uint8_t *des;
3792     int len, ret;
3793
3794     qemu_mutex_lock(&param->mutex);
3795     while (!param->quit) {
3796         if (param->des) {
3797             des = param->des;
3798             len = param->len;
3799             param->des = 0;
3800             qemu_mutex_unlock(&param->mutex);
3801
3802             pagesize = TARGET_PAGE_SIZE;
3803
3804             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3805                                        param->compbuf, len);
3806             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3807                 error_report("decompress data failed");
3808                 qemu_file_set_error(decomp_file, ret);
3809             }
3810
3811             qemu_mutex_lock(&decomp_done_lock);
3812             param->done = true;
3813             qemu_cond_signal(&decomp_done_cond);
3814             qemu_mutex_unlock(&decomp_done_lock);
3815
3816             qemu_mutex_lock(&param->mutex);
3817         } else {
3818             qemu_cond_wait(&param->cond, &param->mutex);
3819         }
3820     }
3821     qemu_mutex_unlock(&param->mutex);
3822
3823     return NULL;
3824 }
3825
3826 static int wait_for_decompress_done(void)
3827 {
3828     int idx, thread_count;
3829
3830     if (!migrate_use_compression()) {
3831         return 0;
3832     }
3833
3834     thread_count = migrate_decompress_threads();
3835     qemu_mutex_lock(&decomp_done_lock);
3836     for (idx = 0; idx < thread_count; idx++) {
3837         while (!decomp_param[idx].done) {
3838             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3839         }
3840     }
3841     qemu_mutex_unlock(&decomp_done_lock);
3842     return qemu_file_get_error(decomp_file);
3843 }
3844
3845 static void compress_threads_load_cleanup(void)
3846 {
3847     int i, thread_count;
3848
3849     if (!migrate_use_compression()) {
3850         return;
3851     }
3852     thread_count = migrate_decompress_threads();
3853     for (i = 0; i < thread_count; i++) {
3854         /*
3855          * we use it as a indicator which shows if the thread is
3856          * properly init'd or not
3857          */
3858         if (!decomp_param[i].compbuf) {
3859             break;
3860         }
3861
3862         qemu_mutex_lock(&decomp_param[i].mutex);
3863         decomp_param[i].quit = true;
3864         qemu_cond_signal(&decomp_param[i].cond);
3865         qemu_mutex_unlock(&decomp_param[i].mutex);
3866     }
3867     for (i = 0; i < thread_count; i++) {
3868         if (!decomp_param[i].compbuf) {
3869             break;
3870         }
3871
3872         qemu_thread_join(decompress_threads + i);
3873         qemu_mutex_destroy(&decomp_param[i].mutex);
3874         qemu_cond_destroy(&decomp_param[i].cond);
3875         inflateEnd(&decomp_param[i].stream);
3876         g_free(decomp_param[i].compbuf);
3877         decomp_param[i].compbuf = NULL;
3878     }
3879     g_free(decompress_threads);
3880     g_free(decomp_param);
3881     decompress_threads = NULL;
3882     decomp_param = NULL;
3883     decomp_file = NULL;
3884 }
3885
3886 static int compress_threads_load_setup(QEMUFile *f)
3887 {
3888     int i, thread_count;
3889
3890     if (!migrate_use_compression()) {
3891         return 0;
3892     }
3893
3894     thread_count = migrate_decompress_threads();
3895     decompress_threads = g_new0(QemuThread, thread_count);
3896     decomp_param = g_new0(DecompressParam, thread_count);
3897     qemu_mutex_init(&decomp_done_lock);
3898     qemu_cond_init(&decomp_done_cond);
3899     decomp_file = f;
3900     for (i = 0; i < thread_count; i++) {
3901         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3902             goto exit;
3903         }
3904
3905         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3906         qemu_mutex_init(&decomp_param[i].mutex);
3907         qemu_cond_init(&decomp_param[i].cond);
3908         decomp_param[i].done = true;
3909         decomp_param[i].quit = false;
3910         qemu_thread_create(decompress_threads + i, "decompress",
3911                            do_data_decompress, decomp_param + i,
3912                            QEMU_THREAD_JOINABLE);
3913     }
3914     return 0;
3915 exit:
3916     compress_threads_load_cleanup();
3917     return -1;
3918 }
3919
3920 static void decompress_data_with_multi_threads(QEMUFile *f,
3921                                                void *host, int len)
3922 {
3923     int idx, thread_count;
3924
3925     thread_count = migrate_decompress_threads();
3926     qemu_mutex_lock(&decomp_done_lock);
3927     while (true) {
3928         for (idx = 0; idx < thread_count; idx++) {
3929             if (decomp_param[idx].done) {
3930                 decomp_param[idx].done = false;
3931                 qemu_mutex_lock(&decomp_param[idx].mutex);
3932                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3933                 decomp_param[idx].des = host;
3934                 decomp_param[idx].len = len;
3935                 qemu_cond_signal(&decomp_param[idx].cond);
3936                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3937                 break;
3938             }
3939         }
3940         if (idx < thread_count) {
3941             break;
3942         } else {
3943             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3944         }
3945     }
3946     qemu_mutex_unlock(&decomp_done_lock);
3947 }
3948
3949 /*
3950  * colo cache: this is for secondary VM, we cache the whole
3951  * memory of the secondary VM, it is need to hold the global lock
3952  * to call this helper.
3953  */
3954 int colo_init_ram_cache(void)
3955 {
3956     RAMBlock *block;
3957
3958     WITH_RCU_READ_LOCK_GUARD() {
3959         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3960             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3961                                                     NULL,
3962                                                     false);
3963             if (!block->colo_cache) {
3964                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3965                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3966                              block->used_length);
3967                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3968                     if (block->colo_cache) {
3969                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3970                         block->colo_cache = NULL;
3971                     }
3972                 }
3973                 return -errno;
3974             }
3975             memcpy(block->colo_cache, block->host, block->used_length);
3976         }
3977     }
3978
3979     /*
3980     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3981     * with to decide which page in cache should be flushed into SVM's RAM. Here
3982     * we use the same name 'ram_bitmap' as for migration.
3983     */
3984     if (ram_bytes_total()) {
3985         RAMBlock *block;
3986
3987         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3988             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3989
3990             block->bmap = bitmap_new(pages);
3991             bitmap_set(block->bmap, 0, pages);
3992         }
3993     }
3994     ram_state = g_new0(RAMState, 1);
3995     ram_state->migration_dirty_pages = 0;
3996     qemu_mutex_init(&ram_state->bitmap_mutex);
3997     memory_global_dirty_log_start();
3998
3999     return 0;
4000 }
4001
4002 /* It is need to hold the global lock to call this helper */
4003 void colo_release_ram_cache(void)
4004 {
4005     RAMBlock *block;
4006
4007     memory_global_dirty_log_stop();
4008     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4009         g_free(block->bmap);
4010         block->bmap = NULL;
4011     }
4012
4013     WITH_RCU_READ_LOCK_GUARD() {
4014         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4015             if (block->colo_cache) {
4016                 qemu_anon_ram_free(block->colo_cache, block->used_length);
4017                 block->colo_cache = NULL;
4018             }
4019         }
4020     }
4021     qemu_mutex_destroy(&ram_state->bitmap_mutex);
4022     g_free(ram_state);
4023     ram_state = NULL;
4024 }
4025
4026 /**
4027  * ram_load_setup: Setup RAM for migration incoming side
4028  *
4029  * Returns zero to indicate success and negative for error
4030  *
4031  * @f: QEMUFile where to receive the data
4032  * @opaque: RAMState pointer
4033  */
4034 static int ram_load_setup(QEMUFile *f, void *opaque)
4035 {
4036     if (compress_threads_load_setup(f)) {
4037         return -1;
4038     }
4039
4040     xbzrle_load_setup();
4041     ramblock_recv_map_init();
4042
4043     return 0;
4044 }
4045
4046 static int ram_load_cleanup(void *opaque)
4047 {
4048     RAMBlock *rb;
4049
4050     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4051         qemu_ram_block_writeback(rb);
4052     }
4053
4054     xbzrle_load_cleanup();
4055     compress_threads_load_cleanup();
4056
4057     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4058         g_free(rb->receivedmap);
4059         rb->receivedmap = NULL;
4060     }
4061
4062     return 0;
4063 }
4064
4065 /**
4066  * ram_postcopy_incoming_init: allocate postcopy data structures
4067  *
4068  * Returns 0 for success and negative if there was one error
4069  *
4070  * @mis: current migration incoming state
4071  *
4072  * Allocate data structures etc needed by incoming migration with
4073  * postcopy-ram. postcopy-ram's similarly names
4074  * postcopy_ram_incoming_init does the work.
4075  */
4076 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4077 {
4078     return postcopy_ram_incoming_init(mis);
4079 }
4080
4081 /**
4082  * ram_load_postcopy: load a page in postcopy case
4083  *
4084  * Returns 0 for success or -errno in case of error
4085  *
4086  * Called in postcopy mode by ram_load().
4087  * rcu_read_lock is taken prior to this being called.
4088  *
4089  * @f: QEMUFile where to send the data
4090  */
4091 static int ram_load_postcopy(QEMUFile *f)
4092 {
4093     int flags = 0, ret = 0;
4094     bool place_needed = false;
4095     bool matches_target_page_size = false;
4096     MigrationIncomingState *mis = migration_incoming_get_current();
4097     /* Temporary page that is later 'placed' */
4098     void *postcopy_host_page = mis->postcopy_tmp_page;
4099     void *this_host = NULL;
4100     bool all_zero = false;
4101     int target_pages = 0;
4102
4103     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4104         ram_addr_t addr;
4105         void *host = NULL;
4106         void *page_buffer = NULL;
4107         void *place_source = NULL;
4108         RAMBlock *block = NULL;
4109         uint8_t ch;
4110         int len;
4111
4112         addr = qemu_get_be64(f);
4113
4114         /*
4115          * If qemu file error, we should stop here, and then "addr"
4116          * may be invalid
4117          */
4118         ret = qemu_file_get_error(f);
4119         if (ret) {
4120             break;
4121         }
4122
4123         flags = addr & ~TARGET_PAGE_MASK;
4124         addr &= TARGET_PAGE_MASK;
4125
4126         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4127         place_needed = false;
4128         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4129                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
4130             block = ram_block_from_stream(f, flags);
4131
4132             host = host_from_ram_block_offset(block, addr);
4133             if (!host) {
4134                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4135                 ret = -EINVAL;
4136                 break;
4137             }
4138             target_pages++;
4139             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4140             /*
4141              * Postcopy requires that we place whole host pages atomically;
4142              * these may be huge pages for RAMBlocks that are backed by
4143              * hugetlbfs.
4144              * To make it atomic, the data is read into a temporary page
4145              * that's moved into place later.
4146              * The migration protocol uses,  possibly smaller, target-pages
4147              * however the source ensures it always sends all the components
4148              * of a host page in one chunk.
4149              */
4150             page_buffer = postcopy_host_page +
4151                           ((uintptr_t)host & (block->page_size - 1));
4152             /* If all TP are zero then we can optimise the place */
4153             if (target_pages == 1) {
4154                 all_zero = true;
4155                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
4156                                                     block->page_size);
4157             } else {
4158                 /* not the 1st TP within the HP */
4159                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
4160                     (uintptr_t)this_host) {
4161                     error_report("Non-same host page %p/%p",
4162                                   host, this_host);
4163                     ret = -EINVAL;
4164                     break;
4165                 }
4166             }
4167
4168             /*
4169              * If it's the last part of a host page then we place the host
4170              * page
4171              */
4172             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
4173                 place_needed = true;
4174                 target_pages = 0;
4175             }
4176             place_source = postcopy_host_page;
4177         }
4178
4179         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4180         case RAM_SAVE_FLAG_ZERO:
4181             ch = qemu_get_byte(f);
4182             /*
4183              * Can skip to set page_buffer when
4184              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4185              */
4186             if (ch || !matches_target_page_size) {
4187                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4188             }
4189             if (ch) {
4190                 all_zero = false;
4191             }
4192             break;
4193
4194         case RAM_SAVE_FLAG_PAGE:
4195             all_zero = false;
4196             if (!matches_target_page_size) {
4197                 /* For huge pages, we always use temporary buffer */
4198                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4199             } else {
4200                 /*
4201                  * For small pages that matches target page size, we
4202                  * avoid the qemu_file copy.  Instead we directly use
4203                  * the buffer of QEMUFile to place the page.  Note: we
4204                  * cannot do any QEMUFile operation before using that
4205                  * buffer to make sure the buffer is valid when
4206                  * placing the page.
4207                  */
4208                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4209                                          TARGET_PAGE_SIZE);
4210             }
4211             break;
4212         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4213             all_zero = false;
4214             len = qemu_get_be32(f);
4215             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4216                 error_report("Invalid compressed data length: %d", len);
4217                 ret = -EINVAL;
4218                 break;
4219             }
4220             decompress_data_with_multi_threads(f, page_buffer, len);
4221             break;
4222
4223         case RAM_SAVE_FLAG_EOS:
4224             /* normal exit */
4225             multifd_recv_sync_main();
4226             break;
4227         default:
4228             error_report("Unknown combination of migration flags: %#x"
4229                          " (postcopy mode)", flags);
4230             ret = -EINVAL;
4231             break;
4232         }
4233
4234         /* Got the whole host page, wait for decompress before placing. */
4235         if (place_needed) {
4236             ret |= wait_for_decompress_done();
4237         }
4238
4239         /* Detect for any possible file errors */
4240         if (!ret && qemu_file_get_error(f)) {
4241             ret = qemu_file_get_error(f);
4242         }
4243
4244         if (!ret && place_needed) {
4245             /* This gets called at the last target page in the host page */
4246             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
4247                                                        block->page_size);
4248
4249             if (all_zero) {
4250                 ret = postcopy_place_page_zero(mis, place_dest,
4251                                                block);
4252             } else {
4253                 ret = postcopy_place_page(mis, place_dest,
4254                                           place_source, block);
4255             }
4256         }
4257     }
4258
4259     return ret;
4260 }
4261
4262 static bool postcopy_is_advised(void)
4263 {
4264     PostcopyState ps = postcopy_state_get();
4265     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4266 }
4267
4268 static bool postcopy_is_running(void)
4269 {
4270     PostcopyState ps = postcopy_state_get();
4271     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4272 }
4273
4274 /*
4275  * Flush content of RAM cache into SVM's memory.
4276  * Only flush the pages that be dirtied by PVM or SVM or both.
4277  */
4278 static void colo_flush_ram_cache(void)
4279 {
4280     RAMBlock *block = NULL;
4281     void *dst_host;
4282     void *src_host;
4283     unsigned long offset = 0;
4284
4285     memory_global_dirty_log_sync();
4286     WITH_RCU_READ_LOCK_GUARD() {
4287         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4288             ramblock_sync_dirty_bitmap(ram_state, block);
4289         }
4290     }
4291
4292     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4293     WITH_RCU_READ_LOCK_GUARD() {
4294         block = QLIST_FIRST_RCU(&ram_list.blocks);
4295
4296         while (block) {
4297             offset = migration_bitmap_find_dirty(ram_state, block, offset);
4298
4299             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
4300                 >= block->used_length) {
4301                 offset = 0;
4302                 block = QLIST_NEXT_RCU(block, next);
4303             } else {
4304                 migration_bitmap_clear_dirty(ram_state, block, offset);
4305                 dst_host = block->host
4306                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4307                 src_host = block->colo_cache
4308                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4309                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4310             }
4311         }
4312     }
4313     trace_colo_flush_ram_cache_end();
4314 }
4315
4316 /**
4317  * ram_load_precopy: load pages in precopy case
4318  *
4319  * Returns 0 for success or -errno in case of error
4320  *
4321  * Called in precopy mode by ram_load().
4322  * rcu_read_lock is taken prior to this being called.
4323  *
4324  * @f: QEMUFile where to send the data
4325  */
4326 static int ram_load_precopy(QEMUFile *f)
4327 {
4328     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4329     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4330     bool postcopy_advised = postcopy_is_advised();
4331     if (!migrate_use_compression()) {
4332         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4333     }
4334
4335     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4336         ram_addr_t addr, total_ram_bytes;
4337         void *host = NULL;
4338         uint8_t ch;
4339
4340         /*
4341          * Yield periodically to let main loop run, but an iteration of
4342          * the main loop is expensive, so do it each some iterations
4343          */
4344         if ((i & 32767) == 0 && qemu_in_coroutine()) {
4345             aio_co_schedule(qemu_get_current_aio_context(),
4346                             qemu_coroutine_self());
4347             qemu_coroutine_yield();
4348         }
4349         i++;
4350
4351         addr = qemu_get_be64(f);
4352         flags = addr & ~TARGET_PAGE_MASK;
4353         addr &= TARGET_PAGE_MASK;
4354
4355         if (flags & invalid_flags) {
4356             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4357                 error_report("Received an unexpected compressed page");
4358             }
4359
4360             ret = -EINVAL;
4361             break;
4362         }
4363
4364         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4365                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4366             RAMBlock *block = ram_block_from_stream(f, flags);
4367
4368             /*
4369              * After going into COLO, we should load the Page into colo_cache.
4370              */
4371             if (migration_incoming_in_colo_state()) {
4372                 host = colo_cache_from_block_offset(block, addr);
4373             } else {
4374                 host = host_from_ram_block_offset(block, addr);
4375             }
4376             if (!host) {
4377                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4378                 ret = -EINVAL;
4379                 break;
4380             }
4381
4382             if (!migration_incoming_in_colo_state()) {
4383                 ramblock_recv_bitmap_set(block, host);
4384             }
4385
4386             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4387         }
4388
4389         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4390         case RAM_SAVE_FLAG_MEM_SIZE:
4391             /* Synchronize RAM block list */
4392             total_ram_bytes = addr;
4393             while (!ret && total_ram_bytes) {
4394                 RAMBlock *block;
4395                 char id[256];
4396                 ram_addr_t length;
4397
4398                 len = qemu_get_byte(f);
4399                 qemu_get_buffer(f, (uint8_t *)id, len);
4400                 id[len] = 0;
4401                 length = qemu_get_be64(f);
4402
4403                 block = qemu_ram_block_by_name(id);
4404                 if (block && !qemu_ram_is_migratable(block)) {
4405                     error_report("block %s should not be migrated !", id);
4406                     ret = -EINVAL;
4407                 } else if (block) {
4408                     if (length != block->used_length) {
4409                         Error *local_err = NULL;
4410
4411                         ret = qemu_ram_resize(block, length,
4412                                               &local_err);
4413                         if (local_err) {
4414                             error_report_err(local_err);
4415                         }
4416                     }
4417                     /* For postcopy we need to check hugepage sizes match */
4418                     if (postcopy_advised &&
4419                         block->page_size != qemu_host_page_size) {
4420                         uint64_t remote_page_size = qemu_get_be64(f);
4421                         if (remote_page_size != block->page_size) {
4422                             error_report("Mismatched RAM page size %s "
4423                                          "(local) %zd != %" PRId64,
4424                                          id, block->page_size,
4425                                          remote_page_size);
4426                             ret = -EINVAL;
4427                         }
4428                     }
4429                     if (migrate_ignore_shared()) {
4430                         hwaddr addr = qemu_get_be64(f);
4431                         if (ramblock_is_ignored(block) &&
4432                             block->mr->addr != addr) {
4433                             error_report("Mismatched GPAs for block %s "
4434                                          "%" PRId64 "!= %" PRId64,
4435                                          id, (uint64_t)addr,
4436                                          (uint64_t)block->mr->addr);
4437                             ret = -EINVAL;
4438                         }
4439                     }
4440                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4441                                           block->idstr);
4442                 } else {
4443                     error_report("Unknown ramblock \"%s\", cannot "
4444                                  "accept migration", id);
4445                     ret = -EINVAL;
4446                 }
4447
4448                 total_ram_bytes -= length;
4449             }
4450             break;
4451
4452         case RAM_SAVE_FLAG_ZERO:
4453             ch = qemu_get_byte(f);
4454             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4455             break;
4456
4457         case RAM_SAVE_FLAG_PAGE:
4458             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4459             break;
4460
4461         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4462             len = qemu_get_be32(f);
4463             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4464                 error_report("Invalid compressed data length: %d", len);
4465                 ret = -EINVAL;
4466                 break;
4467             }
4468             decompress_data_with_multi_threads(f, host, len);
4469             break;
4470
4471         case RAM_SAVE_FLAG_XBZRLE:
4472             if (load_xbzrle(f, addr, host) < 0) {
4473                 error_report("Failed to decompress XBZRLE page at "
4474                              RAM_ADDR_FMT, addr);
4475                 ret = -EINVAL;
4476                 break;
4477             }
4478             break;
4479         case RAM_SAVE_FLAG_EOS:
4480             /* normal exit */
4481             multifd_recv_sync_main();
4482             break;
4483         default:
4484             if (flags & RAM_SAVE_FLAG_HOOK) {
4485                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4486             } else {
4487                 error_report("Unknown combination of migration flags: %#x",
4488                              flags);
4489                 ret = -EINVAL;
4490             }
4491         }
4492         if (!ret) {
4493             ret = qemu_file_get_error(f);
4494         }
4495     }
4496
4497     ret |= wait_for_decompress_done();
4498     return ret;
4499 }
4500
4501 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4502 {
4503     int ret = 0;
4504     static uint64_t seq_iter;
4505     /*
4506      * If system is running in postcopy mode, page inserts to host memory must
4507      * be atomic
4508      */
4509     bool postcopy_running = postcopy_is_running();
4510
4511     seq_iter++;
4512
4513     if (version_id != 4) {
4514         return -EINVAL;
4515     }
4516
4517     /*
4518      * This RCU critical section can be very long running.
4519      * When RCU reclaims in the code start to become numerous,
4520      * it will be necessary to reduce the granularity of this
4521      * critical section.
4522      */
4523     WITH_RCU_READ_LOCK_GUARD() {
4524         if (postcopy_running) {
4525             ret = ram_load_postcopy(f);
4526         } else {
4527             ret = ram_load_precopy(f);
4528         }
4529     }
4530     trace_ram_load_complete(ret, seq_iter);
4531
4532     if (!ret  && migration_incoming_in_colo_state()) {
4533         colo_flush_ram_cache();
4534     }
4535     return ret;
4536 }
4537
4538 static bool ram_has_postcopy(void *opaque)
4539 {
4540     RAMBlock *rb;
4541     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4542         if (ramblock_is_pmem(rb)) {
4543             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4544                          "is not supported now!", rb->idstr, rb->host);
4545             return false;
4546         }
4547     }
4548
4549     return migrate_postcopy_ram();
4550 }
4551
4552 /* Sync all the dirty bitmap with destination VM.  */
4553 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4554 {
4555     RAMBlock *block;
4556     QEMUFile *file = s->to_dst_file;
4557     int ramblock_count = 0;
4558
4559     trace_ram_dirty_bitmap_sync_start();
4560
4561     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4562         qemu_savevm_send_recv_bitmap(file, block->idstr);
4563         trace_ram_dirty_bitmap_request(block->idstr);
4564         ramblock_count++;
4565     }
4566
4567     trace_ram_dirty_bitmap_sync_wait();
4568
4569     /* Wait until all the ramblocks' dirty bitmap synced */
4570     while (ramblock_count--) {
4571         qemu_sem_wait(&s->rp_state.rp_sem);
4572     }
4573
4574     trace_ram_dirty_bitmap_sync_complete();
4575
4576     return 0;
4577 }
4578
4579 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4580 {
4581     qemu_sem_post(&s->rp_state.rp_sem);
4582 }
4583
4584 /*
4585  * Read the received bitmap, revert it as the initial dirty bitmap.
4586  * This is only used when the postcopy migration is paused but wants
4587  * to resume from a middle point.
4588  */
4589 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4590 {
4591     int ret = -EINVAL;
4592     QEMUFile *file = s->rp_state.from_dst_file;
4593     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4594     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4595     uint64_t size, end_mark;
4596
4597     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4598
4599     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4600         error_report("%s: incorrect state %s", __func__,
4601                      MigrationStatus_str(s->state));
4602         return -EINVAL;
4603     }
4604
4605     /*
4606      * Note: see comments in ramblock_recv_bitmap_send() on why we
4607      * need the endianess convertion, and the paddings.
4608      */
4609     local_size = ROUND_UP(local_size, 8);
4610
4611     /* Add paddings */
4612     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4613
4614     size = qemu_get_be64(file);
4615
4616     /* The size of the bitmap should match with our ramblock */
4617     if (size != local_size) {
4618         error_report("%s: ramblock '%s' bitmap size mismatch "
4619                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4620                      block->idstr, size, local_size);
4621         ret = -EINVAL;
4622         goto out;
4623     }
4624
4625     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4626     end_mark = qemu_get_be64(file);
4627
4628     ret = qemu_file_get_error(file);
4629     if (ret || size != local_size) {
4630         error_report("%s: read bitmap failed for ramblock '%s': %d"
4631                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4632                      __func__, block->idstr, ret, local_size, size);
4633         ret = -EIO;
4634         goto out;
4635     }
4636
4637     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4638         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4639                      __func__, block->idstr, end_mark);
4640         ret = -EINVAL;
4641         goto out;
4642     }
4643
4644     /*
4645      * Endianess convertion. We are during postcopy (though paused).
4646      * The dirty bitmap won't change. We can directly modify it.
4647      */
4648     bitmap_from_le(block->bmap, le_bitmap, nbits);
4649
4650     /*
4651      * What we received is "received bitmap". Revert it as the initial
4652      * dirty bitmap for this ramblock.
4653      */
4654     bitmap_complement(block->bmap, block->bmap, nbits);
4655
4656     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4657
4658     /*
4659      * We succeeded to sync bitmap for current ramblock. If this is
4660      * the last one to sync, we need to notify the main send thread.
4661      */
4662     ram_dirty_bitmap_reload_notify(s);
4663
4664     ret = 0;
4665 out:
4666     g_free(le_bitmap);
4667     return ret;
4668 }
4669
4670 static int ram_resume_prepare(MigrationState *s, void *opaque)
4671 {
4672     RAMState *rs = *(RAMState **)opaque;
4673     int ret;
4674
4675     ret = ram_dirty_bitmap_sync_all(s, rs);
4676     if (ret) {
4677         return ret;
4678     }
4679
4680     ram_state_resume_prepare(rs, s->to_dst_file);
4681
4682     return 0;
4683 }
4684
4685 static SaveVMHandlers savevm_ram_handlers = {
4686     .save_setup = ram_save_setup,
4687     .save_live_iterate = ram_save_iterate,
4688     .save_live_complete_postcopy = ram_save_complete,
4689     .save_live_complete_precopy = ram_save_complete,
4690     .has_postcopy = ram_has_postcopy,
4691     .save_live_pending = ram_save_pending,
4692     .load_state = ram_load,
4693     .save_cleanup = ram_save_cleanup,
4694     .load_setup = ram_load_setup,
4695     .load_cleanup = ram_load_cleanup,
4696     .resume_prepare = ram_resume_prepare,
4697 };
4698
4699 void ram_mig_init(void)
4700 {
4701     qemu_mutex_init(&XBZRLE.lock);
4702     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4703 }