migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "socket.h"
  40 #include "migration/register.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "postcopy-ram.h"
  44 #include "page_cache.h"
  45 #include "qemu/error-report.h"
  46 #include "qapi/error.h"
  47 #include "qapi/qapi-events-migration.h"
  48 #include "qapi/qmp/qerror.h"
  49 #include "trace.h"
  50 #include "exec/ram_addr.h"
  51 #include "exec/target_page.h"
  52 #include "qemu/rcu_queue.h"
  53 #include "migration/colo.h"
  54 #include "block.h"
  55 #include "sysemu/sysemu.h"
  56 #include "qemu/uuid.h"
  57 #include "savevm.h"
  58 #include "qemu/iov.h"
  59
  60 /***********************************************************/
  61 /* ram save/restore */
  62
  63 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  64  * worked for pages that where filled with the same char.  We switched
  65  * it to only search for the zero value.  And to avoid confusion with
  66  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  67  */
  68
  69 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  70 #define RAM_SAVE_FLAG_ZERO     0x02
  71 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  72 #define RAM_SAVE_FLAG_PAGE     0x08
  73 #define RAM_SAVE_FLAG_EOS      0x10
  74 #define RAM_SAVE_FLAG_CONTINUE 0x20
  75 #define RAM_SAVE_FLAG_XBZRLE   0x40
  76 /* 0x80 is reserved in migration.h start with 0x100 next */
  77 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  78
  79 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  80 {
  81     return buffer_is_zero(p, size);
  82 }
  83
  84 XBZRLECacheStats xbzrle_counters;
  85
  86 /* struct contains XBZRLE cache and a static page
  87    used by the compression */
  88 static struct {
  89     /* buffer used for XBZRLE encoding */
  90     uint8_t *encoded_buf;
  91     /* buffer for storing page content */
  92     uint8_t *current_buf;
  93     /* Cache for XBZRLE, Protected by lock. */
  94     PageCache *cache;
  95     QemuMutex lock;
  96     /* it will store a page full of zeros */
  97     uint8_t *zero_target_page;
  98     /* buffer used for XBZRLE decoding */
  99     uint8_t *decoded_buf;
 100 } XBZRLE;
 101
 102 static void XBZRLE_cache_lock(void)
 103 {
 104     if (migrate_use_xbzrle())
 105         qemu_mutex_lock(&XBZRLE.lock);
 106 }
 107
 108 static void XBZRLE_cache_unlock(void)
 109 {
 110     if (migrate_use_xbzrle())
 111         qemu_mutex_unlock(&XBZRLE.lock);
 112 }
 113
 114 /**
 115  * xbzrle_cache_resize: resize the xbzrle cache
 116  *
 117  * This function is called from qmp_migrate_set_cache_size in main
 118  * thread, possibly while a migration is in progress.  A running
 119  * migration may be using the cache and might finish during this call,
 120  * hence changes to the cache are protected by XBZRLE.lock().
 121  *
 122  * Returns 0 for success or -1 for error
 123  *
 124  * @new_size: new cache size
 125  * @errp: set *errp if the check failed, with reason
 126  */
 127 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 128 {
 129     PageCache *new_cache;
 130     int64_t ret = 0;
 131
 132     /* Check for truncation */
 133     if (new_size != (size_t)new_size) {
 134         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 135                    "exceeding address space");
 136         return -1;
 137     }
 138
 139     if (new_size == migrate_xbzrle_cache_size()) {
 140         /* nothing to do */
 141         return 0;
 142     }
 143
 144     XBZRLE_cache_lock();
 145
 146     if (XBZRLE.cache != NULL) {
 147         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 148         if (!new_cache) {
 149             ret = -1;
 150             goto out;
 151         }
 152
 153         cache_fini(XBZRLE.cache);
 154         XBZRLE.cache = new_cache;
 155     }
 156 out:
 157     XBZRLE_cache_unlock();
 158     return ret;
 159 }
 160
 161 static bool ramblock_is_ignored(RAMBlock *block)
 162 {
 163     return !qemu_ram_is_migratable(block) ||
 164            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 165 }
 166
 167 /* Should be holding either ram_list.mutex, or the RCU lock. */
 168 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 169     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 170         if (ramblock_is_ignored(block)) {} else
 171
 172 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 173     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 174         if (!qemu_ram_is_migratable(block)) {} else
 175
 176 #undef RAMBLOCK_FOREACH
 177
 178 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 179 {
 180     RAMBlock *block;
 181     int ret = 0;
 182
 183     RCU_READ_LOCK_GUARD();
 184
 185     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 186         ret = func(block, opaque);
 187         if (ret) {
 188             break;
 189         }
 190     }
 191     return ret;
 192 }
 193
 194 static void ramblock_recv_map_init(void)
 195 {
 196     RAMBlock *rb;
 197
 198     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 199         assert(!rb->receivedmap);
 200         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 201     }
 202 }
 203
 204 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 205 {
 206     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 207                     rb->receivedmap);
 208 }
 209
 210 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 211 {
 212     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 213 }
 214
 215 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 216 {
 217     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 218 }
 219
 220 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 221                                     size_t nr)
 222 {
 223     bitmap_set_atomic(rb->receivedmap,
 224                       ramblock_recv_bitmap_offset(host_addr, rb),
 225                       nr);
 226 }
 227
 228 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 229
 230 /*
 231  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 232  *
 233  * Returns >0 if success with sent bytes, or <0 if error.
 234  */
 235 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 236                                   const char *block_name)
 237 {
 238     RAMBlock *block = qemu_ram_block_by_name(block_name);
 239     unsigned long *le_bitmap, nbits;
 240     uint64_t size;
 241
 242     if (!block) {
 243         error_report("%s: invalid block name: %s", __func__, block_name);
 244         return -1;
 245     }
 246
 247     nbits = block->used_length >> TARGET_PAGE_BITS;
 248
 249     /*
 250      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 251      * machines we may need 4 more bytes for padding (see below
 252      * comment). So extend it a bit before hand.
 253      */
 254     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 255
 256     /*
 257      * Always use little endian when sending the bitmap. This is
 258      * required that when source and destination VMs are not using the
 259      * same endianess. (Note: big endian won't work.)
 260      */
 261     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 262
 263     /* Size of the bitmap, in bytes */
 264     size = DIV_ROUND_UP(nbits, 8);
 265
 266     /*
 267      * size is always aligned to 8 bytes for 64bit machines, but it
 268      * may not be true for 32bit machines. We need this padding to
 269      * make sure the migration can survive even between 32bit and
 270      * 64bit machines.
 271      */
 272     size = ROUND_UP(size, 8);
 273
 274     qemu_put_be64(file, size);
 275     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 276     /*
 277      * Mark as an end, in case the middle part is screwed up due to
 278      * some "misterious" reason.
 279      */
 280     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 281     qemu_fflush(file);
 282
 283     g_free(le_bitmap);
 284
 285     if (qemu_file_get_error(file)) {
 286         return qemu_file_get_error(file);
 287     }
 288
 289     return size + sizeof(size);
 290 }
 291
 292 /*
 293  * An outstanding page request, on the source, having been received
 294  * and queued
 295  */
 296 struct RAMSrcPageRequest {
 297     RAMBlock *rb;
 298     hwaddr    offset;
 299     hwaddr    len;
 300
 301     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 302 };
 303
 304 /* State of RAM for migration */
 305 struct RAMState {
 306     /* QEMUFile used for this migration */
 307     QEMUFile *f;
 308     /* Last block that we have visited searching for dirty pages */
 309     RAMBlock *last_seen_block;
 310     /* Last block from where we have sent data */
 311     RAMBlock *last_sent_block;
 312     /* Last dirty target page we have sent */
 313     ram_addr_t last_page;
 314     /* last ram version we have seen */
 315     uint32_t last_version;
 316     /* We are in the first round */
 317     bool ram_bulk_stage;
 318     /* The free page optimization is enabled */
 319     bool fpo_enabled;
 320     /* How many times we have dirty too many pages */
 321     int dirty_rate_high_cnt;
 322     /* these variables are used for bitmap sync */
 323     /* last time we did a full bitmap_sync */
 324     int64_t time_last_bitmap_sync;
 325     /* bytes transferred at start_time */
 326     uint64_t bytes_xfer_prev;
 327     /* number of dirty pages since start_time */
 328     uint64_t num_dirty_pages_period;
 329     /* xbzrle misses since the beginning of the period */
 330     uint64_t xbzrle_cache_miss_prev;
 331
 332     /* compression statistics since the beginning of the period */
 333     /* amount of count that no free thread to compress data */
 334     uint64_t compress_thread_busy_prev;
 335     /* amount bytes after compression */
 336     uint64_t compressed_size_prev;
 337     /* amount of compressed pages */
 338     uint64_t compress_pages_prev;
 339
 340     /* total handled target pages at the beginning of period */
 341     uint64_t target_page_count_prev;
 342     /* total handled target pages since start */
 343     uint64_t target_page_count;
 344     /* number of dirty bits in the bitmap */
 345     uint64_t migration_dirty_pages;
 346     /* Protects modification of the bitmap and migration dirty pages */
 347     QemuMutex bitmap_mutex;
 348     /* The RAMBlock used in the last src_page_requests */
 349     RAMBlock *last_req_rb;
 350     /* Queue of outstanding page requests from the destination */
 351     QemuMutex src_page_req_mutex;
 352     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 353 };
 354 typedef struct RAMState RAMState;
 355
 356 static RAMState *ram_state;
 357
 358 static NotifierWithReturnList precopy_notifier_list;
 359
 360 void precopy_infrastructure_init(void)
 361 {
 362     notifier_with_return_list_init(&precopy_notifier_list);
 363 }
 364
 365 void precopy_add_notifier(NotifierWithReturn *n)
 366 {
 367     notifier_with_return_list_add(&precopy_notifier_list, n);
 368 }
 369
 370 void precopy_remove_notifier(NotifierWithReturn *n)
 371 {
 372     notifier_with_return_remove(n);
 373 }
 374
 375 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 376 {
 377     PrecopyNotifyData pnd;
 378     pnd.reason = reason;
 379     pnd.errp = errp;
 380
 381     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 382 }
 383
 384 void precopy_enable_free_page_optimization(void)
 385 {
 386     if (!ram_state) {
 387         return;
 388     }
 389
 390     ram_state->fpo_enabled = true;
 391 }
 392
 393 uint64_t ram_bytes_remaining(void)
 394 {
 395     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 396                        0;
 397 }
 398
 399 MigrationStats ram_counters;
 400
 401 /* used by the search for pages to send */
 402 struct PageSearchStatus {
 403     /* Current block being searched */
 404     RAMBlock    *block;
 405     /* Current page to search from */
 406     unsigned long page;
 407     /* Set once we wrap around */
 408     bool         complete_round;
 409 };
 410 typedef struct PageSearchStatus PageSearchStatus;
 411
 412 CompressionStats compression_counters;
 413
 414 struct CompressParam {
 415     bool done;
 416     bool quit;
 417     bool zero_page;
 418     QEMUFile *file;
 419     QemuMutex mutex;
 420     QemuCond cond;
 421     RAMBlock *block;
 422     ram_addr_t offset;
 423
 424     /* internally used fields */
 425     z_stream stream;
 426     uint8_t *originbuf;
 427 };
 428 typedef struct CompressParam CompressParam;
 429
 430 struct DecompressParam {
 431     bool done;
 432     bool quit;
 433     QemuMutex mutex;
 434     QemuCond cond;
 435     void *des;
 436     uint8_t *compbuf;
 437     int len;
 438     z_stream stream;
 439 };
 440 typedef struct DecompressParam DecompressParam;
 441
 442 static CompressParam *comp_param;
 443 static QemuThread *compress_threads;
 444 /* comp_done_cond is used to wake up the migration thread when
 445  * one of the compression threads has finished the compression.
 446  * comp_done_lock is used to co-work with comp_done_cond.
 447  */
 448 static QemuMutex comp_done_lock;
 449 static QemuCond comp_done_cond;
 450 /* The empty QEMUFileOps will be used by file in CompressParam */
 451 static const QEMUFileOps empty_ops = { };
 452
 453 static QEMUFile *decomp_file;
 454 static DecompressParam *decomp_param;
 455 static QemuThread *decompress_threads;
 456 static QemuMutex decomp_done_lock;
 457 static QemuCond decomp_done_cond;
 458
 459 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 460                                  ram_addr_t offset, uint8_t *source_buf);
 461
 462 static void *do_data_compress(void *opaque)
 463 {
 464     CompressParam *param = opaque;
 465     RAMBlock *block;
 466     ram_addr_t offset;
 467     bool zero_page;
 468
 469     qemu_mutex_lock(&param->mutex);
 470     while (!param->quit) {
 471         if (param->block) {
 472             block = param->block;
 473             offset = param->offset;
 474             param->block = NULL;
 475             qemu_mutex_unlock(&param->mutex);
 476
 477             zero_page = do_compress_ram_page(param->file, &param->stream,
 478                                              block, offset, param->originbuf);
 479
 480             qemu_mutex_lock(&comp_done_lock);
 481             param->done = true;
 482             param->zero_page = zero_page;
 483             qemu_cond_signal(&comp_done_cond);
 484             qemu_mutex_unlock(&comp_done_lock);
 485
 486             qemu_mutex_lock(&param->mutex);
 487         } else {
 488             qemu_cond_wait(&param->cond, &param->mutex);
 489         }
 490     }
 491     qemu_mutex_unlock(&param->mutex);
 492
 493     return NULL;
 494 }
 495
 496 static void compress_threads_save_cleanup(void)
 497 {
 498     int i, thread_count;
 499
 500     if (!migrate_use_compression() || !comp_param) {
 501         return;
 502     }
 503
 504     thread_count = migrate_compress_threads();
 505     for (i = 0; i < thread_count; i++) {
 506         /*
 507          * we use it as a indicator which shows if the thread is
 508          * properly init'd or not
 509          */
 510         if (!comp_param[i].file) {
 511             break;
 512         }
 513
 514         qemu_mutex_lock(&comp_param[i].mutex);
 515         comp_param[i].quit = true;
 516         qemu_cond_signal(&comp_param[i].cond);
 517         qemu_mutex_unlock(&comp_param[i].mutex);
 518
 519         qemu_thread_join(compress_threads + i);
 520         qemu_mutex_destroy(&comp_param[i].mutex);
 521         qemu_cond_destroy(&comp_param[i].cond);
 522         deflateEnd(&comp_param[i].stream);
 523         g_free(comp_param[i].originbuf);
 524         qemu_fclose(comp_param[i].file);
 525         comp_param[i].file = NULL;
 526     }
 527     qemu_mutex_destroy(&comp_done_lock);
 528     qemu_cond_destroy(&comp_done_cond);
 529     g_free(compress_threads);
 530     g_free(comp_param);
 531     compress_threads = NULL;
 532     comp_param = NULL;
 533 }
 534
 535 static int compress_threads_save_setup(void)
 536 {
 537     int i, thread_count;
 538
 539     if (!migrate_use_compression()) {
 540         return 0;
 541     }
 542     thread_count = migrate_compress_threads();
 543     compress_threads = g_new0(QemuThread, thread_count);
 544     comp_param = g_new0(CompressParam, thread_count);
 545     qemu_cond_init(&comp_done_cond);
 546     qemu_mutex_init(&comp_done_lock);
 547     for (i = 0; i < thread_count; i++) {
 548         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 549         if (!comp_param[i].originbuf) {
 550             goto exit;
 551         }
 552
 553         if (deflateInit(&comp_param[i].stream,
 554                         migrate_compress_level()) != Z_OK) {
 555             g_free(comp_param[i].originbuf);
 556             goto exit;
 557         }
 558
 559         /* comp_param[i].file is just used as a dummy buffer to save data,
 560          * set its ops to empty.
 561          */
 562         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 563         comp_param[i].done = true;
 564         comp_param[i].quit = false;
 565         qemu_mutex_init(&comp_param[i].mutex);
 566         qemu_cond_init(&comp_param[i].cond);
 567         qemu_thread_create(compress_threads + i, "compress",
 568                            do_data_compress, comp_param + i,
 569                            QEMU_THREAD_JOINABLE);
 570     }
 571     return 0;
 572
 573 exit:
 574     compress_threads_save_cleanup();
 575     return -1;
 576 }
 577
 578 /* Multiple fd's */
 579
 580 #define MULTIFD_MAGIC 0x11223344U
 581 #define MULTIFD_VERSION 1
 582
 583 #define MULTIFD_FLAG_SYNC (1 << 0)
 584
 585 /* This value needs to be a multiple of qemu_target_page_size() */
 586 #define MULTIFD_PACKET_SIZE (512 * 1024)
 587
 588 typedef struct {
 589     uint32_t magic;
 590     uint32_t version;
 591     unsigned char uuid[16]; /* QemuUUID */
 592     uint8_t id;
 593     uint8_t unused1[7];     /* Reserved for future use */
 594     uint64_t unused2[4];    /* Reserved for future use */
 595 } __attribute__((packed)) MultiFDInit_t;
 596
 597 typedef struct {
 598     uint32_t magic;
 599     uint32_t version;
 600     uint32_t flags;
 601     /* maximum number of allocated pages */
 602     uint32_t pages_alloc;
 603     uint32_t pages_used;
 604     /* size of the next packet that contains pages */
 605     uint32_t next_packet_size;
 606     uint64_t packet_num;
 607     uint64_t unused[4];    /* Reserved for future use */
 608     char ramblock[256];
 609     uint64_t offset[];
 610 } __attribute__((packed)) MultiFDPacket_t;
 611
 612 typedef struct {
 613     /* number of used pages */
 614     uint32_t used;
 615     /* number of allocated pages */
 616     uint32_t allocated;
 617     /* global number of generated multifd packets */
 618     uint64_t packet_num;
 619     /* offset of each page */
 620     ram_addr_t *offset;
 621     /* pointer to each page */
 622     struct iovec *iov;
 623     RAMBlock *block;
 624 } MultiFDPages_t;
 625
 626 typedef struct {
 627     /* this fields are not changed once the thread is created */
 628     /* channel number */
 629     uint8_t id;
 630     /* channel thread name */
 631     char *name;
 632     /* channel thread id */
 633     QemuThread thread;
 634     /* communication channel */
 635     QIOChannel *c;
 636     /* sem where to wait for more work */
 637     QemuSemaphore sem;
 638     /* this mutex protects the following parameters */
 639     QemuMutex mutex;
 640     /* is this channel thread running */
 641     bool running;
 642     /* should this thread finish */
 643     bool quit;
 644     /* thread has work to do */
 645     int pending_job;
 646     /* array of pages to sent */
 647     MultiFDPages_t *pages;
 648     /* packet allocated len */
 649     uint32_t packet_len;
 650     /* pointer to the packet */
 651     MultiFDPacket_t *packet;
 652     /* multifd flags for each packet */
 653     uint32_t flags;
 654     /* size of the next packet that contains pages */
 655     uint32_t next_packet_size;
 656     /* global number of generated multifd packets */
 657     uint64_t packet_num;
 658     /* thread local variables */
 659     /* packets sent through this channel */
 660     uint64_t num_packets;
 661     /* pages sent through this channel */
 662     uint64_t num_pages;
 663     /* syncs main thread and channels */
 664     QemuSemaphore sem_sync;
 665 }  MultiFDSendParams;
 666
 667 typedef struct {
 668     /* this fields are not changed once the thread is created */
 669     /* channel number */
 670     uint8_t id;
 671     /* channel thread name */
 672     char *name;
 673     /* channel thread id */
 674     QemuThread thread;
 675     /* communication channel */
 676     QIOChannel *c;
 677     /* this mutex protects the following parameters */
 678     QemuMutex mutex;
 679     /* is this channel thread running */
 680     bool running;
 681     /* should this thread finish */
 682     bool quit;
 683     /* array of pages to receive */
 684     MultiFDPages_t *pages;
 685     /* packet allocated len */
 686     uint32_t packet_len;
 687     /* pointer to the packet */
 688     MultiFDPacket_t *packet;
 689     /* multifd flags for each packet */
 690     uint32_t flags;
 691     /* global number of generated multifd packets */
 692     uint64_t packet_num;
 693     /* thread local variables */
 694     /* size of the next packet that contains pages */
 695     uint32_t next_packet_size;
 696     /* packets sent through this channel */
 697     uint64_t num_packets;
 698     /* pages sent through this channel */
 699     uint64_t num_pages;
 700     /* syncs main thread and channels */
 701     QemuSemaphore sem_sync;
 702 } MultiFDRecvParams;
 703
 704 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 705 {
 706     MultiFDInit_t msg = {};
 707     int ret;
 708
 709     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 710     msg.version = cpu_to_be32(MULTIFD_VERSION);
 711     msg.id = p->id;
 712     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 713
 714     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 715     if (ret != 0) {
 716         return -1;
 717     }
 718     return 0;
 719 }
 720
 721 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 722 {
 723     MultiFDInit_t msg;
 724     int ret;
 725
 726     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 727     if (ret != 0) {
 728         return -1;
 729     }
 730
 731     msg.magic = be32_to_cpu(msg.magic);
 732     msg.version = be32_to_cpu(msg.version);
 733
 734     if (msg.magic != MULTIFD_MAGIC) {
 735         error_setg(errp, "multifd: received packet magic %x "
 736                    "expected %x", msg.magic, MULTIFD_MAGIC);
 737         return -1;
 738     }
 739
 740     if (msg.version != MULTIFD_VERSION) {
 741         error_setg(errp, "multifd: received packet version %d "
 742                    "expected %d", msg.version, MULTIFD_VERSION);
 743         return -1;
 744     }
 745
 746     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 747         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 748         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 749
 750         error_setg(errp, "multifd: received uuid '%s' and expected "
 751                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 752         g_free(uuid);
 753         g_free(msg_uuid);
 754         return -1;
 755     }
 756
 757     if (msg.id > migrate_multifd_channels()) {
 758         error_setg(errp, "multifd: received channel version %d "
 759                    "expected %d", msg.version, MULTIFD_VERSION);
 760         return -1;
 761     }
 762
 763     return msg.id;
 764 }
 765
 766 static MultiFDPages_t *multifd_pages_init(size_t size)
 767 {
 768     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 769
 770     pages->allocated = size;
 771     pages->iov = g_new0(struct iovec, size);
 772     pages->offset = g_new0(ram_addr_t, size);
 773
 774     return pages;
 775 }
 776
 777 static void multifd_pages_clear(MultiFDPages_t *pages)
 778 {
 779     pages->used = 0;
 780     pages->allocated = 0;
 781     pages->packet_num = 0;
 782     pages->block = NULL;
 783     g_free(pages->iov);
 784     pages->iov = NULL;
 785     g_free(pages->offset);
 786     pages->offset = NULL;
 787     g_free(pages);
 788 }
 789
 790 static void multifd_send_fill_packet(MultiFDSendParams *p)
 791 {
 792     MultiFDPacket_t *packet = p->packet;
 793     int i;
 794
 795     packet->flags = cpu_to_be32(p->flags);
 796     packet->pages_alloc = cpu_to_be32(p->pages->allocated);
 797     packet->pages_used = cpu_to_be32(p->pages->used);
 798     packet->next_packet_size = cpu_to_be32(p->next_packet_size);
 799     packet->packet_num = cpu_to_be64(p->packet_num);
 800
 801     if (p->pages->block) {
 802         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 803     }
 804
 805     for (i = 0; i < p->pages->used; i++) {
 806         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 807     }
 808 }
 809
 810 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 811 {
 812     MultiFDPacket_t *packet = p->packet;
 813     uint32_t pages_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
 814     RAMBlock *block;
 815     int i;
 816
 817     packet->magic = be32_to_cpu(packet->magic);
 818     if (packet->magic != MULTIFD_MAGIC) {
 819         error_setg(errp, "multifd: received packet "
 820                    "magic %x and expected magic %x",
 821                    packet->magic, MULTIFD_MAGIC);
 822         return -1;
 823     }
 824
 825     packet->version = be32_to_cpu(packet->version);
 826     if (packet->version != MULTIFD_VERSION) {
 827         error_setg(errp, "multifd: received packet "
 828                    "version %d and expected version %d",
 829                    packet->version, MULTIFD_VERSION);
 830         return -1;
 831     }
 832
 833     p->flags = be32_to_cpu(packet->flags);
 834
 835     packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
 836     /*
 837      * If we received a packet that is 100 times bigger than expected
 838      * just stop migration.  It is a magic number.
 839      */
 840     if (packet->pages_alloc > pages_max * 100) {
 841         error_setg(errp, "multifd: received packet "
 842                    "with size %d and expected a maximum size of %d",
 843                    packet->pages_alloc, pages_max * 100) ;
 844         return -1;
 845     }
 846     /*
 847      * We received a packet that is bigger than expected but inside
 848      * reasonable limits (see previous comment).  Just reallocate.
 849      */
 850     if (packet->pages_alloc > p->pages->allocated) {
 851         multifd_pages_clear(p->pages);
 852         p->pages = multifd_pages_init(packet->pages_alloc);
 853     }
 854
 855     p->pages->used = be32_to_cpu(packet->pages_used);
 856     if (p->pages->used > packet->pages_alloc) {
 857         error_setg(errp, "multifd: received packet "
 858                    "with %d pages and expected maximum pages are %d",
 859                    p->pages->used, packet->pages_alloc) ;
 860         return -1;
 861     }
 862
 863     p->next_packet_size = be32_to_cpu(packet->next_packet_size);
 864     p->packet_num = be64_to_cpu(packet->packet_num);
 865
 866     if (p->pages->used == 0) {
 867         return 0;
 868     }
 869
 870     /* make sure that ramblock is 0 terminated */
 871     packet->ramblock[255] = 0;
 872     block = qemu_ram_block_by_name(packet->ramblock);
 873     if (!block) {
 874         error_setg(errp, "multifd: unknown ram block %s",
 875                    packet->ramblock);
 876         return -1;
 877     }
 878
 879     for (i = 0; i < p->pages->used; i++) {
 880         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 881
 882         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 883             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 884                        " (max " RAM_ADDR_FMT ")",
 885                        offset, block->max_length);
 886             return -1;
 887         }
 888         p->pages->iov[i].iov_base = block->host + offset;
 889         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 890     }
 891
 892     return 0;
 893 }
 894
 895 struct {
 896     MultiFDSendParams *params;
 897     /* array of pages to sent */
 898     MultiFDPages_t *pages;
 899     /* global number of generated multifd packets */
 900     uint64_t packet_num;
 901     /* send channels ready */
 902     QemuSemaphore channels_ready;
 903     /*
 904      * Have we already run terminate threads.  There is a race when it
 905      * happens that we got one error while we are exiting.
 906      * We will use atomic operations.  Only valid values are 0 and 1.
 907      */
 908     int exiting;
 909 } *multifd_send_state;
 910
 911 /*
 912  * How we use multifd_send_state->pages and channel->pages?
 913  *
 914  * We create a pages for each channel, and a main one.  Each time that
 915  * we need to send a batch of pages we interchange the ones between
 916  * multifd_send_state and the channel that is sending it.  There are
 917  * two reasons for that:
 918  *    - to not have to do so many mallocs during migration
 919  *    - to make easier to know what to free at the end of migration
 920  *
 921  * This way we always know who is the owner of each "pages" struct,
 922  * and we don't need any locking.  It belongs to the migration thread
 923  * or to the channel thread.  Switching is safe because the migration
 924  * thread is using the channel mutex when changing it, and the channel
 925  * have to had finish with its own, otherwise pending_job can't be
 926  * false.
 927  */
 928
 929 static int multifd_send_pages(RAMState *rs)
 930 {
 931     int i;
 932     static int next_channel;
 933     MultiFDSendParams *p = NULL; /* make happy gcc */
 934     MultiFDPages_t *pages = multifd_send_state->pages;
 935     uint64_t transferred;
 936
 937     if (atomic_read(&multifd_send_state->exiting)) {
 938         return -1;
 939     }
 940
 941     qemu_sem_wait(&multifd_send_state->channels_ready);
 942     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 943         p = &multifd_send_state->params[i];
 944
 945         qemu_mutex_lock(&p->mutex);
 946         if (p->quit) {
 947             error_report("%s: channel %d has already quit!", __func__, i);
 948             qemu_mutex_unlock(&p->mutex);
 949             return -1;
 950         }
 951         if (!p->pending_job) {
 952             p->pending_job++;
 953             next_channel = (i + 1) % migrate_multifd_channels();
 954             break;
 955         }
 956         qemu_mutex_unlock(&p->mutex);
 957     }
 958     assert(!p->pages->used);
 959     assert(!p->pages->block);
 960
 961     p->packet_num = multifd_send_state->packet_num++;
 962     multifd_send_state->pages = p->pages;
 963     p->pages = pages;
 964     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
 965     qemu_file_update_transfer(rs->f, transferred);
 966     ram_counters.multifd_bytes += transferred;
 967     ram_counters.transferred += transferred;;
 968     qemu_mutex_unlock(&p->mutex);
 969     qemu_sem_post(&p->sem);
 970
 971     return 1;
 972 }
 973
 974 static int multifd_queue_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
 975 {
 976     MultiFDPages_t *pages = multifd_send_state->pages;
 977
 978     if (!pages->block) {
 979         pages->block = block;
 980     }
 981
 982     if (pages->block == block) {
 983         pages->offset[pages->used] = offset;
 984         pages->iov[pages->used].iov_base = block->host + offset;
 985         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 986         pages->used++;
 987
 988         if (pages->used < pages->allocated) {
 989             return 1;
 990         }
 991     }
 992
 993     if (multifd_send_pages(rs) < 0) {
 994         return -1;
 995     }
 996
 997     if (pages->block != block) {
 998         return  multifd_queue_page(rs, block, offset);
 999     }
1000
1001     return 1;
1002 }
1003
1004 static void multifd_send_terminate_threads(Error *err)
1005 {
1006     int i;
1007
1008     trace_multifd_send_terminate_threads(err != NULL);
1009
1010     if (err) {
1011         MigrationState *s = migrate_get_current();
1012         migrate_set_error(s, err);
1013         if (s->state == MIGRATION_STATUS_SETUP ||
1014             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
1015             s->state == MIGRATION_STATUS_DEVICE ||
1016             s->state == MIGRATION_STATUS_ACTIVE) {
1017             migrate_set_state(&s->state, s->state,
1018                               MIGRATION_STATUS_FAILED);
1019         }
1020     }
1021
1022     /*
1023      * We don't want to exit each threads twice.  Depending on where
1024      * we get the error, or if there are two independent errors in two
1025      * threads at the same time, we can end calling this function
1026      * twice.
1027      */
1028     if (atomic_xchg(&multifd_send_state->exiting, 1)) {
1029         return;
1030     }
1031
1032     for (i = 0; i < migrate_multifd_channels(); i++) {
1033         MultiFDSendParams *p = &multifd_send_state->params[i];
1034
1035         qemu_mutex_lock(&p->mutex);
1036         p->quit = true;
1037         qemu_sem_post(&p->sem);
1038         qemu_mutex_unlock(&p->mutex);
1039     }
1040 }
1041
1042 void multifd_save_cleanup(void)
1043 {
1044     int i;
1045
1046     if (!migrate_use_multifd()) {
1047         return;
1048     }
1049     multifd_send_terminate_threads(NULL);
1050     for (i = 0; i < migrate_multifd_channels(); i++) {
1051         MultiFDSendParams *p = &multifd_send_state->params[i];
1052
1053         if (p->running) {
1054             qemu_thread_join(&p->thread);
1055         }
1056     }
1057     for (i = 0; i < migrate_multifd_channels(); i++) {
1058         MultiFDSendParams *p = &multifd_send_state->params[i];
1059
1060         socket_send_channel_destroy(p->c);
1061         p->c = NULL;
1062         qemu_mutex_destroy(&p->mutex);
1063         qemu_sem_destroy(&p->sem);
1064         qemu_sem_destroy(&p->sem_sync);
1065         g_free(p->name);
1066         p->name = NULL;
1067         multifd_pages_clear(p->pages);
1068         p->pages = NULL;
1069         p->packet_len = 0;
1070         g_free(p->packet);
1071         p->packet = NULL;
1072     }
1073     qemu_sem_destroy(&multifd_send_state->channels_ready);
1074     g_free(multifd_send_state->params);
1075     multifd_send_state->params = NULL;
1076     multifd_pages_clear(multifd_send_state->pages);
1077     multifd_send_state->pages = NULL;
1078     g_free(multifd_send_state);
1079     multifd_send_state = NULL;
1080 }
1081
1082 static void multifd_send_sync_main(RAMState *rs)
1083 {
1084     int i;
1085
1086     if (!migrate_use_multifd()) {
1087         return;
1088     }
1089     if (multifd_send_state->pages->used) {
1090         if (multifd_send_pages(rs) < 0) {
1091             error_report("%s: multifd_send_pages fail", __func__);
1092             return;
1093         }
1094     }
1095     for (i = 0; i < migrate_multifd_channels(); i++) {
1096         MultiFDSendParams *p = &multifd_send_state->params[i];
1097
1098         trace_multifd_send_sync_main_signal(p->id);
1099
1100         qemu_mutex_lock(&p->mutex);
1101
1102         if (p->quit) {
1103             error_report("%s: channel %d has already quit", __func__, i);
1104             qemu_mutex_unlock(&p->mutex);
1105             return;
1106         }
1107
1108         p->packet_num = multifd_send_state->packet_num++;
1109         p->flags |= MULTIFD_FLAG_SYNC;
1110         p->pending_job++;
1111         qemu_file_update_transfer(rs->f, p->packet_len);
1112         ram_counters.multifd_bytes += p->packet_len;
1113         ram_counters.transferred += p->packet_len;
1114         qemu_mutex_unlock(&p->mutex);
1115         qemu_sem_post(&p->sem);
1116     }
1117     for (i = 0; i < migrate_multifd_channels(); i++) {
1118         MultiFDSendParams *p = &multifd_send_state->params[i];
1119
1120         trace_multifd_send_sync_main_wait(p->id);
1121         qemu_sem_wait(&p->sem_sync);
1122     }
1123     trace_multifd_send_sync_main(multifd_send_state->packet_num);
1124 }
1125
1126 static void *multifd_send_thread(void *opaque)
1127 {
1128     MultiFDSendParams *p = opaque;
1129     Error *local_err = NULL;
1130     int ret = 0;
1131     uint32_t flags = 0;
1132
1133     trace_multifd_send_thread_start(p->id);
1134     rcu_register_thread();
1135
1136     if (multifd_send_initial_packet(p, &local_err) < 0) {
1137         ret = -1;
1138         goto out;
1139     }
1140     /* initial packet */
1141     p->num_packets = 1;
1142
1143     while (true) {
1144         qemu_sem_wait(&p->sem);
1145
1146         if (atomic_read(&multifd_send_state->exiting)) {
1147             break;
1148         }
1149         qemu_mutex_lock(&p->mutex);
1150
1151         if (p->pending_job) {
1152             uint32_t used = p->pages->used;
1153             uint64_t packet_num = p->packet_num;
1154             flags = p->flags;
1155
1156             p->next_packet_size = used * qemu_target_page_size();
1157             multifd_send_fill_packet(p);
1158             p->flags = 0;
1159             p->num_packets++;
1160             p->num_pages += used;
1161             p->pages->used = 0;
1162             p->pages->block = NULL;
1163             qemu_mutex_unlock(&p->mutex);
1164
1165             trace_multifd_send(p->id, packet_num, used, flags,
1166                                p->next_packet_size);
1167
1168             ret = qio_channel_write_all(p->c, (void *)p->packet,
1169                                         p->packet_len, &local_err);
1170             if (ret != 0) {
1171                 break;
1172             }
1173
1174             if (used) {
1175                 ret = qio_channel_writev_all(p->c, p->pages->iov,
1176                                              used, &local_err);
1177                 if (ret != 0) {
1178                     break;
1179                 }
1180             }
1181
1182             qemu_mutex_lock(&p->mutex);
1183             p->pending_job--;
1184             qemu_mutex_unlock(&p->mutex);
1185
1186             if (flags & MULTIFD_FLAG_SYNC) {
1187                 qemu_sem_post(&p->sem_sync);
1188             }
1189             qemu_sem_post(&multifd_send_state->channels_ready);
1190         } else if (p->quit) {
1191             qemu_mutex_unlock(&p->mutex);
1192             break;
1193         } else {
1194             qemu_mutex_unlock(&p->mutex);
1195             /* sometimes there are spurious wakeups */
1196         }
1197     }
1198
1199 out:
1200     if (local_err) {
1201         trace_multifd_send_error(p->id);
1202         multifd_send_terminate_threads(local_err);
1203     }
1204
1205     /*
1206      * Error happen, I will exit, but I can't just leave, tell
1207      * who pay attention to me.
1208      */
1209     if (ret != 0) {
1210         qemu_sem_post(&p->sem_sync);
1211         qemu_sem_post(&multifd_send_state->channels_ready);
1212     }
1213
1214     qemu_mutex_lock(&p->mutex);
1215     p->running = false;
1216     qemu_mutex_unlock(&p->mutex);
1217
1218     rcu_unregister_thread();
1219     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1220
1221     return NULL;
1222 }
1223
1224 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1225 {
1226     MultiFDSendParams *p = opaque;
1227     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1228     Error *local_err = NULL;
1229
1230     trace_multifd_new_send_channel_async(p->id);
1231     if (qio_task_propagate_error(task, &local_err)) {
1232         migrate_set_error(migrate_get_current(), local_err);
1233         multifd_save_cleanup();
1234     } else {
1235         p->c = QIO_CHANNEL(sioc);
1236         qio_channel_set_delay(p->c, false);
1237         p->running = true;
1238         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1239                            QEMU_THREAD_JOINABLE);
1240     }
1241 }
1242
1243 int multifd_save_setup(void)
1244 {
1245     int thread_count;
1246     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1247     uint8_t i;
1248
1249     if (!migrate_use_multifd()) {
1250         return 0;
1251     }
1252     thread_count = migrate_multifd_channels();
1253     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1254     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1255     multifd_send_state->pages = multifd_pages_init(page_count);
1256     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1257     atomic_set(&multifd_send_state->exiting, 0);
1258
1259     for (i = 0; i < thread_count; i++) {
1260         MultiFDSendParams *p = &multifd_send_state->params[i];
1261
1262         qemu_mutex_init(&p->mutex);
1263         qemu_sem_init(&p->sem, 0);
1264         qemu_sem_init(&p->sem_sync, 0);
1265         p->quit = false;
1266         p->pending_job = 0;
1267         p->id = i;
1268         p->pages = multifd_pages_init(page_count);
1269         p->packet_len = sizeof(MultiFDPacket_t)
1270                       + sizeof(ram_addr_t) * page_count;
1271         p->packet = g_malloc0(p->packet_len);
1272         p->packet->magic = cpu_to_be32(MULTIFD_MAGIC);
1273         p->packet->version = cpu_to_be32(MULTIFD_VERSION);
1274         p->name = g_strdup_printf("multifdsend_%d", i);
1275         socket_send_channel_create(multifd_new_send_channel_async, p);
1276     }
1277     return 0;
1278 }
1279
1280 struct {
1281     MultiFDRecvParams *params;
1282     /* number of created threads */
1283     int count;
1284     /* syncs main thread and channels */
1285     QemuSemaphore sem_sync;
1286     /* global number of generated multifd packets */
1287     uint64_t packet_num;
1288 } *multifd_recv_state;
1289
1290 static void multifd_recv_terminate_threads(Error *err)
1291 {
1292     int i;
1293
1294     trace_multifd_recv_terminate_threads(err != NULL);
1295
1296     if (err) {
1297         MigrationState *s = migrate_get_current();
1298         migrate_set_error(s, err);
1299         if (s->state == MIGRATION_STATUS_SETUP ||
1300             s->state == MIGRATION_STATUS_ACTIVE) {
1301             migrate_set_state(&s->state, s->state,
1302                               MIGRATION_STATUS_FAILED);
1303         }
1304     }
1305
1306     for (i = 0; i < migrate_multifd_channels(); i++) {
1307         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1308
1309         qemu_mutex_lock(&p->mutex);
1310         p->quit = true;
1311         /* We could arrive here for two reasons:
1312            - normal quit, i.e. everything went fine, just finished
1313            - error quit: We close the channels so the channel threads
1314              finish the qio_channel_read_all_eof() */
1315         if (p->c) {
1316             qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1317         }
1318         qemu_mutex_unlock(&p->mutex);
1319     }
1320 }
1321
1322 int multifd_load_cleanup(Error **errp)
1323 {
1324     int i;
1325     int ret = 0;
1326
1327     if (!migrate_use_multifd()) {
1328         return 0;
1329     }
1330     multifd_recv_terminate_threads(NULL);
1331     for (i = 0; i < migrate_multifd_channels(); i++) {
1332         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1333
1334         if (p->running) {
1335             p->quit = true;
1336             /*
1337              * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
1338              * however try to wakeup it without harm in cleanup phase.
1339              */
1340             qemu_sem_post(&p->sem_sync);
1341             qemu_thread_join(&p->thread);
1342         }
1343     }
1344     for (i = 0; i < migrate_multifd_channels(); i++) {
1345         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1346
1347         object_unref(OBJECT(p->c));
1348         p->c = NULL;
1349         qemu_mutex_destroy(&p->mutex);
1350         qemu_sem_destroy(&p->sem_sync);
1351         g_free(p->name);
1352         p->name = NULL;
1353         multifd_pages_clear(p->pages);
1354         p->pages = NULL;
1355         p->packet_len = 0;
1356         g_free(p->packet);
1357         p->packet = NULL;
1358     }
1359     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1360     g_free(multifd_recv_state->params);
1361     multifd_recv_state->params = NULL;
1362     g_free(multifd_recv_state);
1363     multifd_recv_state = NULL;
1364
1365     return ret;
1366 }
1367
1368 static void multifd_recv_sync_main(void)
1369 {
1370     int i;
1371
1372     if (!migrate_use_multifd()) {
1373         return;
1374     }
1375     for (i = 0; i < migrate_multifd_channels(); i++) {
1376         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1377
1378         trace_multifd_recv_sync_main_wait(p->id);
1379         qemu_sem_wait(&multifd_recv_state->sem_sync);
1380     }
1381     for (i = 0; i < migrate_multifd_channels(); i++) {
1382         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1383
1384         qemu_mutex_lock(&p->mutex);
1385         if (multifd_recv_state->packet_num < p->packet_num) {
1386             multifd_recv_state->packet_num = p->packet_num;
1387         }
1388         qemu_mutex_unlock(&p->mutex);
1389         trace_multifd_recv_sync_main_signal(p->id);
1390         qemu_sem_post(&p->sem_sync);
1391     }
1392     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1393 }
1394
1395 static void *multifd_recv_thread(void *opaque)
1396 {
1397     MultiFDRecvParams *p = opaque;
1398     Error *local_err = NULL;
1399     int ret;
1400
1401     trace_multifd_recv_thread_start(p->id);
1402     rcu_register_thread();
1403
1404     while (true) {
1405         uint32_t used;
1406         uint32_t flags;
1407
1408         if (p->quit) {
1409             break;
1410         }
1411
1412         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1413                                        p->packet_len, &local_err);
1414         if (ret == 0) {   /* EOF */
1415             break;
1416         }
1417         if (ret == -1) {   /* Error */
1418             break;
1419         }
1420
1421         qemu_mutex_lock(&p->mutex);
1422         ret = multifd_recv_unfill_packet(p, &local_err);
1423         if (ret) {
1424             qemu_mutex_unlock(&p->mutex);
1425             break;
1426         }
1427
1428         used = p->pages->used;
1429         flags = p->flags;
1430         trace_multifd_recv(p->id, p->packet_num, used, flags,
1431                            p->next_packet_size);
1432         p->num_packets++;
1433         p->num_pages += used;
1434         qemu_mutex_unlock(&p->mutex);
1435
1436         if (used) {
1437             ret = qio_channel_readv_all(p->c, p->pages->iov,
1438                                         used, &local_err);
1439             if (ret != 0) {
1440                 break;
1441             }
1442         }
1443
1444         if (flags & MULTIFD_FLAG_SYNC) {
1445             qemu_sem_post(&multifd_recv_state->sem_sync);
1446             qemu_sem_wait(&p->sem_sync);
1447         }
1448     }
1449
1450     if (local_err) {
1451         multifd_recv_terminate_threads(local_err);
1452     }
1453     qemu_mutex_lock(&p->mutex);
1454     p->running = false;
1455     qemu_mutex_unlock(&p->mutex);
1456
1457     rcu_unregister_thread();
1458     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1459
1460     return NULL;
1461 }
1462
1463 int multifd_load_setup(void)
1464 {
1465     int thread_count;
1466     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1467     uint8_t i;
1468
1469     if (!migrate_use_multifd()) {
1470         return 0;
1471     }
1472     thread_count = migrate_multifd_channels();
1473     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1474     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1475     atomic_set(&multifd_recv_state->count, 0);
1476     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1477
1478     for (i = 0; i < thread_count; i++) {
1479         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1480
1481         qemu_mutex_init(&p->mutex);
1482         qemu_sem_init(&p->sem_sync, 0);
1483         p->quit = false;
1484         p->id = i;
1485         p->pages = multifd_pages_init(page_count);
1486         p->packet_len = sizeof(MultiFDPacket_t)
1487                       + sizeof(ram_addr_t) * page_count;
1488         p->packet = g_malloc0(p->packet_len);
1489         p->name = g_strdup_printf("multifdrecv_%d", i);
1490     }
1491     return 0;
1492 }
1493
1494 bool multifd_recv_all_channels_created(void)
1495 {
1496     int thread_count = migrate_multifd_channels();
1497
1498     if (!migrate_use_multifd()) {
1499         return true;
1500     }
1501
1502     return thread_count == atomic_read(&multifd_recv_state->count);
1503 }
1504
1505 /*
1506  * Try to receive all multifd channels to get ready for the migration.
1507  * - Return true and do not set @errp when correctly receving all channels;
1508  * - Return false and do not set @errp when correctly receiving the current one;
1509  * - Return false and set @errp when failing to receive the current channel.
1510  */
1511 bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1512 {
1513     MultiFDRecvParams *p;
1514     Error *local_err = NULL;
1515     int id;
1516
1517     id = multifd_recv_initial_packet(ioc, &local_err);
1518     if (id < 0) {
1519         multifd_recv_terminate_threads(local_err);
1520         error_propagate_prepend(errp, local_err,
1521                                 "failed to receive packet"
1522                                 " via multifd channel %d: ",
1523                                 atomic_read(&multifd_recv_state->count));
1524         return false;
1525     }
1526     trace_multifd_recv_new_channel(id);
1527
1528     p = &multifd_recv_state->params[id];
1529     if (p->c != NULL) {
1530         error_setg(&local_err, "multifd: received id '%d' already setup'",
1531                    id);
1532         multifd_recv_terminate_threads(local_err);
1533         error_propagate(errp, local_err);
1534         return false;
1535     }
1536     p->c = ioc;
1537     object_ref(OBJECT(ioc));
1538     /* initial packet */
1539     p->num_packets = 1;
1540
1541     p->running = true;
1542     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1543                        QEMU_THREAD_JOINABLE);
1544     atomic_inc(&multifd_recv_state->count);
1545     return atomic_read(&multifd_recv_state->count) ==
1546            migrate_multifd_channels();
1547 }
1548
1549 /**
1550  * save_page_header: write page header to wire
1551  *
1552  * If this is the 1st block, it also writes the block identification
1553  *
1554  * Returns the number of bytes written
1555  *
1556  * @f: QEMUFile where to send the data
1557  * @block: block that contains the page we want to send
1558  * @offset: offset inside the block for the page
1559  *          in the lower bits, it contains flags
1560  */
1561 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1562                                ram_addr_t offset)
1563 {
1564     size_t size, len;
1565
1566     if (block == rs->last_sent_block) {
1567         offset |= RAM_SAVE_FLAG_CONTINUE;
1568     }
1569     qemu_put_be64(f, offset);
1570     size = 8;
1571
1572     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1573         len = strlen(block->idstr);
1574         qemu_put_byte(f, len);
1575         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1576         size += 1 + len;
1577         rs->last_sent_block = block;
1578     }
1579     return size;
1580 }
1581
1582 /**
1583  * mig_throttle_guest_down: throotle down the guest
1584  *
1585  * Reduce amount of guest cpu execution to hopefully slow down memory
1586  * writes. If guest dirty memory rate is reduced below the rate at
1587  * which we can transfer pages to the destination then we should be
1588  * able to complete migration. Some workloads dirty memory way too
1589  * fast and will not effectively converge, even with auto-converge.
1590  */
1591 static void mig_throttle_guest_down(void)
1592 {
1593     MigrationState *s = migrate_get_current();
1594     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1595     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1596     int pct_max = s->parameters.max_cpu_throttle;
1597
1598     /* We have not started throttling yet. Let's start it. */
1599     if (!cpu_throttle_active()) {
1600         cpu_throttle_set(pct_initial);
1601     } else {
1602         /* Throttling already on, just increase the rate */
1603         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1604                          pct_max));
1605     }
1606 }
1607
1608 /**
1609  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1610  *
1611  * @rs: current RAM state
1612  * @current_addr: address for the zero page
1613  *
1614  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1615  * The important thing is that a stale (not-yet-0'd) page be replaced
1616  * by the new data.
1617  * As a bonus, if the page wasn't in the cache it gets added so that
1618  * when a small write is made into the 0'd page it gets XBZRLE sent.
1619  */
1620 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1621 {
1622     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1623         return;
1624     }
1625
1626     /* We don't care if this fails to allocate a new cache page
1627      * as long as it updated an old one */
1628     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1629                  ram_counters.dirty_sync_count);
1630 }
1631
1632 #define ENCODING_FLAG_XBZRLE 0x1
1633
1634 /**
1635  * save_xbzrle_page: compress and send current page
1636  *
1637  * Returns: 1 means that we wrote the page
1638  *          0 means that page is identical to the one already sent
1639  *          -1 means that xbzrle would be longer than normal
1640  *
1641  * @rs: current RAM state
1642  * @current_data: pointer to the address of the page contents
1643  * @current_addr: addr of the page
1644  * @block: block that contains the page we want to send
1645  * @offset: offset inside the block for the page
1646  * @last_stage: if we are at the completion stage
1647  */
1648 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1649                             ram_addr_t current_addr, RAMBlock *block,
1650                             ram_addr_t offset, bool last_stage)
1651 {
1652     int encoded_len = 0, bytes_xbzrle;
1653     uint8_t *prev_cached_page;
1654
1655     if (!cache_is_cached(XBZRLE.cache, current_addr,
1656                          ram_counters.dirty_sync_count)) {
1657         xbzrle_counters.cache_miss++;
1658         if (!last_stage) {
1659             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1660                              ram_counters.dirty_sync_count) == -1) {
1661                 return -1;
1662             } else {
1663                 /* update *current_data when the page has been
1664                    inserted into cache */
1665                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1666             }
1667         }
1668         return -1;
1669     }
1670
1671     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1672
1673     /* save current buffer into memory */
1674     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1675
1676     /* XBZRLE encoding (if there is no overflow) */
1677     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1678                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1679                                        TARGET_PAGE_SIZE);
1680
1681     /*
1682      * Update the cache contents, so that it corresponds to the data
1683      * sent, in all cases except where we skip the page.
1684      */
1685     if (!last_stage && encoded_len != 0) {
1686         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1687         /*
1688          * In the case where we couldn't compress, ensure that the caller
1689          * sends the data from the cache, since the guest might have
1690          * changed the RAM since we copied it.
1691          */
1692         *current_data = prev_cached_page;
1693     }
1694
1695     if (encoded_len == 0) {
1696         trace_save_xbzrle_page_skipping();
1697         return 0;
1698     } else if (encoded_len == -1) {
1699         trace_save_xbzrle_page_overflow();
1700         xbzrle_counters.overflow++;
1701         return -1;
1702     }
1703
1704     /* Send XBZRLE based compressed page */
1705     bytes_xbzrle = save_page_header(rs, rs->f, block,
1706                                     offset | RAM_SAVE_FLAG_XBZRLE);
1707     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1708     qemu_put_be16(rs->f, encoded_len);
1709     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1710     bytes_xbzrle += encoded_len + 1 + 2;
1711     xbzrle_counters.pages++;
1712     xbzrle_counters.bytes += bytes_xbzrle;
1713     ram_counters.transferred += bytes_xbzrle;
1714
1715     return 1;
1716 }
1717
1718 /**
1719  * migration_bitmap_find_dirty: find the next dirty page from start
1720  *
1721  * Returns the page offset within memory region of the start of a dirty page
1722  *
1723  * @rs: current RAM state
1724  * @rb: RAMBlock where to search for dirty pages
1725  * @start: page where we start the search
1726  */
1727 static inline
1728 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1729                                           unsigned long start)
1730 {
1731     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1732     unsigned long *bitmap = rb->bmap;
1733     unsigned long next;
1734
1735     if (ramblock_is_ignored(rb)) {
1736         return size;
1737     }
1738
1739     /*
1740      * When the free page optimization is enabled, we need to check the bitmap
1741      * to send the non-free pages rather than all the pages in the bulk stage.
1742      */
1743     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
1744         next = start + 1;
1745     } else {
1746         next = find_next_bit(bitmap, size, start);
1747     }
1748
1749     return next;
1750 }
1751
1752 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1753                                                 RAMBlock *rb,
1754                                                 unsigned long page)
1755 {
1756     bool ret;
1757
1758     qemu_mutex_lock(&rs->bitmap_mutex);
1759
1760     /*
1761      * Clear dirty bitmap if needed.  This _must_ be called before we
1762      * send any of the page in the chunk because we need to make sure
1763      * we can capture further page content changes when we sync dirty
1764      * log the next time.  So as long as we are going to send any of
1765      * the page in the chunk we clear the remote dirty bitmap for all.
1766      * Clearing it earlier won't be a problem, but too late will.
1767      */
1768     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
1769         uint8_t shift = rb->clear_bmap_shift;
1770         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
1771         hwaddr start = (page << TARGET_PAGE_BITS) & (-size);
1772
1773         /*
1774          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
1775          * can make things easier sometimes since then start address
1776          * of the small chunk will always be 64 pages aligned so the
1777          * bitmap will always be aligned to unsigned long.  We should
1778          * even be able to remove this restriction but I'm simply
1779          * keeping it.
1780          */
1781         assert(shift >= 6);
1782         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
1783         memory_region_clear_dirty_bitmap(rb->mr, start, size);
1784     }
1785
1786     ret = test_and_clear_bit(page, rb->bmap);
1787
1788     if (ret) {
1789         rs->migration_dirty_pages--;
1790     }
1791     qemu_mutex_unlock(&rs->bitmap_mutex);
1792
1793     return ret;
1794 }
1795
1796 /* Called with RCU critical section */
1797 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1798 {
1799     rs->migration_dirty_pages +=
1800         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
1801                                               &rs->num_dirty_pages_period);
1802 }
1803
1804 /**
1805  * ram_pagesize_summary: calculate all the pagesizes of a VM
1806  *
1807  * Returns a summary bitmap of the page sizes of all RAMBlocks
1808  *
1809  * For VMs with just normal pages this is equivalent to the host page
1810  * size. If it's got some huge pages then it's the OR of all the
1811  * different page sizes.
1812  */
1813 uint64_t ram_pagesize_summary(void)
1814 {
1815     RAMBlock *block;
1816     uint64_t summary = 0;
1817
1818     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1819         summary |= block->page_size;
1820     }
1821
1822     return summary;
1823 }
1824
1825 uint64_t ram_get_total_transferred_pages(void)
1826 {
1827     return  ram_counters.normal + ram_counters.duplicate +
1828                 compression_counters.pages + xbzrle_counters.pages;
1829 }
1830
1831 static void migration_update_rates(RAMState *rs, int64_t end_time)
1832 {
1833     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1834     double compressed_size;
1835
1836     /* calculate period counters */
1837     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1838                 / (end_time - rs->time_last_bitmap_sync);
1839
1840     if (!page_count) {
1841         return;
1842     }
1843
1844     if (migrate_use_xbzrle()) {
1845         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1846             rs->xbzrle_cache_miss_prev) / page_count;
1847         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1848     }
1849
1850     if (migrate_use_compression()) {
1851         compression_counters.busy_rate = (double)(compression_counters.busy -
1852             rs->compress_thread_busy_prev) / page_count;
1853         rs->compress_thread_busy_prev = compression_counters.busy;
1854
1855         compressed_size = compression_counters.compressed_size -
1856                           rs->compressed_size_prev;
1857         if (compressed_size) {
1858             double uncompressed_size = (compression_counters.pages -
1859                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1860
1861             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1862             compression_counters.compression_rate =
1863                                         uncompressed_size / compressed_size;
1864
1865             rs->compress_pages_prev = compression_counters.pages;
1866             rs->compressed_size_prev = compression_counters.compressed_size;
1867         }
1868     }
1869 }
1870
1871 static void migration_bitmap_sync(RAMState *rs)
1872 {
1873     RAMBlock *block;
1874     int64_t end_time;
1875     uint64_t bytes_xfer_now;
1876
1877     ram_counters.dirty_sync_count++;
1878
1879     if (!rs->time_last_bitmap_sync) {
1880         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1881     }
1882
1883     trace_migration_bitmap_sync_start();
1884     memory_global_dirty_log_sync();
1885
1886     qemu_mutex_lock(&rs->bitmap_mutex);
1887     WITH_RCU_READ_LOCK_GUARD() {
1888         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1889             ramblock_sync_dirty_bitmap(rs, block);
1890         }
1891         ram_counters.remaining = ram_bytes_remaining();
1892     }
1893     qemu_mutex_unlock(&rs->bitmap_mutex);
1894
1895     memory_global_after_dirty_log_sync();
1896     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1897
1898     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1899
1900     /* more than 1 second = 1000 millisecons */
1901     if (end_time > rs->time_last_bitmap_sync + 1000) {
1902         bytes_xfer_now = ram_counters.transferred;
1903
1904         /* During block migration the auto-converge logic incorrectly detects
1905          * that ram migration makes no progress. Avoid this by disabling the
1906          * throttling logic during the bulk phase of block migration. */
1907         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1908             /* The following detection logic can be refined later. For now:
1909                Check to see if the dirtied bytes is 50% more than the approx.
1910                amount of bytes that just got transferred since the last time we
1911                were in this routine. If that happens twice, start or increase
1912                throttling */
1913
1914             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1915                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1916                 (++rs->dirty_rate_high_cnt >= 2)) {
1917                     trace_migration_throttle();
1918                     rs->dirty_rate_high_cnt = 0;
1919                     mig_throttle_guest_down();
1920             }
1921         }
1922
1923         migration_update_rates(rs, end_time);
1924
1925         rs->target_page_count_prev = rs->target_page_count;
1926
1927         /* reset period counters */
1928         rs->time_last_bitmap_sync = end_time;
1929         rs->num_dirty_pages_period = 0;
1930         rs->bytes_xfer_prev = bytes_xfer_now;
1931     }
1932     if (migrate_use_events()) {
1933         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1934     }
1935 }
1936
1937 static void migration_bitmap_sync_precopy(RAMState *rs)
1938 {
1939     Error *local_err = NULL;
1940
1941     /*
1942      * The current notifier usage is just an optimization to migration, so we
1943      * don't stop the normal migration process in the error case.
1944      */
1945     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1946         error_report_err(local_err);
1947     }
1948
1949     migration_bitmap_sync(rs);
1950
1951     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1952         error_report_err(local_err);
1953     }
1954 }
1955
1956 /**
1957  * save_zero_page_to_file: send the zero page to the file
1958  *
1959  * Returns the size of data written to the file, 0 means the page is not
1960  * a zero page
1961  *
1962  * @rs: current RAM state
1963  * @file: the file where the data is saved
1964  * @block: block that contains the page we want to send
1965  * @offset: offset inside the block for the page
1966  */
1967 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1968                                   RAMBlock *block, ram_addr_t offset)
1969 {
1970     uint8_t *p = block->host + offset;
1971     int len = 0;
1972
1973     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1974         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1975         qemu_put_byte(file, 0);
1976         len += 1;
1977     }
1978     return len;
1979 }
1980
1981 /**
1982  * save_zero_page: send the zero page to the stream
1983  *
1984  * Returns the number of pages written.
1985  *
1986  * @rs: current RAM state
1987  * @block: block that contains the page we want to send
1988  * @offset: offset inside the block for the page
1989  */
1990 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1991 {
1992     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1993
1994     if (len) {
1995         ram_counters.duplicate++;
1996         ram_counters.transferred += len;
1997         return 1;
1998     }
1999     return -1;
2000 }
2001
2002 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
2003 {
2004     if (!migrate_release_ram() || !migration_in_postcopy()) {
2005         return;
2006     }
2007
2008     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
2009 }
2010
2011 /*
2012  * @pages: the number of pages written by the control path,
2013  *        < 0 - error
2014  *        > 0 - number of pages written
2015  *
2016  * Return true if the pages has been saved, otherwise false is returned.
2017  */
2018 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
2019                               int *pages)
2020 {
2021     uint64_t bytes_xmit = 0;
2022     int ret;
2023
2024     *pages = -1;
2025     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
2026                                 &bytes_xmit);
2027     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
2028         return false;
2029     }
2030
2031     if (bytes_xmit) {
2032         ram_counters.transferred += bytes_xmit;
2033         *pages = 1;
2034     }
2035
2036     if (ret == RAM_SAVE_CONTROL_DELAYED) {
2037         return true;
2038     }
2039
2040     if (bytes_xmit > 0) {
2041         ram_counters.normal++;
2042     } else if (bytes_xmit == 0) {
2043         ram_counters.duplicate++;
2044     }
2045
2046     return true;
2047 }
2048
2049 /*
2050  * directly send the page to the stream
2051  *
2052  * Returns the number of pages written.
2053  *
2054  * @rs: current RAM state
2055  * @block: block that contains the page we want to send
2056  * @offset: offset inside the block for the page
2057  * @buf: the page to be sent
2058  * @async: send to page asyncly
2059  */
2060 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
2061                             uint8_t *buf, bool async)
2062 {
2063     ram_counters.transferred += save_page_header(rs, rs->f, block,
2064                                                  offset | RAM_SAVE_FLAG_PAGE);
2065     if (async) {
2066         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
2067                               migrate_release_ram() &
2068                               migration_in_postcopy());
2069     } else {
2070         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
2071     }
2072     ram_counters.transferred += TARGET_PAGE_SIZE;
2073     ram_counters.normal++;
2074     return 1;
2075 }
2076
2077 /**
2078  * ram_save_page: send the given page to the stream
2079  *
2080  * Returns the number of pages written.
2081  *          < 0 - error
2082  *          >=0 - Number of pages written - this might legally be 0
2083  *                if xbzrle noticed the page was the same.
2084  *
2085  * @rs: current RAM state
2086  * @block: block that contains the page we want to send
2087  * @offset: offset inside the block for the page
2088  * @last_stage: if we are at the completion stage
2089  */
2090 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
2091 {
2092     int pages = -1;
2093     uint8_t *p;
2094     bool send_async = true;
2095     RAMBlock *block = pss->block;
2096     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2097     ram_addr_t current_addr = block->offset + offset;
2098
2099     p = block->host + offset;
2100     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
2101
2102     XBZRLE_cache_lock();
2103     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
2104         migrate_use_xbzrle()) {
2105         pages = save_xbzrle_page(rs, &p, current_addr, block,
2106                                  offset, last_stage);
2107         if (!last_stage) {
2108             /* Can't send this cached data async, since the cache page
2109              * might get updated before it gets to the wire
2110              */
2111             send_async = false;
2112         }
2113     }
2114
2115     /* XBZRLE overflow or normal page */
2116     if (pages == -1) {
2117         pages = save_normal_page(rs, block, offset, p, send_async);
2118     }
2119
2120     XBZRLE_cache_unlock();
2121
2122     return pages;
2123 }
2124
2125 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
2126                                  ram_addr_t offset)
2127 {
2128     if (multifd_queue_page(rs, block, offset) < 0) {
2129         return -1;
2130     }
2131     ram_counters.normal++;
2132
2133     return 1;
2134 }
2135
2136 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
2137                                  ram_addr_t offset, uint8_t *source_buf)
2138 {
2139     RAMState *rs = ram_state;
2140     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
2141     bool zero_page = false;
2142     int ret;
2143
2144     if (save_zero_page_to_file(rs, f, block, offset)) {
2145         zero_page = true;
2146         goto exit;
2147     }
2148
2149     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
2150
2151     /*
2152      * copy it to a internal buffer to avoid it being modified by VM
2153      * so that we can catch up the error during compression and
2154      * decompression
2155      */
2156     memcpy(source_buf, p, TARGET_PAGE_SIZE);
2157     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2158     if (ret < 0) {
2159         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
2160         error_report("compressed data failed!");
2161         return false;
2162     }
2163
2164 exit:
2165     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
2166     return zero_page;
2167 }
2168
2169 static void
2170 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2171 {
2172     ram_counters.transferred += bytes_xmit;
2173
2174     if (param->zero_page) {
2175         ram_counters.duplicate++;
2176         return;
2177     }
2178
2179     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2180     compression_counters.compressed_size += bytes_xmit - 8;
2181     compression_counters.pages++;
2182 }
2183
2184 static bool save_page_use_compression(RAMState *rs);
2185
2186 static void flush_compressed_data(RAMState *rs)
2187 {
2188     int idx, len, thread_count;
2189
2190     if (!save_page_use_compression(rs)) {
2191         return;
2192     }
2193     thread_count = migrate_compress_threads();
2194
2195     qemu_mutex_lock(&comp_done_lock);
2196     for (idx = 0; idx < thread_count; idx++) {
2197         while (!comp_param[idx].done) {
2198             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2199         }
2200     }
2201     qemu_mutex_unlock(&comp_done_lock);
2202
2203     for (idx = 0; idx < thread_count; idx++) {
2204         qemu_mutex_lock(&comp_param[idx].mutex);
2205         if (!comp_param[idx].quit) {
2206             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2207             /*
2208              * it's safe to fetch zero_page without holding comp_done_lock
2209              * as there is no further request submitted to the thread,
2210              * i.e, the thread should be waiting for a request at this point.
2211              */
2212             update_compress_thread_counts(&comp_param[idx], len);
2213         }
2214         qemu_mutex_unlock(&comp_param[idx].mutex);
2215     }
2216 }
2217
2218 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2219                                        ram_addr_t offset)
2220 {
2221     param->block = block;
2222     param->offset = offset;
2223 }
2224
2225 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2226                                            ram_addr_t offset)
2227 {
2228     int idx, thread_count, bytes_xmit = -1, pages = -1;
2229     bool wait = migrate_compress_wait_thread();
2230
2231     thread_count = migrate_compress_threads();
2232     qemu_mutex_lock(&comp_done_lock);
2233 retry:
2234     for (idx = 0; idx < thread_count; idx++) {
2235         if (comp_param[idx].done) {
2236             comp_param[idx].done = false;
2237             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2238             qemu_mutex_lock(&comp_param[idx].mutex);
2239             set_compress_params(&comp_param[idx], block, offset);
2240             qemu_cond_signal(&comp_param[idx].cond);
2241             qemu_mutex_unlock(&comp_param[idx].mutex);
2242             pages = 1;
2243             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
2244             break;
2245         }
2246     }
2247
2248     /*
2249      * wait for the free thread if the user specifies 'compress-wait-thread',
2250      * otherwise we will post the page out in the main thread as normal page.
2251      */
2252     if (pages < 0 && wait) {
2253         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2254         goto retry;
2255     }
2256     qemu_mutex_unlock(&comp_done_lock);
2257
2258     return pages;
2259 }
2260
2261 /**
2262  * find_dirty_block: find the next dirty page and update any state
2263  * associated with the search process.
2264  *
2265  * Returns true if a page is found
2266  *
2267  * @rs: current RAM state
2268  * @pss: data about the state of the current dirty page scan
2269  * @again: set to false if the search has scanned the whole of RAM
2270  */
2271 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
2272 {
2273     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2274     if (pss->complete_round && pss->block == rs->last_seen_block &&
2275         pss->page >= rs->last_page) {
2276         /*
2277          * We've been once around the RAM and haven't found anything.
2278          * Give up.
2279          */
2280         *again = false;
2281         return false;
2282     }
2283     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
2284         /* Didn't find anything in this RAM Block */
2285         pss->page = 0;
2286         pss->block = QLIST_NEXT_RCU(pss->block, next);
2287         if (!pss->block) {
2288             /*
2289              * If memory migration starts over, we will meet a dirtied page
2290              * which may still exists in compression threads's ring, so we
2291              * should flush the compressed data to make sure the new page
2292              * is not overwritten by the old one in the destination.
2293              *
2294              * Also If xbzrle is on, stop using the data compression at this
2295              * point. In theory, xbzrle can do better than compression.
2296              */
2297             flush_compressed_data(rs);
2298
2299             /* Hit the end of the list */
2300             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2301             /* Flag that we've looped */
2302             pss->complete_round = true;
2303             rs->ram_bulk_stage = false;
2304         }
2305         /* Didn't find anything this time, but try again on the new block */
2306         *again = true;
2307         return false;
2308     } else {
2309         /* Can go around again, but... */
2310         *again = true;
2311         /* We've found something so probably don't need to */
2312         return true;
2313     }
2314 }
2315
2316 /**
2317  * unqueue_page: gets a page of the queue
2318  *
2319  * Helper for 'get_queued_page' - gets a page off the queue
2320  *
2321  * Returns the block of the page (or NULL if none available)
2322  *
2323  * @rs: current RAM state
2324  * @offset: used to return the offset within the RAMBlock
2325  */
2326 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2327 {
2328     RAMBlock *block = NULL;
2329
2330     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2331         return NULL;
2332     }
2333
2334     qemu_mutex_lock(&rs->src_page_req_mutex);
2335     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2336         struct RAMSrcPageRequest *entry =
2337                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
2338         block = entry->rb;
2339         *offset = entry->offset;
2340
2341         if (entry->len > TARGET_PAGE_SIZE) {
2342             entry->len -= TARGET_PAGE_SIZE;
2343             entry->offset += TARGET_PAGE_SIZE;
2344         } else {
2345             memory_region_unref(block->mr);
2346             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2347             g_free(entry);
2348             migration_consume_urgent_request();
2349         }
2350     }
2351     qemu_mutex_unlock(&rs->src_page_req_mutex);
2352
2353     return block;
2354 }
2355
2356 /**
2357  * get_queued_page: unqueue a page from the postcopy requests
2358  *
2359  * Skips pages that are already sent (!dirty)
2360  *
2361  * Returns true if a queued page is found
2362  *
2363  * @rs: current RAM state
2364  * @pss: data about the state of the current dirty page scan
2365  */
2366 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2367 {
2368     RAMBlock  *block;
2369     ram_addr_t offset;
2370     bool dirty;
2371
2372     do {
2373         block = unqueue_page(rs, &offset);
2374         /*
2375          * We're sending this page, and since it's postcopy nothing else
2376          * will dirty it, and we must make sure it doesn't get sent again
2377          * even if this queue request was received after the background
2378          * search already sent it.
2379          */
2380         if (block) {
2381             unsigned long page;
2382
2383             page = offset >> TARGET_PAGE_BITS;
2384             dirty = test_bit(page, block->bmap);
2385             if (!dirty) {
2386                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2387                                                 page);
2388             } else {
2389                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2390             }
2391         }
2392
2393     } while (block && !dirty);
2394
2395     if (block) {
2396         /*
2397          * As soon as we start servicing pages out of order, then we have
2398          * to kill the bulk stage, since the bulk stage assumes
2399          * in (migration_bitmap_find_and_reset_dirty) that every page is
2400          * dirty, that's no longer true.
2401          */
2402         rs->ram_bulk_stage = false;
2403
2404         /*
2405          * We want the background search to continue from the queued page
2406          * since the guest is likely to want other pages near to the page
2407          * it just requested.
2408          */
2409         pss->block = block;
2410         pss->page = offset >> TARGET_PAGE_BITS;
2411
2412         /*
2413          * This unqueued page would break the "one round" check, even is
2414          * really rare.
2415          */
2416         pss->complete_round = false;
2417     }
2418
2419     return !!block;
2420 }
2421
2422 /**
2423  * migration_page_queue_free: drop any remaining pages in the ram
2424  * request queue
2425  *
2426  * It should be empty at the end anyway, but in error cases there may
2427  * be some left.  in case that there is any page left, we drop it.
2428  *
2429  */
2430 static void migration_page_queue_free(RAMState *rs)
2431 {
2432     struct RAMSrcPageRequest *mspr, *next_mspr;
2433     /* This queue generally should be empty - but in the case of a failed
2434      * migration might have some droppings in.
2435      */
2436     RCU_READ_LOCK_GUARD();
2437     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2438         memory_region_unref(mspr->rb->mr);
2439         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2440         g_free(mspr);
2441     }
2442 }
2443
2444 /**
2445  * ram_save_queue_pages: queue the page for transmission
2446  *
2447  * A request from postcopy destination for example.
2448  *
2449  * Returns zero on success or negative on error
2450  *
2451  * @rbname: Name of the RAMBLock of the request. NULL means the
2452  *          same that last one.
2453  * @start: starting address from the start of the RAMBlock
2454  * @len: length (in bytes) to send
2455  */
2456 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2457 {
2458     RAMBlock *ramblock;
2459     RAMState *rs = ram_state;
2460
2461     ram_counters.postcopy_requests++;
2462     RCU_READ_LOCK_GUARD();
2463
2464     if (!rbname) {
2465         /* Reuse last RAMBlock */
2466         ramblock = rs->last_req_rb;
2467
2468         if (!ramblock) {
2469             /*
2470              * Shouldn't happen, we can't reuse the last RAMBlock if
2471              * it's the 1st request.
2472              */
2473             error_report("ram_save_queue_pages no previous block");
2474             return -1;
2475         }
2476     } else {
2477         ramblock = qemu_ram_block_by_name(rbname);
2478
2479         if (!ramblock) {
2480             /* We shouldn't be asked for a non-existent RAMBlock */
2481             error_report("ram_save_queue_pages no block '%s'", rbname);
2482             return -1;
2483         }
2484         rs->last_req_rb = ramblock;
2485     }
2486     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2487     if (start+len > ramblock->used_length) {
2488         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2489                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2490                      __func__, start, len, ramblock->used_length);
2491         return -1;
2492     }
2493
2494     struct RAMSrcPageRequest *new_entry =
2495         g_malloc0(sizeof(struct RAMSrcPageRequest));
2496     new_entry->rb = ramblock;
2497     new_entry->offset = start;
2498     new_entry->len = len;
2499
2500     memory_region_ref(ramblock->mr);
2501     qemu_mutex_lock(&rs->src_page_req_mutex);
2502     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2503     migration_make_urgent_request();
2504     qemu_mutex_unlock(&rs->src_page_req_mutex);
2505
2506     return 0;
2507 }
2508
2509 static bool save_page_use_compression(RAMState *rs)
2510 {
2511     if (!migrate_use_compression()) {
2512         return false;
2513     }
2514
2515     /*
2516      * If xbzrle is on, stop using the data compression after first
2517      * round of migration even if compression is enabled. In theory,
2518      * xbzrle can do better than compression.
2519      */
2520     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2521         return true;
2522     }
2523
2524     return false;
2525 }
2526
2527 /*
2528  * try to compress the page before posting it out, return true if the page
2529  * has been properly handled by compression, otherwise needs other
2530  * paths to handle it
2531  */
2532 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2533 {
2534     if (!save_page_use_compression(rs)) {
2535         return false;
2536     }
2537
2538     /*
2539      * When starting the process of a new block, the first page of
2540      * the block should be sent out before other pages in the same
2541      * block, and all the pages in last block should have been sent
2542      * out, keeping this order is important, because the 'cont' flag
2543      * is used to avoid resending the block name.
2544      *
2545      * We post the fist page as normal page as compression will take
2546      * much CPU resource.
2547      */
2548     if (block != rs->last_sent_block) {
2549         flush_compressed_data(rs);
2550         return false;
2551     }
2552
2553     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2554         return true;
2555     }
2556
2557     compression_counters.busy++;
2558     return false;
2559 }
2560
2561 /**
2562  * ram_save_target_page: save one target page
2563  *
2564  * Returns the number of pages written
2565  *
2566  * @rs: current RAM state
2567  * @pss: data about the page we want to send
2568  * @last_stage: if we are at the completion stage
2569  */
2570 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2571                                 bool last_stage)
2572 {
2573     RAMBlock *block = pss->block;
2574     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2575     int res;
2576
2577     if (control_save_page(rs, block, offset, &res)) {
2578         return res;
2579     }
2580
2581     if (save_compress_page(rs, block, offset)) {
2582         return 1;
2583     }
2584
2585     res = save_zero_page(rs, block, offset);
2586     if (res > 0) {
2587         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2588          * page would be stale
2589          */
2590         if (!save_page_use_compression(rs)) {
2591             XBZRLE_cache_lock();
2592             xbzrle_cache_zero_page(rs, block->offset + offset);
2593             XBZRLE_cache_unlock();
2594         }
2595         ram_release_pages(block->idstr, offset, res);
2596         return res;
2597     }
2598
2599     /*
2600      * Do not use multifd for:
2601      * 1. Compression as the first page in the new block should be posted out
2602      *    before sending the compressed page
2603      * 2. In postcopy as one whole host page should be placed
2604      */
2605     if (!save_page_use_compression(rs) && migrate_use_multifd()
2606         && !migration_in_postcopy()) {
2607         return ram_save_multifd_page(rs, block, offset);
2608     }
2609
2610     return ram_save_page(rs, pss, last_stage);
2611 }
2612
2613 /**
2614  * ram_save_host_page: save a whole host page
2615  *
2616  * Starting at *offset send pages up to the end of the current host
2617  * page. It's valid for the initial offset to point into the middle of
2618  * a host page in which case the remainder of the hostpage is sent.
2619  * Only dirty target pages are sent. Note that the host page size may
2620  * be a huge page for this block.
2621  * The saving stops at the boundary of the used_length of the block
2622  * if the RAMBlock isn't a multiple of the host page size.
2623  *
2624  * Returns the number of pages written or negative on error
2625  *
2626  * @rs: current RAM state
2627  * @ms: current migration state
2628  * @pss: data about the page we want to send
2629  * @last_stage: if we are at the completion stage
2630  */
2631 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2632                               bool last_stage)
2633 {
2634     int tmppages, pages = 0;
2635     size_t pagesize_bits =
2636         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2637
2638     if (ramblock_is_ignored(pss->block)) {
2639         error_report("block %s should not be migrated !", pss->block->idstr);
2640         return 0;
2641     }
2642
2643     do {
2644         /* Check the pages is dirty and if it is send it */
2645         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2646             pss->page++;
2647             continue;
2648         }
2649
2650         tmppages = ram_save_target_page(rs, pss, last_stage);
2651         if (tmppages < 0) {
2652             return tmppages;
2653         }
2654
2655         pages += tmppages;
2656         pss->page++;
2657         /* Allow rate limiting to happen in the middle of huge pages */
2658         migration_rate_limit();
2659     } while ((pss->page & (pagesize_bits - 1)) &&
2660              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2661
2662     /* The offset we leave with is the last one we looked at */
2663     pss->page--;
2664     return pages;
2665 }
2666
2667 /**
2668  * ram_find_and_save_block: finds a dirty page and sends it to f
2669  *
2670  * Called within an RCU critical section.
2671  *
2672  * Returns the number of pages written where zero means no dirty pages,
2673  * or negative on error
2674  *
2675  * @rs: current RAM state
2676  * @last_stage: if we are at the completion stage
2677  *
2678  * On systems where host-page-size > target-page-size it will send all the
2679  * pages in a host page that are dirty.
2680  */
2681
2682 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2683 {
2684     PageSearchStatus pss;
2685     int pages = 0;
2686     bool again, found;
2687
2688     /* No dirty page as there is zero RAM */
2689     if (!ram_bytes_total()) {
2690         return pages;
2691     }
2692
2693     pss.block = rs->last_seen_block;
2694     pss.page = rs->last_page;
2695     pss.complete_round = false;
2696
2697     if (!pss.block) {
2698         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2699     }
2700
2701     do {
2702         again = true;
2703         found = get_queued_page(rs, &pss);
2704
2705         if (!found) {
2706             /* priority queue empty, so just search for something dirty */
2707             found = find_dirty_block(rs, &pss, &again);
2708         }
2709
2710         if (found) {
2711             pages = ram_save_host_page(rs, &pss, last_stage);
2712         }
2713     } while (!pages && again);
2714
2715     rs->last_seen_block = pss.block;
2716     rs->last_page = pss.page;
2717
2718     return pages;
2719 }
2720
2721 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2722 {
2723     uint64_t pages = size / TARGET_PAGE_SIZE;
2724
2725     if (zero) {
2726         ram_counters.duplicate += pages;
2727     } else {
2728         ram_counters.normal += pages;
2729         ram_counters.transferred += size;
2730         qemu_update_position(f, size);
2731     }
2732 }
2733
2734 static uint64_t ram_bytes_total_common(bool count_ignored)
2735 {
2736     RAMBlock *block;
2737     uint64_t total = 0;
2738
2739     RCU_READ_LOCK_GUARD();
2740
2741     if (count_ignored) {
2742         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2743             total += block->used_length;
2744         }
2745     } else {
2746         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2747             total += block->used_length;
2748         }
2749     }
2750     return total;
2751 }
2752
2753 uint64_t ram_bytes_total(void)
2754 {
2755     return ram_bytes_total_common(false);
2756 }
2757
2758 static void xbzrle_load_setup(void)
2759 {
2760     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2761 }
2762
2763 static void xbzrle_load_cleanup(void)
2764 {
2765     g_free(XBZRLE.decoded_buf);
2766     XBZRLE.decoded_buf = NULL;
2767 }
2768
2769 static void ram_state_cleanup(RAMState **rsp)
2770 {
2771     if (*rsp) {
2772         migration_page_queue_free(*rsp);
2773         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2774         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2775         g_free(*rsp);
2776         *rsp = NULL;
2777     }
2778 }
2779
2780 static void xbzrle_cleanup(void)
2781 {
2782     XBZRLE_cache_lock();
2783     if (XBZRLE.cache) {
2784         cache_fini(XBZRLE.cache);
2785         g_free(XBZRLE.encoded_buf);
2786         g_free(XBZRLE.current_buf);
2787         g_free(XBZRLE.zero_target_page);
2788         XBZRLE.cache = NULL;
2789         XBZRLE.encoded_buf = NULL;
2790         XBZRLE.current_buf = NULL;
2791         XBZRLE.zero_target_page = NULL;
2792     }
2793     XBZRLE_cache_unlock();
2794 }
2795
2796 static void ram_save_cleanup(void *opaque)
2797 {
2798     RAMState **rsp = opaque;
2799     RAMBlock *block;
2800
2801     /* caller have hold iothread lock or is in a bh, so there is
2802      * no writing race against the migration bitmap
2803      */
2804     memory_global_dirty_log_stop();
2805
2806     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2807         g_free(block->clear_bmap);
2808         block->clear_bmap = NULL;
2809         g_free(block->bmap);
2810         block->bmap = NULL;
2811     }
2812
2813     xbzrle_cleanup();
2814     compress_threads_save_cleanup();
2815     ram_state_cleanup(rsp);
2816 }
2817
2818 static void ram_state_reset(RAMState *rs)
2819 {
2820     rs->last_seen_block = NULL;
2821     rs->last_sent_block = NULL;
2822     rs->last_page = 0;
2823     rs->last_version = ram_list.version;
2824     rs->ram_bulk_stage = true;
2825     rs->fpo_enabled = false;
2826 }
2827
2828 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2829
2830 /*
2831  * 'expected' is the value you expect the bitmap mostly to be full
2832  * of; it won't bother printing lines that are all this value.
2833  * If 'todump' is null the migration bitmap is dumped.
2834  */
2835 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2836                            unsigned long pages)
2837 {
2838     int64_t cur;
2839     int64_t linelen = 128;
2840     char linebuf[129];
2841
2842     for (cur = 0; cur < pages; cur += linelen) {
2843         int64_t curb;
2844         bool found = false;
2845         /*
2846          * Last line; catch the case where the line length
2847          * is longer than remaining ram
2848          */
2849         if (cur + linelen > pages) {
2850             linelen = pages - cur;
2851         }
2852         for (curb = 0; curb < linelen; curb++) {
2853             bool thisbit = test_bit(cur + curb, todump);
2854             linebuf[curb] = thisbit ? '1' : '.';
2855             found = found || (thisbit != expected);
2856         }
2857         if (found) {
2858             linebuf[curb] = '\0';
2859             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2860         }
2861     }
2862 }
2863
2864 /* **** functions for postcopy ***** */
2865
2866 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2867 {
2868     struct RAMBlock *block;
2869
2870     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2871         unsigned long *bitmap = block->bmap;
2872         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2873         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2874
2875         while (run_start < range) {
2876             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2877             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2878                               (run_end - run_start) << TARGET_PAGE_BITS);
2879             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2880         }
2881     }
2882 }
2883
2884 /**
2885  * postcopy_send_discard_bm_ram: discard a RAMBlock
2886  *
2887  * Returns zero on success
2888  *
2889  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2890  *
2891  * @ms: current migration state
2892  * @block: RAMBlock to discard
2893  */
2894 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2895 {
2896     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2897     unsigned long current;
2898     unsigned long *bitmap = block->bmap;
2899
2900     for (current = 0; current < end; ) {
2901         unsigned long one = find_next_bit(bitmap, end, current);
2902         unsigned long zero, discard_length;
2903
2904         if (one >= end) {
2905             break;
2906         }
2907
2908         zero = find_next_zero_bit(bitmap, end, one + 1);
2909
2910         if (zero >= end) {
2911             discard_length = end - one;
2912         } else {
2913             discard_length = zero - one;
2914         }
2915         postcopy_discard_send_range(ms, one, discard_length);
2916         current = one + discard_length;
2917     }
2918
2919     return 0;
2920 }
2921
2922 /**
2923  * postcopy_each_ram_send_discard: discard all RAMBlocks
2924  *
2925  * Returns 0 for success or negative for error
2926  *
2927  * Utility for the outgoing postcopy code.
2928  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2929  *   passing it bitmap indexes and name.
2930  * (qemu_ram_foreach_block ends up passing unscaled lengths
2931  *  which would mean postcopy code would have to deal with target page)
2932  *
2933  * @ms: current migration state
2934  */
2935 static int postcopy_each_ram_send_discard(MigrationState *ms)
2936 {
2937     struct RAMBlock *block;
2938     int ret;
2939
2940     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2941         postcopy_discard_send_init(ms, block->idstr);
2942
2943         /*
2944          * Postcopy sends chunks of bitmap over the wire, but it
2945          * just needs indexes at this point, avoids it having
2946          * target page specific code.
2947          */
2948         ret = postcopy_send_discard_bm_ram(ms, block);
2949         postcopy_discard_send_finish(ms);
2950         if (ret) {
2951             return ret;
2952         }
2953     }
2954
2955     return 0;
2956 }
2957
2958 /**
2959  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2960  *
2961  * Helper for postcopy_chunk_hostpages; it's called twice to
2962  * canonicalize the two bitmaps, that are similar, but one is
2963  * inverted.
2964  *
2965  * Postcopy requires that all target pages in a hostpage are dirty or
2966  * clean, not a mix.  This function canonicalizes the bitmaps.
2967  *
2968  * @ms: current migration state
2969  * @block: block that contains the page we want to canonicalize
2970  */
2971 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2972 {
2973     RAMState *rs = ram_state;
2974     unsigned long *bitmap = block->bmap;
2975     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2976     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2977     unsigned long run_start;
2978
2979     if (block->page_size == TARGET_PAGE_SIZE) {
2980         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2981         return;
2982     }
2983
2984     /* Find a dirty page */
2985     run_start = find_next_bit(bitmap, pages, 0);
2986
2987     while (run_start < pages) {
2988
2989         /*
2990          * If the start of this run of pages is in the middle of a host
2991          * page, then we need to fixup this host page.
2992          */
2993         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2994             /* Find the end of this run */
2995             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2996             /*
2997              * If the end isn't at the start of a host page, then the
2998              * run doesn't finish at the end of a host page
2999              * and we need to discard.
3000              */
3001         }
3002
3003         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
3004             unsigned long page;
3005             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
3006                                                              host_ratio);
3007             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
3008
3009             /* Clean up the bitmap */
3010             for (page = fixup_start_addr;
3011                  page < fixup_start_addr + host_ratio; page++) {
3012                 /*
3013                  * Remark them as dirty, updating the count for any pages
3014                  * that weren't previously dirty.
3015                  */
3016                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
3017             }
3018         }
3019
3020         /* Find the next dirty page for the next iteration */
3021         run_start = find_next_bit(bitmap, pages, run_start);
3022     }
3023 }
3024
3025 /**
3026  * postcopy_chunk_hostpages: discard any partially sent host page
3027  *
3028  * Utility for the outgoing postcopy code.
3029  *
3030  * Discard any partially sent host-page size chunks, mark any partially
3031  * dirty host-page size chunks as all dirty.  In this case the host-page
3032  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
3033  *
3034  * Returns zero on success
3035  *
3036  * @ms: current migration state
3037  * @block: block we want to work with
3038  */
3039 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
3040 {
3041     postcopy_discard_send_init(ms, block->idstr);
3042
3043     /*
3044      * Ensure that all partially dirty host pages are made fully dirty.
3045      */
3046     postcopy_chunk_hostpages_pass(ms, block);
3047
3048     postcopy_discard_send_finish(ms);
3049     return 0;
3050 }
3051
3052 /**
3053  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3054  *
3055  * Returns zero on success
3056  *
3057  * Transmit the set of pages to be discarded after precopy to the target
3058  * these are pages that:
3059  *     a) Have been previously transmitted but are now dirty again
3060  *     b) Pages that have never been transmitted, this ensures that
3061  *        any pages on the destination that have been mapped by background
3062  *        tasks get discarded (transparent huge pages is the specific concern)
3063  * Hopefully this is pretty sparse
3064  *
3065  * @ms: current migration state
3066  */
3067 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
3068 {
3069     RAMState *rs = ram_state;
3070     RAMBlock *block;
3071     int ret;
3072
3073     RCU_READ_LOCK_GUARD();
3074
3075     /* This should be our last sync, the src is now paused */
3076     migration_bitmap_sync(rs);
3077
3078     /* Easiest way to make sure we don't resume in the middle of a host-page */
3079     rs->last_seen_block = NULL;
3080     rs->last_sent_block = NULL;
3081     rs->last_page = 0;
3082
3083     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3084         /* Deal with TPS != HPS and huge pages */
3085         ret = postcopy_chunk_hostpages(ms, block);
3086         if (ret) {
3087             return ret;
3088         }
3089
3090 #ifdef DEBUG_POSTCOPY
3091         ram_debug_dump_bitmap(block->bmap, true,
3092                               block->used_length >> TARGET_PAGE_BITS);
3093 #endif
3094     }
3095     trace_ram_postcopy_send_discard_bitmap();
3096
3097     ret = postcopy_each_ram_send_discard(ms);
3098
3099     return ret;
3100 }
3101
3102 /**
3103  * ram_discard_range: discard dirtied pages at the beginning of postcopy
3104  *
3105  * Returns zero on success
3106  *
3107  * @rbname: name of the RAMBlock of the request. NULL means the
3108  *          same that last one.
3109  * @start: RAMBlock starting page
3110  * @length: RAMBlock size
3111  */
3112 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3113 {
3114     trace_ram_discard_range(rbname, start, length);
3115
3116     RCU_READ_LOCK_GUARD();
3117     RAMBlock *rb = qemu_ram_block_by_name(rbname);
3118
3119     if (!rb) {
3120         error_report("ram_discard_range: Failed to find block '%s'", rbname);
3121         return -1;
3122     }
3123
3124     /*
3125      * On source VM, we don't need to update the received bitmap since
3126      * we don't even have one.
3127      */
3128     if (rb->receivedmap) {
3129         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3130                      length >> qemu_target_page_bits());
3131     }
3132
3133     return ram_block_discard_range(rb, start, length);
3134 }
3135
3136 /*
3137  * For every allocation, we will try not to crash the VM if the
3138  * allocation failed.
3139  */
3140 static int xbzrle_init(void)
3141 {
3142     Error *local_err = NULL;
3143
3144     if (!migrate_use_xbzrle()) {
3145         return 0;
3146     }
3147
3148     XBZRLE_cache_lock();
3149
3150     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3151     if (!XBZRLE.zero_target_page) {
3152         error_report("%s: Error allocating zero page", __func__);
3153         goto err_out;
3154     }
3155
3156     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3157                               TARGET_PAGE_SIZE, &local_err);
3158     if (!XBZRLE.cache) {
3159         error_report_err(local_err);
3160         goto free_zero_page;
3161     }
3162
3163     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3164     if (!XBZRLE.encoded_buf) {
3165         error_report("%s: Error allocating encoded_buf", __func__);
3166         goto free_cache;
3167     }
3168
3169     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3170     if (!XBZRLE.current_buf) {
3171         error_report("%s: Error allocating current_buf", __func__);
3172         goto free_encoded_buf;
3173     }
3174
3175     /* We are all good */
3176     XBZRLE_cache_unlock();
3177     return 0;
3178
3179 free_encoded_buf:
3180     g_free(XBZRLE.encoded_buf);
3181     XBZRLE.encoded_buf = NULL;
3182 free_cache:
3183     cache_fini(XBZRLE.cache);
3184     XBZRLE.cache = NULL;
3185 free_zero_page:
3186     g_free(XBZRLE.zero_target_page);
3187     XBZRLE.zero_target_page = NULL;
3188 err_out:
3189     XBZRLE_cache_unlock();
3190     return -ENOMEM;
3191 }
3192
3193 static int ram_state_init(RAMState **rsp)
3194 {
3195     *rsp = g_try_new0(RAMState, 1);
3196
3197     if (!*rsp) {
3198         error_report("%s: Init ramstate fail", __func__);
3199         return -1;
3200     }
3201
3202     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3203     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3204     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3205
3206     /*
3207      * Count the total number of pages used by ram blocks not including any
3208      * gaps due to alignment or unplugs.
3209      * This must match with the initial values of dirty bitmap.
3210      */
3211     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3212     ram_state_reset(*rsp);
3213
3214     return 0;
3215 }
3216
3217 static void ram_list_init_bitmaps(void)
3218 {
3219     MigrationState *ms = migrate_get_current();
3220     RAMBlock *block;
3221     unsigned long pages;
3222     uint8_t shift;
3223
3224     /* Skip setting bitmap if there is no RAM */
3225     if (ram_bytes_total()) {
3226         shift = ms->clear_bitmap_shift;
3227         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3228             error_report("clear_bitmap_shift (%u) too big, using "
3229                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3230             shift = CLEAR_BITMAP_SHIFT_MAX;
3231         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3232             error_report("clear_bitmap_shift (%u) too small, using "
3233                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3234             shift = CLEAR_BITMAP_SHIFT_MIN;
3235         }
3236
3237         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3238             pages = block->max_length >> TARGET_PAGE_BITS;
3239             /*
3240              * The initial dirty bitmap for migration must be set with all
3241              * ones to make sure we'll migrate every guest RAM page to
3242              * destination.
3243              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3244              * new migration after a failed migration, ram_list.
3245              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3246              * guest memory.
3247              */
3248             block->bmap = bitmap_new(pages);
3249             bitmap_set(block->bmap, 0, pages);
3250             block->clear_bmap_shift = shift;
3251             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3252         }
3253     }
3254 }
3255
3256 static void ram_init_bitmaps(RAMState *rs)
3257 {
3258     /* For memory_global_dirty_log_start below.  */
3259     qemu_mutex_lock_iothread();
3260     qemu_mutex_lock_ramlist();
3261
3262     WITH_RCU_READ_LOCK_GUARD() {
3263         ram_list_init_bitmaps();
3264         memory_global_dirty_log_start();
3265         migration_bitmap_sync_precopy(rs);
3266     }
3267     qemu_mutex_unlock_ramlist();
3268     qemu_mutex_unlock_iothread();
3269 }
3270
3271 static int ram_init_all(RAMState **rsp)
3272 {
3273     if (ram_state_init(rsp)) {
3274         return -1;
3275     }
3276
3277     if (xbzrle_init()) {
3278         ram_state_cleanup(rsp);
3279         return -1;
3280     }
3281
3282     ram_init_bitmaps(*rsp);
3283
3284     return 0;
3285 }
3286
3287 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3288 {
3289     RAMBlock *block;
3290     uint64_t pages = 0;
3291
3292     /*
3293      * Postcopy is not using xbzrle/compression, so no need for that.
3294      * Also, since source are already halted, we don't need to care
3295      * about dirty page logging as well.
3296      */
3297
3298     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3299         pages += bitmap_count_one(block->bmap,
3300                                   block->used_length >> TARGET_PAGE_BITS);
3301     }
3302
3303     /* This may not be aligned with current bitmaps. Recalculate. */
3304     rs->migration_dirty_pages = pages;
3305
3306     rs->last_seen_block = NULL;
3307     rs->last_sent_block = NULL;
3308     rs->last_page = 0;
3309     rs->last_version = ram_list.version;
3310     /*
3311      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3312      * matter what we have sent.
3313      */
3314     rs->ram_bulk_stage = false;
3315
3316     /* Update RAMState cache of output QEMUFile */
3317     rs->f = out;
3318
3319     trace_ram_state_resume_prepare(pages);
3320 }
3321
3322 /*
3323  * This function clears bits of the free pages reported by the caller from the
3324  * migration dirty bitmap. @addr is the host address corresponding to the
3325  * start of the continuous guest free pages, and @len is the total bytes of
3326  * those pages.
3327  */
3328 void qemu_guest_free_page_hint(void *addr, size_t len)
3329 {
3330     RAMBlock *block;
3331     ram_addr_t offset;
3332     size_t used_len, start, npages;
3333     MigrationState *s = migrate_get_current();
3334
3335     /* This function is currently expected to be used during live migration */
3336     if (!migration_is_setup_or_active(s->state)) {
3337         return;
3338     }
3339
3340     for (; len > 0; len -= used_len, addr += used_len) {
3341         block = qemu_ram_block_from_host(addr, false, &offset);
3342         if (unlikely(!block || offset >= block->used_length)) {
3343             /*
3344              * The implementation might not support RAMBlock resize during
3345              * live migration, but it could happen in theory with future
3346              * updates. So we add a check here to capture that case.
3347              */
3348             error_report_once("%s unexpected error", __func__);
3349             return;
3350         }
3351
3352         if (len <= block->used_length - offset) {
3353             used_len = len;
3354         } else {
3355             used_len = block->used_length - offset;
3356         }
3357
3358         start = offset >> TARGET_PAGE_BITS;
3359         npages = used_len >> TARGET_PAGE_BITS;
3360
3361         qemu_mutex_lock(&ram_state->bitmap_mutex);
3362         ram_state->migration_dirty_pages -=
3363                       bitmap_count_one_with_offset(block->bmap, start, npages);
3364         bitmap_clear(block->bmap, start, npages);
3365         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3366     }
3367 }
3368
3369 /*
3370  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3371  * long-running RCU critical section.  When rcu-reclaims in the code
3372  * start to become numerous it will be necessary to reduce the
3373  * granularity of these critical sections.
3374  */
3375
3376 /**
3377  * ram_save_setup: Setup RAM for migration
3378  *
3379  * Returns zero to indicate success and negative for error
3380  *
3381  * @f: QEMUFile where to send the data
3382  * @opaque: RAMState pointer
3383  */
3384 static int ram_save_setup(QEMUFile *f, void *opaque)
3385 {
3386     RAMState **rsp = opaque;
3387     RAMBlock *block;
3388
3389     if (compress_threads_save_setup()) {
3390         return -1;
3391     }
3392
3393     /* migration has already setup the bitmap, reuse it. */
3394     if (!migration_in_colo_state()) {
3395         if (ram_init_all(rsp) != 0) {
3396             compress_threads_save_cleanup();
3397             return -1;
3398         }
3399     }
3400     (*rsp)->f = f;
3401
3402     WITH_RCU_READ_LOCK_GUARD() {
3403         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3404
3405         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3406             qemu_put_byte(f, strlen(block->idstr));
3407             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3408             qemu_put_be64(f, block->used_length);
3409             if (migrate_postcopy_ram() && block->page_size !=
3410                                           qemu_host_page_size) {
3411                 qemu_put_be64(f, block->page_size);
3412             }
3413             if (migrate_ignore_shared()) {
3414                 qemu_put_be64(f, block->mr->addr);
3415             }
3416         }
3417     }
3418
3419     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3420     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3421
3422     multifd_send_sync_main(*rsp);
3423     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3424     qemu_fflush(f);
3425
3426     return 0;
3427 }
3428
3429 /**
3430  * ram_save_iterate: iterative stage for migration
3431  *
3432  * Returns zero to indicate success and negative for error
3433  *
3434  * @f: QEMUFile where to send the data
3435  * @opaque: RAMState pointer
3436  */
3437 static int ram_save_iterate(QEMUFile *f, void *opaque)
3438 {
3439     RAMState **temp = opaque;
3440     RAMState *rs = *temp;
3441     int ret;
3442     int i;
3443     int64_t t0;
3444     int done = 0;
3445
3446     if (blk_mig_bulk_active()) {
3447         /* Avoid transferring ram during bulk phase of block migration as
3448          * the bulk phase will usually take a long time and transferring
3449          * ram updates during that time is pointless. */
3450         goto out;
3451     }
3452
3453     WITH_RCU_READ_LOCK_GUARD() {
3454         if (ram_list.version != rs->last_version) {
3455             ram_state_reset(rs);
3456         }
3457
3458         /* Read version before ram_list.blocks */
3459         smp_rmb();
3460
3461         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3462
3463         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3464         i = 0;
3465         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3466                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3467             int pages;
3468
3469             if (qemu_file_get_error(f)) {
3470                 break;
3471             }
3472
3473             pages = ram_find_and_save_block(rs, false);
3474             /* no more pages to sent */
3475             if (pages == 0) {
3476                 done = 1;
3477                 break;
3478             }
3479
3480             if (pages < 0) {
3481                 qemu_file_set_error(f, pages);
3482                 break;
3483             }
3484
3485             rs->target_page_count += pages;
3486
3487             /*
3488              * During postcopy, it is necessary to make sure one whole host
3489              * page is sent in one chunk.
3490              */
3491             if (migrate_postcopy_ram()) {
3492                 flush_compressed_data(rs);
3493             }
3494
3495             /*
3496              * we want to check in the 1st loop, just in case it was the 1st
3497              * time and we had to sync the dirty bitmap.
3498              * qemu_clock_get_ns() is a bit expensive, so we only check each
3499              * some iterations
3500              */
3501             if ((i & 63) == 0) {
3502                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3503                               1000000;
3504                 if (t1 > MAX_WAIT) {
3505                     trace_ram_save_iterate_big_wait(t1, i);
3506                     break;
3507                 }
3508             }
3509             i++;
3510         }
3511     }
3512
3513     /*
3514      * Must occur before EOS (or any QEMUFile operation)
3515      * because of RDMA protocol.
3516      */
3517     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3518
3519 out:
3520     multifd_send_sync_main(rs);
3521     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3522     qemu_fflush(f);
3523     ram_counters.transferred += 8;
3524
3525     ret = qemu_file_get_error(f);
3526     if (ret < 0) {
3527         return ret;
3528     }
3529
3530     return done;
3531 }
3532
3533 /**
3534  * ram_save_complete: function called to send the remaining amount of ram
3535  *
3536  * Returns zero to indicate success or negative on error
3537  *
3538  * Called with iothread lock
3539  *
3540  * @f: QEMUFile where to send the data
3541  * @opaque: RAMState pointer
3542  */
3543 static int ram_save_complete(QEMUFile *f, void *opaque)
3544 {
3545     RAMState **temp = opaque;
3546     RAMState *rs = *temp;
3547     int ret = 0;
3548
3549     WITH_RCU_READ_LOCK_GUARD() {
3550         if (!migration_in_postcopy()) {
3551             migration_bitmap_sync_precopy(rs);
3552         }
3553
3554         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3555
3556         /* try transferring iterative blocks of memory */
3557
3558         /* flush all remaining blocks regardless of rate limiting */
3559         while (true) {
3560             int pages;
3561
3562             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3563             /* no more blocks to sent */
3564             if (pages == 0) {
3565                 break;
3566             }
3567             if (pages < 0) {
3568                 ret = pages;
3569                 break;
3570             }
3571         }
3572
3573         flush_compressed_data(rs);
3574         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3575     }
3576
3577     multifd_send_sync_main(rs);
3578     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3579     qemu_fflush(f);
3580
3581     return ret;
3582 }
3583
3584 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3585                              uint64_t *res_precopy_only,
3586                              uint64_t *res_compatible,
3587                              uint64_t *res_postcopy_only)
3588 {
3589     RAMState **temp = opaque;
3590     RAMState *rs = *temp;
3591     uint64_t remaining_size;
3592
3593     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3594
3595     if (!migration_in_postcopy() &&
3596         remaining_size < max_size) {
3597         qemu_mutex_lock_iothread();
3598         WITH_RCU_READ_LOCK_GUARD() {
3599             migration_bitmap_sync_precopy(rs);
3600         }
3601         qemu_mutex_unlock_iothread();
3602         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3603     }
3604
3605     if (migrate_postcopy_ram()) {
3606         /* We can do postcopy, and all the data is postcopiable */
3607         *res_compatible += remaining_size;
3608     } else {
3609         *res_precopy_only += remaining_size;
3610     }
3611 }
3612
3613 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3614 {
3615     unsigned int xh_len;
3616     int xh_flags;
3617     uint8_t *loaded_data;
3618
3619     /* extract RLE header */
3620     xh_flags = qemu_get_byte(f);
3621     xh_len = qemu_get_be16(f);
3622
3623     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3624         error_report("Failed to load XBZRLE page - wrong compression!");
3625         return -1;
3626     }
3627
3628     if (xh_len > TARGET_PAGE_SIZE) {
3629         error_report("Failed to load XBZRLE page - len overflow!");
3630         return -1;
3631     }
3632     loaded_data = XBZRLE.decoded_buf;
3633     /* load data and decode */
3634     /* it can change loaded_data to point to an internal buffer */
3635     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3636
3637     /* decode RLE */
3638     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3639                              TARGET_PAGE_SIZE) == -1) {
3640         error_report("Failed to load XBZRLE page - decode error!");
3641         return -1;
3642     }
3643
3644     return 0;
3645 }
3646
3647 /**
3648  * ram_block_from_stream: read a RAMBlock id from the migration stream
3649  *
3650  * Must be called from within a rcu critical section.
3651  *
3652  * Returns a pointer from within the RCU-protected ram_list.
3653  *
3654  * @f: QEMUFile where to read the data from
3655  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3656  */
3657 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3658 {
3659     static RAMBlock *block = NULL;
3660     char id[256];
3661     uint8_t len;
3662
3663     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3664         if (!block) {
3665             error_report("Ack, bad migration stream!");
3666             return NULL;
3667         }
3668         return block;
3669     }
3670
3671     len = qemu_get_byte(f);
3672     qemu_get_buffer(f, (uint8_t *)id, len);
3673     id[len] = 0;
3674
3675     block = qemu_ram_block_by_name(id);
3676     if (!block) {
3677         error_report("Can't find block %s", id);
3678         return NULL;
3679     }
3680
3681     if (ramblock_is_ignored(block)) {
3682         error_report("block %s should not be migrated !", id);
3683         return NULL;
3684     }
3685
3686     return block;
3687 }
3688
3689 static inline void *host_from_ram_block_offset(RAMBlock *block,
3690                                                ram_addr_t offset)
3691 {
3692     if (!offset_in_ramblock(block, offset)) {
3693         return NULL;
3694     }
3695
3696     return block->host + offset;
3697 }
3698
3699 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3700                                                  ram_addr_t offset)
3701 {
3702     if (!offset_in_ramblock(block, offset)) {
3703         return NULL;
3704     }
3705     if (!block->colo_cache) {
3706         error_report("%s: colo_cache is NULL in block :%s",
3707                      __func__, block->idstr);
3708         return NULL;
3709     }
3710
3711     /*
3712     * During colo checkpoint, we need bitmap of these migrated pages.
3713     * It help us to decide which pages in ram cache should be flushed
3714     * into VM's RAM later.
3715     */
3716     if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3717         ram_state->migration_dirty_pages++;
3718     }
3719     return block->colo_cache + offset;
3720 }
3721
3722 /**
3723  * ram_handle_compressed: handle the zero page case
3724  *
3725  * If a page (or a whole RDMA chunk) has been
3726  * determined to be zero, then zap it.
3727  *
3728  * @host: host address for the zero page
3729  * @ch: what the page is filled from.  We only support zero
3730  * @size: size of the zero page
3731  */
3732 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3733 {
3734     if (ch != 0 || !is_zero_range(host, size)) {
3735         memset(host, ch, size);
3736     }
3737 }
3738
3739 /* return the size after decompression, or negative value on error */
3740 static int
3741 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3742                      const uint8_t *source, size_t source_len)
3743 {
3744     int err;
3745
3746     err = inflateReset(stream);
3747     if (err != Z_OK) {
3748         return -1;
3749     }
3750
3751     stream->avail_in = source_len;
3752     stream->next_in = (uint8_t *)source;
3753     stream->avail_out = dest_len;
3754     stream->next_out = dest;
3755
3756     err = inflate(stream, Z_NO_FLUSH);
3757     if (err != Z_STREAM_END) {
3758         return -1;
3759     }
3760
3761     return stream->total_out;
3762 }
3763
3764 static void *do_data_decompress(void *opaque)
3765 {
3766     DecompressParam *param = opaque;
3767     unsigned long pagesize;
3768     uint8_t *des;
3769     int len, ret;
3770
3771     qemu_mutex_lock(&param->mutex);
3772     while (!param->quit) {
3773         if (param->des) {
3774             des = param->des;
3775             len = param->len;
3776             param->des = 0;
3777             qemu_mutex_unlock(&param->mutex);
3778
3779             pagesize = TARGET_PAGE_SIZE;
3780
3781             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3782                                        param->compbuf, len);
3783             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3784                 error_report("decompress data failed");
3785                 qemu_file_set_error(decomp_file, ret);
3786             }
3787
3788             qemu_mutex_lock(&decomp_done_lock);
3789             param->done = true;
3790             qemu_cond_signal(&decomp_done_cond);
3791             qemu_mutex_unlock(&decomp_done_lock);
3792
3793             qemu_mutex_lock(&param->mutex);
3794         } else {
3795             qemu_cond_wait(&param->cond, &param->mutex);
3796         }
3797     }
3798     qemu_mutex_unlock(&param->mutex);
3799
3800     return NULL;
3801 }
3802
3803 static int wait_for_decompress_done(void)
3804 {
3805     int idx, thread_count;
3806
3807     if (!migrate_use_compression()) {
3808         return 0;
3809     }
3810
3811     thread_count = migrate_decompress_threads();
3812     qemu_mutex_lock(&decomp_done_lock);
3813     for (idx = 0; idx < thread_count; idx++) {
3814         while (!decomp_param[idx].done) {
3815             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3816         }
3817     }
3818     qemu_mutex_unlock(&decomp_done_lock);
3819     return qemu_file_get_error(decomp_file);
3820 }
3821
3822 static void compress_threads_load_cleanup(void)
3823 {
3824     int i, thread_count;
3825
3826     if (!migrate_use_compression()) {
3827         return;
3828     }
3829     thread_count = migrate_decompress_threads();
3830     for (i = 0; i < thread_count; i++) {
3831         /*
3832          * we use it as a indicator which shows if the thread is
3833          * properly init'd or not
3834          */
3835         if (!decomp_param[i].compbuf) {
3836             break;
3837         }
3838
3839         qemu_mutex_lock(&decomp_param[i].mutex);
3840         decomp_param[i].quit = true;
3841         qemu_cond_signal(&decomp_param[i].cond);
3842         qemu_mutex_unlock(&decomp_param[i].mutex);
3843     }
3844     for (i = 0; i < thread_count; i++) {
3845         if (!decomp_param[i].compbuf) {
3846             break;
3847         }
3848
3849         qemu_thread_join(decompress_threads + i);
3850         qemu_mutex_destroy(&decomp_param[i].mutex);
3851         qemu_cond_destroy(&decomp_param[i].cond);
3852         inflateEnd(&decomp_param[i].stream);
3853         g_free(decomp_param[i].compbuf);
3854         decomp_param[i].compbuf = NULL;
3855     }
3856     g_free(decompress_threads);
3857     g_free(decomp_param);
3858     decompress_threads = NULL;
3859     decomp_param = NULL;
3860     decomp_file = NULL;
3861 }
3862
3863 static int compress_threads_load_setup(QEMUFile *f)
3864 {
3865     int i, thread_count;
3866
3867     if (!migrate_use_compression()) {
3868         return 0;
3869     }
3870
3871     thread_count = migrate_decompress_threads();
3872     decompress_threads = g_new0(QemuThread, thread_count);
3873     decomp_param = g_new0(DecompressParam, thread_count);
3874     qemu_mutex_init(&decomp_done_lock);
3875     qemu_cond_init(&decomp_done_cond);
3876     decomp_file = f;
3877     for (i = 0; i < thread_count; i++) {
3878         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3879             goto exit;
3880         }
3881
3882         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3883         qemu_mutex_init(&decomp_param[i].mutex);
3884         qemu_cond_init(&decomp_param[i].cond);
3885         decomp_param[i].done = true;
3886         decomp_param[i].quit = false;
3887         qemu_thread_create(decompress_threads + i, "decompress",
3888                            do_data_decompress, decomp_param + i,
3889                            QEMU_THREAD_JOINABLE);
3890     }
3891     return 0;
3892 exit:
3893     compress_threads_load_cleanup();
3894     return -1;
3895 }
3896
3897 static void decompress_data_with_multi_threads(QEMUFile *f,
3898                                                void *host, int len)
3899 {
3900     int idx, thread_count;
3901
3902     thread_count = migrate_decompress_threads();
3903     qemu_mutex_lock(&decomp_done_lock);
3904     while (true) {
3905         for (idx = 0; idx < thread_count; idx++) {
3906             if (decomp_param[idx].done) {
3907                 decomp_param[idx].done = false;
3908                 qemu_mutex_lock(&decomp_param[idx].mutex);
3909                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3910                 decomp_param[idx].des = host;
3911                 decomp_param[idx].len = len;
3912                 qemu_cond_signal(&decomp_param[idx].cond);
3913                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3914                 break;
3915             }
3916         }
3917         if (idx < thread_count) {
3918             break;
3919         } else {
3920             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3921         }
3922     }
3923     qemu_mutex_unlock(&decomp_done_lock);
3924 }
3925
3926 /*
3927  * colo cache: this is for secondary VM, we cache the whole
3928  * memory of the secondary VM, it is need to hold the global lock
3929  * to call this helper.
3930  */
3931 int colo_init_ram_cache(void)
3932 {
3933     RAMBlock *block;
3934
3935     WITH_RCU_READ_LOCK_GUARD() {
3936         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3937             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3938                                                     NULL,
3939                                                     false);
3940             if (!block->colo_cache) {
3941                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3942                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3943                              block->used_length);
3944                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3945                     if (block->colo_cache) {
3946                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3947                         block->colo_cache = NULL;
3948                     }
3949                 }
3950                 return -errno;
3951             }
3952             memcpy(block->colo_cache, block->host, block->used_length);
3953         }
3954     }
3955
3956     /*
3957     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3958     * with to decide which page in cache should be flushed into SVM's RAM. Here
3959     * we use the same name 'ram_bitmap' as for migration.
3960     */
3961     if (ram_bytes_total()) {
3962         RAMBlock *block;
3963
3964         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3965             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3966
3967             block->bmap = bitmap_new(pages);
3968             bitmap_set(block->bmap, 0, pages);
3969         }
3970     }
3971     ram_state = g_new0(RAMState, 1);
3972     ram_state->migration_dirty_pages = 0;
3973     qemu_mutex_init(&ram_state->bitmap_mutex);
3974     memory_global_dirty_log_start();
3975
3976     return 0;
3977 }
3978
3979 /* It is need to hold the global lock to call this helper */
3980 void colo_release_ram_cache(void)
3981 {
3982     RAMBlock *block;
3983
3984     memory_global_dirty_log_stop();
3985     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3986         g_free(block->bmap);
3987         block->bmap = NULL;
3988     }
3989
3990     WITH_RCU_READ_LOCK_GUARD() {
3991         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3992             if (block->colo_cache) {
3993                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3994                 block->colo_cache = NULL;
3995             }
3996         }
3997     }
3998     qemu_mutex_destroy(&ram_state->bitmap_mutex);
3999     g_free(ram_state);
4000     ram_state = NULL;
4001 }
4002
4003 /**
4004  * ram_load_setup: Setup RAM for migration incoming side
4005  *
4006  * Returns zero to indicate success and negative for error
4007  *
4008  * @f: QEMUFile where to receive the data
4009  * @opaque: RAMState pointer
4010  */
4011 static int ram_load_setup(QEMUFile *f, void *opaque)
4012 {
4013     if (compress_threads_load_setup(f)) {
4014         return -1;
4015     }
4016
4017     xbzrle_load_setup();
4018     ramblock_recv_map_init();
4019
4020     return 0;
4021 }
4022
4023 static int ram_load_cleanup(void *opaque)
4024 {
4025     RAMBlock *rb;
4026
4027     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4028         qemu_ram_block_writeback(rb);
4029     }
4030
4031     xbzrle_load_cleanup();
4032     compress_threads_load_cleanup();
4033
4034     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4035         g_free(rb->receivedmap);
4036         rb->receivedmap = NULL;
4037     }
4038
4039     return 0;
4040 }
4041
4042 /**
4043  * ram_postcopy_incoming_init: allocate postcopy data structures
4044  *
4045  * Returns 0 for success and negative if there was one error
4046  *
4047  * @mis: current migration incoming state
4048  *
4049  * Allocate data structures etc needed by incoming migration with
4050  * postcopy-ram. postcopy-ram's similarly names
4051  * postcopy_ram_incoming_init does the work.
4052  */
4053 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4054 {
4055     return postcopy_ram_incoming_init(mis);
4056 }
4057
4058 /**
4059  * ram_load_postcopy: load a page in postcopy case
4060  *
4061  * Returns 0 for success or -errno in case of error
4062  *
4063  * Called in postcopy mode by ram_load().
4064  * rcu_read_lock is taken prior to this being called.
4065  *
4066  * @f: QEMUFile where to send the data
4067  */
4068 static int ram_load_postcopy(QEMUFile *f)
4069 {
4070     int flags = 0, ret = 0;
4071     bool place_needed = false;
4072     bool matches_target_page_size = false;
4073     MigrationIncomingState *mis = migration_incoming_get_current();
4074     /* Temporary page that is later 'placed' */
4075     void *postcopy_host_page = mis->postcopy_tmp_page;
4076     void *this_host = NULL;
4077     bool all_zero = false;
4078     int target_pages = 0;
4079
4080     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4081         ram_addr_t addr;
4082         void *host = NULL;
4083         void *page_buffer = NULL;
4084         void *place_source = NULL;
4085         RAMBlock *block = NULL;
4086         uint8_t ch;
4087         int len;
4088
4089         addr = qemu_get_be64(f);
4090
4091         /*
4092          * If qemu file error, we should stop here, and then "addr"
4093          * may be invalid
4094          */
4095         ret = qemu_file_get_error(f);
4096         if (ret) {
4097             break;
4098         }
4099
4100         flags = addr & ~TARGET_PAGE_MASK;
4101         addr &= TARGET_PAGE_MASK;
4102
4103         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4104         place_needed = false;
4105         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4106                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
4107             block = ram_block_from_stream(f, flags);
4108
4109             host = host_from_ram_block_offset(block, addr);
4110             if (!host) {
4111                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4112                 ret = -EINVAL;
4113                 break;
4114             }
4115             target_pages++;
4116             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4117             /*
4118              * Postcopy requires that we place whole host pages atomically;
4119              * these may be huge pages for RAMBlocks that are backed by
4120              * hugetlbfs.
4121              * To make it atomic, the data is read into a temporary page
4122              * that's moved into place later.
4123              * The migration protocol uses,  possibly smaller, target-pages
4124              * however the source ensures it always sends all the components
4125              * of a host page in one chunk.
4126              */
4127             page_buffer = postcopy_host_page +
4128                           ((uintptr_t)host & (block->page_size - 1));
4129             /* If all TP are zero then we can optimise the place */
4130             if (target_pages == 1) {
4131                 all_zero = true;
4132                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
4133                                                     block->page_size);
4134             } else {
4135                 /* not the 1st TP within the HP */
4136                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
4137                     (uintptr_t)this_host) {
4138                     error_report("Non-same host page %p/%p",
4139                                   host, this_host);
4140                     ret = -EINVAL;
4141                     break;
4142                 }
4143             }
4144
4145             /*
4146              * If it's the last part of a host page then we place the host
4147              * page
4148              */
4149             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
4150                 place_needed = true;
4151                 target_pages = 0;
4152             }
4153             place_source = postcopy_host_page;
4154         }
4155
4156         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4157         case RAM_SAVE_FLAG_ZERO:
4158             ch = qemu_get_byte(f);
4159             /*
4160              * Can skip to set page_buffer when
4161              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4162              */
4163             if (ch || !matches_target_page_size) {
4164                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4165             }
4166             if (ch) {
4167                 all_zero = false;
4168             }
4169             break;
4170
4171         case RAM_SAVE_FLAG_PAGE:
4172             all_zero = false;
4173             if (!matches_target_page_size) {
4174                 /* For huge pages, we always use temporary buffer */
4175                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4176             } else {
4177                 /*
4178                  * For small pages that matches target page size, we
4179                  * avoid the qemu_file copy.  Instead we directly use
4180                  * the buffer of QEMUFile to place the page.  Note: we
4181                  * cannot do any QEMUFile operation before using that
4182                  * buffer to make sure the buffer is valid when
4183                  * placing the page.
4184                  */
4185                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4186                                          TARGET_PAGE_SIZE);
4187             }
4188             break;
4189         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4190             all_zero = false;
4191             len = qemu_get_be32(f);
4192             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4193                 error_report("Invalid compressed data length: %d", len);
4194                 ret = -EINVAL;
4195                 break;
4196             }
4197             decompress_data_with_multi_threads(f, page_buffer, len);
4198             break;
4199
4200         case RAM_SAVE_FLAG_EOS:
4201             /* normal exit */
4202             multifd_recv_sync_main();
4203             break;
4204         default:
4205             error_report("Unknown combination of migration flags: %#x"
4206                          " (postcopy mode)", flags);
4207             ret = -EINVAL;
4208             break;
4209         }
4210
4211         /* Got the whole host page, wait for decompress before placing. */
4212         if (place_needed) {
4213             ret |= wait_for_decompress_done();
4214         }
4215
4216         /* Detect for any possible file errors */
4217         if (!ret && qemu_file_get_error(f)) {
4218             ret = qemu_file_get_error(f);
4219         }
4220
4221         if (!ret && place_needed) {
4222             /* This gets called at the last target page in the host page */
4223             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
4224                                                        block->page_size);
4225
4226             if (all_zero) {
4227                 ret = postcopy_place_page_zero(mis, place_dest,
4228                                                block);
4229             } else {
4230                 ret = postcopy_place_page(mis, place_dest,
4231                                           place_source, block);
4232             }
4233         }
4234     }
4235
4236     return ret;
4237 }
4238
4239 static bool postcopy_is_advised(void)
4240 {
4241     PostcopyState ps = postcopy_state_get();
4242     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4243 }
4244
4245 static bool postcopy_is_running(void)
4246 {
4247     PostcopyState ps = postcopy_state_get();
4248     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4249 }
4250
4251 /*
4252  * Flush content of RAM cache into SVM's memory.
4253  * Only flush the pages that be dirtied by PVM or SVM or both.
4254  */
4255 static void colo_flush_ram_cache(void)
4256 {
4257     RAMBlock *block = NULL;
4258     void *dst_host;
4259     void *src_host;
4260     unsigned long offset = 0;
4261
4262     memory_global_dirty_log_sync();
4263     WITH_RCU_READ_LOCK_GUARD() {
4264         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4265             ramblock_sync_dirty_bitmap(ram_state, block);
4266         }
4267     }
4268
4269     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4270     WITH_RCU_READ_LOCK_GUARD() {
4271         block = QLIST_FIRST_RCU(&ram_list.blocks);
4272
4273         while (block) {
4274             offset = migration_bitmap_find_dirty(ram_state, block, offset);
4275
4276             if (offset << TARGET_PAGE_BITS >= block->used_length) {
4277                 offset = 0;
4278                 block = QLIST_NEXT_RCU(block, next);
4279             } else {
4280                 migration_bitmap_clear_dirty(ram_state, block, offset);
4281                 dst_host = block->host + (offset << TARGET_PAGE_BITS);
4282                 src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4283                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4284             }
4285         }
4286     }
4287     trace_colo_flush_ram_cache_end();
4288 }
4289
4290 /**
4291  * ram_load_precopy: load pages in precopy case
4292  *
4293  * Returns 0 for success or -errno in case of error
4294  *
4295  * Called in precopy mode by ram_load().
4296  * rcu_read_lock is taken prior to this being called.
4297  *
4298  * @f: QEMUFile where to send the data
4299  */
4300 static int ram_load_precopy(QEMUFile *f)
4301 {
4302     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4303     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4304     bool postcopy_advised = postcopy_is_advised();
4305     if (!migrate_use_compression()) {
4306         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4307     }
4308
4309     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4310         ram_addr_t addr, total_ram_bytes;
4311         void *host = NULL;
4312         uint8_t ch;
4313
4314         /*
4315          * Yield periodically to let main loop run, but an iteration of
4316          * the main loop is expensive, so do it each some iterations
4317          */
4318         if ((i & 32767) == 0 && qemu_in_coroutine()) {
4319             aio_co_schedule(qemu_get_current_aio_context(),
4320                             qemu_coroutine_self());
4321             qemu_coroutine_yield();
4322         }
4323         i++;
4324
4325         addr = qemu_get_be64(f);
4326         flags = addr & ~TARGET_PAGE_MASK;
4327         addr &= TARGET_PAGE_MASK;
4328
4329         if (flags & invalid_flags) {
4330             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4331                 error_report("Received an unexpected compressed page");
4332             }
4333
4334             ret = -EINVAL;
4335             break;
4336         }
4337
4338         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4339                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4340             RAMBlock *block = ram_block_from_stream(f, flags);
4341
4342             /*
4343              * After going into COLO, we should load the Page into colo_cache.
4344              */
4345             if (migration_incoming_in_colo_state()) {
4346                 host = colo_cache_from_block_offset(block, addr);
4347             } else {
4348                 host = host_from_ram_block_offset(block, addr);
4349             }
4350             if (!host) {
4351                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4352                 ret = -EINVAL;
4353                 break;
4354             }
4355
4356             if (!migration_incoming_in_colo_state()) {
4357                 ramblock_recv_bitmap_set(block, host);
4358             }
4359
4360             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4361         }
4362
4363         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4364         case RAM_SAVE_FLAG_MEM_SIZE:
4365             /* Synchronize RAM block list */
4366             total_ram_bytes = addr;
4367             while (!ret && total_ram_bytes) {
4368                 RAMBlock *block;
4369                 char id[256];
4370                 ram_addr_t length;
4371
4372                 len = qemu_get_byte(f);
4373                 qemu_get_buffer(f, (uint8_t *)id, len);
4374                 id[len] = 0;
4375                 length = qemu_get_be64(f);
4376
4377                 block = qemu_ram_block_by_name(id);
4378                 if (block && !qemu_ram_is_migratable(block)) {
4379                     error_report("block %s should not be migrated !", id);
4380                     ret = -EINVAL;
4381                 } else if (block) {
4382                     if (length != block->used_length) {
4383                         Error *local_err = NULL;
4384
4385                         ret = qemu_ram_resize(block, length,
4386                                               &local_err);
4387                         if (local_err) {
4388                             error_report_err(local_err);
4389                         }
4390                     }
4391                     /* For postcopy we need to check hugepage sizes match */
4392                     if (postcopy_advised &&
4393                         block->page_size != qemu_host_page_size) {
4394                         uint64_t remote_page_size = qemu_get_be64(f);
4395                         if (remote_page_size != block->page_size) {
4396                             error_report("Mismatched RAM page size %s "
4397                                          "(local) %zd != %" PRId64,
4398                                          id, block->page_size,
4399                                          remote_page_size);
4400                             ret = -EINVAL;
4401                         }
4402                     }
4403                     if (migrate_ignore_shared()) {
4404                         hwaddr addr = qemu_get_be64(f);
4405                         if (ramblock_is_ignored(block) &&
4406                             block->mr->addr != addr) {
4407                             error_report("Mismatched GPAs for block %s "
4408                                          "%" PRId64 "!= %" PRId64,
4409                                          id, (uint64_t)addr,
4410                                          (uint64_t)block->mr->addr);
4411                             ret = -EINVAL;
4412                         }
4413                     }
4414                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4415                                           block->idstr);
4416                 } else {
4417                     error_report("Unknown ramblock \"%s\", cannot "
4418                                  "accept migration", id);
4419                     ret = -EINVAL;
4420                 }
4421
4422                 total_ram_bytes -= length;
4423             }
4424             break;
4425
4426         case RAM_SAVE_FLAG_ZERO:
4427             ch = qemu_get_byte(f);
4428             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4429             break;
4430
4431         case RAM_SAVE_FLAG_PAGE:
4432             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4433             break;
4434
4435         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4436             len = qemu_get_be32(f);
4437             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4438                 error_report("Invalid compressed data length: %d", len);
4439                 ret = -EINVAL;
4440                 break;
4441             }
4442             decompress_data_with_multi_threads(f, host, len);
4443             break;
4444
4445         case RAM_SAVE_FLAG_XBZRLE:
4446             if (load_xbzrle(f, addr, host) < 0) {
4447                 error_report("Failed to decompress XBZRLE page at "
4448                              RAM_ADDR_FMT, addr);
4449                 ret = -EINVAL;
4450                 break;
4451             }
4452             break;
4453         case RAM_SAVE_FLAG_EOS:
4454             /* normal exit */
4455             multifd_recv_sync_main();
4456             break;
4457         default:
4458             if (flags & RAM_SAVE_FLAG_HOOK) {
4459                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4460             } else {
4461                 error_report("Unknown combination of migration flags: %#x",
4462                              flags);
4463                 ret = -EINVAL;
4464             }
4465         }
4466         if (!ret) {
4467             ret = qemu_file_get_error(f);
4468         }
4469     }
4470
4471     ret |= wait_for_decompress_done();
4472     return ret;
4473 }
4474
4475 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4476 {
4477     int ret = 0;
4478     static uint64_t seq_iter;
4479     /*
4480      * If system is running in postcopy mode, page inserts to host memory must
4481      * be atomic
4482      */
4483     bool postcopy_running = postcopy_is_running();
4484
4485     seq_iter++;
4486
4487     if (version_id != 4) {
4488         return -EINVAL;
4489     }
4490
4491     /*
4492      * This RCU critical section can be very long running.
4493      * When RCU reclaims in the code start to become numerous,
4494      * it will be necessary to reduce the granularity of this
4495      * critical section.
4496      */
4497     WITH_RCU_READ_LOCK_GUARD() {
4498         if (postcopy_running) {
4499             ret = ram_load_postcopy(f);
4500         } else {
4501             ret = ram_load_precopy(f);
4502         }
4503     }
4504     trace_ram_load_complete(ret, seq_iter);
4505
4506     if (!ret  && migration_incoming_in_colo_state()) {
4507         colo_flush_ram_cache();
4508     }
4509     return ret;
4510 }
4511
4512 static bool ram_has_postcopy(void *opaque)
4513 {
4514     RAMBlock *rb;
4515     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4516         if (ramblock_is_pmem(rb)) {
4517             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4518                          "is not supported now!", rb->idstr, rb->host);
4519             return false;
4520         }
4521     }
4522
4523     return migrate_postcopy_ram();
4524 }
4525
4526 /* Sync all the dirty bitmap with destination VM.  */
4527 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4528 {
4529     RAMBlock *block;
4530     QEMUFile *file = s->to_dst_file;
4531     int ramblock_count = 0;
4532
4533     trace_ram_dirty_bitmap_sync_start();
4534
4535     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4536         qemu_savevm_send_recv_bitmap(file, block->idstr);
4537         trace_ram_dirty_bitmap_request(block->idstr);
4538         ramblock_count++;
4539     }
4540
4541     trace_ram_dirty_bitmap_sync_wait();
4542
4543     /* Wait until all the ramblocks' dirty bitmap synced */
4544     while (ramblock_count--) {
4545         qemu_sem_wait(&s->rp_state.rp_sem);
4546     }
4547
4548     trace_ram_dirty_bitmap_sync_complete();
4549
4550     return 0;
4551 }
4552
4553 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4554 {
4555     qemu_sem_post(&s->rp_state.rp_sem);
4556 }
4557
4558 /*
4559  * Read the received bitmap, revert it as the initial dirty bitmap.
4560  * This is only used when the postcopy migration is paused but wants
4561  * to resume from a middle point.
4562  */
4563 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4564 {
4565     int ret = -EINVAL;
4566     QEMUFile *file = s->rp_state.from_dst_file;
4567     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4568     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4569     uint64_t size, end_mark;
4570
4571     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4572
4573     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4574         error_report("%s: incorrect state %s", __func__,
4575                      MigrationStatus_str(s->state));
4576         return -EINVAL;
4577     }
4578
4579     /*
4580      * Note: see comments in ramblock_recv_bitmap_send() on why we
4581      * need the endianess convertion, and the paddings.
4582      */
4583     local_size = ROUND_UP(local_size, 8);
4584
4585     /* Add paddings */
4586     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4587
4588     size = qemu_get_be64(file);
4589
4590     /* The size of the bitmap should match with our ramblock */
4591     if (size != local_size) {
4592         error_report("%s: ramblock '%s' bitmap size mismatch "
4593                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4594                      block->idstr, size, local_size);
4595         ret = -EINVAL;
4596         goto out;
4597     }
4598
4599     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4600     end_mark = qemu_get_be64(file);
4601
4602     ret = qemu_file_get_error(file);
4603     if (ret || size != local_size) {
4604         error_report("%s: read bitmap failed for ramblock '%s': %d"
4605                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4606                      __func__, block->idstr, ret, local_size, size);
4607         ret = -EIO;
4608         goto out;
4609     }
4610
4611     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4612         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4613                      __func__, block->idstr, end_mark);
4614         ret = -EINVAL;
4615         goto out;
4616     }
4617
4618     /*
4619      * Endianess convertion. We are during postcopy (though paused).
4620      * The dirty bitmap won't change. We can directly modify it.
4621      */
4622     bitmap_from_le(block->bmap, le_bitmap, nbits);
4623
4624     /*
4625      * What we received is "received bitmap". Revert it as the initial
4626      * dirty bitmap for this ramblock.
4627      */
4628     bitmap_complement(block->bmap, block->bmap, nbits);
4629
4630     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4631
4632     /*
4633      * We succeeded to sync bitmap for current ramblock. If this is
4634      * the last one to sync, we need to notify the main send thread.
4635      */
4636     ram_dirty_bitmap_reload_notify(s);
4637
4638     ret = 0;
4639 out:
4640     g_free(le_bitmap);
4641     return ret;
4642 }
4643
4644 static int ram_resume_prepare(MigrationState *s, void *opaque)
4645 {
4646     RAMState *rs = *(RAMState **)opaque;
4647     int ret;
4648
4649     ret = ram_dirty_bitmap_sync_all(s, rs);
4650     if (ret) {
4651         return ret;
4652     }
4653
4654     ram_state_resume_prepare(rs, s->to_dst_file);
4655
4656     return 0;
4657 }
4658
4659 static SaveVMHandlers savevm_ram_handlers = {
4660     .save_setup = ram_save_setup,
4661     .save_live_iterate = ram_save_iterate,
4662     .save_live_complete_postcopy = ram_save_complete,
4663     .save_live_complete_precopy = ram_save_complete,
4664     .has_postcopy = ram_has_postcopy,
4665     .save_live_pending = ram_save_pending,
4666     .load_state = ram_load,
4667     .save_cleanup = ram_save_cleanup,
4668     .load_setup = ram_load_setup,
4669     .load_cleanup = ram_load_cleanup,
4670     .resume_prepare = ram_resume_prepare,
4671 };
4672
4673 void ram_mig_init(void)
4674 {
4675     qemu_mutex_init(&XBZRLE.lock);
4676     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4677 }