migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "qemu/pmem.h"
  37 #include "xbzrle.h"
  38 #include "ram.h"
  39 #include "migration.h"
  40 #include "socket.h"
  41 #include "migration/register.h"
  42 #include "migration/misc.h"
  43 #include "qemu-file.h"
  44 #include "postcopy-ram.h"
  45 #include "page_cache.h"
  46 #include "qemu/error-report.h"
  47 #include "qapi/error.h"
  48 #include "qapi/qapi-events-migration.h"
  49 #include "qapi/qmp/qerror.h"
  50 #include "trace.h"
  51 #include "exec/ram_addr.h"
  52 #include "exec/target_page.h"
  53 #include "qemu/rcu_queue.h"
  54 #include "migration/colo.h"
  55 #include "block.h"
  56 #include "sysemu/sysemu.h"
  57 #include "qemu/uuid.h"
  58 #include "savevm.h"
  59 #include "qemu/iov.h"
  60
  61 /***********************************************************/
  62 /* ram save/restore */
  63
  64 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  65  * worked for pages that where filled with the same char.  We switched
  66  * it to only search for the zero value.  And to avoid confusion with
  67  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  68  */
  69
  70 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  71 #define RAM_SAVE_FLAG_ZERO     0x02
  72 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  73 #define RAM_SAVE_FLAG_PAGE     0x08
  74 #define RAM_SAVE_FLAG_EOS      0x10
  75 #define RAM_SAVE_FLAG_CONTINUE 0x20
  76 #define RAM_SAVE_FLAG_XBZRLE   0x40
  77 /* 0x80 is reserved in migration.h start with 0x100 next */
  78 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  79
  80 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  81 {
  82     return buffer_is_zero(p, size);
  83 }
  84
  85 XBZRLECacheStats xbzrle_counters;
  86
  87 /* struct contains XBZRLE cache and a static page
  88    used by the compression */
  89 static struct {
  90     /* buffer used for XBZRLE encoding */
  91     uint8_t *encoded_buf;
  92     /* buffer for storing page content */
  93     uint8_t *current_buf;
  94     /* Cache for XBZRLE, Protected by lock. */
  95     PageCache *cache;
  96     QemuMutex lock;
  97     /* it will store a page full of zeros */
  98     uint8_t *zero_target_page;
  99     /* buffer used for XBZRLE decoding */
 100     uint8_t *decoded_buf;
 101 } XBZRLE;
 102
 103 static void XBZRLE_cache_lock(void)
 104 {
 105     if (migrate_use_xbzrle())
 106         qemu_mutex_lock(&XBZRLE.lock);
 107 }
 108
 109 static void XBZRLE_cache_unlock(void)
 110 {
 111     if (migrate_use_xbzrle())
 112         qemu_mutex_unlock(&XBZRLE.lock);
 113 }
 114
 115 /**
 116  * xbzrle_cache_resize: resize the xbzrle cache
 117  *
 118  * This function is called from qmp_migrate_set_cache_size in main
 119  * thread, possibly while a migration is in progress.  A running
 120  * migration may be using the cache and might finish during this call,
 121  * hence changes to the cache are protected by XBZRLE.lock().
 122  *
 123  * Returns 0 for success or -1 for error
 124  *
 125  * @new_size: new cache size
 126  * @errp: set *errp if the check failed, with reason
 127  */
 128 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 129 {
 130     PageCache *new_cache;
 131     int64_t ret = 0;
 132
 133     /* Check for truncation */
 134     if (new_size != (size_t)new_size) {
 135         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 136                    "exceeding address space");
 137         return -1;
 138     }
 139
 140     if (new_size == migrate_xbzrle_cache_size()) {
 141         /* nothing to do */
 142         return 0;
 143     }
 144
 145     XBZRLE_cache_lock();
 146
 147     if (XBZRLE.cache != NULL) {
 148         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 149         if (!new_cache) {
 150             ret = -1;
 151             goto out;
 152         }
 153
 154         cache_fini(XBZRLE.cache);
 155         XBZRLE.cache = new_cache;
 156     }
 157 out:
 158     XBZRLE_cache_unlock();
 159     return ret;
 160 }
 161
 162 /* Should be holding either ram_list.mutex, or the RCU lock. */
 163 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 164     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 165         if (!qemu_ram_is_migratable(block)) {} else
 166
 167 #undef RAMBLOCK_FOREACH
 168
 169 static void ramblock_recv_map_init(void)
 170 {
 171     RAMBlock *rb;
 172
 173     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
 174         assert(!rb->receivedmap);
 175         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 176     }
 177 }
 178
 179 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 180 {
 181     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 182                     rb->receivedmap);
 183 }
 184
 185 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 186 {
 187     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 188 }
 189
 190 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 191 {
 192     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 193 }
 194
 195 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 196                                     size_t nr)
 197 {
 198     bitmap_set_atomic(rb->receivedmap,
 199                       ramblock_recv_bitmap_offset(host_addr, rb),
 200                       nr);
 201 }
 202
 203 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 204
 205 /*
 206  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 207  *
 208  * Returns >0 if success with sent bytes, or <0 if error.
 209  */
 210 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 211                                   const char *block_name)
 212 {
 213     RAMBlock *block = qemu_ram_block_by_name(block_name);
 214     unsigned long *le_bitmap, nbits;
 215     uint64_t size;
 216
 217     if (!block) {
 218         error_report("%s: invalid block name: %s", __func__, block_name);
 219         return -1;
 220     }
 221
 222     nbits = block->used_length >> TARGET_PAGE_BITS;
 223
 224     /*
 225      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 226      * machines we may need 4 more bytes for padding (see below
 227      * comment). So extend it a bit before hand.
 228      */
 229     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 230
 231     /*
 232      * Always use little endian when sending the bitmap. This is
 233      * required that when source and destination VMs are not using the
 234      * same endianess. (Note: big endian won't work.)
 235      */
 236     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 237
 238     /* Size of the bitmap, in bytes */
 239     size = DIV_ROUND_UP(nbits, 8);
 240
 241     /*
 242      * size is always aligned to 8 bytes for 64bit machines, but it
 243      * may not be true for 32bit machines. We need this padding to
 244      * make sure the migration can survive even between 32bit and
 245      * 64bit machines.
 246      */
 247     size = ROUND_UP(size, 8);
 248
 249     qemu_put_be64(file, size);
 250     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 251     /*
 252      * Mark as an end, in case the middle part is screwed up due to
 253      * some "misterious" reason.
 254      */
 255     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 256     qemu_fflush(file);
 257
 258     g_free(le_bitmap);
 259
 260     if (qemu_file_get_error(file)) {
 261         return qemu_file_get_error(file);
 262     }
 263
 264     return size + sizeof(size);
 265 }
 266
 267 /*
 268  * An outstanding page request, on the source, having been received
 269  * and queued
 270  */
 271 struct RAMSrcPageRequest {
 272     RAMBlock *rb;
 273     hwaddr    offset;
 274     hwaddr    len;
 275
 276     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 277 };
 278
 279 /* State of RAM for migration */
 280 struct RAMState {
 281     /* QEMUFile used for this migration */
 282     QEMUFile *f;
 283     /* Last block that we have visited searching for dirty pages */
 284     RAMBlock *last_seen_block;
 285     /* Last block from where we have sent data */
 286     RAMBlock *last_sent_block;
 287     /* Last dirty target page we have sent */
 288     ram_addr_t last_page;
 289     /* last ram version we have seen */
 290     uint32_t last_version;
 291     /* We are in the first round */
 292     bool ram_bulk_stage;
 293     /* How many times we have dirty too many pages */
 294     int dirty_rate_high_cnt;
 295     /* these variables are used for bitmap sync */
 296     /* last time we did a full bitmap_sync */
 297     int64_t time_last_bitmap_sync;
 298     /* bytes transferred at start_time */
 299     uint64_t bytes_xfer_prev;
 300     /* number of dirty pages since start_time */
 301     uint64_t num_dirty_pages_period;
 302     /* xbzrle misses since the beginning of the period */
 303     uint64_t xbzrle_cache_miss_prev;
 304     /* number of iterations at the beginning of period */
 305     uint64_t iterations_prev;
 306     /* Iterations since start */
 307     uint64_t iterations;
 308     /* number of dirty bits in the bitmap */
 309     uint64_t migration_dirty_pages;
 310     /* protects modification of the bitmap */
 311     QemuMutex bitmap_mutex;
 312     /* The RAMBlock used in the last src_page_requests */
 313     RAMBlock *last_req_rb;
 314     /* Queue of outstanding page requests from the destination */
 315     QemuMutex src_page_req_mutex;
 316     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 317 };
 318 typedef struct RAMState RAMState;
 319
 320 static RAMState *ram_state;
 321
 322 uint64_t ram_bytes_remaining(void)
 323 {
 324     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 325                        0;
 326 }
 327
 328 MigrationStats ram_counters;
 329
 330 /* used by the search for pages to send */
 331 struct PageSearchStatus {
 332     /* Current block being searched */
 333     RAMBlock    *block;
 334     /* Current page to search from */
 335     unsigned long page;
 336     /* Set once we wrap around */
 337     bool         complete_round;
 338 };
 339 typedef struct PageSearchStatus PageSearchStatus;
 340
 341 struct CompressParam {
 342     bool done;
 343     bool quit;
 344     QEMUFile *file;
 345     QemuMutex mutex;
 346     QemuCond cond;
 347     RAMBlock *block;
 348     ram_addr_t offset;
 349
 350     /* internally used fields */
 351     z_stream stream;
 352     uint8_t *originbuf;
 353 };
 354 typedef struct CompressParam CompressParam;
 355
 356 struct DecompressParam {
 357     bool done;
 358     bool quit;
 359     QemuMutex mutex;
 360     QemuCond cond;
 361     void *des;
 362     uint8_t *compbuf;
 363     int len;
 364     z_stream stream;
 365 };
 366 typedef struct DecompressParam DecompressParam;
 367
 368 static CompressParam *comp_param;
 369 static QemuThread *compress_threads;
 370 /* comp_done_cond is used to wake up the migration thread when
 371  * one of the compression threads has finished the compression.
 372  * comp_done_lock is used to co-work with comp_done_cond.
 373  */
 374 static QemuMutex comp_done_lock;
 375 static QemuCond comp_done_cond;
 376 /* The empty QEMUFileOps will be used by file in CompressParam */
 377 static const QEMUFileOps empty_ops = { };
 378
 379 static QEMUFile *decomp_file;
 380 static DecompressParam *decomp_param;
 381 static QemuThread *decompress_threads;
 382 static QemuMutex decomp_done_lock;
 383 static QemuCond decomp_done_cond;
 384
 385 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 386                                 ram_addr_t offset, uint8_t *source_buf);
 387
 388 static void *do_data_compress(void *opaque)
 389 {
 390     CompressParam *param = opaque;
 391     RAMBlock *block;
 392     ram_addr_t offset;
 393
 394     qemu_mutex_lock(&param->mutex);
 395     while (!param->quit) {
 396         if (param->block) {
 397             block = param->block;
 398             offset = param->offset;
 399             param->block = NULL;
 400             qemu_mutex_unlock(&param->mutex);
 401
 402             do_compress_ram_page(param->file, &param->stream, block, offset,
 403                                  param->originbuf);
 404
 405             qemu_mutex_lock(&comp_done_lock);
 406             param->done = true;
 407             qemu_cond_signal(&comp_done_cond);
 408             qemu_mutex_unlock(&comp_done_lock);
 409
 410             qemu_mutex_lock(&param->mutex);
 411         } else {
 412             qemu_cond_wait(&param->cond, &param->mutex);
 413         }
 414     }
 415     qemu_mutex_unlock(&param->mutex);
 416
 417     return NULL;
 418 }
 419
 420 static inline void terminate_compression_threads(void)
 421 {
 422     int idx, thread_count;
 423
 424     thread_count = migrate_compress_threads();
 425
 426     for (idx = 0; idx < thread_count; idx++) {
 427         qemu_mutex_lock(&comp_param[idx].mutex);
 428         comp_param[idx].quit = true;
 429         qemu_cond_signal(&comp_param[idx].cond);
 430         qemu_mutex_unlock(&comp_param[idx].mutex);
 431     }
 432 }
 433
 434 static void compress_threads_save_cleanup(void)
 435 {
 436     int i, thread_count;
 437
 438     if (!migrate_use_compression()) {
 439         return;
 440     }
 441     terminate_compression_threads();
 442     thread_count = migrate_compress_threads();
 443     for (i = 0; i < thread_count; i++) {
 444         /*
 445          * we use it as a indicator which shows if the thread is
 446          * properly init'd or not
 447          */
 448         if (!comp_param[i].file) {
 449             break;
 450         }
 451         qemu_thread_join(compress_threads + i);
 452         qemu_mutex_destroy(&comp_param[i].mutex);
 453         qemu_cond_destroy(&comp_param[i].cond);
 454         deflateEnd(&comp_param[i].stream);
 455         g_free(comp_param[i].originbuf);
 456         qemu_fclose(comp_param[i].file);
 457         comp_param[i].file = NULL;
 458     }
 459     qemu_mutex_destroy(&comp_done_lock);
 460     qemu_cond_destroy(&comp_done_cond);
 461     g_free(compress_threads);
 462     g_free(comp_param);
 463     compress_threads = NULL;
 464     comp_param = NULL;
 465 }
 466
 467 static int compress_threads_save_setup(void)
 468 {
 469     int i, thread_count;
 470
 471     if (!migrate_use_compression()) {
 472         return 0;
 473     }
 474     thread_count = migrate_compress_threads();
 475     compress_threads = g_new0(QemuThread, thread_count);
 476     comp_param = g_new0(CompressParam, thread_count);
 477     qemu_cond_init(&comp_done_cond);
 478     qemu_mutex_init(&comp_done_lock);
 479     for (i = 0; i < thread_count; i++) {
 480         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 481         if (!comp_param[i].originbuf) {
 482             goto exit;
 483         }
 484
 485         if (deflateInit(&comp_param[i].stream,
 486                         migrate_compress_level()) != Z_OK) {
 487             g_free(comp_param[i].originbuf);
 488             goto exit;
 489         }
 490
 491         /* comp_param[i].file is just used as a dummy buffer to save data,
 492          * set its ops to empty.
 493          */
 494         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 495         comp_param[i].done = true;
 496         comp_param[i].quit = false;
 497         qemu_mutex_init(&comp_param[i].mutex);
 498         qemu_cond_init(&comp_param[i].cond);
 499         qemu_thread_create(compress_threads + i, "compress",
 500                            do_data_compress, comp_param + i,
 501                            QEMU_THREAD_JOINABLE);
 502     }
 503     return 0;
 504
 505 exit:
 506     compress_threads_save_cleanup();
 507     return -1;
 508 }
 509
 510 /* Multiple fd's */
 511
 512 #define MULTIFD_MAGIC 0x11223344U
 513 #define MULTIFD_VERSION 1
 514
 515 #define MULTIFD_FLAG_SYNC (1 << 0)
 516
 517 typedef struct {
 518     uint32_t magic;
 519     uint32_t version;
 520     unsigned char uuid[16]; /* QemuUUID */
 521     uint8_t id;
 522 } __attribute__((packed)) MultiFDInit_t;
 523
 524 typedef struct {
 525     uint32_t magic;
 526     uint32_t version;
 527     uint32_t flags;
 528     uint32_t size;
 529     uint32_t used;
 530     uint64_t packet_num;
 531     char ramblock[256];
 532     uint64_t offset[];
 533 } __attribute__((packed)) MultiFDPacket_t;
 534
 535 typedef struct {
 536     /* number of used pages */
 537     uint32_t used;
 538     /* number of allocated pages */
 539     uint32_t allocated;
 540     /* global number of generated multifd packets */
 541     uint64_t packet_num;
 542     /* offset of each page */
 543     ram_addr_t *offset;
 544     /* pointer to each page */
 545     struct iovec *iov;
 546     RAMBlock *block;
 547 } MultiFDPages_t;
 548
 549 typedef struct {
 550     /* this fields are not changed once the thread is created */
 551     /* channel number */
 552     uint8_t id;
 553     /* channel thread name */
 554     char *name;
 555     /* channel thread id */
 556     QemuThread thread;
 557     /* communication channel */
 558     QIOChannel *c;
 559     /* sem where to wait for more work */
 560     QemuSemaphore sem;
 561     /* this mutex protects the following parameters */
 562     QemuMutex mutex;
 563     /* is this channel thread running */
 564     bool running;
 565     /* should this thread finish */
 566     bool quit;
 567     /* thread has work to do */
 568     int pending_job;
 569     /* array of pages to sent */
 570     MultiFDPages_t *pages;
 571     /* packet allocated len */
 572     uint32_t packet_len;
 573     /* pointer to the packet */
 574     MultiFDPacket_t *packet;
 575     /* multifd flags for each packet */
 576     uint32_t flags;
 577     /* global number of generated multifd packets */
 578     uint64_t packet_num;
 579     /* thread local variables */
 580     /* packets sent through this channel */
 581     uint64_t num_packets;
 582     /* pages sent through this channel */
 583     uint64_t num_pages;
 584     /* syncs main thread and channels */
 585     QemuSemaphore sem_sync;
 586 }  MultiFDSendParams;
 587
 588 typedef struct {
 589     /* this fields are not changed once the thread is created */
 590     /* channel number */
 591     uint8_t id;
 592     /* channel thread name */
 593     char *name;
 594     /* channel thread id */
 595     QemuThread thread;
 596     /* communication channel */
 597     QIOChannel *c;
 598     /* this mutex protects the following parameters */
 599     QemuMutex mutex;
 600     /* is this channel thread running */
 601     bool running;
 602     /* array of pages to receive */
 603     MultiFDPages_t *pages;
 604     /* packet allocated len */
 605     uint32_t packet_len;
 606     /* pointer to the packet */
 607     MultiFDPacket_t *packet;
 608     /* multifd flags for each packet */
 609     uint32_t flags;
 610     /* global number of generated multifd packets */
 611     uint64_t packet_num;
 612     /* thread local variables */
 613     /* packets sent through this channel */
 614     uint64_t num_packets;
 615     /* pages sent through this channel */
 616     uint64_t num_pages;
 617     /* syncs main thread and channels */
 618     QemuSemaphore sem_sync;
 619 } MultiFDRecvParams;
 620
 621 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 622 {
 623     MultiFDInit_t msg;
 624     int ret;
 625
 626     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 627     msg.version = cpu_to_be32(MULTIFD_VERSION);
 628     msg.id = p->id;
 629     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 630
 631     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 632     if (ret != 0) {
 633         return -1;
 634     }
 635     return 0;
 636 }
 637
 638 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 639 {
 640     MultiFDInit_t msg;
 641     int ret;
 642
 643     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 644     if (ret != 0) {
 645         return -1;
 646     }
 647
 648     be32_to_cpus(&msg.magic);
 649     be32_to_cpus(&msg.version);
 650
 651     if (msg.magic != MULTIFD_MAGIC) {
 652         error_setg(errp, "multifd: received packet magic %x "
 653                    "expected %x", msg.magic, MULTIFD_MAGIC);
 654         return -1;
 655     }
 656
 657     if (msg.version != MULTIFD_VERSION) {
 658         error_setg(errp, "multifd: received packet version %d "
 659                    "expected %d", msg.version, MULTIFD_VERSION);
 660         return -1;
 661     }
 662
 663     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 664         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 665         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 666
 667         error_setg(errp, "multifd: received uuid '%s' and expected "
 668                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 669         g_free(uuid);
 670         g_free(msg_uuid);
 671         return -1;
 672     }
 673
 674     if (msg.id > migrate_multifd_channels()) {
 675         error_setg(errp, "multifd: received channel version %d "
 676                    "expected %d", msg.version, MULTIFD_VERSION);
 677         return -1;
 678     }
 679
 680     return msg.id;
 681 }
 682
 683 static MultiFDPages_t *multifd_pages_init(size_t size)
 684 {
 685     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 686
 687     pages->allocated = size;
 688     pages->iov = g_new0(struct iovec, size);
 689     pages->offset = g_new0(ram_addr_t, size);
 690
 691     return pages;
 692 }
 693
 694 static void multifd_pages_clear(MultiFDPages_t *pages)
 695 {
 696     pages->used = 0;
 697     pages->allocated = 0;
 698     pages->packet_num = 0;
 699     pages->block = NULL;
 700     g_free(pages->iov);
 701     pages->iov = NULL;
 702     g_free(pages->offset);
 703     pages->offset = NULL;
 704     g_free(pages);
 705 }
 706
 707 static void multifd_send_fill_packet(MultiFDSendParams *p)
 708 {
 709     MultiFDPacket_t *packet = p->packet;
 710     int i;
 711
 712     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 713     packet->version = cpu_to_be32(MULTIFD_VERSION);
 714     packet->flags = cpu_to_be32(p->flags);
 715     packet->size = cpu_to_be32(migrate_multifd_page_count());
 716     packet->used = cpu_to_be32(p->pages->used);
 717     packet->packet_num = cpu_to_be64(p->packet_num);
 718
 719     if (p->pages->block) {
 720         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 721     }
 722
 723     for (i = 0; i < p->pages->used; i++) {
 724         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 725     }
 726 }
 727
 728 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 729 {
 730     MultiFDPacket_t *packet = p->packet;
 731     RAMBlock *block;
 732     int i;
 733
 734     be32_to_cpus(&packet->magic);
 735     if (packet->magic != MULTIFD_MAGIC) {
 736         error_setg(errp, "multifd: received packet "
 737                    "magic %x and expected magic %x",
 738                    packet->magic, MULTIFD_MAGIC);
 739         return -1;
 740     }
 741
 742     be32_to_cpus(&packet->version);
 743     if (packet->version != MULTIFD_VERSION) {
 744         error_setg(errp, "multifd: received packet "
 745                    "version %d and expected version %d",
 746                    packet->version, MULTIFD_VERSION);
 747         return -1;
 748     }
 749
 750     p->flags = be32_to_cpu(packet->flags);
 751
 752     be32_to_cpus(&packet->size);
 753     if (packet->size > migrate_multifd_page_count()) {
 754         error_setg(errp, "multifd: received packet "
 755                    "with size %d and expected maximum size %d",
 756                    packet->size, migrate_multifd_page_count()) ;
 757         return -1;
 758     }
 759
 760     p->pages->used = be32_to_cpu(packet->used);
 761     if (p->pages->used > packet->size) {
 762         error_setg(errp, "multifd: received packet "
 763                    "with size %d and expected maximum size %d",
 764                    p->pages->used, packet->size) ;
 765         return -1;
 766     }
 767
 768     p->packet_num = be64_to_cpu(packet->packet_num);
 769
 770     if (p->pages->used) {
 771         /* make sure that ramblock is 0 terminated */
 772         packet->ramblock[255] = 0;
 773         block = qemu_ram_block_by_name(packet->ramblock);
 774         if (!block) {
 775             error_setg(errp, "multifd: unknown ram block %s",
 776                        packet->ramblock);
 777             return -1;
 778         }
 779     }
 780
 781     for (i = 0; i < p->pages->used; i++) {
 782         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 783
 784         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 785             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 786                        " (max " RAM_ADDR_FMT ")",
 787                        offset, block->max_length);
 788             return -1;
 789         }
 790         p->pages->iov[i].iov_base = block->host + offset;
 791         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 792     }
 793
 794     return 0;
 795 }
 796
 797 struct {
 798     MultiFDSendParams *params;
 799     /* number of created threads */
 800     int count;
 801     /* array of pages to sent */
 802     MultiFDPages_t *pages;
 803     /* syncs main thread and channels */
 804     QemuSemaphore sem_sync;
 805     /* global number of generated multifd packets */
 806     uint64_t packet_num;
 807     /* send channels ready */
 808     QemuSemaphore channels_ready;
 809 } *multifd_send_state;
 810
 811 /*
 812  * How we use multifd_send_state->pages and channel->pages?
 813  *
 814  * We create a pages for each channel, and a main one.  Each time that
 815  * we need to send a batch of pages we interchange the ones between
 816  * multifd_send_state and the channel that is sending it.  There are
 817  * two reasons for that:
 818  *    - to not have to do so many mallocs during migration
 819  *    - to make easier to know what to free at the end of migration
 820  *
 821  * This way we always know who is the owner of each "pages" struct,
 822  * and we don't need any loocking.  It belongs to the migration thread
 823  * or to the channel thread.  Switching is safe because the migration
 824  * thread is using the channel mutex when changing it, and the channel
 825  * have to had finish with its own, otherwise pending_job can't be
 826  * false.
 827  */
 828
 829 static void multifd_send_pages(void)
 830 {
 831     int i;
 832     static int next_channel;
 833     MultiFDSendParams *p = NULL; /* make happy gcc */
 834     MultiFDPages_t *pages = multifd_send_state->pages;
 835     uint64_t transferred;
 836
 837     qemu_sem_wait(&multifd_send_state->channels_ready);
 838     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 839         p = &multifd_send_state->params[i];
 840
 841         qemu_mutex_lock(&p->mutex);
 842         if (!p->pending_job) {
 843             p->pending_job++;
 844             next_channel = (i + 1) % migrate_multifd_channels();
 845             break;
 846         }
 847         qemu_mutex_unlock(&p->mutex);
 848     }
 849     p->pages->used = 0;
 850
 851     p->packet_num = multifd_send_state->packet_num++;
 852     p->pages->block = NULL;
 853     multifd_send_state->pages = p->pages;
 854     p->pages = pages;
 855     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
 856     ram_counters.multifd_bytes += transferred;
 857     ram_counters.transferred += transferred;;
 858     qemu_mutex_unlock(&p->mutex);
 859     qemu_sem_post(&p->sem);
 860 }
 861
 862 static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
 863 {
 864     MultiFDPages_t *pages = multifd_send_state->pages;
 865
 866     if (!pages->block) {
 867         pages->block = block;
 868     }
 869
 870     if (pages->block == block) {
 871         pages->offset[pages->used] = offset;
 872         pages->iov[pages->used].iov_base = block->host + offset;
 873         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 874         pages->used++;
 875
 876         if (pages->used < pages->allocated) {
 877             return;
 878         }
 879     }
 880
 881     multifd_send_pages();
 882
 883     if (pages->block != block) {
 884         multifd_queue_page(block, offset);
 885     }
 886 }
 887
 888 static void multifd_send_terminate_threads(Error *err)
 889 {
 890     int i;
 891
 892     if (err) {
 893         MigrationState *s = migrate_get_current();
 894         migrate_set_error(s, err);
 895         if (s->state == MIGRATION_STATUS_SETUP ||
 896             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
 897             s->state == MIGRATION_STATUS_DEVICE ||
 898             s->state == MIGRATION_STATUS_ACTIVE) {
 899             migrate_set_state(&s->state, s->state,
 900                               MIGRATION_STATUS_FAILED);
 901         }
 902     }
 903
 904     for (i = 0; i < migrate_multifd_channels(); i++) {
 905         MultiFDSendParams *p = &multifd_send_state->params[i];
 906
 907         qemu_mutex_lock(&p->mutex);
 908         p->quit = true;
 909         qemu_sem_post(&p->sem);
 910         qemu_mutex_unlock(&p->mutex);
 911     }
 912 }
 913
 914 int multifd_save_cleanup(Error **errp)
 915 {
 916     int i;
 917     int ret = 0;
 918
 919     if (!migrate_use_multifd()) {
 920         return 0;
 921     }
 922     multifd_send_terminate_threads(NULL);
 923     for (i = 0; i < migrate_multifd_channels(); i++) {
 924         MultiFDSendParams *p = &multifd_send_state->params[i];
 925
 926         if (p->running) {
 927             qemu_thread_join(&p->thread);
 928         }
 929         socket_send_channel_destroy(p->c);
 930         p->c = NULL;
 931         qemu_mutex_destroy(&p->mutex);
 932         qemu_sem_destroy(&p->sem);
 933         qemu_sem_destroy(&p->sem_sync);
 934         g_free(p->name);
 935         p->name = NULL;
 936         multifd_pages_clear(p->pages);
 937         p->pages = NULL;
 938         p->packet_len = 0;
 939         g_free(p->packet);
 940         p->packet = NULL;
 941     }
 942     qemu_sem_destroy(&multifd_send_state->channels_ready);
 943     qemu_sem_destroy(&multifd_send_state->sem_sync);
 944     g_free(multifd_send_state->params);
 945     multifd_send_state->params = NULL;
 946     multifd_pages_clear(multifd_send_state->pages);
 947     multifd_send_state->pages = NULL;
 948     g_free(multifd_send_state);
 949     multifd_send_state = NULL;
 950     return ret;
 951 }
 952
 953 static void multifd_send_sync_main(void)
 954 {
 955     int i;
 956
 957     if (!migrate_use_multifd()) {
 958         return;
 959     }
 960     if (multifd_send_state->pages->used) {
 961         multifd_send_pages();
 962     }
 963     for (i = 0; i < migrate_multifd_channels(); i++) {
 964         MultiFDSendParams *p = &multifd_send_state->params[i];
 965
 966         trace_multifd_send_sync_main_signal(p->id);
 967
 968         qemu_mutex_lock(&p->mutex);
 969
 970         p->packet_num = multifd_send_state->packet_num++;
 971         p->flags |= MULTIFD_FLAG_SYNC;
 972         p->pending_job++;
 973         qemu_mutex_unlock(&p->mutex);
 974         qemu_sem_post(&p->sem);
 975     }
 976     for (i = 0; i < migrate_multifd_channels(); i++) {
 977         MultiFDSendParams *p = &multifd_send_state->params[i];
 978
 979         trace_multifd_send_sync_main_wait(p->id);
 980         qemu_sem_wait(&multifd_send_state->sem_sync);
 981     }
 982     trace_multifd_send_sync_main(multifd_send_state->packet_num);
 983 }
 984
 985 static void *multifd_send_thread(void *opaque)
 986 {
 987     MultiFDSendParams *p = opaque;
 988     Error *local_err = NULL;
 989     int ret;
 990
 991     trace_multifd_send_thread_start(p->id);
 992     rcu_register_thread();
 993
 994     if (multifd_send_initial_packet(p, &local_err) < 0) {
 995         goto out;
 996     }
 997     /* initial packet */
 998     p->num_packets = 1;
 999
1000     while (true) {
1001         qemu_sem_wait(&p->sem);
1002         qemu_mutex_lock(&p->mutex);
1003
1004         if (p->pending_job) {
1005             uint32_t used = p->pages->used;
1006             uint64_t packet_num = p->packet_num;
1007             uint32_t flags = p->flags;
1008
1009             multifd_send_fill_packet(p);
1010             p->flags = 0;
1011             p->num_packets++;
1012             p->num_pages += used;
1013             p->pages->used = 0;
1014             qemu_mutex_unlock(&p->mutex);
1015
1016             trace_multifd_send(p->id, packet_num, used, flags);
1017
1018             ret = qio_channel_write_all(p->c, (void *)p->packet,
1019                                         p->packet_len, &local_err);
1020             if (ret != 0) {
1021                 break;
1022             }
1023
1024             ret = qio_channel_writev_all(p->c, p->pages->iov, used, &local_err);
1025             if (ret != 0) {
1026                 break;
1027             }
1028
1029             qemu_mutex_lock(&p->mutex);
1030             p->pending_job--;
1031             qemu_mutex_unlock(&p->mutex);
1032
1033             if (flags & MULTIFD_FLAG_SYNC) {
1034                 qemu_sem_post(&multifd_send_state->sem_sync);
1035             }
1036             qemu_sem_post(&multifd_send_state->channels_ready);
1037         } else if (p->quit) {
1038             qemu_mutex_unlock(&p->mutex);
1039             break;
1040         } else {
1041             qemu_mutex_unlock(&p->mutex);
1042             /* sometimes there are spurious wakeups */
1043         }
1044     }
1045
1046 out:
1047     if (local_err) {
1048         multifd_send_terminate_threads(local_err);
1049     }
1050
1051     qemu_mutex_lock(&p->mutex);
1052     p->running = false;
1053     qemu_mutex_unlock(&p->mutex);
1054
1055     rcu_unregister_thread();
1056     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1057
1058     return NULL;
1059 }
1060
1061 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1062 {
1063     MultiFDSendParams *p = opaque;
1064     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1065     Error *local_err = NULL;
1066
1067     if (qio_task_propagate_error(task, &local_err)) {
1068         if (multifd_save_cleanup(&local_err) != 0) {
1069             migrate_set_error(migrate_get_current(), local_err);
1070         }
1071     } else {
1072         p->c = QIO_CHANNEL(sioc);
1073         qio_channel_set_delay(p->c, false);
1074         p->running = true;
1075         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1076                            QEMU_THREAD_JOINABLE);
1077
1078         atomic_inc(&multifd_send_state->count);
1079     }
1080 }
1081
1082 int multifd_save_setup(void)
1083 {
1084     int thread_count;
1085     uint32_t page_count = migrate_multifd_page_count();
1086     uint8_t i;
1087
1088     if (!migrate_use_multifd()) {
1089         return 0;
1090     }
1091     thread_count = migrate_multifd_channels();
1092     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1093     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1094     atomic_set(&multifd_send_state->count, 0);
1095     multifd_send_state->pages = multifd_pages_init(page_count);
1096     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1097     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1098
1099     for (i = 0; i < thread_count; i++) {
1100         MultiFDSendParams *p = &multifd_send_state->params[i];
1101
1102         qemu_mutex_init(&p->mutex);
1103         qemu_sem_init(&p->sem, 0);
1104         qemu_sem_init(&p->sem_sync, 0);
1105         p->quit = false;
1106         p->pending_job = 0;
1107         p->id = i;
1108         p->pages = multifd_pages_init(page_count);
1109         p->packet_len = sizeof(MultiFDPacket_t)
1110                       + sizeof(ram_addr_t) * page_count;
1111         p->packet = g_malloc0(p->packet_len);
1112         p->name = g_strdup_printf("multifdsend_%d", i);
1113         socket_send_channel_create(multifd_new_send_channel_async, p);
1114     }
1115     return 0;
1116 }
1117
1118 struct {
1119     MultiFDRecvParams *params;
1120     /* number of created threads */
1121     int count;
1122     /* syncs main thread and channels */
1123     QemuSemaphore sem_sync;
1124     /* global number of generated multifd packets */
1125     uint64_t packet_num;
1126 } *multifd_recv_state;
1127
1128 static void multifd_recv_terminate_threads(Error *err)
1129 {
1130     int i;
1131
1132     if (err) {
1133         MigrationState *s = migrate_get_current();
1134         migrate_set_error(s, err);
1135         if (s->state == MIGRATION_STATUS_SETUP ||
1136             s->state == MIGRATION_STATUS_ACTIVE) {
1137             migrate_set_state(&s->state, s->state,
1138                               MIGRATION_STATUS_FAILED);
1139         }
1140     }
1141
1142     for (i = 0; i < migrate_multifd_channels(); i++) {
1143         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1144
1145         qemu_mutex_lock(&p->mutex);
1146         /* We could arrive here for two reasons:
1147            - normal quit, i.e. everything went fine, just finished
1148            - error quit: We close the channels so the channel threads
1149              finish the qio_channel_read_all_eof() */
1150         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1151         qemu_mutex_unlock(&p->mutex);
1152     }
1153 }
1154
1155 int multifd_load_cleanup(Error **errp)
1156 {
1157     int i;
1158     int ret = 0;
1159
1160     if (!migrate_use_multifd()) {
1161         return 0;
1162     }
1163     multifd_recv_terminate_threads(NULL);
1164     for (i = 0; i < migrate_multifd_channels(); i++) {
1165         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1166
1167         if (p->running) {
1168             qemu_thread_join(&p->thread);
1169         }
1170         object_unref(OBJECT(p->c));
1171         p->c = NULL;
1172         qemu_mutex_destroy(&p->mutex);
1173         qemu_sem_destroy(&p->sem_sync);
1174         g_free(p->name);
1175         p->name = NULL;
1176         multifd_pages_clear(p->pages);
1177         p->pages = NULL;
1178         p->packet_len = 0;
1179         g_free(p->packet);
1180         p->packet = NULL;
1181     }
1182     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1183     g_free(multifd_recv_state->params);
1184     multifd_recv_state->params = NULL;
1185     g_free(multifd_recv_state);
1186     multifd_recv_state = NULL;
1187
1188     return ret;
1189 }
1190
1191 static void multifd_recv_sync_main(void)
1192 {
1193     int i;
1194
1195     if (!migrate_use_multifd()) {
1196         return;
1197     }
1198     for (i = 0; i < migrate_multifd_channels(); i++) {
1199         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1200
1201         trace_multifd_recv_sync_main_wait(p->id);
1202         qemu_sem_wait(&multifd_recv_state->sem_sync);
1203         qemu_mutex_lock(&p->mutex);
1204         if (multifd_recv_state->packet_num < p->packet_num) {
1205             multifd_recv_state->packet_num = p->packet_num;
1206         }
1207         qemu_mutex_unlock(&p->mutex);
1208     }
1209     for (i = 0; i < migrate_multifd_channels(); i++) {
1210         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1211
1212         trace_multifd_recv_sync_main_signal(p->id);
1213         qemu_sem_post(&p->sem_sync);
1214     }
1215     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1216 }
1217
1218 static void *multifd_recv_thread(void *opaque)
1219 {
1220     MultiFDRecvParams *p = opaque;
1221     Error *local_err = NULL;
1222     int ret;
1223
1224     trace_multifd_recv_thread_start(p->id);
1225     rcu_register_thread();
1226
1227     while (true) {
1228         uint32_t used;
1229         uint32_t flags;
1230
1231         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1232                                        p->packet_len, &local_err);
1233         if (ret == 0) {   /* EOF */
1234             break;
1235         }
1236         if (ret == -1) {   /* Error */
1237             break;
1238         }
1239
1240         qemu_mutex_lock(&p->mutex);
1241         ret = multifd_recv_unfill_packet(p, &local_err);
1242         if (ret) {
1243             qemu_mutex_unlock(&p->mutex);
1244             break;
1245         }
1246
1247         used = p->pages->used;
1248         flags = p->flags;
1249         trace_multifd_recv(p->id, p->packet_num, used, flags);
1250         p->num_packets++;
1251         p->num_pages += used;
1252         qemu_mutex_unlock(&p->mutex);
1253
1254         ret = qio_channel_readv_all(p->c, p->pages->iov, used, &local_err);
1255         if (ret != 0) {
1256             break;
1257         }
1258
1259         if (flags & MULTIFD_FLAG_SYNC) {
1260             qemu_sem_post(&multifd_recv_state->sem_sync);
1261             qemu_sem_wait(&p->sem_sync);
1262         }
1263     }
1264
1265     if (local_err) {
1266         multifd_recv_terminate_threads(local_err);
1267     }
1268     qemu_mutex_lock(&p->mutex);
1269     p->running = false;
1270     qemu_mutex_unlock(&p->mutex);
1271
1272     rcu_unregister_thread();
1273     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1274
1275     return NULL;
1276 }
1277
1278 int multifd_load_setup(void)
1279 {
1280     int thread_count;
1281     uint32_t page_count = migrate_multifd_page_count();
1282     uint8_t i;
1283
1284     if (!migrate_use_multifd()) {
1285         return 0;
1286     }
1287     thread_count = migrate_multifd_channels();
1288     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1289     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1290     atomic_set(&multifd_recv_state->count, 0);
1291     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1292
1293     for (i = 0; i < thread_count; i++) {
1294         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1295
1296         qemu_mutex_init(&p->mutex);
1297         qemu_sem_init(&p->sem_sync, 0);
1298         p->id = i;
1299         p->pages = multifd_pages_init(page_count);
1300         p->packet_len = sizeof(MultiFDPacket_t)
1301                       + sizeof(ram_addr_t) * page_count;
1302         p->packet = g_malloc0(p->packet_len);
1303         p->name = g_strdup_printf("multifdrecv_%d", i);
1304     }
1305     return 0;
1306 }
1307
1308 bool multifd_recv_all_channels_created(void)
1309 {
1310     int thread_count = migrate_multifd_channels();
1311
1312     if (!migrate_use_multifd()) {
1313         return true;
1314     }
1315
1316     return thread_count == atomic_read(&multifd_recv_state->count);
1317 }
1318
1319 /* Return true if multifd is ready for the migration, otherwise false */
1320 bool multifd_recv_new_channel(QIOChannel *ioc)
1321 {
1322     MultiFDRecvParams *p;
1323     Error *local_err = NULL;
1324     int id;
1325
1326     id = multifd_recv_initial_packet(ioc, &local_err);
1327     if (id < 0) {
1328         multifd_recv_terminate_threads(local_err);
1329         return false;
1330     }
1331
1332     p = &multifd_recv_state->params[id];
1333     if (p->c != NULL) {
1334         error_setg(&local_err, "multifd: received id '%d' already setup'",
1335                    id);
1336         multifd_recv_terminate_threads(local_err);
1337         return false;
1338     }
1339     p->c = ioc;
1340     object_ref(OBJECT(ioc));
1341     /* initial packet */
1342     p->num_packets = 1;
1343
1344     p->running = true;
1345     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1346                        QEMU_THREAD_JOINABLE);
1347     atomic_inc(&multifd_recv_state->count);
1348     return multifd_recv_state->count == migrate_multifd_channels();
1349 }
1350
1351 /**
1352  * save_page_header: write page header to wire
1353  *
1354  * If this is the 1st block, it also writes the block identification
1355  *
1356  * Returns the number of bytes written
1357  *
1358  * @f: QEMUFile where to send the data
1359  * @block: block that contains the page we want to send
1360  * @offset: offset inside the block for the page
1361  *          in the lower bits, it contains flags
1362  */
1363 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1364                                ram_addr_t offset)
1365 {
1366     size_t size, len;
1367
1368     if (block == rs->last_sent_block) {
1369         offset |= RAM_SAVE_FLAG_CONTINUE;
1370     }
1371     qemu_put_be64(f, offset);
1372     size = 8;
1373
1374     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1375         len = strlen(block->idstr);
1376         qemu_put_byte(f, len);
1377         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1378         size += 1 + len;
1379         rs->last_sent_block = block;
1380     }
1381     return size;
1382 }
1383
1384 /**
1385  * mig_throttle_guest_down: throotle down the guest
1386  *
1387  * Reduce amount of guest cpu execution to hopefully slow down memory
1388  * writes. If guest dirty memory rate is reduced below the rate at
1389  * which we can transfer pages to the destination then we should be
1390  * able to complete migration. Some workloads dirty memory way too
1391  * fast and will not effectively converge, even with auto-converge.
1392  */
1393 static void mig_throttle_guest_down(void)
1394 {
1395     MigrationState *s = migrate_get_current();
1396     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1397     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1398     int pct_max = s->parameters.max_cpu_throttle;
1399
1400     /* We have not started throttling yet. Let's start it. */
1401     if (!cpu_throttle_active()) {
1402         cpu_throttle_set(pct_initial);
1403     } else {
1404         /* Throttling already on, just increase the rate */
1405         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1406                          pct_max));
1407     }
1408 }
1409
1410 /**
1411  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1412  *
1413  * @rs: current RAM state
1414  * @current_addr: address for the zero page
1415  *
1416  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1417  * The important thing is that a stale (not-yet-0'd) page be replaced
1418  * by the new data.
1419  * As a bonus, if the page wasn't in the cache it gets added so that
1420  * when a small write is made into the 0'd page it gets XBZRLE sent.
1421  */
1422 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1423 {
1424     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1425         return;
1426     }
1427
1428     /* We don't care if this fails to allocate a new cache page
1429      * as long as it updated an old one */
1430     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1431                  ram_counters.dirty_sync_count);
1432 }
1433
1434 #define ENCODING_FLAG_XBZRLE 0x1
1435
1436 /**
1437  * save_xbzrle_page: compress and send current page
1438  *
1439  * Returns: 1 means that we wrote the page
1440  *          0 means that page is identical to the one already sent
1441  *          -1 means that xbzrle would be longer than normal
1442  *
1443  * @rs: current RAM state
1444  * @current_data: pointer to the address of the page contents
1445  * @current_addr: addr of the page
1446  * @block: block that contains the page we want to send
1447  * @offset: offset inside the block for the page
1448  * @last_stage: if we are at the completion stage
1449  */
1450 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1451                             ram_addr_t current_addr, RAMBlock *block,
1452                             ram_addr_t offset, bool last_stage)
1453 {
1454     int encoded_len = 0, bytes_xbzrle;
1455     uint8_t *prev_cached_page;
1456
1457     if (!cache_is_cached(XBZRLE.cache, current_addr,
1458                          ram_counters.dirty_sync_count)) {
1459         xbzrle_counters.cache_miss++;
1460         if (!last_stage) {
1461             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1462                              ram_counters.dirty_sync_count) == -1) {
1463                 return -1;
1464             } else {
1465                 /* update *current_data when the page has been
1466                    inserted into cache */
1467                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1468             }
1469         }
1470         return -1;
1471     }
1472
1473     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1474
1475     /* save current buffer into memory */
1476     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1477
1478     /* XBZRLE encoding (if there is no overflow) */
1479     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1480                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1481                                        TARGET_PAGE_SIZE);
1482     if (encoded_len == 0) {
1483         trace_save_xbzrle_page_skipping();
1484         return 0;
1485     } else if (encoded_len == -1) {
1486         trace_save_xbzrle_page_overflow();
1487         xbzrle_counters.overflow++;
1488         /* update data in the cache */
1489         if (!last_stage) {
1490             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1491             *current_data = prev_cached_page;
1492         }
1493         return -1;
1494     }
1495
1496     /* we need to update the data in the cache, in order to get the same data */
1497     if (!last_stage) {
1498         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1499     }
1500
1501     /* Send XBZRLE based compressed page */
1502     bytes_xbzrle = save_page_header(rs, rs->f, block,
1503                                     offset | RAM_SAVE_FLAG_XBZRLE);
1504     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1505     qemu_put_be16(rs->f, encoded_len);
1506     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1507     bytes_xbzrle += encoded_len + 1 + 2;
1508     xbzrle_counters.pages++;
1509     xbzrle_counters.bytes += bytes_xbzrle;
1510     ram_counters.transferred += bytes_xbzrle;
1511
1512     return 1;
1513 }
1514
1515 /**
1516  * migration_bitmap_find_dirty: find the next dirty page from start
1517  *
1518  * Called with rcu_read_lock() to protect migration_bitmap
1519  *
1520  * Returns the byte offset within memory region of the start of a dirty page
1521  *
1522  * @rs: current RAM state
1523  * @rb: RAMBlock where to search for dirty pages
1524  * @start: page where we start the search
1525  */
1526 static inline
1527 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1528                                           unsigned long start)
1529 {
1530     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1531     unsigned long *bitmap = rb->bmap;
1532     unsigned long next;
1533
1534     if (!qemu_ram_is_migratable(rb)) {
1535         return size;
1536     }
1537
1538     if (rs->ram_bulk_stage && start > 0) {
1539         next = start + 1;
1540     } else {
1541         next = find_next_bit(bitmap, size, start);
1542     }
1543
1544     return next;
1545 }
1546
1547 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1548                                                 RAMBlock *rb,
1549                                                 unsigned long page)
1550 {
1551     bool ret;
1552
1553     ret = test_and_clear_bit(page, rb->bmap);
1554
1555     if (ret) {
1556         rs->migration_dirty_pages--;
1557     }
1558     return ret;
1559 }
1560
1561 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1562                                         ram_addr_t start, ram_addr_t length)
1563 {
1564     rs->migration_dirty_pages +=
1565         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1566                                               &rs->num_dirty_pages_period);
1567 }
1568
1569 /**
1570  * ram_pagesize_summary: calculate all the pagesizes of a VM
1571  *
1572  * Returns a summary bitmap of the page sizes of all RAMBlocks
1573  *
1574  * For VMs with just normal pages this is equivalent to the host page
1575  * size. If it's got some huge pages then it's the OR of all the
1576  * different page sizes.
1577  */
1578 uint64_t ram_pagesize_summary(void)
1579 {
1580     RAMBlock *block;
1581     uint64_t summary = 0;
1582
1583     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1584         summary |= block->page_size;
1585     }
1586
1587     return summary;
1588 }
1589
1590 static void migration_update_rates(RAMState *rs, int64_t end_time)
1591 {
1592     uint64_t iter_count = rs->iterations - rs->iterations_prev;
1593
1594     /* calculate period counters */
1595     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1596                 / (end_time - rs->time_last_bitmap_sync);
1597
1598     if (!iter_count) {
1599         return;
1600     }
1601
1602     if (migrate_use_xbzrle()) {
1603         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1604             rs->xbzrle_cache_miss_prev) / iter_count;
1605         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1606     }
1607 }
1608
1609 static void migration_bitmap_sync(RAMState *rs)
1610 {
1611     RAMBlock *block;
1612     int64_t end_time;
1613     uint64_t bytes_xfer_now;
1614
1615     ram_counters.dirty_sync_count++;
1616
1617     if (!rs->time_last_bitmap_sync) {
1618         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1619     }
1620
1621     trace_migration_bitmap_sync_start();
1622     memory_global_dirty_log_sync();
1623
1624     qemu_mutex_lock(&rs->bitmap_mutex);
1625     rcu_read_lock();
1626     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1627         migration_bitmap_sync_range(rs, block, 0, block->used_length);
1628     }
1629     ram_counters.remaining = ram_bytes_remaining();
1630     rcu_read_unlock();
1631     qemu_mutex_unlock(&rs->bitmap_mutex);
1632
1633     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1634
1635     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1636
1637     /* more than 1 second = 1000 millisecons */
1638     if (end_time > rs->time_last_bitmap_sync + 1000) {
1639         bytes_xfer_now = ram_counters.transferred;
1640
1641         /* During block migration the auto-converge logic incorrectly detects
1642          * that ram migration makes no progress. Avoid this by disabling the
1643          * throttling logic during the bulk phase of block migration. */
1644         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1645             /* The following detection logic can be refined later. For now:
1646                Check to see if the dirtied bytes is 50% more than the approx.
1647                amount of bytes that just got transferred since the last time we
1648                were in this routine. If that happens twice, start or increase
1649                throttling */
1650
1651             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1652                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1653                 (++rs->dirty_rate_high_cnt >= 2)) {
1654                     trace_migration_throttle();
1655                     rs->dirty_rate_high_cnt = 0;
1656                     mig_throttle_guest_down();
1657             }
1658         }
1659
1660         migration_update_rates(rs, end_time);
1661
1662         rs->iterations_prev = rs->iterations;
1663
1664         /* reset period counters */
1665         rs->time_last_bitmap_sync = end_time;
1666         rs->num_dirty_pages_period = 0;
1667         rs->bytes_xfer_prev = bytes_xfer_now;
1668     }
1669     if (migrate_use_events()) {
1670         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
1671     }
1672 }
1673
1674 /**
1675  * save_zero_page: send the zero page to the stream
1676  *
1677  * Returns the number of pages written.
1678  *
1679  * @rs: current RAM state
1680  * @block: block that contains the page we want to send
1681  * @offset: offset inside the block for the page
1682  */
1683 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1684 {
1685     uint8_t *p = block->host + offset;
1686     int pages = -1;
1687
1688     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1689         ram_counters.duplicate++;
1690         ram_counters.transferred +=
1691             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
1692         qemu_put_byte(rs->f, 0);
1693         ram_counters.transferred += 1;
1694         pages = 1;
1695     }
1696
1697     return pages;
1698 }
1699
1700 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1701 {
1702     if (!migrate_release_ram() || !migration_in_postcopy()) {
1703         return;
1704     }
1705
1706     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1707 }
1708
1709 /*
1710  * @pages: the number of pages written by the control path,
1711  *        < 0 - error
1712  *        > 0 - number of pages written
1713  *
1714  * Return true if the pages has been saved, otherwise false is returned.
1715  */
1716 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1717                               int *pages)
1718 {
1719     uint64_t bytes_xmit = 0;
1720     int ret;
1721
1722     *pages = -1;
1723     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1724                                 &bytes_xmit);
1725     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1726         return false;
1727     }
1728
1729     if (bytes_xmit) {
1730         ram_counters.transferred += bytes_xmit;
1731         *pages = 1;
1732     }
1733
1734     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1735         return true;
1736     }
1737
1738     if (bytes_xmit > 0) {
1739         ram_counters.normal++;
1740     } else if (bytes_xmit == 0) {
1741         ram_counters.duplicate++;
1742     }
1743
1744     return true;
1745 }
1746
1747 /*
1748  * directly send the page to the stream
1749  *
1750  * Returns the number of pages written.
1751  *
1752  * @rs: current RAM state
1753  * @block: block that contains the page we want to send
1754  * @offset: offset inside the block for the page
1755  * @buf: the page to be sent
1756  * @async: send to page asyncly
1757  */
1758 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1759                             uint8_t *buf, bool async)
1760 {
1761     ram_counters.transferred += save_page_header(rs, rs->f, block,
1762                                                  offset | RAM_SAVE_FLAG_PAGE);
1763     if (async) {
1764         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1765                               migrate_release_ram() &
1766                               migration_in_postcopy());
1767     } else {
1768         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1769     }
1770     ram_counters.transferred += TARGET_PAGE_SIZE;
1771     ram_counters.normal++;
1772     return 1;
1773 }
1774
1775 /**
1776  * ram_save_page: send the given page to the stream
1777  *
1778  * Returns the number of pages written.
1779  *          < 0 - error
1780  *          >=0 - Number of pages written - this might legally be 0
1781  *                if xbzrle noticed the page was the same.
1782  *
1783  * @rs: current RAM state
1784  * @block: block that contains the page we want to send
1785  * @offset: offset inside the block for the page
1786  * @last_stage: if we are at the completion stage
1787  */
1788 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1789 {
1790     int pages = -1;
1791     uint8_t *p;
1792     bool send_async = true;
1793     RAMBlock *block = pss->block;
1794     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1795     ram_addr_t current_addr = block->offset + offset;
1796
1797     p = block->host + offset;
1798     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1799
1800     XBZRLE_cache_lock();
1801     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1802         migrate_use_xbzrle()) {
1803         pages = save_xbzrle_page(rs, &p, current_addr, block,
1804                                  offset, last_stage);
1805         if (!last_stage) {
1806             /* Can't send this cached data async, since the cache page
1807              * might get updated before it gets to the wire
1808              */
1809             send_async = false;
1810         }
1811     }
1812
1813     /* XBZRLE overflow or normal page */
1814     if (pages == -1) {
1815         pages = save_normal_page(rs, block, offset, p, send_async);
1816     }
1817
1818     XBZRLE_cache_unlock();
1819
1820     return pages;
1821 }
1822
1823 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1824                                  ram_addr_t offset)
1825 {
1826     multifd_queue_page(block, offset);
1827     ram_counters.normal++;
1828
1829     return 1;
1830 }
1831
1832 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1833                                 ram_addr_t offset, uint8_t *source_buf)
1834 {
1835     RAMState *rs = ram_state;
1836     int bytes_sent, blen;
1837     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1838
1839     bytes_sent = save_page_header(rs, f, block, offset |
1840                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1841
1842     /*
1843      * copy it to a internal buffer to avoid it being modified by VM
1844      * so that we can catch up the error during compression and
1845      * decompression
1846      */
1847     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1848     blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1849     if (blen < 0) {
1850         bytes_sent = 0;
1851         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1852         error_report("compressed data failed!");
1853     } else {
1854         bytes_sent += blen;
1855         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1856     }
1857
1858     return bytes_sent;
1859 }
1860
1861 static void flush_compressed_data(RAMState *rs)
1862 {
1863     int idx, len, thread_count;
1864
1865     if (!migrate_use_compression()) {
1866         return;
1867     }
1868     thread_count = migrate_compress_threads();
1869
1870     qemu_mutex_lock(&comp_done_lock);
1871     for (idx = 0; idx < thread_count; idx++) {
1872         while (!comp_param[idx].done) {
1873             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1874         }
1875     }
1876     qemu_mutex_unlock(&comp_done_lock);
1877
1878     for (idx = 0; idx < thread_count; idx++) {
1879         qemu_mutex_lock(&comp_param[idx].mutex);
1880         if (!comp_param[idx].quit) {
1881             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1882             ram_counters.transferred += len;
1883         }
1884         qemu_mutex_unlock(&comp_param[idx].mutex);
1885     }
1886 }
1887
1888 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1889                                        ram_addr_t offset)
1890 {
1891     param->block = block;
1892     param->offset = offset;
1893 }
1894
1895 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1896                                            ram_addr_t offset)
1897 {
1898     int idx, thread_count, bytes_xmit = -1, pages = -1;
1899
1900     thread_count = migrate_compress_threads();
1901     qemu_mutex_lock(&comp_done_lock);
1902     while (true) {
1903         for (idx = 0; idx < thread_count; idx++) {
1904             if (comp_param[idx].done) {
1905                 comp_param[idx].done = false;
1906                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1907                 qemu_mutex_lock(&comp_param[idx].mutex);
1908                 set_compress_params(&comp_param[idx], block, offset);
1909                 qemu_cond_signal(&comp_param[idx].cond);
1910                 qemu_mutex_unlock(&comp_param[idx].mutex);
1911                 pages = 1;
1912                 ram_counters.normal++;
1913                 ram_counters.transferred += bytes_xmit;
1914                 break;
1915             }
1916         }
1917         if (pages > 0) {
1918             break;
1919         } else {
1920             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1921         }
1922     }
1923     qemu_mutex_unlock(&comp_done_lock);
1924
1925     return pages;
1926 }
1927
1928 /**
1929  * find_dirty_block: find the next dirty page and update any state
1930  * associated with the search process.
1931  *
1932  * Returns if a page is found
1933  *
1934  * @rs: current RAM state
1935  * @pss: data about the state of the current dirty page scan
1936  * @again: set to false if the search has scanned the whole of RAM
1937  */
1938 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1939 {
1940     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1941     if (pss->complete_round && pss->block == rs->last_seen_block &&
1942         pss->page >= rs->last_page) {
1943         /*
1944          * We've been once around the RAM and haven't found anything.
1945          * Give up.
1946          */
1947         *again = false;
1948         return false;
1949     }
1950     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1951         /* Didn't find anything in this RAM Block */
1952         pss->page = 0;
1953         pss->block = QLIST_NEXT_RCU(pss->block, next);
1954         if (!pss->block) {
1955             /* Hit the end of the list */
1956             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1957             /* Flag that we've looped */
1958             pss->complete_round = true;
1959             rs->ram_bulk_stage = false;
1960             if (migrate_use_xbzrle()) {
1961                 /* If xbzrle is on, stop using the data compression at this
1962                  * point. In theory, xbzrle can do better than compression.
1963                  */
1964                 flush_compressed_data(rs);
1965             }
1966         }
1967         /* Didn't find anything this time, but try again on the new block */
1968         *again = true;
1969         return false;
1970     } else {
1971         /* Can go around again, but... */
1972         *again = true;
1973         /* We've found something so probably don't need to */
1974         return true;
1975     }
1976 }
1977
1978 /**
1979  * unqueue_page: gets a page of the queue
1980  *
1981  * Helper for 'get_queued_page' - gets a page off the queue
1982  *
1983  * Returns the block of the page (or NULL if none available)
1984  *
1985  * @rs: current RAM state
1986  * @offset: used to return the offset within the RAMBlock
1987  */
1988 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1989 {
1990     RAMBlock *block = NULL;
1991
1992     qemu_mutex_lock(&rs->src_page_req_mutex);
1993     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1994         struct RAMSrcPageRequest *entry =
1995                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1996         block = entry->rb;
1997         *offset = entry->offset;
1998
1999         if (entry->len > TARGET_PAGE_SIZE) {
2000             entry->len -= TARGET_PAGE_SIZE;
2001             entry->offset += TARGET_PAGE_SIZE;
2002         } else {
2003             memory_region_unref(block->mr);
2004             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2005             g_free(entry);
2006             migration_consume_urgent_request();
2007         }
2008     }
2009     qemu_mutex_unlock(&rs->src_page_req_mutex);
2010
2011     return block;
2012 }
2013
2014 /**
2015  * get_queued_page: unqueue a page from the postocpy requests
2016  *
2017  * Skips pages that are already sent (!dirty)
2018  *
2019  * Returns if a queued page is found
2020  *
2021  * @rs: current RAM state
2022  * @pss: data about the state of the current dirty page scan
2023  */
2024 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2025 {
2026     RAMBlock  *block;
2027     ram_addr_t offset;
2028     bool dirty;
2029
2030     do {
2031         block = unqueue_page(rs, &offset);
2032         /*
2033          * We're sending this page, and since it's postcopy nothing else
2034          * will dirty it, and we must make sure it doesn't get sent again
2035          * even if this queue request was received after the background
2036          * search already sent it.
2037          */
2038         if (block) {
2039             unsigned long page;
2040
2041             page = offset >> TARGET_PAGE_BITS;
2042             dirty = test_bit(page, block->bmap);
2043             if (!dirty) {
2044                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2045                        page, test_bit(page, block->unsentmap));
2046             } else {
2047                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2048             }
2049         }
2050
2051     } while (block && !dirty);
2052
2053     if (block) {
2054         /*
2055          * As soon as we start servicing pages out of order, then we have
2056          * to kill the bulk stage, since the bulk stage assumes
2057          * in (migration_bitmap_find_and_reset_dirty) that every page is
2058          * dirty, that's no longer true.
2059          */
2060         rs->ram_bulk_stage = false;
2061
2062         /*
2063          * We want the background search to continue from the queued page
2064          * since the guest is likely to want other pages near to the page
2065          * it just requested.
2066          */
2067         pss->block = block;
2068         pss->page = offset >> TARGET_PAGE_BITS;
2069     }
2070
2071     return !!block;
2072 }
2073
2074 /**
2075  * migration_page_queue_free: drop any remaining pages in the ram
2076  * request queue
2077  *
2078  * It should be empty at the end anyway, but in error cases there may
2079  * be some left.  in case that there is any page left, we drop it.
2080  *
2081  */
2082 static void migration_page_queue_free(RAMState *rs)
2083 {
2084     struct RAMSrcPageRequest *mspr, *next_mspr;
2085     /* This queue generally should be empty - but in the case of a failed
2086      * migration might have some droppings in.
2087      */
2088     rcu_read_lock();
2089     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2090         memory_region_unref(mspr->rb->mr);
2091         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2092         g_free(mspr);
2093     }
2094     rcu_read_unlock();
2095 }
2096
2097 /**
2098  * ram_save_queue_pages: queue the page for transmission
2099  *
2100  * A request from postcopy destination for example.
2101  *
2102  * Returns zero on success or negative on error
2103  *
2104  * @rbname: Name of the RAMBLock of the request. NULL means the
2105  *          same that last one.
2106  * @start: starting address from the start of the RAMBlock
2107  * @len: length (in bytes) to send
2108  */
2109 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2110 {
2111     RAMBlock *ramblock;
2112     RAMState *rs = ram_state;
2113
2114     ram_counters.postcopy_requests++;
2115     rcu_read_lock();
2116     if (!rbname) {
2117         /* Reuse last RAMBlock */
2118         ramblock = rs->last_req_rb;
2119
2120         if (!ramblock) {
2121             /*
2122              * Shouldn't happen, we can't reuse the last RAMBlock if
2123              * it's the 1st request.
2124              */
2125             error_report("ram_save_queue_pages no previous block");
2126             goto err;
2127         }
2128     } else {
2129         ramblock = qemu_ram_block_by_name(rbname);
2130
2131         if (!ramblock) {
2132             /* We shouldn't be asked for a non-existent RAMBlock */
2133             error_report("ram_save_queue_pages no block '%s'", rbname);
2134             goto err;
2135         }
2136         rs->last_req_rb = ramblock;
2137     }
2138     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2139     if (start+len > ramblock->used_length) {
2140         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2141                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2142                      __func__, start, len, ramblock->used_length);
2143         goto err;
2144     }
2145
2146     struct RAMSrcPageRequest *new_entry =
2147         g_malloc0(sizeof(struct RAMSrcPageRequest));
2148     new_entry->rb = ramblock;
2149     new_entry->offset = start;
2150     new_entry->len = len;
2151
2152     memory_region_ref(ramblock->mr);
2153     qemu_mutex_lock(&rs->src_page_req_mutex);
2154     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2155     migration_make_urgent_request();
2156     qemu_mutex_unlock(&rs->src_page_req_mutex);
2157     rcu_read_unlock();
2158
2159     return 0;
2160
2161 err:
2162     rcu_read_unlock();
2163     return -1;
2164 }
2165
2166 static bool save_page_use_compression(RAMState *rs)
2167 {
2168     if (!migrate_use_compression()) {
2169         return false;
2170     }
2171
2172     /*
2173      * If xbzrle is on, stop using the data compression after first
2174      * round of migration even if compression is enabled. In theory,
2175      * xbzrle can do better than compression.
2176      */
2177     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2178         return true;
2179     }
2180
2181     return false;
2182 }
2183
2184 /**
2185  * ram_save_target_page: save one target page
2186  *
2187  * Returns the number of pages written
2188  *
2189  * @rs: current RAM state
2190  * @pss: data about the page we want to send
2191  * @last_stage: if we are at the completion stage
2192  */
2193 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2194                                 bool last_stage)
2195 {
2196     RAMBlock *block = pss->block;
2197     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2198     int res;
2199
2200     if (control_save_page(rs, block, offset, &res)) {
2201         return res;
2202     }
2203
2204     /*
2205      * When starting the process of a new block, the first page of
2206      * the block should be sent out before other pages in the same
2207      * block, and all the pages in last block should have been sent
2208      * out, keeping this order is important, because the 'cont' flag
2209      * is used to avoid resending the block name.
2210      */
2211     if (block != rs->last_sent_block && save_page_use_compression(rs)) {
2212             flush_compressed_data(rs);
2213     }
2214
2215     res = save_zero_page(rs, block, offset);
2216     if (res > 0) {
2217         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2218          * page would be stale
2219          */
2220         if (!save_page_use_compression(rs)) {
2221             XBZRLE_cache_lock();
2222             xbzrle_cache_zero_page(rs, block->offset + offset);
2223             XBZRLE_cache_unlock();
2224         }
2225         ram_release_pages(block->idstr, offset, res);
2226         return res;
2227     }
2228
2229     /*
2230      * Make sure the first page is sent out before other pages.
2231      *
2232      * we post it as normal page as compression will take much
2233      * CPU resource.
2234      */
2235     if (block == rs->last_sent_block && save_page_use_compression(rs)) {
2236         return compress_page_with_multi_thread(rs, block, offset);
2237     } else if (migrate_use_multifd()) {
2238         return ram_save_multifd_page(rs, block, offset);
2239     }
2240
2241     return ram_save_page(rs, pss, last_stage);
2242 }
2243
2244 /**
2245  * ram_save_host_page: save a whole host page
2246  *
2247  * Starting at *offset send pages up to the end of the current host
2248  * page. It's valid for the initial offset to point into the middle of
2249  * a host page in which case the remainder of the hostpage is sent.
2250  * Only dirty target pages are sent. Note that the host page size may
2251  * be a huge page for this block.
2252  * The saving stops at the boundary of the used_length of the block
2253  * if the RAMBlock isn't a multiple of the host page size.
2254  *
2255  * Returns the number of pages written or negative on error
2256  *
2257  * @rs: current RAM state
2258  * @ms: current migration state
2259  * @pss: data about the page we want to send
2260  * @last_stage: if we are at the completion stage
2261  */
2262 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2263                               bool last_stage)
2264 {
2265     int tmppages, pages = 0;
2266     size_t pagesize_bits =
2267         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2268
2269     if (!qemu_ram_is_migratable(pss->block)) {
2270         error_report("block %s should not be migrated !", pss->block->idstr);
2271         return 0;
2272     }
2273
2274     do {
2275         /* Check the pages is dirty and if it is send it */
2276         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2277             pss->page++;
2278             continue;
2279         }
2280
2281         tmppages = ram_save_target_page(rs, pss, last_stage);
2282         if (tmppages < 0) {
2283             return tmppages;
2284         }
2285
2286         pages += tmppages;
2287         if (pss->block->unsentmap) {
2288             clear_bit(pss->page, pss->block->unsentmap);
2289         }
2290
2291         pss->page++;
2292     } while ((pss->page & (pagesize_bits - 1)) &&
2293              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2294
2295     /* The offset we leave with is the last one we looked at */
2296     pss->page--;
2297     return pages;
2298 }
2299
2300 /**
2301  * ram_find_and_save_block: finds a dirty page and sends it to f
2302  *
2303  * Called within an RCU critical section.
2304  *
2305  * Returns the number of pages written where zero means no dirty pages
2306  *
2307  * @rs: current RAM state
2308  * @last_stage: if we are at the completion stage
2309  *
2310  * On systems where host-page-size > target-page-size it will send all the
2311  * pages in a host page that are dirty.
2312  */
2313
2314 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2315 {
2316     PageSearchStatus pss;
2317     int pages = 0;
2318     bool again, found;
2319
2320     /* No dirty page as there is zero RAM */
2321     if (!ram_bytes_total()) {
2322         return pages;
2323     }
2324
2325     pss.block = rs->last_seen_block;
2326     pss.page = rs->last_page;
2327     pss.complete_round = false;
2328
2329     if (!pss.block) {
2330         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2331     }
2332
2333     do {
2334         again = true;
2335         found = get_queued_page(rs, &pss);
2336
2337         if (!found) {
2338             /* priority queue empty, so just search for something dirty */
2339             found = find_dirty_block(rs, &pss, &again);
2340         }
2341
2342         if (found) {
2343             pages = ram_save_host_page(rs, &pss, last_stage);
2344         }
2345     } while (!pages && again);
2346
2347     rs->last_seen_block = pss.block;
2348     rs->last_page = pss.page;
2349
2350     return pages;
2351 }
2352
2353 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2354 {
2355     uint64_t pages = size / TARGET_PAGE_SIZE;
2356
2357     if (zero) {
2358         ram_counters.duplicate += pages;
2359     } else {
2360         ram_counters.normal += pages;
2361         ram_counters.transferred += size;
2362         qemu_update_position(f, size);
2363     }
2364 }
2365
2366 uint64_t ram_bytes_total(void)
2367 {
2368     RAMBlock *block;
2369     uint64_t total = 0;
2370
2371     rcu_read_lock();
2372     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2373         total += block->used_length;
2374     }
2375     rcu_read_unlock();
2376     return total;
2377 }
2378
2379 static void xbzrle_load_setup(void)
2380 {
2381     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2382 }
2383
2384 static void xbzrle_load_cleanup(void)
2385 {
2386     g_free(XBZRLE.decoded_buf);
2387     XBZRLE.decoded_buf = NULL;
2388 }
2389
2390 static void ram_state_cleanup(RAMState **rsp)
2391 {
2392     if (*rsp) {
2393         migration_page_queue_free(*rsp);
2394         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2395         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2396         g_free(*rsp);
2397         *rsp = NULL;
2398     }
2399 }
2400
2401 static void xbzrle_cleanup(void)
2402 {
2403     XBZRLE_cache_lock();
2404     if (XBZRLE.cache) {
2405         cache_fini(XBZRLE.cache);
2406         g_free(XBZRLE.encoded_buf);
2407         g_free(XBZRLE.current_buf);
2408         g_free(XBZRLE.zero_target_page);
2409         XBZRLE.cache = NULL;
2410         XBZRLE.encoded_buf = NULL;
2411         XBZRLE.current_buf = NULL;
2412         XBZRLE.zero_target_page = NULL;
2413     }
2414     XBZRLE_cache_unlock();
2415 }
2416
2417 static void ram_save_cleanup(void *opaque)
2418 {
2419     RAMState **rsp = opaque;
2420     RAMBlock *block;
2421
2422     /* caller have hold iothread lock or is in a bh, so there is
2423      * no writing race against this migration_bitmap
2424      */
2425     memory_global_dirty_log_stop();
2426
2427     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2428         g_free(block->bmap);
2429         block->bmap = NULL;
2430         g_free(block->unsentmap);
2431         block->unsentmap = NULL;
2432     }
2433
2434     xbzrle_cleanup();
2435     compress_threads_save_cleanup();
2436     ram_state_cleanup(rsp);
2437 }
2438
2439 static void ram_state_reset(RAMState *rs)
2440 {
2441     rs->last_seen_block = NULL;
2442     rs->last_sent_block = NULL;
2443     rs->last_page = 0;
2444     rs->last_version = ram_list.version;
2445     rs->ram_bulk_stage = true;
2446 }
2447
2448 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2449
2450 /*
2451  * 'expected' is the value you expect the bitmap mostly to be full
2452  * of; it won't bother printing lines that are all this value.
2453  * If 'todump' is null the migration bitmap is dumped.
2454  */
2455 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2456                            unsigned long pages)
2457 {
2458     int64_t cur;
2459     int64_t linelen = 128;
2460     char linebuf[129];
2461
2462     for (cur = 0; cur < pages; cur += linelen) {
2463         int64_t curb;
2464         bool found = false;
2465         /*
2466          * Last line; catch the case where the line length
2467          * is longer than remaining ram
2468          */
2469         if (cur + linelen > pages) {
2470             linelen = pages - cur;
2471         }
2472         for (curb = 0; curb < linelen; curb++) {
2473             bool thisbit = test_bit(cur + curb, todump);
2474             linebuf[curb] = thisbit ? '1' : '.';
2475             found = found || (thisbit != expected);
2476         }
2477         if (found) {
2478             linebuf[curb] = '\0';
2479             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2480         }
2481     }
2482 }
2483
2484 /* **** functions for postcopy ***** */
2485
2486 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2487 {
2488     struct RAMBlock *block;
2489
2490     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2491         unsigned long *bitmap = block->bmap;
2492         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2493         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2494
2495         while (run_start < range) {
2496             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2497             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2498                               (run_end - run_start) << TARGET_PAGE_BITS);
2499             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2500         }
2501     }
2502 }
2503
2504 /**
2505  * postcopy_send_discard_bm_ram: discard a RAMBlock
2506  *
2507  * Returns zero on success
2508  *
2509  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2510  * Note: At this point the 'unsentmap' is the processed bitmap combined
2511  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2512  *
2513  * @ms: current migration state
2514  * @pds: state for postcopy
2515  * @start: RAMBlock starting page
2516  * @length: RAMBlock size
2517  */
2518 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2519                                         PostcopyDiscardState *pds,
2520                                         RAMBlock *block)
2521 {
2522     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2523     unsigned long current;
2524     unsigned long *unsentmap = block->unsentmap;
2525
2526     for (current = 0; current < end; ) {
2527         unsigned long one = find_next_bit(unsentmap, end, current);
2528
2529         if (one <= end) {
2530             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2531             unsigned long discard_length;
2532
2533             if (zero >= end) {
2534                 discard_length = end - one;
2535             } else {
2536                 discard_length = zero - one;
2537             }
2538             if (discard_length) {
2539                 postcopy_discard_send_range(ms, pds, one, discard_length);
2540             }
2541             current = one + discard_length;
2542         } else {
2543             current = one;
2544         }
2545     }
2546
2547     return 0;
2548 }
2549
2550 /**
2551  * postcopy_each_ram_send_discard: discard all RAMBlocks
2552  *
2553  * Returns 0 for success or negative for error
2554  *
2555  * Utility for the outgoing postcopy code.
2556  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2557  *   passing it bitmap indexes and name.
2558  * (qemu_ram_foreach_block ends up passing unscaled lengths
2559  *  which would mean postcopy code would have to deal with target page)
2560  *
2561  * @ms: current migration state
2562  */
2563 static int postcopy_each_ram_send_discard(MigrationState *ms)
2564 {
2565     struct RAMBlock *block;
2566     int ret;
2567
2568     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2569         PostcopyDiscardState *pds =
2570             postcopy_discard_send_init(ms, block->idstr);
2571
2572         /*
2573          * Postcopy sends chunks of bitmap over the wire, but it
2574          * just needs indexes at this point, avoids it having
2575          * target page specific code.
2576          */
2577         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2578         postcopy_discard_send_finish(ms, pds);
2579         if (ret) {
2580             return ret;
2581         }
2582     }
2583
2584     return 0;
2585 }
2586
2587 /**
2588  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2589  *
2590  * Helper for postcopy_chunk_hostpages; it's called twice to
2591  * canonicalize the two bitmaps, that are similar, but one is
2592  * inverted.
2593  *
2594  * Postcopy requires that all target pages in a hostpage are dirty or
2595  * clean, not a mix.  This function canonicalizes the bitmaps.
2596  *
2597  * @ms: current migration state
2598  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2599  *               otherwise we need to canonicalize partially dirty host pages
2600  * @block: block that contains the page we want to canonicalize
2601  * @pds: state for postcopy
2602  */
2603 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2604                                           RAMBlock *block,
2605                                           PostcopyDiscardState *pds)
2606 {
2607     RAMState *rs = ram_state;
2608     unsigned long *bitmap = block->bmap;
2609     unsigned long *unsentmap = block->unsentmap;
2610     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2611     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2612     unsigned long run_start;
2613
2614     if (block->page_size == TARGET_PAGE_SIZE) {
2615         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2616         return;
2617     }
2618
2619     if (unsent_pass) {
2620         /* Find a sent page */
2621         run_start = find_next_zero_bit(unsentmap, pages, 0);
2622     } else {
2623         /* Find a dirty page */
2624         run_start = find_next_bit(bitmap, pages, 0);
2625     }
2626
2627     while (run_start < pages) {
2628         bool do_fixup = false;
2629         unsigned long fixup_start_addr;
2630         unsigned long host_offset;
2631
2632         /*
2633          * If the start of this run of pages is in the middle of a host
2634          * page, then we need to fixup this host page.
2635          */
2636         host_offset = run_start % host_ratio;
2637         if (host_offset) {
2638             do_fixup = true;
2639             run_start -= host_offset;
2640             fixup_start_addr = run_start;
2641             /* For the next pass */
2642             run_start = run_start + host_ratio;
2643         } else {
2644             /* Find the end of this run */
2645             unsigned long run_end;
2646             if (unsent_pass) {
2647                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2648             } else {
2649                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2650             }
2651             /*
2652              * If the end isn't at the start of a host page, then the
2653              * run doesn't finish at the end of a host page
2654              * and we need to discard.
2655              */
2656             host_offset = run_end % host_ratio;
2657             if (host_offset) {
2658                 do_fixup = true;
2659                 fixup_start_addr = run_end - host_offset;
2660                 /*
2661                  * This host page has gone, the next loop iteration starts
2662                  * from after the fixup
2663                  */
2664                 run_start = fixup_start_addr + host_ratio;
2665             } else {
2666                 /*
2667                  * No discards on this iteration, next loop starts from
2668                  * next sent/dirty page
2669                  */
2670                 run_start = run_end + 1;
2671             }
2672         }
2673
2674         if (do_fixup) {
2675             unsigned long page;
2676
2677             /* Tell the destination to discard this page */
2678             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2679                 /* For the unsent_pass we:
2680                  *     discard partially sent pages
2681                  * For the !unsent_pass (dirty) we:
2682                  *     discard partially dirty pages that were sent
2683                  *     (any partially sent pages were already discarded
2684                  *     by the previous unsent_pass)
2685                  */
2686                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2687                                             host_ratio);
2688             }
2689
2690             /* Clean up the bitmap */
2691             for (page = fixup_start_addr;
2692                  page < fixup_start_addr + host_ratio; page++) {
2693                 /* All pages in this host page are now not sent */
2694                 set_bit(page, unsentmap);
2695
2696                 /*
2697                  * Remark them as dirty, updating the count for any pages
2698                  * that weren't previously dirty.
2699                  */
2700                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2701             }
2702         }
2703
2704         if (unsent_pass) {
2705             /* Find the next sent page for the next iteration */
2706             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2707         } else {
2708             /* Find the next dirty page for the next iteration */
2709             run_start = find_next_bit(bitmap, pages, run_start);
2710         }
2711     }
2712 }
2713
2714 /**
2715  * postcopy_chuck_hostpages: discrad any partially sent host page
2716  *
2717  * Utility for the outgoing postcopy code.
2718  *
2719  * Discard any partially sent host-page size chunks, mark any partially
2720  * dirty host-page size chunks as all dirty.  In this case the host-page
2721  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2722  *
2723  * Returns zero on success
2724  *
2725  * @ms: current migration state
2726  * @block: block we want to work with
2727  */
2728 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2729 {
2730     PostcopyDiscardState *pds =
2731         postcopy_discard_send_init(ms, block->idstr);
2732
2733     /* First pass: Discard all partially sent host pages */
2734     postcopy_chunk_hostpages_pass(ms, true, block, pds);
2735     /*
2736      * Second pass: Ensure that all partially dirty host pages are made
2737      * fully dirty.
2738      */
2739     postcopy_chunk_hostpages_pass(ms, false, block, pds);
2740
2741     postcopy_discard_send_finish(ms, pds);
2742     return 0;
2743 }
2744
2745 /**
2746  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2747  *
2748  * Returns zero on success
2749  *
2750  * Transmit the set of pages to be discarded after precopy to the target
2751  * these are pages that:
2752  *     a) Have been previously transmitted but are now dirty again
2753  *     b) Pages that have never been transmitted, this ensures that
2754  *        any pages on the destination that have been mapped by background
2755  *        tasks get discarded (transparent huge pages is the specific concern)
2756  * Hopefully this is pretty sparse
2757  *
2758  * @ms: current migration state
2759  */
2760 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2761 {
2762     RAMState *rs = ram_state;
2763     RAMBlock *block;
2764     int ret;
2765
2766     rcu_read_lock();
2767
2768     /* This should be our last sync, the src is now paused */
2769     migration_bitmap_sync(rs);
2770
2771     /* Easiest way to make sure we don't resume in the middle of a host-page */
2772     rs->last_seen_block = NULL;
2773     rs->last_sent_block = NULL;
2774     rs->last_page = 0;
2775
2776     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2777         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2778         unsigned long *bitmap = block->bmap;
2779         unsigned long *unsentmap = block->unsentmap;
2780
2781         if (!unsentmap) {
2782             /* We don't have a safe way to resize the sentmap, so
2783              * if the bitmap was resized it will be NULL at this
2784              * point.
2785              */
2786             error_report("migration ram resized during precopy phase");
2787             rcu_read_unlock();
2788             return -EINVAL;
2789         }
2790         /* Deal with TPS != HPS and huge pages */
2791         ret = postcopy_chunk_hostpages(ms, block);
2792         if (ret) {
2793             rcu_read_unlock();
2794             return ret;
2795         }
2796
2797         /*
2798          * Update the unsentmap to be unsentmap = unsentmap | dirty
2799          */
2800         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2801 #ifdef DEBUG_POSTCOPY
2802         ram_debug_dump_bitmap(unsentmap, true, pages);
2803 #endif
2804     }
2805     trace_ram_postcopy_send_discard_bitmap();
2806
2807     ret = postcopy_each_ram_send_discard(ms);
2808     rcu_read_unlock();
2809
2810     return ret;
2811 }
2812
2813 /**
2814  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2815  *
2816  * Returns zero on success
2817  *
2818  * @rbname: name of the RAMBlock of the request. NULL means the
2819  *          same that last one.
2820  * @start: RAMBlock starting page
2821  * @length: RAMBlock size
2822  */
2823 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2824 {
2825     int ret = -1;
2826
2827     trace_ram_discard_range(rbname, start, length);
2828
2829     rcu_read_lock();
2830     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2831
2832     if (!rb) {
2833         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2834         goto err;
2835     }
2836
2837     /*
2838      * On source VM, we don't need to update the received bitmap since
2839      * we don't even have one.
2840      */
2841     if (rb->receivedmap) {
2842         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2843                      length >> qemu_target_page_bits());
2844     }
2845
2846     ret = ram_block_discard_range(rb, start, length);
2847
2848 err:
2849     rcu_read_unlock();
2850
2851     return ret;
2852 }
2853
2854 /*
2855  * For every allocation, we will try not to crash the VM if the
2856  * allocation failed.
2857  */
2858 static int xbzrle_init(void)
2859 {
2860     Error *local_err = NULL;
2861
2862     if (!migrate_use_xbzrle()) {
2863         return 0;
2864     }
2865
2866     XBZRLE_cache_lock();
2867
2868     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2869     if (!XBZRLE.zero_target_page) {
2870         error_report("%s: Error allocating zero page", __func__);
2871         goto err_out;
2872     }
2873
2874     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2875                               TARGET_PAGE_SIZE, &local_err);
2876     if (!XBZRLE.cache) {
2877         error_report_err(local_err);
2878         goto free_zero_page;
2879     }
2880
2881     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2882     if (!XBZRLE.encoded_buf) {
2883         error_report("%s: Error allocating encoded_buf", __func__);
2884         goto free_cache;
2885     }
2886
2887     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2888     if (!XBZRLE.current_buf) {
2889         error_report("%s: Error allocating current_buf", __func__);
2890         goto free_encoded_buf;
2891     }
2892
2893     /* We are all good */
2894     XBZRLE_cache_unlock();
2895     return 0;
2896
2897 free_encoded_buf:
2898     g_free(XBZRLE.encoded_buf);
2899     XBZRLE.encoded_buf = NULL;
2900 free_cache:
2901     cache_fini(XBZRLE.cache);
2902     XBZRLE.cache = NULL;
2903 free_zero_page:
2904     g_free(XBZRLE.zero_target_page);
2905     XBZRLE.zero_target_page = NULL;
2906 err_out:
2907     XBZRLE_cache_unlock();
2908     return -ENOMEM;
2909 }
2910
2911 static int ram_state_init(RAMState **rsp)
2912 {
2913     *rsp = g_try_new0(RAMState, 1);
2914
2915     if (!*rsp) {
2916         error_report("%s: Init ramstate fail", __func__);
2917         return -1;
2918     }
2919
2920     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2921     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2922     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2923
2924     /*
2925      * Count the total number of pages used by ram blocks not including any
2926      * gaps due to alignment or unplugs.
2927      */
2928     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2929
2930     ram_state_reset(*rsp);
2931
2932     return 0;
2933 }
2934
2935 static void ram_list_init_bitmaps(void)
2936 {
2937     RAMBlock *block;
2938     unsigned long pages;
2939
2940     /* Skip setting bitmap if there is no RAM */
2941     if (ram_bytes_total()) {
2942         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2943             pages = block->max_length >> TARGET_PAGE_BITS;
2944             block->bmap = bitmap_new(pages);
2945             bitmap_set(block->bmap, 0, pages);
2946             if (migrate_postcopy_ram()) {
2947                 block->unsentmap = bitmap_new(pages);
2948                 bitmap_set(block->unsentmap, 0, pages);
2949             }
2950         }
2951     }
2952 }
2953
2954 static void ram_init_bitmaps(RAMState *rs)
2955 {
2956     /* For memory_global_dirty_log_start below.  */
2957     qemu_mutex_lock_iothread();
2958     qemu_mutex_lock_ramlist();
2959     rcu_read_lock();
2960
2961     ram_list_init_bitmaps();
2962     memory_global_dirty_log_start();
2963     migration_bitmap_sync(rs);
2964
2965     rcu_read_unlock();
2966     qemu_mutex_unlock_ramlist();
2967     qemu_mutex_unlock_iothread();
2968 }
2969
2970 static int ram_init_all(RAMState **rsp)
2971 {
2972     if (ram_state_init(rsp)) {
2973         return -1;
2974     }
2975
2976     if (xbzrle_init()) {
2977         ram_state_cleanup(rsp);
2978         return -1;
2979     }
2980
2981     ram_init_bitmaps(*rsp);
2982
2983     return 0;
2984 }
2985
2986 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2987 {
2988     RAMBlock *block;
2989     uint64_t pages = 0;
2990
2991     /*
2992      * Postcopy is not using xbzrle/compression, so no need for that.
2993      * Also, since source are already halted, we don't need to care
2994      * about dirty page logging as well.
2995      */
2996
2997     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2998         pages += bitmap_count_one(block->bmap,
2999                                   block->used_length >> TARGET_PAGE_BITS);
3000     }
3001
3002     /* This may not be aligned with current bitmaps. Recalculate. */
3003     rs->migration_dirty_pages = pages;
3004
3005     rs->last_seen_block = NULL;
3006     rs->last_sent_block = NULL;
3007     rs->last_page = 0;
3008     rs->last_version = ram_list.version;
3009     /*
3010      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3011      * matter what we have sent.
3012      */
3013     rs->ram_bulk_stage = false;
3014
3015     /* Update RAMState cache of output QEMUFile */
3016     rs->f = out;
3017
3018     trace_ram_state_resume_prepare(pages);
3019 }
3020
3021 /*
3022  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3023  * long-running RCU critical section.  When rcu-reclaims in the code
3024  * start to become numerous it will be necessary to reduce the
3025  * granularity of these critical sections.
3026  */
3027
3028 /**
3029  * ram_save_setup: Setup RAM for migration
3030  *
3031  * Returns zero to indicate success and negative for error
3032  *
3033  * @f: QEMUFile where to send the data
3034  * @opaque: RAMState pointer
3035  */
3036 static int ram_save_setup(QEMUFile *f, void *opaque)
3037 {
3038     RAMState **rsp = opaque;
3039     RAMBlock *block;
3040
3041     if (compress_threads_save_setup()) {
3042         return -1;
3043     }
3044
3045     /* migration has already setup the bitmap, reuse it. */
3046     if (!migration_in_colo_state()) {
3047         if (ram_init_all(rsp) != 0) {
3048             compress_threads_save_cleanup();
3049             return -1;
3050         }
3051     }
3052     (*rsp)->f = f;
3053
3054     rcu_read_lock();
3055
3056     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
3057
3058     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3059         qemu_put_byte(f, strlen(block->idstr));
3060         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3061         qemu_put_be64(f, block->used_length);
3062         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3063             qemu_put_be64(f, block->page_size);
3064         }
3065     }
3066
3067     rcu_read_unlock();
3068
3069     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3070     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3071
3072     multifd_send_sync_main();
3073     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3074     qemu_fflush(f);
3075
3076     return 0;
3077 }
3078
3079 /**
3080  * ram_save_iterate: iterative stage for migration
3081  *
3082  * Returns zero to indicate success and negative for error
3083  *
3084  * @f: QEMUFile where to send the data
3085  * @opaque: RAMState pointer
3086  */
3087 static int ram_save_iterate(QEMUFile *f, void *opaque)
3088 {
3089     RAMState **temp = opaque;
3090     RAMState *rs = *temp;
3091     int ret;
3092     int i;
3093     int64_t t0;
3094     int done = 0;
3095
3096     if (blk_mig_bulk_active()) {
3097         /* Avoid transferring ram during bulk phase of block migration as
3098          * the bulk phase will usually take a long time and transferring
3099          * ram updates during that time is pointless. */
3100         goto out;
3101     }
3102
3103     rcu_read_lock();
3104     if (ram_list.version != rs->last_version) {
3105         ram_state_reset(rs);
3106     }
3107
3108     /* Read version before ram_list.blocks */
3109     smp_rmb();
3110
3111     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3112
3113     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3114     i = 0;
3115     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3116             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3117         int pages;
3118
3119         if (qemu_file_get_error(f)) {
3120             break;
3121         }
3122
3123         pages = ram_find_and_save_block(rs, false);
3124         /* no more pages to sent */
3125         if (pages == 0) {
3126             done = 1;
3127             break;
3128         }
3129         rs->iterations++;
3130
3131         /* we want to check in the 1st loop, just in case it was the 1st time
3132            and we had to sync the dirty bitmap.
3133            qemu_get_clock_ns() is a bit expensive, so we only check each some
3134            iterations
3135         */
3136         if ((i & 63) == 0) {
3137             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3138             if (t1 > MAX_WAIT) {
3139                 trace_ram_save_iterate_big_wait(t1, i);
3140                 break;
3141             }
3142         }
3143         i++;
3144     }
3145     flush_compressed_data(rs);
3146     rcu_read_unlock();
3147
3148     /*
3149      * Must occur before EOS (or any QEMUFile operation)
3150      * because of RDMA protocol.
3151      */
3152     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3153
3154     multifd_send_sync_main();
3155 out:
3156     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3157     qemu_fflush(f);
3158     ram_counters.transferred += 8;
3159
3160     ret = qemu_file_get_error(f);
3161     if (ret < 0) {
3162         return ret;
3163     }
3164
3165     return done;
3166 }
3167
3168 /**
3169  * ram_save_complete: function called to send the remaining amount of ram
3170  *
3171  * Returns zero to indicate success
3172  *
3173  * Called with iothread lock
3174  *
3175  * @f: QEMUFile where to send the data
3176  * @opaque: RAMState pointer
3177  */
3178 static int ram_save_complete(QEMUFile *f, void *opaque)
3179 {
3180     RAMState **temp = opaque;
3181     RAMState *rs = *temp;
3182
3183     rcu_read_lock();
3184
3185     if (!migration_in_postcopy()) {
3186         migration_bitmap_sync(rs);
3187     }
3188
3189     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3190
3191     /* try transferring iterative blocks of memory */
3192
3193     /* flush all remaining blocks regardless of rate limiting */
3194     while (true) {
3195         int pages;
3196
3197         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3198         /* no more blocks to sent */
3199         if (pages == 0) {
3200             break;
3201         }
3202     }
3203
3204     flush_compressed_data(rs);
3205     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3206
3207     rcu_read_unlock();
3208
3209     multifd_send_sync_main();
3210     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3211     qemu_fflush(f);
3212
3213     return 0;
3214 }
3215
3216 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3217                              uint64_t *res_precopy_only,
3218                              uint64_t *res_compatible,
3219                              uint64_t *res_postcopy_only)
3220 {
3221     RAMState **temp = opaque;
3222     RAMState *rs = *temp;
3223     uint64_t remaining_size;
3224
3225     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3226
3227     if (!migration_in_postcopy() &&
3228         remaining_size < max_size) {
3229         qemu_mutex_lock_iothread();
3230         rcu_read_lock();
3231         migration_bitmap_sync(rs);
3232         rcu_read_unlock();
3233         qemu_mutex_unlock_iothread();
3234         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3235     }
3236
3237     if (migrate_postcopy_ram()) {
3238         /* We can do postcopy, and all the data is postcopiable */
3239         *res_compatible += remaining_size;
3240     } else {
3241         *res_precopy_only += remaining_size;
3242     }
3243 }
3244
3245 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3246 {
3247     unsigned int xh_len;
3248     int xh_flags;
3249     uint8_t *loaded_data;
3250
3251     /* extract RLE header */
3252     xh_flags = qemu_get_byte(f);
3253     xh_len = qemu_get_be16(f);
3254
3255     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3256         error_report("Failed to load XBZRLE page - wrong compression!");
3257         return -1;
3258     }
3259
3260     if (xh_len > TARGET_PAGE_SIZE) {
3261         error_report("Failed to load XBZRLE page - len overflow!");
3262         return -1;
3263     }
3264     loaded_data = XBZRLE.decoded_buf;
3265     /* load data and decode */
3266     /* it can change loaded_data to point to an internal buffer */
3267     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3268
3269     /* decode RLE */
3270     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3271                              TARGET_PAGE_SIZE) == -1) {
3272         error_report("Failed to load XBZRLE page - decode error!");
3273         return -1;
3274     }
3275
3276     return 0;
3277 }
3278
3279 /**
3280  * ram_block_from_stream: read a RAMBlock id from the migration stream
3281  *
3282  * Must be called from within a rcu critical section.
3283  *
3284  * Returns a pointer from within the RCU-protected ram_list.
3285  *
3286  * @f: QEMUFile where to read the data from
3287  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3288  */
3289 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3290 {
3291     static RAMBlock *block = NULL;
3292     char id[256];
3293     uint8_t len;
3294
3295     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3296         if (!block) {
3297             error_report("Ack, bad migration stream!");
3298             return NULL;
3299         }
3300         return block;
3301     }
3302
3303     len = qemu_get_byte(f);
3304     qemu_get_buffer(f, (uint8_t *)id, len);
3305     id[len] = 0;
3306
3307     block = qemu_ram_block_by_name(id);
3308     if (!block) {
3309         error_report("Can't find block %s", id);
3310         return NULL;
3311     }
3312
3313     if (!qemu_ram_is_migratable(block)) {
3314         error_report("block %s should not be migrated !", id);
3315         return NULL;
3316     }
3317
3318     return block;
3319 }
3320
3321 static inline void *host_from_ram_block_offset(RAMBlock *block,
3322                                                ram_addr_t offset)
3323 {
3324     if (!offset_in_ramblock(block, offset)) {
3325         return NULL;
3326     }
3327
3328     return block->host + offset;
3329 }
3330
3331 /**
3332  * ram_handle_compressed: handle the zero page case
3333  *
3334  * If a page (or a whole RDMA chunk) has been
3335  * determined to be zero, then zap it.
3336  *
3337  * @host: host address for the zero page
3338  * @ch: what the page is filled from.  We only support zero
3339  * @size: size of the zero page
3340  */
3341 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3342 {
3343     if (ch != 0 || !is_zero_range(host, size)) {
3344         memset(host, ch, size);
3345     }
3346 }
3347
3348 /* return the size after decompression, or negative value on error */
3349 static int
3350 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3351                      const uint8_t *source, size_t source_len)
3352 {
3353     int err;
3354
3355     err = inflateReset(stream);
3356     if (err != Z_OK) {
3357         return -1;
3358     }
3359
3360     stream->avail_in = source_len;
3361     stream->next_in = (uint8_t *)source;
3362     stream->avail_out = dest_len;
3363     stream->next_out = dest;
3364
3365     err = inflate(stream, Z_NO_FLUSH);
3366     if (err != Z_STREAM_END) {
3367         return -1;
3368     }
3369
3370     return stream->total_out;
3371 }
3372
3373 static void *do_data_decompress(void *opaque)
3374 {
3375     DecompressParam *param = opaque;
3376     unsigned long pagesize;
3377     uint8_t *des;
3378     int len, ret;
3379
3380     qemu_mutex_lock(&param->mutex);
3381     while (!param->quit) {
3382         if (param->des) {
3383             des = param->des;
3384             len = param->len;
3385             param->des = 0;
3386             qemu_mutex_unlock(&param->mutex);
3387
3388             pagesize = TARGET_PAGE_SIZE;
3389
3390             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3391                                        param->compbuf, len);
3392             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3393                 error_report("decompress data failed");
3394                 qemu_file_set_error(decomp_file, ret);
3395             }
3396
3397             qemu_mutex_lock(&decomp_done_lock);
3398             param->done = true;
3399             qemu_cond_signal(&decomp_done_cond);
3400             qemu_mutex_unlock(&decomp_done_lock);
3401
3402             qemu_mutex_lock(&param->mutex);
3403         } else {
3404             qemu_cond_wait(&param->cond, &param->mutex);
3405         }
3406     }
3407     qemu_mutex_unlock(&param->mutex);
3408
3409     return NULL;
3410 }
3411
3412 static int wait_for_decompress_done(void)
3413 {
3414     int idx, thread_count;
3415
3416     if (!migrate_use_compression()) {
3417         return 0;
3418     }
3419
3420     thread_count = migrate_decompress_threads();
3421     qemu_mutex_lock(&decomp_done_lock);
3422     for (idx = 0; idx < thread_count; idx++) {
3423         while (!decomp_param[idx].done) {
3424             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3425         }
3426     }
3427     qemu_mutex_unlock(&decomp_done_lock);
3428     return qemu_file_get_error(decomp_file);
3429 }
3430
3431 static void compress_threads_load_cleanup(void)
3432 {
3433     int i, thread_count;
3434
3435     if (!migrate_use_compression()) {
3436         return;
3437     }
3438     thread_count = migrate_decompress_threads();
3439     for (i = 0; i < thread_count; i++) {
3440         /*
3441          * we use it as a indicator which shows if the thread is
3442          * properly init'd or not
3443          */
3444         if (!decomp_param[i].compbuf) {
3445             break;
3446         }
3447
3448         qemu_mutex_lock(&decomp_param[i].mutex);
3449         decomp_param[i].quit = true;
3450         qemu_cond_signal(&decomp_param[i].cond);
3451         qemu_mutex_unlock(&decomp_param[i].mutex);
3452     }
3453     for (i = 0; i < thread_count; i++) {
3454         if (!decomp_param[i].compbuf) {
3455             break;
3456         }
3457
3458         qemu_thread_join(decompress_threads + i);
3459         qemu_mutex_destroy(&decomp_param[i].mutex);
3460         qemu_cond_destroy(&decomp_param[i].cond);
3461         inflateEnd(&decomp_param[i].stream);
3462         g_free(decomp_param[i].compbuf);
3463         decomp_param[i].compbuf = NULL;
3464     }
3465     g_free(decompress_threads);
3466     g_free(decomp_param);
3467     decompress_threads = NULL;
3468     decomp_param = NULL;
3469     decomp_file = NULL;
3470 }
3471
3472 static int compress_threads_load_setup(QEMUFile *f)
3473 {
3474     int i, thread_count;
3475
3476     if (!migrate_use_compression()) {
3477         return 0;
3478     }
3479
3480     thread_count = migrate_decompress_threads();
3481     decompress_threads = g_new0(QemuThread, thread_count);
3482     decomp_param = g_new0(DecompressParam, thread_count);
3483     qemu_mutex_init(&decomp_done_lock);
3484     qemu_cond_init(&decomp_done_cond);
3485     decomp_file = f;
3486     for (i = 0; i < thread_count; i++) {
3487         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3488             goto exit;
3489         }
3490
3491         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3492         qemu_mutex_init(&decomp_param[i].mutex);
3493         qemu_cond_init(&decomp_param[i].cond);
3494         decomp_param[i].done = true;
3495         decomp_param[i].quit = false;
3496         qemu_thread_create(decompress_threads + i, "decompress",
3497                            do_data_decompress, decomp_param + i,
3498                            QEMU_THREAD_JOINABLE);
3499     }
3500     return 0;
3501 exit:
3502     compress_threads_load_cleanup();
3503     return -1;
3504 }
3505
3506 static void decompress_data_with_multi_threads(QEMUFile *f,
3507                                                void *host, int len)
3508 {
3509     int idx, thread_count;
3510
3511     thread_count = migrate_decompress_threads();
3512     qemu_mutex_lock(&decomp_done_lock);
3513     while (true) {
3514         for (idx = 0; idx < thread_count; idx++) {
3515             if (decomp_param[idx].done) {
3516                 decomp_param[idx].done = false;
3517                 qemu_mutex_lock(&decomp_param[idx].mutex);
3518                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3519                 decomp_param[idx].des = host;
3520                 decomp_param[idx].len = len;
3521                 qemu_cond_signal(&decomp_param[idx].cond);
3522                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3523                 break;
3524             }
3525         }
3526         if (idx < thread_count) {
3527             break;
3528         } else {
3529             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3530         }
3531     }
3532     qemu_mutex_unlock(&decomp_done_lock);
3533 }
3534
3535 /**
3536  * ram_load_setup: Setup RAM for migration incoming side
3537  *
3538  * Returns zero to indicate success and negative for error
3539  *
3540  * @f: QEMUFile where to receive the data
3541  * @opaque: RAMState pointer
3542  */
3543 static int ram_load_setup(QEMUFile *f, void *opaque)
3544 {
3545     if (compress_threads_load_setup(f)) {
3546         return -1;
3547     }
3548
3549     xbzrle_load_setup();
3550     ramblock_recv_map_init();
3551     return 0;
3552 }
3553
3554 static int ram_load_cleanup(void *opaque)
3555 {
3556     RAMBlock *rb;
3557
3558     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3559         if (ramblock_is_pmem(rb)) {
3560             pmem_persist(rb->host, rb->used_length);
3561         }
3562     }
3563
3564     xbzrle_load_cleanup();
3565     compress_threads_load_cleanup();
3566
3567     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3568         g_free(rb->receivedmap);
3569         rb->receivedmap = NULL;
3570     }
3571     return 0;
3572 }
3573
3574 /**
3575  * ram_postcopy_incoming_init: allocate postcopy data structures
3576  *
3577  * Returns 0 for success and negative if there was one error
3578  *
3579  * @mis: current migration incoming state
3580  *
3581  * Allocate data structures etc needed by incoming migration with
3582  * postcopy-ram. postcopy-ram's similarly names
3583  * postcopy_ram_incoming_init does the work.
3584  */
3585 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3586 {
3587     return postcopy_ram_incoming_init(mis);
3588 }
3589
3590 /**
3591  * ram_load_postcopy: load a page in postcopy case
3592  *
3593  * Returns 0 for success or -errno in case of error
3594  *
3595  * Called in postcopy mode by ram_load().
3596  * rcu_read_lock is taken prior to this being called.
3597  *
3598  * @f: QEMUFile where to send the data
3599  */
3600 static int ram_load_postcopy(QEMUFile *f)
3601 {
3602     int flags = 0, ret = 0;
3603     bool place_needed = false;
3604     bool matches_target_page_size = false;
3605     MigrationIncomingState *mis = migration_incoming_get_current();
3606     /* Temporary page that is later 'placed' */
3607     void *postcopy_host_page = postcopy_get_tmp_page(mis);
3608     void *last_host = NULL;
3609     bool all_zero = false;
3610
3611     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3612         ram_addr_t addr;
3613         void *host = NULL;
3614         void *page_buffer = NULL;
3615         void *place_source = NULL;
3616         RAMBlock *block = NULL;
3617         uint8_t ch;
3618
3619         addr = qemu_get_be64(f);
3620
3621         /*
3622          * If qemu file error, we should stop here, and then "addr"
3623          * may be invalid
3624          */
3625         ret = qemu_file_get_error(f);
3626         if (ret) {
3627             break;
3628         }
3629
3630         flags = addr & ~TARGET_PAGE_MASK;
3631         addr &= TARGET_PAGE_MASK;
3632
3633         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3634         place_needed = false;
3635         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3636             block = ram_block_from_stream(f, flags);
3637
3638             host = host_from_ram_block_offset(block, addr);
3639             if (!host) {
3640                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3641                 ret = -EINVAL;
3642                 break;
3643             }
3644             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3645             /*
3646              * Postcopy requires that we place whole host pages atomically;
3647              * these may be huge pages for RAMBlocks that are backed by
3648              * hugetlbfs.
3649              * To make it atomic, the data is read into a temporary page
3650              * that's moved into place later.
3651              * The migration protocol uses,  possibly smaller, target-pages
3652              * however the source ensures it always sends all the components
3653              * of a host page in order.
3654              */
3655             page_buffer = postcopy_host_page +
3656                           ((uintptr_t)host & (block->page_size - 1));
3657             /* If all TP are zero then we can optimise the place */
3658             if (!((uintptr_t)host & (block->page_size - 1))) {
3659                 all_zero = true;
3660             } else {
3661                 /* not the 1st TP within the HP */
3662                 if (host != (last_host + TARGET_PAGE_SIZE)) {
3663                     error_report("Non-sequential target page %p/%p",
3664                                   host, last_host);
3665                     ret = -EINVAL;
3666                     break;
3667                 }
3668             }
3669
3670
3671             /*
3672              * If it's the last part of a host page then we place the host
3673              * page
3674              */
3675             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3676                                      (block->page_size - 1)) == 0;
3677             place_source = postcopy_host_page;
3678         }
3679         last_host = host;
3680
3681         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3682         case RAM_SAVE_FLAG_ZERO:
3683             ch = qemu_get_byte(f);
3684             memset(page_buffer, ch, TARGET_PAGE_SIZE);
3685             if (ch) {
3686                 all_zero = false;
3687             }
3688             break;
3689
3690         case RAM_SAVE_FLAG_PAGE:
3691             all_zero = false;
3692             if (!matches_target_page_size) {
3693                 /* For huge pages, we always use temporary buffer */
3694                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3695             } else {
3696                 /*
3697                  * For small pages that matches target page size, we
3698                  * avoid the qemu_file copy.  Instead we directly use
3699                  * the buffer of QEMUFile to place the page.  Note: we
3700                  * cannot do any QEMUFile operation before using that
3701                  * buffer to make sure the buffer is valid when
3702                  * placing the page.
3703                  */
3704                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3705                                          TARGET_PAGE_SIZE);
3706             }
3707             break;
3708         case RAM_SAVE_FLAG_EOS:
3709             /* normal exit */
3710             multifd_recv_sync_main();
3711             break;
3712         default:
3713             error_report("Unknown combination of migration flags: %#x"
3714                          " (postcopy mode)", flags);
3715             ret = -EINVAL;
3716             break;
3717         }
3718
3719         /* Detect for any possible file errors */
3720         if (!ret && qemu_file_get_error(f)) {
3721             ret = qemu_file_get_error(f);
3722         }
3723
3724         if (!ret && place_needed) {
3725             /* This gets called at the last target page in the host page */
3726             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3727
3728             if (all_zero) {
3729                 ret = postcopy_place_page_zero(mis, place_dest,
3730                                                block);
3731             } else {
3732                 ret = postcopy_place_page(mis, place_dest,
3733                                           place_source, block);
3734             }
3735         }
3736     }
3737
3738     return ret;
3739 }
3740
3741 static bool postcopy_is_advised(void)
3742 {
3743     PostcopyState ps = postcopy_state_get();
3744     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3745 }
3746
3747 static bool postcopy_is_running(void)
3748 {
3749     PostcopyState ps = postcopy_state_get();
3750     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3751 }
3752
3753 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3754 {
3755     int flags = 0, ret = 0, invalid_flags = 0;
3756     static uint64_t seq_iter;
3757     int len = 0;
3758     /*
3759      * If system is running in postcopy mode, page inserts to host memory must
3760      * be atomic
3761      */
3762     bool postcopy_running = postcopy_is_running();
3763     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3764     bool postcopy_advised = postcopy_is_advised();
3765
3766     seq_iter++;
3767
3768     if (version_id != 4) {
3769         ret = -EINVAL;
3770     }
3771
3772     if (!migrate_use_compression()) {
3773         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3774     }
3775     /* This RCU critical section can be very long running.
3776      * When RCU reclaims in the code start to become numerous,
3777      * it will be necessary to reduce the granularity of this
3778      * critical section.
3779      */
3780     rcu_read_lock();
3781
3782     if (postcopy_running) {
3783         ret = ram_load_postcopy(f);
3784     }
3785
3786     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3787         ram_addr_t addr, total_ram_bytes;
3788         void *host = NULL;
3789         uint8_t ch;
3790
3791         addr = qemu_get_be64(f);
3792         flags = addr & ~TARGET_PAGE_MASK;
3793         addr &= TARGET_PAGE_MASK;
3794
3795         if (flags & invalid_flags) {
3796             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3797                 error_report("Received an unexpected compressed page");
3798             }
3799
3800             ret = -EINVAL;
3801             break;
3802         }
3803
3804         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3805                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3806             RAMBlock *block = ram_block_from_stream(f, flags);
3807
3808             host = host_from_ram_block_offset(block, addr);
3809             if (!host) {
3810                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3811                 ret = -EINVAL;
3812                 break;
3813             }
3814             ramblock_recv_bitmap_set(block, host);
3815             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3816         }
3817
3818         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3819         case RAM_SAVE_FLAG_MEM_SIZE:
3820             /* Synchronize RAM block list */
3821             total_ram_bytes = addr;
3822             while (!ret && total_ram_bytes) {
3823                 RAMBlock *block;
3824                 char id[256];
3825                 ram_addr_t length;
3826
3827                 len = qemu_get_byte(f);
3828                 qemu_get_buffer(f, (uint8_t *)id, len);
3829                 id[len] = 0;
3830                 length = qemu_get_be64(f);
3831
3832                 block = qemu_ram_block_by_name(id);
3833                 if (block && !qemu_ram_is_migratable(block)) {
3834                     error_report("block %s should not be migrated !", id);
3835                     ret = -EINVAL;
3836                 } else if (block) {
3837                     if (length != block->used_length) {
3838                         Error *local_err = NULL;
3839
3840                         ret = qemu_ram_resize(block, length,
3841                                               &local_err);
3842                         if (local_err) {
3843                             error_report_err(local_err);
3844                         }
3845                     }
3846                     /* For postcopy we need to check hugepage sizes match */
3847                     if (postcopy_advised &&
3848                         block->page_size != qemu_host_page_size) {
3849                         uint64_t remote_page_size = qemu_get_be64(f);
3850                         if (remote_page_size != block->page_size) {
3851                             error_report("Mismatched RAM page size %s "
3852                                          "(local) %zd != %" PRId64,
3853                                          id, block->page_size,
3854                                          remote_page_size);
3855                             ret = -EINVAL;
3856                         }
3857                     }
3858                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3859                                           block->idstr);
3860                 } else {
3861                     error_report("Unknown ramblock \"%s\", cannot "
3862                                  "accept migration", id);
3863                     ret = -EINVAL;
3864                 }
3865
3866                 total_ram_bytes -= length;
3867             }
3868             break;
3869
3870         case RAM_SAVE_FLAG_ZERO:
3871             ch = qemu_get_byte(f);
3872             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3873             break;
3874
3875         case RAM_SAVE_FLAG_PAGE:
3876             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3877             break;
3878
3879         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3880             len = qemu_get_be32(f);
3881             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3882                 error_report("Invalid compressed data length: %d", len);
3883                 ret = -EINVAL;
3884                 break;
3885             }
3886             decompress_data_with_multi_threads(f, host, len);
3887             break;
3888
3889         case RAM_SAVE_FLAG_XBZRLE:
3890             if (load_xbzrle(f, addr, host) < 0) {
3891                 error_report("Failed to decompress XBZRLE page at "
3892                              RAM_ADDR_FMT, addr);
3893                 ret = -EINVAL;
3894                 break;
3895             }
3896             break;
3897         case RAM_SAVE_FLAG_EOS:
3898             /* normal exit */
3899             multifd_recv_sync_main();
3900             break;
3901         default:
3902             if (flags & RAM_SAVE_FLAG_HOOK) {
3903                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3904             } else {
3905                 error_report("Unknown combination of migration flags: %#x",
3906                              flags);
3907                 ret = -EINVAL;
3908             }
3909         }
3910         if (!ret) {
3911             ret = qemu_file_get_error(f);
3912         }
3913     }
3914
3915     ret |= wait_for_decompress_done();
3916     rcu_read_unlock();
3917     trace_ram_load_complete(ret, seq_iter);
3918     return ret;
3919 }
3920
3921 static bool ram_has_postcopy(void *opaque)
3922 {
3923     RAMBlock *rb;
3924     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3925         if (ramblock_is_pmem(rb)) {
3926             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3927                          "is not supported now!", rb->idstr, rb->host);
3928             return false;
3929         }
3930     }
3931
3932     return migrate_postcopy_ram();
3933 }
3934
3935 /* Sync all the dirty bitmap with destination VM.  */
3936 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3937 {
3938     RAMBlock *block;
3939     QEMUFile *file = s->to_dst_file;
3940     int ramblock_count = 0;
3941
3942     trace_ram_dirty_bitmap_sync_start();
3943
3944     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3945         qemu_savevm_send_recv_bitmap(file, block->idstr);
3946         trace_ram_dirty_bitmap_request(block->idstr);
3947         ramblock_count++;
3948     }
3949
3950     trace_ram_dirty_bitmap_sync_wait();
3951
3952     /* Wait until all the ramblocks' dirty bitmap synced */
3953     while (ramblock_count--) {
3954         qemu_sem_wait(&s->rp_state.rp_sem);
3955     }
3956
3957     trace_ram_dirty_bitmap_sync_complete();
3958
3959     return 0;
3960 }
3961
3962 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3963 {
3964     qemu_sem_post(&s->rp_state.rp_sem);
3965 }
3966
3967 /*
3968  * Read the received bitmap, revert it as the initial dirty bitmap.
3969  * This is only used when the postcopy migration is paused but wants
3970  * to resume from a middle point.
3971  */
3972 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3973 {
3974     int ret = -EINVAL;
3975     QEMUFile *file = s->rp_state.from_dst_file;
3976     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3977     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3978     uint64_t size, end_mark;
3979
3980     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3981
3982     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3983         error_report("%s: incorrect state %s", __func__,
3984                      MigrationStatus_str(s->state));
3985         return -EINVAL;
3986     }
3987
3988     /*
3989      * Note: see comments in ramblock_recv_bitmap_send() on why we
3990      * need the endianess convertion, and the paddings.
3991      */
3992     local_size = ROUND_UP(local_size, 8);
3993
3994     /* Add paddings */
3995     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3996
3997     size = qemu_get_be64(file);
3998
3999     /* The size of the bitmap should match with our ramblock */
4000     if (size != local_size) {
4001         error_report("%s: ramblock '%s' bitmap size mismatch "
4002                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4003                      block->idstr, size, local_size);
4004         ret = -EINVAL;
4005         goto out;
4006     }
4007
4008     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4009     end_mark = qemu_get_be64(file);
4010
4011     ret = qemu_file_get_error(file);
4012     if (ret || size != local_size) {
4013         error_report("%s: read bitmap failed for ramblock '%s': %d"
4014                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4015                      __func__, block->idstr, ret, local_size, size);
4016         ret = -EIO;
4017         goto out;
4018     }
4019
4020     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4021         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4022                      __func__, block->idstr, end_mark);
4023         ret = -EINVAL;
4024         goto out;
4025     }
4026
4027     /*
4028      * Endianess convertion. We are during postcopy (though paused).
4029      * The dirty bitmap won't change. We can directly modify it.
4030      */
4031     bitmap_from_le(block->bmap, le_bitmap, nbits);
4032
4033     /*
4034      * What we received is "received bitmap". Revert it as the initial
4035      * dirty bitmap for this ramblock.
4036      */
4037     bitmap_complement(block->bmap, block->bmap, nbits);
4038
4039     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4040
4041     /*
4042      * We succeeded to sync bitmap for current ramblock. If this is
4043      * the last one to sync, we need to notify the main send thread.
4044      */
4045     ram_dirty_bitmap_reload_notify(s);
4046
4047     ret = 0;
4048 out:
4049     g_free(le_bitmap);
4050     return ret;
4051 }
4052
4053 static int ram_resume_prepare(MigrationState *s, void *opaque)
4054 {
4055     RAMState *rs = *(RAMState **)opaque;
4056     int ret;
4057
4058     ret = ram_dirty_bitmap_sync_all(s, rs);
4059     if (ret) {
4060         return ret;
4061     }
4062
4063     ram_state_resume_prepare(rs, s->to_dst_file);
4064
4065     return 0;
4066 }
4067
4068 static SaveVMHandlers savevm_ram_handlers = {
4069     .save_setup = ram_save_setup,
4070     .save_live_iterate = ram_save_iterate,
4071     .save_live_complete_postcopy = ram_save_complete,
4072     .save_live_complete_precopy = ram_save_complete,
4073     .has_postcopy = ram_has_postcopy,
4074     .save_live_pending = ram_save_pending,
4075     .load_state = ram_load,
4076     .save_cleanup = ram_save_cleanup,
4077     .load_setup = ram_load_setup,
4078     .load_cleanup = ram_load_cleanup,
4079     .resume_prepare = ram_resume_prepare,
4080 };
4081
4082 void ram_mig_init(void)
4083 {
4084     qemu_mutex_init(&XBZRLE.lock);
4085     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4086 }