migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "qemu/pmem.h"
  37 #include "xbzrle.h"
  38 #include "ram.h"
  39 #include "migration.h"
  40 #include "socket.h"
  41 #include "migration/register.h"
  42 #include "migration/misc.h"
  43 #include "qemu-file.h"
  44 #include "postcopy-ram.h"
  45 #include "page_cache.h"
  46 #include "qemu/error-report.h"
  47 #include "qapi/error.h"
  48 #include "qapi/qapi-events-migration.h"
  49 #include "qapi/qmp/qerror.h"
  50 #include "trace.h"
  51 #include "exec/ram_addr.h"
  52 #include "exec/target_page.h"
  53 #include "qemu/rcu_queue.h"
  54 #include "migration/colo.h"
  55 #include "block.h"
  56 #include "sysemu/sysemu.h"
  57 #include "qemu/uuid.h"
  58 #include "savevm.h"
  59 #include "qemu/iov.h"
  60
  61 /***********************************************************/
  62 /* ram save/restore */
  63
  64 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  65  * worked for pages that where filled with the same char.  We switched
  66  * it to only search for the zero value.  And to avoid confusion with
  67  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  68  */
  69
  70 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  71 #define RAM_SAVE_FLAG_ZERO     0x02
  72 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  73 #define RAM_SAVE_FLAG_PAGE     0x08
  74 #define RAM_SAVE_FLAG_EOS      0x10
  75 #define RAM_SAVE_FLAG_CONTINUE 0x20
  76 #define RAM_SAVE_FLAG_XBZRLE   0x40
  77 /* 0x80 is reserved in migration.h start with 0x100 next */
  78 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  79
  80 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  81 {
  82     return buffer_is_zero(p, size);
  83 }
  84
  85 XBZRLECacheStats xbzrle_counters;
  86
  87 /* struct contains XBZRLE cache and a static page
  88    used by the compression */
  89 static struct {
  90     /* buffer used for XBZRLE encoding */
  91     uint8_t *encoded_buf;
  92     /* buffer for storing page content */
  93     uint8_t *current_buf;
  94     /* Cache for XBZRLE, Protected by lock. */
  95     PageCache *cache;
  96     QemuMutex lock;
  97     /* it will store a page full of zeros */
  98     uint8_t *zero_target_page;
  99     /* buffer used for XBZRLE decoding */
 100     uint8_t *decoded_buf;
 101 } XBZRLE;
 102
 103 static void XBZRLE_cache_lock(void)
 104 {
 105     if (migrate_use_xbzrle())
 106         qemu_mutex_lock(&XBZRLE.lock);
 107 }
 108
 109 static void XBZRLE_cache_unlock(void)
 110 {
 111     if (migrate_use_xbzrle())
 112         qemu_mutex_unlock(&XBZRLE.lock);
 113 }
 114
 115 /**
 116  * xbzrle_cache_resize: resize the xbzrle cache
 117  *
 118  * This function is called from qmp_migrate_set_cache_size in main
 119  * thread, possibly while a migration is in progress.  A running
 120  * migration may be using the cache and might finish during this call,
 121  * hence changes to the cache are protected by XBZRLE.lock().
 122  *
 123  * Returns 0 for success or -1 for error
 124  *
 125  * @new_size: new cache size
 126  * @errp: set *errp if the check failed, with reason
 127  */
 128 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 129 {
 130     PageCache *new_cache;
 131     int64_t ret = 0;
 132
 133     /* Check for truncation */
 134     if (new_size != (size_t)new_size) {
 135         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 136                    "exceeding address space");
 137         return -1;
 138     }
 139
 140     if (new_size == migrate_xbzrle_cache_size()) {
 141         /* nothing to do */
 142         return 0;
 143     }
 144
 145     XBZRLE_cache_lock();
 146
 147     if (XBZRLE.cache != NULL) {
 148         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 149         if (!new_cache) {
 150             ret = -1;
 151             goto out;
 152         }
 153
 154         cache_fini(XBZRLE.cache);
 155         XBZRLE.cache = new_cache;
 156     }
 157 out:
 158     XBZRLE_cache_unlock();
 159     return ret;
 160 }
 161
 162 /* Should be holding either ram_list.mutex, or the RCU lock. */
 163 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 164     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 165         if (!qemu_ram_is_migratable(block)) {} else
 166
 167 #undef RAMBLOCK_FOREACH
 168
 169 static void ramblock_recv_map_init(void)
 170 {
 171     RAMBlock *rb;
 172
 173     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
 174         assert(!rb->receivedmap);
 175         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 176     }
 177 }
 178
 179 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 180 {
 181     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 182                     rb->receivedmap);
 183 }
 184
 185 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 186 {
 187     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 188 }
 189
 190 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 191 {
 192     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 193 }
 194
 195 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 196                                     size_t nr)
 197 {
 198     bitmap_set_atomic(rb->receivedmap,
 199                       ramblock_recv_bitmap_offset(host_addr, rb),
 200                       nr);
 201 }
 202
 203 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 204
 205 /*
 206  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 207  *
 208  * Returns >0 if success with sent bytes, or <0 if error.
 209  */
 210 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 211                                   const char *block_name)
 212 {
 213     RAMBlock *block = qemu_ram_block_by_name(block_name);
 214     unsigned long *le_bitmap, nbits;
 215     uint64_t size;
 216
 217     if (!block) {
 218         error_report("%s: invalid block name: %s", __func__, block_name);
 219         return -1;
 220     }
 221
 222     nbits = block->used_length >> TARGET_PAGE_BITS;
 223
 224     /*
 225      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 226      * machines we may need 4 more bytes for padding (see below
 227      * comment). So extend it a bit before hand.
 228      */
 229     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 230
 231     /*
 232      * Always use little endian when sending the bitmap. This is
 233      * required that when source and destination VMs are not using the
 234      * same endianess. (Note: big endian won't work.)
 235      */
 236     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 237
 238     /* Size of the bitmap, in bytes */
 239     size = DIV_ROUND_UP(nbits, 8);
 240
 241     /*
 242      * size is always aligned to 8 bytes for 64bit machines, but it
 243      * may not be true for 32bit machines. We need this padding to
 244      * make sure the migration can survive even between 32bit and
 245      * 64bit machines.
 246      */
 247     size = ROUND_UP(size, 8);
 248
 249     qemu_put_be64(file, size);
 250     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 251     /*
 252      * Mark as an end, in case the middle part is screwed up due to
 253      * some "misterious" reason.
 254      */
 255     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 256     qemu_fflush(file);
 257
 258     g_free(le_bitmap);
 259
 260     if (qemu_file_get_error(file)) {
 261         return qemu_file_get_error(file);
 262     }
 263
 264     return size + sizeof(size);
 265 }
 266
 267 /*
 268  * An outstanding page request, on the source, having been received
 269  * and queued
 270  */
 271 struct RAMSrcPageRequest {
 272     RAMBlock *rb;
 273     hwaddr    offset;
 274     hwaddr    len;
 275
 276     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 277 };
 278
 279 /* State of RAM for migration */
 280 struct RAMState {
 281     /* QEMUFile used for this migration */
 282     QEMUFile *f;
 283     /* Last block that we have visited searching for dirty pages */
 284     RAMBlock *last_seen_block;
 285     /* Last block from where we have sent data */
 286     RAMBlock *last_sent_block;
 287     /* Last dirty target page we have sent */
 288     ram_addr_t last_page;
 289     /* last ram version we have seen */
 290     uint32_t last_version;
 291     /* We are in the first round */
 292     bool ram_bulk_stage;
 293     /* How many times we have dirty too many pages */
 294     int dirty_rate_high_cnt;
 295     /* these variables are used for bitmap sync */
 296     /* last time we did a full bitmap_sync */
 297     int64_t time_last_bitmap_sync;
 298     /* bytes transferred at start_time */
 299     uint64_t bytes_xfer_prev;
 300     /* number of dirty pages since start_time */
 301     uint64_t num_dirty_pages_period;
 302     /* xbzrle misses since the beginning of the period */
 303     uint64_t xbzrle_cache_miss_prev;
 304     /* number of iterations at the beginning of period */
 305     uint64_t iterations_prev;
 306     /* Iterations since start */
 307     uint64_t iterations;
 308     /* number of dirty bits in the bitmap */
 309     uint64_t migration_dirty_pages;
 310     /* protects modification of the bitmap */
 311     QemuMutex bitmap_mutex;
 312     /* The RAMBlock used in the last src_page_requests */
 313     RAMBlock *last_req_rb;
 314     /* Queue of outstanding page requests from the destination */
 315     QemuMutex src_page_req_mutex;
 316     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 317 };
 318 typedef struct RAMState RAMState;
 319
 320 static RAMState *ram_state;
 321
 322 uint64_t ram_bytes_remaining(void)
 323 {
 324     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 325                        0;
 326 }
 327
 328 MigrationStats ram_counters;
 329
 330 /* used by the search for pages to send */
 331 struct PageSearchStatus {
 332     /* Current block being searched */
 333     RAMBlock    *block;
 334     /* Current page to search from */
 335     unsigned long page;
 336     /* Set once we wrap around */
 337     bool         complete_round;
 338 };
 339 typedef struct PageSearchStatus PageSearchStatus;
 340
 341 struct CompressParam {
 342     bool done;
 343     bool quit;
 344     QEMUFile *file;
 345     QemuMutex mutex;
 346     QemuCond cond;
 347     RAMBlock *block;
 348     ram_addr_t offset;
 349
 350     /* internally used fields */
 351     z_stream stream;
 352     uint8_t *originbuf;
 353 };
 354 typedef struct CompressParam CompressParam;
 355
 356 struct DecompressParam {
 357     bool done;
 358     bool quit;
 359     QemuMutex mutex;
 360     QemuCond cond;
 361     void *des;
 362     uint8_t *compbuf;
 363     int len;
 364     z_stream stream;
 365 };
 366 typedef struct DecompressParam DecompressParam;
 367
 368 static CompressParam *comp_param;
 369 static QemuThread *compress_threads;
 370 /* comp_done_cond is used to wake up the migration thread when
 371  * one of the compression threads has finished the compression.
 372  * comp_done_lock is used to co-work with comp_done_cond.
 373  */
 374 static QemuMutex comp_done_lock;
 375 static QemuCond comp_done_cond;
 376 /* The empty QEMUFileOps will be used by file in CompressParam */
 377 static const QEMUFileOps empty_ops = { };
 378
 379 static QEMUFile *decomp_file;
 380 static DecompressParam *decomp_param;
 381 static QemuThread *decompress_threads;
 382 static QemuMutex decomp_done_lock;
 383 static QemuCond decomp_done_cond;
 384
 385 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 386                                 ram_addr_t offset, uint8_t *source_buf);
 387
 388 static void *do_data_compress(void *opaque)
 389 {
 390     CompressParam *param = opaque;
 391     RAMBlock *block;
 392     ram_addr_t offset;
 393
 394     qemu_mutex_lock(&param->mutex);
 395     while (!param->quit) {
 396         if (param->block) {
 397             block = param->block;
 398             offset = param->offset;
 399             param->block = NULL;
 400             qemu_mutex_unlock(&param->mutex);
 401
 402             do_compress_ram_page(param->file, &param->stream, block, offset,
 403                                  param->originbuf);
 404
 405             qemu_mutex_lock(&comp_done_lock);
 406             param->done = true;
 407             qemu_cond_signal(&comp_done_cond);
 408             qemu_mutex_unlock(&comp_done_lock);
 409
 410             qemu_mutex_lock(&param->mutex);
 411         } else {
 412             qemu_cond_wait(&param->cond, &param->mutex);
 413         }
 414     }
 415     qemu_mutex_unlock(&param->mutex);
 416
 417     return NULL;
 418 }
 419
 420 static inline void terminate_compression_threads(void)
 421 {
 422     int idx, thread_count;
 423
 424     thread_count = migrate_compress_threads();
 425
 426     for (idx = 0; idx < thread_count; idx++) {
 427         qemu_mutex_lock(&comp_param[idx].mutex);
 428         comp_param[idx].quit = true;
 429         qemu_cond_signal(&comp_param[idx].cond);
 430         qemu_mutex_unlock(&comp_param[idx].mutex);
 431     }
 432 }
 433
 434 static void compress_threads_save_cleanup(void)
 435 {
 436     int i, thread_count;
 437
 438     if (!migrate_use_compression()) {
 439         return;
 440     }
 441     terminate_compression_threads();
 442     thread_count = migrate_compress_threads();
 443     for (i = 0; i < thread_count; i++) {
 444         /*
 445          * we use it as a indicator which shows if the thread is
 446          * properly init'd or not
 447          */
 448         if (!comp_param[i].file) {
 449             break;
 450         }
 451         qemu_thread_join(compress_threads + i);
 452         qemu_mutex_destroy(&comp_param[i].mutex);
 453         qemu_cond_destroy(&comp_param[i].cond);
 454         deflateEnd(&comp_param[i].stream);
 455         g_free(comp_param[i].originbuf);
 456         qemu_fclose(comp_param[i].file);
 457         comp_param[i].file = NULL;
 458     }
 459     qemu_mutex_destroy(&comp_done_lock);
 460     qemu_cond_destroy(&comp_done_cond);
 461     g_free(compress_threads);
 462     g_free(comp_param);
 463     compress_threads = NULL;
 464     comp_param = NULL;
 465 }
 466
 467 static int compress_threads_save_setup(void)
 468 {
 469     int i, thread_count;
 470
 471     if (!migrate_use_compression()) {
 472         return 0;
 473     }
 474     thread_count = migrate_compress_threads();
 475     compress_threads = g_new0(QemuThread, thread_count);
 476     comp_param = g_new0(CompressParam, thread_count);
 477     qemu_cond_init(&comp_done_cond);
 478     qemu_mutex_init(&comp_done_lock);
 479     for (i = 0; i < thread_count; i++) {
 480         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 481         if (!comp_param[i].originbuf) {
 482             goto exit;
 483         }
 484
 485         if (deflateInit(&comp_param[i].stream,
 486                         migrate_compress_level()) != Z_OK) {
 487             g_free(comp_param[i].originbuf);
 488             goto exit;
 489         }
 490
 491         /* comp_param[i].file is just used as a dummy buffer to save data,
 492          * set its ops to empty.
 493          */
 494         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 495         comp_param[i].done = true;
 496         comp_param[i].quit = false;
 497         qemu_mutex_init(&comp_param[i].mutex);
 498         qemu_cond_init(&comp_param[i].cond);
 499         qemu_thread_create(compress_threads + i, "compress",
 500                            do_data_compress, comp_param + i,
 501                            QEMU_THREAD_JOINABLE);
 502     }
 503     return 0;
 504
 505 exit:
 506     compress_threads_save_cleanup();
 507     return -1;
 508 }
 509
 510 /* Multiple fd's */
 511
 512 #define MULTIFD_MAGIC 0x11223344U
 513 #define MULTIFD_VERSION 1
 514
 515 #define MULTIFD_FLAG_SYNC (1 << 0)
 516
 517 typedef struct {
 518     uint32_t magic;
 519     uint32_t version;
 520     unsigned char uuid[16]; /* QemuUUID */
 521     uint8_t id;
 522 } __attribute__((packed)) MultiFDInit_t;
 523
 524 typedef struct {
 525     uint32_t magic;
 526     uint32_t version;
 527     uint32_t flags;
 528     uint32_t size;
 529     uint32_t used;
 530     uint64_t packet_num;
 531     char ramblock[256];
 532     uint64_t offset[];
 533 } __attribute__((packed)) MultiFDPacket_t;
 534
 535 typedef struct {
 536     /* number of used pages */
 537     uint32_t used;
 538     /* number of allocated pages */
 539     uint32_t allocated;
 540     /* global number of generated multifd packets */
 541     uint64_t packet_num;
 542     /* offset of each page */
 543     ram_addr_t *offset;
 544     /* pointer to each page */
 545     struct iovec *iov;
 546     RAMBlock *block;
 547 } MultiFDPages_t;
 548
 549 typedef struct {
 550     /* this fields are not changed once the thread is created */
 551     /* channel number */
 552     uint8_t id;
 553     /* channel thread name */
 554     char *name;
 555     /* channel thread id */
 556     QemuThread thread;
 557     /* communication channel */
 558     QIOChannel *c;
 559     /* sem where to wait for more work */
 560     QemuSemaphore sem;
 561     /* this mutex protects the following parameters */
 562     QemuMutex mutex;
 563     /* is this channel thread running */
 564     bool running;
 565     /* should this thread finish */
 566     bool quit;
 567     /* thread has work to do */
 568     int pending_job;
 569     /* array of pages to sent */
 570     MultiFDPages_t *pages;
 571     /* packet allocated len */
 572     uint32_t packet_len;
 573     /* pointer to the packet */
 574     MultiFDPacket_t *packet;
 575     /* multifd flags for each packet */
 576     uint32_t flags;
 577     /* global number of generated multifd packets */
 578     uint64_t packet_num;
 579     /* thread local variables */
 580     /* packets sent through this channel */
 581     uint64_t num_packets;
 582     /* pages sent through this channel */
 583     uint64_t num_pages;
 584     /* syncs main thread and channels */
 585     QemuSemaphore sem_sync;
 586 }  MultiFDSendParams;
 587
 588 typedef struct {
 589     /* this fields are not changed once the thread is created */
 590     /* channel number */
 591     uint8_t id;
 592     /* channel thread name */
 593     char *name;
 594     /* channel thread id */
 595     QemuThread thread;
 596     /* communication channel */
 597     QIOChannel *c;
 598     /* this mutex protects the following parameters */
 599     QemuMutex mutex;
 600     /* is this channel thread running */
 601     bool running;
 602     /* array of pages to receive */
 603     MultiFDPages_t *pages;
 604     /* packet allocated len */
 605     uint32_t packet_len;
 606     /* pointer to the packet */
 607     MultiFDPacket_t *packet;
 608     /* multifd flags for each packet */
 609     uint32_t flags;
 610     /* global number of generated multifd packets */
 611     uint64_t packet_num;
 612     /* thread local variables */
 613     /* packets sent through this channel */
 614     uint64_t num_packets;
 615     /* pages sent through this channel */
 616     uint64_t num_pages;
 617     /* syncs main thread and channels */
 618     QemuSemaphore sem_sync;
 619 } MultiFDRecvParams;
 620
 621 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 622 {
 623     MultiFDInit_t msg;
 624     int ret;
 625
 626     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 627     msg.version = cpu_to_be32(MULTIFD_VERSION);
 628     msg.id = p->id;
 629     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 630
 631     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 632     if (ret != 0) {
 633         return -1;
 634     }
 635     return 0;
 636 }
 637
 638 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 639 {
 640     MultiFDInit_t msg;
 641     int ret;
 642
 643     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 644     if (ret != 0) {
 645         return -1;
 646     }
 647
 648     be32_to_cpus(&msg.magic);
 649     be32_to_cpus(&msg.version);
 650
 651     if (msg.magic != MULTIFD_MAGIC) {
 652         error_setg(errp, "multifd: received packet magic %x "
 653                    "expected %x", msg.magic, MULTIFD_MAGIC);
 654         return -1;
 655     }
 656
 657     if (msg.version != MULTIFD_VERSION) {
 658         error_setg(errp, "multifd: received packet version %d "
 659                    "expected %d", msg.version, MULTIFD_VERSION);
 660         return -1;
 661     }
 662
 663     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 664         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 665         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 666
 667         error_setg(errp, "multifd: received uuid '%s' and expected "
 668                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 669         g_free(uuid);
 670         g_free(msg_uuid);
 671         return -1;
 672     }
 673
 674     if (msg.id > migrate_multifd_channels()) {
 675         error_setg(errp, "multifd: received channel version %d "
 676                    "expected %d", msg.version, MULTIFD_VERSION);
 677         return -1;
 678     }
 679
 680     return msg.id;
 681 }
 682
 683 static MultiFDPages_t *multifd_pages_init(size_t size)
 684 {
 685     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 686
 687     pages->allocated = size;
 688     pages->iov = g_new0(struct iovec, size);
 689     pages->offset = g_new0(ram_addr_t, size);
 690
 691     return pages;
 692 }
 693
 694 static void multifd_pages_clear(MultiFDPages_t *pages)
 695 {
 696     pages->used = 0;
 697     pages->allocated = 0;
 698     pages->packet_num = 0;
 699     pages->block = NULL;
 700     g_free(pages->iov);
 701     pages->iov = NULL;
 702     g_free(pages->offset);
 703     pages->offset = NULL;
 704     g_free(pages);
 705 }
 706
 707 static void multifd_send_fill_packet(MultiFDSendParams *p)
 708 {
 709     MultiFDPacket_t *packet = p->packet;
 710     int i;
 711
 712     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 713     packet->version = cpu_to_be32(MULTIFD_VERSION);
 714     packet->flags = cpu_to_be32(p->flags);
 715     packet->size = cpu_to_be32(migrate_multifd_page_count());
 716     packet->used = cpu_to_be32(p->pages->used);
 717     packet->packet_num = cpu_to_be64(p->packet_num);
 718
 719     if (p->pages->block) {
 720         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 721     }
 722
 723     for (i = 0; i < p->pages->used; i++) {
 724         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 725     }
 726 }
 727
 728 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 729 {
 730     MultiFDPacket_t *packet = p->packet;
 731     RAMBlock *block;
 732     int i;
 733
 734     be32_to_cpus(&packet->magic);
 735     if (packet->magic != MULTIFD_MAGIC) {
 736         error_setg(errp, "multifd: received packet "
 737                    "magic %x and expected magic %x",
 738                    packet->magic, MULTIFD_MAGIC);
 739         return -1;
 740     }
 741
 742     be32_to_cpus(&packet->version);
 743     if (packet->version != MULTIFD_VERSION) {
 744         error_setg(errp, "multifd: received packet "
 745                    "version %d and expected version %d",
 746                    packet->version, MULTIFD_VERSION);
 747         return -1;
 748     }
 749
 750     p->flags = be32_to_cpu(packet->flags);
 751
 752     be32_to_cpus(&packet->size);
 753     if (packet->size > migrate_multifd_page_count()) {
 754         error_setg(errp, "multifd: received packet "
 755                    "with size %d and expected maximum size %d",
 756                    packet->size, migrate_multifd_page_count()) ;
 757         return -1;
 758     }
 759
 760     p->pages->used = be32_to_cpu(packet->used);
 761     if (p->pages->used > packet->size) {
 762         error_setg(errp, "multifd: received packet "
 763                    "with size %d and expected maximum size %d",
 764                    p->pages->used, packet->size) ;
 765         return -1;
 766     }
 767
 768     p->packet_num = be64_to_cpu(packet->packet_num);
 769
 770     if (p->pages->used) {
 771         /* make sure that ramblock is 0 terminated */
 772         packet->ramblock[255] = 0;
 773         block = qemu_ram_block_by_name(packet->ramblock);
 774         if (!block) {
 775             error_setg(errp, "multifd: unknown ram block %s",
 776                        packet->ramblock);
 777             return -1;
 778         }
 779     }
 780
 781     for (i = 0; i < p->pages->used; i++) {
 782         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 783
 784         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 785             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 786                        " (max " RAM_ADDR_FMT ")",
 787                        offset, block->max_length);
 788             return -1;
 789         }
 790         p->pages->iov[i].iov_base = block->host + offset;
 791         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 792     }
 793
 794     return 0;
 795 }
 796
 797 struct {
 798     MultiFDSendParams *params;
 799     /* number of created threads */
 800     int count;
 801     /* array of pages to sent */
 802     MultiFDPages_t *pages;
 803     /* syncs main thread and channels */
 804     QemuSemaphore sem_sync;
 805     /* global number of generated multifd packets */
 806     uint64_t packet_num;
 807     /* send channels ready */
 808     QemuSemaphore channels_ready;
 809 } *multifd_send_state;
 810
 811 /*
 812  * How we use multifd_send_state->pages and channel->pages?
 813  *
 814  * We create a pages for each channel, and a main one.  Each time that
 815  * we need to send a batch of pages we interchange the ones between
 816  * multifd_send_state and the channel that is sending it.  There are
 817  * two reasons for that:
 818  *    - to not have to do so many mallocs during migration
 819  *    - to make easier to know what to free at the end of migration
 820  *
 821  * This way we always know who is the owner of each "pages" struct,
 822  * and we don't need any loocking.  It belongs to the migration thread
 823  * or to the channel thread.  Switching is safe because the migration
 824  * thread is using the channel mutex when changing it, and the channel
 825  * have to had finish with its own, otherwise pending_job can't be
 826  * false.
 827  */
 828
 829 static void multifd_send_pages(void)
 830 {
 831     int i;
 832     static int next_channel;
 833     MultiFDSendParams *p = NULL; /* make happy gcc */
 834     MultiFDPages_t *pages = multifd_send_state->pages;
 835     uint64_t transferred;
 836
 837     qemu_sem_wait(&multifd_send_state->channels_ready);
 838     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 839         p = &multifd_send_state->params[i];
 840
 841         qemu_mutex_lock(&p->mutex);
 842         if (!p->pending_job) {
 843             p->pending_job++;
 844             next_channel = (i + 1) % migrate_multifd_channels();
 845             break;
 846         }
 847         qemu_mutex_unlock(&p->mutex);
 848     }
 849     p->pages->used = 0;
 850
 851     p->packet_num = multifd_send_state->packet_num++;
 852     p->pages->block = NULL;
 853     multifd_send_state->pages = p->pages;
 854     p->pages = pages;
 855     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
 856     ram_counters.multifd_bytes += transferred;
 857     ram_counters.transferred += transferred;;
 858     qemu_mutex_unlock(&p->mutex);
 859     qemu_sem_post(&p->sem);
 860 }
 861
 862 static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
 863 {
 864     MultiFDPages_t *pages = multifd_send_state->pages;
 865
 866     if (!pages->block) {
 867         pages->block = block;
 868     }
 869
 870     if (pages->block == block) {
 871         pages->offset[pages->used] = offset;
 872         pages->iov[pages->used].iov_base = block->host + offset;
 873         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 874         pages->used++;
 875
 876         if (pages->used < pages->allocated) {
 877             return;
 878         }
 879     }
 880
 881     multifd_send_pages();
 882
 883     if (pages->block != block) {
 884         multifd_queue_page(block, offset);
 885     }
 886 }
 887
 888 static void multifd_send_terminate_threads(Error *err)
 889 {
 890     int i;
 891
 892     if (err) {
 893         MigrationState *s = migrate_get_current();
 894         migrate_set_error(s, err);
 895         if (s->state == MIGRATION_STATUS_SETUP ||
 896             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
 897             s->state == MIGRATION_STATUS_DEVICE ||
 898             s->state == MIGRATION_STATUS_ACTIVE) {
 899             migrate_set_state(&s->state, s->state,
 900                               MIGRATION_STATUS_FAILED);
 901         }
 902     }
 903
 904     for (i = 0; i < migrate_multifd_channels(); i++) {
 905         MultiFDSendParams *p = &multifd_send_state->params[i];
 906
 907         qemu_mutex_lock(&p->mutex);
 908         p->quit = true;
 909         qemu_sem_post(&p->sem);
 910         qemu_mutex_unlock(&p->mutex);
 911     }
 912 }
 913
 914 int multifd_save_cleanup(Error **errp)
 915 {
 916     int i;
 917     int ret = 0;
 918
 919     if (!migrate_use_multifd()) {
 920         return 0;
 921     }
 922     multifd_send_terminate_threads(NULL);
 923     for (i = 0; i < migrate_multifd_channels(); i++) {
 924         MultiFDSendParams *p = &multifd_send_state->params[i];
 925
 926         if (p->running) {
 927             qemu_thread_join(&p->thread);
 928         }
 929         socket_send_channel_destroy(p->c);
 930         p->c = NULL;
 931         qemu_mutex_destroy(&p->mutex);
 932         qemu_sem_destroy(&p->sem);
 933         qemu_sem_destroy(&p->sem_sync);
 934         g_free(p->name);
 935         p->name = NULL;
 936         multifd_pages_clear(p->pages);
 937         p->pages = NULL;
 938         p->packet_len = 0;
 939         g_free(p->packet);
 940         p->packet = NULL;
 941     }
 942     qemu_sem_destroy(&multifd_send_state->channels_ready);
 943     qemu_sem_destroy(&multifd_send_state->sem_sync);
 944     g_free(multifd_send_state->params);
 945     multifd_send_state->params = NULL;
 946     multifd_pages_clear(multifd_send_state->pages);
 947     multifd_send_state->pages = NULL;
 948     g_free(multifd_send_state);
 949     multifd_send_state = NULL;
 950     return ret;
 951 }
 952
 953 static void multifd_send_sync_main(void)
 954 {
 955     int i;
 956
 957     if (!migrate_use_multifd()) {
 958         return;
 959     }
 960     if (multifd_send_state->pages->used) {
 961         multifd_send_pages();
 962     }
 963     for (i = 0; i < migrate_multifd_channels(); i++) {
 964         MultiFDSendParams *p = &multifd_send_state->params[i];
 965
 966         trace_multifd_send_sync_main_signal(p->id);
 967
 968         qemu_mutex_lock(&p->mutex);
 969
 970         p->packet_num = multifd_send_state->packet_num++;
 971         p->flags |= MULTIFD_FLAG_SYNC;
 972         p->pending_job++;
 973         qemu_mutex_unlock(&p->mutex);
 974         qemu_sem_post(&p->sem);
 975     }
 976     for (i = 0; i < migrate_multifd_channels(); i++) {
 977         MultiFDSendParams *p = &multifd_send_state->params[i];
 978
 979         trace_multifd_send_sync_main_wait(p->id);
 980         qemu_sem_wait(&multifd_send_state->sem_sync);
 981     }
 982     trace_multifd_send_sync_main(multifd_send_state->packet_num);
 983 }
 984
 985 static void *multifd_send_thread(void *opaque)
 986 {
 987     MultiFDSendParams *p = opaque;
 988     Error *local_err = NULL;
 989     int ret;
 990
 991     trace_multifd_send_thread_start(p->id);
 992
 993     if (multifd_send_initial_packet(p, &local_err) < 0) {
 994         goto out;
 995     }
 996     /* initial packet */
 997     p->num_packets = 1;
 998
 999     while (true) {
1000         qemu_sem_wait(&p->sem);
1001         qemu_mutex_lock(&p->mutex);
1002
1003         if (p->pending_job) {
1004             uint32_t used = p->pages->used;
1005             uint64_t packet_num = p->packet_num;
1006             uint32_t flags = p->flags;
1007
1008             multifd_send_fill_packet(p);
1009             p->flags = 0;
1010             p->num_packets++;
1011             p->num_pages += used;
1012             p->pages->used = 0;
1013             qemu_mutex_unlock(&p->mutex);
1014
1015             trace_multifd_send(p->id, packet_num, used, flags);
1016
1017             ret = qio_channel_write_all(p->c, (void *)p->packet,
1018                                         p->packet_len, &local_err);
1019             if (ret != 0) {
1020                 break;
1021             }
1022
1023             ret = qio_channel_writev_all(p->c, p->pages->iov, used, &local_err);
1024             if (ret != 0) {
1025                 break;
1026             }
1027
1028             qemu_mutex_lock(&p->mutex);
1029             p->pending_job--;
1030             qemu_mutex_unlock(&p->mutex);
1031
1032             if (flags & MULTIFD_FLAG_SYNC) {
1033                 qemu_sem_post(&multifd_send_state->sem_sync);
1034             }
1035             qemu_sem_post(&multifd_send_state->channels_ready);
1036         } else if (p->quit) {
1037             qemu_mutex_unlock(&p->mutex);
1038             break;
1039         } else {
1040             qemu_mutex_unlock(&p->mutex);
1041             /* sometimes there are spurious wakeups */
1042         }
1043     }
1044
1045 out:
1046     if (local_err) {
1047         multifd_send_terminate_threads(local_err);
1048     }
1049
1050     qemu_mutex_lock(&p->mutex);
1051     p->running = false;
1052     qemu_mutex_unlock(&p->mutex);
1053
1054     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1055
1056     return NULL;
1057 }
1058
1059 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1060 {
1061     MultiFDSendParams *p = opaque;
1062     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1063     Error *local_err = NULL;
1064
1065     if (qio_task_propagate_error(task, &local_err)) {
1066         if (multifd_save_cleanup(&local_err) != 0) {
1067             migrate_set_error(migrate_get_current(), local_err);
1068         }
1069     } else {
1070         p->c = QIO_CHANNEL(sioc);
1071         qio_channel_set_delay(p->c, false);
1072         p->running = true;
1073         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1074                            QEMU_THREAD_JOINABLE);
1075
1076         atomic_inc(&multifd_send_state->count);
1077     }
1078 }
1079
1080 int multifd_save_setup(void)
1081 {
1082     int thread_count;
1083     uint32_t page_count = migrate_multifd_page_count();
1084     uint8_t i;
1085
1086     if (!migrate_use_multifd()) {
1087         return 0;
1088     }
1089     thread_count = migrate_multifd_channels();
1090     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1091     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1092     atomic_set(&multifd_send_state->count, 0);
1093     multifd_send_state->pages = multifd_pages_init(page_count);
1094     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1095     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1096
1097     for (i = 0; i < thread_count; i++) {
1098         MultiFDSendParams *p = &multifd_send_state->params[i];
1099
1100         qemu_mutex_init(&p->mutex);
1101         qemu_sem_init(&p->sem, 0);
1102         qemu_sem_init(&p->sem_sync, 0);
1103         p->quit = false;
1104         p->pending_job = 0;
1105         p->id = i;
1106         p->pages = multifd_pages_init(page_count);
1107         p->packet_len = sizeof(MultiFDPacket_t)
1108                       + sizeof(ram_addr_t) * page_count;
1109         p->packet = g_malloc0(p->packet_len);
1110         p->name = g_strdup_printf("multifdsend_%d", i);
1111         socket_send_channel_create(multifd_new_send_channel_async, p);
1112     }
1113     return 0;
1114 }
1115
1116 struct {
1117     MultiFDRecvParams *params;
1118     /* number of created threads */
1119     int count;
1120     /* syncs main thread and channels */
1121     QemuSemaphore sem_sync;
1122     /* global number of generated multifd packets */
1123     uint64_t packet_num;
1124 } *multifd_recv_state;
1125
1126 static void multifd_recv_terminate_threads(Error *err)
1127 {
1128     int i;
1129
1130     if (err) {
1131         MigrationState *s = migrate_get_current();
1132         migrate_set_error(s, err);
1133         if (s->state == MIGRATION_STATUS_SETUP ||
1134             s->state == MIGRATION_STATUS_ACTIVE) {
1135             migrate_set_state(&s->state, s->state,
1136                               MIGRATION_STATUS_FAILED);
1137         }
1138     }
1139
1140     for (i = 0; i < migrate_multifd_channels(); i++) {
1141         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1142
1143         qemu_mutex_lock(&p->mutex);
1144         /* We could arrive here for two reasons:
1145            - normal quit, i.e. everything went fine, just finished
1146            - error quit: We close the channels so the channel threads
1147              finish the qio_channel_read_all_eof() */
1148         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1149         qemu_mutex_unlock(&p->mutex);
1150     }
1151 }
1152
1153 int multifd_load_cleanup(Error **errp)
1154 {
1155     int i;
1156     int ret = 0;
1157
1158     if (!migrate_use_multifd()) {
1159         return 0;
1160     }
1161     multifd_recv_terminate_threads(NULL);
1162     for (i = 0; i < migrate_multifd_channels(); i++) {
1163         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1164
1165         if (p->running) {
1166             qemu_thread_join(&p->thread);
1167         }
1168         object_unref(OBJECT(p->c));
1169         p->c = NULL;
1170         qemu_mutex_destroy(&p->mutex);
1171         qemu_sem_destroy(&p->sem_sync);
1172         g_free(p->name);
1173         p->name = NULL;
1174         multifd_pages_clear(p->pages);
1175         p->pages = NULL;
1176         p->packet_len = 0;
1177         g_free(p->packet);
1178         p->packet = NULL;
1179     }
1180     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1181     g_free(multifd_recv_state->params);
1182     multifd_recv_state->params = NULL;
1183     g_free(multifd_recv_state);
1184     multifd_recv_state = NULL;
1185
1186     return ret;
1187 }
1188
1189 static void multifd_recv_sync_main(void)
1190 {
1191     int i;
1192
1193     if (!migrate_use_multifd()) {
1194         return;
1195     }
1196     for (i = 0; i < migrate_multifd_channels(); i++) {
1197         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1198
1199         trace_multifd_recv_sync_main_wait(p->id);
1200         qemu_sem_wait(&multifd_recv_state->sem_sync);
1201         qemu_mutex_lock(&p->mutex);
1202         if (multifd_recv_state->packet_num < p->packet_num) {
1203             multifd_recv_state->packet_num = p->packet_num;
1204         }
1205         qemu_mutex_unlock(&p->mutex);
1206     }
1207     for (i = 0; i < migrate_multifd_channels(); i++) {
1208         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1209
1210         trace_multifd_recv_sync_main_signal(p->id);
1211         qemu_sem_post(&p->sem_sync);
1212     }
1213     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1214 }
1215
1216 static void *multifd_recv_thread(void *opaque)
1217 {
1218     MultiFDRecvParams *p = opaque;
1219     Error *local_err = NULL;
1220     int ret;
1221
1222     trace_multifd_recv_thread_start(p->id);
1223
1224     while (true) {
1225         uint32_t used;
1226         uint32_t flags;
1227
1228         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1229                                        p->packet_len, &local_err);
1230         if (ret == 0) {   /* EOF */
1231             break;
1232         }
1233         if (ret == -1) {   /* Error */
1234             break;
1235         }
1236
1237         qemu_mutex_lock(&p->mutex);
1238         ret = multifd_recv_unfill_packet(p, &local_err);
1239         if (ret) {
1240             qemu_mutex_unlock(&p->mutex);
1241             break;
1242         }
1243
1244         used = p->pages->used;
1245         flags = p->flags;
1246         trace_multifd_recv(p->id, p->packet_num, used, flags);
1247         p->num_packets++;
1248         p->num_pages += used;
1249         qemu_mutex_unlock(&p->mutex);
1250
1251         ret = qio_channel_readv_all(p->c, p->pages->iov, used, &local_err);
1252         if (ret != 0) {
1253             break;
1254         }
1255
1256         if (flags & MULTIFD_FLAG_SYNC) {
1257             qemu_sem_post(&multifd_recv_state->sem_sync);
1258             qemu_sem_wait(&p->sem_sync);
1259         }
1260     }
1261
1262     if (local_err) {
1263         multifd_recv_terminate_threads(local_err);
1264     }
1265     qemu_mutex_lock(&p->mutex);
1266     p->running = false;
1267     qemu_mutex_unlock(&p->mutex);
1268
1269     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1270
1271     return NULL;
1272 }
1273
1274 int multifd_load_setup(void)
1275 {
1276     int thread_count;
1277     uint32_t page_count = migrate_multifd_page_count();
1278     uint8_t i;
1279
1280     if (!migrate_use_multifd()) {
1281         return 0;
1282     }
1283     thread_count = migrate_multifd_channels();
1284     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1285     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1286     atomic_set(&multifd_recv_state->count, 0);
1287     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1288
1289     for (i = 0; i < thread_count; i++) {
1290         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1291
1292         qemu_mutex_init(&p->mutex);
1293         qemu_sem_init(&p->sem_sync, 0);
1294         p->id = i;
1295         p->pages = multifd_pages_init(page_count);
1296         p->packet_len = sizeof(MultiFDPacket_t)
1297                       + sizeof(ram_addr_t) * page_count;
1298         p->packet = g_malloc0(p->packet_len);
1299         p->name = g_strdup_printf("multifdrecv_%d", i);
1300     }
1301     return 0;
1302 }
1303
1304 bool multifd_recv_all_channels_created(void)
1305 {
1306     int thread_count = migrate_multifd_channels();
1307
1308     if (!migrate_use_multifd()) {
1309         return true;
1310     }
1311
1312     return thread_count == atomic_read(&multifd_recv_state->count);
1313 }
1314
1315 /* Return true if multifd is ready for the migration, otherwise false */
1316 bool multifd_recv_new_channel(QIOChannel *ioc)
1317 {
1318     MultiFDRecvParams *p;
1319     Error *local_err = NULL;
1320     int id;
1321
1322     id = multifd_recv_initial_packet(ioc, &local_err);
1323     if (id < 0) {
1324         multifd_recv_terminate_threads(local_err);
1325         return false;
1326     }
1327
1328     p = &multifd_recv_state->params[id];
1329     if (p->c != NULL) {
1330         error_setg(&local_err, "multifd: received id '%d' already setup'",
1331                    id);
1332         multifd_recv_terminate_threads(local_err);
1333         return false;
1334     }
1335     p->c = ioc;
1336     object_ref(OBJECT(ioc));
1337     /* initial packet */
1338     p->num_packets = 1;
1339
1340     p->running = true;
1341     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1342                        QEMU_THREAD_JOINABLE);
1343     atomic_inc(&multifd_recv_state->count);
1344     return multifd_recv_state->count == migrate_multifd_channels();
1345 }
1346
1347 /**
1348  * save_page_header: write page header to wire
1349  *
1350  * If this is the 1st block, it also writes the block identification
1351  *
1352  * Returns the number of bytes written
1353  *
1354  * @f: QEMUFile where to send the data
1355  * @block: block that contains the page we want to send
1356  * @offset: offset inside the block for the page
1357  *          in the lower bits, it contains flags
1358  */
1359 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1360                                ram_addr_t offset)
1361 {
1362     size_t size, len;
1363
1364     if (block == rs->last_sent_block) {
1365         offset |= RAM_SAVE_FLAG_CONTINUE;
1366     }
1367     qemu_put_be64(f, offset);
1368     size = 8;
1369
1370     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1371         len = strlen(block->idstr);
1372         qemu_put_byte(f, len);
1373         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1374         size += 1 + len;
1375         rs->last_sent_block = block;
1376     }
1377     return size;
1378 }
1379
1380 /**
1381  * mig_throttle_guest_down: throotle down the guest
1382  *
1383  * Reduce amount of guest cpu execution to hopefully slow down memory
1384  * writes. If guest dirty memory rate is reduced below the rate at
1385  * which we can transfer pages to the destination then we should be
1386  * able to complete migration. Some workloads dirty memory way too
1387  * fast and will not effectively converge, even with auto-converge.
1388  */
1389 static void mig_throttle_guest_down(void)
1390 {
1391     MigrationState *s = migrate_get_current();
1392     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1393     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1394     int pct_max = s->parameters.max_cpu_throttle;
1395
1396     /* We have not started throttling yet. Let's start it. */
1397     if (!cpu_throttle_active()) {
1398         cpu_throttle_set(pct_initial);
1399     } else {
1400         /* Throttling already on, just increase the rate */
1401         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1402                          pct_max));
1403     }
1404 }
1405
1406 /**
1407  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1408  *
1409  * @rs: current RAM state
1410  * @current_addr: address for the zero page
1411  *
1412  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1413  * The important thing is that a stale (not-yet-0'd) page be replaced
1414  * by the new data.
1415  * As a bonus, if the page wasn't in the cache it gets added so that
1416  * when a small write is made into the 0'd page it gets XBZRLE sent.
1417  */
1418 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1419 {
1420     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1421         return;
1422     }
1423
1424     /* We don't care if this fails to allocate a new cache page
1425      * as long as it updated an old one */
1426     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1427                  ram_counters.dirty_sync_count);
1428 }
1429
1430 #define ENCODING_FLAG_XBZRLE 0x1
1431
1432 /**
1433  * save_xbzrle_page: compress and send current page
1434  *
1435  * Returns: 1 means that we wrote the page
1436  *          0 means that page is identical to the one already sent
1437  *          -1 means that xbzrle would be longer than normal
1438  *
1439  * @rs: current RAM state
1440  * @current_data: pointer to the address of the page contents
1441  * @current_addr: addr of the page
1442  * @block: block that contains the page we want to send
1443  * @offset: offset inside the block for the page
1444  * @last_stage: if we are at the completion stage
1445  */
1446 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1447                             ram_addr_t current_addr, RAMBlock *block,
1448                             ram_addr_t offset, bool last_stage)
1449 {
1450     int encoded_len = 0, bytes_xbzrle;
1451     uint8_t *prev_cached_page;
1452
1453     if (!cache_is_cached(XBZRLE.cache, current_addr,
1454                          ram_counters.dirty_sync_count)) {
1455         xbzrle_counters.cache_miss++;
1456         if (!last_stage) {
1457             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1458                              ram_counters.dirty_sync_count) == -1) {
1459                 return -1;
1460             } else {
1461                 /* update *current_data when the page has been
1462                    inserted into cache */
1463                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1464             }
1465         }
1466         return -1;
1467     }
1468
1469     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1470
1471     /* save current buffer into memory */
1472     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1473
1474     /* XBZRLE encoding (if there is no overflow) */
1475     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1476                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1477                                        TARGET_PAGE_SIZE);
1478     if (encoded_len == 0) {
1479         trace_save_xbzrle_page_skipping();
1480         return 0;
1481     } else if (encoded_len == -1) {
1482         trace_save_xbzrle_page_overflow();
1483         xbzrle_counters.overflow++;
1484         /* update data in the cache */
1485         if (!last_stage) {
1486             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1487             *current_data = prev_cached_page;
1488         }
1489         return -1;
1490     }
1491
1492     /* we need to update the data in the cache, in order to get the same data */
1493     if (!last_stage) {
1494         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1495     }
1496
1497     /* Send XBZRLE based compressed page */
1498     bytes_xbzrle = save_page_header(rs, rs->f, block,
1499                                     offset | RAM_SAVE_FLAG_XBZRLE);
1500     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1501     qemu_put_be16(rs->f, encoded_len);
1502     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1503     bytes_xbzrle += encoded_len + 1 + 2;
1504     xbzrle_counters.pages++;
1505     xbzrle_counters.bytes += bytes_xbzrle;
1506     ram_counters.transferred += bytes_xbzrle;
1507
1508     return 1;
1509 }
1510
1511 /**
1512  * migration_bitmap_find_dirty: find the next dirty page from start
1513  *
1514  * Called with rcu_read_lock() to protect migration_bitmap
1515  *
1516  * Returns the byte offset within memory region of the start of a dirty page
1517  *
1518  * @rs: current RAM state
1519  * @rb: RAMBlock where to search for dirty pages
1520  * @start: page where we start the search
1521  */
1522 static inline
1523 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1524                                           unsigned long start)
1525 {
1526     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1527     unsigned long *bitmap = rb->bmap;
1528     unsigned long next;
1529
1530     if (!qemu_ram_is_migratable(rb)) {
1531         return size;
1532     }
1533
1534     if (rs->ram_bulk_stage && start > 0) {
1535         next = start + 1;
1536     } else {
1537         next = find_next_bit(bitmap, size, start);
1538     }
1539
1540     return next;
1541 }
1542
1543 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1544                                                 RAMBlock *rb,
1545                                                 unsigned long page)
1546 {
1547     bool ret;
1548
1549     ret = test_and_clear_bit(page, rb->bmap);
1550
1551     if (ret) {
1552         rs->migration_dirty_pages--;
1553     }
1554     return ret;
1555 }
1556
1557 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1558                                         ram_addr_t start, ram_addr_t length)
1559 {
1560     rs->migration_dirty_pages +=
1561         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1562                                               &rs->num_dirty_pages_period);
1563 }
1564
1565 /**
1566  * ram_pagesize_summary: calculate all the pagesizes of a VM
1567  *
1568  * Returns a summary bitmap of the page sizes of all RAMBlocks
1569  *
1570  * For VMs with just normal pages this is equivalent to the host page
1571  * size. If it's got some huge pages then it's the OR of all the
1572  * different page sizes.
1573  */
1574 uint64_t ram_pagesize_summary(void)
1575 {
1576     RAMBlock *block;
1577     uint64_t summary = 0;
1578
1579     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1580         summary |= block->page_size;
1581     }
1582
1583     return summary;
1584 }
1585
1586 static void migration_update_rates(RAMState *rs, int64_t end_time)
1587 {
1588     uint64_t iter_count = rs->iterations - rs->iterations_prev;
1589
1590     /* calculate period counters */
1591     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1592                 / (end_time - rs->time_last_bitmap_sync);
1593
1594     if (!iter_count) {
1595         return;
1596     }
1597
1598     if (migrate_use_xbzrle()) {
1599         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1600             rs->xbzrle_cache_miss_prev) / iter_count;
1601         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1602     }
1603 }
1604
1605 static void migration_bitmap_sync(RAMState *rs)
1606 {
1607     RAMBlock *block;
1608     int64_t end_time;
1609     uint64_t bytes_xfer_now;
1610
1611     ram_counters.dirty_sync_count++;
1612
1613     if (!rs->time_last_bitmap_sync) {
1614         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1615     }
1616
1617     trace_migration_bitmap_sync_start();
1618     memory_global_dirty_log_sync();
1619
1620     qemu_mutex_lock(&rs->bitmap_mutex);
1621     rcu_read_lock();
1622     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1623         migration_bitmap_sync_range(rs, block, 0, block->used_length);
1624     }
1625     ram_counters.remaining = ram_bytes_remaining();
1626     rcu_read_unlock();
1627     qemu_mutex_unlock(&rs->bitmap_mutex);
1628
1629     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1630
1631     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1632
1633     /* more than 1 second = 1000 millisecons */
1634     if (end_time > rs->time_last_bitmap_sync + 1000) {
1635         bytes_xfer_now = ram_counters.transferred;
1636
1637         /* During block migration the auto-converge logic incorrectly detects
1638          * that ram migration makes no progress. Avoid this by disabling the
1639          * throttling logic during the bulk phase of block migration. */
1640         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1641             /* The following detection logic can be refined later. For now:
1642                Check to see if the dirtied bytes is 50% more than the approx.
1643                amount of bytes that just got transferred since the last time we
1644                were in this routine. If that happens twice, start or increase
1645                throttling */
1646
1647             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1648                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1649                 (++rs->dirty_rate_high_cnt >= 2)) {
1650                     trace_migration_throttle();
1651                     rs->dirty_rate_high_cnt = 0;
1652                     mig_throttle_guest_down();
1653             }
1654         }
1655
1656         migration_update_rates(rs, end_time);
1657
1658         rs->iterations_prev = rs->iterations;
1659
1660         /* reset period counters */
1661         rs->time_last_bitmap_sync = end_time;
1662         rs->num_dirty_pages_period = 0;
1663         rs->bytes_xfer_prev = bytes_xfer_now;
1664     }
1665     if (migrate_use_events()) {
1666         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
1667     }
1668 }
1669
1670 /**
1671  * save_zero_page: send the zero page to the stream
1672  *
1673  * Returns the number of pages written.
1674  *
1675  * @rs: current RAM state
1676  * @block: block that contains the page we want to send
1677  * @offset: offset inside the block for the page
1678  */
1679 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1680 {
1681     uint8_t *p = block->host + offset;
1682     int pages = -1;
1683
1684     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1685         ram_counters.duplicate++;
1686         ram_counters.transferred +=
1687             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
1688         qemu_put_byte(rs->f, 0);
1689         ram_counters.transferred += 1;
1690         pages = 1;
1691     }
1692
1693     return pages;
1694 }
1695
1696 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1697 {
1698     if (!migrate_release_ram() || !migration_in_postcopy()) {
1699         return;
1700     }
1701
1702     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1703 }
1704
1705 /*
1706  * @pages: the number of pages written by the control path,
1707  *        < 0 - error
1708  *        > 0 - number of pages written
1709  *
1710  * Return true if the pages has been saved, otherwise false is returned.
1711  */
1712 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1713                               int *pages)
1714 {
1715     uint64_t bytes_xmit = 0;
1716     int ret;
1717
1718     *pages = -1;
1719     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1720                                 &bytes_xmit);
1721     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1722         return false;
1723     }
1724
1725     if (bytes_xmit) {
1726         ram_counters.transferred += bytes_xmit;
1727         *pages = 1;
1728     }
1729
1730     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1731         return true;
1732     }
1733
1734     if (bytes_xmit > 0) {
1735         ram_counters.normal++;
1736     } else if (bytes_xmit == 0) {
1737         ram_counters.duplicate++;
1738     }
1739
1740     return true;
1741 }
1742
1743 /*
1744  * directly send the page to the stream
1745  *
1746  * Returns the number of pages written.
1747  *
1748  * @rs: current RAM state
1749  * @block: block that contains the page we want to send
1750  * @offset: offset inside the block for the page
1751  * @buf: the page to be sent
1752  * @async: send to page asyncly
1753  */
1754 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1755                             uint8_t *buf, bool async)
1756 {
1757     ram_counters.transferred += save_page_header(rs, rs->f, block,
1758                                                  offset | RAM_SAVE_FLAG_PAGE);
1759     if (async) {
1760         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1761                               migrate_release_ram() &
1762                               migration_in_postcopy());
1763     } else {
1764         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1765     }
1766     ram_counters.transferred += TARGET_PAGE_SIZE;
1767     ram_counters.normal++;
1768     return 1;
1769 }
1770
1771 /**
1772  * ram_save_page: send the given page to the stream
1773  *
1774  * Returns the number of pages written.
1775  *          < 0 - error
1776  *          >=0 - Number of pages written - this might legally be 0
1777  *                if xbzrle noticed the page was the same.
1778  *
1779  * @rs: current RAM state
1780  * @block: block that contains the page we want to send
1781  * @offset: offset inside the block for the page
1782  * @last_stage: if we are at the completion stage
1783  */
1784 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1785 {
1786     int pages = -1;
1787     uint8_t *p;
1788     bool send_async = true;
1789     RAMBlock *block = pss->block;
1790     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1791     ram_addr_t current_addr = block->offset + offset;
1792
1793     p = block->host + offset;
1794     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1795
1796     XBZRLE_cache_lock();
1797     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1798         migrate_use_xbzrle()) {
1799         pages = save_xbzrle_page(rs, &p, current_addr, block,
1800                                  offset, last_stage);
1801         if (!last_stage) {
1802             /* Can't send this cached data async, since the cache page
1803              * might get updated before it gets to the wire
1804              */
1805             send_async = false;
1806         }
1807     }
1808
1809     /* XBZRLE overflow or normal page */
1810     if (pages == -1) {
1811         pages = save_normal_page(rs, block, offset, p, send_async);
1812     }
1813
1814     XBZRLE_cache_unlock();
1815
1816     return pages;
1817 }
1818
1819 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1820                                  ram_addr_t offset)
1821 {
1822     multifd_queue_page(block, offset);
1823     ram_counters.normal++;
1824
1825     return 1;
1826 }
1827
1828 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1829                                 ram_addr_t offset, uint8_t *source_buf)
1830 {
1831     RAMState *rs = ram_state;
1832     int bytes_sent, blen;
1833     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1834
1835     bytes_sent = save_page_header(rs, f, block, offset |
1836                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1837
1838     /*
1839      * copy it to a internal buffer to avoid it being modified by VM
1840      * so that we can catch up the error during compression and
1841      * decompression
1842      */
1843     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1844     blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1845     if (blen < 0) {
1846         bytes_sent = 0;
1847         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1848         error_report("compressed data failed!");
1849     } else {
1850         bytes_sent += blen;
1851         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1852     }
1853
1854     return bytes_sent;
1855 }
1856
1857 static void flush_compressed_data(RAMState *rs)
1858 {
1859     int idx, len, thread_count;
1860
1861     if (!migrate_use_compression()) {
1862         return;
1863     }
1864     thread_count = migrate_compress_threads();
1865
1866     qemu_mutex_lock(&comp_done_lock);
1867     for (idx = 0; idx < thread_count; idx++) {
1868         while (!comp_param[idx].done) {
1869             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1870         }
1871     }
1872     qemu_mutex_unlock(&comp_done_lock);
1873
1874     for (idx = 0; idx < thread_count; idx++) {
1875         qemu_mutex_lock(&comp_param[idx].mutex);
1876         if (!comp_param[idx].quit) {
1877             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1878             ram_counters.transferred += len;
1879         }
1880         qemu_mutex_unlock(&comp_param[idx].mutex);
1881     }
1882 }
1883
1884 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1885                                        ram_addr_t offset)
1886 {
1887     param->block = block;
1888     param->offset = offset;
1889 }
1890
1891 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1892                                            ram_addr_t offset)
1893 {
1894     int idx, thread_count, bytes_xmit = -1, pages = -1;
1895
1896     thread_count = migrate_compress_threads();
1897     qemu_mutex_lock(&comp_done_lock);
1898     while (true) {
1899         for (idx = 0; idx < thread_count; idx++) {
1900             if (comp_param[idx].done) {
1901                 comp_param[idx].done = false;
1902                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1903                 qemu_mutex_lock(&comp_param[idx].mutex);
1904                 set_compress_params(&comp_param[idx], block, offset);
1905                 qemu_cond_signal(&comp_param[idx].cond);
1906                 qemu_mutex_unlock(&comp_param[idx].mutex);
1907                 pages = 1;
1908                 ram_counters.normal++;
1909                 ram_counters.transferred += bytes_xmit;
1910                 break;
1911             }
1912         }
1913         if (pages > 0) {
1914             break;
1915         } else {
1916             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1917         }
1918     }
1919     qemu_mutex_unlock(&comp_done_lock);
1920
1921     return pages;
1922 }
1923
1924 /**
1925  * find_dirty_block: find the next dirty page and update any state
1926  * associated with the search process.
1927  *
1928  * Returns if a page is found
1929  *
1930  * @rs: current RAM state
1931  * @pss: data about the state of the current dirty page scan
1932  * @again: set to false if the search has scanned the whole of RAM
1933  */
1934 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1935 {
1936     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1937     if (pss->complete_round && pss->block == rs->last_seen_block &&
1938         pss->page >= rs->last_page) {
1939         /*
1940          * We've been once around the RAM and haven't found anything.
1941          * Give up.
1942          */
1943         *again = false;
1944         return false;
1945     }
1946     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1947         /* Didn't find anything in this RAM Block */
1948         pss->page = 0;
1949         pss->block = QLIST_NEXT_RCU(pss->block, next);
1950         if (!pss->block) {
1951             /* Hit the end of the list */
1952             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1953             /* Flag that we've looped */
1954             pss->complete_round = true;
1955             rs->ram_bulk_stage = false;
1956             if (migrate_use_xbzrle()) {
1957                 /* If xbzrle is on, stop using the data compression at this
1958                  * point. In theory, xbzrle can do better than compression.
1959                  */
1960                 flush_compressed_data(rs);
1961             }
1962         }
1963         /* Didn't find anything this time, but try again on the new block */
1964         *again = true;
1965         return false;
1966     } else {
1967         /* Can go around again, but... */
1968         *again = true;
1969         /* We've found something so probably don't need to */
1970         return true;
1971     }
1972 }
1973
1974 /**
1975  * unqueue_page: gets a page of the queue
1976  *
1977  * Helper for 'get_queued_page' - gets a page off the queue
1978  *
1979  * Returns the block of the page (or NULL if none available)
1980  *
1981  * @rs: current RAM state
1982  * @offset: used to return the offset within the RAMBlock
1983  */
1984 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1985 {
1986     RAMBlock *block = NULL;
1987
1988     qemu_mutex_lock(&rs->src_page_req_mutex);
1989     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1990         struct RAMSrcPageRequest *entry =
1991                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1992         block = entry->rb;
1993         *offset = entry->offset;
1994
1995         if (entry->len > TARGET_PAGE_SIZE) {
1996             entry->len -= TARGET_PAGE_SIZE;
1997             entry->offset += TARGET_PAGE_SIZE;
1998         } else {
1999             memory_region_unref(block->mr);
2000             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2001             g_free(entry);
2002             migration_consume_urgent_request();
2003         }
2004     }
2005     qemu_mutex_unlock(&rs->src_page_req_mutex);
2006
2007     return block;
2008 }
2009
2010 /**
2011  * get_queued_page: unqueue a page from the postocpy requests
2012  *
2013  * Skips pages that are already sent (!dirty)
2014  *
2015  * Returns if a queued page is found
2016  *
2017  * @rs: current RAM state
2018  * @pss: data about the state of the current dirty page scan
2019  */
2020 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2021 {
2022     RAMBlock  *block;
2023     ram_addr_t offset;
2024     bool dirty;
2025
2026     do {
2027         block = unqueue_page(rs, &offset);
2028         /*
2029          * We're sending this page, and since it's postcopy nothing else
2030          * will dirty it, and we must make sure it doesn't get sent again
2031          * even if this queue request was received after the background
2032          * search already sent it.
2033          */
2034         if (block) {
2035             unsigned long page;
2036
2037             page = offset >> TARGET_PAGE_BITS;
2038             dirty = test_bit(page, block->bmap);
2039             if (!dirty) {
2040                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2041                        page, test_bit(page, block->unsentmap));
2042             } else {
2043                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2044             }
2045         }
2046
2047     } while (block && !dirty);
2048
2049     if (block) {
2050         /*
2051          * As soon as we start servicing pages out of order, then we have
2052          * to kill the bulk stage, since the bulk stage assumes
2053          * in (migration_bitmap_find_and_reset_dirty) that every page is
2054          * dirty, that's no longer true.
2055          */
2056         rs->ram_bulk_stage = false;
2057
2058         /*
2059          * We want the background search to continue from the queued page
2060          * since the guest is likely to want other pages near to the page
2061          * it just requested.
2062          */
2063         pss->block = block;
2064         pss->page = offset >> TARGET_PAGE_BITS;
2065     }
2066
2067     return !!block;
2068 }
2069
2070 /**
2071  * migration_page_queue_free: drop any remaining pages in the ram
2072  * request queue
2073  *
2074  * It should be empty at the end anyway, but in error cases there may
2075  * be some left.  in case that there is any page left, we drop it.
2076  *
2077  */
2078 static void migration_page_queue_free(RAMState *rs)
2079 {
2080     struct RAMSrcPageRequest *mspr, *next_mspr;
2081     /* This queue generally should be empty - but in the case of a failed
2082      * migration might have some droppings in.
2083      */
2084     rcu_read_lock();
2085     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2086         memory_region_unref(mspr->rb->mr);
2087         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2088         g_free(mspr);
2089     }
2090     rcu_read_unlock();
2091 }
2092
2093 /**
2094  * ram_save_queue_pages: queue the page for transmission
2095  *
2096  * A request from postcopy destination for example.
2097  *
2098  * Returns zero on success or negative on error
2099  *
2100  * @rbname: Name of the RAMBLock of the request. NULL means the
2101  *          same that last one.
2102  * @start: starting address from the start of the RAMBlock
2103  * @len: length (in bytes) to send
2104  */
2105 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2106 {
2107     RAMBlock *ramblock;
2108     RAMState *rs = ram_state;
2109
2110     ram_counters.postcopy_requests++;
2111     rcu_read_lock();
2112     if (!rbname) {
2113         /* Reuse last RAMBlock */
2114         ramblock = rs->last_req_rb;
2115
2116         if (!ramblock) {
2117             /*
2118              * Shouldn't happen, we can't reuse the last RAMBlock if
2119              * it's the 1st request.
2120              */
2121             error_report("ram_save_queue_pages no previous block");
2122             goto err;
2123         }
2124     } else {
2125         ramblock = qemu_ram_block_by_name(rbname);
2126
2127         if (!ramblock) {
2128             /* We shouldn't be asked for a non-existent RAMBlock */
2129             error_report("ram_save_queue_pages no block '%s'", rbname);
2130             goto err;
2131         }
2132         rs->last_req_rb = ramblock;
2133     }
2134     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2135     if (start+len > ramblock->used_length) {
2136         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2137                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2138                      __func__, start, len, ramblock->used_length);
2139         goto err;
2140     }
2141
2142     struct RAMSrcPageRequest *new_entry =
2143         g_malloc0(sizeof(struct RAMSrcPageRequest));
2144     new_entry->rb = ramblock;
2145     new_entry->offset = start;
2146     new_entry->len = len;
2147
2148     memory_region_ref(ramblock->mr);
2149     qemu_mutex_lock(&rs->src_page_req_mutex);
2150     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2151     migration_make_urgent_request();
2152     qemu_mutex_unlock(&rs->src_page_req_mutex);
2153     rcu_read_unlock();
2154
2155     return 0;
2156
2157 err:
2158     rcu_read_unlock();
2159     return -1;
2160 }
2161
2162 static bool save_page_use_compression(RAMState *rs)
2163 {
2164     if (!migrate_use_compression()) {
2165         return false;
2166     }
2167
2168     /*
2169      * If xbzrle is on, stop using the data compression after first
2170      * round of migration even if compression is enabled. In theory,
2171      * xbzrle can do better than compression.
2172      */
2173     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2174         return true;
2175     }
2176
2177     return false;
2178 }
2179
2180 /**
2181  * ram_save_target_page: save one target page
2182  *
2183  * Returns the number of pages written
2184  *
2185  * @rs: current RAM state
2186  * @pss: data about the page we want to send
2187  * @last_stage: if we are at the completion stage
2188  */
2189 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2190                                 bool last_stage)
2191 {
2192     RAMBlock *block = pss->block;
2193     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2194     int res;
2195
2196     if (control_save_page(rs, block, offset, &res)) {
2197         return res;
2198     }
2199
2200     /*
2201      * When starting the process of a new block, the first page of
2202      * the block should be sent out before other pages in the same
2203      * block, and all the pages in last block should have been sent
2204      * out, keeping this order is important, because the 'cont' flag
2205      * is used to avoid resending the block name.
2206      */
2207     if (block != rs->last_sent_block && save_page_use_compression(rs)) {
2208             flush_compressed_data(rs);
2209     }
2210
2211     res = save_zero_page(rs, block, offset);
2212     if (res > 0) {
2213         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2214          * page would be stale
2215          */
2216         if (!save_page_use_compression(rs)) {
2217             XBZRLE_cache_lock();
2218             xbzrle_cache_zero_page(rs, block->offset + offset);
2219             XBZRLE_cache_unlock();
2220         }
2221         ram_release_pages(block->idstr, offset, res);
2222         return res;
2223     }
2224
2225     /*
2226      * Make sure the first page is sent out before other pages.
2227      *
2228      * we post it as normal page as compression will take much
2229      * CPU resource.
2230      */
2231     if (block == rs->last_sent_block && save_page_use_compression(rs)) {
2232         return compress_page_with_multi_thread(rs, block, offset);
2233     } else if (migrate_use_multifd()) {
2234         return ram_save_multifd_page(rs, block, offset);
2235     }
2236
2237     return ram_save_page(rs, pss, last_stage);
2238 }
2239
2240 /**
2241  * ram_save_host_page: save a whole host page
2242  *
2243  * Starting at *offset send pages up to the end of the current host
2244  * page. It's valid for the initial offset to point into the middle of
2245  * a host page in which case the remainder of the hostpage is sent.
2246  * Only dirty target pages are sent. Note that the host page size may
2247  * be a huge page for this block.
2248  * The saving stops at the boundary of the used_length of the block
2249  * if the RAMBlock isn't a multiple of the host page size.
2250  *
2251  * Returns the number of pages written or negative on error
2252  *
2253  * @rs: current RAM state
2254  * @ms: current migration state
2255  * @pss: data about the page we want to send
2256  * @last_stage: if we are at the completion stage
2257  */
2258 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2259                               bool last_stage)
2260 {
2261     int tmppages, pages = 0;
2262     size_t pagesize_bits =
2263         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2264
2265     if (!qemu_ram_is_migratable(pss->block)) {
2266         error_report("block %s should not be migrated !", pss->block->idstr);
2267         return 0;
2268     }
2269
2270     do {
2271         /* Check the pages is dirty and if it is send it */
2272         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2273             pss->page++;
2274             continue;
2275         }
2276
2277         tmppages = ram_save_target_page(rs, pss, last_stage);
2278         if (tmppages < 0) {
2279             return tmppages;
2280         }
2281
2282         pages += tmppages;
2283         if (pss->block->unsentmap) {
2284             clear_bit(pss->page, pss->block->unsentmap);
2285         }
2286
2287         pss->page++;
2288     } while ((pss->page & (pagesize_bits - 1)) &&
2289              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2290
2291     /* The offset we leave with is the last one we looked at */
2292     pss->page--;
2293     return pages;
2294 }
2295
2296 /**
2297  * ram_find_and_save_block: finds a dirty page and sends it to f
2298  *
2299  * Called within an RCU critical section.
2300  *
2301  * Returns the number of pages written where zero means no dirty pages
2302  *
2303  * @rs: current RAM state
2304  * @last_stage: if we are at the completion stage
2305  *
2306  * On systems where host-page-size > target-page-size it will send all the
2307  * pages in a host page that are dirty.
2308  */
2309
2310 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2311 {
2312     PageSearchStatus pss;
2313     int pages = 0;
2314     bool again, found;
2315
2316     /* No dirty page as there is zero RAM */
2317     if (!ram_bytes_total()) {
2318         return pages;
2319     }
2320
2321     pss.block = rs->last_seen_block;
2322     pss.page = rs->last_page;
2323     pss.complete_round = false;
2324
2325     if (!pss.block) {
2326         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2327     }
2328
2329     do {
2330         again = true;
2331         found = get_queued_page(rs, &pss);
2332
2333         if (!found) {
2334             /* priority queue empty, so just search for something dirty */
2335             found = find_dirty_block(rs, &pss, &again);
2336         }
2337
2338         if (found) {
2339             pages = ram_save_host_page(rs, &pss, last_stage);
2340         }
2341     } while (!pages && again);
2342
2343     rs->last_seen_block = pss.block;
2344     rs->last_page = pss.page;
2345
2346     return pages;
2347 }
2348
2349 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2350 {
2351     uint64_t pages = size / TARGET_PAGE_SIZE;
2352
2353     if (zero) {
2354         ram_counters.duplicate += pages;
2355     } else {
2356         ram_counters.normal += pages;
2357         ram_counters.transferred += size;
2358         qemu_update_position(f, size);
2359     }
2360 }
2361
2362 uint64_t ram_bytes_total(void)
2363 {
2364     RAMBlock *block;
2365     uint64_t total = 0;
2366
2367     rcu_read_lock();
2368     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2369         total += block->used_length;
2370     }
2371     rcu_read_unlock();
2372     return total;
2373 }
2374
2375 static void xbzrle_load_setup(void)
2376 {
2377     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2378 }
2379
2380 static void xbzrle_load_cleanup(void)
2381 {
2382     g_free(XBZRLE.decoded_buf);
2383     XBZRLE.decoded_buf = NULL;
2384 }
2385
2386 static void ram_state_cleanup(RAMState **rsp)
2387 {
2388     if (*rsp) {
2389         migration_page_queue_free(*rsp);
2390         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2391         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2392         g_free(*rsp);
2393         *rsp = NULL;
2394     }
2395 }
2396
2397 static void xbzrle_cleanup(void)
2398 {
2399     XBZRLE_cache_lock();
2400     if (XBZRLE.cache) {
2401         cache_fini(XBZRLE.cache);
2402         g_free(XBZRLE.encoded_buf);
2403         g_free(XBZRLE.current_buf);
2404         g_free(XBZRLE.zero_target_page);
2405         XBZRLE.cache = NULL;
2406         XBZRLE.encoded_buf = NULL;
2407         XBZRLE.current_buf = NULL;
2408         XBZRLE.zero_target_page = NULL;
2409     }
2410     XBZRLE_cache_unlock();
2411 }
2412
2413 static void ram_save_cleanup(void *opaque)
2414 {
2415     RAMState **rsp = opaque;
2416     RAMBlock *block;
2417
2418     /* caller have hold iothread lock or is in a bh, so there is
2419      * no writing race against this migration_bitmap
2420      */
2421     memory_global_dirty_log_stop();
2422
2423     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2424         g_free(block->bmap);
2425         block->bmap = NULL;
2426         g_free(block->unsentmap);
2427         block->unsentmap = NULL;
2428     }
2429
2430     xbzrle_cleanup();
2431     compress_threads_save_cleanup();
2432     ram_state_cleanup(rsp);
2433 }
2434
2435 static void ram_state_reset(RAMState *rs)
2436 {
2437     rs->last_seen_block = NULL;
2438     rs->last_sent_block = NULL;
2439     rs->last_page = 0;
2440     rs->last_version = ram_list.version;
2441     rs->ram_bulk_stage = true;
2442 }
2443
2444 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2445
2446 /*
2447  * 'expected' is the value you expect the bitmap mostly to be full
2448  * of; it won't bother printing lines that are all this value.
2449  * If 'todump' is null the migration bitmap is dumped.
2450  */
2451 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2452                            unsigned long pages)
2453 {
2454     int64_t cur;
2455     int64_t linelen = 128;
2456     char linebuf[129];
2457
2458     for (cur = 0; cur < pages; cur += linelen) {
2459         int64_t curb;
2460         bool found = false;
2461         /*
2462          * Last line; catch the case where the line length
2463          * is longer than remaining ram
2464          */
2465         if (cur + linelen > pages) {
2466             linelen = pages - cur;
2467         }
2468         for (curb = 0; curb < linelen; curb++) {
2469             bool thisbit = test_bit(cur + curb, todump);
2470             linebuf[curb] = thisbit ? '1' : '.';
2471             found = found || (thisbit != expected);
2472         }
2473         if (found) {
2474             linebuf[curb] = '\0';
2475             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2476         }
2477     }
2478 }
2479
2480 /* **** functions for postcopy ***** */
2481
2482 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2483 {
2484     struct RAMBlock *block;
2485
2486     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2487         unsigned long *bitmap = block->bmap;
2488         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2489         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2490
2491         while (run_start < range) {
2492             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2493             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2494                               (run_end - run_start) << TARGET_PAGE_BITS);
2495             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2496         }
2497     }
2498 }
2499
2500 /**
2501  * postcopy_send_discard_bm_ram: discard a RAMBlock
2502  *
2503  * Returns zero on success
2504  *
2505  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2506  * Note: At this point the 'unsentmap' is the processed bitmap combined
2507  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2508  *
2509  * @ms: current migration state
2510  * @pds: state for postcopy
2511  * @start: RAMBlock starting page
2512  * @length: RAMBlock size
2513  */
2514 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2515                                         PostcopyDiscardState *pds,
2516                                         RAMBlock *block)
2517 {
2518     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2519     unsigned long current;
2520     unsigned long *unsentmap = block->unsentmap;
2521
2522     for (current = 0; current < end; ) {
2523         unsigned long one = find_next_bit(unsentmap, end, current);
2524
2525         if (one <= end) {
2526             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2527             unsigned long discard_length;
2528
2529             if (zero >= end) {
2530                 discard_length = end - one;
2531             } else {
2532                 discard_length = zero - one;
2533             }
2534             if (discard_length) {
2535                 postcopy_discard_send_range(ms, pds, one, discard_length);
2536             }
2537             current = one + discard_length;
2538         } else {
2539             current = one;
2540         }
2541     }
2542
2543     return 0;
2544 }
2545
2546 /**
2547  * postcopy_each_ram_send_discard: discard all RAMBlocks
2548  *
2549  * Returns 0 for success or negative for error
2550  *
2551  * Utility for the outgoing postcopy code.
2552  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2553  *   passing it bitmap indexes and name.
2554  * (qemu_ram_foreach_block ends up passing unscaled lengths
2555  *  which would mean postcopy code would have to deal with target page)
2556  *
2557  * @ms: current migration state
2558  */
2559 static int postcopy_each_ram_send_discard(MigrationState *ms)
2560 {
2561     struct RAMBlock *block;
2562     int ret;
2563
2564     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2565         PostcopyDiscardState *pds =
2566             postcopy_discard_send_init(ms, block->idstr);
2567
2568         /*
2569          * Postcopy sends chunks of bitmap over the wire, but it
2570          * just needs indexes at this point, avoids it having
2571          * target page specific code.
2572          */
2573         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2574         postcopy_discard_send_finish(ms, pds);
2575         if (ret) {
2576             return ret;
2577         }
2578     }
2579
2580     return 0;
2581 }
2582
2583 /**
2584  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2585  *
2586  * Helper for postcopy_chunk_hostpages; it's called twice to
2587  * canonicalize the two bitmaps, that are similar, but one is
2588  * inverted.
2589  *
2590  * Postcopy requires that all target pages in a hostpage are dirty or
2591  * clean, not a mix.  This function canonicalizes the bitmaps.
2592  *
2593  * @ms: current migration state
2594  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2595  *               otherwise we need to canonicalize partially dirty host pages
2596  * @block: block that contains the page we want to canonicalize
2597  * @pds: state for postcopy
2598  */
2599 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2600                                           RAMBlock *block,
2601                                           PostcopyDiscardState *pds)
2602 {
2603     RAMState *rs = ram_state;
2604     unsigned long *bitmap = block->bmap;
2605     unsigned long *unsentmap = block->unsentmap;
2606     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2607     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2608     unsigned long run_start;
2609
2610     if (block->page_size == TARGET_PAGE_SIZE) {
2611         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2612         return;
2613     }
2614
2615     if (unsent_pass) {
2616         /* Find a sent page */
2617         run_start = find_next_zero_bit(unsentmap, pages, 0);
2618     } else {
2619         /* Find a dirty page */
2620         run_start = find_next_bit(bitmap, pages, 0);
2621     }
2622
2623     while (run_start < pages) {
2624         bool do_fixup = false;
2625         unsigned long fixup_start_addr;
2626         unsigned long host_offset;
2627
2628         /*
2629          * If the start of this run of pages is in the middle of a host
2630          * page, then we need to fixup this host page.
2631          */
2632         host_offset = run_start % host_ratio;
2633         if (host_offset) {
2634             do_fixup = true;
2635             run_start -= host_offset;
2636             fixup_start_addr = run_start;
2637             /* For the next pass */
2638             run_start = run_start + host_ratio;
2639         } else {
2640             /* Find the end of this run */
2641             unsigned long run_end;
2642             if (unsent_pass) {
2643                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2644             } else {
2645                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2646             }
2647             /*
2648              * If the end isn't at the start of a host page, then the
2649              * run doesn't finish at the end of a host page
2650              * and we need to discard.
2651              */
2652             host_offset = run_end % host_ratio;
2653             if (host_offset) {
2654                 do_fixup = true;
2655                 fixup_start_addr = run_end - host_offset;
2656                 /*
2657                  * This host page has gone, the next loop iteration starts
2658                  * from after the fixup
2659                  */
2660                 run_start = fixup_start_addr + host_ratio;
2661             } else {
2662                 /*
2663                  * No discards on this iteration, next loop starts from
2664                  * next sent/dirty page
2665                  */
2666                 run_start = run_end + 1;
2667             }
2668         }
2669
2670         if (do_fixup) {
2671             unsigned long page;
2672
2673             /* Tell the destination to discard this page */
2674             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2675                 /* For the unsent_pass we:
2676                  *     discard partially sent pages
2677                  * For the !unsent_pass (dirty) we:
2678                  *     discard partially dirty pages that were sent
2679                  *     (any partially sent pages were already discarded
2680                  *     by the previous unsent_pass)
2681                  */
2682                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2683                                             host_ratio);
2684             }
2685
2686             /* Clean up the bitmap */
2687             for (page = fixup_start_addr;
2688                  page < fixup_start_addr + host_ratio; page++) {
2689                 /* All pages in this host page are now not sent */
2690                 set_bit(page, unsentmap);
2691
2692                 /*
2693                  * Remark them as dirty, updating the count for any pages
2694                  * that weren't previously dirty.
2695                  */
2696                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2697             }
2698         }
2699
2700         if (unsent_pass) {
2701             /* Find the next sent page for the next iteration */
2702             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2703         } else {
2704             /* Find the next dirty page for the next iteration */
2705             run_start = find_next_bit(bitmap, pages, run_start);
2706         }
2707     }
2708 }
2709
2710 /**
2711  * postcopy_chuck_hostpages: discrad any partially sent host page
2712  *
2713  * Utility for the outgoing postcopy code.
2714  *
2715  * Discard any partially sent host-page size chunks, mark any partially
2716  * dirty host-page size chunks as all dirty.  In this case the host-page
2717  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2718  *
2719  * Returns zero on success
2720  *
2721  * @ms: current migration state
2722  * @block: block we want to work with
2723  */
2724 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2725 {
2726     PostcopyDiscardState *pds =
2727         postcopy_discard_send_init(ms, block->idstr);
2728
2729     /* First pass: Discard all partially sent host pages */
2730     postcopy_chunk_hostpages_pass(ms, true, block, pds);
2731     /*
2732      * Second pass: Ensure that all partially dirty host pages are made
2733      * fully dirty.
2734      */
2735     postcopy_chunk_hostpages_pass(ms, false, block, pds);
2736
2737     postcopy_discard_send_finish(ms, pds);
2738     return 0;
2739 }
2740
2741 /**
2742  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2743  *
2744  * Returns zero on success
2745  *
2746  * Transmit the set of pages to be discarded after precopy to the target
2747  * these are pages that:
2748  *     a) Have been previously transmitted but are now dirty again
2749  *     b) Pages that have never been transmitted, this ensures that
2750  *        any pages on the destination that have been mapped by background
2751  *        tasks get discarded (transparent huge pages is the specific concern)
2752  * Hopefully this is pretty sparse
2753  *
2754  * @ms: current migration state
2755  */
2756 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2757 {
2758     RAMState *rs = ram_state;
2759     RAMBlock *block;
2760     int ret;
2761
2762     rcu_read_lock();
2763
2764     /* This should be our last sync, the src is now paused */
2765     migration_bitmap_sync(rs);
2766
2767     /* Easiest way to make sure we don't resume in the middle of a host-page */
2768     rs->last_seen_block = NULL;
2769     rs->last_sent_block = NULL;
2770     rs->last_page = 0;
2771
2772     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2773         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2774         unsigned long *bitmap = block->bmap;
2775         unsigned long *unsentmap = block->unsentmap;
2776
2777         if (!unsentmap) {
2778             /* We don't have a safe way to resize the sentmap, so
2779              * if the bitmap was resized it will be NULL at this
2780              * point.
2781              */
2782             error_report("migration ram resized during precopy phase");
2783             rcu_read_unlock();
2784             return -EINVAL;
2785         }
2786         /* Deal with TPS != HPS and huge pages */
2787         ret = postcopy_chunk_hostpages(ms, block);
2788         if (ret) {
2789             rcu_read_unlock();
2790             return ret;
2791         }
2792
2793         /*
2794          * Update the unsentmap to be unsentmap = unsentmap | dirty
2795          */
2796         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2797 #ifdef DEBUG_POSTCOPY
2798         ram_debug_dump_bitmap(unsentmap, true, pages);
2799 #endif
2800     }
2801     trace_ram_postcopy_send_discard_bitmap();
2802
2803     ret = postcopy_each_ram_send_discard(ms);
2804     rcu_read_unlock();
2805
2806     return ret;
2807 }
2808
2809 /**
2810  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2811  *
2812  * Returns zero on success
2813  *
2814  * @rbname: name of the RAMBlock of the request. NULL means the
2815  *          same that last one.
2816  * @start: RAMBlock starting page
2817  * @length: RAMBlock size
2818  */
2819 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2820 {
2821     int ret = -1;
2822
2823     trace_ram_discard_range(rbname, start, length);
2824
2825     rcu_read_lock();
2826     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2827
2828     if (!rb) {
2829         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2830         goto err;
2831     }
2832
2833     /*
2834      * On source VM, we don't need to update the received bitmap since
2835      * we don't even have one.
2836      */
2837     if (rb->receivedmap) {
2838         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2839                      length >> qemu_target_page_bits());
2840     }
2841
2842     ret = ram_block_discard_range(rb, start, length);
2843
2844 err:
2845     rcu_read_unlock();
2846
2847     return ret;
2848 }
2849
2850 /*
2851  * For every allocation, we will try not to crash the VM if the
2852  * allocation failed.
2853  */
2854 static int xbzrle_init(void)
2855 {
2856     Error *local_err = NULL;
2857
2858     if (!migrate_use_xbzrle()) {
2859         return 0;
2860     }
2861
2862     XBZRLE_cache_lock();
2863
2864     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2865     if (!XBZRLE.zero_target_page) {
2866         error_report("%s: Error allocating zero page", __func__);
2867         goto err_out;
2868     }
2869
2870     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2871                               TARGET_PAGE_SIZE, &local_err);
2872     if (!XBZRLE.cache) {
2873         error_report_err(local_err);
2874         goto free_zero_page;
2875     }
2876
2877     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2878     if (!XBZRLE.encoded_buf) {
2879         error_report("%s: Error allocating encoded_buf", __func__);
2880         goto free_cache;
2881     }
2882
2883     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2884     if (!XBZRLE.current_buf) {
2885         error_report("%s: Error allocating current_buf", __func__);
2886         goto free_encoded_buf;
2887     }
2888
2889     /* We are all good */
2890     XBZRLE_cache_unlock();
2891     return 0;
2892
2893 free_encoded_buf:
2894     g_free(XBZRLE.encoded_buf);
2895     XBZRLE.encoded_buf = NULL;
2896 free_cache:
2897     cache_fini(XBZRLE.cache);
2898     XBZRLE.cache = NULL;
2899 free_zero_page:
2900     g_free(XBZRLE.zero_target_page);
2901     XBZRLE.zero_target_page = NULL;
2902 err_out:
2903     XBZRLE_cache_unlock();
2904     return -ENOMEM;
2905 }
2906
2907 static int ram_state_init(RAMState **rsp)
2908 {
2909     *rsp = g_try_new0(RAMState, 1);
2910
2911     if (!*rsp) {
2912         error_report("%s: Init ramstate fail", __func__);
2913         return -1;
2914     }
2915
2916     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2917     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2918     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2919
2920     /*
2921      * Count the total number of pages used by ram blocks not including any
2922      * gaps due to alignment or unplugs.
2923      */
2924     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2925
2926     ram_state_reset(*rsp);
2927
2928     return 0;
2929 }
2930
2931 static void ram_list_init_bitmaps(void)
2932 {
2933     RAMBlock *block;
2934     unsigned long pages;
2935
2936     /* Skip setting bitmap if there is no RAM */
2937     if (ram_bytes_total()) {
2938         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2939             pages = block->max_length >> TARGET_PAGE_BITS;
2940             block->bmap = bitmap_new(pages);
2941             bitmap_set(block->bmap, 0, pages);
2942             if (migrate_postcopy_ram()) {
2943                 block->unsentmap = bitmap_new(pages);
2944                 bitmap_set(block->unsentmap, 0, pages);
2945             }
2946         }
2947     }
2948 }
2949
2950 static void ram_init_bitmaps(RAMState *rs)
2951 {
2952     /* For memory_global_dirty_log_start below.  */
2953     qemu_mutex_lock_iothread();
2954     qemu_mutex_lock_ramlist();
2955     rcu_read_lock();
2956
2957     ram_list_init_bitmaps();
2958     memory_global_dirty_log_start();
2959     migration_bitmap_sync(rs);
2960
2961     rcu_read_unlock();
2962     qemu_mutex_unlock_ramlist();
2963     qemu_mutex_unlock_iothread();
2964 }
2965
2966 static int ram_init_all(RAMState **rsp)
2967 {
2968     if (ram_state_init(rsp)) {
2969         return -1;
2970     }
2971
2972     if (xbzrle_init()) {
2973         ram_state_cleanup(rsp);
2974         return -1;
2975     }
2976
2977     ram_init_bitmaps(*rsp);
2978
2979     return 0;
2980 }
2981
2982 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2983 {
2984     RAMBlock *block;
2985     uint64_t pages = 0;
2986
2987     /*
2988      * Postcopy is not using xbzrle/compression, so no need for that.
2989      * Also, since source are already halted, we don't need to care
2990      * about dirty page logging as well.
2991      */
2992
2993     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2994         pages += bitmap_count_one(block->bmap,
2995                                   block->used_length >> TARGET_PAGE_BITS);
2996     }
2997
2998     /* This may not be aligned with current bitmaps. Recalculate. */
2999     rs->migration_dirty_pages = pages;
3000
3001     rs->last_seen_block = NULL;
3002     rs->last_sent_block = NULL;
3003     rs->last_page = 0;
3004     rs->last_version = ram_list.version;
3005     /*
3006      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3007      * matter what we have sent.
3008      */
3009     rs->ram_bulk_stage = false;
3010
3011     /* Update RAMState cache of output QEMUFile */
3012     rs->f = out;
3013
3014     trace_ram_state_resume_prepare(pages);
3015 }
3016
3017 /*
3018  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3019  * long-running RCU critical section.  When rcu-reclaims in the code
3020  * start to become numerous it will be necessary to reduce the
3021  * granularity of these critical sections.
3022  */
3023
3024 /**
3025  * ram_save_setup: Setup RAM for migration
3026  *
3027  * Returns zero to indicate success and negative for error
3028  *
3029  * @f: QEMUFile where to send the data
3030  * @opaque: RAMState pointer
3031  */
3032 static int ram_save_setup(QEMUFile *f, void *opaque)
3033 {
3034     RAMState **rsp = opaque;
3035     RAMBlock *block;
3036
3037     if (compress_threads_save_setup()) {
3038         return -1;
3039     }
3040
3041     /* migration has already setup the bitmap, reuse it. */
3042     if (!migration_in_colo_state()) {
3043         if (ram_init_all(rsp) != 0) {
3044             compress_threads_save_cleanup();
3045             return -1;
3046         }
3047     }
3048     (*rsp)->f = f;
3049
3050     rcu_read_lock();
3051
3052     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
3053
3054     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3055         qemu_put_byte(f, strlen(block->idstr));
3056         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3057         qemu_put_be64(f, block->used_length);
3058         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3059             qemu_put_be64(f, block->page_size);
3060         }
3061     }
3062
3063     rcu_read_unlock();
3064
3065     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3066     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3067
3068     multifd_send_sync_main();
3069     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3070     qemu_fflush(f);
3071
3072     return 0;
3073 }
3074
3075 /**
3076  * ram_save_iterate: iterative stage for migration
3077  *
3078  * Returns zero to indicate success and negative for error
3079  *
3080  * @f: QEMUFile where to send the data
3081  * @opaque: RAMState pointer
3082  */
3083 static int ram_save_iterate(QEMUFile *f, void *opaque)
3084 {
3085     RAMState **temp = opaque;
3086     RAMState *rs = *temp;
3087     int ret;
3088     int i;
3089     int64_t t0;
3090     int done = 0;
3091
3092     if (blk_mig_bulk_active()) {
3093         /* Avoid transferring ram during bulk phase of block migration as
3094          * the bulk phase will usually take a long time and transferring
3095          * ram updates during that time is pointless. */
3096         goto out;
3097     }
3098
3099     rcu_read_lock();
3100     if (ram_list.version != rs->last_version) {
3101         ram_state_reset(rs);
3102     }
3103
3104     /* Read version before ram_list.blocks */
3105     smp_rmb();
3106
3107     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3108
3109     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3110     i = 0;
3111     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3112             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3113         int pages;
3114
3115         if (qemu_file_get_error(f)) {
3116             break;
3117         }
3118
3119         pages = ram_find_and_save_block(rs, false);
3120         /* no more pages to sent */
3121         if (pages == 0) {
3122             done = 1;
3123             break;
3124         }
3125         rs->iterations++;
3126
3127         /* we want to check in the 1st loop, just in case it was the 1st time
3128            and we had to sync the dirty bitmap.
3129            qemu_get_clock_ns() is a bit expensive, so we only check each some
3130            iterations
3131         */
3132         if ((i & 63) == 0) {
3133             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3134             if (t1 > MAX_WAIT) {
3135                 trace_ram_save_iterate_big_wait(t1, i);
3136                 break;
3137             }
3138         }
3139         i++;
3140     }
3141     flush_compressed_data(rs);
3142     rcu_read_unlock();
3143
3144     /*
3145      * Must occur before EOS (or any QEMUFile operation)
3146      * because of RDMA protocol.
3147      */
3148     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3149
3150     multifd_send_sync_main();
3151 out:
3152     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3153     qemu_fflush(f);
3154     ram_counters.transferred += 8;
3155
3156     ret = qemu_file_get_error(f);
3157     if (ret < 0) {
3158         return ret;
3159     }
3160
3161     return done;
3162 }
3163
3164 /**
3165  * ram_save_complete: function called to send the remaining amount of ram
3166  *
3167  * Returns zero to indicate success
3168  *
3169  * Called with iothread lock
3170  *
3171  * @f: QEMUFile where to send the data
3172  * @opaque: RAMState pointer
3173  */
3174 static int ram_save_complete(QEMUFile *f, void *opaque)
3175 {
3176     RAMState **temp = opaque;
3177     RAMState *rs = *temp;
3178
3179     rcu_read_lock();
3180
3181     if (!migration_in_postcopy()) {
3182         migration_bitmap_sync(rs);
3183     }
3184
3185     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3186
3187     /* try transferring iterative blocks of memory */
3188
3189     /* flush all remaining blocks regardless of rate limiting */
3190     while (true) {
3191         int pages;
3192
3193         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3194         /* no more blocks to sent */
3195         if (pages == 0) {
3196             break;
3197         }
3198     }
3199
3200     flush_compressed_data(rs);
3201     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3202
3203     rcu_read_unlock();
3204
3205     multifd_send_sync_main();
3206     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3207     qemu_fflush(f);
3208
3209     return 0;
3210 }
3211
3212 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3213                              uint64_t *res_precopy_only,
3214                              uint64_t *res_compatible,
3215                              uint64_t *res_postcopy_only)
3216 {
3217     RAMState **temp = opaque;
3218     RAMState *rs = *temp;
3219     uint64_t remaining_size;
3220
3221     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3222
3223     if (!migration_in_postcopy() &&
3224         remaining_size < max_size) {
3225         qemu_mutex_lock_iothread();
3226         rcu_read_lock();
3227         migration_bitmap_sync(rs);
3228         rcu_read_unlock();
3229         qemu_mutex_unlock_iothread();
3230         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3231     }
3232
3233     if (migrate_postcopy_ram()) {
3234         /* We can do postcopy, and all the data is postcopiable */
3235         *res_compatible += remaining_size;
3236     } else {
3237         *res_precopy_only += remaining_size;
3238     }
3239 }
3240
3241 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3242 {
3243     unsigned int xh_len;
3244     int xh_flags;
3245     uint8_t *loaded_data;
3246
3247     /* extract RLE header */
3248     xh_flags = qemu_get_byte(f);
3249     xh_len = qemu_get_be16(f);
3250
3251     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3252         error_report("Failed to load XBZRLE page - wrong compression!");
3253         return -1;
3254     }
3255
3256     if (xh_len > TARGET_PAGE_SIZE) {
3257         error_report("Failed to load XBZRLE page - len overflow!");
3258         return -1;
3259     }
3260     loaded_data = XBZRLE.decoded_buf;
3261     /* load data and decode */
3262     /* it can change loaded_data to point to an internal buffer */
3263     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3264
3265     /* decode RLE */
3266     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3267                              TARGET_PAGE_SIZE) == -1) {
3268         error_report("Failed to load XBZRLE page - decode error!");
3269         return -1;
3270     }
3271
3272     return 0;
3273 }
3274
3275 /**
3276  * ram_block_from_stream: read a RAMBlock id from the migration stream
3277  *
3278  * Must be called from within a rcu critical section.
3279  *
3280  * Returns a pointer from within the RCU-protected ram_list.
3281  *
3282  * @f: QEMUFile where to read the data from
3283  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3284  */
3285 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3286 {
3287     static RAMBlock *block = NULL;
3288     char id[256];
3289     uint8_t len;
3290
3291     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3292         if (!block) {
3293             error_report("Ack, bad migration stream!");
3294             return NULL;
3295         }
3296         return block;
3297     }
3298
3299     len = qemu_get_byte(f);
3300     qemu_get_buffer(f, (uint8_t *)id, len);
3301     id[len] = 0;
3302
3303     block = qemu_ram_block_by_name(id);
3304     if (!block) {
3305         error_report("Can't find block %s", id);
3306         return NULL;
3307     }
3308
3309     if (!qemu_ram_is_migratable(block)) {
3310         error_report("block %s should not be migrated !", id);
3311         return NULL;
3312     }
3313
3314     return block;
3315 }
3316
3317 static inline void *host_from_ram_block_offset(RAMBlock *block,
3318                                                ram_addr_t offset)
3319 {
3320     if (!offset_in_ramblock(block, offset)) {
3321         return NULL;
3322     }
3323
3324     return block->host + offset;
3325 }
3326
3327 /**
3328  * ram_handle_compressed: handle the zero page case
3329  *
3330  * If a page (or a whole RDMA chunk) has been
3331  * determined to be zero, then zap it.
3332  *
3333  * @host: host address for the zero page
3334  * @ch: what the page is filled from.  We only support zero
3335  * @size: size of the zero page
3336  */
3337 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3338 {
3339     if (ch != 0 || !is_zero_range(host, size)) {
3340         memset(host, ch, size);
3341     }
3342 }
3343
3344 /* return the size after decompression, or negative value on error */
3345 static int
3346 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3347                      const uint8_t *source, size_t source_len)
3348 {
3349     int err;
3350
3351     err = inflateReset(stream);
3352     if (err != Z_OK) {
3353         return -1;
3354     }
3355
3356     stream->avail_in = source_len;
3357     stream->next_in = (uint8_t *)source;
3358     stream->avail_out = dest_len;
3359     stream->next_out = dest;
3360
3361     err = inflate(stream, Z_NO_FLUSH);
3362     if (err != Z_STREAM_END) {
3363         return -1;
3364     }
3365
3366     return stream->total_out;
3367 }
3368
3369 static void *do_data_decompress(void *opaque)
3370 {
3371     DecompressParam *param = opaque;
3372     unsigned long pagesize;
3373     uint8_t *des;
3374     int len, ret;
3375
3376     qemu_mutex_lock(&param->mutex);
3377     while (!param->quit) {
3378         if (param->des) {
3379             des = param->des;
3380             len = param->len;
3381             param->des = 0;
3382             qemu_mutex_unlock(&param->mutex);
3383
3384             pagesize = TARGET_PAGE_SIZE;
3385
3386             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3387                                        param->compbuf, len);
3388             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3389                 error_report("decompress data failed");
3390                 qemu_file_set_error(decomp_file, ret);
3391             }
3392
3393             qemu_mutex_lock(&decomp_done_lock);
3394             param->done = true;
3395             qemu_cond_signal(&decomp_done_cond);
3396             qemu_mutex_unlock(&decomp_done_lock);
3397
3398             qemu_mutex_lock(&param->mutex);
3399         } else {
3400             qemu_cond_wait(&param->cond, &param->mutex);
3401         }
3402     }
3403     qemu_mutex_unlock(&param->mutex);
3404
3405     return NULL;
3406 }
3407
3408 static int wait_for_decompress_done(void)
3409 {
3410     int idx, thread_count;
3411
3412     if (!migrate_use_compression()) {
3413         return 0;
3414     }
3415
3416     thread_count = migrate_decompress_threads();
3417     qemu_mutex_lock(&decomp_done_lock);
3418     for (idx = 0; idx < thread_count; idx++) {
3419         while (!decomp_param[idx].done) {
3420             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3421         }
3422     }
3423     qemu_mutex_unlock(&decomp_done_lock);
3424     return qemu_file_get_error(decomp_file);
3425 }
3426
3427 static void compress_threads_load_cleanup(void)
3428 {
3429     int i, thread_count;
3430
3431     if (!migrate_use_compression()) {
3432         return;
3433     }
3434     thread_count = migrate_decompress_threads();
3435     for (i = 0; i < thread_count; i++) {
3436         /*
3437          * we use it as a indicator which shows if the thread is
3438          * properly init'd or not
3439          */
3440         if (!decomp_param[i].compbuf) {
3441             break;
3442         }
3443
3444         qemu_mutex_lock(&decomp_param[i].mutex);
3445         decomp_param[i].quit = true;
3446         qemu_cond_signal(&decomp_param[i].cond);
3447         qemu_mutex_unlock(&decomp_param[i].mutex);
3448     }
3449     for (i = 0; i < thread_count; i++) {
3450         if (!decomp_param[i].compbuf) {
3451             break;
3452         }
3453
3454         qemu_thread_join(decompress_threads + i);
3455         qemu_mutex_destroy(&decomp_param[i].mutex);
3456         qemu_cond_destroy(&decomp_param[i].cond);
3457         inflateEnd(&decomp_param[i].stream);
3458         g_free(decomp_param[i].compbuf);
3459         decomp_param[i].compbuf = NULL;
3460     }
3461     g_free(decompress_threads);
3462     g_free(decomp_param);
3463     decompress_threads = NULL;
3464     decomp_param = NULL;
3465     decomp_file = NULL;
3466 }
3467
3468 static int compress_threads_load_setup(QEMUFile *f)
3469 {
3470     int i, thread_count;
3471
3472     if (!migrate_use_compression()) {
3473         return 0;
3474     }
3475
3476     thread_count = migrate_decompress_threads();
3477     decompress_threads = g_new0(QemuThread, thread_count);
3478     decomp_param = g_new0(DecompressParam, thread_count);
3479     qemu_mutex_init(&decomp_done_lock);
3480     qemu_cond_init(&decomp_done_cond);
3481     decomp_file = f;
3482     for (i = 0; i < thread_count; i++) {
3483         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3484             goto exit;
3485         }
3486
3487         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3488         qemu_mutex_init(&decomp_param[i].mutex);
3489         qemu_cond_init(&decomp_param[i].cond);
3490         decomp_param[i].done = true;
3491         decomp_param[i].quit = false;
3492         qemu_thread_create(decompress_threads + i, "decompress",
3493                            do_data_decompress, decomp_param + i,
3494                            QEMU_THREAD_JOINABLE);
3495     }
3496     return 0;
3497 exit:
3498     compress_threads_load_cleanup();
3499     return -1;
3500 }
3501
3502 static void decompress_data_with_multi_threads(QEMUFile *f,
3503                                                void *host, int len)
3504 {
3505     int idx, thread_count;
3506
3507     thread_count = migrate_decompress_threads();
3508     qemu_mutex_lock(&decomp_done_lock);
3509     while (true) {
3510         for (idx = 0; idx < thread_count; idx++) {
3511             if (decomp_param[idx].done) {
3512                 decomp_param[idx].done = false;
3513                 qemu_mutex_lock(&decomp_param[idx].mutex);
3514                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3515                 decomp_param[idx].des = host;
3516                 decomp_param[idx].len = len;
3517                 qemu_cond_signal(&decomp_param[idx].cond);
3518                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3519                 break;
3520             }
3521         }
3522         if (idx < thread_count) {
3523             break;
3524         } else {
3525             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3526         }
3527     }
3528     qemu_mutex_unlock(&decomp_done_lock);
3529 }
3530
3531 /**
3532  * ram_load_setup: Setup RAM for migration incoming side
3533  *
3534  * Returns zero to indicate success and negative for error
3535  *
3536  * @f: QEMUFile where to receive the data
3537  * @opaque: RAMState pointer
3538  */
3539 static int ram_load_setup(QEMUFile *f, void *opaque)
3540 {
3541     if (compress_threads_load_setup(f)) {
3542         return -1;
3543     }
3544
3545     xbzrle_load_setup();
3546     ramblock_recv_map_init();
3547     return 0;
3548 }
3549
3550 static int ram_load_cleanup(void *opaque)
3551 {
3552     RAMBlock *rb;
3553
3554     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3555         if (ramblock_is_pmem(rb)) {
3556             pmem_persist(rb->host, rb->used_length);
3557         }
3558     }
3559
3560     xbzrle_load_cleanup();
3561     compress_threads_load_cleanup();
3562
3563     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3564         g_free(rb->receivedmap);
3565         rb->receivedmap = NULL;
3566     }
3567     return 0;
3568 }
3569
3570 /**
3571  * ram_postcopy_incoming_init: allocate postcopy data structures
3572  *
3573  * Returns 0 for success and negative if there was one error
3574  *
3575  * @mis: current migration incoming state
3576  *
3577  * Allocate data structures etc needed by incoming migration with
3578  * postcopy-ram. postcopy-ram's similarly names
3579  * postcopy_ram_incoming_init does the work.
3580  */
3581 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3582 {
3583     return postcopy_ram_incoming_init(mis);
3584 }
3585
3586 /**
3587  * ram_load_postcopy: load a page in postcopy case
3588  *
3589  * Returns 0 for success or -errno in case of error
3590  *
3591  * Called in postcopy mode by ram_load().
3592  * rcu_read_lock is taken prior to this being called.
3593  *
3594  * @f: QEMUFile where to send the data
3595  */
3596 static int ram_load_postcopy(QEMUFile *f)
3597 {
3598     int flags = 0, ret = 0;
3599     bool place_needed = false;
3600     bool matches_target_page_size = false;
3601     MigrationIncomingState *mis = migration_incoming_get_current();
3602     /* Temporary page that is later 'placed' */
3603     void *postcopy_host_page = postcopy_get_tmp_page(mis);
3604     void *last_host = NULL;
3605     bool all_zero = false;
3606
3607     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3608         ram_addr_t addr;
3609         void *host = NULL;
3610         void *page_buffer = NULL;
3611         void *place_source = NULL;
3612         RAMBlock *block = NULL;
3613         uint8_t ch;
3614
3615         addr = qemu_get_be64(f);
3616
3617         /*
3618          * If qemu file error, we should stop here, and then "addr"
3619          * may be invalid
3620          */
3621         ret = qemu_file_get_error(f);
3622         if (ret) {
3623             break;
3624         }
3625
3626         flags = addr & ~TARGET_PAGE_MASK;
3627         addr &= TARGET_PAGE_MASK;
3628
3629         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3630         place_needed = false;
3631         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3632             block = ram_block_from_stream(f, flags);
3633
3634             host = host_from_ram_block_offset(block, addr);
3635             if (!host) {
3636                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3637                 ret = -EINVAL;
3638                 break;
3639             }
3640             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3641             /*
3642              * Postcopy requires that we place whole host pages atomically;
3643              * these may be huge pages for RAMBlocks that are backed by
3644              * hugetlbfs.
3645              * To make it atomic, the data is read into a temporary page
3646              * that's moved into place later.
3647              * The migration protocol uses,  possibly smaller, target-pages
3648              * however the source ensures it always sends all the components
3649              * of a host page in order.
3650              */
3651             page_buffer = postcopy_host_page +
3652                           ((uintptr_t)host & (block->page_size - 1));
3653             /* If all TP are zero then we can optimise the place */
3654             if (!((uintptr_t)host & (block->page_size - 1))) {
3655                 all_zero = true;
3656             } else {
3657                 /* not the 1st TP within the HP */
3658                 if (host != (last_host + TARGET_PAGE_SIZE)) {
3659                     error_report("Non-sequential target page %p/%p",
3660                                   host, last_host);
3661                     ret = -EINVAL;
3662                     break;
3663                 }
3664             }
3665
3666
3667             /*
3668              * If it's the last part of a host page then we place the host
3669              * page
3670              */
3671             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3672                                      (block->page_size - 1)) == 0;
3673             place_source = postcopy_host_page;
3674         }
3675         last_host = host;
3676
3677         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3678         case RAM_SAVE_FLAG_ZERO:
3679             ch = qemu_get_byte(f);
3680             memset(page_buffer, ch, TARGET_PAGE_SIZE);
3681             if (ch) {
3682                 all_zero = false;
3683             }
3684             break;
3685
3686         case RAM_SAVE_FLAG_PAGE:
3687             all_zero = false;
3688             if (!matches_target_page_size) {
3689                 /* For huge pages, we always use temporary buffer */
3690                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3691             } else {
3692                 /*
3693                  * For small pages that matches target page size, we
3694                  * avoid the qemu_file copy.  Instead we directly use
3695                  * the buffer of QEMUFile to place the page.  Note: we
3696                  * cannot do any QEMUFile operation before using that
3697                  * buffer to make sure the buffer is valid when
3698                  * placing the page.
3699                  */
3700                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3701                                          TARGET_PAGE_SIZE);
3702             }
3703             break;
3704         case RAM_SAVE_FLAG_EOS:
3705             /* normal exit */
3706             multifd_recv_sync_main();
3707             break;
3708         default:
3709             error_report("Unknown combination of migration flags: %#x"
3710                          " (postcopy mode)", flags);
3711             ret = -EINVAL;
3712             break;
3713         }
3714
3715         /* Detect for any possible file errors */
3716         if (!ret && qemu_file_get_error(f)) {
3717             ret = qemu_file_get_error(f);
3718         }
3719
3720         if (!ret && place_needed) {
3721             /* This gets called at the last target page in the host page */
3722             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3723
3724             if (all_zero) {
3725                 ret = postcopy_place_page_zero(mis, place_dest,
3726                                                block);
3727             } else {
3728                 ret = postcopy_place_page(mis, place_dest,
3729                                           place_source, block);
3730             }
3731         }
3732     }
3733
3734     return ret;
3735 }
3736
3737 static bool postcopy_is_advised(void)
3738 {
3739     PostcopyState ps = postcopy_state_get();
3740     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3741 }
3742
3743 static bool postcopy_is_running(void)
3744 {
3745     PostcopyState ps = postcopy_state_get();
3746     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3747 }
3748
3749 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3750 {
3751     int flags = 0, ret = 0, invalid_flags = 0;
3752     static uint64_t seq_iter;
3753     int len = 0;
3754     /*
3755      * If system is running in postcopy mode, page inserts to host memory must
3756      * be atomic
3757      */
3758     bool postcopy_running = postcopy_is_running();
3759     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3760     bool postcopy_advised = postcopy_is_advised();
3761
3762     seq_iter++;
3763
3764     if (version_id != 4) {
3765         ret = -EINVAL;
3766     }
3767
3768     if (!migrate_use_compression()) {
3769         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3770     }
3771     /* This RCU critical section can be very long running.
3772      * When RCU reclaims in the code start to become numerous,
3773      * it will be necessary to reduce the granularity of this
3774      * critical section.
3775      */
3776     rcu_read_lock();
3777
3778     if (postcopy_running) {
3779         ret = ram_load_postcopy(f);
3780     }
3781
3782     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3783         ram_addr_t addr, total_ram_bytes;
3784         void *host = NULL;
3785         uint8_t ch;
3786
3787         addr = qemu_get_be64(f);
3788         flags = addr & ~TARGET_PAGE_MASK;
3789         addr &= TARGET_PAGE_MASK;
3790
3791         if (flags & invalid_flags) {
3792             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3793                 error_report("Received an unexpected compressed page");
3794             }
3795
3796             ret = -EINVAL;
3797             break;
3798         }
3799
3800         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3801                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3802             RAMBlock *block = ram_block_from_stream(f, flags);
3803
3804             host = host_from_ram_block_offset(block, addr);
3805             if (!host) {
3806                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3807                 ret = -EINVAL;
3808                 break;
3809             }
3810             ramblock_recv_bitmap_set(block, host);
3811             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3812         }
3813
3814         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3815         case RAM_SAVE_FLAG_MEM_SIZE:
3816             /* Synchronize RAM block list */
3817             total_ram_bytes = addr;
3818             while (!ret && total_ram_bytes) {
3819                 RAMBlock *block;
3820                 char id[256];
3821                 ram_addr_t length;
3822
3823                 len = qemu_get_byte(f);
3824                 qemu_get_buffer(f, (uint8_t *)id, len);
3825                 id[len] = 0;
3826                 length = qemu_get_be64(f);
3827
3828                 block = qemu_ram_block_by_name(id);
3829                 if (block && !qemu_ram_is_migratable(block)) {
3830                     error_report("block %s should not be migrated !", id);
3831                     ret = -EINVAL;
3832                 } else if (block) {
3833                     if (length != block->used_length) {
3834                         Error *local_err = NULL;
3835
3836                         ret = qemu_ram_resize(block, length,
3837                                               &local_err);
3838                         if (local_err) {
3839                             error_report_err(local_err);
3840                         }
3841                     }
3842                     /* For postcopy we need to check hugepage sizes match */
3843                     if (postcopy_advised &&
3844                         block->page_size != qemu_host_page_size) {
3845                         uint64_t remote_page_size = qemu_get_be64(f);
3846                         if (remote_page_size != block->page_size) {
3847                             error_report("Mismatched RAM page size %s "
3848                                          "(local) %zd != %" PRId64,
3849                                          id, block->page_size,
3850                                          remote_page_size);
3851                             ret = -EINVAL;
3852                         }
3853                     }
3854                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3855                                           block->idstr);
3856                 } else {
3857                     error_report("Unknown ramblock \"%s\", cannot "
3858                                  "accept migration", id);
3859                     ret = -EINVAL;
3860                 }
3861
3862                 total_ram_bytes -= length;
3863             }
3864             break;
3865
3866         case RAM_SAVE_FLAG_ZERO:
3867             ch = qemu_get_byte(f);
3868             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3869             break;
3870
3871         case RAM_SAVE_FLAG_PAGE:
3872             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3873             break;
3874
3875         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3876             len = qemu_get_be32(f);
3877             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3878                 error_report("Invalid compressed data length: %d", len);
3879                 ret = -EINVAL;
3880                 break;
3881             }
3882             decompress_data_with_multi_threads(f, host, len);
3883             break;
3884
3885         case RAM_SAVE_FLAG_XBZRLE:
3886             if (load_xbzrle(f, addr, host) < 0) {
3887                 error_report("Failed to decompress XBZRLE page at "
3888                              RAM_ADDR_FMT, addr);
3889                 ret = -EINVAL;
3890                 break;
3891             }
3892             break;
3893         case RAM_SAVE_FLAG_EOS:
3894             /* normal exit */
3895             multifd_recv_sync_main();
3896             break;
3897         default:
3898             if (flags & RAM_SAVE_FLAG_HOOK) {
3899                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3900             } else {
3901                 error_report("Unknown combination of migration flags: %#x",
3902                              flags);
3903                 ret = -EINVAL;
3904             }
3905         }
3906         if (!ret) {
3907             ret = qemu_file_get_error(f);
3908         }
3909     }
3910
3911     ret |= wait_for_decompress_done();
3912     rcu_read_unlock();
3913     trace_ram_load_complete(ret, seq_iter);
3914     return ret;
3915 }
3916
3917 static bool ram_has_postcopy(void *opaque)
3918 {
3919     RAMBlock *rb;
3920     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3921         if (ramblock_is_pmem(rb)) {
3922             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3923                          "is not supported now!", rb->idstr, rb->host);
3924             return false;
3925         }
3926     }
3927
3928     return migrate_postcopy_ram();
3929 }
3930
3931 /* Sync all the dirty bitmap with destination VM.  */
3932 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3933 {
3934     RAMBlock *block;
3935     QEMUFile *file = s->to_dst_file;
3936     int ramblock_count = 0;
3937
3938     trace_ram_dirty_bitmap_sync_start();
3939
3940     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3941         qemu_savevm_send_recv_bitmap(file, block->idstr);
3942         trace_ram_dirty_bitmap_request(block->idstr);
3943         ramblock_count++;
3944     }
3945
3946     trace_ram_dirty_bitmap_sync_wait();
3947
3948     /* Wait until all the ramblocks' dirty bitmap synced */
3949     while (ramblock_count--) {
3950         qemu_sem_wait(&s->rp_state.rp_sem);
3951     }
3952
3953     trace_ram_dirty_bitmap_sync_complete();
3954
3955     return 0;
3956 }
3957
3958 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3959 {
3960     qemu_sem_post(&s->rp_state.rp_sem);
3961 }
3962
3963 /*
3964  * Read the received bitmap, revert it as the initial dirty bitmap.
3965  * This is only used when the postcopy migration is paused but wants
3966  * to resume from a middle point.
3967  */
3968 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3969 {
3970     int ret = -EINVAL;
3971     QEMUFile *file = s->rp_state.from_dst_file;
3972     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3973     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3974     uint64_t size, end_mark;
3975
3976     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3977
3978     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3979         error_report("%s: incorrect state %s", __func__,
3980                      MigrationStatus_str(s->state));
3981         return -EINVAL;
3982     }
3983
3984     /*
3985      * Note: see comments in ramblock_recv_bitmap_send() on why we
3986      * need the endianess convertion, and the paddings.
3987      */
3988     local_size = ROUND_UP(local_size, 8);
3989
3990     /* Add paddings */
3991     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3992
3993     size = qemu_get_be64(file);
3994
3995     /* The size of the bitmap should match with our ramblock */
3996     if (size != local_size) {
3997         error_report("%s: ramblock '%s' bitmap size mismatch "
3998                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3999                      block->idstr, size, local_size);
4000         ret = -EINVAL;
4001         goto out;
4002     }
4003
4004     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4005     end_mark = qemu_get_be64(file);
4006
4007     ret = qemu_file_get_error(file);
4008     if (ret || size != local_size) {
4009         error_report("%s: read bitmap failed for ramblock '%s': %d"
4010                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4011                      __func__, block->idstr, ret, local_size, size);
4012         ret = -EIO;
4013         goto out;
4014     }
4015
4016     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4017         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4018                      __func__, block->idstr, end_mark);
4019         ret = -EINVAL;
4020         goto out;
4021     }
4022
4023     /*
4024      * Endianess convertion. We are during postcopy (though paused).
4025      * The dirty bitmap won't change. We can directly modify it.
4026      */
4027     bitmap_from_le(block->bmap, le_bitmap, nbits);
4028
4029     /*
4030      * What we received is "received bitmap". Revert it as the initial
4031      * dirty bitmap for this ramblock.
4032      */
4033     bitmap_complement(block->bmap, block->bmap, nbits);
4034
4035     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4036
4037     /*
4038      * We succeeded to sync bitmap for current ramblock. If this is
4039      * the last one to sync, we need to notify the main send thread.
4040      */
4041     ram_dirty_bitmap_reload_notify(s);
4042
4043     ret = 0;
4044 out:
4045     g_free(le_bitmap);
4046     return ret;
4047 }
4048
4049 static int ram_resume_prepare(MigrationState *s, void *opaque)
4050 {
4051     RAMState *rs = *(RAMState **)opaque;
4052     int ret;
4053
4054     ret = ram_dirty_bitmap_sync_all(s, rs);
4055     if (ret) {
4056         return ret;
4057     }
4058
4059     ram_state_resume_prepare(rs, s->to_dst_file);
4060
4061     return 0;
4062 }
4063
4064 static SaveVMHandlers savevm_ram_handlers = {
4065     .save_setup = ram_save_setup,
4066     .save_live_iterate = ram_save_iterate,
4067     .save_live_complete_postcopy = ram_save_complete,
4068     .save_live_complete_precopy = ram_save_complete,
4069     .has_postcopy = ram_has_postcopy,
4070     .save_live_pending = ram_save_pending,
4071     .load_state = ram_load,
4072     .save_cleanup = ram_save_cleanup,
4073     .load_setup = ram_load_setup,
4074     .load_cleanup = ram_load_cleanup,
4075     .resume_prepare = ram_resume_prepare,
4076 };
4077
4078 void ram_mig_init(void)
4079 {
4080     qemu_mutex_init(&XBZRLE.lock);
4081     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4082 }