migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "qemu/pmem.h"
  37 #include "xbzrle.h"
  38 #include "ram.h"
  39 #include "migration.h"
  40 #include "socket.h"
  41 #include "migration/register.h"
  42 #include "migration/misc.h"
  43 #include "qemu-file.h"
  44 #include "postcopy-ram.h"
  45 #include "page_cache.h"
  46 #include "qemu/error-report.h"
  47 #include "qapi/error.h"
  48 #include "qapi/qapi-events-migration.h"
  49 #include "qapi/qmp/qerror.h"
  50 #include "trace.h"
  51 #include "exec/ram_addr.h"
  52 #include "exec/target_page.h"
  53 #include "qemu/rcu_queue.h"
  54 #include "migration/colo.h"
  55 #include "block.h"
  56 #include "sysemu/sysemu.h"
  57 #include "qemu/uuid.h"
  58 #include "savevm.h"
  59 #include "qemu/iov.h"
  60
  61 /***********************************************************/
  62 /* ram save/restore */
  63
  64 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  65  * worked for pages that where filled with the same char.  We switched
  66  * it to only search for the zero value.  And to avoid confusion with
  67  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  68  */
  69
  70 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  71 #define RAM_SAVE_FLAG_ZERO     0x02
  72 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  73 #define RAM_SAVE_FLAG_PAGE     0x08
  74 #define RAM_SAVE_FLAG_EOS      0x10
  75 #define RAM_SAVE_FLAG_CONTINUE 0x20
  76 #define RAM_SAVE_FLAG_XBZRLE   0x40
  77 /* 0x80 is reserved in migration.h start with 0x100 next */
  78 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  79
  80 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  81 {
  82     return buffer_is_zero(p, size);
  83 }
  84
  85 XBZRLECacheStats xbzrle_counters;
  86
  87 /* struct contains XBZRLE cache and a static page
  88    used by the compression */
  89 static struct {
  90     /* buffer used for XBZRLE encoding */
  91     uint8_t *encoded_buf;
  92     /* buffer for storing page content */
  93     uint8_t *current_buf;
  94     /* Cache for XBZRLE, Protected by lock. */
  95     PageCache *cache;
  96     QemuMutex lock;
  97     /* it will store a page full of zeros */
  98     uint8_t *zero_target_page;
  99     /* buffer used for XBZRLE decoding */
 100     uint8_t *decoded_buf;
 101 } XBZRLE;
 102
 103 static void XBZRLE_cache_lock(void)
 104 {
 105     if (migrate_use_xbzrle())
 106         qemu_mutex_lock(&XBZRLE.lock);
 107 }
 108
 109 static void XBZRLE_cache_unlock(void)
 110 {
 111     if (migrate_use_xbzrle())
 112         qemu_mutex_unlock(&XBZRLE.lock);
 113 }
 114
 115 /**
 116  * xbzrle_cache_resize: resize the xbzrle cache
 117  *
 118  * This function is called from qmp_migrate_set_cache_size in main
 119  * thread, possibly while a migration is in progress.  A running
 120  * migration may be using the cache and might finish during this call,
 121  * hence changes to the cache are protected by XBZRLE.lock().
 122  *
 123  * Returns 0 for success or -1 for error
 124  *
 125  * @new_size: new cache size
 126  * @errp: set *errp if the check failed, with reason
 127  */
 128 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 129 {
 130     PageCache *new_cache;
 131     int64_t ret = 0;
 132
 133     /* Check for truncation */
 134     if (new_size != (size_t)new_size) {
 135         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 136                    "exceeding address space");
 137         return -1;
 138     }
 139
 140     if (new_size == migrate_xbzrle_cache_size()) {
 141         /* nothing to do */
 142         return 0;
 143     }
 144
 145     XBZRLE_cache_lock();
 146
 147     if (XBZRLE.cache != NULL) {
 148         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 149         if (!new_cache) {
 150             ret = -1;
 151             goto out;
 152         }
 153
 154         cache_fini(XBZRLE.cache);
 155         XBZRLE.cache = new_cache;
 156     }
 157 out:
 158     XBZRLE_cache_unlock();
 159     return ret;
 160 }
 161
 162 /* Should be holding either ram_list.mutex, or the RCU lock. */
 163 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 164     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 165         if (!qemu_ram_is_migratable(block)) {} else
 166
 167 #undef RAMBLOCK_FOREACH
 168
 169 static void ramblock_recv_map_init(void)
 170 {
 171     RAMBlock *rb;
 172
 173     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
 174         assert(!rb->receivedmap);
 175         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 176     }
 177 }
 178
 179 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 180 {
 181     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 182                     rb->receivedmap);
 183 }
 184
 185 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 186 {
 187     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 188 }
 189
 190 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 191 {
 192     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 193 }
 194
 195 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 196                                     size_t nr)
 197 {
 198     bitmap_set_atomic(rb->receivedmap,
 199                       ramblock_recv_bitmap_offset(host_addr, rb),
 200                       nr);
 201 }
 202
 203 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 204
 205 /*
 206  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 207  *
 208  * Returns >0 if success with sent bytes, or <0 if error.
 209  */
 210 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 211                                   const char *block_name)
 212 {
 213     RAMBlock *block = qemu_ram_block_by_name(block_name);
 214     unsigned long *le_bitmap, nbits;
 215     uint64_t size;
 216
 217     if (!block) {
 218         error_report("%s: invalid block name: %s", __func__, block_name);
 219         return -1;
 220     }
 221
 222     nbits = block->used_length >> TARGET_PAGE_BITS;
 223
 224     /*
 225      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 226      * machines we may need 4 more bytes for padding (see below
 227      * comment). So extend it a bit before hand.
 228      */
 229     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 230
 231     /*
 232      * Always use little endian when sending the bitmap. This is
 233      * required that when source and destination VMs are not using the
 234      * same endianess. (Note: big endian won't work.)
 235      */
 236     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 237
 238     /* Size of the bitmap, in bytes */
 239     size = DIV_ROUND_UP(nbits, 8);
 240
 241     /*
 242      * size is always aligned to 8 bytes for 64bit machines, but it
 243      * may not be true for 32bit machines. We need this padding to
 244      * make sure the migration can survive even between 32bit and
 245      * 64bit machines.
 246      */
 247     size = ROUND_UP(size, 8);
 248
 249     qemu_put_be64(file, size);
 250     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 251     /*
 252      * Mark as an end, in case the middle part is screwed up due to
 253      * some "misterious" reason.
 254      */
 255     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 256     qemu_fflush(file);
 257
 258     g_free(le_bitmap);
 259
 260     if (qemu_file_get_error(file)) {
 261         return qemu_file_get_error(file);
 262     }
 263
 264     return size + sizeof(size);
 265 }
 266
 267 /*
 268  * An outstanding page request, on the source, having been received
 269  * and queued
 270  */
 271 struct RAMSrcPageRequest {
 272     RAMBlock *rb;
 273     hwaddr    offset;
 274     hwaddr    len;
 275
 276     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 277 };
 278
 279 /* State of RAM for migration */
 280 struct RAMState {
 281     /* QEMUFile used for this migration */
 282     QEMUFile *f;
 283     /* Last block that we have visited searching for dirty pages */
 284     RAMBlock *last_seen_block;
 285     /* Last block from where we have sent data */
 286     RAMBlock *last_sent_block;
 287     /* Last dirty target page we have sent */
 288     ram_addr_t last_page;
 289     /* last ram version we have seen */
 290     uint32_t last_version;
 291     /* We are in the first round */
 292     bool ram_bulk_stage;
 293     /* How many times we have dirty too many pages */
 294     int dirty_rate_high_cnt;
 295     /* these variables are used for bitmap sync */
 296     /* last time we did a full bitmap_sync */
 297     int64_t time_last_bitmap_sync;
 298     /* bytes transferred at start_time */
 299     uint64_t bytes_xfer_prev;
 300     /* number of dirty pages since start_time */
 301     uint64_t num_dirty_pages_period;
 302     /* xbzrle misses since the beginning of the period */
 303     uint64_t xbzrle_cache_miss_prev;
 304     /* total handled target pages at the beginning of period */
 305     uint64_t target_page_count_prev;
 306     /* total handled target pages since start */
 307     uint64_t target_page_count;
 308     /* number of dirty bits in the bitmap */
 309     uint64_t migration_dirty_pages;
 310     /* protects modification of the bitmap */
 311     QemuMutex bitmap_mutex;
 312     /* The RAMBlock used in the last src_page_requests */
 313     RAMBlock *last_req_rb;
 314     /* Queue of outstanding page requests from the destination */
 315     QemuMutex src_page_req_mutex;
 316     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 317 };
 318 typedef struct RAMState RAMState;
 319
 320 static RAMState *ram_state;
 321
 322 uint64_t ram_bytes_remaining(void)
 323 {
 324     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 325                        0;
 326 }
 327
 328 MigrationStats ram_counters;
 329
 330 /* used by the search for pages to send */
 331 struct PageSearchStatus {
 332     /* Current block being searched */
 333     RAMBlock    *block;
 334     /* Current page to search from */
 335     unsigned long page;
 336     /* Set once we wrap around */
 337     bool         complete_round;
 338 };
 339 typedef struct PageSearchStatus PageSearchStatus;
 340
 341 struct CompressParam {
 342     bool done;
 343     bool quit;
 344     bool zero_page;
 345     QEMUFile *file;
 346     QemuMutex mutex;
 347     QemuCond cond;
 348     RAMBlock *block;
 349     ram_addr_t offset;
 350
 351     /* internally used fields */
 352     z_stream stream;
 353     uint8_t *originbuf;
 354 };
 355 typedef struct CompressParam CompressParam;
 356
 357 struct DecompressParam {
 358     bool done;
 359     bool quit;
 360     QemuMutex mutex;
 361     QemuCond cond;
 362     void *des;
 363     uint8_t *compbuf;
 364     int len;
 365     z_stream stream;
 366 };
 367 typedef struct DecompressParam DecompressParam;
 368
 369 static CompressParam *comp_param;
 370 static QemuThread *compress_threads;
 371 /* comp_done_cond is used to wake up the migration thread when
 372  * one of the compression threads has finished the compression.
 373  * comp_done_lock is used to co-work with comp_done_cond.
 374  */
 375 static QemuMutex comp_done_lock;
 376 static QemuCond comp_done_cond;
 377 /* The empty QEMUFileOps will be used by file in CompressParam */
 378 static const QEMUFileOps empty_ops = { };
 379
 380 static QEMUFile *decomp_file;
 381 static DecompressParam *decomp_param;
 382 static QemuThread *decompress_threads;
 383 static QemuMutex decomp_done_lock;
 384 static QemuCond decomp_done_cond;
 385
 386 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 387                                  ram_addr_t offset, uint8_t *source_buf);
 388
 389 static void *do_data_compress(void *opaque)
 390 {
 391     CompressParam *param = opaque;
 392     RAMBlock *block;
 393     ram_addr_t offset;
 394     bool zero_page;
 395
 396     qemu_mutex_lock(&param->mutex);
 397     while (!param->quit) {
 398         if (param->block) {
 399             block = param->block;
 400             offset = param->offset;
 401             param->block = NULL;
 402             qemu_mutex_unlock(&param->mutex);
 403
 404             zero_page = do_compress_ram_page(param->file, &param->stream,
 405                                              block, offset, param->originbuf);
 406
 407             qemu_mutex_lock(&comp_done_lock);
 408             param->done = true;
 409             param->zero_page = zero_page;
 410             qemu_cond_signal(&comp_done_cond);
 411             qemu_mutex_unlock(&comp_done_lock);
 412
 413             qemu_mutex_lock(&param->mutex);
 414         } else {
 415             qemu_cond_wait(&param->cond, &param->mutex);
 416         }
 417     }
 418     qemu_mutex_unlock(&param->mutex);
 419
 420     return NULL;
 421 }
 422
 423 static inline void terminate_compression_threads(void)
 424 {
 425     int idx, thread_count;
 426
 427     thread_count = migrate_compress_threads();
 428
 429     for (idx = 0; idx < thread_count; idx++) {
 430         qemu_mutex_lock(&comp_param[idx].mutex);
 431         comp_param[idx].quit = true;
 432         qemu_cond_signal(&comp_param[idx].cond);
 433         qemu_mutex_unlock(&comp_param[idx].mutex);
 434     }
 435 }
 436
 437 static void compress_threads_save_cleanup(void)
 438 {
 439     int i, thread_count;
 440
 441     if (!migrate_use_compression()) {
 442         return;
 443     }
 444     terminate_compression_threads();
 445     thread_count = migrate_compress_threads();
 446     for (i = 0; i < thread_count; i++) {
 447         /*
 448          * we use it as a indicator which shows if the thread is
 449          * properly init'd or not
 450          */
 451         if (!comp_param[i].file) {
 452             break;
 453         }
 454         qemu_thread_join(compress_threads + i);
 455         qemu_mutex_destroy(&comp_param[i].mutex);
 456         qemu_cond_destroy(&comp_param[i].cond);
 457         deflateEnd(&comp_param[i].stream);
 458         g_free(comp_param[i].originbuf);
 459         qemu_fclose(comp_param[i].file);
 460         comp_param[i].file = NULL;
 461     }
 462     qemu_mutex_destroy(&comp_done_lock);
 463     qemu_cond_destroy(&comp_done_cond);
 464     g_free(compress_threads);
 465     g_free(comp_param);
 466     compress_threads = NULL;
 467     comp_param = NULL;
 468 }
 469
 470 static int compress_threads_save_setup(void)
 471 {
 472     int i, thread_count;
 473
 474     if (!migrate_use_compression()) {
 475         return 0;
 476     }
 477     thread_count = migrate_compress_threads();
 478     compress_threads = g_new0(QemuThread, thread_count);
 479     comp_param = g_new0(CompressParam, thread_count);
 480     qemu_cond_init(&comp_done_cond);
 481     qemu_mutex_init(&comp_done_lock);
 482     for (i = 0; i < thread_count; i++) {
 483         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 484         if (!comp_param[i].originbuf) {
 485             goto exit;
 486         }
 487
 488         if (deflateInit(&comp_param[i].stream,
 489                         migrate_compress_level()) != Z_OK) {
 490             g_free(comp_param[i].originbuf);
 491             goto exit;
 492         }
 493
 494         /* comp_param[i].file is just used as a dummy buffer to save data,
 495          * set its ops to empty.
 496          */
 497         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 498         comp_param[i].done = true;
 499         comp_param[i].quit = false;
 500         qemu_mutex_init(&comp_param[i].mutex);
 501         qemu_cond_init(&comp_param[i].cond);
 502         qemu_thread_create(compress_threads + i, "compress",
 503                            do_data_compress, comp_param + i,
 504                            QEMU_THREAD_JOINABLE);
 505     }
 506     return 0;
 507
 508 exit:
 509     compress_threads_save_cleanup();
 510     return -1;
 511 }
 512
 513 /* Multiple fd's */
 514
 515 #define MULTIFD_MAGIC 0x11223344U
 516 #define MULTIFD_VERSION 1
 517
 518 #define MULTIFD_FLAG_SYNC (1 << 0)
 519
 520 typedef struct {
 521     uint32_t magic;
 522     uint32_t version;
 523     unsigned char uuid[16]; /* QemuUUID */
 524     uint8_t id;
 525 } __attribute__((packed)) MultiFDInit_t;
 526
 527 typedef struct {
 528     uint32_t magic;
 529     uint32_t version;
 530     uint32_t flags;
 531     uint32_t size;
 532     uint32_t used;
 533     uint64_t packet_num;
 534     char ramblock[256];
 535     uint64_t offset[];
 536 } __attribute__((packed)) MultiFDPacket_t;
 537
 538 typedef struct {
 539     /* number of used pages */
 540     uint32_t used;
 541     /* number of allocated pages */
 542     uint32_t allocated;
 543     /* global number of generated multifd packets */
 544     uint64_t packet_num;
 545     /* offset of each page */
 546     ram_addr_t *offset;
 547     /* pointer to each page */
 548     struct iovec *iov;
 549     RAMBlock *block;
 550 } MultiFDPages_t;
 551
 552 typedef struct {
 553     /* this fields are not changed once the thread is created */
 554     /* channel number */
 555     uint8_t id;
 556     /* channel thread name */
 557     char *name;
 558     /* channel thread id */
 559     QemuThread thread;
 560     /* communication channel */
 561     QIOChannel *c;
 562     /* sem where to wait for more work */
 563     QemuSemaphore sem;
 564     /* this mutex protects the following parameters */
 565     QemuMutex mutex;
 566     /* is this channel thread running */
 567     bool running;
 568     /* should this thread finish */
 569     bool quit;
 570     /* thread has work to do */
 571     int pending_job;
 572     /* array of pages to sent */
 573     MultiFDPages_t *pages;
 574     /* packet allocated len */
 575     uint32_t packet_len;
 576     /* pointer to the packet */
 577     MultiFDPacket_t *packet;
 578     /* multifd flags for each packet */
 579     uint32_t flags;
 580     /* global number of generated multifd packets */
 581     uint64_t packet_num;
 582     /* thread local variables */
 583     /* packets sent through this channel */
 584     uint64_t num_packets;
 585     /* pages sent through this channel */
 586     uint64_t num_pages;
 587     /* syncs main thread and channels */
 588     QemuSemaphore sem_sync;
 589 }  MultiFDSendParams;
 590
 591 typedef struct {
 592     /* this fields are not changed once the thread is created */
 593     /* channel number */
 594     uint8_t id;
 595     /* channel thread name */
 596     char *name;
 597     /* channel thread id */
 598     QemuThread thread;
 599     /* communication channel */
 600     QIOChannel *c;
 601     /* this mutex protects the following parameters */
 602     QemuMutex mutex;
 603     /* is this channel thread running */
 604     bool running;
 605     /* array of pages to receive */
 606     MultiFDPages_t *pages;
 607     /* packet allocated len */
 608     uint32_t packet_len;
 609     /* pointer to the packet */
 610     MultiFDPacket_t *packet;
 611     /* multifd flags for each packet */
 612     uint32_t flags;
 613     /* global number of generated multifd packets */
 614     uint64_t packet_num;
 615     /* thread local variables */
 616     /* packets sent through this channel */
 617     uint64_t num_packets;
 618     /* pages sent through this channel */
 619     uint64_t num_pages;
 620     /* syncs main thread and channels */
 621     QemuSemaphore sem_sync;
 622 } MultiFDRecvParams;
 623
 624 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 625 {
 626     MultiFDInit_t msg;
 627     int ret;
 628
 629     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 630     msg.version = cpu_to_be32(MULTIFD_VERSION);
 631     msg.id = p->id;
 632     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 633
 634     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 635     if (ret != 0) {
 636         return -1;
 637     }
 638     return 0;
 639 }
 640
 641 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 642 {
 643     MultiFDInit_t msg;
 644     int ret;
 645
 646     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 647     if (ret != 0) {
 648         return -1;
 649     }
 650
 651     be32_to_cpus(&msg.magic);
 652     be32_to_cpus(&msg.version);
 653
 654     if (msg.magic != MULTIFD_MAGIC) {
 655         error_setg(errp, "multifd: received packet magic %x "
 656                    "expected %x", msg.magic, MULTIFD_MAGIC);
 657         return -1;
 658     }
 659
 660     if (msg.version != MULTIFD_VERSION) {
 661         error_setg(errp, "multifd: received packet version %d "
 662                    "expected %d", msg.version, MULTIFD_VERSION);
 663         return -1;
 664     }
 665
 666     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 667         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 668         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 669
 670         error_setg(errp, "multifd: received uuid '%s' and expected "
 671                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 672         g_free(uuid);
 673         g_free(msg_uuid);
 674         return -1;
 675     }
 676
 677     if (msg.id > migrate_multifd_channels()) {
 678         error_setg(errp, "multifd: received channel version %d "
 679                    "expected %d", msg.version, MULTIFD_VERSION);
 680         return -1;
 681     }
 682
 683     return msg.id;
 684 }
 685
 686 static MultiFDPages_t *multifd_pages_init(size_t size)
 687 {
 688     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 689
 690     pages->allocated = size;
 691     pages->iov = g_new0(struct iovec, size);
 692     pages->offset = g_new0(ram_addr_t, size);
 693
 694     return pages;
 695 }
 696
 697 static void multifd_pages_clear(MultiFDPages_t *pages)
 698 {
 699     pages->used = 0;
 700     pages->allocated = 0;
 701     pages->packet_num = 0;
 702     pages->block = NULL;
 703     g_free(pages->iov);
 704     pages->iov = NULL;
 705     g_free(pages->offset);
 706     pages->offset = NULL;
 707     g_free(pages);
 708 }
 709
 710 static void multifd_send_fill_packet(MultiFDSendParams *p)
 711 {
 712     MultiFDPacket_t *packet = p->packet;
 713     int i;
 714
 715     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 716     packet->version = cpu_to_be32(MULTIFD_VERSION);
 717     packet->flags = cpu_to_be32(p->flags);
 718     packet->size = cpu_to_be32(migrate_multifd_page_count());
 719     packet->used = cpu_to_be32(p->pages->used);
 720     packet->packet_num = cpu_to_be64(p->packet_num);
 721
 722     if (p->pages->block) {
 723         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 724     }
 725
 726     for (i = 0; i < p->pages->used; i++) {
 727         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 728     }
 729 }
 730
 731 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 732 {
 733     MultiFDPacket_t *packet = p->packet;
 734     RAMBlock *block;
 735     int i;
 736
 737     be32_to_cpus(&packet->magic);
 738     if (packet->magic != MULTIFD_MAGIC) {
 739         error_setg(errp, "multifd: received packet "
 740                    "magic %x and expected magic %x",
 741                    packet->magic, MULTIFD_MAGIC);
 742         return -1;
 743     }
 744
 745     be32_to_cpus(&packet->version);
 746     if (packet->version != MULTIFD_VERSION) {
 747         error_setg(errp, "multifd: received packet "
 748                    "version %d and expected version %d",
 749                    packet->version, MULTIFD_VERSION);
 750         return -1;
 751     }
 752
 753     p->flags = be32_to_cpu(packet->flags);
 754
 755     be32_to_cpus(&packet->size);
 756     if (packet->size > migrate_multifd_page_count()) {
 757         error_setg(errp, "multifd: received packet "
 758                    "with size %d and expected maximum size %d",
 759                    packet->size, migrate_multifd_page_count()) ;
 760         return -1;
 761     }
 762
 763     p->pages->used = be32_to_cpu(packet->used);
 764     if (p->pages->used > packet->size) {
 765         error_setg(errp, "multifd: received packet "
 766                    "with size %d and expected maximum size %d",
 767                    p->pages->used, packet->size) ;
 768         return -1;
 769     }
 770
 771     p->packet_num = be64_to_cpu(packet->packet_num);
 772
 773     if (p->pages->used) {
 774         /* make sure that ramblock is 0 terminated */
 775         packet->ramblock[255] = 0;
 776         block = qemu_ram_block_by_name(packet->ramblock);
 777         if (!block) {
 778             error_setg(errp, "multifd: unknown ram block %s",
 779                        packet->ramblock);
 780             return -1;
 781         }
 782     }
 783
 784     for (i = 0; i < p->pages->used; i++) {
 785         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 786
 787         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 788             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 789                        " (max " RAM_ADDR_FMT ")",
 790                        offset, block->max_length);
 791             return -1;
 792         }
 793         p->pages->iov[i].iov_base = block->host + offset;
 794         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 795     }
 796
 797     return 0;
 798 }
 799
 800 struct {
 801     MultiFDSendParams *params;
 802     /* number of created threads */
 803     int count;
 804     /* array of pages to sent */
 805     MultiFDPages_t *pages;
 806     /* syncs main thread and channels */
 807     QemuSemaphore sem_sync;
 808     /* global number of generated multifd packets */
 809     uint64_t packet_num;
 810     /* send channels ready */
 811     QemuSemaphore channels_ready;
 812 } *multifd_send_state;
 813
 814 /*
 815  * How we use multifd_send_state->pages and channel->pages?
 816  *
 817  * We create a pages for each channel, and a main one.  Each time that
 818  * we need to send a batch of pages we interchange the ones between
 819  * multifd_send_state and the channel that is sending it.  There are
 820  * two reasons for that:
 821  *    - to not have to do so many mallocs during migration
 822  *    - to make easier to know what to free at the end of migration
 823  *
 824  * This way we always know who is the owner of each "pages" struct,
 825  * and we don't need any loocking.  It belongs to the migration thread
 826  * or to the channel thread.  Switching is safe because the migration
 827  * thread is using the channel mutex when changing it, and the channel
 828  * have to had finish with its own, otherwise pending_job can't be
 829  * false.
 830  */
 831
 832 static void multifd_send_pages(void)
 833 {
 834     int i;
 835     static int next_channel;
 836     MultiFDSendParams *p = NULL; /* make happy gcc */
 837     MultiFDPages_t *pages = multifd_send_state->pages;
 838     uint64_t transferred;
 839
 840     qemu_sem_wait(&multifd_send_state->channels_ready);
 841     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 842         p = &multifd_send_state->params[i];
 843
 844         qemu_mutex_lock(&p->mutex);
 845         if (!p->pending_job) {
 846             p->pending_job++;
 847             next_channel = (i + 1) % migrate_multifd_channels();
 848             break;
 849         }
 850         qemu_mutex_unlock(&p->mutex);
 851     }
 852     p->pages->used = 0;
 853
 854     p->packet_num = multifd_send_state->packet_num++;
 855     p->pages->block = NULL;
 856     multifd_send_state->pages = p->pages;
 857     p->pages = pages;
 858     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
 859     ram_counters.multifd_bytes += transferred;
 860     ram_counters.transferred += transferred;;
 861     qemu_mutex_unlock(&p->mutex);
 862     qemu_sem_post(&p->sem);
 863 }
 864
 865 static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
 866 {
 867     MultiFDPages_t *pages = multifd_send_state->pages;
 868
 869     if (!pages->block) {
 870         pages->block = block;
 871     }
 872
 873     if (pages->block == block) {
 874         pages->offset[pages->used] = offset;
 875         pages->iov[pages->used].iov_base = block->host + offset;
 876         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 877         pages->used++;
 878
 879         if (pages->used < pages->allocated) {
 880             return;
 881         }
 882     }
 883
 884     multifd_send_pages();
 885
 886     if (pages->block != block) {
 887         multifd_queue_page(block, offset);
 888     }
 889 }
 890
 891 static void multifd_send_terminate_threads(Error *err)
 892 {
 893     int i;
 894
 895     if (err) {
 896         MigrationState *s = migrate_get_current();
 897         migrate_set_error(s, err);
 898         if (s->state == MIGRATION_STATUS_SETUP ||
 899             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
 900             s->state == MIGRATION_STATUS_DEVICE ||
 901             s->state == MIGRATION_STATUS_ACTIVE) {
 902             migrate_set_state(&s->state, s->state,
 903                               MIGRATION_STATUS_FAILED);
 904         }
 905     }
 906
 907     for (i = 0; i < migrate_multifd_channels(); i++) {
 908         MultiFDSendParams *p = &multifd_send_state->params[i];
 909
 910         qemu_mutex_lock(&p->mutex);
 911         p->quit = true;
 912         qemu_sem_post(&p->sem);
 913         qemu_mutex_unlock(&p->mutex);
 914     }
 915 }
 916
 917 int multifd_save_cleanup(Error **errp)
 918 {
 919     int i;
 920     int ret = 0;
 921
 922     if (!migrate_use_multifd()) {
 923         return 0;
 924     }
 925     multifd_send_terminate_threads(NULL);
 926     for (i = 0; i < migrate_multifd_channels(); i++) {
 927         MultiFDSendParams *p = &multifd_send_state->params[i];
 928
 929         if (p->running) {
 930             qemu_thread_join(&p->thread);
 931         }
 932         socket_send_channel_destroy(p->c);
 933         p->c = NULL;
 934         qemu_mutex_destroy(&p->mutex);
 935         qemu_sem_destroy(&p->sem);
 936         qemu_sem_destroy(&p->sem_sync);
 937         g_free(p->name);
 938         p->name = NULL;
 939         multifd_pages_clear(p->pages);
 940         p->pages = NULL;
 941         p->packet_len = 0;
 942         g_free(p->packet);
 943         p->packet = NULL;
 944     }
 945     qemu_sem_destroy(&multifd_send_state->channels_ready);
 946     qemu_sem_destroy(&multifd_send_state->sem_sync);
 947     g_free(multifd_send_state->params);
 948     multifd_send_state->params = NULL;
 949     multifd_pages_clear(multifd_send_state->pages);
 950     multifd_send_state->pages = NULL;
 951     g_free(multifd_send_state);
 952     multifd_send_state = NULL;
 953     return ret;
 954 }
 955
 956 static void multifd_send_sync_main(void)
 957 {
 958     int i;
 959
 960     if (!migrate_use_multifd()) {
 961         return;
 962     }
 963     if (multifd_send_state->pages->used) {
 964         multifd_send_pages();
 965     }
 966     for (i = 0; i < migrate_multifd_channels(); i++) {
 967         MultiFDSendParams *p = &multifd_send_state->params[i];
 968
 969         trace_multifd_send_sync_main_signal(p->id);
 970
 971         qemu_mutex_lock(&p->mutex);
 972
 973         p->packet_num = multifd_send_state->packet_num++;
 974         p->flags |= MULTIFD_FLAG_SYNC;
 975         p->pending_job++;
 976         qemu_mutex_unlock(&p->mutex);
 977         qemu_sem_post(&p->sem);
 978     }
 979     for (i = 0; i < migrate_multifd_channels(); i++) {
 980         MultiFDSendParams *p = &multifd_send_state->params[i];
 981
 982         trace_multifd_send_sync_main_wait(p->id);
 983         qemu_sem_wait(&multifd_send_state->sem_sync);
 984     }
 985     trace_multifd_send_sync_main(multifd_send_state->packet_num);
 986 }
 987
 988 static void *multifd_send_thread(void *opaque)
 989 {
 990     MultiFDSendParams *p = opaque;
 991     Error *local_err = NULL;
 992     int ret;
 993
 994     trace_multifd_send_thread_start(p->id);
 995     rcu_register_thread();
 996
 997     if (multifd_send_initial_packet(p, &local_err) < 0) {
 998         goto out;
 999     }
1000     /* initial packet */
1001     p->num_packets = 1;
1002
1003     while (true) {
1004         qemu_sem_wait(&p->sem);
1005         qemu_mutex_lock(&p->mutex);
1006
1007         if (p->pending_job) {
1008             uint32_t used = p->pages->used;
1009             uint64_t packet_num = p->packet_num;
1010             uint32_t flags = p->flags;
1011
1012             multifd_send_fill_packet(p);
1013             p->flags = 0;
1014             p->num_packets++;
1015             p->num_pages += used;
1016             p->pages->used = 0;
1017             qemu_mutex_unlock(&p->mutex);
1018
1019             trace_multifd_send(p->id, packet_num, used, flags);
1020
1021             ret = qio_channel_write_all(p->c, (void *)p->packet,
1022                                         p->packet_len, &local_err);
1023             if (ret != 0) {
1024                 break;
1025             }
1026
1027             ret = qio_channel_writev_all(p->c, p->pages->iov, used, &local_err);
1028             if (ret != 0) {
1029                 break;
1030             }
1031
1032             qemu_mutex_lock(&p->mutex);
1033             p->pending_job--;
1034             qemu_mutex_unlock(&p->mutex);
1035
1036             if (flags & MULTIFD_FLAG_SYNC) {
1037                 qemu_sem_post(&multifd_send_state->sem_sync);
1038             }
1039             qemu_sem_post(&multifd_send_state->channels_ready);
1040         } else if (p->quit) {
1041             qemu_mutex_unlock(&p->mutex);
1042             break;
1043         } else {
1044             qemu_mutex_unlock(&p->mutex);
1045             /* sometimes there are spurious wakeups */
1046         }
1047     }
1048
1049 out:
1050     if (local_err) {
1051         multifd_send_terminate_threads(local_err);
1052     }
1053
1054     qemu_mutex_lock(&p->mutex);
1055     p->running = false;
1056     qemu_mutex_unlock(&p->mutex);
1057
1058     rcu_unregister_thread();
1059     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1060
1061     return NULL;
1062 }
1063
1064 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1065 {
1066     MultiFDSendParams *p = opaque;
1067     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1068     Error *local_err = NULL;
1069
1070     if (qio_task_propagate_error(task, &local_err)) {
1071         if (multifd_save_cleanup(&local_err) != 0) {
1072             migrate_set_error(migrate_get_current(), local_err);
1073         }
1074     } else {
1075         p->c = QIO_CHANNEL(sioc);
1076         qio_channel_set_delay(p->c, false);
1077         p->running = true;
1078         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1079                            QEMU_THREAD_JOINABLE);
1080
1081         atomic_inc(&multifd_send_state->count);
1082     }
1083 }
1084
1085 int multifd_save_setup(void)
1086 {
1087     int thread_count;
1088     uint32_t page_count = migrate_multifd_page_count();
1089     uint8_t i;
1090
1091     if (!migrate_use_multifd()) {
1092         return 0;
1093     }
1094     thread_count = migrate_multifd_channels();
1095     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1096     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1097     atomic_set(&multifd_send_state->count, 0);
1098     multifd_send_state->pages = multifd_pages_init(page_count);
1099     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1100     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1101
1102     for (i = 0; i < thread_count; i++) {
1103         MultiFDSendParams *p = &multifd_send_state->params[i];
1104
1105         qemu_mutex_init(&p->mutex);
1106         qemu_sem_init(&p->sem, 0);
1107         qemu_sem_init(&p->sem_sync, 0);
1108         p->quit = false;
1109         p->pending_job = 0;
1110         p->id = i;
1111         p->pages = multifd_pages_init(page_count);
1112         p->packet_len = sizeof(MultiFDPacket_t)
1113                       + sizeof(ram_addr_t) * page_count;
1114         p->packet = g_malloc0(p->packet_len);
1115         p->name = g_strdup_printf("multifdsend_%d", i);
1116         socket_send_channel_create(multifd_new_send_channel_async, p);
1117     }
1118     return 0;
1119 }
1120
1121 struct {
1122     MultiFDRecvParams *params;
1123     /* number of created threads */
1124     int count;
1125     /* syncs main thread and channels */
1126     QemuSemaphore sem_sync;
1127     /* global number of generated multifd packets */
1128     uint64_t packet_num;
1129 } *multifd_recv_state;
1130
1131 static void multifd_recv_terminate_threads(Error *err)
1132 {
1133     int i;
1134
1135     if (err) {
1136         MigrationState *s = migrate_get_current();
1137         migrate_set_error(s, err);
1138         if (s->state == MIGRATION_STATUS_SETUP ||
1139             s->state == MIGRATION_STATUS_ACTIVE) {
1140             migrate_set_state(&s->state, s->state,
1141                               MIGRATION_STATUS_FAILED);
1142         }
1143     }
1144
1145     for (i = 0; i < migrate_multifd_channels(); i++) {
1146         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1147
1148         qemu_mutex_lock(&p->mutex);
1149         /* We could arrive here for two reasons:
1150            - normal quit, i.e. everything went fine, just finished
1151            - error quit: We close the channels so the channel threads
1152              finish the qio_channel_read_all_eof() */
1153         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1154         qemu_mutex_unlock(&p->mutex);
1155     }
1156 }
1157
1158 int multifd_load_cleanup(Error **errp)
1159 {
1160     int i;
1161     int ret = 0;
1162
1163     if (!migrate_use_multifd()) {
1164         return 0;
1165     }
1166     multifd_recv_terminate_threads(NULL);
1167     for (i = 0; i < migrate_multifd_channels(); i++) {
1168         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1169
1170         if (p->running) {
1171             qemu_thread_join(&p->thread);
1172         }
1173         object_unref(OBJECT(p->c));
1174         p->c = NULL;
1175         qemu_mutex_destroy(&p->mutex);
1176         qemu_sem_destroy(&p->sem_sync);
1177         g_free(p->name);
1178         p->name = NULL;
1179         multifd_pages_clear(p->pages);
1180         p->pages = NULL;
1181         p->packet_len = 0;
1182         g_free(p->packet);
1183         p->packet = NULL;
1184     }
1185     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1186     g_free(multifd_recv_state->params);
1187     multifd_recv_state->params = NULL;
1188     g_free(multifd_recv_state);
1189     multifd_recv_state = NULL;
1190
1191     return ret;
1192 }
1193
1194 static void multifd_recv_sync_main(void)
1195 {
1196     int i;
1197
1198     if (!migrate_use_multifd()) {
1199         return;
1200     }
1201     for (i = 0; i < migrate_multifd_channels(); i++) {
1202         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1203
1204         trace_multifd_recv_sync_main_wait(p->id);
1205         qemu_sem_wait(&multifd_recv_state->sem_sync);
1206         qemu_mutex_lock(&p->mutex);
1207         if (multifd_recv_state->packet_num < p->packet_num) {
1208             multifd_recv_state->packet_num = p->packet_num;
1209         }
1210         qemu_mutex_unlock(&p->mutex);
1211     }
1212     for (i = 0; i < migrate_multifd_channels(); i++) {
1213         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1214
1215         trace_multifd_recv_sync_main_signal(p->id);
1216         qemu_sem_post(&p->sem_sync);
1217     }
1218     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1219 }
1220
1221 static void *multifd_recv_thread(void *opaque)
1222 {
1223     MultiFDRecvParams *p = opaque;
1224     Error *local_err = NULL;
1225     int ret;
1226
1227     trace_multifd_recv_thread_start(p->id);
1228     rcu_register_thread();
1229
1230     while (true) {
1231         uint32_t used;
1232         uint32_t flags;
1233
1234         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1235                                        p->packet_len, &local_err);
1236         if (ret == 0) {   /* EOF */
1237             break;
1238         }
1239         if (ret == -1) {   /* Error */
1240             break;
1241         }
1242
1243         qemu_mutex_lock(&p->mutex);
1244         ret = multifd_recv_unfill_packet(p, &local_err);
1245         if (ret) {
1246             qemu_mutex_unlock(&p->mutex);
1247             break;
1248         }
1249
1250         used = p->pages->used;
1251         flags = p->flags;
1252         trace_multifd_recv(p->id, p->packet_num, used, flags);
1253         p->num_packets++;
1254         p->num_pages += used;
1255         qemu_mutex_unlock(&p->mutex);
1256
1257         ret = qio_channel_readv_all(p->c, p->pages->iov, used, &local_err);
1258         if (ret != 0) {
1259             break;
1260         }
1261
1262         if (flags & MULTIFD_FLAG_SYNC) {
1263             qemu_sem_post(&multifd_recv_state->sem_sync);
1264             qemu_sem_wait(&p->sem_sync);
1265         }
1266     }
1267
1268     if (local_err) {
1269         multifd_recv_terminate_threads(local_err);
1270     }
1271     qemu_mutex_lock(&p->mutex);
1272     p->running = false;
1273     qemu_mutex_unlock(&p->mutex);
1274
1275     rcu_unregister_thread();
1276     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1277
1278     return NULL;
1279 }
1280
1281 int multifd_load_setup(void)
1282 {
1283     int thread_count;
1284     uint32_t page_count = migrate_multifd_page_count();
1285     uint8_t i;
1286
1287     if (!migrate_use_multifd()) {
1288         return 0;
1289     }
1290     thread_count = migrate_multifd_channels();
1291     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1292     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1293     atomic_set(&multifd_recv_state->count, 0);
1294     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1295
1296     for (i = 0; i < thread_count; i++) {
1297         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1298
1299         qemu_mutex_init(&p->mutex);
1300         qemu_sem_init(&p->sem_sync, 0);
1301         p->id = i;
1302         p->pages = multifd_pages_init(page_count);
1303         p->packet_len = sizeof(MultiFDPacket_t)
1304                       + sizeof(ram_addr_t) * page_count;
1305         p->packet = g_malloc0(p->packet_len);
1306         p->name = g_strdup_printf("multifdrecv_%d", i);
1307     }
1308     return 0;
1309 }
1310
1311 bool multifd_recv_all_channels_created(void)
1312 {
1313     int thread_count = migrate_multifd_channels();
1314
1315     if (!migrate_use_multifd()) {
1316         return true;
1317     }
1318
1319     return thread_count == atomic_read(&multifd_recv_state->count);
1320 }
1321
1322 /* Return true if multifd is ready for the migration, otherwise false */
1323 bool multifd_recv_new_channel(QIOChannel *ioc)
1324 {
1325     MultiFDRecvParams *p;
1326     Error *local_err = NULL;
1327     int id;
1328
1329     id = multifd_recv_initial_packet(ioc, &local_err);
1330     if (id < 0) {
1331         multifd_recv_terminate_threads(local_err);
1332         return false;
1333     }
1334
1335     p = &multifd_recv_state->params[id];
1336     if (p->c != NULL) {
1337         error_setg(&local_err, "multifd: received id '%d' already setup'",
1338                    id);
1339         multifd_recv_terminate_threads(local_err);
1340         return false;
1341     }
1342     p->c = ioc;
1343     object_ref(OBJECT(ioc));
1344     /* initial packet */
1345     p->num_packets = 1;
1346
1347     p->running = true;
1348     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1349                        QEMU_THREAD_JOINABLE);
1350     atomic_inc(&multifd_recv_state->count);
1351     return multifd_recv_state->count == migrate_multifd_channels();
1352 }
1353
1354 /**
1355  * save_page_header: write page header to wire
1356  *
1357  * If this is the 1st block, it also writes the block identification
1358  *
1359  * Returns the number of bytes written
1360  *
1361  * @f: QEMUFile where to send the data
1362  * @block: block that contains the page we want to send
1363  * @offset: offset inside the block for the page
1364  *          in the lower bits, it contains flags
1365  */
1366 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1367                                ram_addr_t offset)
1368 {
1369     size_t size, len;
1370
1371     if (block == rs->last_sent_block) {
1372         offset |= RAM_SAVE_FLAG_CONTINUE;
1373     }
1374     qemu_put_be64(f, offset);
1375     size = 8;
1376
1377     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1378         len = strlen(block->idstr);
1379         qemu_put_byte(f, len);
1380         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1381         size += 1 + len;
1382         rs->last_sent_block = block;
1383     }
1384     return size;
1385 }
1386
1387 /**
1388  * mig_throttle_guest_down: throotle down the guest
1389  *
1390  * Reduce amount of guest cpu execution to hopefully slow down memory
1391  * writes. If guest dirty memory rate is reduced below the rate at
1392  * which we can transfer pages to the destination then we should be
1393  * able to complete migration. Some workloads dirty memory way too
1394  * fast and will not effectively converge, even with auto-converge.
1395  */
1396 static void mig_throttle_guest_down(void)
1397 {
1398     MigrationState *s = migrate_get_current();
1399     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1400     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1401     int pct_max = s->parameters.max_cpu_throttle;
1402
1403     /* We have not started throttling yet. Let's start it. */
1404     if (!cpu_throttle_active()) {
1405         cpu_throttle_set(pct_initial);
1406     } else {
1407         /* Throttling already on, just increase the rate */
1408         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1409                          pct_max));
1410     }
1411 }
1412
1413 /**
1414  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1415  *
1416  * @rs: current RAM state
1417  * @current_addr: address for the zero page
1418  *
1419  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1420  * The important thing is that a stale (not-yet-0'd) page be replaced
1421  * by the new data.
1422  * As a bonus, if the page wasn't in the cache it gets added so that
1423  * when a small write is made into the 0'd page it gets XBZRLE sent.
1424  */
1425 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1426 {
1427     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1428         return;
1429     }
1430
1431     /* We don't care if this fails to allocate a new cache page
1432      * as long as it updated an old one */
1433     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1434                  ram_counters.dirty_sync_count);
1435 }
1436
1437 #define ENCODING_FLAG_XBZRLE 0x1
1438
1439 /**
1440  * save_xbzrle_page: compress and send current page
1441  *
1442  * Returns: 1 means that we wrote the page
1443  *          0 means that page is identical to the one already sent
1444  *          -1 means that xbzrle would be longer than normal
1445  *
1446  * @rs: current RAM state
1447  * @current_data: pointer to the address of the page contents
1448  * @current_addr: addr of the page
1449  * @block: block that contains the page we want to send
1450  * @offset: offset inside the block for the page
1451  * @last_stage: if we are at the completion stage
1452  */
1453 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1454                             ram_addr_t current_addr, RAMBlock *block,
1455                             ram_addr_t offset, bool last_stage)
1456 {
1457     int encoded_len = 0, bytes_xbzrle;
1458     uint8_t *prev_cached_page;
1459
1460     if (!cache_is_cached(XBZRLE.cache, current_addr,
1461                          ram_counters.dirty_sync_count)) {
1462         xbzrle_counters.cache_miss++;
1463         if (!last_stage) {
1464             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1465                              ram_counters.dirty_sync_count) == -1) {
1466                 return -1;
1467             } else {
1468                 /* update *current_data when the page has been
1469                    inserted into cache */
1470                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1471             }
1472         }
1473         return -1;
1474     }
1475
1476     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1477
1478     /* save current buffer into memory */
1479     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1480
1481     /* XBZRLE encoding (if there is no overflow) */
1482     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1483                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1484                                        TARGET_PAGE_SIZE);
1485     if (encoded_len == 0) {
1486         trace_save_xbzrle_page_skipping();
1487         return 0;
1488     } else if (encoded_len == -1) {
1489         trace_save_xbzrle_page_overflow();
1490         xbzrle_counters.overflow++;
1491         /* update data in the cache */
1492         if (!last_stage) {
1493             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1494             *current_data = prev_cached_page;
1495         }
1496         return -1;
1497     }
1498
1499     /* we need to update the data in the cache, in order to get the same data */
1500     if (!last_stage) {
1501         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1502     }
1503
1504     /* Send XBZRLE based compressed page */
1505     bytes_xbzrle = save_page_header(rs, rs->f, block,
1506                                     offset | RAM_SAVE_FLAG_XBZRLE);
1507     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1508     qemu_put_be16(rs->f, encoded_len);
1509     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1510     bytes_xbzrle += encoded_len + 1 + 2;
1511     xbzrle_counters.pages++;
1512     xbzrle_counters.bytes += bytes_xbzrle;
1513     ram_counters.transferred += bytes_xbzrle;
1514
1515     return 1;
1516 }
1517
1518 /**
1519  * migration_bitmap_find_dirty: find the next dirty page from start
1520  *
1521  * Called with rcu_read_lock() to protect migration_bitmap
1522  *
1523  * Returns the byte offset within memory region of the start of a dirty page
1524  *
1525  * @rs: current RAM state
1526  * @rb: RAMBlock where to search for dirty pages
1527  * @start: page where we start the search
1528  */
1529 static inline
1530 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1531                                           unsigned long start)
1532 {
1533     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1534     unsigned long *bitmap = rb->bmap;
1535     unsigned long next;
1536
1537     if (!qemu_ram_is_migratable(rb)) {
1538         return size;
1539     }
1540
1541     if (rs->ram_bulk_stage && start > 0) {
1542         next = start + 1;
1543     } else {
1544         next = find_next_bit(bitmap, size, start);
1545     }
1546
1547     return next;
1548 }
1549
1550 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1551                                                 RAMBlock *rb,
1552                                                 unsigned long page)
1553 {
1554     bool ret;
1555
1556     ret = test_and_clear_bit(page, rb->bmap);
1557
1558     if (ret) {
1559         rs->migration_dirty_pages--;
1560     }
1561     return ret;
1562 }
1563
1564 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1565                                         ram_addr_t start, ram_addr_t length)
1566 {
1567     rs->migration_dirty_pages +=
1568         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1569                                               &rs->num_dirty_pages_period);
1570 }
1571
1572 /**
1573  * ram_pagesize_summary: calculate all the pagesizes of a VM
1574  *
1575  * Returns a summary bitmap of the page sizes of all RAMBlocks
1576  *
1577  * For VMs with just normal pages this is equivalent to the host page
1578  * size. If it's got some huge pages then it's the OR of all the
1579  * different page sizes.
1580  */
1581 uint64_t ram_pagesize_summary(void)
1582 {
1583     RAMBlock *block;
1584     uint64_t summary = 0;
1585
1586     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1587         summary |= block->page_size;
1588     }
1589
1590     return summary;
1591 }
1592
1593 static void migration_update_rates(RAMState *rs, int64_t end_time)
1594 {
1595     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1596
1597     /* calculate period counters */
1598     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1599                 / (end_time - rs->time_last_bitmap_sync);
1600
1601     if (!page_count) {
1602         return;
1603     }
1604
1605     if (migrate_use_xbzrle()) {
1606         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1607             rs->xbzrle_cache_miss_prev) / page_count;
1608         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1609     }
1610 }
1611
1612 static void migration_bitmap_sync(RAMState *rs)
1613 {
1614     RAMBlock *block;
1615     int64_t end_time;
1616     uint64_t bytes_xfer_now;
1617
1618     ram_counters.dirty_sync_count++;
1619
1620     if (!rs->time_last_bitmap_sync) {
1621         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1622     }
1623
1624     trace_migration_bitmap_sync_start();
1625     memory_global_dirty_log_sync();
1626
1627     qemu_mutex_lock(&rs->bitmap_mutex);
1628     rcu_read_lock();
1629     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1630         migration_bitmap_sync_range(rs, block, 0, block->used_length);
1631     }
1632     ram_counters.remaining = ram_bytes_remaining();
1633     rcu_read_unlock();
1634     qemu_mutex_unlock(&rs->bitmap_mutex);
1635
1636     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1637
1638     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1639
1640     /* more than 1 second = 1000 millisecons */
1641     if (end_time > rs->time_last_bitmap_sync + 1000) {
1642         bytes_xfer_now = ram_counters.transferred;
1643
1644         /* During block migration the auto-converge logic incorrectly detects
1645          * that ram migration makes no progress. Avoid this by disabling the
1646          * throttling logic during the bulk phase of block migration. */
1647         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1648             /* The following detection logic can be refined later. For now:
1649                Check to see if the dirtied bytes is 50% more than the approx.
1650                amount of bytes that just got transferred since the last time we
1651                were in this routine. If that happens twice, start or increase
1652                throttling */
1653
1654             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1655                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1656                 (++rs->dirty_rate_high_cnt >= 2)) {
1657                     trace_migration_throttle();
1658                     rs->dirty_rate_high_cnt = 0;
1659                     mig_throttle_guest_down();
1660             }
1661         }
1662
1663         migration_update_rates(rs, end_time);
1664
1665         rs->target_page_count_prev = rs->target_page_count;
1666
1667         /* reset period counters */
1668         rs->time_last_bitmap_sync = end_time;
1669         rs->num_dirty_pages_period = 0;
1670         rs->bytes_xfer_prev = bytes_xfer_now;
1671     }
1672     if (migrate_use_events()) {
1673         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1674     }
1675 }
1676
1677 /**
1678  * save_zero_page_to_file: send the zero page to the file
1679  *
1680  * Returns the size of data written to the file, 0 means the page is not
1681  * a zero page
1682  *
1683  * @rs: current RAM state
1684  * @file: the file where the data is saved
1685  * @block: block that contains the page we want to send
1686  * @offset: offset inside the block for the page
1687  */
1688 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1689                                   RAMBlock *block, ram_addr_t offset)
1690 {
1691     uint8_t *p = block->host + offset;
1692     int len = 0;
1693
1694     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1695         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1696         qemu_put_byte(file, 0);
1697         len += 1;
1698     }
1699     return len;
1700 }
1701
1702 /**
1703  * save_zero_page: send the zero page to the stream
1704  *
1705  * Returns the number of pages written.
1706  *
1707  * @rs: current RAM state
1708  * @block: block that contains the page we want to send
1709  * @offset: offset inside the block for the page
1710  */
1711 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1712 {
1713     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1714
1715     if (len) {
1716         ram_counters.duplicate++;
1717         ram_counters.transferred += len;
1718         return 1;
1719     }
1720     return -1;
1721 }
1722
1723 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1724 {
1725     if (!migrate_release_ram() || !migration_in_postcopy()) {
1726         return;
1727     }
1728
1729     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1730 }
1731
1732 /*
1733  * @pages: the number of pages written by the control path,
1734  *        < 0 - error
1735  *        > 0 - number of pages written
1736  *
1737  * Return true if the pages has been saved, otherwise false is returned.
1738  */
1739 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1740                               int *pages)
1741 {
1742     uint64_t bytes_xmit = 0;
1743     int ret;
1744
1745     *pages = -1;
1746     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1747                                 &bytes_xmit);
1748     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1749         return false;
1750     }
1751
1752     if (bytes_xmit) {
1753         ram_counters.transferred += bytes_xmit;
1754         *pages = 1;
1755     }
1756
1757     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1758         return true;
1759     }
1760
1761     if (bytes_xmit > 0) {
1762         ram_counters.normal++;
1763     } else if (bytes_xmit == 0) {
1764         ram_counters.duplicate++;
1765     }
1766
1767     return true;
1768 }
1769
1770 /*
1771  * directly send the page to the stream
1772  *
1773  * Returns the number of pages written.
1774  *
1775  * @rs: current RAM state
1776  * @block: block that contains the page we want to send
1777  * @offset: offset inside the block for the page
1778  * @buf: the page to be sent
1779  * @async: send to page asyncly
1780  */
1781 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1782                             uint8_t *buf, bool async)
1783 {
1784     ram_counters.transferred += save_page_header(rs, rs->f, block,
1785                                                  offset | RAM_SAVE_FLAG_PAGE);
1786     if (async) {
1787         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1788                               migrate_release_ram() &
1789                               migration_in_postcopy());
1790     } else {
1791         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1792     }
1793     ram_counters.transferred += TARGET_PAGE_SIZE;
1794     ram_counters.normal++;
1795     return 1;
1796 }
1797
1798 /**
1799  * ram_save_page: send the given page to the stream
1800  *
1801  * Returns the number of pages written.
1802  *          < 0 - error
1803  *          >=0 - Number of pages written - this might legally be 0
1804  *                if xbzrle noticed the page was the same.
1805  *
1806  * @rs: current RAM state
1807  * @block: block that contains the page we want to send
1808  * @offset: offset inside the block for the page
1809  * @last_stage: if we are at the completion stage
1810  */
1811 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1812 {
1813     int pages = -1;
1814     uint8_t *p;
1815     bool send_async = true;
1816     RAMBlock *block = pss->block;
1817     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1818     ram_addr_t current_addr = block->offset + offset;
1819
1820     p = block->host + offset;
1821     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1822
1823     XBZRLE_cache_lock();
1824     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1825         migrate_use_xbzrle()) {
1826         pages = save_xbzrle_page(rs, &p, current_addr, block,
1827                                  offset, last_stage);
1828         if (!last_stage) {
1829             /* Can't send this cached data async, since the cache page
1830              * might get updated before it gets to the wire
1831              */
1832             send_async = false;
1833         }
1834     }
1835
1836     /* XBZRLE overflow or normal page */
1837     if (pages == -1) {
1838         pages = save_normal_page(rs, block, offset, p, send_async);
1839     }
1840
1841     XBZRLE_cache_unlock();
1842
1843     return pages;
1844 }
1845
1846 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1847                                  ram_addr_t offset)
1848 {
1849     multifd_queue_page(block, offset);
1850     ram_counters.normal++;
1851
1852     return 1;
1853 }
1854
1855 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1856                                  ram_addr_t offset, uint8_t *source_buf)
1857 {
1858     RAMState *rs = ram_state;
1859     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1860     bool zero_page = false;
1861     int ret;
1862
1863     if (save_zero_page_to_file(rs, f, block, offset)) {
1864         zero_page = true;
1865         goto exit;
1866     }
1867
1868     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1869
1870     /*
1871      * copy it to a internal buffer to avoid it being modified by VM
1872      * so that we can catch up the error during compression and
1873      * decompression
1874      */
1875     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1876     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1877     if (ret < 0) {
1878         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1879         error_report("compressed data failed!");
1880         return false;
1881     }
1882
1883 exit:
1884     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1885     return zero_page;
1886 }
1887
1888 static void
1889 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1890 {
1891     if (param->zero_page) {
1892         ram_counters.duplicate++;
1893     }
1894     ram_counters.transferred += bytes_xmit;
1895 }
1896
1897 static void flush_compressed_data(RAMState *rs)
1898 {
1899     int idx, len, thread_count;
1900
1901     if (!migrate_use_compression()) {
1902         return;
1903     }
1904     thread_count = migrate_compress_threads();
1905
1906     qemu_mutex_lock(&comp_done_lock);
1907     for (idx = 0; idx < thread_count; idx++) {
1908         while (!comp_param[idx].done) {
1909             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1910         }
1911     }
1912     qemu_mutex_unlock(&comp_done_lock);
1913
1914     for (idx = 0; idx < thread_count; idx++) {
1915         qemu_mutex_lock(&comp_param[idx].mutex);
1916         if (!comp_param[idx].quit) {
1917             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1918             /*
1919              * it's safe to fetch zero_page without holding comp_done_lock
1920              * as there is no further request submitted to the thread,
1921              * i.e, the thread should be waiting for a request at this point.
1922              */
1923             update_compress_thread_counts(&comp_param[idx], len);
1924         }
1925         qemu_mutex_unlock(&comp_param[idx].mutex);
1926     }
1927 }
1928
1929 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1930                                        ram_addr_t offset)
1931 {
1932     param->block = block;
1933     param->offset = offset;
1934 }
1935
1936 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1937                                            ram_addr_t offset)
1938 {
1939     int idx, thread_count, bytes_xmit = -1, pages = -1;
1940     bool wait = migrate_compress_wait_thread();
1941
1942     thread_count = migrate_compress_threads();
1943     qemu_mutex_lock(&comp_done_lock);
1944 retry:
1945     for (idx = 0; idx < thread_count; idx++) {
1946         if (comp_param[idx].done) {
1947             comp_param[idx].done = false;
1948             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1949             qemu_mutex_lock(&comp_param[idx].mutex);
1950             set_compress_params(&comp_param[idx], block, offset);
1951             qemu_cond_signal(&comp_param[idx].cond);
1952             qemu_mutex_unlock(&comp_param[idx].mutex);
1953             pages = 1;
1954             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1955             break;
1956         }
1957     }
1958
1959     /*
1960      * wait for the free thread if the user specifies 'compress-wait-thread',
1961      * otherwise we will post the page out in the main thread as normal page.
1962      */
1963     if (pages < 0 && wait) {
1964         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1965         goto retry;
1966     }
1967     qemu_mutex_unlock(&comp_done_lock);
1968
1969     return pages;
1970 }
1971
1972 /**
1973  * find_dirty_block: find the next dirty page and update any state
1974  * associated with the search process.
1975  *
1976  * Returns if a page is found
1977  *
1978  * @rs: current RAM state
1979  * @pss: data about the state of the current dirty page scan
1980  * @again: set to false if the search has scanned the whole of RAM
1981  */
1982 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1983 {
1984     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1985     if (pss->complete_round && pss->block == rs->last_seen_block &&
1986         pss->page >= rs->last_page) {
1987         /*
1988          * We've been once around the RAM and haven't found anything.
1989          * Give up.
1990          */
1991         *again = false;
1992         return false;
1993     }
1994     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1995         /* Didn't find anything in this RAM Block */
1996         pss->page = 0;
1997         pss->block = QLIST_NEXT_RCU(pss->block, next);
1998         if (!pss->block) {
1999             /*
2000              * If memory migration starts over, we will meet a dirtied page
2001              * which may still exists in compression threads's ring, so we
2002              * should flush the compressed data to make sure the new page
2003              * is not overwritten by the old one in the destination.
2004              *
2005              * Also If xbzrle is on, stop using the data compression at this
2006              * point. In theory, xbzrle can do better than compression.
2007              */
2008             flush_compressed_data(rs);
2009
2010             /* Hit the end of the list */
2011             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2012             /* Flag that we've looped */
2013             pss->complete_round = true;
2014             rs->ram_bulk_stage = false;
2015         }
2016         /* Didn't find anything this time, but try again on the new block */
2017         *again = true;
2018         return false;
2019     } else {
2020         /* Can go around again, but... */
2021         *again = true;
2022         /* We've found something so probably don't need to */
2023         return true;
2024     }
2025 }
2026
2027 /**
2028  * unqueue_page: gets a page of the queue
2029  *
2030  * Helper for 'get_queued_page' - gets a page off the queue
2031  *
2032  * Returns the block of the page (or NULL if none available)
2033  *
2034  * @rs: current RAM state
2035  * @offset: used to return the offset within the RAMBlock
2036  */
2037 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2038 {
2039     RAMBlock *block = NULL;
2040
2041     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2042         return NULL;
2043     }
2044
2045     qemu_mutex_lock(&rs->src_page_req_mutex);
2046     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2047         struct RAMSrcPageRequest *entry =
2048                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
2049         block = entry->rb;
2050         *offset = entry->offset;
2051
2052         if (entry->len > TARGET_PAGE_SIZE) {
2053             entry->len -= TARGET_PAGE_SIZE;
2054             entry->offset += TARGET_PAGE_SIZE;
2055         } else {
2056             memory_region_unref(block->mr);
2057             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2058             g_free(entry);
2059             migration_consume_urgent_request();
2060         }
2061     }
2062     qemu_mutex_unlock(&rs->src_page_req_mutex);
2063
2064     return block;
2065 }
2066
2067 /**
2068  * get_queued_page: unqueue a page from the postocpy requests
2069  *
2070  * Skips pages that are already sent (!dirty)
2071  *
2072  * Returns if a queued page is found
2073  *
2074  * @rs: current RAM state
2075  * @pss: data about the state of the current dirty page scan
2076  */
2077 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2078 {
2079     RAMBlock  *block;
2080     ram_addr_t offset;
2081     bool dirty;
2082
2083     do {
2084         block = unqueue_page(rs, &offset);
2085         /*
2086          * We're sending this page, and since it's postcopy nothing else
2087          * will dirty it, and we must make sure it doesn't get sent again
2088          * even if this queue request was received after the background
2089          * search already sent it.
2090          */
2091         if (block) {
2092             unsigned long page;
2093
2094             page = offset >> TARGET_PAGE_BITS;
2095             dirty = test_bit(page, block->bmap);
2096             if (!dirty) {
2097                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2098                        page, test_bit(page, block->unsentmap));
2099             } else {
2100                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2101             }
2102         }
2103
2104     } while (block && !dirty);
2105
2106     if (block) {
2107         /*
2108          * As soon as we start servicing pages out of order, then we have
2109          * to kill the bulk stage, since the bulk stage assumes
2110          * in (migration_bitmap_find_and_reset_dirty) that every page is
2111          * dirty, that's no longer true.
2112          */
2113         rs->ram_bulk_stage = false;
2114
2115         /*
2116          * We want the background search to continue from the queued page
2117          * since the guest is likely to want other pages near to the page
2118          * it just requested.
2119          */
2120         pss->block = block;
2121         pss->page = offset >> TARGET_PAGE_BITS;
2122     }
2123
2124     return !!block;
2125 }
2126
2127 /**
2128  * migration_page_queue_free: drop any remaining pages in the ram
2129  * request queue
2130  *
2131  * It should be empty at the end anyway, but in error cases there may
2132  * be some left.  in case that there is any page left, we drop it.
2133  *
2134  */
2135 static void migration_page_queue_free(RAMState *rs)
2136 {
2137     struct RAMSrcPageRequest *mspr, *next_mspr;
2138     /* This queue generally should be empty - but in the case of a failed
2139      * migration might have some droppings in.
2140      */
2141     rcu_read_lock();
2142     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2143         memory_region_unref(mspr->rb->mr);
2144         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2145         g_free(mspr);
2146     }
2147     rcu_read_unlock();
2148 }
2149
2150 /**
2151  * ram_save_queue_pages: queue the page for transmission
2152  *
2153  * A request from postcopy destination for example.
2154  *
2155  * Returns zero on success or negative on error
2156  *
2157  * @rbname: Name of the RAMBLock of the request. NULL means the
2158  *          same that last one.
2159  * @start: starting address from the start of the RAMBlock
2160  * @len: length (in bytes) to send
2161  */
2162 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2163 {
2164     RAMBlock *ramblock;
2165     RAMState *rs = ram_state;
2166
2167     ram_counters.postcopy_requests++;
2168     rcu_read_lock();
2169     if (!rbname) {
2170         /* Reuse last RAMBlock */
2171         ramblock = rs->last_req_rb;
2172
2173         if (!ramblock) {
2174             /*
2175              * Shouldn't happen, we can't reuse the last RAMBlock if
2176              * it's the 1st request.
2177              */
2178             error_report("ram_save_queue_pages no previous block");
2179             goto err;
2180         }
2181     } else {
2182         ramblock = qemu_ram_block_by_name(rbname);
2183
2184         if (!ramblock) {
2185             /* We shouldn't be asked for a non-existent RAMBlock */
2186             error_report("ram_save_queue_pages no block '%s'", rbname);
2187             goto err;
2188         }
2189         rs->last_req_rb = ramblock;
2190     }
2191     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2192     if (start+len > ramblock->used_length) {
2193         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2194                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2195                      __func__, start, len, ramblock->used_length);
2196         goto err;
2197     }
2198
2199     struct RAMSrcPageRequest *new_entry =
2200         g_malloc0(sizeof(struct RAMSrcPageRequest));
2201     new_entry->rb = ramblock;
2202     new_entry->offset = start;
2203     new_entry->len = len;
2204
2205     memory_region_ref(ramblock->mr);
2206     qemu_mutex_lock(&rs->src_page_req_mutex);
2207     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2208     migration_make_urgent_request();
2209     qemu_mutex_unlock(&rs->src_page_req_mutex);
2210     rcu_read_unlock();
2211
2212     return 0;
2213
2214 err:
2215     rcu_read_unlock();
2216     return -1;
2217 }
2218
2219 static bool save_page_use_compression(RAMState *rs)
2220 {
2221     if (!migrate_use_compression()) {
2222         return false;
2223     }
2224
2225     /*
2226      * If xbzrle is on, stop using the data compression after first
2227      * round of migration even if compression is enabled. In theory,
2228      * xbzrle can do better than compression.
2229      */
2230     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2231         return true;
2232     }
2233
2234     return false;
2235 }
2236
2237 /*
2238  * try to compress the page before posting it out, return true if the page
2239  * has been properly handled by compression, otherwise needs other
2240  * paths to handle it
2241  */
2242 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2243 {
2244     if (!save_page_use_compression(rs)) {
2245         return false;
2246     }
2247
2248     /*
2249      * When starting the process of a new block, the first page of
2250      * the block should be sent out before other pages in the same
2251      * block, and all the pages in last block should have been sent
2252      * out, keeping this order is important, because the 'cont' flag
2253      * is used to avoid resending the block name.
2254      *
2255      * We post the fist page as normal page as compression will take
2256      * much CPU resource.
2257      */
2258     if (block != rs->last_sent_block) {
2259         flush_compressed_data(rs);
2260         return false;
2261     }
2262
2263     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2264         return true;
2265     }
2266
2267     return false;
2268 }
2269
2270 /**
2271  * ram_save_target_page: save one target page
2272  *
2273  * Returns the number of pages written
2274  *
2275  * @rs: current RAM state
2276  * @pss: data about the page we want to send
2277  * @last_stage: if we are at the completion stage
2278  */
2279 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2280                                 bool last_stage)
2281 {
2282     RAMBlock *block = pss->block;
2283     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2284     int res;
2285
2286     if (control_save_page(rs, block, offset, &res)) {
2287         return res;
2288     }
2289
2290     if (save_compress_page(rs, block, offset)) {
2291         return 1;
2292     }
2293
2294     res = save_zero_page(rs, block, offset);
2295     if (res > 0) {
2296         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2297          * page would be stale
2298          */
2299         if (!save_page_use_compression(rs)) {
2300             XBZRLE_cache_lock();
2301             xbzrle_cache_zero_page(rs, block->offset + offset);
2302             XBZRLE_cache_unlock();
2303         }
2304         ram_release_pages(block->idstr, offset, res);
2305         return res;
2306     }
2307
2308     /*
2309      * do not use multifd for compression as the first page in the new
2310      * block should be posted out before sending the compressed page
2311      */
2312     if (!save_page_use_compression(rs) && migrate_use_multifd()) {
2313         return ram_save_multifd_page(rs, block, offset);
2314     }
2315
2316     return ram_save_page(rs, pss, last_stage);
2317 }
2318
2319 /**
2320  * ram_save_host_page: save a whole host page
2321  *
2322  * Starting at *offset send pages up to the end of the current host
2323  * page. It's valid for the initial offset to point into the middle of
2324  * a host page in which case the remainder of the hostpage is sent.
2325  * Only dirty target pages are sent. Note that the host page size may
2326  * be a huge page for this block.
2327  * The saving stops at the boundary of the used_length of the block
2328  * if the RAMBlock isn't a multiple of the host page size.
2329  *
2330  * Returns the number of pages written or negative on error
2331  *
2332  * @rs: current RAM state
2333  * @ms: current migration state
2334  * @pss: data about the page we want to send
2335  * @last_stage: if we are at the completion stage
2336  */
2337 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2338                               bool last_stage)
2339 {
2340     int tmppages, pages = 0;
2341     size_t pagesize_bits =
2342         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2343
2344     if (!qemu_ram_is_migratable(pss->block)) {
2345         error_report("block %s should not be migrated !", pss->block->idstr);
2346         return 0;
2347     }
2348
2349     do {
2350         /* Check the pages is dirty and if it is send it */
2351         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2352             pss->page++;
2353             continue;
2354         }
2355
2356         tmppages = ram_save_target_page(rs, pss, last_stage);
2357         if (tmppages < 0) {
2358             return tmppages;
2359         }
2360
2361         pages += tmppages;
2362         if (pss->block->unsentmap) {
2363             clear_bit(pss->page, pss->block->unsentmap);
2364         }
2365
2366         pss->page++;
2367     } while ((pss->page & (pagesize_bits - 1)) &&
2368              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2369
2370     /* The offset we leave with is the last one we looked at */
2371     pss->page--;
2372     return pages;
2373 }
2374
2375 /**
2376  * ram_find_and_save_block: finds a dirty page and sends it to f
2377  *
2378  * Called within an RCU critical section.
2379  *
2380  * Returns the number of pages written where zero means no dirty pages,
2381  * or negative on error
2382  *
2383  * @rs: current RAM state
2384  * @last_stage: if we are at the completion stage
2385  *
2386  * On systems where host-page-size > target-page-size it will send all the
2387  * pages in a host page that are dirty.
2388  */
2389
2390 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2391 {
2392     PageSearchStatus pss;
2393     int pages = 0;
2394     bool again, found;
2395
2396     /* No dirty page as there is zero RAM */
2397     if (!ram_bytes_total()) {
2398         return pages;
2399     }
2400
2401     pss.block = rs->last_seen_block;
2402     pss.page = rs->last_page;
2403     pss.complete_round = false;
2404
2405     if (!pss.block) {
2406         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2407     }
2408
2409     do {
2410         again = true;
2411         found = get_queued_page(rs, &pss);
2412
2413         if (!found) {
2414             /* priority queue empty, so just search for something dirty */
2415             found = find_dirty_block(rs, &pss, &again);
2416         }
2417
2418         if (found) {
2419             pages = ram_save_host_page(rs, &pss, last_stage);
2420         }
2421     } while (!pages && again);
2422
2423     rs->last_seen_block = pss.block;
2424     rs->last_page = pss.page;
2425
2426     return pages;
2427 }
2428
2429 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2430 {
2431     uint64_t pages = size / TARGET_PAGE_SIZE;
2432
2433     if (zero) {
2434         ram_counters.duplicate += pages;
2435     } else {
2436         ram_counters.normal += pages;
2437         ram_counters.transferred += size;
2438         qemu_update_position(f, size);
2439     }
2440 }
2441
2442 uint64_t ram_bytes_total(void)
2443 {
2444     RAMBlock *block;
2445     uint64_t total = 0;
2446
2447     rcu_read_lock();
2448     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2449         total += block->used_length;
2450     }
2451     rcu_read_unlock();
2452     return total;
2453 }
2454
2455 static void xbzrle_load_setup(void)
2456 {
2457     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2458 }
2459
2460 static void xbzrle_load_cleanup(void)
2461 {
2462     g_free(XBZRLE.decoded_buf);
2463     XBZRLE.decoded_buf = NULL;
2464 }
2465
2466 static void ram_state_cleanup(RAMState **rsp)
2467 {
2468     if (*rsp) {
2469         migration_page_queue_free(*rsp);
2470         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2471         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2472         g_free(*rsp);
2473         *rsp = NULL;
2474     }
2475 }
2476
2477 static void xbzrle_cleanup(void)
2478 {
2479     XBZRLE_cache_lock();
2480     if (XBZRLE.cache) {
2481         cache_fini(XBZRLE.cache);
2482         g_free(XBZRLE.encoded_buf);
2483         g_free(XBZRLE.current_buf);
2484         g_free(XBZRLE.zero_target_page);
2485         XBZRLE.cache = NULL;
2486         XBZRLE.encoded_buf = NULL;
2487         XBZRLE.current_buf = NULL;
2488         XBZRLE.zero_target_page = NULL;
2489     }
2490     XBZRLE_cache_unlock();
2491 }
2492
2493 static void ram_save_cleanup(void *opaque)
2494 {
2495     RAMState **rsp = opaque;
2496     RAMBlock *block;
2497
2498     /* caller have hold iothread lock or is in a bh, so there is
2499      * no writing race against this migration_bitmap
2500      */
2501     memory_global_dirty_log_stop();
2502
2503     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2504         g_free(block->bmap);
2505         block->bmap = NULL;
2506         g_free(block->unsentmap);
2507         block->unsentmap = NULL;
2508     }
2509
2510     xbzrle_cleanup();
2511     compress_threads_save_cleanup();
2512     ram_state_cleanup(rsp);
2513 }
2514
2515 static void ram_state_reset(RAMState *rs)
2516 {
2517     rs->last_seen_block = NULL;
2518     rs->last_sent_block = NULL;
2519     rs->last_page = 0;
2520     rs->last_version = ram_list.version;
2521     rs->ram_bulk_stage = true;
2522 }
2523
2524 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2525
2526 /*
2527  * 'expected' is the value you expect the bitmap mostly to be full
2528  * of; it won't bother printing lines that are all this value.
2529  * If 'todump' is null the migration bitmap is dumped.
2530  */
2531 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2532                            unsigned long pages)
2533 {
2534     int64_t cur;
2535     int64_t linelen = 128;
2536     char linebuf[129];
2537
2538     for (cur = 0; cur < pages; cur += linelen) {
2539         int64_t curb;
2540         bool found = false;
2541         /*
2542          * Last line; catch the case where the line length
2543          * is longer than remaining ram
2544          */
2545         if (cur + linelen > pages) {
2546             linelen = pages - cur;
2547         }
2548         for (curb = 0; curb < linelen; curb++) {
2549             bool thisbit = test_bit(cur + curb, todump);
2550             linebuf[curb] = thisbit ? '1' : '.';
2551             found = found || (thisbit != expected);
2552         }
2553         if (found) {
2554             linebuf[curb] = '\0';
2555             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2556         }
2557     }
2558 }
2559
2560 /* **** functions for postcopy ***** */
2561
2562 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2563 {
2564     struct RAMBlock *block;
2565
2566     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2567         unsigned long *bitmap = block->bmap;
2568         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2569         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2570
2571         while (run_start < range) {
2572             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2573             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2574                               (run_end - run_start) << TARGET_PAGE_BITS);
2575             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2576         }
2577     }
2578 }
2579
2580 /**
2581  * postcopy_send_discard_bm_ram: discard a RAMBlock
2582  *
2583  * Returns zero on success
2584  *
2585  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2586  * Note: At this point the 'unsentmap' is the processed bitmap combined
2587  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2588  *
2589  * @ms: current migration state
2590  * @pds: state for postcopy
2591  * @start: RAMBlock starting page
2592  * @length: RAMBlock size
2593  */
2594 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2595                                         PostcopyDiscardState *pds,
2596                                         RAMBlock *block)
2597 {
2598     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2599     unsigned long current;
2600     unsigned long *unsentmap = block->unsentmap;
2601
2602     for (current = 0; current < end; ) {
2603         unsigned long one = find_next_bit(unsentmap, end, current);
2604
2605         if (one <= end) {
2606             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2607             unsigned long discard_length;
2608
2609             if (zero >= end) {
2610                 discard_length = end - one;
2611             } else {
2612                 discard_length = zero - one;
2613             }
2614             if (discard_length) {
2615                 postcopy_discard_send_range(ms, pds, one, discard_length);
2616             }
2617             current = one + discard_length;
2618         } else {
2619             current = one;
2620         }
2621     }
2622
2623     return 0;
2624 }
2625
2626 /**
2627  * postcopy_each_ram_send_discard: discard all RAMBlocks
2628  *
2629  * Returns 0 for success or negative for error
2630  *
2631  * Utility for the outgoing postcopy code.
2632  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2633  *   passing it bitmap indexes and name.
2634  * (qemu_ram_foreach_block ends up passing unscaled lengths
2635  *  which would mean postcopy code would have to deal with target page)
2636  *
2637  * @ms: current migration state
2638  */
2639 static int postcopy_each_ram_send_discard(MigrationState *ms)
2640 {
2641     struct RAMBlock *block;
2642     int ret;
2643
2644     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2645         PostcopyDiscardState *pds =
2646             postcopy_discard_send_init(ms, block->idstr);
2647
2648         /*
2649          * Postcopy sends chunks of bitmap over the wire, but it
2650          * just needs indexes at this point, avoids it having
2651          * target page specific code.
2652          */
2653         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2654         postcopy_discard_send_finish(ms, pds);
2655         if (ret) {
2656             return ret;
2657         }
2658     }
2659
2660     return 0;
2661 }
2662
2663 /**
2664  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2665  *
2666  * Helper for postcopy_chunk_hostpages; it's called twice to
2667  * canonicalize the two bitmaps, that are similar, but one is
2668  * inverted.
2669  *
2670  * Postcopy requires that all target pages in a hostpage are dirty or
2671  * clean, not a mix.  This function canonicalizes the bitmaps.
2672  *
2673  * @ms: current migration state
2674  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2675  *               otherwise we need to canonicalize partially dirty host pages
2676  * @block: block that contains the page we want to canonicalize
2677  * @pds: state for postcopy
2678  */
2679 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2680                                           RAMBlock *block,
2681                                           PostcopyDiscardState *pds)
2682 {
2683     RAMState *rs = ram_state;
2684     unsigned long *bitmap = block->bmap;
2685     unsigned long *unsentmap = block->unsentmap;
2686     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2687     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2688     unsigned long run_start;
2689
2690     if (block->page_size == TARGET_PAGE_SIZE) {
2691         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2692         return;
2693     }
2694
2695     if (unsent_pass) {
2696         /* Find a sent page */
2697         run_start = find_next_zero_bit(unsentmap, pages, 0);
2698     } else {
2699         /* Find a dirty page */
2700         run_start = find_next_bit(bitmap, pages, 0);
2701     }
2702
2703     while (run_start < pages) {
2704         bool do_fixup = false;
2705         unsigned long fixup_start_addr;
2706         unsigned long host_offset;
2707
2708         /*
2709          * If the start of this run of pages is in the middle of a host
2710          * page, then we need to fixup this host page.
2711          */
2712         host_offset = run_start % host_ratio;
2713         if (host_offset) {
2714             do_fixup = true;
2715             run_start -= host_offset;
2716             fixup_start_addr = run_start;
2717             /* For the next pass */
2718             run_start = run_start + host_ratio;
2719         } else {
2720             /* Find the end of this run */
2721             unsigned long run_end;
2722             if (unsent_pass) {
2723                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2724             } else {
2725                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2726             }
2727             /*
2728              * If the end isn't at the start of a host page, then the
2729              * run doesn't finish at the end of a host page
2730              * and we need to discard.
2731              */
2732             host_offset = run_end % host_ratio;
2733             if (host_offset) {
2734                 do_fixup = true;
2735                 fixup_start_addr = run_end - host_offset;
2736                 /*
2737                  * This host page has gone, the next loop iteration starts
2738                  * from after the fixup
2739                  */
2740                 run_start = fixup_start_addr + host_ratio;
2741             } else {
2742                 /*
2743                  * No discards on this iteration, next loop starts from
2744                  * next sent/dirty page
2745                  */
2746                 run_start = run_end + 1;
2747             }
2748         }
2749
2750         if (do_fixup) {
2751             unsigned long page;
2752
2753             /* Tell the destination to discard this page */
2754             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2755                 /* For the unsent_pass we:
2756                  *     discard partially sent pages
2757                  * For the !unsent_pass (dirty) we:
2758                  *     discard partially dirty pages that were sent
2759                  *     (any partially sent pages were already discarded
2760                  *     by the previous unsent_pass)
2761                  */
2762                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2763                                             host_ratio);
2764             }
2765
2766             /* Clean up the bitmap */
2767             for (page = fixup_start_addr;
2768                  page < fixup_start_addr + host_ratio; page++) {
2769                 /* All pages in this host page are now not sent */
2770                 set_bit(page, unsentmap);
2771
2772                 /*
2773                  * Remark them as dirty, updating the count for any pages
2774                  * that weren't previously dirty.
2775                  */
2776                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2777             }
2778         }
2779
2780         if (unsent_pass) {
2781             /* Find the next sent page for the next iteration */
2782             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2783         } else {
2784             /* Find the next dirty page for the next iteration */
2785             run_start = find_next_bit(bitmap, pages, run_start);
2786         }
2787     }
2788 }
2789
2790 /**
2791  * postcopy_chuck_hostpages: discrad any partially sent host page
2792  *
2793  * Utility for the outgoing postcopy code.
2794  *
2795  * Discard any partially sent host-page size chunks, mark any partially
2796  * dirty host-page size chunks as all dirty.  In this case the host-page
2797  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2798  *
2799  * Returns zero on success
2800  *
2801  * @ms: current migration state
2802  * @block: block we want to work with
2803  */
2804 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2805 {
2806     PostcopyDiscardState *pds =
2807         postcopy_discard_send_init(ms, block->idstr);
2808
2809     /* First pass: Discard all partially sent host pages */
2810     postcopy_chunk_hostpages_pass(ms, true, block, pds);
2811     /*
2812      * Second pass: Ensure that all partially dirty host pages are made
2813      * fully dirty.
2814      */
2815     postcopy_chunk_hostpages_pass(ms, false, block, pds);
2816
2817     postcopy_discard_send_finish(ms, pds);
2818     return 0;
2819 }
2820
2821 /**
2822  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2823  *
2824  * Returns zero on success
2825  *
2826  * Transmit the set of pages to be discarded after precopy to the target
2827  * these are pages that:
2828  *     a) Have been previously transmitted but are now dirty again
2829  *     b) Pages that have never been transmitted, this ensures that
2830  *        any pages on the destination that have been mapped by background
2831  *        tasks get discarded (transparent huge pages is the specific concern)
2832  * Hopefully this is pretty sparse
2833  *
2834  * @ms: current migration state
2835  */
2836 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2837 {
2838     RAMState *rs = ram_state;
2839     RAMBlock *block;
2840     int ret;
2841
2842     rcu_read_lock();
2843
2844     /* This should be our last sync, the src is now paused */
2845     migration_bitmap_sync(rs);
2846
2847     /* Easiest way to make sure we don't resume in the middle of a host-page */
2848     rs->last_seen_block = NULL;
2849     rs->last_sent_block = NULL;
2850     rs->last_page = 0;
2851
2852     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2853         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2854         unsigned long *bitmap = block->bmap;
2855         unsigned long *unsentmap = block->unsentmap;
2856
2857         if (!unsentmap) {
2858             /* We don't have a safe way to resize the sentmap, so
2859              * if the bitmap was resized it will be NULL at this
2860              * point.
2861              */
2862             error_report("migration ram resized during precopy phase");
2863             rcu_read_unlock();
2864             return -EINVAL;
2865         }
2866         /* Deal with TPS != HPS and huge pages */
2867         ret = postcopy_chunk_hostpages(ms, block);
2868         if (ret) {
2869             rcu_read_unlock();
2870             return ret;
2871         }
2872
2873         /*
2874          * Update the unsentmap to be unsentmap = unsentmap | dirty
2875          */
2876         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2877 #ifdef DEBUG_POSTCOPY
2878         ram_debug_dump_bitmap(unsentmap, true, pages);
2879 #endif
2880     }
2881     trace_ram_postcopy_send_discard_bitmap();
2882
2883     ret = postcopy_each_ram_send_discard(ms);
2884     rcu_read_unlock();
2885
2886     return ret;
2887 }
2888
2889 /**
2890  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2891  *
2892  * Returns zero on success
2893  *
2894  * @rbname: name of the RAMBlock of the request. NULL means the
2895  *          same that last one.
2896  * @start: RAMBlock starting page
2897  * @length: RAMBlock size
2898  */
2899 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2900 {
2901     int ret = -1;
2902
2903     trace_ram_discard_range(rbname, start, length);
2904
2905     rcu_read_lock();
2906     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2907
2908     if (!rb) {
2909         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2910         goto err;
2911     }
2912
2913     /*
2914      * On source VM, we don't need to update the received bitmap since
2915      * we don't even have one.
2916      */
2917     if (rb->receivedmap) {
2918         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2919                      length >> qemu_target_page_bits());
2920     }
2921
2922     ret = ram_block_discard_range(rb, start, length);
2923
2924 err:
2925     rcu_read_unlock();
2926
2927     return ret;
2928 }
2929
2930 /*
2931  * For every allocation, we will try not to crash the VM if the
2932  * allocation failed.
2933  */
2934 static int xbzrle_init(void)
2935 {
2936     Error *local_err = NULL;
2937
2938     if (!migrate_use_xbzrle()) {
2939         return 0;
2940     }
2941
2942     XBZRLE_cache_lock();
2943
2944     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2945     if (!XBZRLE.zero_target_page) {
2946         error_report("%s: Error allocating zero page", __func__);
2947         goto err_out;
2948     }
2949
2950     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2951                               TARGET_PAGE_SIZE, &local_err);
2952     if (!XBZRLE.cache) {
2953         error_report_err(local_err);
2954         goto free_zero_page;
2955     }
2956
2957     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2958     if (!XBZRLE.encoded_buf) {
2959         error_report("%s: Error allocating encoded_buf", __func__);
2960         goto free_cache;
2961     }
2962
2963     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2964     if (!XBZRLE.current_buf) {
2965         error_report("%s: Error allocating current_buf", __func__);
2966         goto free_encoded_buf;
2967     }
2968
2969     /* We are all good */
2970     XBZRLE_cache_unlock();
2971     return 0;
2972
2973 free_encoded_buf:
2974     g_free(XBZRLE.encoded_buf);
2975     XBZRLE.encoded_buf = NULL;
2976 free_cache:
2977     cache_fini(XBZRLE.cache);
2978     XBZRLE.cache = NULL;
2979 free_zero_page:
2980     g_free(XBZRLE.zero_target_page);
2981     XBZRLE.zero_target_page = NULL;
2982 err_out:
2983     XBZRLE_cache_unlock();
2984     return -ENOMEM;
2985 }
2986
2987 static int ram_state_init(RAMState **rsp)
2988 {
2989     *rsp = g_try_new0(RAMState, 1);
2990
2991     if (!*rsp) {
2992         error_report("%s: Init ramstate fail", __func__);
2993         return -1;
2994     }
2995
2996     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2997     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2998     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2999
3000     /*
3001      * Count the total number of pages used by ram blocks not including any
3002      * gaps due to alignment or unplugs.
3003      */
3004     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3005
3006     ram_state_reset(*rsp);
3007
3008     return 0;
3009 }
3010
3011 static void ram_list_init_bitmaps(void)
3012 {
3013     RAMBlock *block;
3014     unsigned long pages;
3015
3016     /* Skip setting bitmap if there is no RAM */
3017     if (ram_bytes_total()) {
3018         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3019             pages = block->max_length >> TARGET_PAGE_BITS;
3020             block->bmap = bitmap_new(pages);
3021             bitmap_set(block->bmap, 0, pages);
3022             if (migrate_postcopy_ram()) {
3023                 block->unsentmap = bitmap_new(pages);
3024                 bitmap_set(block->unsentmap, 0, pages);
3025             }
3026         }
3027     }
3028 }
3029
3030 static void ram_init_bitmaps(RAMState *rs)
3031 {
3032     /* For memory_global_dirty_log_start below.  */
3033     qemu_mutex_lock_iothread();
3034     qemu_mutex_lock_ramlist();
3035     rcu_read_lock();
3036
3037     ram_list_init_bitmaps();
3038     memory_global_dirty_log_start();
3039     migration_bitmap_sync(rs);
3040
3041     rcu_read_unlock();
3042     qemu_mutex_unlock_ramlist();
3043     qemu_mutex_unlock_iothread();
3044 }
3045
3046 static int ram_init_all(RAMState **rsp)
3047 {
3048     if (ram_state_init(rsp)) {
3049         return -1;
3050     }
3051
3052     if (xbzrle_init()) {
3053         ram_state_cleanup(rsp);
3054         return -1;
3055     }
3056
3057     ram_init_bitmaps(*rsp);
3058
3059     return 0;
3060 }
3061
3062 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3063 {
3064     RAMBlock *block;
3065     uint64_t pages = 0;
3066
3067     /*
3068      * Postcopy is not using xbzrle/compression, so no need for that.
3069      * Also, since source are already halted, we don't need to care
3070      * about dirty page logging as well.
3071      */
3072
3073     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3074         pages += bitmap_count_one(block->bmap,
3075                                   block->used_length >> TARGET_PAGE_BITS);
3076     }
3077
3078     /* This may not be aligned with current bitmaps. Recalculate. */
3079     rs->migration_dirty_pages = pages;
3080
3081     rs->last_seen_block = NULL;
3082     rs->last_sent_block = NULL;
3083     rs->last_page = 0;
3084     rs->last_version = ram_list.version;
3085     /*
3086      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3087      * matter what we have sent.
3088      */
3089     rs->ram_bulk_stage = false;
3090
3091     /* Update RAMState cache of output QEMUFile */
3092     rs->f = out;
3093
3094     trace_ram_state_resume_prepare(pages);
3095 }
3096
3097 /*
3098  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3099  * long-running RCU critical section.  When rcu-reclaims in the code
3100  * start to become numerous it will be necessary to reduce the
3101  * granularity of these critical sections.
3102  */
3103
3104 /**
3105  * ram_save_setup: Setup RAM for migration
3106  *
3107  * Returns zero to indicate success and negative for error
3108  *
3109  * @f: QEMUFile where to send the data
3110  * @opaque: RAMState pointer
3111  */
3112 static int ram_save_setup(QEMUFile *f, void *opaque)
3113 {
3114     RAMState **rsp = opaque;
3115     RAMBlock *block;
3116
3117     if (compress_threads_save_setup()) {
3118         return -1;
3119     }
3120
3121     /* migration has already setup the bitmap, reuse it. */
3122     if (!migration_in_colo_state()) {
3123         if (ram_init_all(rsp) != 0) {
3124             compress_threads_save_cleanup();
3125             return -1;
3126         }
3127     }
3128     (*rsp)->f = f;
3129
3130     rcu_read_lock();
3131
3132     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
3133
3134     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3135         qemu_put_byte(f, strlen(block->idstr));
3136         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3137         qemu_put_be64(f, block->used_length);
3138         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3139             qemu_put_be64(f, block->page_size);
3140         }
3141     }
3142
3143     rcu_read_unlock();
3144
3145     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3146     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3147
3148     multifd_send_sync_main();
3149     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3150     qemu_fflush(f);
3151
3152     return 0;
3153 }
3154
3155 /**
3156  * ram_save_iterate: iterative stage for migration
3157  *
3158  * Returns zero to indicate success and negative for error
3159  *
3160  * @f: QEMUFile where to send the data
3161  * @opaque: RAMState pointer
3162  */
3163 static int ram_save_iterate(QEMUFile *f, void *opaque)
3164 {
3165     RAMState **temp = opaque;
3166     RAMState *rs = *temp;
3167     int ret;
3168     int i;
3169     int64_t t0;
3170     int done = 0;
3171
3172     if (blk_mig_bulk_active()) {
3173         /* Avoid transferring ram during bulk phase of block migration as
3174          * the bulk phase will usually take a long time and transferring
3175          * ram updates during that time is pointless. */
3176         goto out;
3177     }
3178
3179     rcu_read_lock();
3180     if (ram_list.version != rs->last_version) {
3181         ram_state_reset(rs);
3182     }
3183
3184     /* Read version before ram_list.blocks */
3185     smp_rmb();
3186
3187     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3188
3189     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3190     i = 0;
3191     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3192             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3193         int pages;
3194
3195         if (qemu_file_get_error(f)) {
3196             break;
3197         }
3198
3199         pages = ram_find_and_save_block(rs, false);
3200         /* no more pages to sent */
3201         if (pages == 0) {
3202             done = 1;
3203             break;
3204         }
3205
3206         if (pages < 0) {
3207             qemu_file_set_error(f, pages);
3208             break;
3209         }
3210
3211         rs->target_page_count += pages;
3212
3213         /* we want to check in the 1st loop, just in case it was the 1st time
3214            and we had to sync the dirty bitmap.
3215            qemu_get_clock_ns() is a bit expensive, so we only check each some
3216            iterations
3217         */
3218         if ((i & 63) == 0) {
3219             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3220             if (t1 > MAX_WAIT) {
3221                 trace_ram_save_iterate_big_wait(t1, i);
3222                 break;
3223             }
3224         }
3225         i++;
3226     }
3227     rcu_read_unlock();
3228
3229     /*
3230      * Must occur before EOS (or any QEMUFile operation)
3231      * because of RDMA protocol.
3232      */
3233     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3234
3235     multifd_send_sync_main();
3236 out:
3237     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3238     qemu_fflush(f);
3239     ram_counters.transferred += 8;
3240
3241     ret = qemu_file_get_error(f);
3242     if (ret < 0) {
3243         return ret;
3244     }
3245
3246     return done;
3247 }
3248
3249 /**
3250  * ram_save_complete: function called to send the remaining amount of ram
3251  *
3252  * Returns zero to indicate success or negative on error
3253  *
3254  * Called with iothread lock
3255  *
3256  * @f: QEMUFile where to send the data
3257  * @opaque: RAMState pointer
3258  */
3259 static int ram_save_complete(QEMUFile *f, void *opaque)
3260 {
3261     RAMState **temp = opaque;
3262     RAMState *rs = *temp;
3263     int ret = 0;
3264
3265     rcu_read_lock();
3266
3267     if (!migration_in_postcopy()) {
3268         migration_bitmap_sync(rs);
3269     }
3270
3271     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3272
3273     /* try transferring iterative blocks of memory */
3274
3275     /* flush all remaining blocks regardless of rate limiting */
3276     while (true) {
3277         int pages;
3278
3279         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3280         /* no more blocks to sent */
3281         if (pages == 0) {
3282             break;
3283         }
3284         if (pages < 0) {
3285             ret = pages;
3286             break;
3287         }
3288     }
3289
3290     flush_compressed_data(rs);
3291     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3292
3293     rcu_read_unlock();
3294
3295     multifd_send_sync_main();
3296     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3297     qemu_fflush(f);
3298
3299     return ret;
3300 }
3301
3302 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3303                              uint64_t *res_precopy_only,
3304                              uint64_t *res_compatible,
3305                              uint64_t *res_postcopy_only)
3306 {
3307     RAMState **temp = opaque;
3308     RAMState *rs = *temp;
3309     uint64_t remaining_size;
3310
3311     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3312
3313     if (!migration_in_postcopy() &&
3314         remaining_size < max_size) {
3315         qemu_mutex_lock_iothread();
3316         rcu_read_lock();
3317         migration_bitmap_sync(rs);
3318         rcu_read_unlock();
3319         qemu_mutex_unlock_iothread();
3320         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3321     }
3322
3323     if (migrate_postcopy_ram()) {
3324         /* We can do postcopy, and all the data is postcopiable */
3325         *res_compatible += remaining_size;
3326     } else {
3327         *res_precopy_only += remaining_size;
3328     }
3329 }
3330
3331 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3332 {
3333     unsigned int xh_len;
3334     int xh_flags;
3335     uint8_t *loaded_data;
3336
3337     /* extract RLE header */
3338     xh_flags = qemu_get_byte(f);
3339     xh_len = qemu_get_be16(f);
3340
3341     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3342         error_report("Failed to load XBZRLE page - wrong compression!");
3343         return -1;
3344     }
3345
3346     if (xh_len > TARGET_PAGE_SIZE) {
3347         error_report("Failed to load XBZRLE page - len overflow!");
3348         return -1;
3349     }
3350     loaded_data = XBZRLE.decoded_buf;
3351     /* load data and decode */
3352     /* it can change loaded_data to point to an internal buffer */
3353     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3354
3355     /* decode RLE */
3356     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3357                              TARGET_PAGE_SIZE) == -1) {
3358         error_report("Failed to load XBZRLE page - decode error!");
3359         return -1;
3360     }
3361
3362     return 0;
3363 }
3364
3365 /**
3366  * ram_block_from_stream: read a RAMBlock id from the migration stream
3367  *
3368  * Must be called from within a rcu critical section.
3369  *
3370  * Returns a pointer from within the RCU-protected ram_list.
3371  *
3372  * @f: QEMUFile where to read the data from
3373  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3374  */
3375 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3376 {
3377     static RAMBlock *block = NULL;
3378     char id[256];
3379     uint8_t len;
3380
3381     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3382         if (!block) {
3383             error_report("Ack, bad migration stream!");
3384             return NULL;
3385         }
3386         return block;
3387     }
3388
3389     len = qemu_get_byte(f);
3390     qemu_get_buffer(f, (uint8_t *)id, len);
3391     id[len] = 0;
3392
3393     block = qemu_ram_block_by_name(id);
3394     if (!block) {
3395         error_report("Can't find block %s", id);
3396         return NULL;
3397     }
3398
3399     if (!qemu_ram_is_migratable(block)) {
3400         error_report("block %s should not be migrated !", id);
3401         return NULL;
3402     }
3403
3404     return block;
3405 }
3406
3407 static inline void *host_from_ram_block_offset(RAMBlock *block,
3408                                                ram_addr_t offset)
3409 {
3410     if (!offset_in_ramblock(block, offset)) {
3411         return NULL;
3412     }
3413
3414     return block->host + offset;
3415 }
3416
3417 /**
3418  * ram_handle_compressed: handle the zero page case
3419  *
3420  * If a page (or a whole RDMA chunk) has been
3421  * determined to be zero, then zap it.
3422  *
3423  * @host: host address for the zero page
3424  * @ch: what the page is filled from.  We only support zero
3425  * @size: size of the zero page
3426  */
3427 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3428 {
3429     if (ch != 0 || !is_zero_range(host, size)) {
3430         memset(host, ch, size);
3431     }
3432 }
3433
3434 /* return the size after decompression, or negative value on error */
3435 static int
3436 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3437                      const uint8_t *source, size_t source_len)
3438 {
3439     int err;
3440
3441     err = inflateReset(stream);
3442     if (err != Z_OK) {
3443         return -1;
3444     }
3445
3446     stream->avail_in = source_len;
3447     stream->next_in = (uint8_t *)source;
3448     stream->avail_out = dest_len;
3449     stream->next_out = dest;
3450
3451     err = inflate(stream, Z_NO_FLUSH);
3452     if (err != Z_STREAM_END) {
3453         return -1;
3454     }
3455
3456     return stream->total_out;
3457 }
3458
3459 static void *do_data_decompress(void *opaque)
3460 {
3461     DecompressParam *param = opaque;
3462     unsigned long pagesize;
3463     uint8_t *des;
3464     int len, ret;
3465
3466     qemu_mutex_lock(&param->mutex);
3467     while (!param->quit) {
3468         if (param->des) {
3469             des = param->des;
3470             len = param->len;
3471             param->des = 0;
3472             qemu_mutex_unlock(&param->mutex);
3473
3474             pagesize = TARGET_PAGE_SIZE;
3475
3476             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3477                                        param->compbuf, len);
3478             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3479                 error_report("decompress data failed");
3480                 qemu_file_set_error(decomp_file, ret);
3481             }
3482
3483             qemu_mutex_lock(&decomp_done_lock);
3484             param->done = true;
3485             qemu_cond_signal(&decomp_done_cond);
3486             qemu_mutex_unlock(&decomp_done_lock);
3487
3488             qemu_mutex_lock(&param->mutex);
3489         } else {
3490             qemu_cond_wait(&param->cond, &param->mutex);
3491         }
3492     }
3493     qemu_mutex_unlock(&param->mutex);
3494
3495     return NULL;
3496 }
3497
3498 static int wait_for_decompress_done(void)
3499 {
3500     int idx, thread_count;
3501
3502     if (!migrate_use_compression()) {
3503         return 0;
3504     }
3505
3506     thread_count = migrate_decompress_threads();
3507     qemu_mutex_lock(&decomp_done_lock);
3508     for (idx = 0; idx < thread_count; idx++) {
3509         while (!decomp_param[idx].done) {
3510             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3511         }
3512     }
3513     qemu_mutex_unlock(&decomp_done_lock);
3514     return qemu_file_get_error(decomp_file);
3515 }
3516
3517 static void compress_threads_load_cleanup(void)
3518 {
3519     int i, thread_count;
3520
3521     if (!migrate_use_compression()) {
3522         return;
3523     }
3524     thread_count = migrate_decompress_threads();
3525     for (i = 0; i < thread_count; i++) {
3526         /*
3527          * we use it as a indicator which shows if the thread is
3528          * properly init'd or not
3529          */
3530         if (!decomp_param[i].compbuf) {
3531             break;
3532         }
3533
3534         qemu_mutex_lock(&decomp_param[i].mutex);
3535         decomp_param[i].quit = true;
3536         qemu_cond_signal(&decomp_param[i].cond);
3537         qemu_mutex_unlock(&decomp_param[i].mutex);
3538     }
3539     for (i = 0; i < thread_count; i++) {
3540         if (!decomp_param[i].compbuf) {
3541             break;
3542         }
3543
3544         qemu_thread_join(decompress_threads + i);
3545         qemu_mutex_destroy(&decomp_param[i].mutex);
3546         qemu_cond_destroy(&decomp_param[i].cond);
3547         inflateEnd(&decomp_param[i].stream);
3548         g_free(decomp_param[i].compbuf);
3549         decomp_param[i].compbuf = NULL;
3550     }
3551     g_free(decompress_threads);
3552     g_free(decomp_param);
3553     decompress_threads = NULL;
3554     decomp_param = NULL;
3555     decomp_file = NULL;
3556 }
3557
3558 static int compress_threads_load_setup(QEMUFile *f)
3559 {
3560     int i, thread_count;
3561
3562     if (!migrate_use_compression()) {
3563         return 0;
3564     }
3565
3566     thread_count = migrate_decompress_threads();
3567     decompress_threads = g_new0(QemuThread, thread_count);
3568     decomp_param = g_new0(DecompressParam, thread_count);
3569     qemu_mutex_init(&decomp_done_lock);
3570     qemu_cond_init(&decomp_done_cond);
3571     decomp_file = f;
3572     for (i = 0; i < thread_count; i++) {
3573         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3574             goto exit;
3575         }
3576
3577         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3578         qemu_mutex_init(&decomp_param[i].mutex);
3579         qemu_cond_init(&decomp_param[i].cond);
3580         decomp_param[i].done = true;
3581         decomp_param[i].quit = false;
3582         qemu_thread_create(decompress_threads + i, "decompress",
3583                            do_data_decompress, decomp_param + i,
3584                            QEMU_THREAD_JOINABLE);
3585     }
3586     return 0;
3587 exit:
3588     compress_threads_load_cleanup();
3589     return -1;
3590 }
3591
3592 static void decompress_data_with_multi_threads(QEMUFile *f,
3593                                                void *host, int len)
3594 {
3595     int idx, thread_count;
3596
3597     thread_count = migrate_decompress_threads();
3598     qemu_mutex_lock(&decomp_done_lock);
3599     while (true) {
3600         for (idx = 0; idx < thread_count; idx++) {
3601             if (decomp_param[idx].done) {
3602                 decomp_param[idx].done = false;
3603                 qemu_mutex_lock(&decomp_param[idx].mutex);
3604                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3605                 decomp_param[idx].des = host;
3606                 decomp_param[idx].len = len;
3607                 qemu_cond_signal(&decomp_param[idx].cond);
3608                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3609                 break;
3610             }
3611         }
3612         if (idx < thread_count) {
3613             break;
3614         } else {
3615             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3616         }
3617     }
3618     qemu_mutex_unlock(&decomp_done_lock);
3619 }
3620
3621 /**
3622  * ram_load_setup: Setup RAM for migration incoming side
3623  *
3624  * Returns zero to indicate success and negative for error
3625  *
3626  * @f: QEMUFile where to receive the data
3627  * @opaque: RAMState pointer
3628  */
3629 static int ram_load_setup(QEMUFile *f, void *opaque)
3630 {
3631     if (compress_threads_load_setup(f)) {
3632         return -1;
3633     }
3634
3635     xbzrle_load_setup();
3636     ramblock_recv_map_init();
3637     return 0;
3638 }
3639
3640 static int ram_load_cleanup(void *opaque)
3641 {
3642     RAMBlock *rb;
3643
3644     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3645         if (ramblock_is_pmem(rb)) {
3646             pmem_persist(rb->host, rb->used_length);
3647         }
3648     }
3649
3650     xbzrle_load_cleanup();
3651     compress_threads_load_cleanup();
3652
3653     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3654         g_free(rb->receivedmap);
3655         rb->receivedmap = NULL;
3656     }
3657     return 0;
3658 }
3659
3660 /**
3661  * ram_postcopy_incoming_init: allocate postcopy data structures
3662  *
3663  * Returns 0 for success and negative if there was one error
3664  *
3665  * @mis: current migration incoming state
3666  *
3667  * Allocate data structures etc needed by incoming migration with
3668  * postcopy-ram. postcopy-ram's similarly names
3669  * postcopy_ram_incoming_init does the work.
3670  */
3671 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3672 {
3673     return postcopy_ram_incoming_init(mis);
3674 }
3675
3676 /**
3677  * ram_load_postcopy: load a page in postcopy case
3678  *
3679  * Returns 0 for success or -errno in case of error
3680  *
3681  * Called in postcopy mode by ram_load().
3682  * rcu_read_lock is taken prior to this being called.
3683  *
3684  * @f: QEMUFile where to send the data
3685  */
3686 static int ram_load_postcopy(QEMUFile *f)
3687 {
3688     int flags = 0, ret = 0;
3689     bool place_needed = false;
3690     bool matches_target_page_size = false;
3691     MigrationIncomingState *mis = migration_incoming_get_current();
3692     /* Temporary page that is later 'placed' */
3693     void *postcopy_host_page = postcopy_get_tmp_page(mis);
3694     void *last_host = NULL;
3695     bool all_zero = false;
3696
3697     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3698         ram_addr_t addr;
3699         void *host = NULL;
3700         void *page_buffer = NULL;
3701         void *place_source = NULL;
3702         RAMBlock *block = NULL;
3703         uint8_t ch;
3704
3705         addr = qemu_get_be64(f);
3706
3707         /*
3708          * If qemu file error, we should stop here, and then "addr"
3709          * may be invalid
3710          */
3711         ret = qemu_file_get_error(f);
3712         if (ret) {
3713             break;
3714         }
3715
3716         flags = addr & ~TARGET_PAGE_MASK;
3717         addr &= TARGET_PAGE_MASK;
3718
3719         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3720         place_needed = false;
3721         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3722             block = ram_block_from_stream(f, flags);
3723
3724             host = host_from_ram_block_offset(block, addr);
3725             if (!host) {
3726                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3727                 ret = -EINVAL;
3728                 break;
3729             }
3730             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3731             /*
3732              * Postcopy requires that we place whole host pages atomically;
3733              * these may be huge pages for RAMBlocks that are backed by
3734              * hugetlbfs.
3735              * To make it atomic, the data is read into a temporary page
3736              * that's moved into place later.
3737              * The migration protocol uses,  possibly smaller, target-pages
3738              * however the source ensures it always sends all the components
3739              * of a host page in order.
3740              */
3741             page_buffer = postcopy_host_page +
3742                           ((uintptr_t)host & (block->page_size - 1));
3743             /* If all TP are zero then we can optimise the place */
3744             if (!((uintptr_t)host & (block->page_size - 1))) {
3745                 all_zero = true;
3746             } else {
3747                 /* not the 1st TP within the HP */
3748                 if (host != (last_host + TARGET_PAGE_SIZE)) {
3749                     error_report("Non-sequential target page %p/%p",
3750                                   host, last_host);
3751                     ret = -EINVAL;
3752                     break;
3753                 }
3754             }
3755
3756
3757             /*
3758              * If it's the last part of a host page then we place the host
3759              * page
3760              */
3761             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3762                                      (block->page_size - 1)) == 0;
3763             place_source = postcopy_host_page;
3764         }
3765         last_host = host;
3766
3767         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3768         case RAM_SAVE_FLAG_ZERO:
3769             ch = qemu_get_byte(f);
3770             memset(page_buffer, ch, TARGET_PAGE_SIZE);
3771             if (ch) {
3772                 all_zero = false;
3773             }
3774             break;
3775
3776         case RAM_SAVE_FLAG_PAGE:
3777             all_zero = false;
3778             if (!matches_target_page_size) {
3779                 /* For huge pages, we always use temporary buffer */
3780                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3781             } else {
3782                 /*
3783                  * For small pages that matches target page size, we
3784                  * avoid the qemu_file copy.  Instead we directly use
3785                  * the buffer of QEMUFile to place the page.  Note: we
3786                  * cannot do any QEMUFile operation before using that
3787                  * buffer to make sure the buffer is valid when
3788                  * placing the page.
3789                  */
3790                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3791                                          TARGET_PAGE_SIZE);
3792             }
3793             break;
3794         case RAM_SAVE_FLAG_EOS:
3795             /* normal exit */
3796             multifd_recv_sync_main();
3797             break;
3798         default:
3799             error_report("Unknown combination of migration flags: %#x"
3800                          " (postcopy mode)", flags);
3801             ret = -EINVAL;
3802             break;
3803         }
3804
3805         /* Detect for any possible file errors */
3806         if (!ret && qemu_file_get_error(f)) {
3807             ret = qemu_file_get_error(f);
3808         }
3809
3810         if (!ret && place_needed) {
3811             /* This gets called at the last target page in the host page */
3812             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3813
3814             if (all_zero) {
3815                 ret = postcopy_place_page_zero(mis, place_dest,
3816                                                block);
3817             } else {
3818                 ret = postcopy_place_page(mis, place_dest,
3819                                           place_source, block);
3820             }
3821         }
3822     }
3823
3824     return ret;
3825 }
3826
3827 static bool postcopy_is_advised(void)
3828 {
3829     PostcopyState ps = postcopy_state_get();
3830     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3831 }
3832
3833 static bool postcopy_is_running(void)
3834 {
3835     PostcopyState ps = postcopy_state_get();
3836     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3837 }
3838
3839 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3840 {
3841     int flags = 0, ret = 0, invalid_flags = 0;
3842     static uint64_t seq_iter;
3843     int len = 0;
3844     /*
3845      * If system is running in postcopy mode, page inserts to host memory must
3846      * be atomic
3847      */
3848     bool postcopy_running = postcopy_is_running();
3849     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3850     bool postcopy_advised = postcopy_is_advised();
3851
3852     seq_iter++;
3853
3854     if (version_id != 4) {
3855         ret = -EINVAL;
3856     }
3857
3858     if (!migrate_use_compression()) {
3859         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3860     }
3861     /* This RCU critical section can be very long running.
3862      * When RCU reclaims in the code start to become numerous,
3863      * it will be necessary to reduce the granularity of this
3864      * critical section.
3865      */
3866     rcu_read_lock();
3867
3868     if (postcopy_running) {
3869         ret = ram_load_postcopy(f);
3870     }
3871
3872     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3873         ram_addr_t addr, total_ram_bytes;
3874         void *host = NULL;
3875         uint8_t ch;
3876
3877         addr = qemu_get_be64(f);
3878         flags = addr & ~TARGET_PAGE_MASK;
3879         addr &= TARGET_PAGE_MASK;
3880
3881         if (flags & invalid_flags) {
3882             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3883                 error_report("Received an unexpected compressed page");
3884             }
3885
3886             ret = -EINVAL;
3887             break;
3888         }
3889
3890         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3891                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3892             RAMBlock *block = ram_block_from_stream(f, flags);
3893
3894             host = host_from_ram_block_offset(block, addr);
3895             if (!host) {
3896                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3897                 ret = -EINVAL;
3898                 break;
3899             }
3900             ramblock_recv_bitmap_set(block, host);
3901             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3902         }
3903
3904         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3905         case RAM_SAVE_FLAG_MEM_SIZE:
3906             /* Synchronize RAM block list */
3907             total_ram_bytes = addr;
3908             while (!ret && total_ram_bytes) {
3909                 RAMBlock *block;
3910                 char id[256];
3911                 ram_addr_t length;
3912
3913                 len = qemu_get_byte(f);
3914                 qemu_get_buffer(f, (uint8_t *)id, len);
3915                 id[len] = 0;
3916                 length = qemu_get_be64(f);
3917
3918                 block = qemu_ram_block_by_name(id);
3919                 if (block && !qemu_ram_is_migratable(block)) {
3920                     error_report("block %s should not be migrated !", id);
3921                     ret = -EINVAL;
3922                 } else if (block) {
3923                     if (length != block->used_length) {
3924                         Error *local_err = NULL;
3925
3926                         ret = qemu_ram_resize(block, length,
3927                                               &local_err);
3928                         if (local_err) {
3929                             error_report_err(local_err);
3930                         }
3931                     }
3932                     /* For postcopy we need to check hugepage sizes match */
3933                     if (postcopy_advised &&
3934                         block->page_size != qemu_host_page_size) {
3935                         uint64_t remote_page_size = qemu_get_be64(f);
3936                         if (remote_page_size != block->page_size) {
3937                             error_report("Mismatched RAM page size %s "
3938                                          "(local) %zd != %" PRId64,
3939                                          id, block->page_size,
3940                                          remote_page_size);
3941                             ret = -EINVAL;
3942                         }
3943                     }
3944                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3945                                           block->idstr);
3946                 } else {
3947                     error_report("Unknown ramblock \"%s\", cannot "
3948                                  "accept migration", id);
3949                     ret = -EINVAL;
3950                 }
3951
3952                 total_ram_bytes -= length;
3953             }
3954             break;
3955
3956         case RAM_SAVE_FLAG_ZERO:
3957             ch = qemu_get_byte(f);
3958             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3959             break;
3960
3961         case RAM_SAVE_FLAG_PAGE:
3962             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3963             break;
3964
3965         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3966             len = qemu_get_be32(f);
3967             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3968                 error_report("Invalid compressed data length: %d", len);
3969                 ret = -EINVAL;
3970                 break;
3971             }
3972             decompress_data_with_multi_threads(f, host, len);
3973             break;
3974
3975         case RAM_SAVE_FLAG_XBZRLE:
3976             if (load_xbzrle(f, addr, host) < 0) {
3977                 error_report("Failed to decompress XBZRLE page at "
3978                              RAM_ADDR_FMT, addr);
3979                 ret = -EINVAL;
3980                 break;
3981             }
3982             break;
3983         case RAM_SAVE_FLAG_EOS:
3984             /* normal exit */
3985             multifd_recv_sync_main();
3986             break;
3987         default:
3988             if (flags & RAM_SAVE_FLAG_HOOK) {
3989                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3990             } else {
3991                 error_report("Unknown combination of migration flags: %#x",
3992                              flags);
3993                 ret = -EINVAL;
3994             }
3995         }
3996         if (!ret) {
3997             ret = qemu_file_get_error(f);
3998         }
3999     }
4000
4001     ret |= wait_for_decompress_done();
4002     rcu_read_unlock();
4003     trace_ram_load_complete(ret, seq_iter);
4004     return ret;
4005 }
4006
4007 static bool ram_has_postcopy(void *opaque)
4008 {
4009     RAMBlock *rb;
4010     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
4011         if (ramblock_is_pmem(rb)) {
4012             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4013                          "is not supported now!", rb->idstr, rb->host);
4014             return false;
4015         }
4016     }
4017
4018     return migrate_postcopy_ram();
4019 }
4020
4021 /* Sync all the dirty bitmap with destination VM.  */
4022 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4023 {
4024     RAMBlock *block;
4025     QEMUFile *file = s->to_dst_file;
4026     int ramblock_count = 0;
4027
4028     trace_ram_dirty_bitmap_sync_start();
4029
4030     RAMBLOCK_FOREACH_MIGRATABLE(block) {
4031         qemu_savevm_send_recv_bitmap(file, block->idstr);
4032         trace_ram_dirty_bitmap_request(block->idstr);
4033         ramblock_count++;
4034     }
4035
4036     trace_ram_dirty_bitmap_sync_wait();
4037
4038     /* Wait until all the ramblocks' dirty bitmap synced */
4039     while (ramblock_count--) {
4040         qemu_sem_wait(&s->rp_state.rp_sem);
4041     }
4042
4043     trace_ram_dirty_bitmap_sync_complete();
4044
4045     return 0;
4046 }
4047
4048 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4049 {
4050     qemu_sem_post(&s->rp_state.rp_sem);
4051 }
4052
4053 /*
4054  * Read the received bitmap, revert it as the initial dirty bitmap.
4055  * This is only used when the postcopy migration is paused but wants
4056  * to resume from a middle point.
4057  */
4058 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4059 {
4060     int ret = -EINVAL;
4061     QEMUFile *file = s->rp_state.from_dst_file;
4062     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4063     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4064     uint64_t size, end_mark;
4065
4066     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4067
4068     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4069         error_report("%s: incorrect state %s", __func__,
4070                      MigrationStatus_str(s->state));
4071         return -EINVAL;
4072     }
4073
4074     /*
4075      * Note: see comments in ramblock_recv_bitmap_send() on why we
4076      * need the endianess convertion, and the paddings.
4077      */
4078     local_size = ROUND_UP(local_size, 8);
4079
4080     /* Add paddings */
4081     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4082
4083     size = qemu_get_be64(file);
4084
4085     /* The size of the bitmap should match with our ramblock */
4086     if (size != local_size) {
4087         error_report("%s: ramblock '%s' bitmap size mismatch "
4088                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4089                      block->idstr, size, local_size);
4090         ret = -EINVAL;
4091         goto out;
4092     }
4093
4094     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4095     end_mark = qemu_get_be64(file);
4096
4097     ret = qemu_file_get_error(file);
4098     if (ret || size != local_size) {
4099         error_report("%s: read bitmap failed for ramblock '%s': %d"
4100                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4101                      __func__, block->idstr, ret, local_size, size);
4102         ret = -EIO;
4103         goto out;
4104     }
4105
4106     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4107         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4108                      __func__, block->idstr, end_mark);
4109         ret = -EINVAL;
4110         goto out;
4111     }
4112
4113     /*
4114      * Endianess convertion. We are during postcopy (though paused).
4115      * The dirty bitmap won't change. We can directly modify it.
4116      */
4117     bitmap_from_le(block->bmap, le_bitmap, nbits);
4118
4119     /*
4120      * What we received is "received bitmap". Revert it as the initial
4121      * dirty bitmap for this ramblock.
4122      */
4123     bitmap_complement(block->bmap, block->bmap, nbits);
4124
4125     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4126
4127     /*
4128      * We succeeded to sync bitmap for current ramblock. If this is
4129      * the last one to sync, we need to notify the main send thread.
4130      */
4131     ram_dirty_bitmap_reload_notify(s);
4132
4133     ret = 0;
4134 out:
4135     g_free(le_bitmap);
4136     return ret;
4137 }
4138
4139 static int ram_resume_prepare(MigrationState *s, void *opaque)
4140 {
4141     RAMState *rs = *(RAMState **)opaque;
4142     int ret;
4143
4144     ret = ram_dirty_bitmap_sync_all(s, rs);
4145     if (ret) {
4146         return ret;
4147     }
4148
4149     ram_state_resume_prepare(rs, s->to_dst_file);
4150
4151     return 0;
4152 }
4153
4154 static SaveVMHandlers savevm_ram_handlers = {
4155     .save_setup = ram_save_setup,
4156     .save_live_iterate = ram_save_iterate,
4157     .save_live_complete_postcopy = ram_save_complete,
4158     .save_live_complete_precopy = ram_save_complete,
4159     .has_postcopy = ram_has_postcopy,
4160     .save_live_pending = ram_save_pending,
4161     .load_state = ram_load,
4162     .save_cleanup = ram_save_cleanup,
4163     .load_setup = ram_load_setup,
4164     .load_cleanup = ram_load_cleanup,
4165     .resume_prepare = ram_resume_prepare,
4166 };
4167
4168 void ram_mig_init(void)
4169 {
4170     qemu_mutex_init(&XBZRLE.lock);
4171     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4172 }