migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "qemu/pmem.h"
  37 #include "xbzrle.h"
  38 #include "ram.h"
  39 #include "migration.h"
  40 #include "socket.h"
  41 #include "migration/register.h"
  42 #include "migration/misc.h"
  43 #include "qemu-file.h"
  44 #include "postcopy-ram.h"
  45 #include "page_cache.h"
  46 #include "qemu/error-report.h"
  47 #include "qapi/error.h"
  48 #include "qapi/qapi-events-migration.h"
  49 #include "qapi/qmp/qerror.h"
  50 #include "trace.h"
  51 #include "exec/ram_addr.h"
  52 #include "exec/target_page.h"
  53 #include "qemu/rcu_queue.h"
  54 #include "migration/colo.h"
  55 #include "block.h"
  56 #include "sysemu/sysemu.h"
  57 #include "qemu/uuid.h"
  58 #include "savevm.h"
  59 #include "qemu/iov.h"
  60
  61 /***********************************************************/
  62 /* ram save/restore */
  63
  64 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  65  * worked for pages that where filled with the same char.  We switched
  66  * it to only search for the zero value.  And to avoid confusion with
  67  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  68  */
  69
  70 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  71 #define RAM_SAVE_FLAG_ZERO     0x02
  72 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  73 #define RAM_SAVE_FLAG_PAGE     0x08
  74 #define RAM_SAVE_FLAG_EOS      0x10
  75 #define RAM_SAVE_FLAG_CONTINUE 0x20
  76 #define RAM_SAVE_FLAG_XBZRLE   0x40
  77 /* 0x80 is reserved in migration.h start with 0x100 next */
  78 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  79
  80 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  81 {
  82     return buffer_is_zero(p, size);
  83 }
  84
  85 XBZRLECacheStats xbzrle_counters;
  86
  87 /* struct contains XBZRLE cache and a static page
  88    used by the compression */
  89 static struct {
  90     /* buffer used for XBZRLE encoding */
  91     uint8_t *encoded_buf;
  92     /* buffer for storing page content */
  93     uint8_t *current_buf;
  94     /* Cache for XBZRLE, Protected by lock. */
  95     PageCache *cache;
  96     QemuMutex lock;
  97     /* it will store a page full of zeros */
  98     uint8_t *zero_target_page;
  99     /* buffer used for XBZRLE decoding */
 100     uint8_t *decoded_buf;
 101 } XBZRLE;
 102
 103 static void XBZRLE_cache_lock(void)
 104 {
 105     if (migrate_use_xbzrle())
 106         qemu_mutex_lock(&XBZRLE.lock);
 107 }
 108
 109 static void XBZRLE_cache_unlock(void)
 110 {
 111     if (migrate_use_xbzrle())
 112         qemu_mutex_unlock(&XBZRLE.lock);
 113 }
 114
 115 /**
 116  * xbzrle_cache_resize: resize the xbzrle cache
 117  *
 118  * This function is called from qmp_migrate_set_cache_size in main
 119  * thread, possibly while a migration is in progress.  A running
 120  * migration may be using the cache and might finish during this call,
 121  * hence changes to the cache are protected by XBZRLE.lock().
 122  *
 123  * Returns 0 for success or -1 for error
 124  *
 125  * @new_size: new cache size
 126  * @errp: set *errp if the check failed, with reason
 127  */
 128 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 129 {
 130     PageCache *new_cache;
 131     int64_t ret = 0;
 132
 133     /* Check for truncation */
 134     if (new_size != (size_t)new_size) {
 135         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 136                    "exceeding address space");
 137         return -1;
 138     }
 139
 140     if (new_size == migrate_xbzrle_cache_size()) {
 141         /* nothing to do */
 142         return 0;
 143     }
 144
 145     XBZRLE_cache_lock();
 146
 147     if (XBZRLE.cache != NULL) {
 148         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 149         if (!new_cache) {
 150             ret = -1;
 151             goto out;
 152         }
 153
 154         cache_fini(XBZRLE.cache);
 155         XBZRLE.cache = new_cache;
 156     }
 157 out:
 158     XBZRLE_cache_unlock();
 159     return ret;
 160 }
 161
 162 /* Should be holding either ram_list.mutex, or the RCU lock. */
 163 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 164     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 165         if (!qemu_ram_is_migratable(block)) {} else
 166
 167 #undef RAMBLOCK_FOREACH
 168
 169 static void ramblock_recv_map_init(void)
 170 {
 171     RAMBlock *rb;
 172
 173     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
 174         assert(!rb->receivedmap);
 175         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 176     }
 177 }
 178
 179 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 180 {
 181     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 182                     rb->receivedmap);
 183 }
 184
 185 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 186 {
 187     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 188 }
 189
 190 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 191 {
 192     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 193 }
 194
 195 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 196                                     size_t nr)
 197 {
 198     bitmap_set_atomic(rb->receivedmap,
 199                       ramblock_recv_bitmap_offset(host_addr, rb),
 200                       nr);
 201 }
 202
 203 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 204
 205 /*
 206  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 207  *
 208  * Returns >0 if success with sent bytes, or <0 if error.
 209  */
 210 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 211                                   const char *block_name)
 212 {
 213     RAMBlock *block = qemu_ram_block_by_name(block_name);
 214     unsigned long *le_bitmap, nbits;
 215     uint64_t size;
 216
 217     if (!block) {
 218         error_report("%s: invalid block name: %s", __func__, block_name);
 219         return -1;
 220     }
 221
 222     nbits = block->used_length >> TARGET_PAGE_BITS;
 223
 224     /*
 225      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 226      * machines we may need 4 more bytes for padding (see below
 227      * comment). So extend it a bit before hand.
 228      */
 229     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 230
 231     /*
 232      * Always use little endian when sending the bitmap. This is
 233      * required that when source and destination VMs are not using the
 234      * same endianess. (Note: big endian won't work.)
 235      */
 236     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 237
 238     /* Size of the bitmap, in bytes */
 239     size = DIV_ROUND_UP(nbits, 8);
 240
 241     /*
 242      * size is always aligned to 8 bytes for 64bit machines, but it
 243      * may not be true for 32bit machines. We need this padding to
 244      * make sure the migration can survive even between 32bit and
 245      * 64bit machines.
 246      */
 247     size = ROUND_UP(size, 8);
 248
 249     qemu_put_be64(file, size);
 250     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 251     /*
 252      * Mark as an end, in case the middle part is screwed up due to
 253      * some "misterious" reason.
 254      */
 255     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 256     qemu_fflush(file);
 257
 258     g_free(le_bitmap);
 259
 260     if (qemu_file_get_error(file)) {
 261         return qemu_file_get_error(file);
 262     }
 263
 264     return size + sizeof(size);
 265 }
 266
 267 /*
 268  * An outstanding page request, on the source, having been received
 269  * and queued
 270  */
 271 struct RAMSrcPageRequest {
 272     RAMBlock *rb;
 273     hwaddr    offset;
 274     hwaddr    len;
 275
 276     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 277 };
 278
 279 /* State of RAM for migration */
 280 struct RAMState {
 281     /* QEMUFile used for this migration */
 282     QEMUFile *f;
 283     /* Last block that we have visited searching for dirty pages */
 284     RAMBlock *last_seen_block;
 285     /* Last block from where we have sent data */
 286     RAMBlock *last_sent_block;
 287     /* Last dirty target page we have sent */
 288     ram_addr_t last_page;
 289     /* last ram version we have seen */
 290     uint32_t last_version;
 291     /* We are in the first round */
 292     bool ram_bulk_stage;
 293     /* How many times we have dirty too many pages */
 294     int dirty_rate_high_cnt;
 295     /* these variables are used for bitmap sync */
 296     /* last time we did a full bitmap_sync */
 297     int64_t time_last_bitmap_sync;
 298     /* bytes transferred at start_time */
 299     uint64_t bytes_xfer_prev;
 300     /* number of dirty pages since start_time */
 301     uint64_t num_dirty_pages_period;
 302     /* xbzrle misses since the beginning of the period */
 303     uint64_t xbzrle_cache_miss_prev;
 304     /* number of iterations at the beginning of period */
 305     uint64_t iterations_prev;
 306     /* Iterations since start */
 307     uint64_t iterations;
 308     /* number of dirty bits in the bitmap */
 309     uint64_t migration_dirty_pages;
 310     /* protects modification of the bitmap */
 311     QemuMutex bitmap_mutex;
 312     /* The RAMBlock used in the last src_page_requests */
 313     RAMBlock *last_req_rb;
 314     /* Queue of outstanding page requests from the destination */
 315     QemuMutex src_page_req_mutex;
 316     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 317 };
 318 typedef struct RAMState RAMState;
 319
 320 static RAMState *ram_state;
 321
 322 uint64_t ram_bytes_remaining(void)
 323 {
 324     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 325                        0;
 326 }
 327
 328 MigrationStats ram_counters;
 329
 330 /* used by the search for pages to send */
 331 struct PageSearchStatus {
 332     /* Current block being searched */
 333     RAMBlock    *block;
 334     /* Current page to search from */
 335     unsigned long page;
 336     /* Set once we wrap around */
 337     bool         complete_round;
 338 };
 339 typedef struct PageSearchStatus PageSearchStatus;
 340
 341 struct CompressParam {
 342     bool done;
 343     bool quit;
 344     bool zero_page;
 345     QEMUFile *file;
 346     QemuMutex mutex;
 347     QemuCond cond;
 348     RAMBlock *block;
 349     ram_addr_t offset;
 350
 351     /* internally used fields */
 352     z_stream stream;
 353     uint8_t *originbuf;
 354 };
 355 typedef struct CompressParam CompressParam;
 356
 357 struct DecompressParam {
 358     bool done;
 359     bool quit;
 360     QemuMutex mutex;
 361     QemuCond cond;
 362     void *des;
 363     uint8_t *compbuf;
 364     int len;
 365     z_stream stream;
 366 };
 367 typedef struct DecompressParam DecompressParam;
 368
 369 static CompressParam *comp_param;
 370 static QemuThread *compress_threads;
 371 /* comp_done_cond is used to wake up the migration thread when
 372  * one of the compression threads has finished the compression.
 373  * comp_done_lock is used to co-work with comp_done_cond.
 374  */
 375 static QemuMutex comp_done_lock;
 376 static QemuCond comp_done_cond;
 377 /* The empty QEMUFileOps will be used by file in CompressParam */
 378 static const QEMUFileOps empty_ops = { };
 379
 380 static QEMUFile *decomp_file;
 381 static DecompressParam *decomp_param;
 382 static QemuThread *decompress_threads;
 383 static QemuMutex decomp_done_lock;
 384 static QemuCond decomp_done_cond;
 385
 386 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 387                                  ram_addr_t offset, uint8_t *source_buf);
 388
 389 static void *do_data_compress(void *opaque)
 390 {
 391     CompressParam *param = opaque;
 392     RAMBlock *block;
 393     ram_addr_t offset;
 394     bool zero_page;
 395
 396     qemu_mutex_lock(&param->mutex);
 397     while (!param->quit) {
 398         if (param->block) {
 399             block = param->block;
 400             offset = param->offset;
 401             param->block = NULL;
 402             qemu_mutex_unlock(&param->mutex);
 403
 404             zero_page = do_compress_ram_page(param->file, &param->stream,
 405                                              block, offset, param->originbuf);
 406
 407             qemu_mutex_lock(&comp_done_lock);
 408             param->done = true;
 409             param->zero_page = zero_page;
 410             qemu_cond_signal(&comp_done_cond);
 411             qemu_mutex_unlock(&comp_done_lock);
 412
 413             qemu_mutex_lock(&param->mutex);
 414         } else {
 415             qemu_cond_wait(&param->cond, &param->mutex);
 416         }
 417     }
 418     qemu_mutex_unlock(&param->mutex);
 419
 420     return NULL;
 421 }
 422
 423 static inline void terminate_compression_threads(void)
 424 {
 425     int idx, thread_count;
 426
 427     thread_count = migrate_compress_threads();
 428
 429     for (idx = 0; idx < thread_count; idx++) {
 430         qemu_mutex_lock(&comp_param[idx].mutex);
 431         comp_param[idx].quit = true;
 432         qemu_cond_signal(&comp_param[idx].cond);
 433         qemu_mutex_unlock(&comp_param[idx].mutex);
 434     }
 435 }
 436
 437 static void compress_threads_save_cleanup(void)
 438 {
 439     int i, thread_count;
 440
 441     if (!migrate_use_compression()) {
 442         return;
 443     }
 444     terminate_compression_threads();
 445     thread_count = migrate_compress_threads();
 446     for (i = 0; i < thread_count; i++) {
 447         /*
 448          * we use it as a indicator which shows if the thread is
 449          * properly init'd or not
 450          */
 451         if (!comp_param[i].file) {
 452             break;
 453         }
 454         qemu_thread_join(compress_threads + i);
 455         qemu_mutex_destroy(&comp_param[i].mutex);
 456         qemu_cond_destroy(&comp_param[i].cond);
 457         deflateEnd(&comp_param[i].stream);
 458         g_free(comp_param[i].originbuf);
 459         qemu_fclose(comp_param[i].file);
 460         comp_param[i].file = NULL;
 461     }
 462     qemu_mutex_destroy(&comp_done_lock);
 463     qemu_cond_destroy(&comp_done_cond);
 464     g_free(compress_threads);
 465     g_free(comp_param);
 466     compress_threads = NULL;
 467     comp_param = NULL;
 468 }
 469
 470 static int compress_threads_save_setup(void)
 471 {
 472     int i, thread_count;
 473
 474     if (!migrate_use_compression()) {
 475         return 0;
 476     }
 477     thread_count = migrate_compress_threads();
 478     compress_threads = g_new0(QemuThread, thread_count);
 479     comp_param = g_new0(CompressParam, thread_count);
 480     qemu_cond_init(&comp_done_cond);
 481     qemu_mutex_init(&comp_done_lock);
 482     for (i = 0; i < thread_count; i++) {
 483         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 484         if (!comp_param[i].originbuf) {
 485             goto exit;
 486         }
 487
 488         if (deflateInit(&comp_param[i].stream,
 489                         migrate_compress_level()) != Z_OK) {
 490             g_free(comp_param[i].originbuf);
 491             goto exit;
 492         }
 493
 494         /* comp_param[i].file is just used as a dummy buffer to save data,
 495          * set its ops to empty.
 496          */
 497         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 498         comp_param[i].done = true;
 499         comp_param[i].quit = false;
 500         qemu_mutex_init(&comp_param[i].mutex);
 501         qemu_cond_init(&comp_param[i].cond);
 502         qemu_thread_create(compress_threads + i, "compress",
 503                            do_data_compress, comp_param + i,
 504                            QEMU_THREAD_JOINABLE);
 505     }
 506     return 0;
 507
 508 exit:
 509     compress_threads_save_cleanup();
 510     return -1;
 511 }
 512
 513 /* Multiple fd's */
 514
 515 #define MULTIFD_MAGIC 0x11223344U
 516 #define MULTIFD_VERSION 1
 517
 518 #define MULTIFD_FLAG_SYNC (1 << 0)
 519
 520 typedef struct {
 521     uint32_t magic;
 522     uint32_t version;
 523     unsigned char uuid[16]; /* QemuUUID */
 524     uint8_t id;
 525 } __attribute__((packed)) MultiFDInit_t;
 526
 527 typedef struct {
 528     uint32_t magic;
 529     uint32_t version;
 530     uint32_t flags;
 531     uint32_t size;
 532     uint32_t used;
 533     uint64_t packet_num;
 534     char ramblock[256];
 535     uint64_t offset[];
 536 } __attribute__((packed)) MultiFDPacket_t;
 537
 538 typedef struct {
 539     /* number of used pages */
 540     uint32_t used;
 541     /* number of allocated pages */
 542     uint32_t allocated;
 543     /* global number of generated multifd packets */
 544     uint64_t packet_num;
 545     /* offset of each page */
 546     ram_addr_t *offset;
 547     /* pointer to each page */
 548     struct iovec *iov;
 549     RAMBlock *block;
 550 } MultiFDPages_t;
 551
 552 typedef struct {
 553     /* this fields are not changed once the thread is created */
 554     /* channel number */
 555     uint8_t id;
 556     /* channel thread name */
 557     char *name;
 558     /* channel thread id */
 559     QemuThread thread;
 560     /* communication channel */
 561     QIOChannel *c;
 562     /* sem where to wait for more work */
 563     QemuSemaphore sem;
 564     /* this mutex protects the following parameters */
 565     QemuMutex mutex;
 566     /* is this channel thread running */
 567     bool running;
 568     /* should this thread finish */
 569     bool quit;
 570     /* thread has work to do */
 571     int pending_job;
 572     /* array of pages to sent */
 573     MultiFDPages_t *pages;
 574     /* packet allocated len */
 575     uint32_t packet_len;
 576     /* pointer to the packet */
 577     MultiFDPacket_t *packet;
 578     /* multifd flags for each packet */
 579     uint32_t flags;
 580     /* global number of generated multifd packets */
 581     uint64_t packet_num;
 582     /* thread local variables */
 583     /* packets sent through this channel */
 584     uint64_t num_packets;
 585     /* pages sent through this channel */
 586     uint64_t num_pages;
 587     /* syncs main thread and channels */
 588     QemuSemaphore sem_sync;
 589 }  MultiFDSendParams;
 590
 591 typedef struct {
 592     /* this fields are not changed once the thread is created */
 593     /* channel number */
 594     uint8_t id;
 595     /* channel thread name */
 596     char *name;
 597     /* channel thread id */
 598     QemuThread thread;
 599     /* communication channel */
 600     QIOChannel *c;
 601     /* this mutex protects the following parameters */
 602     QemuMutex mutex;
 603     /* is this channel thread running */
 604     bool running;
 605     /* array of pages to receive */
 606     MultiFDPages_t *pages;
 607     /* packet allocated len */
 608     uint32_t packet_len;
 609     /* pointer to the packet */
 610     MultiFDPacket_t *packet;
 611     /* multifd flags for each packet */
 612     uint32_t flags;
 613     /* global number of generated multifd packets */
 614     uint64_t packet_num;
 615     /* thread local variables */
 616     /* packets sent through this channel */
 617     uint64_t num_packets;
 618     /* pages sent through this channel */
 619     uint64_t num_pages;
 620     /* syncs main thread and channels */
 621     QemuSemaphore sem_sync;
 622 } MultiFDRecvParams;
 623
 624 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 625 {
 626     MultiFDInit_t msg;
 627     int ret;
 628
 629     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 630     msg.version = cpu_to_be32(MULTIFD_VERSION);
 631     msg.id = p->id;
 632     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 633
 634     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 635     if (ret != 0) {
 636         return -1;
 637     }
 638     return 0;
 639 }
 640
 641 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 642 {
 643     MultiFDInit_t msg;
 644     int ret;
 645
 646     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 647     if (ret != 0) {
 648         return -1;
 649     }
 650
 651     be32_to_cpus(&msg.magic);
 652     be32_to_cpus(&msg.version);
 653
 654     if (msg.magic != MULTIFD_MAGIC) {
 655         error_setg(errp, "multifd: received packet magic %x "
 656                    "expected %x", msg.magic, MULTIFD_MAGIC);
 657         return -1;
 658     }
 659
 660     if (msg.version != MULTIFD_VERSION) {
 661         error_setg(errp, "multifd: received packet version %d "
 662                    "expected %d", msg.version, MULTIFD_VERSION);
 663         return -1;
 664     }
 665
 666     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 667         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 668         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 669
 670         error_setg(errp, "multifd: received uuid '%s' and expected "
 671                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 672         g_free(uuid);
 673         g_free(msg_uuid);
 674         return -1;
 675     }
 676
 677     if (msg.id > migrate_multifd_channels()) {
 678         error_setg(errp, "multifd: received channel version %d "
 679                    "expected %d", msg.version, MULTIFD_VERSION);
 680         return -1;
 681     }
 682
 683     return msg.id;
 684 }
 685
 686 static MultiFDPages_t *multifd_pages_init(size_t size)
 687 {
 688     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 689
 690     pages->allocated = size;
 691     pages->iov = g_new0(struct iovec, size);
 692     pages->offset = g_new0(ram_addr_t, size);
 693
 694     return pages;
 695 }
 696
 697 static void multifd_pages_clear(MultiFDPages_t *pages)
 698 {
 699     pages->used = 0;
 700     pages->allocated = 0;
 701     pages->packet_num = 0;
 702     pages->block = NULL;
 703     g_free(pages->iov);
 704     pages->iov = NULL;
 705     g_free(pages->offset);
 706     pages->offset = NULL;
 707     g_free(pages);
 708 }
 709
 710 static void multifd_send_fill_packet(MultiFDSendParams *p)
 711 {
 712     MultiFDPacket_t *packet = p->packet;
 713     int i;
 714
 715     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 716     packet->version = cpu_to_be32(MULTIFD_VERSION);
 717     packet->flags = cpu_to_be32(p->flags);
 718     packet->size = cpu_to_be32(migrate_multifd_page_count());
 719     packet->used = cpu_to_be32(p->pages->used);
 720     packet->packet_num = cpu_to_be64(p->packet_num);
 721
 722     if (p->pages->block) {
 723         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 724     }
 725
 726     for (i = 0; i < p->pages->used; i++) {
 727         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 728     }
 729 }
 730
 731 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 732 {
 733     MultiFDPacket_t *packet = p->packet;
 734     RAMBlock *block;
 735     int i;
 736
 737     be32_to_cpus(&packet->magic);
 738     if (packet->magic != MULTIFD_MAGIC) {
 739         error_setg(errp, "multifd: received packet "
 740                    "magic %x and expected magic %x",
 741                    packet->magic, MULTIFD_MAGIC);
 742         return -1;
 743     }
 744
 745     be32_to_cpus(&packet->version);
 746     if (packet->version != MULTIFD_VERSION) {
 747         error_setg(errp, "multifd: received packet "
 748                    "version %d and expected version %d",
 749                    packet->version, MULTIFD_VERSION);
 750         return -1;
 751     }
 752
 753     p->flags = be32_to_cpu(packet->flags);
 754
 755     be32_to_cpus(&packet->size);
 756     if (packet->size > migrate_multifd_page_count()) {
 757         error_setg(errp, "multifd: received packet "
 758                    "with size %d and expected maximum size %d",
 759                    packet->size, migrate_multifd_page_count()) ;
 760         return -1;
 761     }
 762
 763     p->pages->used = be32_to_cpu(packet->used);
 764     if (p->pages->used > packet->size) {
 765         error_setg(errp, "multifd: received packet "
 766                    "with size %d and expected maximum size %d",
 767                    p->pages->used, packet->size) ;
 768         return -1;
 769     }
 770
 771     p->packet_num = be64_to_cpu(packet->packet_num);
 772
 773     if (p->pages->used) {
 774         /* make sure that ramblock is 0 terminated */
 775         packet->ramblock[255] = 0;
 776         block = qemu_ram_block_by_name(packet->ramblock);
 777         if (!block) {
 778             error_setg(errp, "multifd: unknown ram block %s",
 779                        packet->ramblock);
 780             return -1;
 781         }
 782     }
 783
 784     for (i = 0; i < p->pages->used; i++) {
 785         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 786
 787         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 788             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 789                        " (max " RAM_ADDR_FMT ")",
 790                        offset, block->max_length);
 791             return -1;
 792         }
 793         p->pages->iov[i].iov_base = block->host + offset;
 794         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 795     }
 796
 797     return 0;
 798 }
 799
 800 struct {
 801     MultiFDSendParams *params;
 802     /* number of created threads */
 803     int count;
 804     /* array of pages to sent */
 805     MultiFDPages_t *pages;
 806     /* syncs main thread and channels */
 807     QemuSemaphore sem_sync;
 808     /* global number of generated multifd packets */
 809     uint64_t packet_num;
 810     /* send channels ready */
 811     QemuSemaphore channels_ready;
 812 } *multifd_send_state;
 813
 814 /*
 815  * How we use multifd_send_state->pages and channel->pages?
 816  *
 817  * We create a pages for each channel, and a main one.  Each time that
 818  * we need to send a batch of pages we interchange the ones between
 819  * multifd_send_state and the channel that is sending it.  There are
 820  * two reasons for that:
 821  *    - to not have to do so many mallocs during migration
 822  *    - to make easier to know what to free at the end of migration
 823  *
 824  * This way we always know who is the owner of each "pages" struct,
 825  * and we don't need any loocking.  It belongs to the migration thread
 826  * or to the channel thread.  Switching is safe because the migration
 827  * thread is using the channel mutex when changing it, and the channel
 828  * have to had finish with its own, otherwise pending_job can't be
 829  * false.
 830  */
 831
 832 static void multifd_send_pages(void)
 833 {
 834     int i;
 835     static int next_channel;
 836     MultiFDSendParams *p = NULL; /* make happy gcc */
 837     MultiFDPages_t *pages = multifd_send_state->pages;
 838     uint64_t transferred;
 839
 840     qemu_sem_wait(&multifd_send_state->channels_ready);
 841     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 842         p = &multifd_send_state->params[i];
 843
 844         qemu_mutex_lock(&p->mutex);
 845         if (!p->pending_job) {
 846             p->pending_job++;
 847             next_channel = (i + 1) % migrate_multifd_channels();
 848             break;
 849         }
 850         qemu_mutex_unlock(&p->mutex);
 851     }
 852     p->pages->used = 0;
 853
 854     p->packet_num = multifd_send_state->packet_num++;
 855     p->pages->block = NULL;
 856     multifd_send_state->pages = p->pages;
 857     p->pages = pages;
 858     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
 859     ram_counters.multifd_bytes += transferred;
 860     ram_counters.transferred += transferred;;
 861     qemu_mutex_unlock(&p->mutex);
 862     qemu_sem_post(&p->sem);
 863 }
 864
 865 static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
 866 {
 867     MultiFDPages_t *pages = multifd_send_state->pages;
 868
 869     if (!pages->block) {
 870         pages->block = block;
 871     }
 872
 873     if (pages->block == block) {
 874         pages->offset[pages->used] = offset;
 875         pages->iov[pages->used].iov_base = block->host + offset;
 876         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 877         pages->used++;
 878
 879         if (pages->used < pages->allocated) {
 880             return;
 881         }
 882     }
 883
 884     multifd_send_pages();
 885
 886     if (pages->block != block) {
 887         multifd_queue_page(block, offset);
 888     }
 889 }
 890
 891 static void multifd_send_terminate_threads(Error *err)
 892 {
 893     int i;
 894
 895     if (err) {
 896         MigrationState *s = migrate_get_current();
 897         migrate_set_error(s, err);
 898         if (s->state == MIGRATION_STATUS_SETUP ||
 899             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
 900             s->state == MIGRATION_STATUS_DEVICE ||
 901             s->state == MIGRATION_STATUS_ACTIVE) {
 902             migrate_set_state(&s->state, s->state,
 903                               MIGRATION_STATUS_FAILED);
 904         }
 905     }
 906
 907     for (i = 0; i < migrate_multifd_channels(); i++) {
 908         MultiFDSendParams *p = &multifd_send_state->params[i];
 909
 910         qemu_mutex_lock(&p->mutex);
 911         p->quit = true;
 912         qemu_sem_post(&p->sem);
 913         qemu_mutex_unlock(&p->mutex);
 914     }
 915 }
 916
 917 int multifd_save_cleanup(Error **errp)
 918 {
 919     int i;
 920     int ret = 0;
 921
 922     if (!migrate_use_multifd()) {
 923         return 0;
 924     }
 925     multifd_send_terminate_threads(NULL);
 926     for (i = 0; i < migrate_multifd_channels(); i++) {
 927         MultiFDSendParams *p = &multifd_send_state->params[i];
 928
 929         if (p->running) {
 930             qemu_thread_join(&p->thread);
 931         }
 932         socket_send_channel_destroy(p->c);
 933         p->c = NULL;
 934         qemu_mutex_destroy(&p->mutex);
 935         qemu_sem_destroy(&p->sem);
 936         qemu_sem_destroy(&p->sem_sync);
 937         g_free(p->name);
 938         p->name = NULL;
 939         multifd_pages_clear(p->pages);
 940         p->pages = NULL;
 941         p->packet_len = 0;
 942         g_free(p->packet);
 943         p->packet = NULL;
 944     }
 945     qemu_sem_destroy(&multifd_send_state->channels_ready);
 946     qemu_sem_destroy(&multifd_send_state->sem_sync);
 947     g_free(multifd_send_state->params);
 948     multifd_send_state->params = NULL;
 949     multifd_pages_clear(multifd_send_state->pages);
 950     multifd_send_state->pages = NULL;
 951     g_free(multifd_send_state);
 952     multifd_send_state = NULL;
 953     return ret;
 954 }
 955
 956 static void multifd_send_sync_main(void)
 957 {
 958     int i;
 959
 960     if (!migrate_use_multifd()) {
 961         return;
 962     }
 963     if (multifd_send_state->pages->used) {
 964         multifd_send_pages();
 965     }
 966     for (i = 0; i < migrate_multifd_channels(); i++) {
 967         MultiFDSendParams *p = &multifd_send_state->params[i];
 968
 969         trace_multifd_send_sync_main_signal(p->id);
 970
 971         qemu_mutex_lock(&p->mutex);
 972
 973         p->packet_num = multifd_send_state->packet_num++;
 974         p->flags |= MULTIFD_FLAG_SYNC;
 975         p->pending_job++;
 976         qemu_mutex_unlock(&p->mutex);
 977         qemu_sem_post(&p->sem);
 978     }
 979     for (i = 0; i < migrate_multifd_channels(); i++) {
 980         MultiFDSendParams *p = &multifd_send_state->params[i];
 981
 982         trace_multifd_send_sync_main_wait(p->id);
 983         qemu_sem_wait(&multifd_send_state->sem_sync);
 984     }
 985     trace_multifd_send_sync_main(multifd_send_state->packet_num);
 986 }
 987
 988 static void *multifd_send_thread(void *opaque)
 989 {
 990     MultiFDSendParams *p = opaque;
 991     Error *local_err = NULL;
 992     int ret;
 993
 994     trace_multifd_send_thread_start(p->id);
 995     rcu_register_thread();
 996
 997     if (multifd_send_initial_packet(p, &local_err) < 0) {
 998         goto out;
 999     }
1000     /* initial packet */
1001     p->num_packets = 1;
1002
1003     while (true) {
1004         qemu_sem_wait(&p->sem);
1005         qemu_mutex_lock(&p->mutex);
1006
1007         if (p->pending_job) {
1008             uint32_t used = p->pages->used;
1009             uint64_t packet_num = p->packet_num;
1010             uint32_t flags = p->flags;
1011
1012             multifd_send_fill_packet(p);
1013             p->flags = 0;
1014             p->num_packets++;
1015             p->num_pages += used;
1016             p->pages->used = 0;
1017             qemu_mutex_unlock(&p->mutex);
1018
1019             trace_multifd_send(p->id, packet_num, used, flags);
1020
1021             ret = qio_channel_write_all(p->c, (void *)p->packet,
1022                                         p->packet_len, &local_err);
1023             if (ret != 0) {
1024                 break;
1025             }
1026
1027             ret = qio_channel_writev_all(p->c, p->pages->iov, used, &local_err);
1028             if (ret != 0) {
1029                 break;
1030             }
1031
1032             qemu_mutex_lock(&p->mutex);
1033             p->pending_job--;
1034             qemu_mutex_unlock(&p->mutex);
1035
1036             if (flags & MULTIFD_FLAG_SYNC) {
1037                 qemu_sem_post(&multifd_send_state->sem_sync);
1038             }
1039             qemu_sem_post(&multifd_send_state->channels_ready);
1040         } else if (p->quit) {
1041             qemu_mutex_unlock(&p->mutex);
1042             break;
1043         } else {
1044             qemu_mutex_unlock(&p->mutex);
1045             /* sometimes there are spurious wakeups */
1046         }
1047     }
1048
1049 out:
1050     if (local_err) {
1051         multifd_send_terminate_threads(local_err);
1052     }
1053
1054     qemu_mutex_lock(&p->mutex);
1055     p->running = false;
1056     qemu_mutex_unlock(&p->mutex);
1057
1058     rcu_unregister_thread();
1059     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1060
1061     return NULL;
1062 }
1063
1064 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1065 {
1066     MultiFDSendParams *p = opaque;
1067     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1068     Error *local_err = NULL;
1069
1070     if (qio_task_propagate_error(task, &local_err)) {
1071         if (multifd_save_cleanup(&local_err) != 0) {
1072             migrate_set_error(migrate_get_current(), local_err);
1073         }
1074     } else {
1075         p->c = QIO_CHANNEL(sioc);
1076         qio_channel_set_delay(p->c, false);
1077         p->running = true;
1078         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1079                            QEMU_THREAD_JOINABLE);
1080
1081         atomic_inc(&multifd_send_state->count);
1082     }
1083 }
1084
1085 int multifd_save_setup(void)
1086 {
1087     int thread_count;
1088     uint32_t page_count = migrate_multifd_page_count();
1089     uint8_t i;
1090
1091     if (!migrate_use_multifd()) {
1092         return 0;
1093     }
1094     thread_count = migrate_multifd_channels();
1095     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1096     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1097     atomic_set(&multifd_send_state->count, 0);
1098     multifd_send_state->pages = multifd_pages_init(page_count);
1099     qemu_sem_init(&multifd_send_state->sem_sync, 0);
1100     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1101
1102     for (i = 0; i < thread_count; i++) {
1103         MultiFDSendParams *p = &multifd_send_state->params[i];
1104
1105         qemu_mutex_init(&p->mutex);
1106         qemu_sem_init(&p->sem, 0);
1107         qemu_sem_init(&p->sem_sync, 0);
1108         p->quit = false;
1109         p->pending_job = 0;
1110         p->id = i;
1111         p->pages = multifd_pages_init(page_count);
1112         p->packet_len = sizeof(MultiFDPacket_t)
1113                       + sizeof(ram_addr_t) * page_count;
1114         p->packet = g_malloc0(p->packet_len);
1115         p->name = g_strdup_printf("multifdsend_%d", i);
1116         socket_send_channel_create(multifd_new_send_channel_async, p);
1117     }
1118     return 0;
1119 }
1120
1121 struct {
1122     MultiFDRecvParams *params;
1123     /* number of created threads */
1124     int count;
1125     /* syncs main thread and channels */
1126     QemuSemaphore sem_sync;
1127     /* global number of generated multifd packets */
1128     uint64_t packet_num;
1129 } *multifd_recv_state;
1130
1131 static void multifd_recv_terminate_threads(Error *err)
1132 {
1133     int i;
1134
1135     if (err) {
1136         MigrationState *s = migrate_get_current();
1137         migrate_set_error(s, err);
1138         if (s->state == MIGRATION_STATUS_SETUP ||
1139             s->state == MIGRATION_STATUS_ACTIVE) {
1140             migrate_set_state(&s->state, s->state,
1141                               MIGRATION_STATUS_FAILED);
1142         }
1143     }
1144
1145     for (i = 0; i < migrate_multifd_channels(); i++) {
1146         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1147
1148         qemu_mutex_lock(&p->mutex);
1149         /* We could arrive here for two reasons:
1150            - normal quit, i.e. everything went fine, just finished
1151            - error quit: We close the channels so the channel threads
1152              finish the qio_channel_read_all_eof() */
1153         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1154         qemu_mutex_unlock(&p->mutex);
1155     }
1156 }
1157
1158 int multifd_load_cleanup(Error **errp)
1159 {
1160     int i;
1161     int ret = 0;
1162
1163     if (!migrate_use_multifd()) {
1164         return 0;
1165     }
1166     multifd_recv_terminate_threads(NULL);
1167     for (i = 0; i < migrate_multifd_channels(); i++) {
1168         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1169
1170         if (p->running) {
1171             qemu_thread_join(&p->thread);
1172         }
1173         object_unref(OBJECT(p->c));
1174         p->c = NULL;
1175         qemu_mutex_destroy(&p->mutex);
1176         qemu_sem_destroy(&p->sem_sync);
1177         g_free(p->name);
1178         p->name = NULL;
1179         multifd_pages_clear(p->pages);
1180         p->pages = NULL;
1181         p->packet_len = 0;
1182         g_free(p->packet);
1183         p->packet = NULL;
1184     }
1185     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1186     g_free(multifd_recv_state->params);
1187     multifd_recv_state->params = NULL;
1188     g_free(multifd_recv_state);
1189     multifd_recv_state = NULL;
1190
1191     return ret;
1192 }
1193
1194 static void multifd_recv_sync_main(void)
1195 {
1196     int i;
1197
1198     if (!migrate_use_multifd()) {
1199         return;
1200     }
1201     for (i = 0; i < migrate_multifd_channels(); i++) {
1202         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1203
1204         trace_multifd_recv_sync_main_wait(p->id);
1205         qemu_sem_wait(&multifd_recv_state->sem_sync);
1206         qemu_mutex_lock(&p->mutex);
1207         if (multifd_recv_state->packet_num < p->packet_num) {
1208             multifd_recv_state->packet_num = p->packet_num;
1209         }
1210         qemu_mutex_unlock(&p->mutex);
1211     }
1212     for (i = 0; i < migrate_multifd_channels(); i++) {
1213         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1214
1215         trace_multifd_recv_sync_main_signal(p->id);
1216         qemu_sem_post(&p->sem_sync);
1217     }
1218     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1219 }
1220
1221 static void *multifd_recv_thread(void *opaque)
1222 {
1223     MultiFDRecvParams *p = opaque;
1224     Error *local_err = NULL;
1225     int ret;
1226
1227     trace_multifd_recv_thread_start(p->id);
1228     rcu_register_thread();
1229
1230     while (true) {
1231         uint32_t used;
1232         uint32_t flags;
1233
1234         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1235                                        p->packet_len, &local_err);
1236         if (ret == 0) {   /* EOF */
1237             break;
1238         }
1239         if (ret == -1) {   /* Error */
1240             break;
1241         }
1242
1243         qemu_mutex_lock(&p->mutex);
1244         ret = multifd_recv_unfill_packet(p, &local_err);
1245         if (ret) {
1246             qemu_mutex_unlock(&p->mutex);
1247             break;
1248         }
1249
1250         used = p->pages->used;
1251         flags = p->flags;
1252         trace_multifd_recv(p->id, p->packet_num, used, flags);
1253         p->num_packets++;
1254         p->num_pages += used;
1255         qemu_mutex_unlock(&p->mutex);
1256
1257         ret = qio_channel_readv_all(p->c, p->pages->iov, used, &local_err);
1258         if (ret != 0) {
1259             break;
1260         }
1261
1262         if (flags & MULTIFD_FLAG_SYNC) {
1263             qemu_sem_post(&multifd_recv_state->sem_sync);
1264             qemu_sem_wait(&p->sem_sync);
1265         }
1266     }
1267
1268     if (local_err) {
1269         multifd_recv_terminate_threads(local_err);
1270     }
1271     qemu_mutex_lock(&p->mutex);
1272     p->running = false;
1273     qemu_mutex_unlock(&p->mutex);
1274
1275     rcu_unregister_thread();
1276     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1277
1278     return NULL;
1279 }
1280
1281 int multifd_load_setup(void)
1282 {
1283     int thread_count;
1284     uint32_t page_count = migrate_multifd_page_count();
1285     uint8_t i;
1286
1287     if (!migrate_use_multifd()) {
1288         return 0;
1289     }
1290     thread_count = migrate_multifd_channels();
1291     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1292     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1293     atomic_set(&multifd_recv_state->count, 0);
1294     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1295
1296     for (i = 0; i < thread_count; i++) {
1297         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1298
1299         qemu_mutex_init(&p->mutex);
1300         qemu_sem_init(&p->sem_sync, 0);
1301         p->id = i;
1302         p->pages = multifd_pages_init(page_count);
1303         p->packet_len = sizeof(MultiFDPacket_t)
1304                       + sizeof(ram_addr_t) * page_count;
1305         p->packet = g_malloc0(p->packet_len);
1306         p->name = g_strdup_printf("multifdrecv_%d", i);
1307     }
1308     return 0;
1309 }
1310
1311 bool multifd_recv_all_channels_created(void)
1312 {
1313     int thread_count = migrate_multifd_channels();
1314
1315     if (!migrate_use_multifd()) {
1316         return true;
1317     }
1318
1319     return thread_count == atomic_read(&multifd_recv_state->count);
1320 }
1321
1322 /* Return true if multifd is ready for the migration, otherwise false */
1323 bool multifd_recv_new_channel(QIOChannel *ioc)
1324 {
1325     MultiFDRecvParams *p;
1326     Error *local_err = NULL;
1327     int id;
1328
1329     id = multifd_recv_initial_packet(ioc, &local_err);
1330     if (id < 0) {
1331         multifd_recv_terminate_threads(local_err);
1332         return false;
1333     }
1334
1335     p = &multifd_recv_state->params[id];
1336     if (p->c != NULL) {
1337         error_setg(&local_err, "multifd: received id '%d' already setup'",
1338                    id);
1339         multifd_recv_terminate_threads(local_err);
1340         return false;
1341     }
1342     p->c = ioc;
1343     object_ref(OBJECT(ioc));
1344     /* initial packet */
1345     p->num_packets = 1;
1346
1347     p->running = true;
1348     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1349                        QEMU_THREAD_JOINABLE);
1350     atomic_inc(&multifd_recv_state->count);
1351     return multifd_recv_state->count == migrate_multifd_channels();
1352 }
1353
1354 /**
1355  * save_page_header: write page header to wire
1356  *
1357  * If this is the 1st block, it also writes the block identification
1358  *
1359  * Returns the number of bytes written
1360  *
1361  * @f: QEMUFile where to send the data
1362  * @block: block that contains the page we want to send
1363  * @offset: offset inside the block for the page
1364  *          in the lower bits, it contains flags
1365  */
1366 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1367                                ram_addr_t offset)
1368 {
1369     size_t size, len;
1370
1371     if (block == rs->last_sent_block) {
1372         offset |= RAM_SAVE_FLAG_CONTINUE;
1373     }
1374     qemu_put_be64(f, offset);
1375     size = 8;
1376
1377     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1378         len = strlen(block->idstr);
1379         qemu_put_byte(f, len);
1380         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1381         size += 1 + len;
1382         rs->last_sent_block = block;
1383     }
1384     return size;
1385 }
1386
1387 /**
1388  * mig_throttle_guest_down: throotle down the guest
1389  *
1390  * Reduce amount of guest cpu execution to hopefully slow down memory
1391  * writes. If guest dirty memory rate is reduced below the rate at
1392  * which we can transfer pages to the destination then we should be
1393  * able to complete migration. Some workloads dirty memory way too
1394  * fast and will not effectively converge, even with auto-converge.
1395  */
1396 static void mig_throttle_guest_down(void)
1397 {
1398     MigrationState *s = migrate_get_current();
1399     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1400     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1401     int pct_max = s->parameters.max_cpu_throttle;
1402
1403     /* We have not started throttling yet. Let's start it. */
1404     if (!cpu_throttle_active()) {
1405         cpu_throttle_set(pct_initial);
1406     } else {
1407         /* Throttling already on, just increase the rate */
1408         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1409                          pct_max));
1410     }
1411 }
1412
1413 /**
1414  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1415  *
1416  * @rs: current RAM state
1417  * @current_addr: address for the zero page
1418  *
1419  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1420  * The important thing is that a stale (not-yet-0'd) page be replaced
1421  * by the new data.
1422  * As a bonus, if the page wasn't in the cache it gets added so that
1423  * when a small write is made into the 0'd page it gets XBZRLE sent.
1424  */
1425 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1426 {
1427     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1428         return;
1429     }
1430
1431     /* We don't care if this fails to allocate a new cache page
1432      * as long as it updated an old one */
1433     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1434                  ram_counters.dirty_sync_count);
1435 }
1436
1437 #define ENCODING_FLAG_XBZRLE 0x1
1438
1439 /**
1440  * save_xbzrle_page: compress and send current page
1441  *
1442  * Returns: 1 means that we wrote the page
1443  *          0 means that page is identical to the one already sent
1444  *          -1 means that xbzrle would be longer than normal
1445  *
1446  * @rs: current RAM state
1447  * @current_data: pointer to the address of the page contents
1448  * @current_addr: addr of the page
1449  * @block: block that contains the page we want to send
1450  * @offset: offset inside the block for the page
1451  * @last_stage: if we are at the completion stage
1452  */
1453 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1454                             ram_addr_t current_addr, RAMBlock *block,
1455                             ram_addr_t offset, bool last_stage)
1456 {
1457     int encoded_len = 0, bytes_xbzrle;
1458     uint8_t *prev_cached_page;
1459
1460     if (!cache_is_cached(XBZRLE.cache, current_addr,
1461                          ram_counters.dirty_sync_count)) {
1462         xbzrle_counters.cache_miss++;
1463         if (!last_stage) {
1464             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1465                              ram_counters.dirty_sync_count) == -1) {
1466                 return -1;
1467             } else {
1468                 /* update *current_data when the page has been
1469                    inserted into cache */
1470                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1471             }
1472         }
1473         return -1;
1474     }
1475
1476     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1477
1478     /* save current buffer into memory */
1479     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1480
1481     /* XBZRLE encoding (if there is no overflow) */
1482     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1483                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1484                                        TARGET_PAGE_SIZE);
1485     if (encoded_len == 0) {
1486         trace_save_xbzrle_page_skipping();
1487         return 0;
1488     } else if (encoded_len == -1) {
1489         trace_save_xbzrle_page_overflow();
1490         xbzrle_counters.overflow++;
1491         /* update data in the cache */
1492         if (!last_stage) {
1493             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1494             *current_data = prev_cached_page;
1495         }
1496         return -1;
1497     }
1498
1499     /* we need to update the data in the cache, in order to get the same data */
1500     if (!last_stage) {
1501         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1502     }
1503
1504     /* Send XBZRLE based compressed page */
1505     bytes_xbzrle = save_page_header(rs, rs->f, block,
1506                                     offset | RAM_SAVE_FLAG_XBZRLE);
1507     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1508     qemu_put_be16(rs->f, encoded_len);
1509     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1510     bytes_xbzrle += encoded_len + 1 + 2;
1511     xbzrle_counters.pages++;
1512     xbzrle_counters.bytes += bytes_xbzrle;
1513     ram_counters.transferred += bytes_xbzrle;
1514
1515     return 1;
1516 }
1517
1518 /**
1519  * migration_bitmap_find_dirty: find the next dirty page from start
1520  *
1521  * Called with rcu_read_lock() to protect migration_bitmap
1522  *
1523  * Returns the byte offset within memory region of the start of a dirty page
1524  *
1525  * @rs: current RAM state
1526  * @rb: RAMBlock where to search for dirty pages
1527  * @start: page where we start the search
1528  */
1529 static inline
1530 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1531                                           unsigned long start)
1532 {
1533     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1534     unsigned long *bitmap = rb->bmap;
1535     unsigned long next;
1536
1537     if (!qemu_ram_is_migratable(rb)) {
1538         return size;
1539     }
1540
1541     if (rs->ram_bulk_stage && start > 0) {
1542         next = start + 1;
1543     } else {
1544         next = find_next_bit(bitmap, size, start);
1545     }
1546
1547     return next;
1548 }
1549
1550 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1551                                                 RAMBlock *rb,
1552                                                 unsigned long page)
1553 {
1554     bool ret;
1555
1556     ret = test_and_clear_bit(page, rb->bmap);
1557
1558     if (ret) {
1559         rs->migration_dirty_pages--;
1560     }
1561     return ret;
1562 }
1563
1564 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1565                                         ram_addr_t start, ram_addr_t length)
1566 {
1567     rs->migration_dirty_pages +=
1568         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1569                                               &rs->num_dirty_pages_period);
1570 }
1571
1572 /**
1573  * ram_pagesize_summary: calculate all the pagesizes of a VM
1574  *
1575  * Returns a summary bitmap of the page sizes of all RAMBlocks
1576  *
1577  * For VMs with just normal pages this is equivalent to the host page
1578  * size. If it's got some huge pages then it's the OR of all the
1579  * different page sizes.
1580  */
1581 uint64_t ram_pagesize_summary(void)
1582 {
1583     RAMBlock *block;
1584     uint64_t summary = 0;
1585
1586     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1587         summary |= block->page_size;
1588     }
1589
1590     return summary;
1591 }
1592
1593 static void migration_update_rates(RAMState *rs, int64_t end_time)
1594 {
1595     uint64_t iter_count = rs->iterations - rs->iterations_prev;
1596
1597     /* calculate period counters */
1598     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1599                 / (end_time - rs->time_last_bitmap_sync);
1600
1601     if (!iter_count) {
1602         return;
1603     }
1604
1605     if (migrate_use_xbzrle()) {
1606         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1607             rs->xbzrle_cache_miss_prev) / iter_count;
1608         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1609     }
1610 }
1611
1612 static void migration_bitmap_sync(RAMState *rs)
1613 {
1614     RAMBlock *block;
1615     int64_t end_time;
1616     uint64_t bytes_xfer_now;
1617
1618     ram_counters.dirty_sync_count++;
1619
1620     if (!rs->time_last_bitmap_sync) {
1621         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1622     }
1623
1624     trace_migration_bitmap_sync_start();
1625     memory_global_dirty_log_sync();
1626
1627     qemu_mutex_lock(&rs->bitmap_mutex);
1628     rcu_read_lock();
1629     RAMBLOCK_FOREACH_MIGRATABLE(block) {
1630         migration_bitmap_sync_range(rs, block, 0, block->used_length);
1631     }
1632     ram_counters.remaining = ram_bytes_remaining();
1633     rcu_read_unlock();
1634     qemu_mutex_unlock(&rs->bitmap_mutex);
1635
1636     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1637
1638     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1639
1640     /* more than 1 second = 1000 millisecons */
1641     if (end_time > rs->time_last_bitmap_sync + 1000) {
1642         bytes_xfer_now = ram_counters.transferred;
1643
1644         /* During block migration the auto-converge logic incorrectly detects
1645          * that ram migration makes no progress. Avoid this by disabling the
1646          * throttling logic during the bulk phase of block migration. */
1647         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1648             /* The following detection logic can be refined later. For now:
1649                Check to see if the dirtied bytes is 50% more than the approx.
1650                amount of bytes that just got transferred since the last time we
1651                were in this routine. If that happens twice, start or increase
1652                throttling */
1653
1654             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1655                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1656                 (++rs->dirty_rate_high_cnt >= 2)) {
1657                     trace_migration_throttle();
1658                     rs->dirty_rate_high_cnt = 0;
1659                     mig_throttle_guest_down();
1660             }
1661         }
1662
1663         migration_update_rates(rs, end_time);
1664
1665         rs->iterations_prev = rs->iterations;
1666
1667         /* reset period counters */
1668         rs->time_last_bitmap_sync = end_time;
1669         rs->num_dirty_pages_period = 0;
1670         rs->bytes_xfer_prev = bytes_xfer_now;
1671     }
1672     if (migrate_use_events()) {
1673         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1674     }
1675 }
1676
1677 /**
1678  * save_zero_page_to_file: send the zero page to the file
1679  *
1680  * Returns the size of data written to the file, 0 means the page is not
1681  * a zero page
1682  *
1683  * @rs: current RAM state
1684  * @file: the file where the data is saved
1685  * @block: block that contains the page we want to send
1686  * @offset: offset inside the block for the page
1687  */
1688 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1689                                   RAMBlock *block, ram_addr_t offset)
1690 {
1691     uint8_t *p = block->host + offset;
1692     int len = 0;
1693
1694     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1695         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1696         qemu_put_byte(file, 0);
1697         len += 1;
1698     }
1699     return len;
1700 }
1701
1702 /**
1703  * save_zero_page: send the zero page to the stream
1704  *
1705  * Returns the number of pages written.
1706  *
1707  * @rs: current RAM state
1708  * @block: block that contains the page we want to send
1709  * @offset: offset inside the block for the page
1710  */
1711 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1712 {
1713     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1714
1715     if (len) {
1716         ram_counters.duplicate++;
1717         ram_counters.transferred += len;
1718         return 1;
1719     }
1720     return -1;
1721 }
1722
1723 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1724 {
1725     if (!migrate_release_ram() || !migration_in_postcopy()) {
1726         return;
1727     }
1728
1729     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1730 }
1731
1732 /*
1733  * @pages: the number of pages written by the control path,
1734  *        < 0 - error
1735  *        > 0 - number of pages written
1736  *
1737  * Return true if the pages has been saved, otherwise false is returned.
1738  */
1739 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1740                               int *pages)
1741 {
1742     uint64_t bytes_xmit = 0;
1743     int ret;
1744
1745     *pages = -1;
1746     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1747                                 &bytes_xmit);
1748     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1749         return false;
1750     }
1751
1752     if (bytes_xmit) {
1753         ram_counters.transferred += bytes_xmit;
1754         *pages = 1;
1755     }
1756
1757     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1758         return true;
1759     }
1760
1761     if (bytes_xmit > 0) {
1762         ram_counters.normal++;
1763     } else if (bytes_xmit == 0) {
1764         ram_counters.duplicate++;
1765     }
1766
1767     return true;
1768 }
1769
1770 /*
1771  * directly send the page to the stream
1772  *
1773  * Returns the number of pages written.
1774  *
1775  * @rs: current RAM state
1776  * @block: block that contains the page we want to send
1777  * @offset: offset inside the block for the page
1778  * @buf: the page to be sent
1779  * @async: send to page asyncly
1780  */
1781 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1782                             uint8_t *buf, bool async)
1783 {
1784     ram_counters.transferred += save_page_header(rs, rs->f, block,
1785                                                  offset | RAM_SAVE_FLAG_PAGE);
1786     if (async) {
1787         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1788                               migrate_release_ram() &
1789                               migration_in_postcopy());
1790     } else {
1791         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1792     }
1793     ram_counters.transferred += TARGET_PAGE_SIZE;
1794     ram_counters.normal++;
1795     return 1;
1796 }
1797
1798 /**
1799  * ram_save_page: send the given page to the stream
1800  *
1801  * Returns the number of pages written.
1802  *          < 0 - error
1803  *          >=0 - Number of pages written - this might legally be 0
1804  *                if xbzrle noticed the page was the same.
1805  *
1806  * @rs: current RAM state
1807  * @block: block that contains the page we want to send
1808  * @offset: offset inside the block for the page
1809  * @last_stage: if we are at the completion stage
1810  */
1811 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1812 {
1813     int pages = -1;
1814     uint8_t *p;
1815     bool send_async = true;
1816     RAMBlock *block = pss->block;
1817     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1818     ram_addr_t current_addr = block->offset + offset;
1819
1820     p = block->host + offset;
1821     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1822
1823     XBZRLE_cache_lock();
1824     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1825         migrate_use_xbzrle()) {
1826         pages = save_xbzrle_page(rs, &p, current_addr, block,
1827                                  offset, last_stage);
1828         if (!last_stage) {
1829             /* Can't send this cached data async, since the cache page
1830              * might get updated before it gets to the wire
1831              */
1832             send_async = false;
1833         }
1834     }
1835
1836     /* XBZRLE overflow or normal page */
1837     if (pages == -1) {
1838         pages = save_normal_page(rs, block, offset, p, send_async);
1839     }
1840
1841     XBZRLE_cache_unlock();
1842
1843     return pages;
1844 }
1845
1846 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1847                                  ram_addr_t offset)
1848 {
1849     multifd_queue_page(block, offset);
1850     ram_counters.normal++;
1851
1852     return 1;
1853 }
1854
1855 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1856                                  ram_addr_t offset, uint8_t *source_buf)
1857 {
1858     RAMState *rs = ram_state;
1859     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1860     bool zero_page = false;
1861     int ret;
1862
1863     if (save_zero_page_to_file(rs, f, block, offset)) {
1864         zero_page = true;
1865         goto exit;
1866     }
1867
1868     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1869
1870     /*
1871      * copy it to a internal buffer to avoid it being modified by VM
1872      * so that we can catch up the error during compression and
1873      * decompression
1874      */
1875     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1876     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1877     if (ret < 0) {
1878         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1879         error_report("compressed data failed!");
1880         return false;
1881     }
1882
1883 exit:
1884     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1885     return zero_page;
1886 }
1887
1888 static void
1889 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1890 {
1891     if (param->zero_page) {
1892         ram_counters.duplicate++;
1893     }
1894     ram_counters.transferred += bytes_xmit;
1895 }
1896
1897 static void flush_compressed_data(RAMState *rs)
1898 {
1899     int idx, len, thread_count;
1900
1901     if (!migrate_use_compression()) {
1902         return;
1903     }
1904     thread_count = migrate_compress_threads();
1905
1906     qemu_mutex_lock(&comp_done_lock);
1907     for (idx = 0; idx < thread_count; idx++) {
1908         while (!comp_param[idx].done) {
1909             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1910         }
1911     }
1912     qemu_mutex_unlock(&comp_done_lock);
1913
1914     for (idx = 0; idx < thread_count; idx++) {
1915         qemu_mutex_lock(&comp_param[idx].mutex);
1916         if (!comp_param[idx].quit) {
1917             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1918             /*
1919              * it's safe to fetch zero_page without holding comp_done_lock
1920              * as there is no further request submitted to the thread,
1921              * i.e, the thread should be waiting for a request at this point.
1922              */
1923             update_compress_thread_counts(&comp_param[idx], len);
1924         }
1925         qemu_mutex_unlock(&comp_param[idx].mutex);
1926     }
1927 }
1928
1929 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1930                                        ram_addr_t offset)
1931 {
1932     param->block = block;
1933     param->offset = offset;
1934 }
1935
1936 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1937                                            ram_addr_t offset)
1938 {
1939     int idx, thread_count, bytes_xmit = -1, pages = -1;
1940     bool wait = migrate_compress_wait_thread();
1941
1942     thread_count = migrate_compress_threads();
1943     qemu_mutex_lock(&comp_done_lock);
1944 retry:
1945     for (idx = 0; idx < thread_count; idx++) {
1946         if (comp_param[idx].done) {
1947             comp_param[idx].done = false;
1948             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1949             qemu_mutex_lock(&comp_param[idx].mutex);
1950             set_compress_params(&comp_param[idx], block, offset);
1951             qemu_cond_signal(&comp_param[idx].cond);
1952             qemu_mutex_unlock(&comp_param[idx].mutex);
1953             pages = 1;
1954             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1955             break;
1956         }
1957     }
1958
1959     /*
1960      * wait for the free thread if the user specifies 'compress-wait-thread',
1961      * otherwise we will post the page out in the main thread as normal page.
1962      */
1963     if (pages < 0 && wait) {
1964         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1965         goto retry;
1966     }
1967     qemu_mutex_unlock(&comp_done_lock);
1968
1969     return pages;
1970 }
1971
1972 /**
1973  * find_dirty_block: find the next dirty page and update any state
1974  * associated with the search process.
1975  *
1976  * Returns if a page is found
1977  *
1978  * @rs: current RAM state
1979  * @pss: data about the state of the current dirty page scan
1980  * @again: set to false if the search has scanned the whole of RAM
1981  */
1982 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1983 {
1984     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1985     if (pss->complete_round && pss->block == rs->last_seen_block &&
1986         pss->page >= rs->last_page) {
1987         /*
1988          * We've been once around the RAM and haven't found anything.
1989          * Give up.
1990          */
1991         *again = false;
1992         return false;
1993     }
1994     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1995         /* Didn't find anything in this RAM Block */
1996         pss->page = 0;
1997         pss->block = QLIST_NEXT_RCU(pss->block, next);
1998         if (!pss->block) {
1999             /* Hit the end of the list */
2000             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2001             /* Flag that we've looped */
2002             pss->complete_round = true;
2003             rs->ram_bulk_stage = false;
2004             if (migrate_use_xbzrle()) {
2005                 /* If xbzrle is on, stop using the data compression at this
2006                  * point. In theory, xbzrle can do better than compression.
2007                  */
2008                 flush_compressed_data(rs);
2009             }
2010         }
2011         /* Didn't find anything this time, but try again on the new block */
2012         *again = true;
2013         return false;
2014     } else {
2015         /* Can go around again, but... */
2016         *again = true;
2017         /* We've found something so probably don't need to */
2018         return true;
2019     }
2020 }
2021
2022 /**
2023  * unqueue_page: gets a page of the queue
2024  *
2025  * Helper for 'get_queued_page' - gets a page off the queue
2026  *
2027  * Returns the block of the page (or NULL if none available)
2028  *
2029  * @rs: current RAM state
2030  * @offset: used to return the offset within the RAMBlock
2031  */
2032 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2033 {
2034     RAMBlock *block = NULL;
2035
2036     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2037         return NULL;
2038     }
2039
2040     qemu_mutex_lock(&rs->src_page_req_mutex);
2041     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2042         struct RAMSrcPageRequest *entry =
2043                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
2044         block = entry->rb;
2045         *offset = entry->offset;
2046
2047         if (entry->len > TARGET_PAGE_SIZE) {
2048             entry->len -= TARGET_PAGE_SIZE;
2049             entry->offset += TARGET_PAGE_SIZE;
2050         } else {
2051             memory_region_unref(block->mr);
2052             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2053             g_free(entry);
2054             migration_consume_urgent_request();
2055         }
2056     }
2057     qemu_mutex_unlock(&rs->src_page_req_mutex);
2058
2059     return block;
2060 }
2061
2062 /**
2063  * get_queued_page: unqueue a page from the postocpy requests
2064  *
2065  * Skips pages that are already sent (!dirty)
2066  *
2067  * Returns if a queued page is found
2068  *
2069  * @rs: current RAM state
2070  * @pss: data about the state of the current dirty page scan
2071  */
2072 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2073 {
2074     RAMBlock  *block;
2075     ram_addr_t offset;
2076     bool dirty;
2077
2078     do {
2079         block = unqueue_page(rs, &offset);
2080         /*
2081          * We're sending this page, and since it's postcopy nothing else
2082          * will dirty it, and we must make sure it doesn't get sent again
2083          * even if this queue request was received after the background
2084          * search already sent it.
2085          */
2086         if (block) {
2087             unsigned long page;
2088
2089             page = offset >> TARGET_PAGE_BITS;
2090             dirty = test_bit(page, block->bmap);
2091             if (!dirty) {
2092                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2093                        page, test_bit(page, block->unsentmap));
2094             } else {
2095                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2096             }
2097         }
2098
2099     } while (block && !dirty);
2100
2101     if (block) {
2102         /*
2103          * As soon as we start servicing pages out of order, then we have
2104          * to kill the bulk stage, since the bulk stage assumes
2105          * in (migration_bitmap_find_and_reset_dirty) that every page is
2106          * dirty, that's no longer true.
2107          */
2108         rs->ram_bulk_stage = false;
2109
2110         /*
2111          * We want the background search to continue from the queued page
2112          * since the guest is likely to want other pages near to the page
2113          * it just requested.
2114          */
2115         pss->block = block;
2116         pss->page = offset >> TARGET_PAGE_BITS;
2117     }
2118
2119     return !!block;
2120 }
2121
2122 /**
2123  * migration_page_queue_free: drop any remaining pages in the ram
2124  * request queue
2125  *
2126  * It should be empty at the end anyway, but in error cases there may
2127  * be some left.  in case that there is any page left, we drop it.
2128  *
2129  */
2130 static void migration_page_queue_free(RAMState *rs)
2131 {
2132     struct RAMSrcPageRequest *mspr, *next_mspr;
2133     /* This queue generally should be empty - but in the case of a failed
2134      * migration might have some droppings in.
2135      */
2136     rcu_read_lock();
2137     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2138         memory_region_unref(mspr->rb->mr);
2139         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2140         g_free(mspr);
2141     }
2142     rcu_read_unlock();
2143 }
2144
2145 /**
2146  * ram_save_queue_pages: queue the page for transmission
2147  *
2148  * A request from postcopy destination for example.
2149  *
2150  * Returns zero on success or negative on error
2151  *
2152  * @rbname: Name of the RAMBLock of the request. NULL means the
2153  *          same that last one.
2154  * @start: starting address from the start of the RAMBlock
2155  * @len: length (in bytes) to send
2156  */
2157 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2158 {
2159     RAMBlock *ramblock;
2160     RAMState *rs = ram_state;
2161
2162     ram_counters.postcopy_requests++;
2163     rcu_read_lock();
2164     if (!rbname) {
2165         /* Reuse last RAMBlock */
2166         ramblock = rs->last_req_rb;
2167
2168         if (!ramblock) {
2169             /*
2170              * Shouldn't happen, we can't reuse the last RAMBlock if
2171              * it's the 1st request.
2172              */
2173             error_report("ram_save_queue_pages no previous block");
2174             goto err;
2175         }
2176     } else {
2177         ramblock = qemu_ram_block_by_name(rbname);
2178
2179         if (!ramblock) {
2180             /* We shouldn't be asked for a non-existent RAMBlock */
2181             error_report("ram_save_queue_pages no block '%s'", rbname);
2182             goto err;
2183         }
2184         rs->last_req_rb = ramblock;
2185     }
2186     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2187     if (start+len > ramblock->used_length) {
2188         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2189                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2190                      __func__, start, len, ramblock->used_length);
2191         goto err;
2192     }
2193
2194     struct RAMSrcPageRequest *new_entry =
2195         g_malloc0(sizeof(struct RAMSrcPageRequest));
2196     new_entry->rb = ramblock;
2197     new_entry->offset = start;
2198     new_entry->len = len;
2199
2200     memory_region_ref(ramblock->mr);
2201     qemu_mutex_lock(&rs->src_page_req_mutex);
2202     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2203     migration_make_urgent_request();
2204     qemu_mutex_unlock(&rs->src_page_req_mutex);
2205     rcu_read_unlock();
2206
2207     return 0;
2208
2209 err:
2210     rcu_read_unlock();
2211     return -1;
2212 }
2213
2214 static bool save_page_use_compression(RAMState *rs)
2215 {
2216     if (!migrate_use_compression()) {
2217         return false;
2218     }
2219
2220     /*
2221      * If xbzrle is on, stop using the data compression after first
2222      * round of migration even if compression is enabled. In theory,
2223      * xbzrle can do better than compression.
2224      */
2225     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2226         return true;
2227     }
2228
2229     return false;
2230 }
2231
2232 /*
2233  * try to compress the page before posting it out, return true if the page
2234  * has been properly handled by compression, otherwise needs other
2235  * paths to handle it
2236  */
2237 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2238 {
2239     if (!save_page_use_compression(rs)) {
2240         return false;
2241     }
2242
2243     /*
2244      * When starting the process of a new block, the first page of
2245      * the block should be sent out before other pages in the same
2246      * block, and all the pages in last block should have been sent
2247      * out, keeping this order is important, because the 'cont' flag
2248      * is used to avoid resending the block name.
2249      *
2250      * We post the fist page as normal page as compression will take
2251      * much CPU resource.
2252      */
2253     if (block != rs->last_sent_block) {
2254         flush_compressed_data(rs);
2255         return false;
2256     }
2257
2258     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2259         return true;
2260     }
2261
2262     return false;
2263 }
2264
2265 /**
2266  * ram_save_target_page: save one target page
2267  *
2268  * Returns the number of pages written
2269  *
2270  * @rs: current RAM state
2271  * @pss: data about the page we want to send
2272  * @last_stage: if we are at the completion stage
2273  */
2274 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2275                                 bool last_stage)
2276 {
2277     RAMBlock *block = pss->block;
2278     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2279     int res;
2280
2281     if (control_save_page(rs, block, offset, &res)) {
2282         return res;
2283     }
2284
2285     if (save_compress_page(rs, block, offset)) {
2286         return 1;
2287     }
2288
2289     res = save_zero_page(rs, block, offset);
2290     if (res > 0) {
2291         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2292          * page would be stale
2293          */
2294         if (!save_page_use_compression(rs)) {
2295             XBZRLE_cache_lock();
2296             xbzrle_cache_zero_page(rs, block->offset + offset);
2297             XBZRLE_cache_unlock();
2298         }
2299         ram_release_pages(block->idstr, offset, res);
2300         return res;
2301     }
2302
2303     /*
2304      * do not use multifd for compression as the first page in the new
2305      * block should be posted out before sending the compressed page
2306      */
2307     if (!save_page_use_compression(rs) && migrate_use_multifd()) {
2308         return ram_save_multifd_page(rs, block, offset);
2309     }
2310
2311     return ram_save_page(rs, pss, last_stage);
2312 }
2313
2314 /**
2315  * ram_save_host_page: save a whole host page
2316  *
2317  * Starting at *offset send pages up to the end of the current host
2318  * page. It's valid for the initial offset to point into the middle of
2319  * a host page in which case the remainder of the hostpage is sent.
2320  * Only dirty target pages are sent. Note that the host page size may
2321  * be a huge page for this block.
2322  * The saving stops at the boundary of the used_length of the block
2323  * if the RAMBlock isn't a multiple of the host page size.
2324  *
2325  * Returns the number of pages written or negative on error
2326  *
2327  * @rs: current RAM state
2328  * @ms: current migration state
2329  * @pss: data about the page we want to send
2330  * @last_stage: if we are at the completion stage
2331  */
2332 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2333                               bool last_stage)
2334 {
2335     int tmppages, pages = 0;
2336     size_t pagesize_bits =
2337         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2338
2339     if (!qemu_ram_is_migratable(pss->block)) {
2340         error_report("block %s should not be migrated !", pss->block->idstr);
2341         return 0;
2342     }
2343
2344     do {
2345         /* Check the pages is dirty and if it is send it */
2346         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2347             pss->page++;
2348             continue;
2349         }
2350
2351         tmppages = ram_save_target_page(rs, pss, last_stage);
2352         if (tmppages < 0) {
2353             return tmppages;
2354         }
2355
2356         pages += tmppages;
2357         if (pss->block->unsentmap) {
2358             clear_bit(pss->page, pss->block->unsentmap);
2359         }
2360
2361         pss->page++;
2362     } while ((pss->page & (pagesize_bits - 1)) &&
2363              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2364
2365     /* The offset we leave with is the last one we looked at */
2366     pss->page--;
2367     return pages;
2368 }
2369
2370 /**
2371  * ram_find_and_save_block: finds a dirty page and sends it to f
2372  *
2373  * Called within an RCU critical section.
2374  *
2375  * Returns the number of pages written where zero means no dirty pages
2376  *
2377  * @rs: current RAM state
2378  * @last_stage: if we are at the completion stage
2379  *
2380  * On systems where host-page-size > target-page-size it will send all the
2381  * pages in a host page that are dirty.
2382  */
2383
2384 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2385 {
2386     PageSearchStatus pss;
2387     int pages = 0;
2388     bool again, found;
2389
2390     /* No dirty page as there is zero RAM */
2391     if (!ram_bytes_total()) {
2392         return pages;
2393     }
2394
2395     pss.block = rs->last_seen_block;
2396     pss.page = rs->last_page;
2397     pss.complete_round = false;
2398
2399     if (!pss.block) {
2400         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2401     }
2402
2403     do {
2404         again = true;
2405         found = get_queued_page(rs, &pss);
2406
2407         if (!found) {
2408             /* priority queue empty, so just search for something dirty */
2409             found = find_dirty_block(rs, &pss, &again);
2410         }
2411
2412         if (found) {
2413             pages = ram_save_host_page(rs, &pss, last_stage);
2414         }
2415     } while (!pages && again);
2416
2417     rs->last_seen_block = pss.block;
2418     rs->last_page = pss.page;
2419
2420     return pages;
2421 }
2422
2423 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2424 {
2425     uint64_t pages = size / TARGET_PAGE_SIZE;
2426
2427     if (zero) {
2428         ram_counters.duplicate += pages;
2429     } else {
2430         ram_counters.normal += pages;
2431         ram_counters.transferred += size;
2432         qemu_update_position(f, size);
2433     }
2434 }
2435
2436 uint64_t ram_bytes_total(void)
2437 {
2438     RAMBlock *block;
2439     uint64_t total = 0;
2440
2441     rcu_read_lock();
2442     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2443         total += block->used_length;
2444     }
2445     rcu_read_unlock();
2446     return total;
2447 }
2448
2449 static void xbzrle_load_setup(void)
2450 {
2451     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2452 }
2453
2454 static void xbzrle_load_cleanup(void)
2455 {
2456     g_free(XBZRLE.decoded_buf);
2457     XBZRLE.decoded_buf = NULL;
2458 }
2459
2460 static void ram_state_cleanup(RAMState **rsp)
2461 {
2462     if (*rsp) {
2463         migration_page_queue_free(*rsp);
2464         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2465         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2466         g_free(*rsp);
2467         *rsp = NULL;
2468     }
2469 }
2470
2471 static void xbzrle_cleanup(void)
2472 {
2473     XBZRLE_cache_lock();
2474     if (XBZRLE.cache) {
2475         cache_fini(XBZRLE.cache);
2476         g_free(XBZRLE.encoded_buf);
2477         g_free(XBZRLE.current_buf);
2478         g_free(XBZRLE.zero_target_page);
2479         XBZRLE.cache = NULL;
2480         XBZRLE.encoded_buf = NULL;
2481         XBZRLE.current_buf = NULL;
2482         XBZRLE.zero_target_page = NULL;
2483     }
2484     XBZRLE_cache_unlock();
2485 }
2486
2487 static void ram_save_cleanup(void *opaque)
2488 {
2489     RAMState **rsp = opaque;
2490     RAMBlock *block;
2491
2492     /* caller have hold iothread lock or is in a bh, so there is
2493      * no writing race against this migration_bitmap
2494      */
2495     memory_global_dirty_log_stop();
2496
2497     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2498         g_free(block->bmap);
2499         block->bmap = NULL;
2500         g_free(block->unsentmap);
2501         block->unsentmap = NULL;
2502     }
2503
2504     xbzrle_cleanup();
2505     compress_threads_save_cleanup();
2506     ram_state_cleanup(rsp);
2507 }
2508
2509 static void ram_state_reset(RAMState *rs)
2510 {
2511     rs->last_seen_block = NULL;
2512     rs->last_sent_block = NULL;
2513     rs->last_page = 0;
2514     rs->last_version = ram_list.version;
2515     rs->ram_bulk_stage = true;
2516 }
2517
2518 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2519
2520 /*
2521  * 'expected' is the value you expect the bitmap mostly to be full
2522  * of; it won't bother printing lines that are all this value.
2523  * If 'todump' is null the migration bitmap is dumped.
2524  */
2525 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2526                            unsigned long pages)
2527 {
2528     int64_t cur;
2529     int64_t linelen = 128;
2530     char linebuf[129];
2531
2532     for (cur = 0; cur < pages; cur += linelen) {
2533         int64_t curb;
2534         bool found = false;
2535         /*
2536          * Last line; catch the case where the line length
2537          * is longer than remaining ram
2538          */
2539         if (cur + linelen > pages) {
2540             linelen = pages - cur;
2541         }
2542         for (curb = 0; curb < linelen; curb++) {
2543             bool thisbit = test_bit(cur + curb, todump);
2544             linebuf[curb] = thisbit ? '1' : '.';
2545             found = found || (thisbit != expected);
2546         }
2547         if (found) {
2548             linebuf[curb] = '\0';
2549             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2550         }
2551     }
2552 }
2553
2554 /* **** functions for postcopy ***** */
2555
2556 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2557 {
2558     struct RAMBlock *block;
2559
2560     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2561         unsigned long *bitmap = block->bmap;
2562         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2563         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2564
2565         while (run_start < range) {
2566             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2567             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2568                               (run_end - run_start) << TARGET_PAGE_BITS);
2569             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2570         }
2571     }
2572 }
2573
2574 /**
2575  * postcopy_send_discard_bm_ram: discard a RAMBlock
2576  *
2577  * Returns zero on success
2578  *
2579  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2580  * Note: At this point the 'unsentmap' is the processed bitmap combined
2581  *       with the dirtymap; so a '1' means it's either dirty or unsent.
2582  *
2583  * @ms: current migration state
2584  * @pds: state for postcopy
2585  * @start: RAMBlock starting page
2586  * @length: RAMBlock size
2587  */
2588 static int postcopy_send_discard_bm_ram(MigrationState *ms,
2589                                         PostcopyDiscardState *pds,
2590                                         RAMBlock *block)
2591 {
2592     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2593     unsigned long current;
2594     unsigned long *unsentmap = block->unsentmap;
2595
2596     for (current = 0; current < end; ) {
2597         unsigned long one = find_next_bit(unsentmap, end, current);
2598
2599         if (one <= end) {
2600             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2601             unsigned long discard_length;
2602
2603             if (zero >= end) {
2604                 discard_length = end - one;
2605             } else {
2606                 discard_length = zero - one;
2607             }
2608             if (discard_length) {
2609                 postcopy_discard_send_range(ms, pds, one, discard_length);
2610             }
2611             current = one + discard_length;
2612         } else {
2613             current = one;
2614         }
2615     }
2616
2617     return 0;
2618 }
2619
2620 /**
2621  * postcopy_each_ram_send_discard: discard all RAMBlocks
2622  *
2623  * Returns 0 for success or negative for error
2624  *
2625  * Utility for the outgoing postcopy code.
2626  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2627  *   passing it bitmap indexes and name.
2628  * (qemu_ram_foreach_block ends up passing unscaled lengths
2629  *  which would mean postcopy code would have to deal with target page)
2630  *
2631  * @ms: current migration state
2632  */
2633 static int postcopy_each_ram_send_discard(MigrationState *ms)
2634 {
2635     struct RAMBlock *block;
2636     int ret;
2637
2638     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2639         PostcopyDiscardState *pds =
2640             postcopy_discard_send_init(ms, block->idstr);
2641
2642         /*
2643          * Postcopy sends chunks of bitmap over the wire, but it
2644          * just needs indexes at this point, avoids it having
2645          * target page specific code.
2646          */
2647         ret = postcopy_send_discard_bm_ram(ms, pds, block);
2648         postcopy_discard_send_finish(ms, pds);
2649         if (ret) {
2650             return ret;
2651         }
2652     }
2653
2654     return 0;
2655 }
2656
2657 /**
2658  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2659  *
2660  * Helper for postcopy_chunk_hostpages; it's called twice to
2661  * canonicalize the two bitmaps, that are similar, but one is
2662  * inverted.
2663  *
2664  * Postcopy requires that all target pages in a hostpage are dirty or
2665  * clean, not a mix.  This function canonicalizes the bitmaps.
2666  *
2667  * @ms: current migration state
2668  * @unsent_pass: if true we need to canonicalize partially unsent host pages
2669  *               otherwise we need to canonicalize partially dirty host pages
2670  * @block: block that contains the page we want to canonicalize
2671  * @pds: state for postcopy
2672  */
2673 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2674                                           RAMBlock *block,
2675                                           PostcopyDiscardState *pds)
2676 {
2677     RAMState *rs = ram_state;
2678     unsigned long *bitmap = block->bmap;
2679     unsigned long *unsentmap = block->unsentmap;
2680     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2681     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2682     unsigned long run_start;
2683
2684     if (block->page_size == TARGET_PAGE_SIZE) {
2685         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2686         return;
2687     }
2688
2689     if (unsent_pass) {
2690         /* Find a sent page */
2691         run_start = find_next_zero_bit(unsentmap, pages, 0);
2692     } else {
2693         /* Find a dirty page */
2694         run_start = find_next_bit(bitmap, pages, 0);
2695     }
2696
2697     while (run_start < pages) {
2698         bool do_fixup = false;
2699         unsigned long fixup_start_addr;
2700         unsigned long host_offset;
2701
2702         /*
2703          * If the start of this run of pages is in the middle of a host
2704          * page, then we need to fixup this host page.
2705          */
2706         host_offset = run_start % host_ratio;
2707         if (host_offset) {
2708             do_fixup = true;
2709             run_start -= host_offset;
2710             fixup_start_addr = run_start;
2711             /* For the next pass */
2712             run_start = run_start + host_ratio;
2713         } else {
2714             /* Find the end of this run */
2715             unsigned long run_end;
2716             if (unsent_pass) {
2717                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
2718             } else {
2719                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2720             }
2721             /*
2722              * If the end isn't at the start of a host page, then the
2723              * run doesn't finish at the end of a host page
2724              * and we need to discard.
2725              */
2726             host_offset = run_end % host_ratio;
2727             if (host_offset) {
2728                 do_fixup = true;
2729                 fixup_start_addr = run_end - host_offset;
2730                 /*
2731                  * This host page has gone, the next loop iteration starts
2732                  * from after the fixup
2733                  */
2734                 run_start = fixup_start_addr + host_ratio;
2735             } else {
2736                 /*
2737                  * No discards on this iteration, next loop starts from
2738                  * next sent/dirty page
2739                  */
2740                 run_start = run_end + 1;
2741             }
2742         }
2743
2744         if (do_fixup) {
2745             unsigned long page;
2746
2747             /* Tell the destination to discard this page */
2748             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2749                 /* For the unsent_pass we:
2750                  *     discard partially sent pages
2751                  * For the !unsent_pass (dirty) we:
2752                  *     discard partially dirty pages that were sent
2753                  *     (any partially sent pages were already discarded
2754                  *     by the previous unsent_pass)
2755                  */
2756                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
2757                                             host_ratio);
2758             }
2759
2760             /* Clean up the bitmap */
2761             for (page = fixup_start_addr;
2762                  page < fixup_start_addr + host_ratio; page++) {
2763                 /* All pages in this host page are now not sent */
2764                 set_bit(page, unsentmap);
2765
2766                 /*
2767                  * Remark them as dirty, updating the count for any pages
2768                  * that weren't previously dirty.
2769                  */
2770                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2771             }
2772         }
2773
2774         if (unsent_pass) {
2775             /* Find the next sent page for the next iteration */
2776             run_start = find_next_zero_bit(unsentmap, pages, run_start);
2777         } else {
2778             /* Find the next dirty page for the next iteration */
2779             run_start = find_next_bit(bitmap, pages, run_start);
2780         }
2781     }
2782 }
2783
2784 /**
2785  * postcopy_chuck_hostpages: discrad any partially sent host page
2786  *
2787  * Utility for the outgoing postcopy code.
2788  *
2789  * Discard any partially sent host-page size chunks, mark any partially
2790  * dirty host-page size chunks as all dirty.  In this case the host-page
2791  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2792  *
2793  * Returns zero on success
2794  *
2795  * @ms: current migration state
2796  * @block: block we want to work with
2797  */
2798 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2799 {
2800     PostcopyDiscardState *pds =
2801         postcopy_discard_send_init(ms, block->idstr);
2802
2803     /* First pass: Discard all partially sent host pages */
2804     postcopy_chunk_hostpages_pass(ms, true, block, pds);
2805     /*
2806      * Second pass: Ensure that all partially dirty host pages are made
2807      * fully dirty.
2808      */
2809     postcopy_chunk_hostpages_pass(ms, false, block, pds);
2810
2811     postcopy_discard_send_finish(ms, pds);
2812     return 0;
2813 }
2814
2815 /**
2816  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2817  *
2818  * Returns zero on success
2819  *
2820  * Transmit the set of pages to be discarded after precopy to the target
2821  * these are pages that:
2822  *     a) Have been previously transmitted but are now dirty again
2823  *     b) Pages that have never been transmitted, this ensures that
2824  *        any pages on the destination that have been mapped by background
2825  *        tasks get discarded (transparent huge pages is the specific concern)
2826  * Hopefully this is pretty sparse
2827  *
2828  * @ms: current migration state
2829  */
2830 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2831 {
2832     RAMState *rs = ram_state;
2833     RAMBlock *block;
2834     int ret;
2835
2836     rcu_read_lock();
2837
2838     /* This should be our last sync, the src is now paused */
2839     migration_bitmap_sync(rs);
2840
2841     /* Easiest way to make sure we don't resume in the middle of a host-page */
2842     rs->last_seen_block = NULL;
2843     rs->last_sent_block = NULL;
2844     rs->last_page = 0;
2845
2846     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2847         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2848         unsigned long *bitmap = block->bmap;
2849         unsigned long *unsentmap = block->unsentmap;
2850
2851         if (!unsentmap) {
2852             /* We don't have a safe way to resize the sentmap, so
2853              * if the bitmap was resized it will be NULL at this
2854              * point.
2855              */
2856             error_report("migration ram resized during precopy phase");
2857             rcu_read_unlock();
2858             return -EINVAL;
2859         }
2860         /* Deal with TPS != HPS and huge pages */
2861         ret = postcopy_chunk_hostpages(ms, block);
2862         if (ret) {
2863             rcu_read_unlock();
2864             return ret;
2865         }
2866
2867         /*
2868          * Update the unsentmap to be unsentmap = unsentmap | dirty
2869          */
2870         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2871 #ifdef DEBUG_POSTCOPY
2872         ram_debug_dump_bitmap(unsentmap, true, pages);
2873 #endif
2874     }
2875     trace_ram_postcopy_send_discard_bitmap();
2876
2877     ret = postcopy_each_ram_send_discard(ms);
2878     rcu_read_unlock();
2879
2880     return ret;
2881 }
2882
2883 /**
2884  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2885  *
2886  * Returns zero on success
2887  *
2888  * @rbname: name of the RAMBlock of the request. NULL means the
2889  *          same that last one.
2890  * @start: RAMBlock starting page
2891  * @length: RAMBlock size
2892  */
2893 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2894 {
2895     int ret = -1;
2896
2897     trace_ram_discard_range(rbname, start, length);
2898
2899     rcu_read_lock();
2900     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2901
2902     if (!rb) {
2903         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2904         goto err;
2905     }
2906
2907     /*
2908      * On source VM, we don't need to update the received bitmap since
2909      * we don't even have one.
2910      */
2911     if (rb->receivedmap) {
2912         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2913                      length >> qemu_target_page_bits());
2914     }
2915
2916     ret = ram_block_discard_range(rb, start, length);
2917
2918 err:
2919     rcu_read_unlock();
2920
2921     return ret;
2922 }
2923
2924 /*
2925  * For every allocation, we will try not to crash the VM if the
2926  * allocation failed.
2927  */
2928 static int xbzrle_init(void)
2929 {
2930     Error *local_err = NULL;
2931
2932     if (!migrate_use_xbzrle()) {
2933         return 0;
2934     }
2935
2936     XBZRLE_cache_lock();
2937
2938     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2939     if (!XBZRLE.zero_target_page) {
2940         error_report("%s: Error allocating zero page", __func__);
2941         goto err_out;
2942     }
2943
2944     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2945                               TARGET_PAGE_SIZE, &local_err);
2946     if (!XBZRLE.cache) {
2947         error_report_err(local_err);
2948         goto free_zero_page;
2949     }
2950
2951     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2952     if (!XBZRLE.encoded_buf) {
2953         error_report("%s: Error allocating encoded_buf", __func__);
2954         goto free_cache;
2955     }
2956
2957     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2958     if (!XBZRLE.current_buf) {
2959         error_report("%s: Error allocating current_buf", __func__);
2960         goto free_encoded_buf;
2961     }
2962
2963     /* We are all good */
2964     XBZRLE_cache_unlock();
2965     return 0;
2966
2967 free_encoded_buf:
2968     g_free(XBZRLE.encoded_buf);
2969     XBZRLE.encoded_buf = NULL;
2970 free_cache:
2971     cache_fini(XBZRLE.cache);
2972     XBZRLE.cache = NULL;
2973 free_zero_page:
2974     g_free(XBZRLE.zero_target_page);
2975     XBZRLE.zero_target_page = NULL;
2976 err_out:
2977     XBZRLE_cache_unlock();
2978     return -ENOMEM;
2979 }
2980
2981 static int ram_state_init(RAMState **rsp)
2982 {
2983     *rsp = g_try_new0(RAMState, 1);
2984
2985     if (!*rsp) {
2986         error_report("%s: Init ramstate fail", __func__);
2987         return -1;
2988     }
2989
2990     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2991     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2992     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2993
2994     /*
2995      * Count the total number of pages used by ram blocks not including any
2996      * gaps due to alignment or unplugs.
2997      */
2998     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2999
3000     ram_state_reset(*rsp);
3001
3002     return 0;
3003 }
3004
3005 static void ram_list_init_bitmaps(void)
3006 {
3007     RAMBlock *block;
3008     unsigned long pages;
3009
3010     /* Skip setting bitmap if there is no RAM */
3011     if (ram_bytes_total()) {
3012         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3013             pages = block->max_length >> TARGET_PAGE_BITS;
3014             block->bmap = bitmap_new(pages);
3015             bitmap_set(block->bmap, 0, pages);
3016             if (migrate_postcopy_ram()) {
3017                 block->unsentmap = bitmap_new(pages);
3018                 bitmap_set(block->unsentmap, 0, pages);
3019             }
3020         }
3021     }
3022 }
3023
3024 static void ram_init_bitmaps(RAMState *rs)
3025 {
3026     /* For memory_global_dirty_log_start below.  */
3027     qemu_mutex_lock_iothread();
3028     qemu_mutex_lock_ramlist();
3029     rcu_read_lock();
3030
3031     ram_list_init_bitmaps();
3032     memory_global_dirty_log_start();
3033     migration_bitmap_sync(rs);
3034
3035     rcu_read_unlock();
3036     qemu_mutex_unlock_ramlist();
3037     qemu_mutex_unlock_iothread();
3038 }
3039
3040 static int ram_init_all(RAMState **rsp)
3041 {
3042     if (ram_state_init(rsp)) {
3043         return -1;
3044     }
3045
3046     if (xbzrle_init()) {
3047         ram_state_cleanup(rsp);
3048         return -1;
3049     }
3050
3051     ram_init_bitmaps(*rsp);
3052
3053     return 0;
3054 }
3055
3056 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3057 {
3058     RAMBlock *block;
3059     uint64_t pages = 0;
3060
3061     /*
3062      * Postcopy is not using xbzrle/compression, so no need for that.
3063      * Also, since source are already halted, we don't need to care
3064      * about dirty page logging as well.
3065      */
3066
3067     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3068         pages += bitmap_count_one(block->bmap,
3069                                   block->used_length >> TARGET_PAGE_BITS);
3070     }
3071
3072     /* This may not be aligned with current bitmaps. Recalculate. */
3073     rs->migration_dirty_pages = pages;
3074
3075     rs->last_seen_block = NULL;
3076     rs->last_sent_block = NULL;
3077     rs->last_page = 0;
3078     rs->last_version = ram_list.version;
3079     /*
3080      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3081      * matter what we have sent.
3082      */
3083     rs->ram_bulk_stage = false;
3084
3085     /* Update RAMState cache of output QEMUFile */
3086     rs->f = out;
3087
3088     trace_ram_state_resume_prepare(pages);
3089 }
3090
3091 /*
3092  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3093  * long-running RCU critical section.  When rcu-reclaims in the code
3094  * start to become numerous it will be necessary to reduce the
3095  * granularity of these critical sections.
3096  */
3097
3098 /**
3099  * ram_save_setup: Setup RAM for migration
3100  *
3101  * Returns zero to indicate success and negative for error
3102  *
3103  * @f: QEMUFile where to send the data
3104  * @opaque: RAMState pointer
3105  */
3106 static int ram_save_setup(QEMUFile *f, void *opaque)
3107 {
3108     RAMState **rsp = opaque;
3109     RAMBlock *block;
3110
3111     if (compress_threads_save_setup()) {
3112         return -1;
3113     }
3114
3115     /* migration has already setup the bitmap, reuse it. */
3116     if (!migration_in_colo_state()) {
3117         if (ram_init_all(rsp) != 0) {
3118             compress_threads_save_cleanup();
3119             return -1;
3120         }
3121     }
3122     (*rsp)->f = f;
3123
3124     rcu_read_lock();
3125
3126     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
3127
3128     RAMBLOCK_FOREACH_MIGRATABLE(block) {
3129         qemu_put_byte(f, strlen(block->idstr));
3130         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3131         qemu_put_be64(f, block->used_length);
3132         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3133             qemu_put_be64(f, block->page_size);
3134         }
3135     }
3136
3137     rcu_read_unlock();
3138
3139     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3140     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3141
3142     multifd_send_sync_main();
3143     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3144     qemu_fflush(f);
3145
3146     return 0;
3147 }
3148
3149 /**
3150  * ram_save_iterate: iterative stage for migration
3151  *
3152  * Returns zero to indicate success and negative for error
3153  *
3154  * @f: QEMUFile where to send the data
3155  * @opaque: RAMState pointer
3156  */
3157 static int ram_save_iterate(QEMUFile *f, void *opaque)
3158 {
3159     RAMState **temp = opaque;
3160     RAMState *rs = *temp;
3161     int ret;
3162     int i;
3163     int64_t t0;
3164     int done = 0;
3165
3166     if (blk_mig_bulk_active()) {
3167         /* Avoid transferring ram during bulk phase of block migration as
3168          * the bulk phase will usually take a long time and transferring
3169          * ram updates during that time is pointless. */
3170         goto out;
3171     }
3172
3173     rcu_read_lock();
3174     if (ram_list.version != rs->last_version) {
3175         ram_state_reset(rs);
3176     }
3177
3178     /* Read version before ram_list.blocks */
3179     smp_rmb();
3180
3181     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3182
3183     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3184     i = 0;
3185     while ((ret = qemu_file_rate_limit(f)) == 0 ||
3186             !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3187         int pages;
3188
3189         if (qemu_file_get_error(f)) {
3190             break;
3191         }
3192
3193         pages = ram_find_and_save_block(rs, false);
3194         /* no more pages to sent */
3195         if (pages == 0) {
3196             done = 1;
3197             break;
3198         }
3199         rs->iterations++;
3200
3201         /* we want to check in the 1st loop, just in case it was the 1st time
3202            and we had to sync the dirty bitmap.
3203            qemu_get_clock_ns() is a bit expensive, so we only check each some
3204            iterations
3205         */
3206         if ((i & 63) == 0) {
3207             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3208             if (t1 > MAX_WAIT) {
3209                 trace_ram_save_iterate_big_wait(t1, i);
3210                 break;
3211             }
3212         }
3213         i++;
3214     }
3215     flush_compressed_data(rs);
3216     rcu_read_unlock();
3217
3218     /*
3219      * Must occur before EOS (or any QEMUFile operation)
3220      * because of RDMA protocol.
3221      */
3222     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3223
3224     multifd_send_sync_main();
3225 out:
3226     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3227     qemu_fflush(f);
3228     ram_counters.transferred += 8;
3229
3230     ret = qemu_file_get_error(f);
3231     if (ret < 0) {
3232         return ret;
3233     }
3234
3235     return done;
3236 }
3237
3238 /**
3239  * ram_save_complete: function called to send the remaining amount of ram
3240  *
3241  * Returns zero to indicate success
3242  *
3243  * Called with iothread lock
3244  *
3245  * @f: QEMUFile where to send the data
3246  * @opaque: RAMState pointer
3247  */
3248 static int ram_save_complete(QEMUFile *f, void *opaque)
3249 {
3250     RAMState **temp = opaque;
3251     RAMState *rs = *temp;
3252
3253     rcu_read_lock();
3254
3255     if (!migration_in_postcopy()) {
3256         migration_bitmap_sync(rs);
3257     }
3258
3259     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3260
3261     /* try transferring iterative blocks of memory */
3262
3263     /* flush all remaining blocks regardless of rate limiting */
3264     while (true) {
3265         int pages;
3266
3267         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3268         /* no more blocks to sent */
3269         if (pages == 0) {
3270             break;
3271         }
3272     }
3273
3274     flush_compressed_data(rs);
3275     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3276
3277     rcu_read_unlock();
3278
3279     multifd_send_sync_main();
3280     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3281     qemu_fflush(f);
3282
3283     return 0;
3284 }
3285
3286 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3287                              uint64_t *res_precopy_only,
3288                              uint64_t *res_compatible,
3289                              uint64_t *res_postcopy_only)
3290 {
3291     RAMState **temp = opaque;
3292     RAMState *rs = *temp;
3293     uint64_t remaining_size;
3294
3295     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3296
3297     if (!migration_in_postcopy() &&
3298         remaining_size < max_size) {
3299         qemu_mutex_lock_iothread();
3300         rcu_read_lock();
3301         migration_bitmap_sync(rs);
3302         rcu_read_unlock();
3303         qemu_mutex_unlock_iothread();
3304         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3305     }
3306
3307     if (migrate_postcopy_ram()) {
3308         /* We can do postcopy, and all the data is postcopiable */
3309         *res_compatible += remaining_size;
3310     } else {
3311         *res_precopy_only += remaining_size;
3312     }
3313 }
3314
3315 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3316 {
3317     unsigned int xh_len;
3318     int xh_flags;
3319     uint8_t *loaded_data;
3320
3321     /* extract RLE header */
3322     xh_flags = qemu_get_byte(f);
3323     xh_len = qemu_get_be16(f);
3324
3325     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3326         error_report("Failed to load XBZRLE page - wrong compression!");
3327         return -1;
3328     }
3329
3330     if (xh_len > TARGET_PAGE_SIZE) {
3331         error_report("Failed to load XBZRLE page - len overflow!");
3332         return -1;
3333     }
3334     loaded_data = XBZRLE.decoded_buf;
3335     /* load data and decode */
3336     /* it can change loaded_data to point to an internal buffer */
3337     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3338
3339     /* decode RLE */
3340     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3341                              TARGET_PAGE_SIZE) == -1) {
3342         error_report("Failed to load XBZRLE page - decode error!");
3343         return -1;
3344     }
3345
3346     return 0;
3347 }
3348
3349 /**
3350  * ram_block_from_stream: read a RAMBlock id from the migration stream
3351  *
3352  * Must be called from within a rcu critical section.
3353  *
3354  * Returns a pointer from within the RCU-protected ram_list.
3355  *
3356  * @f: QEMUFile where to read the data from
3357  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3358  */
3359 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3360 {
3361     static RAMBlock *block = NULL;
3362     char id[256];
3363     uint8_t len;
3364
3365     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3366         if (!block) {
3367             error_report("Ack, bad migration stream!");
3368             return NULL;
3369         }
3370         return block;
3371     }
3372
3373     len = qemu_get_byte(f);
3374     qemu_get_buffer(f, (uint8_t *)id, len);
3375     id[len] = 0;
3376
3377     block = qemu_ram_block_by_name(id);
3378     if (!block) {
3379         error_report("Can't find block %s", id);
3380         return NULL;
3381     }
3382
3383     if (!qemu_ram_is_migratable(block)) {
3384         error_report("block %s should not be migrated !", id);
3385         return NULL;
3386     }
3387
3388     return block;
3389 }
3390
3391 static inline void *host_from_ram_block_offset(RAMBlock *block,
3392                                                ram_addr_t offset)
3393 {
3394     if (!offset_in_ramblock(block, offset)) {
3395         return NULL;
3396     }
3397
3398     return block->host + offset;
3399 }
3400
3401 /**
3402  * ram_handle_compressed: handle the zero page case
3403  *
3404  * If a page (or a whole RDMA chunk) has been
3405  * determined to be zero, then zap it.
3406  *
3407  * @host: host address for the zero page
3408  * @ch: what the page is filled from.  We only support zero
3409  * @size: size of the zero page
3410  */
3411 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3412 {
3413     if (ch != 0 || !is_zero_range(host, size)) {
3414         memset(host, ch, size);
3415     }
3416 }
3417
3418 /* return the size after decompression, or negative value on error */
3419 static int
3420 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3421                      const uint8_t *source, size_t source_len)
3422 {
3423     int err;
3424
3425     err = inflateReset(stream);
3426     if (err != Z_OK) {
3427         return -1;
3428     }
3429
3430     stream->avail_in = source_len;
3431     stream->next_in = (uint8_t *)source;
3432     stream->avail_out = dest_len;
3433     stream->next_out = dest;
3434
3435     err = inflate(stream, Z_NO_FLUSH);
3436     if (err != Z_STREAM_END) {
3437         return -1;
3438     }
3439
3440     return stream->total_out;
3441 }
3442
3443 static void *do_data_decompress(void *opaque)
3444 {
3445     DecompressParam *param = opaque;
3446     unsigned long pagesize;
3447     uint8_t *des;
3448     int len, ret;
3449
3450     qemu_mutex_lock(&param->mutex);
3451     while (!param->quit) {
3452         if (param->des) {
3453             des = param->des;
3454             len = param->len;
3455             param->des = 0;
3456             qemu_mutex_unlock(&param->mutex);
3457
3458             pagesize = TARGET_PAGE_SIZE;
3459
3460             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3461                                        param->compbuf, len);
3462             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3463                 error_report("decompress data failed");
3464                 qemu_file_set_error(decomp_file, ret);
3465             }
3466
3467             qemu_mutex_lock(&decomp_done_lock);
3468             param->done = true;
3469             qemu_cond_signal(&decomp_done_cond);
3470             qemu_mutex_unlock(&decomp_done_lock);
3471
3472             qemu_mutex_lock(&param->mutex);
3473         } else {
3474             qemu_cond_wait(&param->cond, &param->mutex);
3475         }
3476     }
3477     qemu_mutex_unlock(&param->mutex);
3478
3479     return NULL;
3480 }
3481
3482 static int wait_for_decompress_done(void)
3483 {
3484     int idx, thread_count;
3485
3486     if (!migrate_use_compression()) {
3487         return 0;
3488     }
3489
3490     thread_count = migrate_decompress_threads();
3491     qemu_mutex_lock(&decomp_done_lock);
3492     for (idx = 0; idx < thread_count; idx++) {
3493         while (!decomp_param[idx].done) {
3494             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3495         }
3496     }
3497     qemu_mutex_unlock(&decomp_done_lock);
3498     return qemu_file_get_error(decomp_file);
3499 }
3500
3501 static void compress_threads_load_cleanup(void)
3502 {
3503     int i, thread_count;
3504
3505     if (!migrate_use_compression()) {
3506         return;
3507     }
3508     thread_count = migrate_decompress_threads();
3509     for (i = 0; i < thread_count; i++) {
3510         /*
3511          * we use it as a indicator which shows if the thread is
3512          * properly init'd or not
3513          */
3514         if (!decomp_param[i].compbuf) {
3515             break;
3516         }
3517
3518         qemu_mutex_lock(&decomp_param[i].mutex);
3519         decomp_param[i].quit = true;
3520         qemu_cond_signal(&decomp_param[i].cond);
3521         qemu_mutex_unlock(&decomp_param[i].mutex);
3522     }
3523     for (i = 0; i < thread_count; i++) {
3524         if (!decomp_param[i].compbuf) {
3525             break;
3526         }
3527
3528         qemu_thread_join(decompress_threads + i);
3529         qemu_mutex_destroy(&decomp_param[i].mutex);
3530         qemu_cond_destroy(&decomp_param[i].cond);
3531         inflateEnd(&decomp_param[i].stream);
3532         g_free(decomp_param[i].compbuf);
3533         decomp_param[i].compbuf = NULL;
3534     }
3535     g_free(decompress_threads);
3536     g_free(decomp_param);
3537     decompress_threads = NULL;
3538     decomp_param = NULL;
3539     decomp_file = NULL;
3540 }
3541
3542 static int compress_threads_load_setup(QEMUFile *f)
3543 {
3544     int i, thread_count;
3545
3546     if (!migrate_use_compression()) {
3547         return 0;
3548     }
3549
3550     thread_count = migrate_decompress_threads();
3551     decompress_threads = g_new0(QemuThread, thread_count);
3552     decomp_param = g_new0(DecompressParam, thread_count);
3553     qemu_mutex_init(&decomp_done_lock);
3554     qemu_cond_init(&decomp_done_cond);
3555     decomp_file = f;
3556     for (i = 0; i < thread_count; i++) {
3557         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3558             goto exit;
3559         }
3560
3561         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3562         qemu_mutex_init(&decomp_param[i].mutex);
3563         qemu_cond_init(&decomp_param[i].cond);
3564         decomp_param[i].done = true;
3565         decomp_param[i].quit = false;
3566         qemu_thread_create(decompress_threads + i, "decompress",
3567                            do_data_decompress, decomp_param + i,
3568                            QEMU_THREAD_JOINABLE);
3569     }
3570     return 0;
3571 exit:
3572     compress_threads_load_cleanup();
3573     return -1;
3574 }
3575
3576 static void decompress_data_with_multi_threads(QEMUFile *f,
3577                                                void *host, int len)
3578 {
3579     int idx, thread_count;
3580
3581     thread_count = migrate_decompress_threads();
3582     qemu_mutex_lock(&decomp_done_lock);
3583     while (true) {
3584         for (idx = 0; idx < thread_count; idx++) {
3585             if (decomp_param[idx].done) {
3586                 decomp_param[idx].done = false;
3587                 qemu_mutex_lock(&decomp_param[idx].mutex);
3588                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3589                 decomp_param[idx].des = host;
3590                 decomp_param[idx].len = len;
3591                 qemu_cond_signal(&decomp_param[idx].cond);
3592                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3593                 break;
3594             }
3595         }
3596         if (idx < thread_count) {
3597             break;
3598         } else {
3599             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3600         }
3601     }
3602     qemu_mutex_unlock(&decomp_done_lock);
3603 }
3604
3605 /**
3606  * ram_load_setup: Setup RAM for migration incoming side
3607  *
3608  * Returns zero to indicate success and negative for error
3609  *
3610  * @f: QEMUFile where to receive the data
3611  * @opaque: RAMState pointer
3612  */
3613 static int ram_load_setup(QEMUFile *f, void *opaque)
3614 {
3615     if (compress_threads_load_setup(f)) {
3616         return -1;
3617     }
3618
3619     xbzrle_load_setup();
3620     ramblock_recv_map_init();
3621     return 0;
3622 }
3623
3624 static int ram_load_cleanup(void *opaque)
3625 {
3626     RAMBlock *rb;
3627
3628     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3629         if (ramblock_is_pmem(rb)) {
3630             pmem_persist(rb->host, rb->used_length);
3631         }
3632     }
3633
3634     xbzrle_load_cleanup();
3635     compress_threads_load_cleanup();
3636
3637     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3638         g_free(rb->receivedmap);
3639         rb->receivedmap = NULL;
3640     }
3641     return 0;
3642 }
3643
3644 /**
3645  * ram_postcopy_incoming_init: allocate postcopy data structures
3646  *
3647  * Returns 0 for success and negative if there was one error
3648  *
3649  * @mis: current migration incoming state
3650  *
3651  * Allocate data structures etc needed by incoming migration with
3652  * postcopy-ram. postcopy-ram's similarly names
3653  * postcopy_ram_incoming_init does the work.
3654  */
3655 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3656 {
3657     return postcopy_ram_incoming_init(mis);
3658 }
3659
3660 /**
3661  * ram_load_postcopy: load a page in postcopy case
3662  *
3663  * Returns 0 for success or -errno in case of error
3664  *
3665  * Called in postcopy mode by ram_load().
3666  * rcu_read_lock is taken prior to this being called.
3667  *
3668  * @f: QEMUFile where to send the data
3669  */
3670 static int ram_load_postcopy(QEMUFile *f)
3671 {
3672     int flags = 0, ret = 0;
3673     bool place_needed = false;
3674     bool matches_target_page_size = false;
3675     MigrationIncomingState *mis = migration_incoming_get_current();
3676     /* Temporary page that is later 'placed' */
3677     void *postcopy_host_page = postcopy_get_tmp_page(mis);
3678     void *last_host = NULL;
3679     bool all_zero = false;
3680
3681     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3682         ram_addr_t addr;
3683         void *host = NULL;
3684         void *page_buffer = NULL;
3685         void *place_source = NULL;
3686         RAMBlock *block = NULL;
3687         uint8_t ch;
3688
3689         addr = qemu_get_be64(f);
3690
3691         /*
3692          * If qemu file error, we should stop here, and then "addr"
3693          * may be invalid
3694          */
3695         ret = qemu_file_get_error(f);
3696         if (ret) {
3697             break;
3698         }
3699
3700         flags = addr & ~TARGET_PAGE_MASK;
3701         addr &= TARGET_PAGE_MASK;
3702
3703         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3704         place_needed = false;
3705         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3706             block = ram_block_from_stream(f, flags);
3707
3708             host = host_from_ram_block_offset(block, addr);
3709             if (!host) {
3710                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3711                 ret = -EINVAL;
3712                 break;
3713             }
3714             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3715             /*
3716              * Postcopy requires that we place whole host pages atomically;
3717              * these may be huge pages for RAMBlocks that are backed by
3718              * hugetlbfs.
3719              * To make it atomic, the data is read into a temporary page
3720              * that's moved into place later.
3721              * The migration protocol uses,  possibly smaller, target-pages
3722              * however the source ensures it always sends all the components
3723              * of a host page in order.
3724              */
3725             page_buffer = postcopy_host_page +
3726                           ((uintptr_t)host & (block->page_size - 1));
3727             /* If all TP are zero then we can optimise the place */
3728             if (!((uintptr_t)host & (block->page_size - 1))) {
3729                 all_zero = true;
3730             } else {
3731                 /* not the 1st TP within the HP */
3732                 if (host != (last_host + TARGET_PAGE_SIZE)) {
3733                     error_report("Non-sequential target page %p/%p",
3734                                   host, last_host);
3735                     ret = -EINVAL;
3736                     break;
3737                 }
3738             }
3739
3740
3741             /*
3742              * If it's the last part of a host page then we place the host
3743              * page
3744              */
3745             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3746                                      (block->page_size - 1)) == 0;
3747             place_source = postcopy_host_page;
3748         }
3749         last_host = host;
3750
3751         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3752         case RAM_SAVE_FLAG_ZERO:
3753             ch = qemu_get_byte(f);
3754             memset(page_buffer, ch, TARGET_PAGE_SIZE);
3755             if (ch) {
3756                 all_zero = false;
3757             }
3758             break;
3759
3760         case RAM_SAVE_FLAG_PAGE:
3761             all_zero = false;
3762             if (!matches_target_page_size) {
3763                 /* For huge pages, we always use temporary buffer */
3764                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3765             } else {
3766                 /*
3767                  * For small pages that matches target page size, we
3768                  * avoid the qemu_file copy.  Instead we directly use
3769                  * the buffer of QEMUFile to place the page.  Note: we
3770                  * cannot do any QEMUFile operation before using that
3771                  * buffer to make sure the buffer is valid when
3772                  * placing the page.
3773                  */
3774                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3775                                          TARGET_PAGE_SIZE);
3776             }
3777             break;
3778         case RAM_SAVE_FLAG_EOS:
3779             /* normal exit */
3780             multifd_recv_sync_main();
3781             break;
3782         default:
3783             error_report("Unknown combination of migration flags: %#x"
3784                          " (postcopy mode)", flags);
3785             ret = -EINVAL;
3786             break;
3787         }
3788
3789         /* Detect for any possible file errors */
3790         if (!ret && qemu_file_get_error(f)) {
3791             ret = qemu_file_get_error(f);
3792         }
3793
3794         if (!ret && place_needed) {
3795             /* This gets called at the last target page in the host page */
3796             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3797
3798             if (all_zero) {
3799                 ret = postcopy_place_page_zero(mis, place_dest,
3800                                                block);
3801             } else {
3802                 ret = postcopy_place_page(mis, place_dest,
3803                                           place_source, block);
3804             }
3805         }
3806     }
3807
3808     return ret;
3809 }
3810
3811 static bool postcopy_is_advised(void)
3812 {
3813     PostcopyState ps = postcopy_state_get();
3814     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3815 }
3816
3817 static bool postcopy_is_running(void)
3818 {
3819     PostcopyState ps = postcopy_state_get();
3820     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3821 }
3822
3823 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3824 {
3825     int flags = 0, ret = 0, invalid_flags = 0;
3826     static uint64_t seq_iter;
3827     int len = 0;
3828     /*
3829      * If system is running in postcopy mode, page inserts to host memory must
3830      * be atomic
3831      */
3832     bool postcopy_running = postcopy_is_running();
3833     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3834     bool postcopy_advised = postcopy_is_advised();
3835
3836     seq_iter++;
3837
3838     if (version_id != 4) {
3839         ret = -EINVAL;
3840     }
3841
3842     if (!migrate_use_compression()) {
3843         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3844     }
3845     /* This RCU critical section can be very long running.
3846      * When RCU reclaims in the code start to become numerous,
3847      * it will be necessary to reduce the granularity of this
3848      * critical section.
3849      */
3850     rcu_read_lock();
3851
3852     if (postcopy_running) {
3853         ret = ram_load_postcopy(f);
3854     }
3855
3856     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3857         ram_addr_t addr, total_ram_bytes;
3858         void *host = NULL;
3859         uint8_t ch;
3860
3861         addr = qemu_get_be64(f);
3862         flags = addr & ~TARGET_PAGE_MASK;
3863         addr &= TARGET_PAGE_MASK;
3864
3865         if (flags & invalid_flags) {
3866             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3867                 error_report("Received an unexpected compressed page");
3868             }
3869
3870             ret = -EINVAL;
3871             break;
3872         }
3873
3874         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3875                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3876             RAMBlock *block = ram_block_from_stream(f, flags);
3877
3878             host = host_from_ram_block_offset(block, addr);
3879             if (!host) {
3880                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3881                 ret = -EINVAL;
3882                 break;
3883             }
3884             ramblock_recv_bitmap_set(block, host);
3885             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3886         }
3887
3888         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3889         case RAM_SAVE_FLAG_MEM_SIZE:
3890             /* Synchronize RAM block list */
3891             total_ram_bytes = addr;
3892             while (!ret && total_ram_bytes) {
3893                 RAMBlock *block;
3894                 char id[256];
3895                 ram_addr_t length;
3896
3897                 len = qemu_get_byte(f);
3898                 qemu_get_buffer(f, (uint8_t *)id, len);
3899                 id[len] = 0;
3900                 length = qemu_get_be64(f);
3901
3902                 block = qemu_ram_block_by_name(id);
3903                 if (block && !qemu_ram_is_migratable(block)) {
3904                     error_report("block %s should not be migrated !", id);
3905                     ret = -EINVAL;
3906                 } else if (block) {
3907                     if (length != block->used_length) {
3908                         Error *local_err = NULL;
3909
3910                         ret = qemu_ram_resize(block, length,
3911                                               &local_err);
3912                         if (local_err) {
3913                             error_report_err(local_err);
3914                         }
3915                     }
3916                     /* For postcopy we need to check hugepage sizes match */
3917                     if (postcopy_advised &&
3918                         block->page_size != qemu_host_page_size) {
3919                         uint64_t remote_page_size = qemu_get_be64(f);
3920                         if (remote_page_size != block->page_size) {
3921                             error_report("Mismatched RAM page size %s "
3922                                          "(local) %zd != %" PRId64,
3923                                          id, block->page_size,
3924                                          remote_page_size);
3925                             ret = -EINVAL;
3926                         }
3927                     }
3928                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3929                                           block->idstr);
3930                 } else {
3931                     error_report("Unknown ramblock \"%s\", cannot "
3932                                  "accept migration", id);
3933                     ret = -EINVAL;
3934                 }
3935
3936                 total_ram_bytes -= length;
3937             }
3938             break;
3939
3940         case RAM_SAVE_FLAG_ZERO:
3941             ch = qemu_get_byte(f);
3942             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3943             break;
3944
3945         case RAM_SAVE_FLAG_PAGE:
3946             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3947             break;
3948
3949         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3950             len = qemu_get_be32(f);
3951             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3952                 error_report("Invalid compressed data length: %d", len);
3953                 ret = -EINVAL;
3954                 break;
3955             }
3956             decompress_data_with_multi_threads(f, host, len);
3957             break;
3958
3959         case RAM_SAVE_FLAG_XBZRLE:
3960             if (load_xbzrle(f, addr, host) < 0) {
3961                 error_report("Failed to decompress XBZRLE page at "
3962                              RAM_ADDR_FMT, addr);
3963                 ret = -EINVAL;
3964                 break;
3965             }
3966             break;
3967         case RAM_SAVE_FLAG_EOS:
3968             /* normal exit */
3969             multifd_recv_sync_main();
3970             break;
3971         default:
3972             if (flags & RAM_SAVE_FLAG_HOOK) {
3973                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3974             } else {
3975                 error_report("Unknown combination of migration flags: %#x",
3976                              flags);
3977                 ret = -EINVAL;
3978             }
3979         }
3980         if (!ret) {
3981             ret = qemu_file_get_error(f);
3982         }
3983     }
3984
3985     ret |= wait_for_decompress_done();
3986     rcu_read_unlock();
3987     trace_ram_load_complete(ret, seq_iter);
3988     return ret;
3989 }
3990
3991 static bool ram_has_postcopy(void *opaque)
3992 {
3993     RAMBlock *rb;
3994     RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3995         if (ramblock_is_pmem(rb)) {
3996             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3997                          "is not supported now!", rb->idstr, rb->host);
3998             return false;
3999         }
4000     }
4001
4002     return migrate_postcopy_ram();
4003 }
4004
4005 /* Sync all the dirty bitmap with destination VM.  */
4006 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4007 {
4008     RAMBlock *block;
4009     QEMUFile *file = s->to_dst_file;
4010     int ramblock_count = 0;
4011
4012     trace_ram_dirty_bitmap_sync_start();
4013
4014     RAMBLOCK_FOREACH_MIGRATABLE(block) {
4015         qemu_savevm_send_recv_bitmap(file, block->idstr);
4016         trace_ram_dirty_bitmap_request(block->idstr);
4017         ramblock_count++;
4018     }
4019
4020     trace_ram_dirty_bitmap_sync_wait();
4021
4022     /* Wait until all the ramblocks' dirty bitmap synced */
4023     while (ramblock_count--) {
4024         qemu_sem_wait(&s->rp_state.rp_sem);
4025     }
4026
4027     trace_ram_dirty_bitmap_sync_complete();
4028
4029     return 0;
4030 }
4031
4032 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4033 {
4034     qemu_sem_post(&s->rp_state.rp_sem);
4035 }
4036
4037 /*
4038  * Read the received bitmap, revert it as the initial dirty bitmap.
4039  * This is only used when the postcopy migration is paused but wants
4040  * to resume from a middle point.
4041  */
4042 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4043 {
4044     int ret = -EINVAL;
4045     QEMUFile *file = s->rp_state.from_dst_file;
4046     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4047     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4048     uint64_t size, end_mark;
4049
4050     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4051
4052     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4053         error_report("%s: incorrect state %s", __func__,
4054                      MigrationStatus_str(s->state));
4055         return -EINVAL;
4056     }
4057
4058     /*
4059      * Note: see comments in ramblock_recv_bitmap_send() on why we
4060      * need the endianess convertion, and the paddings.
4061      */
4062     local_size = ROUND_UP(local_size, 8);
4063
4064     /* Add paddings */
4065     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4066
4067     size = qemu_get_be64(file);
4068
4069     /* The size of the bitmap should match with our ramblock */
4070     if (size != local_size) {
4071         error_report("%s: ramblock '%s' bitmap size mismatch "
4072                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4073                      block->idstr, size, local_size);
4074         ret = -EINVAL;
4075         goto out;
4076     }
4077
4078     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4079     end_mark = qemu_get_be64(file);
4080
4081     ret = qemu_file_get_error(file);
4082     if (ret || size != local_size) {
4083         error_report("%s: read bitmap failed for ramblock '%s': %d"
4084                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4085                      __func__, block->idstr, ret, local_size, size);
4086         ret = -EIO;
4087         goto out;
4088     }
4089
4090     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4091         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4092                      __func__, block->idstr, end_mark);
4093         ret = -EINVAL;
4094         goto out;
4095     }
4096
4097     /*
4098      * Endianess convertion. We are during postcopy (though paused).
4099      * The dirty bitmap won't change. We can directly modify it.
4100      */
4101     bitmap_from_le(block->bmap, le_bitmap, nbits);
4102
4103     /*
4104      * What we received is "received bitmap". Revert it as the initial
4105      * dirty bitmap for this ramblock.
4106      */
4107     bitmap_complement(block->bmap, block->bmap, nbits);
4108
4109     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4110
4111     /*
4112      * We succeeded to sync bitmap for current ramblock. If this is
4113      * the last one to sync, we need to notify the main send thread.
4114      */
4115     ram_dirty_bitmap_reload_notify(s);
4116
4117     ret = 0;
4118 out:
4119     g_free(le_bitmap);
4120     return ret;
4121 }
4122
4123 static int ram_resume_prepare(MigrationState *s, void *opaque)
4124 {
4125     RAMState *rs = *(RAMState **)opaque;
4126     int ret;
4127
4128     ret = ram_dirty_bitmap_sync_all(s, rs);
4129     if (ret) {
4130         return ret;
4131     }
4132
4133     ram_state_resume_prepare(rs, s->to_dst_file);
4134
4135     return 0;
4136 }
4137
4138 static SaveVMHandlers savevm_ram_handlers = {
4139     .save_setup = ram_save_setup,
4140     .save_live_iterate = ram_save_iterate,
4141     .save_live_complete_postcopy = ram_save_complete,
4142     .save_live_complete_precopy = ram_save_complete,
4143     .has_postcopy = ram_has_postcopy,
4144     .save_live_pending = ram_save_pending,
4145     .load_state = ram_load,
4146     .save_cleanup = ram_save_cleanup,
4147     .load_setup = ram_load_setup,
4148     .load_cleanup = ram_load_cleanup,
4149     .resume_prepare = ram_resume_prepare,
4150 };
4151
4152 void ram_mig_init(void)
4153 {
4154     qemu_mutex_init(&XBZRLE.lock);
4155     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4156 }