migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration/register.h"
  40 #include "migration/misc.h"
  41 #include "qemu-file.h"
  42 #include "postcopy-ram.h"
  43 #include "page_cache.h"
  44 #include "qemu/error-report.h"
  45 #include "qapi/error.h"
  46 #include "qapi/qapi-events-migration.h"
  47 #include "qapi/qmp/qerror.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "exec/target_page.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "migration/colo.h"
  53 #include "block.h"
  54 #include "sysemu/sysemu.h"
  55 #include "savevm.h"
  56 #include "qemu/iov.h"
  57 #include "multifd.h"
  58
  59 /***********************************************************/
  60 /* ram save/restore */
  61
  62 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  63  * worked for pages that where filled with the same char.  We switched
  64  * it to only search for the zero value.  And to avoid confusion with
  65  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  66  */
  67
  68 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  69 #define RAM_SAVE_FLAG_ZERO     0x02
  70 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  71 #define RAM_SAVE_FLAG_PAGE     0x08
  72 #define RAM_SAVE_FLAG_EOS      0x10
  73 #define RAM_SAVE_FLAG_CONTINUE 0x20
  74 #define RAM_SAVE_FLAG_XBZRLE   0x40
  75 /* 0x80 is reserved in migration.h start with 0x100 next */
  76 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  77
  78 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  79 {
  80     return buffer_is_zero(p, size);
  81 }
  82
  83 XBZRLECacheStats xbzrle_counters;
  84
  85 /* struct contains XBZRLE cache and a static page
  86    used by the compression */
  87 static struct {
  88     /* buffer used for XBZRLE encoding */
  89     uint8_t *encoded_buf;
  90     /* buffer for storing page content */
  91     uint8_t *current_buf;
  92     /* Cache for XBZRLE, Protected by lock. */
  93     PageCache *cache;
  94     QemuMutex lock;
  95     /* it will store a page full of zeros */
  96     uint8_t *zero_target_page;
  97     /* buffer used for XBZRLE decoding */
  98     uint8_t *decoded_buf;
  99 } XBZRLE;
 100
 101 static void XBZRLE_cache_lock(void)
 102 {
 103     if (migrate_use_xbzrle())
 104         qemu_mutex_lock(&XBZRLE.lock);
 105 }
 106
 107 static void XBZRLE_cache_unlock(void)
 108 {
 109     if (migrate_use_xbzrle())
 110         qemu_mutex_unlock(&XBZRLE.lock);
 111 }
 112
 113 /**
 114  * xbzrle_cache_resize: resize the xbzrle cache
 115  *
 116  * This function is called from qmp_migrate_set_cache_size in main
 117  * thread, possibly while a migration is in progress.  A running
 118  * migration may be using the cache and might finish during this call,
 119  * hence changes to the cache are protected by XBZRLE.lock().
 120  *
 121  * Returns 0 for success or -1 for error
 122  *
 123  * @new_size: new cache size
 124  * @errp: set *errp if the check failed, with reason
 125  */
 126 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 127 {
 128     PageCache *new_cache;
 129     int64_t ret = 0;
 130
 131     /* Check for truncation */
 132     if (new_size != (size_t)new_size) {
 133         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 134                    "exceeding address space");
 135         return -1;
 136     }
 137
 138     if (new_size == migrate_xbzrle_cache_size()) {
 139         /* nothing to do */
 140         return 0;
 141     }
 142
 143     XBZRLE_cache_lock();
 144
 145     if (XBZRLE.cache != NULL) {
 146         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 147         if (!new_cache) {
 148             ret = -1;
 149             goto out;
 150         }
 151
 152         cache_fini(XBZRLE.cache);
 153         XBZRLE.cache = new_cache;
 154     }
 155 out:
 156     XBZRLE_cache_unlock();
 157     return ret;
 158 }
 159
 160 static bool ramblock_is_ignored(RAMBlock *block)
 161 {
 162     return !qemu_ram_is_migratable(block) ||
 163            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 164 }
 165
 166 /* Should be holding either ram_list.mutex, or the RCU lock. */
 167 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 168     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 169         if (ramblock_is_ignored(block)) {} else
 170
 171 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 172     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 173         if (!qemu_ram_is_migratable(block)) {} else
 174
 175 #undef RAMBLOCK_FOREACH
 176
 177 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 178 {
 179     RAMBlock *block;
 180     int ret = 0;
 181
 182     RCU_READ_LOCK_GUARD();
 183
 184     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 185         ret = func(block, opaque);
 186         if (ret) {
 187             break;
 188         }
 189     }
 190     return ret;
 191 }
 192
 193 static void ramblock_recv_map_init(void)
 194 {
 195     RAMBlock *rb;
 196
 197     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 198         assert(!rb->receivedmap);
 199         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 200     }
 201 }
 202
 203 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 204 {
 205     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 206                     rb->receivedmap);
 207 }
 208
 209 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 210 {
 211     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 212 }
 213
 214 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 215 {
 216     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 217 }
 218
 219 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 220                                     size_t nr)
 221 {
 222     bitmap_set_atomic(rb->receivedmap,
 223                       ramblock_recv_bitmap_offset(host_addr, rb),
 224                       nr);
 225 }
 226
 227 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 228
 229 /*
 230  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 231  *
 232  * Returns >0 if success with sent bytes, or <0 if error.
 233  */
 234 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 235                                   const char *block_name)
 236 {
 237     RAMBlock *block = qemu_ram_block_by_name(block_name);
 238     unsigned long *le_bitmap, nbits;
 239     uint64_t size;
 240
 241     if (!block) {
 242         error_report("%s: invalid block name: %s", __func__, block_name);
 243         return -1;
 244     }
 245
 246     nbits = block->used_length >> TARGET_PAGE_BITS;
 247
 248     /*
 249      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 250      * machines we may need 4 more bytes for padding (see below
 251      * comment). So extend it a bit before hand.
 252      */
 253     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 254
 255     /*
 256      * Always use little endian when sending the bitmap. This is
 257      * required that when source and destination VMs are not using the
 258      * same endianess. (Note: big endian won't work.)
 259      */
 260     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 261
 262     /* Size of the bitmap, in bytes */
 263     size = DIV_ROUND_UP(nbits, 8);
 264
 265     /*
 266      * size is always aligned to 8 bytes for 64bit machines, but it
 267      * may not be true for 32bit machines. We need this padding to
 268      * make sure the migration can survive even between 32bit and
 269      * 64bit machines.
 270      */
 271     size = ROUND_UP(size, 8);
 272
 273     qemu_put_be64(file, size);
 274     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 275     /*
 276      * Mark as an end, in case the middle part is screwed up due to
 277      * some "misterious" reason.
 278      */
 279     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 280     qemu_fflush(file);
 281
 282     g_free(le_bitmap);
 283
 284     if (qemu_file_get_error(file)) {
 285         return qemu_file_get_error(file);
 286     }
 287
 288     return size + sizeof(size);
 289 }
 290
 291 /*
 292  * An outstanding page request, on the source, having been received
 293  * and queued
 294  */
 295 struct RAMSrcPageRequest {
 296     RAMBlock *rb;
 297     hwaddr    offset;
 298     hwaddr    len;
 299
 300     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 301 };
 302
 303 /* State of RAM for migration */
 304 struct RAMState {
 305     /* QEMUFile used for this migration */
 306     QEMUFile *f;
 307     /* Last block that we have visited searching for dirty pages */
 308     RAMBlock *last_seen_block;
 309     /* Last block from where we have sent data */
 310     RAMBlock *last_sent_block;
 311     /* Last dirty target page we have sent */
 312     ram_addr_t last_page;
 313     /* last ram version we have seen */
 314     uint32_t last_version;
 315     /* We are in the first round */
 316     bool ram_bulk_stage;
 317     /* The free page optimization is enabled */
 318     bool fpo_enabled;
 319     /* How many times we have dirty too many pages */
 320     int dirty_rate_high_cnt;
 321     /* these variables are used for bitmap sync */
 322     /* last time we did a full bitmap_sync */
 323     int64_t time_last_bitmap_sync;
 324     /* bytes transferred at start_time */
 325     uint64_t bytes_xfer_prev;
 326     /* number of dirty pages since start_time */
 327     uint64_t num_dirty_pages_period;
 328     /* xbzrle misses since the beginning of the period */
 329     uint64_t xbzrle_cache_miss_prev;
 330
 331     /* compression statistics since the beginning of the period */
 332     /* amount of count that no free thread to compress data */
 333     uint64_t compress_thread_busy_prev;
 334     /* amount bytes after compression */
 335     uint64_t compressed_size_prev;
 336     /* amount of compressed pages */
 337     uint64_t compress_pages_prev;
 338
 339     /* total handled target pages at the beginning of period */
 340     uint64_t target_page_count_prev;
 341     /* total handled target pages since start */
 342     uint64_t target_page_count;
 343     /* number of dirty bits in the bitmap */
 344     uint64_t migration_dirty_pages;
 345     /* Protects modification of the bitmap and migration dirty pages */
 346     QemuMutex bitmap_mutex;
 347     /* The RAMBlock used in the last src_page_requests */
 348     RAMBlock *last_req_rb;
 349     /* Queue of outstanding page requests from the destination */
 350     QemuMutex src_page_req_mutex;
 351     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 352 };
 353 typedef struct RAMState RAMState;
 354
 355 static RAMState *ram_state;
 356
 357 static NotifierWithReturnList precopy_notifier_list;
 358
 359 void precopy_infrastructure_init(void)
 360 {
 361     notifier_with_return_list_init(&precopy_notifier_list);
 362 }
 363
 364 void precopy_add_notifier(NotifierWithReturn *n)
 365 {
 366     notifier_with_return_list_add(&precopy_notifier_list, n);
 367 }
 368
 369 void precopy_remove_notifier(NotifierWithReturn *n)
 370 {
 371     notifier_with_return_remove(n);
 372 }
 373
 374 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 375 {
 376     PrecopyNotifyData pnd;
 377     pnd.reason = reason;
 378     pnd.errp = errp;
 379
 380     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 381 }
 382
 383 void precopy_enable_free_page_optimization(void)
 384 {
 385     if (!ram_state) {
 386         return;
 387     }
 388
 389     ram_state->fpo_enabled = true;
 390 }
 391
 392 uint64_t ram_bytes_remaining(void)
 393 {
 394     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 395                        0;
 396 }
 397
 398 MigrationStats ram_counters;
 399
 400 /* used by the search for pages to send */
 401 struct PageSearchStatus {
 402     /* Current block being searched */
 403     RAMBlock    *block;
 404     /* Current page to search from */
 405     unsigned long page;
 406     /* Set once we wrap around */
 407     bool         complete_round;
 408 };
 409 typedef struct PageSearchStatus PageSearchStatus;
 410
 411 CompressionStats compression_counters;
 412
 413 struct CompressParam {
 414     bool done;
 415     bool quit;
 416     bool zero_page;
 417     QEMUFile *file;
 418     QemuMutex mutex;
 419     QemuCond cond;
 420     RAMBlock *block;
 421     ram_addr_t offset;
 422
 423     /* internally used fields */
 424     z_stream stream;
 425     uint8_t *originbuf;
 426 };
 427 typedef struct CompressParam CompressParam;
 428
 429 struct DecompressParam {
 430     bool done;
 431     bool quit;
 432     QemuMutex mutex;
 433     QemuCond cond;
 434     void *des;
 435     uint8_t *compbuf;
 436     int len;
 437     z_stream stream;
 438 };
 439 typedef struct DecompressParam DecompressParam;
 440
 441 static CompressParam *comp_param;
 442 static QemuThread *compress_threads;
 443 /* comp_done_cond is used to wake up the migration thread when
 444  * one of the compression threads has finished the compression.
 445  * comp_done_lock is used to co-work with comp_done_cond.
 446  */
 447 static QemuMutex comp_done_lock;
 448 static QemuCond comp_done_cond;
 449 /* The empty QEMUFileOps will be used by file in CompressParam */
 450 static const QEMUFileOps empty_ops = { };
 451
 452 static QEMUFile *decomp_file;
 453 static DecompressParam *decomp_param;
 454 static QemuThread *decompress_threads;
 455 static QemuMutex decomp_done_lock;
 456 static QemuCond decomp_done_cond;
 457
 458 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 459                                  ram_addr_t offset, uint8_t *source_buf);
 460
 461 static void *do_data_compress(void *opaque)
 462 {
 463     CompressParam *param = opaque;
 464     RAMBlock *block;
 465     ram_addr_t offset;
 466     bool zero_page;
 467
 468     qemu_mutex_lock(&param->mutex);
 469     while (!param->quit) {
 470         if (param->block) {
 471             block = param->block;
 472             offset = param->offset;
 473             param->block = NULL;
 474             qemu_mutex_unlock(&param->mutex);
 475
 476             zero_page = do_compress_ram_page(param->file, &param->stream,
 477                                              block, offset, param->originbuf);
 478
 479             qemu_mutex_lock(&comp_done_lock);
 480             param->done = true;
 481             param->zero_page = zero_page;
 482             qemu_cond_signal(&comp_done_cond);
 483             qemu_mutex_unlock(&comp_done_lock);
 484
 485             qemu_mutex_lock(&param->mutex);
 486         } else {
 487             qemu_cond_wait(&param->cond, &param->mutex);
 488         }
 489     }
 490     qemu_mutex_unlock(&param->mutex);
 491
 492     return NULL;
 493 }
 494
 495 static void compress_threads_save_cleanup(void)
 496 {
 497     int i, thread_count;
 498
 499     if (!migrate_use_compression() || !comp_param) {
 500         return;
 501     }
 502
 503     thread_count = migrate_compress_threads();
 504     for (i = 0; i < thread_count; i++) {
 505         /*
 506          * we use it as a indicator which shows if the thread is
 507          * properly init'd or not
 508          */
 509         if (!comp_param[i].file) {
 510             break;
 511         }
 512
 513         qemu_mutex_lock(&comp_param[i].mutex);
 514         comp_param[i].quit = true;
 515         qemu_cond_signal(&comp_param[i].cond);
 516         qemu_mutex_unlock(&comp_param[i].mutex);
 517
 518         qemu_thread_join(compress_threads + i);
 519         qemu_mutex_destroy(&comp_param[i].mutex);
 520         qemu_cond_destroy(&comp_param[i].cond);
 521         deflateEnd(&comp_param[i].stream);
 522         g_free(comp_param[i].originbuf);
 523         qemu_fclose(comp_param[i].file);
 524         comp_param[i].file = NULL;
 525     }
 526     qemu_mutex_destroy(&comp_done_lock);
 527     qemu_cond_destroy(&comp_done_cond);
 528     g_free(compress_threads);
 529     g_free(comp_param);
 530     compress_threads = NULL;
 531     comp_param = NULL;
 532 }
 533
 534 static int compress_threads_save_setup(void)
 535 {
 536     int i, thread_count;
 537
 538     if (!migrate_use_compression()) {
 539         return 0;
 540     }
 541     thread_count = migrate_compress_threads();
 542     compress_threads = g_new0(QemuThread, thread_count);
 543     comp_param = g_new0(CompressParam, thread_count);
 544     qemu_cond_init(&comp_done_cond);
 545     qemu_mutex_init(&comp_done_lock);
 546     for (i = 0; i < thread_count; i++) {
 547         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 548         if (!comp_param[i].originbuf) {
 549             goto exit;
 550         }
 551
 552         if (deflateInit(&comp_param[i].stream,
 553                         migrate_compress_level()) != Z_OK) {
 554             g_free(comp_param[i].originbuf);
 555             goto exit;
 556         }
 557
 558         /* comp_param[i].file is just used as a dummy buffer to save data,
 559          * set its ops to empty.
 560          */
 561         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 562         comp_param[i].done = true;
 563         comp_param[i].quit = false;
 564         qemu_mutex_init(&comp_param[i].mutex);
 565         qemu_cond_init(&comp_param[i].cond);
 566         qemu_thread_create(compress_threads + i, "compress",
 567                            do_data_compress, comp_param + i,
 568                            QEMU_THREAD_JOINABLE);
 569     }
 570     return 0;
 571
 572 exit:
 573     compress_threads_save_cleanup();
 574     return -1;
 575 }
 576
 577 /**
 578  * save_page_header: write page header to wire
 579  *
 580  * If this is the 1st block, it also writes the block identification
 581  *
 582  * Returns the number of bytes written
 583  *
 584  * @f: QEMUFile where to send the data
 585  * @block: block that contains the page we want to send
 586  * @offset: offset inside the block for the page
 587  *          in the lower bits, it contains flags
 588  */
 589 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 590                                ram_addr_t offset)
 591 {
 592     size_t size, len;
 593
 594     if (block == rs->last_sent_block) {
 595         offset |= RAM_SAVE_FLAG_CONTINUE;
 596     }
 597     qemu_put_be64(f, offset);
 598     size = 8;
 599
 600     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 601         len = strlen(block->idstr);
 602         qemu_put_byte(f, len);
 603         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 604         size += 1 + len;
 605         rs->last_sent_block = block;
 606     }
 607     return size;
 608 }
 609
 610 /**
 611  * mig_throttle_guest_down: throotle down the guest
 612  *
 613  * Reduce amount of guest cpu execution to hopefully slow down memory
 614  * writes. If guest dirty memory rate is reduced below the rate at
 615  * which we can transfer pages to the destination then we should be
 616  * able to complete migration. Some workloads dirty memory way too
 617  * fast and will not effectively converge, even with auto-converge.
 618  */
 619 static void mig_throttle_guest_down(void)
 620 {
 621     MigrationState *s = migrate_get_current();
 622     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 623     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 624     int pct_max = s->parameters.max_cpu_throttle;
 625
 626     /* We have not started throttling yet. Let's start it. */
 627     if (!cpu_throttle_active()) {
 628         cpu_throttle_set(pct_initial);
 629     } else {
 630         /* Throttling already on, just increase the rate */
 631         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
 632                          pct_max));
 633     }
 634 }
 635
 636 /**
 637  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 638  *
 639  * @rs: current RAM state
 640  * @current_addr: address for the zero page
 641  *
 642  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 643  * The important thing is that a stale (not-yet-0'd) page be replaced
 644  * by the new data.
 645  * As a bonus, if the page wasn't in the cache it gets added so that
 646  * when a small write is made into the 0'd page it gets XBZRLE sent.
 647  */
 648 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 649 {
 650     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 651         return;
 652     }
 653
 654     /* We don't care if this fails to allocate a new cache page
 655      * as long as it updated an old one */
 656     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 657                  ram_counters.dirty_sync_count);
 658 }
 659
 660 #define ENCODING_FLAG_XBZRLE 0x1
 661
 662 /**
 663  * save_xbzrle_page: compress and send current page
 664  *
 665  * Returns: 1 means that we wrote the page
 666  *          0 means that page is identical to the one already sent
 667  *          -1 means that xbzrle would be longer than normal
 668  *
 669  * @rs: current RAM state
 670  * @current_data: pointer to the address of the page contents
 671  * @current_addr: addr of the page
 672  * @block: block that contains the page we want to send
 673  * @offset: offset inside the block for the page
 674  * @last_stage: if we are at the completion stage
 675  */
 676 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 677                             ram_addr_t current_addr, RAMBlock *block,
 678                             ram_addr_t offset, bool last_stage)
 679 {
 680     int encoded_len = 0, bytes_xbzrle;
 681     uint8_t *prev_cached_page;
 682
 683     if (!cache_is_cached(XBZRLE.cache, current_addr,
 684                          ram_counters.dirty_sync_count)) {
 685         xbzrle_counters.cache_miss++;
 686         if (!last_stage) {
 687             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 688                              ram_counters.dirty_sync_count) == -1) {
 689                 return -1;
 690             } else {
 691                 /* update *current_data when the page has been
 692                    inserted into cache */
 693                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 694             }
 695         }
 696         return -1;
 697     }
 698
 699     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 700
 701     /* save current buffer into memory */
 702     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 703
 704     /* XBZRLE encoding (if there is no overflow) */
 705     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 706                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 707                                        TARGET_PAGE_SIZE);
 708
 709     /*
 710      * Update the cache contents, so that it corresponds to the data
 711      * sent, in all cases except where we skip the page.
 712      */
 713     if (!last_stage && encoded_len != 0) {
 714         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 715         /*
 716          * In the case where we couldn't compress, ensure that the caller
 717          * sends the data from the cache, since the guest might have
 718          * changed the RAM since we copied it.
 719          */
 720         *current_data = prev_cached_page;
 721     }
 722
 723     if (encoded_len == 0) {
 724         trace_save_xbzrle_page_skipping();
 725         return 0;
 726     } else if (encoded_len == -1) {
 727         trace_save_xbzrle_page_overflow();
 728         xbzrle_counters.overflow++;
 729         return -1;
 730     }
 731
 732     /* Send XBZRLE based compressed page */
 733     bytes_xbzrle = save_page_header(rs, rs->f, block,
 734                                     offset | RAM_SAVE_FLAG_XBZRLE);
 735     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 736     qemu_put_be16(rs->f, encoded_len);
 737     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 738     bytes_xbzrle += encoded_len + 1 + 2;
 739     xbzrle_counters.pages++;
 740     xbzrle_counters.bytes += bytes_xbzrle;
 741     ram_counters.transferred += bytes_xbzrle;
 742
 743     return 1;
 744 }
 745
 746 /**
 747  * migration_bitmap_find_dirty: find the next dirty page from start
 748  *
 749  * Returns the page offset within memory region of the start of a dirty page
 750  *
 751  * @rs: current RAM state
 752  * @rb: RAMBlock where to search for dirty pages
 753  * @start: page where we start the search
 754  */
 755 static inline
 756 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 757                                           unsigned long start)
 758 {
 759     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 760     unsigned long *bitmap = rb->bmap;
 761     unsigned long next;
 762
 763     if (ramblock_is_ignored(rb)) {
 764         return size;
 765     }
 766
 767     /*
 768      * When the free page optimization is enabled, we need to check the bitmap
 769      * to send the non-free pages rather than all the pages in the bulk stage.
 770      */
 771     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
 772         next = start + 1;
 773     } else {
 774         next = find_next_bit(bitmap, size, start);
 775     }
 776
 777     return next;
 778 }
 779
 780 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 781                                                 RAMBlock *rb,
 782                                                 unsigned long page)
 783 {
 784     bool ret;
 785
 786     qemu_mutex_lock(&rs->bitmap_mutex);
 787
 788     /*
 789      * Clear dirty bitmap if needed.  This _must_ be called before we
 790      * send any of the page in the chunk because we need to make sure
 791      * we can capture further page content changes when we sync dirty
 792      * log the next time.  So as long as we are going to send any of
 793      * the page in the chunk we clear the remote dirty bitmap for all.
 794      * Clearing it earlier won't be a problem, but too late will.
 795      */
 796     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
 797         uint8_t shift = rb->clear_bmap_shift;
 798         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
 799         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
 800
 801         /*
 802          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 803          * can make things easier sometimes since then start address
 804          * of the small chunk will always be 64 pages aligned so the
 805          * bitmap will always be aligned to unsigned long.  We should
 806          * even be able to remove this restriction but I'm simply
 807          * keeping it.
 808          */
 809         assert(shift >= 6);
 810         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 811         memory_region_clear_dirty_bitmap(rb->mr, start, size);
 812     }
 813
 814     ret = test_and_clear_bit(page, rb->bmap);
 815
 816     if (ret) {
 817         rs->migration_dirty_pages--;
 818     }
 819     qemu_mutex_unlock(&rs->bitmap_mutex);
 820
 821     return ret;
 822 }
 823
 824 /* Called with RCU critical section */
 825 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 826 {
 827     rs->migration_dirty_pages +=
 828         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
 829                                               &rs->num_dirty_pages_period);
 830 }
 831
 832 /**
 833  * ram_pagesize_summary: calculate all the pagesizes of a VM
 834  *
 835  * Returns a summary bitmap of the page sizes of all RAMBlocks
 836  *
 837  * For VMs with just normal pages this is equivalent to the host page
 838  * size. If it's got some huge pages then it's the OR of all the
 839  * different page sizes.
 840  */
 841 uint64_t ram_pagesize_summary(void)
 842 {
 843     RAMBlock *block;
 844     uint64_t summary = 0;
 845
 846     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 847         summary |= block->page_size;
 848     }
 849
 850     return summary;
 851 }
 852
 853 uint64_t ram_get_total_transferred_pages(void)
 854 {
 855     return  ram_counters.normal + ram_counters.duplicate +
 856                 compression_counters.pages + xbzrle_counters.pages;
 857 }
 858
 859 static void migration_update_rates(RAMState *rs, int64_t end_time)
 860 {
 861     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 862     double compressed_size;
 863
 864     /* calculate period counters */
 865     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 866                 / (end_time - rs->time_last_bitmap_sync);
 867
 868     if (!page_count) {
 869         return;
 870     }
 871
 872     if (migrate_use_xbzrle()) {
 873         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 874             rs->xbzrle_cache_miss_prev) / page_count;
 875         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 876     }
 877
 878     if (migrate_use_compression()) {
 879         compression_counters.busy_rate = (double)(compression_counters.busy -
 880             rs->compress_thread_busy_prev) / page_count;
 881         rs->compress_thread_busy_prev = compression_counters.busy;
 882
 883         compressed_size = compression_counters.compressed_size -
 884                           rs->compressed_size_prev;
 885         if (compressed_size) {
 886             double uncompressed_size = (compression_counters.pages -
 887                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 888
 889             /* Compression-Ratio = Uncompressed-size / Compressed-size */
 890             compression_counters.compression_rate =
 891                                         uncompressed_size / compressed_size;
 892
 893             rs->compress_pages_prev = compression_counters.pages;
 894             rs->compressed_size_prev = compression_counters.compressed_size;
 895         }
 896     }
 897 }
 898
 899 static void migration_bitmap_sync(RAMState *rs)
 900 {
 901     RAMBlock *block;
 902     int64_t end_time;
 903     uint64_t bytes_xfer_now;
 904
 905     ram_counters.dirty_sync_count++;
 906
 907     if (!rs->time_last_bitmap_sync) {
 908         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 909     }
 910
 911     trace_migration_bitmap_sync_start();
 912     memory_global_dirty_log_sync();
 913
 914     qemu_mutex_lock(&rs->bitmap_mutex);
 915     WITH_RCU_READ_LOCK_GUARD() {
 916         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 917             ramblock_sync_dirty_bitmap(rs, block);
 918         }
 919         ram_counters.remaining = ram_bytes_remaining();
 920     }
 921     qemu_mutex_unlock(&rs->bitmap_mutex);
 922
 923     memory_global_after_dirty_log_sync();
 924     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 925
 926     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 927
 928     /* more than 1 second = 1000 millisecons */
 929     if (end_time > rs->time_last_bitmap_sync + 1000) {
 930         bytes_xfer_now = ram_counters.transferred;
 931
 932         /* During block migration the auto-converge logic incorrectly detects
 933          * that ram migration makes no progress. Avoid this by disabling the
 934          * throttling logic during the bulk phase of block migration. */
 935         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 936             /* The following detection logic can be refined later. For now:
 937                Check to see if the dirtied bytes is 50% more than the approx.
 938                amount of bytes that just got transferred since the last time we
 939                were in this routine. If that happens twice, start or increase
 940                throttling */
 941
 942             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 943                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 944                 (++rs->dirty_rate_high_cnt >= 2)) {
 945                     trace_migration_throttle();
 946                     rs->dirty_rate_high_cnt = 0;
 947                     mig_throttle_guest_down();
 948             }
 949         }
 950
 951         migration_update_rates(rs, end_time);
 952
 953         rs->target_page_count_prev = rs->target_page_count;
 954
 955         /* reset period counters */
 956         rs->time_last_bitmap_sync = end_time;
 957         rs->num_dirty_pages_period = 0;
 958         rs->bytes_xfer_prev = bytes_xfer_now;
 959     }
 960     if (migrate_use_events()) {
 961         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
 962     }
 963 }
 964
 965 static void migration_bitmap_sync_precopy(RAMState *rs)
 966 {
 967     Error *local_err = NULL;
 968
 969     /*
 970      * The current notifier usage is just an optimization to migration, so we
 971      * don't stop the normal migration process in the error case.
 972      */
 973     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
 974         error_report_err(local_err);
 975     }
 976
 977     migration_bitmap_sync(rs);
 978
 979     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
 980         error_report_err(local_err);
 981     }
 982 }
 983
 984 /**
 985  * save_zero_page_to_file: send the zero page to the file
 986  *
 987  * Returns the size of data written to the file, 0 means the page is not
 988  * a zero page
 989  *
 990  * @rs: current RAM state
 991  * @file: the file where the data is saved
 992  * @block: block that contains the page we want to send
 993  * @offset: offset inside the block for the page
 994  */
 995 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
 996                                   RAMBlock *block, ram_addr_t offset)
 997 {
 998     uint8_t *p = block->host + offset;
 999     int len = 0;
1000
1001     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1002         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1003         qemu_put_byte(file, 0);
1004         len += 1;
1005     }
1006     return len;
1007 }
1008
1009 /**
1010  * save_zero_page: send the zero page to the stream
1011  *
1012  * Returns the number of pages written.
1013  *
1014  * @rs: current RAM state
1015  * @block: block that contains the page we want to send
1016  * @offset: offset inside the block for the page
1017  */
1018 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1019 {
1020     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1021
1022     if (len) {
1023         ram_counters.duplicate++;
1024         ram_counters.transferred += len;
1025         return 1;
1026     }
1027     return -1;
1028 }
1029
1030 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1031 {
1032     if (!migrate_release_ram() || !migration_in_postcopy()) {
1033         return;
1034     }
1035
1036     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1037 }
1038
1039 /*
1040  * @pages: the number of pages written by the control path,
1041  *        < 0 - error
1042  *        > 0 - number of pages written
1043  *
1044  * Return true if the pages has been saved, otherwise false is returned.
1045  */
1046 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1047                               int *pages)
1048 {
1049     uint64_t bytes_xmit = 0;
1050     int ret;
1051
1052     *pages = -1;
1053     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1054                                 &bytes_xmit);
1055     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1056         return false;
1057     }
1058
1059     if (bytes_xmit) {
1060         ram_counters.transferred += bytes_xmit;
1061         *pages = 1;
1062     }
1063
1064     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1065         return true;
1066     }
1067
1068     if (bytes_xmit > 0) {
1069         ram_counters.normal++;
1070     } else if (bytes_xmit == 0) {
1071         ram_counters.duplicate++;
1072     }
1073
1074     return true;
1075 }
1076
1077 /*
1078  * directly send the page to the stream
1079  *
1080  * Returns the number of pages written.
1081  *
1082  * @rs: current RAM state
1083  * @block: block that contains the page we want to send
1084  * @offset: offset inside the block for the page
1085  * @buf: the page to be sent
1086  * @async: send to page asyncly
1087  */
1088 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1089                             uint8_t *buf, bool async)
1090 {
1091     ram_counters.transferred += save_page_header(rs, rs->f, block,
1092                                                  offset | RAM_SAVE_FLAG_PAGE);
1093     if (async) {
1094         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1095                               migrate_release_ram() &
1096                               migration_in_postcopy());
1097     } else {
1098         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1099     }
1100     ram_counters.transferred += TARGET_PAGE_SIZE;
1101     ram_counters.normal++;
1102     return 1;
1103 }
1104
1105 /**
1106  * ram_save_page: send the given page to the stream
1107  *
1108  * Returns the number of pages written.
1109  *          < 0 - error
1110  *          >=0 - Number of pages written - this might legally be 0
1111  *                if xbzrle noticed the page was the same.
1112  *
1113  * @rs: current RAM state
1114  * @block: block that contains the page we want to send
1115  * @offset: offset inside the block for the page
1116  * @last_stage: if we are at the completion stage
1117  */
1118 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1119 {
1120     int pages = -1;
1121     uint8_t *p;
1122     bool send_async = true;
1123     RAMBlock *block = pss->block;
1124     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1125     ram_addr_t current_addr = block->offset + offset;
1126
1127     p = block->host + offset;
1128     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1129
1130     XBZRLE_cache_lock();
1131     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1132         migrate_use_xbzrle()) {
1133         pages = save_xbzrle_page(rs, &p, current_addr, block,
1134                                  offset, last_stage);
1135         if (!last_stage) {
1136             /* Can't send this cached data async, since the cache page
1137              * might get updated before it gets to the wire
1138              */
1139             send_async = false;
1140         }
1141     }
1142
1143     /* XBZRLE overflow or normal page */
1144     if (pages == -1) {
1145         pages = save_normal_page(rs, block, offset, p, send_async);
1146     }
1147
1148     XBZRLE_cache_unlock();
1149
1150     return pages;
1151 }
1152
1153 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1154                                  ram_addr_t offset)
1155 {
1156     if (multifd_queue_page(rs->f, block, offset) < 0) {
1157         return -1;
1158     }
1159     ram_counters.normal++;
1160
1161     return 1;
1162 }
1163
1164 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1165                                  ram_addr_t offset, uint8_t *source_buf)
1166 {
1167     RAMState *rs = ram_state;
1168     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1169     bool zero_page = false;
1170     int ret;
1171
1172     if (save_zero_page_to_file(rs, f, block, offset)) {
1173         zero_page = true;
1174         goto exit;
1175     }
1176
1177     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1178
1179     /*
1180      * copy it to a internal buffer to avoid it being modified by VM
1181      * so that we can catch up the error during compression and
1182      * decompression
1183      */
1184     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1185     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1186     if (ret < 0) {
1187         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1188         error_report("compressed data failed!");
1189         return false;
1190     }
1191
1192 exit:
1193     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1194     return zero_page;
1195 }
1196
1197 static void
1198 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1199 {
1200     ram_counters.transferred += bytes_xmit;
1201
1202     if (param->zero_page) {
1203         ram_counters.duplicate++;
1204         return;
1205     }
1206
1207     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1208     compression_counters.compressed_size += bytes_xmit - 8;
1209     compression_counters.pages++;
1210 }
1211
1212 static bool save_page_use_compression(RAMState *rs);
1213
1214 static void flush_compressed_data(RAMState *rs)
1215 {
1216     int idx, len, thread_count;
1217
1218     if (!save_page_use_compression(rs)) {
1219         return;
1220     }
1221     thread_count = migrate_compress_threads();
1222
1223     qemu_mutex_lock(&comp_done_lock);
1224     for (idx = 0; idx < thread_count; idx++) {
1225         while (!comp_param[idx].done) {
1226             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1227         }
1228     }
1229     qemu_mutex_unlock(&comp_done_lock);
1230
1231     for (idx = 0; idx < thread_count; idx++) {
1232         qemu_mutex_lock(&comp_param[idx].mutex);
1233         if (!comp_param[idx].quit) {
1234             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1235             /*
1236              * it's safe to fetch zero_page without holding comp_done_lock
1237              * as there is no further request submitted to the thread,
1238              * i.e, the thread should be waiting for a request at this point.
1239              */
1240             update_compress_thread_counts(&comp_param[idx], len);
1241         }
1242         qemu_mutex_unlock(&comp_param[idx].mutex);
1243     }
1244 }
1245
1246 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1247                                        ram_addr_t offset)
1248 {
1249     param->block = block;
1250     param->offset = offset;
1251 }
1252
1253 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1254                                            ram_addr_t offset)
1255 {
1256     int idx, thread_count, bytes_xmit = -1, pages = -1;
1257     bool wait = migrate_compress_wait_thread();
1258
1259     thread_count = migrate_compress_threads();
1260     qemu_mutex_lock(&comp_done_lock);
1261 retry:
1262     for (idx = 0; idx < thread_count; idx++) {
1263         if (comp_param[idx].done) {
1264             comp_param[idx].done = false;
1265             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1266             qemu_mutex_lock(&comp_param[idx].mutex);
1267             set_compress_params(&comp_param[idx], block, offset);
1268             qemu_cond_signal(&comp_param[idx].cond);
1269             qemu_mutex_unlock(&comp_param[idx].mutex);
1270             pages = 1;
1271             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1272             break;
1273         }
1274     }
1275
1276     /*
1277      * wait for the free thread if the user specifies 'compress-wait-thread',
1278      * otherwise we will post the page out in the main thread as normal page.
1279      */
1280     if (pages < 0 && wait) {
1281         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1282         goto retry;
1283     }
1284     qemu_mutex_unlock(&comp_done_lock);
1285
1286     return pages;
1287 }
1288
1289 /**
1290  * find_dirty_block: find the next dirty page and update any state
1291  * associated with the search process.
1292  *
1293  * Returns true if a page is found
1294  *
1295  * @rs: current RAM state
1296  * @pss: data about the state of the current dirty page scan
1297  * @again: set to false if the search has scanned the whole of RAM
1298  */
1299 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1300 {
1301     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1302     if (pss->complete_round && pss->block == rs->last_seen_block &&
1303         pss->page >= rs->last_page) {
1304         /*
1305          * We've been once around the RAM and haven't found anything.
1306          * Give up.
1307          */
1308         *again = false;
1309         return false;
1310     }
1311     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1312         >= pss->block->used_length) {
1313         /* Didn't find anything in this RAM Block */
1314         pss->page = 0;
1315         pss->block = QLIST_NEXT_RCU(pss->block, next);
1316         if (!pss->block) {
1317             /*
1318              * If memory migration starts over, we will meet a dirtied page
1319              * which may still exists in compression threads's ring, so we
1320              * should flush the compressed data to make sure the new page
1321              * is not overwritten by the old one in the destination.
1322              *
1323              * Also If xbzrle is on, stop using the data compression at this
1324              * point. In theory, xbzrle can do better than compression.
1325              */
1326             flush_compressed_data(rs);
1327
1328             /* Hit the end of the list */
1329             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1330             /* Flag that we've looped */
1331             pss->complete_round = true;
1332             rs->ram_bulk_stage = false;
1333         }
1334         /* Didn't find anything this time, but try again on the new block */
1335         *again = true;
1336         return false;
1337     } else {
1338         /* Can go around again, but... */
1339         *again = true;
1340         /* We've found something so probably don't need to */
1341         return true;
1342     }
1343 }
1344
1345 /**
1346  * unqueue_page: gets a page of the queue
1347  *
1348  * Helper for 'get_queued_page' - gets a page off the queue
1349  *
1350  * Returns the block of the page (or NULL if none available)
1351  *
1352  * @rs: current RAM state
1353  * @offset: used to return the offset within the RAMBlock
1354  */
1355 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1356 {
1357     RAMBlock *block = NULL;
1358
1359     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1360         return NULL;
1361     }
1362
1363     qemu_mutex_lock(&rs->src_page_req_mutex);
1364     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1365         struct RAMSrcPageRequest *entry =
1366                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1367         block = entry->rb;
1368         *offset = entry->offset;
1369
1370         if (entry->len > TARGET_PAGE_SIZE) {
1371             entry->len -= TARGET_PAGE_SIZE;
1372             entry->offset += TARGET_PAGE_SIZE;
1373         } else {
1374             memory_region_unref(block->mr);
1375             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1376             g_free(entry);
1377             migration_consume_urgent_request();
1378         }
1379     }
1380     qemu_mutex_unlock(&rs->src_page_req_mutex);
1381
1382     return block;
1383 }
1384
1385 /**
1386  * get_queued_page: unqueue a page from the postcopy requests
1387  *
1388  * Skips pages that are already sent (!dirty)
1389  *
1390  * Returns true if a queued page is found
1391  *
1392  * @rs: current RAM state
1393  * @pss: data about the state of the current dirty page scan
1394  */
1395 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1396 {
1397     RAMBlock  *block;
1398     ram_addr_t offset;
1399     bool dirty;
1400
1401     do {
1402         block = unqueue_page(rs, &offset);
1403         /*
1404          * We're sending this page, and since it's postcopy nothing else
1405          * will dirty it, and we must make sure it doesn't get sent again
1406          * even if this queue request was received after the background
1407          * search already sent it.
1408          */
1409         if (block) {
1410             unsigned long page;
1411
1412             page = offset >> TARGET_PAGE_BITS;
1413             dirty = test_bit(page, block->bmap);
1414             if (!dirty) {
1415                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1416                                                 page);
1417             } else {
1418                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1419             }
1420         }
1421
1422     } while (block && !dirty);
1423
1424     if (block) {
1425         /*
1426          * As soon as we start servicing pages out of order, then we have
1427          * to kill the bulk stage, since the bulk stage assumes
1428          * in (migration_bitmap_find_and_reset_dirty) that every page is
1429          * dirty, that's no longer true.
1430          */
1431         rs->ram_bulk_stage = false;
1432
1433         /*
1434          * We want the background search to continue from the queued page
1435          * since the guest is likely to want other pages near to the page
1436          * it just requested.
1437          */
1438         pss->block = block;
1439         pss->page = offset >> TARGET_PAGE_BITS;
1440
1441         /*
1442          * This unqueued page would break the "one round" check, even is
1443          * really rare.
1444          */
1445         pss->complete_round = false;
1446     }
1447
1448     return !!block;
1449 }
1450
1451 /**
1452  * migration_page_queue_free: drop any remaining pages in the ram
1453  * request queue
1454  *
1455  * It should be empty at the end anyway, but in error cases there may
1456  * be some left.  in case that there is any page left, we drop it.
1457  *
1458  */
1459 static void migration_page_queue_free(RAMState *rs)
1460 {
1461     struct RAMSrcPageRequest *mspr, *next_mspr;
1462     /* This queue generally should be empty - but in the case of a failed
1463      * migration might have some droppings in.
1464      */
1465     RCU_READ_LOCK_GUARD();
1466     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1467         memory_region_unref(mspr->rb->mr);
1468         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1469         g_free(mspr);
1470     }
1471 }
1472
1473 /**
1474  * ram_save_queue_pages: queue the page for transmission
1475  *
1476  * A request from postcopy destination for example.
1477  *
1478  * Returns zero on success or negative on error
1479  *
1480  * @rbname: Name of the RAMBLock of the request. NULL means the
1481  *          same that last one.
1482  * @start: starting address from the start of the RAMBlock
1483  * @len: length (in bytes) to send
1484  */
1485 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1486 {
1487     RAMBlock *ramblock;
1488     RAMState *rs = ram_state;
1489
1490     ram_counters.postcopy_requests++;
1491     RCU_READ_LOCK_GUARD();
1492
1493     if (!rbname) {
1494         /* Reuse last RAMBlock */
1495         ramblock = rs->last_req_rb;
1496
1497         if (!ramblock) {
1498             /*
1499              * Shouldn't happen, we can't reuse the last RAMBlock if
1500              * it's the 1st request.
1501              */
1502             error_report("ram_save_queue_pages no previous block");
1503             return -1;
1504         }
1505     } else {
1506         ramblock = qemu_ram_block_by_name(rbname);
1507
1508         if (!ramblock) {
1509             /* We shouldn't be asked for a non-existent RAMBlock */
1510             error_report("ram_save_queue_pages no block '%s'", rbname);
1511             return -1;
1512         }
1513         rs->last_req_rb = ramblock;
1514     }
1515     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1516     if (start+len > ramblock->used_length) {
1517         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1518                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1519                      __func__, start, len, ramblock->used_length);
1520         return -1;
1521     }
1522
1523     struct RAMSrcPageRequest *new_entry =
1524         g_malloc0(sizeof(struct RAMSrcPageRequest));
1525     new_entry->rb = ramblock;
1526     new_entry->offset = start;
1527     new_entry->len = len;
1528
1529     memory_region_ref(ramblock->mr);
1530     qemu_mutex_lock(&rs->src_page_req_mutex);
1531     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1532     migration_make_urgent_request();
1533     qemu_mutex_unlock(&rs->src_page_req_mutex);
1534
1535     return 0;
1536 }
1537
1538 static bool save_page_use_compression(RAMState *rs)
1539 {
1540     if (!migrate_use_compression()) {
1541         return false;
1542     }
1543
1544     /*
1545      * If xbzrle is on, stop using the data compression after first
1546      * round of migration even if compression is enabled. In theory,
1547      * xbzrle can do better than compression.
1548      */
1549     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1550         return true;
1551     }
1552
1553     return false;
1554 }
1555
1556 /*
1557  * try to compress the page before posting it out, return true if the page
1558  * has been properly handled by compression, otherwise needs other
1559  * paths to handle it
1560  */
1561 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1562 {
1563     if (!save_page_use_compression(rs)) {
1564         return false;
1565     }
1566
1567     /*
1568      * When starting the process of a new block, the first page of
1569      * the block should be sent out before other pages in the same
1570      * block, and all the pages in last block should have been sent
1571      * out, keeping this order is important, because the 'cont' flag
1572      * is used to avoid resending the block name.
1573      *
1574      * We post the fist page as normal page as compression will take
1575      * much CPU resource.
1576      */
1577     if (block != rs->last_sent_block) {
1578         flush_compressed_data(rs);
1579         return false;
1580     }
1581
1582     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1583         return true;
1584     }
1585
1586     compression_counters.busy++;
1587     return false;
1588 }
1589
1590 /**
1591  * ram_save_target_page: save one target page
1592  *
1593  * Returns the number of pages written
1594  *
1595  * @rs: current RAM state
1596  * @pss: data about the page we want to send
1597  * @last_stage: if we are at the completion stage
1598  */
1599 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1600                                 bool last_stage)
1601 {
1602     RAMBlock *block = pss->block;
1603     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1604     int res;
1605
1606     if (control_save_page(rs, block, offset, &res)) {
1607         return res;
1608     }
1609
1610     if (save_compress_page(rs, block, offset)) {
1611         return 1;
1612     }
1613
1614     res = save_zero_page(rs, block, offset);
1615     if (res > 0) {
1616         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1617          * page would be stale
1618          */
1619         if (!save_page_use_compression(rs)) {
1620             XBZRLE_cache_lock();
1621             xbzrle_cache_zero_page(rs, block->offset + offset);
1622             XBZRLE_cache_unlock();
1623         }
1624         ram_release_pages(block->idstr, offset, res);
1625         return res;
1626     }
1627
1628     /*
1629      * Do not use multifd for:
1630      * 1. Compression as the first page in the new block should be posted out
1631      *    before sending the compressed page
1632      * 2. In postcopy as one whole host page should be placed
1633      */
1634     if (!save_page_use_compression(rs) && migrate_use_multifd()
1635         && !migration_in_postcopy()) {
1636         return ram_save_multifd_page(rs, block, offset);
1637     }
1638
1639     return ram_save_page(rs, pss, last_stage);
1640 }
1641
1642 /**
1643  * ram_save_host_page: save a whole host page
1644  *
1645  * Starting at *offset send pages up to the end of the current host
1646  * page. It's valid for the initial offset to point into the middle of
1647  * a host page in which case the remainder of the hostpage is sent.
1648  * Only dirty target pages are sent. Note that the host page size may
1649  * be a huge page for this block.
1650  * The saving stops at the boundary of the used_length of the block
1651  * if the RAMBlock isn't a multiple of the host page size.
1652  *
1653  * Returns the number of pages written or negative on error
1654  *
1655  * @rs: current RAM state
1656  * @ms: current migration state
1657  * @pss: data about the page we want to send
1658  * @last_stage: if we are at the completion stage
1659  */
1660 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1661                               bool last_stage)
1662 {
1663     int tmppages, pages = 0;
1664     size_t pagesize_bits =
1665         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1666
1667     if (ramblock_is_ignored(pss->block)) {
1668         error_report("block %s should not be migrated !", pss->block->idstr);
1669         return 0;
1670     }
1671
1672     do {
1673         /* Check the pages is dirty and if it is send it */
1674         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1675             pss->page++;
1676             continue;
1677         }
1678
1679         tmppages = ram_save_target_page(rs, pss, last_stage);
1680         if (tmppages < 0) {
1681             return tmppages;
1682         }
1683
1684         pages += tmppages;
1685         pss->page++;
1686         /* Allow rate limiting to happen in the middle of huge pages */
1687         migration_rate_limit();
1688     } while ((pss->page & (pagesize_bits - 1)) &&
1689              offset_in_ramblock(pss->block,
1690                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
1691
1692     /* The offset we leave with is the last one we looked at */
1693     pss->page--;
1694     return pages;
1695 }
1696
1697 /**
1698  * ram_find_and_save_block: finds a dirty page and sends it to f
1699  *
1700  * Called within an RCU critical section.
1701  *
1702  * Returns the number of pages written where zero means no dirty pages,
1703  * or negative on error
1704  *
1705  * @rs: current RAM state
1706  * @last_stage: if we are at the completion stage
1707  *
1708  * On systems where host-page-size > target-page-size it will send all the
1709  * pages in a host page that are dirty.
1710  */
1711
1712 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1713 {
1714     PageSearchStatus pss;
1715     int pages = 0;
1716     bool again, found;
1717
1718     /* No dirty page as there is zero RAM */
1719     if (!ram_bytes_total()) {
1720         return pages;
1721     }
1722
1723     pss.block = rs->last_seen_block;
1724     pss.page = rs->last_page;
1725     pss.complete_round = false;
1726
1727     if (!pss.block) {
1728         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1729     }
1730
1731     do {
1732         again = true;
1733         found = get_queued_page(rs, &pss);
1734
1735         if (!found) {
1736             /* priority queue empty, so just search for something dirty */
1737             found = find_dirty_block(rs, &pss, &again);
1738         }
1739
1740         if (found) {
1741             pages = ram_save_host_page(rs, &pss, last_stage);
1742         }
1743     } while (!pages && again);
1744
1745     rs->last_seen_block = pss.block;
1746     rs->last_page = pss.page;
1747
1748     return pages;
1749 }
1750
1751 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1752 {
1753     uint64_t pages = size / TARGET_PAGE_SIZE;
1754
1755     if (zero) {
1756         ram_counters.duplicate += pages;
1757     } else {
1758         ram_counters.normal += pages;
1759         ram_counters.transferred += size;
1760         qemu_update_position(f, size);
1761     }
1762 }
1763
1764 static uint64_t ram_bytes_total_common(bool count_ignored)
1765 {
1766     RAMBlock *block;
1767     uint64_t total = 0;
1768
1769     RCU_READ_LOCK_GUARD();
1770
1771     if (count_ignored) {
1772         RAMBLOCK_FOREACH_MIGRATABLE(block) {
1773             total += block->used_length;
1774         }
1775     } else {
1776         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1777             total += block->used_length;
1778         }
1779     }
1780     return total;
1781 }
1782
1783 uint64_t ram_bytes_total(void)
1784 {
1785     return ram_bytes_total_common(false);
1786 }
1787
1788 static void xbzrle_load_setup(void)
1789 {
1790     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1791 }
1792
1793 static void xbzrle_load_cleanup(void)
1794 {
1795     g_free(XBZRLE.decoded_buf);
1796     XBZRLE.decoded_buf = NULL;
1797 }
1798
1799 static void ram_state_cleanup(RAMState **rsp)
1800 {
1801     if (*rsp) {
1802         migration_page_queue_free(*rsp);
1803         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1804         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1805         g_free(*rsp);
1806         *rsp = NULL;
1807     }
1808 }
1809
1810 static void xbzrle_cleanup(void)
1811 {
1812     XBZRLE_cache_lock();
1813     if (XBZRLE.cache) {
1814         cache_fini(XBZRLE.cache);
1815         g_free(XBZRLE.encoded_buf);
1816         g_free(XBZRLE.current_buf);
1817         g_free(XBZRLE.zero_target_page);
1818         XBZRLE.cache = NULL;
1819         XBZRLE.encoded_buf = NULL;
1820         XBZRLE.current_buf = NULL;
1821         XBZRLE.zero_target_page = NULL;
1822     }
1823     XBZRLE_cache_unlock();
1824 }
1825
1826 static void ram_save_cleanup(void *opaque)
1827 {
1828     RAMState **rsp = opaque;
1829     RAMBlock *block;
1830
1831     /* caller have hold iothread lock or is in a bh, so there is
1832      * no writing race against the migration bitmap
1833      */
1834     memory_global_dirty_log_stop();
1835
1836     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1837         g_free(block->clear_bmap);
1838         block->clear_bmap = NULL;
1839         g_free(block->bmap);
1840         block->bmap = NULL;
1841     }
1842
1843     xbzrle_cleanup();
1844     compress_threads_save_cleanup();
1845     ram_state_cleanup(rsp);
1846 }
1847
1848 static void ram_state_reset(RAMState *rs)
1849 {
1850     rs->last_seen_block = NULL;
1851     rs->last_sent_block = NULL;
1852     rs->last_page = 0;
1853     rs->last_version = ram_list.version;
1854     rs->ram_bulk_stage = true;
1855     rs->fpo_enabled = false;
1856 }
1857
1858 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1859
1860 /*
1861  * 'expected' is the value you expect the bitmap mostly to be full
1862  * of; it won't bother printing lines that are all this value.
1863  * If 'todump' is null the migration bitmap is dumped.
1864  */
1865 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1866                            unsigned long pages)
1867 {
1868     int64_t cur;
1869     int64_t linelen = 128;
1870     char linebuf[129];
1871
1872     for (cur = 0; cur < pages; cur += linelen) {
1873         int64_t curb;
1874         bool found = false;
1875         /*
1876          * Last line; catch the case where the line length
1877          * is longer than remaining ram
1878          */
1879         if (cur + linelen > pages) {
1880             linelen = pages - cur;
1881         }
1882         for (curb = 0; curb < linelen; curb++) {
1883             bool thisbit = test_bit(cur + curb, todump);
1884             linebuf[curb] = thisbit ? '1' : '.';
1885             found = found || (thisbit != expected);
1886         }
1887         if (found) {
1888             linebuf[curb] = '\0';
1889             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1890         }
1891     }
1892 }
1893
1894 /* **** functions for postcopy ***** */
1895
1896 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1897 {
1898     struct RAMBlock *block;
1899
1900     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1901         unsigned long *bitmap = block->bmap;
1902         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1903         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1904
1905         while (run_start < range) {
1906             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1907             ram_discard_range(block->idstr,
1908                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1909                               ((ram_addr_t)(run_end - run_start))
1910                                 << TARGET_PAGE_BITS);
1911             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1912         }
1913     }
1914 }
1915
1916 /**
1917  * postcopy_send_discard_bm_ram: discard a RAMBlock
1918  *
1919  * Returns zero on success
1920  *
1921  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1922  *
1923  * @ms: current migration state
1924  * @block: RAMBlock to discard
1925  */
1926 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
1927 {
1928     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1929     unsigned long current;
1930     unsigned long *bitmap = block->bmap;
1931
1932     for (current = 0; current < end; ) {
1933         unsigned long one = find_next_bit(bitmap, end, current);
1934         unsigned long zero, discard_length;
1935
1936         if (one >= end) {
1937             break;
1938         }
1939
1940         zero = find_next_zero_bit(bitmap, end, one + 1);
1941
1942         if (zero >= end) {
1943             discard_length = end - one;
1944         } else {
1945             discard_length = zero - one;
1946         }
1947         postcopy_discard_send_range(ms, one, discard_length);
1948         current = one + discard_length;
1949     }
1950
1951     return 0;
1952 }
1953
1954 /**
1955  * postcopy_each_ram_send_discard: discard all RAMBlocks
1956  *
1957  * Returns 0 for success or negative for error
1958  *
1959  * Utility for the outgoing postcopy code.
1960  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1961  *   passing it bitmap indexes and name.
1962  * (qemu_ram_foreach_block ends up passing unscaled lengths
1963  *  which would mean postcopy code would have to deal with target page)
1964  *
1965  * @ms: current migration state
1966  */
1967 static int postcopy_each_ram_send_discard(MigrationState *ms)
1968 {
1969     struct RAMBlock *block;
1970     int ret;
1971
1972     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1973         postcopy_discard_send_init(ms, block->idstr);
1974
1975         /*
1976          * Postcopy sends chunks of bitmap over the wire, but it
1977          * just needs indexes at this point, avoids it having
1978          * target page specific code.
1979          */
1980         ret = postcopy_send_discard_bm_ram(ms, block);
1981         postcopy_discard_send_finish(ms);
1982         if (ret) {
1983             return ret;
1984         }
1985     }
1986
1987     return 0;
1988 }
1989
1990 /**
1991  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
1992  *
1993  * Helper for postcopy_chunk_hostpages; it's called twice to
1994  * canonicalize the two bitmaps, that are similar, but one is
1995  * inverted.
1996  *
1997  * Postcopy requires that all target pages in a hostpage are dirty or
1998  * clean, not a mix.  This function canonicalizes the bitmaps.
1999  *
2000  * @ms: current migration state
2001  * @block: block that contains the page we want to canonicalize
2002  */
2003 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2004 {
2005     RAMState *rs = ram_state;
2006     unsigned long *bitmap = block->bmap;
2007     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2008     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2009     unsigned long run_start;
2010
2011     if (block->page_size == TARGET_PAGE_SIZE) {
2012         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2013         return;
2014     }
2015
2016     /* Find a dirty page */
2017     run_start = find_next_bit(bitmap, pages, 0);
2018
2019     while (run_start < pages) {
2020
2021         /*
2022          * If the start of this run of pages is in the middle of a host
2023          * page, then we need to fixup this host page.
2024          */
2025         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2026             /* Find the end of this run */
2027             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2028             /*
2029              * If the end isn't at the start of a host page, then the
2030              * run doesn't finish at the end of a host page
2031              * and we need to discard.
2032              */
2033         }
2034
2035         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2036             unsigned long page;
2037             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2038                                                              host_ratio);
2039             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2040
2041             /* Clean up the bitmap */
2042             for (page = fixup_start_addr;
2043                  page < fixup_start_addr + host_ratio; page++) {
2044                 /*
2045                  * Remark them as dirty, updating the count for any pages
2046                  * that weren't previously dirty.
2047                  */
2048                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2049             }
2050         }
2051
2052         /* Find the next dirty page for the next iteration */
2053         run_start = find_next_bit(bitmap, pages, run_start);
2054     }
2055 }
2056
2057 /**
2058  * postcopy_chunk_hostpages: discard any partially sent host page
2059  *
2060  * Utility for the outgoing postcopy code.
2061  *
2062  * Discard any partially sent host-page size chunks, mark any partially
2063  * dirty host-page size chunks as all dirty.  In this case the host-page
2064  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2065  *
2066  * Returns zero on success
2067  *
2068  * @ms: current migration state
2069  * @block: block we want to work with
2070  */
2071 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2072 {
2073     postcopy_discard_send_init(ms, block->idstr);
2074
2075     /*
2076      * Ensure that all partially dirty host pages are made fully dirty.
2077      */
2078     postcopy_chunk_hostpages_pass(ms, block);
2079
2080     postcopy_discard_send_finish(ms);
2081     return 0;
2082 }
2083
2084 /**
2085  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2086  *
2087  * Returns zero on success
2088  *
2089  * Transmit the set of pages to be discarded after precopy to the target
2090  * these are pages that:
2091  *     a) Have been previously transmitted but are now dirty again
2092  *     b) Pages that have never been transmitted, this ensures that
2093  *        any pages on the destination that have been mapped by background
2094  *        tasks get discarded (transparent huge pages is the specific concern)
2095  * Hopefully this is pretty sparse
2096  *
2097  * @ms: current migration state
2098  */
2099 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2100 {
2101     RAMState *rs = ram_state;
2102     RAMBlock *block;
2103     int ret;
2104
2105     RCU_READ_LOCK_GUARD();
2106
2107     /* This should be our last sync, the src is now paused */
2108     migration_bitmap_sync(rs);
2109
2110     /* Easiest way to make sure we don't resume in the middle of a host-page */
2111     rs->last_seen_block = NULL;
2112     rs->last_sent_block = NULL;
2113     rs->last_page = 0;
2114
2115     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2116         /* Deal with TPS != HPS and huge pages */
2117         ret = postcopy_chunk_hostpages(ms, block);
2118         if (ret) {
2119             return ret;
2120         }
2121
2122 #ifdef DEBUG_POSTCOPY
2123         ram_debug_dump_bitmap(block->bmap, true,
2124                               block->used_length >> TARGET_PAGE_BITS);
2125 #endif
2126     }
2127     trace_ram_postcopy_send_discard_bitmap();
2128
2129     ret = postcopy_each_ram_send_discard(ms);
2130
2131     return ret;
2132 }
2133
2134 /**
2135  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2136  *
2137  * Returns zero on success
2138  *
2139  * @rbname: name of the RAMBlock of the request. NULL means the
2140  *          same that last one.
2141  * @start: RAMBlock starting page
2142  * @length: RAMBlock size
2143  */
2144 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2145 {
2146     trace_ram_discard_range(rbname, start, length);
2147
2148     RCU_READ_LOCK_GUARD();
2149     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2150
2151     if (!rb) {
2152         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2153         return -1;
2154     }
2155
2156     /*
2157      * On source VM, we don't need to update the received bitmap since
2158      * we don't even have one.
2159      */
2160     if (rb->receivedmap) {
2161         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2162                      length >> qemu_target_page_bits());
2163     }
2164
2165     return ram_block_discard_range(rb, start, length);
2166 }
2167
2168 /*
2169  * For every allocation, we will try not to crash the VM if the
2170  * allocation failed.
2171  */
2172 static int xbzrle_init(void)
2173 {
2174     Error *local_err = NULL;
2175
2176     if (!migrate_use_xbzrle()) {
2177         return 0;
2178     }
2179
2180     XBZRLE_cache_lock();
2181
2182     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2183     if (!XBZRLE.zero_target_page) {
2184         error_report("%s: Error allocating zero page", __func__);
2185         goto err_out;
2186     }
2187
2188     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2189                               TARGET_PAGE_SIZE, &local_err);
2190     if (!XBZRLE.cache) {
2191         error_report_err(local_err);
2192         goto free_zero_page;
2193     }
2194
2195     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2196     if (!XBZRLE.encoded_buf) {
2197         error_report("%s: Error allocating encoded_buf", __func__);
2198         goto free_cache;
2199     }
2200
2201     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2202     if (!XBZRLE.current_buf) {
2203         error_report("%s: Error allocating current_buf", __func__);
2204         goto free_encoded_buf;
2205     }
2206
2207     /* We are all good */
2208     XBZRLE_cache_unlock();
2209     return 0;
2210
2211 free_encoded_buf:
2212     g_free(XBZRLE.encoded_buf);
2213     XBZRLE.encoded_buf = NULL;
2214 free_cache:
2215     cache_fini(XBZRLE.cache);
2216     XBZRLE.cache = NULL;
2217 free_zero_page:
2218     g_free(XBZRLE.zero_target_page);
2219     XBZRLE.zero_target_page = NULL;
2220 err_out:
2221     XBZRLE_cache_unlock();
2222     return -ENOMEM;
2223 }
2224
2225 static int ram_state_init(RAMState **rsp)
2226 {
2227     *rsp = g_try_new0(RAMState, 1);
2228
2229     if (!*rsp) {
2230         error_report("%s: Init ramstate fail", __func__);
2231         return -1;
2232     }
2233
2234     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2235     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2236     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2237
2238     /*
2239      * Count the total number of pages used by ram blocks not including any
2240      * gaps due to alignment or unplugs.
2241      * This must match with the initial values of dirty bitmap.
2242      */
2243     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2244     ram_state_reset(*rsp);
2245
2246     return 0;
2247 }
2248
2249 static void ram_list_init_bitmaps(void)
2250 {
2251     MigrationState *ms = migrate_get_current();
2252     RAMBlock *block;
2253     unsigned long pages;
2254     uint8_t shift;
2255
2256     /* Skip setting bitmap if there is no RAM */
2257     if (ram_bytes_total()) {
2258         shift = ms->clear_bitmap_shift;
2259         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2260             error_report("clear_bitmap_shift (%u) too big, using "
2261                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2262             shift = CLEAR_BITMAP_SHIFT_MAX;
2263         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2264             error_report("clear_bitmap_shift (%u) too small, using "
2265                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2266             shift = CLEAR_BITMAP_SHIFT_MIN;
2267         }
2268
2269         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2270             pages = block->max_length >> TARGET_PAGE_BITS;
2271             /*
2272              * The initial dirty bitmap for migration must be set with all
2273              * ones to make sure we'll migrate every guest RAM page to
2274              * destination.
2275              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2276              * new migration after a failed migration, ram_list.
2277              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2278              * guest memory.
2279              */
2280             block->bmap = bitmap_new(pages);
2281             bitmap_set(block->bmap, 0, pages);
2282             block->clear_bmap_shift = shift;
2283             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2284         }
2285     }
2286 }
2287
2288 static void ram_init_bitmaps(RAMState *rs)
2289 {
2290     /* For memory_global_dirty_log_start below.  */
2291     qemu_mutex_lock_iothread();
2292     qemu_mutex_lock_ramlist();
2293
2294     WITH_RCU_READ_LOCK_GUARD() {
2295         ram_list_init_bitmaps();
2296         memory_global_dirty_log_start();
2297         migration_bitmap_sync_precopy(rs);
2298     }
2299     qemu_mutex_unlock_ramlist();
2300     qemu_mutex_unlock_iothread();
2301 }
2302
2303 static int ram_init_all(RAMState **rsp)
2304 {
2305     if (ram_state_init(rsp)) {
2306         return -1;
2307     }
2308
2309     if (xbzrle_init()) {
2310         ram_state_cleanup(rsp);
2311         return -1;
2312     }
2313
2314     ram_init_bitmaps(*rsp);
2315
2316     return 0;
2317 }
2318
2319 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2320 {
2321     RAMBlock *block;
2322     uint64_t pages = 0;
2323
2324     /*
2325      * Postcopy is not using xbzrle/compression, so no need for that.
2326      * Also, since source are already halted, we don't need to care
2327      * about dirty page logging as well.
2328      */
2329
2330     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2331         pages += bitmap_count_one(block->bmap,
2332                                   block->used_length >> TARGET_PAGE_BITS);
2333     }
2334
2335     /* This may not be aligned with current bitmaps. Recalculate. */
2336     rs->migration_dirty_pages = pages;
2337
2338     rs->last_seen_block = NULL;
2339     rs->last_sent_block = NULL;
2340     rs->last_page = 0;
2341     rs->last_version = ram_list.version;
2342     /*
2343      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2344      * matter what we have sent.
2345      */
2346     rs->ram_bulk_stage = false;
2347
2348     /* Update RAMState cache of output QEMUFile */
2349     rs->f = out;
2350
2351     trace_ram_state_resume_prepare(pages);
2352 }
2353
2354 /*
2355  * This function clears bits of the free pages reported by the caller from the
2356  * migration dirty bitmap. @addr is the host address corresponding to the
2357  * start of the continuous guest free pages, and @len is the total bytes of
2358  * those pages.
2359  */
2360 void qemu_guest_free_page_hint(void *addr, size_t len)
2361 {
2362     RAMBlock *block;
2363     ram_addr_t offset;
2364     size_t used_len, start, npages;
2365     MigrationState *s = migrate_get_current();
2366
2367     /* This function is currently expected to be used during live migration */
2368     if (!migration_is_setup_or_active(s->state)) {
2369         return;
2370     }
2371
2372     for (; len > 0; len -= used_len, addr += used_len) {
2373         block = qemu_ram_block_from_host(addr, false, &offset);
2374         if (unlikely(!block || offset >= block->used_length)) {
2375             /*
2376              * The implementation might not support RAMBlock resize during
2377              * live migration, but it could happen in theory with future
2378              * updates. So we add a check here to capture that case.
2379              */
2380             error_report_once("%s unexpected error", __func__);
2381             return;
2382         }
2383
2384         if (len <= block->used_length - offset) {
2385             used_len = len;
2386         } else {
2387             used_len = block->used_length - offset;
2388         }
2389
2390         start = offset >> TARGET_PAGE_BITS;
2391         npages = used_len >> TARGET_PAGE_BITS;
2392
2393         qemu_mutex_lock(&ram_state->bitmap_mutex);
2394         ram_state->migration_dirty_pages -=
2395                       bitmap_count_one_with_offset(block->bmap, start, npages);
2396         bitmap_clear(block->bmap, start, npages);
2397         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2398     }
2399 }
2400
2401 /*
2402  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2403  * long-running RCU critical section.  When rcu-reclaims in the code
2404  * start to become numerous it will be necessary to reduce the
2405  * granularity of these critical sections.
2406  */
2407
2408 /**
2409  * ram_save_setup: Setup RAM for migration
2410  *
2411  * Returns zero to indicate success and negative for error
2412  *
2413  * @f: QEMUFile where to send the data
2414  * @opaque: RAMState pointer
2415  */
2416 static int ram_save_setup(QEMUFile *f, void *opaque)
2417 {
2418     RAMState **rsp = opaque;
2419     RAMBlock *block;
2420
2421     if (compress_threads_save_setup()) {
2422         return -1;
2423     }
2424
2425     /* migration has already setup the bitmap, reuse it. */
2426     if (!migration_in_colo_state()) {
2427         if (ram_init_all(rsp) != 0) {
2428             compress_threads_save_cleanup();
2429             return -1;
2430         }
2431     }
2432     (*rsp)->f = f;
2433
2434     WITH_RCU_READ_LOCK_GUARD() {
2435         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2436
2437         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2438             qemu_put_byte(f, strlen(block->idstr));
2439             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2440             qemu_put_be64(f, block->used_length);
2441             if (migrate_postcopy_ram() && block->page_size !=
2442                                           qemu_host_page_size) {
2443                 qemu_put_be64(f, block->page_size);
2444             }
2445             if (migrate_ignore_shared()) {
2446                 qemu_put_be64(f, block->mr->addr);
2447             }
2448         }
2449     }
2450
2451     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2452     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2453
2454     multifd_send_sync_main(f);
2455     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2456     qemu_fflush(f);
2457
2458     return 0;
2459 }
2460
2461 /**
2462  * ram_save_iterate: iterative stage for migration
2463  *
2464  * Returns zero to indicate success and negative for error
2465  *
2466  * @f: QEMUFile where to send the data
2467  * @opaque: RAMState pointer
2468  */
2469 static int ram_save_iterate(QEMUFile *f, void *opaque)
2470 {
2471     RAMState **temp = opaque;
2472     RAMState *rs = *temp;
2473     int ret = 0;
2474     int i;
2475     int64_t t0;
2476     int done = 0;
2477
2478     if (blk_mig_bulk_active()) {
2479         /* Avoid transferring ram during bulk phase of block migration as
2480          * the bulk phase will usually take a long time and transferring
2481          * ram updates during that time is pointless. */
2482         goto out;
2483     }
2484
2485     WITH_RCU_READ_LOCK_GUARD() {
2486         if (ram_list.version != rs->last_version) {
2487             ram_state_reset(rs);
2488         }
2489
2490         /* Read version before ram_list.blocks */
2491         smp_rmb();
2492
2493         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2494
2495         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2496         i = 0;
2497         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2498                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2499             int pages;
2500
2501             if (qemu_file_get_error(f)) {
2502                 break;
2503             }
2504
2505             pages = ram_find_and_save_block(rs, false);
2506             /* no more pages to sent */
2507             if (pages == 0) {
2508                 done = 1;
2509                 break;
2510             }
2511
2512             if (pages < 0) {
2513                 qemu_file_set_error(f, pages);
2514                 break;
2515             }
2516
2517             rs->target_page_count += pages;
2518
2519             /*
2520              * During postcopy, it is necessary to make sure one whole host
2521              * page is sent in one chunk.
2522              */
2523             if (migrate_postcopy_ram()) {
2524                 flush_compressed_data(rs);
2525             }
2526
2527             /*
2528              * we want to check in the 1st loop, just in case it was the 1st
2529              * time and we had to sync the dirty bitmap.
2530              * qemu_clock_get_ns() is a bit expensive, so we only check each
2531              * some iterations
2532              */
2533             if ((i & 63) == 0) {
2534                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2535                               1000000;
2536                 if (t1 > MAX_WAIT) {
2537                     trace_ram_save_iterate_big_wait(t1, i);
2538                     break;
2539                 }
2540             }
2541             i++;
2542         }
2543     }
2544
2545     /*
2546      * Must occur before EOS (or any QEMUFile operation)
2547      * because of RDMA protocol.
2548      */
2549     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2550
2551 out:
2552     if (ret >= 0
2553         && migration_is_setup_or_active(migrate_get_current()->state)) {
2554         multifd_send_sync_main(rs->f);
2555         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2556         qemu_fflush(f);
2557         ram_counters.transferred += 8;
2558
2559         ret = qemu_file_get_error(f);
2560     }
2561     if (ret < 0) {
2562         return ret;
2563     }
2564
2565     return done;
2566 }
2567
2568 /**
2569  * ram_save_complete: function called to send the remaining amount of ram
2570  *
2571  * Returns zero to indicate success or negative on error
2572  *
2573  * Called with iothread lock
2574  *
2575  * @f: QEMUFile where to send the data
2576  * @opaque: RAMState pointer
2577  */
2578 static int ram_save_complete(QEMUFile *f, void *opaque)
2579 {
2580     RAMState **temp = opaque;
2581     RAMState *rs = *temp;
2582     int ret = 0;
2583
2584     WITH_RCU_READ_LOCK_GUARD() {
2585         if (!migration_in_postcopy()) {
2586             migration_bitmap_sync_precopy(rs);
2587         }
2588
2589         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2590
2591         /* try transferring iterative blocks of memory */
2592
2593         /* flush all remaining blocks regardless of rate limiting */
2594         while (true) {
2595             int pages;
2596
2597             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2598             /* no more blocks to sent */
2599             if (pages == 0) {
2600                 break;
2601             }
2602             if (pages < 0) {
2603                 ret = pages;
2604                 break;
2605             }
2606         }
2607
2608         flush_compressed_data(rs);
2609         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2610     }
2611
2612     if (ret >= 0) {
2613         multifd_send_sync_main(rs->f);
2614         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2615         qemu_fflush(f);
2616     }
2617
2618     return ret;
2619 }
2620
2621 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2622                              uint64_t *res_precopy_only,
2623                              uint64_t *res_compatible,
2624                              uint64_t *res_postcopy_only)
2625 {
2626     RAMState **temp = opaque;
2627     RAMState *rs = *temp;
2628     uint64_t remaining_size;
2629
2630     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2631
2632     if (!migration_in_postcopy() &&
2633         remaining_size < max_size) {
2634         qemu_mutex_lock_iothread();
2635         WITH_RCU_READ_LOCK_GUARD() {
2636             migration_bitmap_sync_precopy(rs);
2637         }
2638         qemu_mutex_unlock_iothread();
2639         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2640     }
2641
2642     if (migrate_postcopy_ram()) {
2643         /* We can do postcopy, and all the data is postcopiable */
2644         *res_compatible += remaining_size;
2645     } else {
2646         *res_precopy_only += remaining_size;
2647     }
2648 }
2649
2650 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2651 {
2652     unsigned int xh_len;
2653     int xh_flags;
2654     uint8_t *loaded_data;
2655
2656     /* extract RLE header */
2657     xh_flags = qemu_get_byte(f);
2658     xh_len = qemu_get_be16(f);
2659
2660     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2661         error_report("Failed to load XBZRLE page - wrong compression!");
2662         return -1;
2663     }
2664
2665     if (xh_len > TARGET_PAGE_SIZE) {
2666         error_report("Failed to load XBZRLE page - len overflow!");
2667         return -1;
2668     }
2669     loaded_data = XBZRLE.decoded_buf;
2670     /* load data and decode */
2671     /* it can change loaded_data to point to an internal buffer */
2672     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2673
2674     /* decode RLE */
2675     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2676                              TARGET_PAGE_SIZE) == -1) {
2677         error_report("Failed to load XBZRLE page - decode error!");
2678         return -1;
2679     }
2680
2681     return 0;
2682 }
2683
2684 /**
2685  * ram_block_from_stream: read a RAMBlock id from the migration stream
2686  *
2687  * Must be called from within a rcu critical section.
2688  *
2689  * Returns a pointer from within the RCU-protected ram_list.
2690  *
2691  * @f: QEMUFile where to read the data from
2692  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2693  */
2694 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2695 {
2696     static RAMBlock *block = NULL;
2697     char id[256];
2698     uint8_t len;
2699
2700     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2701         if (!block) {
2702             error_report("Ack, bad migration stream!");
2703             return NULL;
2704         }
2705         return block;
2706     }
2707
2708     len = qemu_get_byte(f);
2709     qemu_get_buffer(f, (uint8_t *)id, len);
2710     id[len] = 0;
2711
2712     block = qemu_ram_block_by_name(id);
2713     if (!block) {
2714         error_report("Can't find block %s", id);
2715         return NULL;
2716     }
2717
2718     if (ramblock_is_ignored(block)) {
2719         error_report("block %s should not be migrated !", id);
2720         return NULL;
2721     }
2722
2723     return block;
2724 }
2725
2726 static inline void *host_from_ram_block_offset(RAMBlock *block,
2727                                                ram_addr_t offset)
2728 {
2729     if (!offset_in_ramblock(block, offset)) {
2730         return NULL;
2731     }
2732
2733     return block->host + offset;
2734 }
2735
2736 static inline void *colo_cache_from_block_offset(RAMBlock *block,
2737                                                  ram_addr_t offset)
2738 {
2739     if (!offset_in_ramblock(block, offset)) {
2740         return NULL;
2741     }
2742     if (!block->colo_cache) {
2743         error_report("%s: colo_cache is NULL in block :%s",
2744                      __func__, block->idstr);
2745         return NULL;
2746     }
2747
2748     /*
2749     * During colo checkpoint, we need bitmap of these migrated pages.
2750     * It help us to decide which pages in ram cache should be flushed
2751     * into VM's RAM later.
2752     */
2753     if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
2754         ram_state->migration_dirty_pages++;
2755     }
2756     return block->colo_cache + offset;
2757 }
2758
2759 /**
2760  * ram_handle_compressed: handle the zero page case
2761  *
2762  * If a page (or a whole RDMA chunk) has been
2763  * determined to be zero, then zap it.
2764  *
2765  * @host: host address for the zero page
2766  * @ch: what the page is filled from.  We only support zero
2767  * @size: size of the zero page
2768  */
2769 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2770 {
2771     if (ch != 0 || !is_zero_range(host, size)) {
2772         memset(host, ch, size);
2773     }
2774 }
2775
2776 /* return the size after decompression, or negative value on error */
2777 static int
2778 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2779                      const uint8_t *source, size_t source_len)
2780 {
2781     int err;
2782
2783     err = inflateReset(stream);
2784     if (err != Z_OK) {
2785         return -1;
2786     }
2787
2788     stream->avail_in = source_len;
2789     stream->next_in = (uint8_t *)source;
2790     stream->avail_out = dest_len;
2791     stream->next_out = dest;
2792
2793     err = inflate(stream, Z_NO_FLUSH);
2794     if (err != Z_STREAM_END) {
2795         return -1;
2796     }
2797
2798     return stream->total_out;
2799 }
2800
2801 static void *do_data_decompress(void *opaque)
2802 {
2803     DecompressParam *param = opaque;
2804     unsigned long pagesize;
2805     uint8_t *des;
2806     int len, ret;
2807
2808     qemu_mutex_lock(&param->mutex);
2809     while (!param->quit) {
2810         if (param->des) {
2811             des = param->des;
2812             len = param->len;
2813             param->des = 0;
2814             qemu_mutex_unlock(&param->mutex);
2815
2816             pagesize = TARGET_PAGE_SIZE;
2817
2818             ret = qemu_uncompress_data(&param->stream, des, pagesize,
2819                                        param->compbuf, len);
2820             if (ret < 0 && migrate_get_current()->decompress_error_check) {
2821                 error_report("decompress data failed");
2822                 qemu_file_set_error(decomp_file, ret);
2823             }
2824
2825             qemu_mutex_lock(&decomp_done_lock);
2826             param->done = true;
2827             qemu_cond_signal(&decomp_done_cond);
2828             qemu_mutex_unlock(&decomp_done_lock);
2829
2830             qemu_mutex_lock(&param->mutex);
2831         } else {
2832             qemu_cond_wait(&param->cond, &param->mutex);
2833         }
2834     }
2835     qemu_mutex_unlock(&param->mutex);
2836
2837     return NULL;
2838 }
2839
2840 static int wait_for_decompress_done(void)
2841 {
2842     int idx, thread_count;
2843
2844     if (!migrate_use_compression()) {
2845         return 0;
2846     }
2847
2848     thread_count = migrate_decompress_threads();
2849     qemu_mutex_lock(&decomp_done_lock);
2850     for (idx = 0; idx < thread_count; idx++) {
2851         while (!decomp_param[idx].done) {
2852             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2853         }
2854     }
2855     qemu_mutex_unlock(&decomp_done_lock);
2856     return qemu_file_get_error(decomp_file);
2857 }
2858
2859 static void compress_threads_load_cleanup(void)
2860 {
2861     int i, thread_count;
2862
2863     if (!migrate_use_compression()) {
2864         return;
2865     }
2866     thread_count = migrate_decompress_threads();
2867     for (i = 0; i < thread_count; i++) {
2868         /*
2869          * we use it as a indicator which shows if the thread is
2870          * properly init'd or not
2871          */
2872         if (!decomp_param[i].compbuf) {
2873             break;
2874         }
2875
2876         qemu_mutex_lock(&decomp_param[i].mutex);
2877         decomp_param[i].quit = true;
2878         qemu_cond_signal(&decomp_param[i].cond);
2879         qemu_mutex_unlock(&decomp_param[i].mutex);
2880     }
2881     for (i = 0; i < thread_count; i++) {
2882         if (!decomp_param[i].compbuf) {
2883             break;
2884         }
2885
2886         qemu_thread_join(decompress_threads + i);
2887         qemu_mutex_destroy(&decomp_param[i].mutex);
2888         qemu_cond_destroy(&decomp_param[i].cond);
2889         inflateEnd(&decomp_param[i].stream);
2890         g_free(decomp_param[i].compbuf);
2891         decomp_param[i].compbuf = NULL;
2892     }
2893     g_free(decompress_threads);
2894     g_free(decomp_param);
2895     decompress_threads = NULL;
2896     decomp_param = NULL;
2897     decomp_file = NULL;
2898 }
2899
2900 static int compress_threads_load_setup(QEMUFile *f)
2901 {
2902     int i, thread_count;
2903
2904     if (!migrate_use_compression()) {
2905         return 0;
2906     }
2907
2908     thread_count = migrate_decompress_threads();
2909     decompress_threads = g_new0(QemuThread, thread_count);
2910     decomp_param = g_new0(DecompressParam, thread_count);
2911     qemu_mutex_init(&decomp_done_lock);
2912     qemu_cond_init(&decomp_done_cond);
2913     decomp_file = f;
2914     for (i = 0; i < thread_count; i++) {
2915         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2916             goto exit;
2917         }
2918
2919         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2920         qemu_mutex_init(&decomp_param[i].mutex);
2921         qemu_cond_init(&decomp_param[i].cond);
2922         decomp_param[i].done = true;
2923         decomp_param[i].quit = false;
2924         qemu_thread_create(decompress_threads + i, "decompress",
2925                            do_data_decompress, decomp_param + i,
2926                            QEMU_THREAD_JOINABLE);
2927     }
2928     return 0;
2929 exit:
2930     compress_threads_load_cleanup();
2931     return -1;
2932 }
2933
2934 static void decompress_data_with_multi_threads(QEMUFile *f,
2935                                                void *host, int len)
2936 {
2937     int idx, thread_count;
2938
2939     thread_count = migrate_decompress_threads();
2940     qemu_mutex_lock(&decomp_done_lock);
2941     while (true) {
2942         for (idx = 0; idx < thread_count; idx++) {
2943             if (decomp_param[idx].done) {
2944                 decomp_param[idx].done = false;
2945                 qemu_mutex_lock(&decomp_param[idx].mutex);
2946                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2947                 decomp_param[idx].des = host;
2948                 decomp_param[idx].len = len;
2949                 qemu_cond_signal(&decomp_param[idx].cond);
2950                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2951                 break;
2952             }
2953         }
2954         if (idx < thread_count) {
2955             break;
2956         } else {
2957             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2958         }
2959     }
2960     qemu_mutex_unlock(&decomp_done_lock);
2961 }
2962
2963 /*
2964  * colo cache: this is for secondary VM, we cache the whole
2965  * memory of the secondary VM, it is need to hold the global lock
2966  * to call this helper.
2967  */
2968 int colo_init_ram_cache(void)
2969 {
2970     RAMBlock *block;
2971
2972     WITH_RCU_READ_LOCK_GUARD() {
2973         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2974             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
2975                                                     NULL,
2976                                                     false);
2977             if (!block->colo_cache) {
2978                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
2979                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
2980                              block->used_length);
2981                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2982                     if (block->colo_cache) {
2983                         qemu_anon_ram_free(block->colo_cache, block->used_length);
2984                         block->colo_cache = NULL;
2985                     }
2986                 }
2987                 return -errno;
2988             }
2989             memcpy(block->colo_cache, block->host, block->used_length);
2990         }
2991     }
2992
2993     /*
2994     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
2995     * with to decide which page in cache should be flushed into SVM's RAM. Here
2996     * we use the same name 'ram_bitmap' as for migration.
2997     */
2998     if (ram_bytes_total()) {
2999         RAMBlock *block;
3000
3001         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3002             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3003
3004             block->bmap = bitmap_new(pages);
3005             bitmap_set(block->bmap, 0, pages);
3006         }
3007     }
3008     ram_state = g_new0(RAMState, 1);
3009     ram_state->migration_dirty_pages = 0;
3010     qemu_mutex_init(&ram_state->bitmap_mutex);
3011     memory_global_dirty_log_start();
3012
3013     return 0;
3014 }
3015
3016 /* It is need to hold the global lock to call this helper */
3017 void colo_release_ram_cache(void)
3018 {
3019     RAMBlock *block;
3020
3021     memory_global_dirty_log_stop();
3022     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3023         g_free(block->bmap);
3024         block->bmap = NULL;
3025     }
3026
3027     WITH_RCU_READ_LOCK_GUARD() {
3028         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3029             if (block->colo_cache) {
3030                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3031                 block->colo_cache = NULL;
3032             }
3033         }
3034     }
3035     qemu_mutex_destroy(&ram_state->bitmap_mutex);
3036     g_free(ram_state);
3037     ram_state = NULL;
3038 }
3039
3040 /**
3041  * ram_load_setup: Setup RAM for migration incoming side
3042  *
3043  * Returns zero to indicate success and negative for error
3044  *
3045  * @f: QEMUFile where to receive the data
3046  * @opaque: RAMState pointer
3047  */
3048 static int ram_load_setup(QEMUFile *f, void *opaque)
3049 {
3050     if (compress_threads_load_setup(f)) {
3051         return -1;
3052     }
3053
3054     xbzrle_load_setup();
3055     ramblock_recv_map_init();
3056
3057     return 0;
3058 }
3059
3060 static int ram_load_cleanup(void *opaque)
3061 {
3062     RAMBlock *rb;
3063
3064     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3065         qemu_ram_block_writeback(rb);
3066     }
3067
3068     xbzrle_load_cleanup();
3069     compress_threads_load_cleanup();
3070
3071     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3072         g_free(rb->receivedmap);
3073         rb->receivedmap = NULL;
3074     }
3075
3076     return 0;
3077 }
3078
3079 /**
3080  * ram_postcopy_incoming_init: allocate postcopy data structures
3081  *
3082  * Returns 0 for success and negative if there was one error
3083  *
3084  * @mis: current migration incoming state
3085  *
3086  * Allocate data structures etc needed by incoming migration with
3087  * postcopy-ram. postcopy-ram's similarly names
3088  * postcopy_ram_incoming_init does the work.
3089  */
3090 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3091 {
3092     return postcopy_ram_incoming_init(mis);
3093 }
3094
3095 /**
3096  * ram_load_postcopy: load a page in postcopy case
3097  *
3098  * Returns 0 for success or -errno in case of error
3099  *
3100  * Called in postcopy mode by ram_load().
3101  * rcu_read_lock is taken prior to this being called.
3102  *
3103  * @f: QEMUFile where to send the data
3104  */
3105 static int ram_load_postcopy(QEMUFile *f)
3106 {
3107     int flags = 0, ret = 0;
3108     bool place_needed = false;
3109     bool matches_target_page_size = false;
3110     MigrationIncomingState *mis = migration_incoming_get_current();
3111     /* Temporary page that is later 'placed' */
3112     void *postcopy_host_page = mis->postcopy_tmp_page;
3113     void *this_host = NULL;
3114     bool all_zero = false;
3115     int target_pages = 0;
3116
3117     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3118         ram_addr_t addr;
3119         void *host = NULL;
3120         void *page_buffer = NULL;
3121         void *place_source = NULL;
3122         RAMBlock *block = NULL;
3123         uint8_t ch;
3124         int len;
3125
3126         addr = qemu_get_be64(f);
3127
3128         /*
3129          * If qemu file error, we should stop here, and then "addr"
3130          * may be invalid
3131          */
3132         ret = qemu_file_get_error(f);
3133         if (ret) {
3134             break;
3135         }
3136
3137         flags = addr & ~TARGET_PAGE_MASK;
3138         addr &= TARGET_PAGE_MASK;
3139
3140         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3141         place_needed = false;
3142         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3143                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3144             block = ram_block_from_stream(f, flags);
3145
3146             host = host_from_ram_block_offset(block, addr);
3147             if (!host) {
3148                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3149                 ret = -EINVAL;
3150                 break;
3151             }
3152             target_pages++;
3153             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3154             /*
3155              * Postcopy requires that we place whole host pages atomically;
3156              * these may be huge pages for RAMBlocks that are backed by
3157              * hugetlbfs.
3158              * To make it atomic, the data is read into a temporary page
3159              * that's moved into place later.
3160              * The migration protocol uses,  possibly smaller, target-pages
3161              * however the source ensures it always sends all the components
3162              * of a host page in one chunk.
3163              */
3164             page_buffer = postcopy_host_page +
3165                           ((uintptr_t)host & (block->page_size - 1));
3166             /* If all TP are zero then we can optimise the place */
3167             if (target_pages == 1) {
3168                 all_zero = true;
3169                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3170                                                     block->page_size);
3171             } else {
3172                 /* not the 1st TP within the HP */
3173                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3174                     (uintptr_t)this_host) {
3175                     error_report("Non-same host page %p/%p",
3176                                   host, this_host);
3177                     ret = -EINVAL;
3178                     break;
3179                 }
3180             }
3181
3182             /*
3183              * If it's the last part of a host page then we place the host
3184              * page
3185              */
3186             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3187                 place_needed = true;
3188                 target_pages = 0;
3189             }
3190             place_source = postcopy_host_page;
3191         }
3192
3193         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3194         case RAM_SAVE_FLAG_ZERO:
3195             ch = qemu_get_byte(f);
3196             /*
3197              * Can skip to set page_buffer when
3198              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3199              */
3200             if (ch || !matches_target_page_size) {
3201                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3202             }
3203             if (ch) {
3204                 all_zero = false;
3205             }
3206             break;
3207
3208         case RAM_SAVE_FLAG_PAGE:
3209             all_zero = false;
3210             if (!matches_target_page_size) {
3211                 /* For huge pages, we always use temporary buffer */
3212                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3213             } else {
3214                 /*
3215                  * For small pages that matches target page size, we
3216                  * avoid the qemu_file copy.  Instead we directly use
3217                  * the buffer of QEMUFile to place the page.  Note: we
3218                  * cannot do any QEMUFile operation before using that
3219                  * buffer to make sure the buffer is valid when
3220                  * placing the page.
3221                  */
3222                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3223                                          TARGET_PAGE_SIZE);
3224             }
3225             break;
3226         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3227             all_zero = false;
3228             len = qemu_get_be32(f);
3229             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3230                 error_report("Invalid compressed data length: %d", len);
3231                 ret = -EINVAL;
3232                 break;
3233             }
3234             decompress_data_with_multi_threads(f, page_buffer, len);
3235             break;
3236
3237         case RAM_SAVE_FLAG_EOS:
3238             /* normal exit */
3239             multifd_recv_sync_main();
3240             break;
3241         default:
3242             error_report("Unknown combination of migration flags: %#x"
3243                          " (postcopy mode)", flags);
3244             ret = -EINVAL;
3245             break;
3246         }
3247
3248         /* Got the whole host page, wait for decompress before placing. */
3249         if (place_needed) {
3250             ret |= wait_for_decompress_done();
3251         }
3252
3253         /* Detect for any possible file errors */
3254         if (!ret && qemu_file_get_error(f)) {
3255             ret = qemu_file_get_error(f);
3256         }
3257
3258         if (!ret && place_needed) {
3259             /* This gets called at the last target page in the host page */
3260             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3261                                                        block->page_size);
3262
3263             if (all_zero) {
3264                 ret = postcopy_place_page_zero(mis, place_dest,
3265                                                block);
3266             } else {
3267                 ret = postcopy_place_page(mis, place_dest,
3268                                           place_source, block);
3269             }
3270         }
3271     }
3272
3273     return ret;
3274 }
3275
3276 static bool postcopy_is_advised(void)
3277 {
3278     PostcopyState ps = postcopy_state_get();
3279     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3280 }
3281
3282 static bool postcopy_is_running(void)
3283 {
3284     PostcopyState ps = postcopy_state_get();
3285     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3286 }
3287
3288 /*
3289  * Flush content of RAM cache into SVM's memory.
3290  * Only flush the pages that be dirtied by PVM or SVM or both.
3291  */
3292 static void colo_flush_ram_cache(void)
3293 {
3294     RAMBlock *block = NULL;
3295     void *dst_host;
3296     void *src_host;
3297     unsigned long offset = 0;
3298
3299     memory_global_dirty_log_sync();
3300     WITH_RCU_READ_LOCK_GUARD() {
3301         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3302             ramblock_sync_dirty_bitmap(ram_state, block);
3303         }
3304     }
3305
3306     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3307     WITH_RCU_READ_LOCK_GUARD() {
3308         block = QLIST_FIRST_RCU(&ram_list.blocks);
3309
3310         while (block) {
3311             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3312
3313             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3314                 >= block->used_length) {
3315                 offset = 0;
3316                 block = QLIST_NEXT_RCU(block, next);
3317             } else {
3318                 migration_bitmap_clear_dirty(ram_state, block, offset);
3319                 dst_host = block->host
3320                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3321                 src_host = block->colo_cache
3322                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3323                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3324             }
3325         }
3326     }
3327     trace_colo_flush_ram_cache_end();
3328 }
3329
3330 /**
3331  * ram_load_precopy: load pages in precopy case
3332  *
3333  * Returns 0 for success or -errno in case of error
3334  *
3335  * Called in precopy mode by ram_load().
3336  * rcu_read_lock is taken prior to this being called.
3337  *
3338  * @f: QEMUFile where to send the data
3339  */
3340 static int ram_load_precopy(QEMUFile *f)
3341 {
3342     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3343     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3344     bool postcopy_advised = postcopy_is_advised();
3345     if (!migrate_use_compression()) {
3346         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3347     }
3348
3349     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3350         ram_addr_t addr, total_ram_bytes;
3351         void *host = NULL;
3352         uint8_t ch;
3353
3354         /*
3355          * Yield periodically to let main loop run, but an iteration of
3356          * the main loop is expensive, so do it each some iterations
3357          */
3358         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3359             aio_co_schedule(qemu_get_current_aio_context(),
3360                             qemu_coroutine_self());
3361             qemu_coroutine_yield();
3362         }
3363         i++;
3364
3365         addr = qemu_get_be64(f);
3366         flags = addr & ~TARGET_PAGE_MASK;
3367         addr &= TARGET_PAGE_MASK;
3368
3369         if (flags & invalid_flags) {
3370             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3371                 error_report("Received an unexpected compressed page");
3372             }
3373
3374             ret = -EINVAL;
3375             break;
3376         }
3377
3378         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3379                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3380             RAMBlock *block = ram_block_from_stream(f, flags);
3381
3382             /*
3383              * After going into COLO, we should load the Page into colo_cache.
3384              */
3385             if (migration_incoming_in_colo_state()) {
3386                 host = colo_cache_from_block_offset(block, addr);
3387             } else {
3388                 host = host_from_ram_block_offset(block, addr);
3389             }
3390             if (!host) {
3391                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3392                 ret = -EINVAL;
3393                 break;
3394             }
3395
3396             if (!migration_incoming_in_colo_state()) {
3397                 ramblock_recv_bitmap_set(block, host);
3398             }
3399
3400             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3401         }
3402
3403         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3404         case RAM_SAVE_FLAG_MEM_SIZE:
3405             /* Synchronize RAM block list */
3406             total_ram_bytes = addr;
3407             while (!ret && total_ram_bytes) {
3408                 RAMBlock *block;
3409                 char id[256];
3410                 ram_addr_t length;
3411
3412                 len = qemu_get_byte(f);
3413                 qemu_get_buffer(f, (uint8_t *)id, len);
3414                 id[len] = 0;
3415                 length = qemu_get_be64(f);
3416
3417                 block = qemu_ram_block_by_name(id);
3418                 if (block && !qemu_ram_is_migratable(block)) {
3419                     error_report("block %s should not be migrated !", id);
3420                     ret = -EINVAL;
3421                 } else if (block) {
3422                     if (length != block->used_length) {
3423                         Error *local_err = NULL;
3424
3425                         ret = qemu_ram_resize(block, length,
3426                                               &local_err);
3427                         if (local_err) {
3428                             error_report_err(local_err);
3429                         }
3430                     }
3431                     /* For postcopy we need to check hugepage sizes match */
3432                     if (postcopy_advised &&
3433                         block->page_size != qemu_host_page_size) {
3434                         uint64_t remote_page_size = qemu_get_be64(f);
3435                         if (remote_page_size != block->page_size) {
3436                             error_report("Mismatched RAM page size %s "
3437                                          "(local) %zd != %" PRId64,
3438                                          id, block->page_size,
3439                                          remote_page_size);
3440                             ret = -EINVAL;
3441                         }
3442                     }
3443                     if (migrate_ignore_shared()) {
3444                         hwaddr addr = qemu_get_be64(f);
3445                         if (ramblock_is_ignored(block) &&
3446                             block->mr->addr != addr) {
3447                             error_report("Mismatched GPAs for block %s "
3448                                          "%" PRId64 "!= %" PRId64,
3449                                          id, (uint64_t)addr,
3450                                          (uint64_t)block->mr->addr);
3451                             ret = -EINVAL;
3452                         }
3453                     }
3454                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3455                                           block->idstr);
3456                 } else {
3457                     error_report("Unknown ramblock \"%s\", cannot "
3458                                  "accept migration", id);
3459                     ret = -EINVAL;
3460                 }
3461
3462                 total_ram_bytes -= length;
3463             }
3464             break;
3465
3466         case RAM_SAVE_FLAG_ZERO:
3467             ch = qemu_get_byte(f);
3468             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3469             break;
3470
3471         case RAM_SAVE_FLAG_PAGE:
3472             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3473             break;
3474
3475         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3476             len = qemu_get_be32(f);
3477             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3478                 error_report("Invalid compressed data length: %d", len);
3479                 ret = -EINVAL;
3480                 break;
3481             }
3482             decompress_data_with_multi_threads(f, host, len);
3483             break;
3484
3485         case RAM_SAVE_FLAG_XBZRLE:
3486             if (load_xbzrle(f, addr, host) < 0) {
3487                 error_report("Failed to decompress XBZRLE page at "
3488                              RAM_ADDR_FMT, addr);
3489                 ret = -EINVAL;
3490                 break;
3491             }
3492             break;
3493         case RAM_SAVE_FLAG_EOS:
3494             /* normal exit */
3495             multifd_recv_sync_main();
3496             break;
3497         default:
3498             if (flags & RAM_SAVE_FLAG_HOOK) {
3499                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3500             } else {
3501                 error_report("Unknown combination of migration flags: %#x",
3502                              flags);
3503                 ret = -EINVAL;
3504             }
3505         }
3506         if (!ret) {
3507             ret = qemu_file_get_error(f);
3508         }
3509     }
3510
3511     ret |= wait_for_decompress_done();
3512     return ret;
3513 }
3514
3515 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3516 {
3517     int ret = 0;
3518     static uint64_t seq_iter;
3519     /*
3520      * If system is running in postcopy mode, page inserts to host memory must
3521      * be atomic
3522      */
3523     bool postcopy_running = postcopy_is_running();
3524
3525     seq_iter++;
3526
3527     if (version_id != 4) {
3528         return -EINVAL;
3529     }
3530
3531     /*
3532      * This RCU critical section can be very long running.
3533      * When RCU reclaims in the code start to become numerous,
3534      * it will be necessary to reduce the granularity of this
3535      * critical section.
3536      */
3537     WITH_RCU_READ_LOCK_GUARD() {
3538         if (postcopy_running) {
3539             ret = ram_load_postcopy(f);
3540         } else {
3541             ret = ram_load_precopy(f);
3542         }
3543     }
3544     trace_ram_load_complete(ret, seq_iter);
3545
3546     if (!ret  && migration_incoming_in_colo_state()) {
3547         colo_flush_ram_cache();
3548     }
3549     return ret;
3550 }
3551
3552 static bool ram_has_postcopy(void *opaque)
3553 {
3554     RAMBlock *rb;
3555     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3556         if (ramblock_is_pmem(rb)) {
3557             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3558                          "is not supported now!", rb->idstr, rb->host);
3559             return false;
3560         }
3561     }
3562
3563     return migrate_postcopy_ram();
3564 }
3565
3566 /* Sync all the dirty bitmap with destination VM.  */
3567 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3568 {
3569     RAMBlock *block;
3570     QEMUFile *file = s->to_dst_file;
3571     int ramblock_count = 0;
3572
3573     trace_ram_dirty_bitmap_sync_start();
3574
3575     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3576         qemu_savevm_send_recv_bitmap(file, block->idstr);
3577         trace_ram_dirty_bitmap_request(block->idstr);
3578         ramblock_count++;
3579     }
3580
3581     trace_ram_dirty_bitmap_sync_wait();
3582
3583     /* Wait until all the ramblocks' dirty bitmap synced */
3584     while (ramblock_count--) {
3585         qemu_sem_wait(&s->rp_state.rp_sem);
3586     }
3587
3588     trace_ram_dirty_bitmap_sync_complete();
3589
3590     return 0;
3591 }
3592
3593 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3594 {
3595     qemu_sem_post(&s->rp_state.rp_sem);
3596 }
3597
3598 /*
3599  * Read the received bitmap, revert it as the initial dirty bitmap.
3600  * This is only used when the postcopy migration is paused but wants
3601  * to resume from a middle point.
3602  */
3603 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3604 {
3605     int ret = -EINVAL;
3606     QEMUFile *file = s->rp_state.from_dst_file;
3607     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3608     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3609     uint64_t size, end_mark;
3610
3611     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3612
3613     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3614         error_report("%s: incorrect state %s", __func__,
3615                      MigrationStatus_str(s->state));
3616         return -EINVAL;
3617     }
3618
3619     /*
3620      * Note: see comments in ramblock_recv_bitmap_send() on why we
3621      * need the endianess convertion, and the paddings.
3622      */
3623     local_size = ROUND_UP(local_size, 8);
3624
3625     /* Add paddings */
3626     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3627
3628     size = qemu_get_be64(file);
3629
3630     /* The size of the bitmap should match with our ramblock */
3631     if (size != local_size) {
3632         error_report("%s: ramblock '%s' bitmap size mismatch "
3633                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3634                      block->idstr, size, local_size);
3635         ret = -EINVAL;
3636         goto out;
3637     }
3638
3639     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3640     end_mark = qemu_get_be64(file);
3641
3642     ret = qemu_file_get_error(file);
3643     if (ret || size != local_size) {
3644         error_report("%s: read bitmap failed for ramblock '%s': %d"
3645                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3646                      __func__, block->idstr, ret, local_size, size);
3647         ret = -EIO;
3648         goto out;
3649     }
3650
3651     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3652         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3653                      __func__, block->idstr, end_mark);
3654         ret = -EINVAL;
3655         goto out;
3656     }
3657
3658     /*
3659      * Endianess convertion. We are during postcopy (though paused).
3660      * The dirty bitmap won't change. We can directly modify it.
3661      */
3662     bitmap_from_le(block->bmap, le_bitmap, nbits);
3663
3664     /*
3665      * What we received is "received bitmap". Revert it as the initial
3666      * dirty bitmap for this ramblock.
3667      */
3668     bitmap_complement(block->bmap, block->bmap, nbits);
3669
3670     trace_ram_dirty_bitmap_reload_complete(block->idstr);
3671
3672     /*
3673      * We succeeded to sync bitmap for current ramblock. If this is
3674      * the last one to sync, we need to notify the main send thread.
3675      */
3676     ram_dirty_bitmap_reload_notify(s);
3677
3678     ret = 0;
3679 out:
3680     g_free(le_bitmap);
3681     return ret;
3682 }
3683
3684 static int ram_resume_prepare(MigrationState *s, void *opaque)
3685 {
3686     RAMState *rs = *(RAMState **)opaque;
3687     int ret;
3688
3689     ret = ram_dirty_bitmap_sync_all(s, rs);
3690     if (ret) {
3691         return ret;
3692     }
3693
3694     ram_state_resume_prepare(rs, s->to_dst_file);
3695
3696     return 0;
3697 }
3698
3699 static SaveVMHandlers savevm_ram_handlers = {
3700     .save_setup = ram_save_setup,
3701     .save_live_iterate = ram_save_iterate,
3702     .save_live_complete_postcopy = ram_save_complete,
3703     .save_live_complete_precopy = ram_save_complete,
3704     .has_postcopy = ram_has_postcopy,
3705     .save_live_pending = ram_save_pending,
3706     .load_state = ram_load,
3707     .save_cleanup = ram_save_cleanup,
3708     .load_setup = ram_load_setup,
3709     .load_cleanup = ram_load_cleanup,
3710     .resume_prepare = ram_resume_prepare,
3711 };
3712
3713 void ram_mig_init(void)
3714 {
3715     qemu_mutex_init(&XBZRLE.lock);
3716     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
3717 }