migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include "qemu/cutils.h"
  32 #include "qemu/bitops.h"
  33 #include "qemu/bitmap.h"
  34 #include "qemu/main-loop.h"
  35 #include "xbzrle.h"
  36 #include "ram.h"
  37 #include "migration.h"
  38 #include "migration/register.h"
  39 #include "migration/misc.h"
  40 #include "qemu-file.h"
  41 #include "postcopy-ram.h"
  42 #include "page_cache.h"
  43 #include "qemu/error-report.h"
  44 #include "qapi/error.h"
  45 #include "qapi/qapi-types-migration.h"
  46 #include "qapi/qapi-events-migration.h"
  47 #include "qapi/qmp/qerror.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "exec/target_page.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "migration/colo.h"
  53 #include "block.h"
  54 #include "sysemu/sysemu.h"
  55 #include "sysemu/cpu-throttle.h"
  56 #include "savevm.h"
  57 #include "qemu/iov.h"
  58 #include "multifd.h"
  59
  60 /***********************************************************/
  61 /* ram save/restore */
  62
  63 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  64  * worked for pages that where filled with the same char.  We switched
  65  * it to only search for the zero value.  And to avoid confusion with
  66  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  67  */
  68
  69 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  70 #define RAM_SAVE_FLAG_ZERO     0x02
  71 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  72 #define RAM_SAVE_FLAG_PAGE     0x08
  73 #define RAM_SAVE_FLAG_EOS      0x10
  74 #define RAM_SAVE_FLAG_CONTINUE 0x20
  75 #define RAM_SAVE_FLAG_XBZRLE   0x40
  76 /* 0x80 is reserved in migration.h start with 0x100 next */
  77 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  78
  79 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  80 {
  81     return buffer_is_zero(p, size);
  82 }
  83
  84 XBZRLECacheStats xbzrle_counters;
  85
  86 /* struct contains XBZRLE cache and a static page
  87    used by the compression */
  88 static struct {
  89     /* buffer used for XBZRLE encoding */
  90     uint8_t *encoded_buf;
  91     /* buffer for storing page content */
  92     uint8_t *current_buf;
  93     /* Cache for XBZRLE, Protected by lock. */
  94     PageCache *cache;
  95     QemuMutex lock;
  96     /* it will store a page full of zeros */
  97     uint8_t *zero_target_page;
  98     /* buffer used for XBZRLE decoding */
  99     uint8_t *decoded_buf;
 100 } XBZRLE;
 101
 102 static void XBZRLE_cache_lock(void)
 103 {
 104     if (migrate_use_xbzrle()) {
 105         qemu_mutex_lock(&XBZRLE.lock);
 106     }
 107 }
 108
 109 static void XBZRLE_cache_unlock(void)
 110 {
 111     if (migrate_use_xbzrle()) {
 112         qemu_mutex_unlock(&XBZRLE.lock);
 113     }
 114 }
 115
 116 /**
 117  * xbzrle_cache_resize: resize the xbzrle cache
 118  *
 119  * This function is called from qmp_migrate_set_cache_size in main
 120  * thread, possibly while a migration is in progress.  A running
 121  * migration may be using the cache and might finish during this call,
 122  * hence changes to the cache are protected by XBZRLE.lock().
 123  *
 124  * Returns 0 for success or -1 for error
 125  *
 126  * @new_size: new cache size
 127  * @errp: set *errp if the check failed, with reason
 128  */
 129 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 130 {
 131     PageCache *new_cache;
 132     int64_t ret = 0;
 133
 134     /* Check for truncation */
 135     if (new_size != (size_t)new_size) {
 136         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 137                    "exceeding address space");
 138         return -1;
 139     }
 140
 141     if (new_size == migrate_xbzrle_cache_size()) {
 142         /* nothing to do */
 143         return 0;
 144     }
 145
 146     XBZRLE_cache_lock();
 147
 148     if (XBZRLE.cache != NULL) {
 149         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 150         if (!new_cache) {
 151             ret = -1;
 152             goto out;
 153         }
 154
 155         cache_fini(XBZRLE.cache);
 156         XBZRLE.cache = new_cache;
 157     }
 158 out:
 159     XBZRLE_cache_unlock();
 160     return ret;
 161 }
 162
 163 bool ramblock_is_ignored(RAMBlock *block)
 164 {
 165     return !qemu_ram_is_migratable(block) ||
 166            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 167 }
 168
 169 #undef RAMBLOCK_FOREACH
 170
 171 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 172 {
 173     RAMBlock *block;
 174     int ret = 0;
 175
 176     RCU_READ_LOCK_GUARD();
 177
 178     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 179         ret = func(block, opaque);
 180         if (ret) {
 181             break;
 182         }
 183     }
 184     return ret;
 185 }
 186
 187 static void ramblock_recv_map_init(void)
 188 {
 189     RAMBlock *rb;
 190
 191     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 192         assert(!rb->receivedmap);
 193         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 194     }
 195 }
 196
 197 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 198 {
 199     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 200                     rb->receivedmap);
 201 }
 202
 203 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 204 {
 205     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 206 }
 207
 208 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 209 {
 210     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 211 }
 212
 213 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 214                                     size_t nr)
 215 {
 216     bitmap_set_atomic(rb->receivedmap,
 217                       ramblock_recv_bitmap_offset(host_addr, rb),
 218                       nr);
 219 }
 220
 221 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 222
 223 /*
 224  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 225  *
 226  * Returns >0 if success with sent bytes, or <0 if error.
 227  */
 228 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 229                                   const char *block_name)
 230 {
 231     RAMBlock *block = qemu_ram_block_by_name(block_name);
 232     unsigned long *le_bitmap, nbits;
 233     uint64_t size;
 234
 235     if (!block) {
 236         error_report("%s: invalid block name: %s", __func__, block_name);
 237         return -1;
 238     }
 239
 240     nbits = block->used_length >> TARGET_PAGE_BITS;
 241
 242     /*
 243      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 244      * machines we may need 4 more bytes for padding (see below
 245      * comment). So extend it a bit before hand.
 246      */
 247     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 248
 249     /*
 250      * Always use little endian when sending the bitmap. This is
 251      * required that when source and destination VMs are not using the
 252      * same endianness. (Note: big endian won't work.)
 253      */
 254     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 255
 256     /* Size of the bitmap, in bytes */
 257     size = DIV_ROUND_UP(nbits, 8);
 258
 259     /*
 260      * size is always aligned to 8 bytes for 64bit machines, but it
 261      * may not be true for 32bit machines. We need this padding to
 262      * make sure the migration can survive even between 32bit and
 263      * 64bit machines.
 264      */
 265     size = ROUND_UP(size, 8);
 266
 267     qemu_put_be64(file, size);
 268     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 269     /*
 270      * Mark as an end, in case the middle part is screwed up due to
 271      * some "mysterious" reason.
 272      */
 273     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 274     qemu_fflush(file);
 275
 276     g_free(le_bitmap);
 277
 278     if (qemu_file_get_error(file)) {
 279         return qemu_file_get_error(file);
 280     }
 281
 282     return size + sizeof(size);
 283 }
 284
 285 /*
 286  * An outstanding page request, on the source, having been received
 287  * and queued
 288  */
 289 struct RAMSrcPageRequest {
 290     RAMBlock *rb;
 291     hwaddr    offset;
 292     hwaddr    len;
 293
 294     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 295 };
 296
 297 /* State of RAM for migration */
 298 struct RAMState {
 299     /* QEMUFile used for this migration */
 300     QEMUFile *f;
 301     /* Last block that we have visited searching for dirty pages */
 302     RAMBlock *last_seen_block;
 303     /* Last block from where we have sent data */
 304     RAMBlock *last_sent_block;
 305     /* Last dirty target page we have sent */
 306     ram_addr_t last_page;
 307     /* last ram version we have seen */
 308     uint32_t last_version;
 309     /* We are in the first round */
 310     bool ram_bulk_stage;
 311     /* The free page optimization is enabled */
 312     bool fpo_enabled;
 313     /* How many times we have dirty too many pages */
 314     int dirty_rate_high_cnt;
 315     /* these variables are used for bitmap sync */
 316     /* last time we did a full bitmap_sync */
 317     int64_t time_last_bitmap_sync;
 318     /* bytes transferred at start_time */
 319     uint64_t bytes_xfer_prev;
 320     /* number of dirty pages since start_time */
 321     uint64_t num_dirty_pages_period;
 322     /* xbzrle misses since the beginning of the period */
 323     uint64_t xbzrle_cache_miss_prev;
 324     /* Amount of xbzrle pages since the beginning of the period */
 325     uint64_t xbzrle_pages_prev;
 326     /* Amount of xbzrle encoded bytes since the beginning of the period */
 327     uint64_t xbzrle_bytes_prev;
 328
 329     /* compression statistics since the beginning of the period */
 330     /* amount of count that no free thread to compress data */
 331     uint64_t compress_thread_busy_prev;
 332     /* amount bytes after compression */
 333     uint64_t compressed_size_prev;
 334     /* amount of compressed pages */
 335     uint64_t compress_pages_prev;
 336
 337     /* total handled target pages at the beginning of period */
 338     uint64_t target_page_count_prev;
 339     /* total handled target pages since start */
 340     uint64_t target_page_count;
 341     /* number of dirty bits in the bitmap */
 342     uint64_t migration_dirty_pages;
 343     /* Protects modification of the bitmap and migration dirty pages */
 344     QemuMutex bitmap_mutex;
 345     /* The RAMBlock used in the last src_page_requests */
 346     RAMBlock *last_req_rb;
 347     /* Queue of outstanding page requests from the destination */
 348     QemuMutex src_page_req_mutex;
 349     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 350 };
 351 typedef struct RAMState RAMState;
 352
 353 static RAMState *ram_state;
 354
 355 static NotifierWithReturnList precopy_notifier_list;
 356
 357 void precopy_infrastructure_init(void)
 358 {
 359     notifier_with_return_list_init(&precopy_notifier_list);
 360 }
 361
 362 void precopy_add_notifier(NotifierWithReturn *n)
 363 {
 364     notifier_with_return_list_add(&precopy_notifier_list, n);
 365 }
 366
 367 void precopy_remove_notifier(NotifierWithReturn *n)
 368 {
 369     notifier_with_return_remove(n);
 370 }
 371
 372 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 373 {
 374     PrecopyNotifyData pnd;
 375     pnd.reason = reason;
 376     pnd.errp = errp;
 377
 378     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 379 }
 380
 381 void precopy_enable_free_page_optimization(void)
 382 {
 383     if (!ram_state) {
 384         return;
 385     }
 386
 387     ram_state->fpo_enabled = true;
 388 }
 389
 390 uint64_t ram_bytes_remaining(void)
 391 {
 392     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 393                        0;
 394 }
 395
 396 MigrationStats ram_counters;
 397
 398 /* used by the search for pages to send */
 399 struct PageSearchStatus {
 400     /* Current block being searched */
 401     RAMBlock    *block;
 402     /* Current page to search from */
 403     unsigned long page;
 404     /* Set once we wrap around */
 405     bool         complete_round;
 406 };
 407 typedef struct PageSearchStatus PageSearchStatus;
 408
 409 CompressionStats compression_counters;
 410
 411 struct CompressParam {
 412     bool done;
 413     bool quit;
 414     bool zero_page;
 415     QEMUFile *file;
 416     QemuMutex mutex;
 417     QemuCond cond;
 418     RAMBlock *block;
 419     ram_addr_t offset;
 420
 421     /* internally used fields */
 422     z_stream stream;
 423     uint8_t *originbuf;
 424 };
 425 typedef struct CompressParam CompressParam;
 426
 427 struct DecompressParam {
 428     bool done;
 429     bool quit;
 430     QemuMutex mutex;
 431     QemuCond cond;
 432     void *des;
 433     uint8_t *compbuf;
 434     int len;
 435     z_stream stream;
 436 };
 437 typedef struct DecompressParam DecompressParam;
 438
 439 static CompressParam *comp_param;
 440 static QemuThread *compress_threads;
 441 /* comp_done_cond is used to wake up the migration thread when
 442  * one of the compression threads has finished the compression.
 443  * comp_done_lock is used to co-work with comp_done_cond.
 444  */
 445 static QemuMutex comp_done_lock;
 446 static QemuCond comp_done_cond;
 447 /* The empty QEMUFileOps will be used by file in CompressParam */
 448 static const QEMUFileOps empty_ops = { };
 449
 450 static QEMUFile *decomp_file;
 451 static DecompressParam *decomp_param;
 452 static QemuThread *decompress_threads;
 453 static QemuMutex decomp_done_lock;
 454 static QemuCond decomp_done_cond;
 455
 456 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 457                                  ram_addr_t offset, uint8_t *source_buf);
 458
 459 static void *do_data_compress(void *opaque)
 460 {
 461     CompressParam *param = opaque;
 462     RAMBlock *block;
 463     ram_addr_t offset;
 464     bool zero_page;
 465
 466     qemu_mutex_lock(&param->mutex);
 467     while (!param->quit) {
 468         if (param->block) {
 469             block = param->block;
 470             offset = param->offset;
 471             param->block = NULL;
 472             qemu_mutex_unlock(&param->mutex);
 473
 474             zero_page = do_compress_ram_page(param->file, &param->stream,
 475                                              block, offset, param->originbuf);
 476
 477             qemu_mutex_lock(&comp_done_lock);
 478             param->done = true;
 479             param->zero_page = zero_page;
 480             qemu_cond_signal(&comp_done_cond);
 481             qemu_mutex_unlock(&comp_done_lock);
 482
 483             qemu_mutex_lock(&param->mutex);
 484         } else {
 485             qemu_cond_wait(&param->cond, &param->mutex);
 486         }
 487     }
 488     qemu_mutex_unlock(&param->mutex);
 489
 490     return NULL;
 491 }
 492
 493 static void compress_threads_save_cleanup(void)
 494 {
 495     int i, thread_count;
 496
 497     if (!migrate_use_compression() || !comp_param) {
 498         return;
 499     }
 500
 501     thread_count = migrate_compress_threads();
 502     for (i = 0; i < thread_count; i++) {
 503         /*
 504          * we use it as a indicator which shows if the thread is
 505          * properly init'd or not
 506          */
 507         if (!comp_param[i].file) {
 508             break;
 509         }
 510
 511         qemu_mutex_lock(&comp_param[i].mutex);
 512         comp_param[i].quit = true;
 513         qemu_cond_signal(&comp_param[i].cond);
 514         qemu_mutex_unlock(&comp_param[i].mutex);
 515
 516         qemu_thread_join(compress_threads + i);
 517         qemu_mutex_destroy(&comp_param[i].mutex);
 518         qemu_cond_destroy(&comp_param[i].cond);
 519         deflateEnd(&comp_param[i].stream);
 520         g_free(comp_param[i].originbuf);
 521         qemu_fclose(comp_param[i].file);
 522         comp_param[i].file = NULL;
 523     }
 524     qemu_mutex_destroy(&comp_done_lock);
 525     qemu_cond_destroy(&comp_done_cond);
 526     g_free(compress_threads);
 527     g_free(comp_param);
 528     compress_threads = NULL;
 529     comp_param = NULL;
 530 }
 531
 532 static int compress_threads_save_setup(void)
 533 {
 534     int i, thread_count;
 535
 536     if (!migrate_use_compression()) {
 537         return 0;
 538     }
 539     thread_count = migrate_compress_threads();
 540     compress_threads = g_new0(QemuThread, thread_count);
 541     comp_param = g_new0(CompressParam, thread_count);
 542     qemu_cond_init(&comp_done_cond);
 543     qemu_mutex_init(&comp_done_lock);
 544     for (i = 0; i < thread_count; i++) {
 545         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 546         if (!comp_param[i].originbuf) {
 547             goto exit;
 548         }
 549
 550         if (deflateInit(&comp_param[i].stream,
 551                         migrate_compress_level()) != Z_OK) {
 552             g_free(comp_param[i].originbuf);
 553             goto exit;
 554         }
 555
 556         /* comp_param[i].file is just used as a dummy buffer to save data,
 557          * set its ops to empty.
 558          */
 559         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 560         comp_param[i].done = true;
 561         comp_param[i].quit = false;
 562         qemu_mutex_init(&comp_param[i].mutex);
 563         qemu_cond_init(&comp_param[i].cond);
 564         qemu_thread_create(compress_threads + i, "compress",
 565                            do_data_compress, comp_param + i,
 566                            QEMU_THREAD_JOINABLE);
 567     }
 568     return 0;
 569
 570 exit:
 571     compress_threads_save_cleanup();
 572     return -1;
 573 }
 574
 575 /**
 576  * save_page_header: write page header to wire
 577  *
 578  * If this is the 1st block, it also writes the block identification
 579  *
 580  * Returns the number of bytes written
 581  *
 582  * @f: QEMUFile where to send the data
 583  * @block: block that contains the page we want to send
 584  * @offset: offset inside the block for the page
 585  *          in the lower bits, it contains flags
 586  */
 587 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 588                                ram_addr_t offset)
 589 {
 590     size_t size, len;
 591
 592     if (block == rs->last_sent_block) {
 593         offset |= RAM_SAVE_FLAG_CONTINUE;
 594     }
 595     qemu_put_be64(f, offset);
 596     size = 8;
 597
 598     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 599         len = strlen(block->idstr);
 600         qemu_put_byte(f, len);
 601         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 602         size += 1 + len;
 603         rs->last_sent_block = block;
 604     }
 605     return size;
 606 }
 607
 608 /**
 609  * mig_throttle_guest_down: throotle down the guest
 610  *
 611  * Reduce amount of guest cpu execution to hopefully slow down memory
 612  * writes. If guest dirty memory rate is reduced below the rate at
 613  * which we can transfer pages to the destination then we should be
 614  * able to complete migration. Some workloads dirty memory way too
 615  * fast and will not effectively converge, even with auto-converge.
 616  */
 617 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 618                                     uint64_t bytes_dirty_threshold)
 619 {
 620     MigrationState *s = migrate_get_current();
 621     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 622     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 623     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 624     int pct_max = s->parameters.max_cpu_throttle;
 625
 626     uint64_t throttle_now = cpu_throttle_get_percentage();
 627     uint64_t cpu_now, cpu_ideal, throttle_inc;
 628
 629     /* We have not started throttling yet. Let's start it. */
 630     if (!cpu_throttle_active()) {
 631         cpu_throttle_set(pct_initial);
 632     } else {
 633         /* Throttling already on, just increase the rate */
 634         if (!pct_tailslow) {
 635             throttle_inc = pct_increment;
 636         } else {
 637             /* Compute the ideal CPU percentage used by Guest, which may
 638              * make the dirty rate match the dirty rate threshold. */
 639             cpu_now = 100 - throttle_now;
 640             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 641                         bytes_dirty_period);
 642             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 643         }
 644         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 645     }
 646 }
 647
 648 /**
 649  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 650  *
 651  * @rs: current RAM state
 652  * @current_addr: address for the zero page
 653  *
 654  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 655  * The important thing is that a stale (not-yet-0'd) page be replaced
 656  * by the new data.
 657  * As a bonus, if the page wasn't in the cache it gets added so that
 658  * when a small write is made into the 0'd page it gets XBZRLE sent.
 659  */
 660 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 661 {
 662     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 663         return;
 664     }
 665
 666     /* We don't care if this fails to allocate a new cache page
 667      * as long as it updated an old one */
 668     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 669                  ram_counters.dirty_sync_count);
 670 }
 671
 672 #define ENCODING_FLAG_XBZRLE 0x1
 673
 674 /**
 675  * save_xbzrle_page: compress and send current page
 676  *
 677  * Returns: 1 means that we wrote the page
 678  *          0 means that page is identical to the one already sent
 679  *          -1 means that xbzrle would be longer than normal
 680  *
 681  * @rs: current RAM state
 682  * @current_data: pointer to the address of the page contents
 683  * @current_addr: addr of the page
 684  * @block: block that contains the page we want to send
 685  * @offset: offset inside the block for the page
 686  * @last_stage: if we are at the completion stage
 687  */
 688 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 689                             ram_addr_t current_addr, RAMBlock *block,
 690                             ram_addr_t offset, bool last_stage)
 691 {
 692     int encoded_len = 0, bytes_xbzrle;
 693     uint8_t *prev_cached_page;
 694
 695     if (!cache_is_cached(XBZRLE.cache, current_addr,
 696                          ram_counters.dirty_sync_count)) {
 697         xbzrle_counters.cache_miss++;
 698         if (!last_stage) {
 699             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 700                              ram_counters.dirty_sync_count) == -1) {
 701                 return -1;
 702             } else {
 703                 /* update *current_data when the page has been
 704                    inserted into cache */
 705                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 706             }
 707         }
 708         return -1;
 709     }
 710
 711     /*
 712      * Reaching here means the page has hit the xbzrle cache, no matter what
 713      * encoding result it is (normal encoding, overflow or skipping the page),
 714      * count the page as encoded. This is used to calculate the encoding rate.
 715      *
 716      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 717      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 718      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 719      * skipped page included. In this way, the encoding rate can tell if the
 720      * guest page is good for xbzrle encoding.
 721      */
 722     xbzrle_counters.pages++;
 723     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 724
 725     /* save current buffer into memory */
 726     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 727
 728     /* XBZRLE encoding (if there is no overflow) */
 729     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 730                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 731                                        TARGET_PAGE_SIZE);
 732
 733     /*
 734      * Update the cache contents, so that it corresponds to the data
 735      * sent, in all cases except where we skip the page.
 736      */
 737     if (!last_stage && encoded_len != 0) {
 738         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 739         /*
 740          * In the case where we couldn't compress, ensure that the caller
 741          * sends the data from the cache, since the guest might have
 742          * changed the RAM since we copied it.
 743          */
 744         *current_data = prev_cached_page;
 745     }
 746
 747     if (encoded_len == 0) {
 748         trace_save_xbzrle_page_skipping();
 749         return 0;
 750     } else if (encoded_len == -1) {
 751         trace_save_xbzrle_page_overflow();
 752         xbzrle_counters.overflow++;
 753         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 754         return -1;
 755     }
 756
 757     /* Send XBZRLE based compressed page */
 758     bytes_xbzrle = save_page_header(rs, rs->f, block,
 759                                     offset | RAM_SAVE_FLAG_XBZRLE);
 760     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 761     qemu_put_be16(rs->f, encoded_len);
 762     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 763     bytes_xbzrle += encoded_len + 1 + 2;
 764     /*
 765      * Like compressed_size (please see update_compress_thread_counts),
 766      * the xbzrle encoded bytes don't count the 8 byte header with
 767      * RAM_SAVE_FLAG_CONTINUE.
 768      */
 769     xbzrle_counters.bytes += bytes_xbzrle - 8;
 770     ram_counters.transferred += bytes_xbzrle;
 771
 772     return 1;
 773 }
 774
 775 /**
 776  * migration_bitmap_find_dirty: find the next dirty page from start
 777  *
 778  * Returns the page offset within memory region of the start of a dirty page
 779  *
 780  * @rs: current RAM state
 781  * @rb: RAMBlock where to search for dirty pages
 782  * @start: page where we start the search
 783  */
 784 static inline
 785 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 786                                           unsigned long start)
 787 {
 788     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 789     unsigned long *bitmap = rb->bmap;
 790     unsigned long next;
 791
 792     if (ramblock_is_ignored(rb)) {
 793         return size;
 794     }
 795
 796     /*
 797      * When the free page optimization is enabled, we need to check the bitmap
 798      * to send the non-free pages rather than all the pages in the bulk stage.
 799      */
 800     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
 801         next = start + 1;
 802     } else {
 803         next = find_next_bit(bitmap, size, start);
 804     }
 805
 806     return next;
 807 }
 808
 809 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 810                                                 RAMBlock *rb,
 811                                                 unsigned long page)
 812 {
 813     bool ret;
 814
 815     qemu_mutex_lock(&rs->bitmap_mutex);
 816
 817     /*
 818      * Clear dirty bitmap if needed.  This _must_ be called before we
 819      * send any of the page in the chunk because we need to make sure
 820      * we can capture further page content changes when we sync dirty
 821      * log the next time.  So as long as we are going to send any of
 822      * the page in the chunk we clear the remote dirty bitmap for all.
 823      * Clearing it earlier won't be a problem, but too late will.
 824      */
 825     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
 826         uint8_t shift = rb->clear_bmap_shift;
 827         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
 828         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
 829
 830         /*
 831          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 832          * can make things easier sometimes since then start address
 833          * of the small chunk will always be 64 pages aligned so the
 834          * bitmap will always be aligned to unsigned long.  We should
 835          * even be able to remove this restriction but I'm simply
 836          * keeping it.
 837          */
 838         assert(shift >= 6);
 839         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 840         memory_region_clear_dirty_bitmap(rb->mr, start, size);
 841     }
 842
 843     ret = test_and_clear_bit(page, rb->bmap);
 844
 845     if (ret) {
 846         rs->migration_dirty_pages--;
 847     }
 848     qemu_mutex_unlock(&rs->bitmap_mutex);
 849
 850     return ret;
 851 }
 852
 853 /* Called with RCU critical section */
 854 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 855 {
 856     uint64_t new_dirty_pages =
 857         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 858
 859     rs->migration_dirty_pages += new_dirty_pages;
 860     rs->num_dirty_pages_period += new_dirty_pages;
 861 }
 862
 863 /**
 864  * ram_pagesize_summary: calculate all the pagesizes of a VM
 865  *
 866  * Returns a summary bitmap of the page sizes of all RAMBlocks
 867  *
 868  * For VMs with just normal pages this is equivalent to the host page
 869  * size. If it's got some huge pages then it's the OR of all the
 870  * different page sizes.
 871  */
 872 uint64_t ram_pagesize_summary(void)
 873 {
 874     RAMBlock *block;
 875     uint64_t summary = 0;
 876
 877     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 878         summary |= block->page_size;
 879     }
 880
 881     return summary;
 882 }
 883
 884 uint64_t ram_get_total_transferred_pages(void)
 885 {
 886     return  ram_counters.normal + ram_counters.duplicate +
 887                 compression_counters.pages + xbzrle_counters.pages;
 888 }
 889
 890 static void migration_update_rates(RAMState *rs, int64_t end_time)
 891 {
 892     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 893     double compressed_size;
 894
 895     /* calculate period counters */
 896     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 897                 / (end_time - rs->time_last_bitmap_sync);
 898
 899     if (!page_count) {
 900         return;
 901     }
 902
 903     if (migrate_use_xbzrle()) {
 904         double encoded_size, unencoded_size;
 905
 906         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 907             rs->xbzrle_cache_miss_prev) / page_count;
 908         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 909         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 910                          TARGET_PAGE_SIZE;
 911         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 912         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 913             xbzrle_counters.encoding_rate = 0;
 914         } else {
 915             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
 916         }
 917         rs->xbzrle_pages_prev = xbzrle_counters.pages;
 918         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
 919     }
 920
 921     if (migrate_use_compression()) {
 922         compression_counters.busy_rate = (double)(compression_counters.busy -
 923             rs->compress_thread_busy_prev) / page_count;
 924         rs->compress_thread_busy_prev = compression_counters.busy;
 925
 926         compressed_size = compression_counters.compressed_size -
 927                           rs->compressed_size_prev;
 928         if (compressed_size) {
 929             double uncompressed_size = (compression_counters.pages -
 930                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 931
 932             /* Compression-Ratio = Uncompressed-size / Compressed-size */
 933             compression_counters.compression_rate =
 934                                         uncompressed_size / compressed_size;
 935
 936             rs->compress_pages_prev = compression_counters.pages;
 937             rs->compressed_size_prev = compression_counters.compressed_size;
 938         }
 939     }
 940 }
 941
 942 static void migration_trigger_throttle(RAMState *rs)
 943 {
 944     MigrationState *s = migrate_get_current();
 945     uint64_t threshold = s->parameters.throttle_trigger_threshold;
 946
 947     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
 948     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 949     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 950
 951     /* During block migration the auto-converge logic incorrectly detects
 952      * that ram migration makes no progress. Avoid this by disabling the
 953      * throttling logic during the bulk phase of block migration. */
 954     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 955         /* The following detection logic can be refined later. For now:
 956            Check to see if the ratio between dirtied bytes and the approx.
 957            amount of bytes that just got transferred since the last time
 958            we were in this routine reaches the threshold. If that happens
 959            twice, start or increase throttling. */
 960
 961         if ((bytes_dirty_period > bytes_dirty_threshold) &&
 962             (++rs->dirty_rate_high_cnt >= 2)) {
 963             trace_migration_throttle();
 964             rs->dirty_rate_high_cnt = 0;
 965             mig_throttle_guest_down(bytes_dirty_period,
 966                                     bytes_dirty_threshold);
 967         }
 968     }
 969 }
 970
 971 static void migration_bitmap_sync(RAMState *rs)
 972 {
 973     RAMBlock *block;
 974     int64_t end_time;
 975
 976     ram_counters.dirty_sync_count++;
 977
 978     if (!rs->time_last_bitmap_sync) {
 979         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 980     }
 981
 982     trace_migration_bitmap_sync_start();
 983     memory_global_dirty_log_sync();
 984
 985     qemu_mutex_lock(&rs->bitmap_mutex);
 986     WITH_RCU_READ_LOCK_GUARD() {
 987         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 988             ramblock_sync_dirty_bitmap(rs, block);
 989         }
 990         ram_counters.remaining = ram_bytes_remaining();
 991     }
 992     qemu_mutex_unlock(&rs->bitmap_mutex);
 993
 994     memory_global_after_dirty_log_sync();
 995     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 996
 997     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 998
 999     /* more than 1 second = 1000 millisecons */
1000     if (end_time > rs->time_last_bitmap_sync + 1000) {
1001         migration_trigger_throttle(rs);
1002
1003         migration_update_rates(rs, end_time);
1004
1005         rs->target_page_count_prev = rs->target_page_count;
1006
1007         /* reset period counters */
1008         rs->time_last_bitmap_sync = end_time;
1009         rs->num_dirty_pages_period = 0;
1010         rs->bytes_xfer_prev = ram_counters.transferred;
1011     }
1012     if (migrate_use_events()) {
1013         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1014     }
1015 }
1016
1017 static void migration_bitmap_sync_precopy(RAMState *rs)
1018 {
1019     Error *local_err = NULL;
1020
1021     /*
1022      * The current notifier usage is just an optimization to migration, so we
1023      * don't stop the normal migration process in the error case.
1024      */
1025     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1026         error_report_err(local_err);
1027         local_err = NULL;
1028     }
1029
1030     migration_bitmap_sync(rs);
1031
1032     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1033         error_report_err(local_err);
1034     }
1035 }
1036
1037 /**
1038  * save_zero_page_to_file: send the zero page to the file
1039  *
1040  * Returns the size of data written to the file, 0 means the page is not
1041  * a zero page
1042  *
1043  * @rs: current RAM state
1044  * @file: the file where the data is saved
1045  * @block: block that contains the page we want to send
1046  * @offset: offset inside the block for the page
1047  */
1048 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1049                                   RAMBlock *block, ram_addr_t offset)
1050 {
1051     uint8_t *p = block->host + offset;
1052     int len = 0;
1053
1054     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1055         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1056         qemu_put_byte(file, 0);
1057         len += 1;
1058     }
1059     return len;
1060 }
1061
1062 /**
1063  * save_zero_page: send the zero page to the stream
1064  *
1065  * Returns the number of pages written.
1066  *
1067  * @rs: current RAM state
1068  * @block: block that contains the page we want to send
1069  * @offset: offset inside the block for the page
1070  */
1071 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1072 {
1073     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1074
1075     if (len) {
1076         ram_counters.duplicate++;
1077         ram_counters.transferred += len;
1078         return 1;
1079     }
1080     return -1;
1081 }
1082
1083 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1084 {
1085     if (!migrate_release_ram() || !migration_in_postcopy()) {
1086         return;
1087     }
1088
1089     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1090 }
1091
1092 /*
1093  * @pages: the number of pages written by the control path,
1094  *        < 0 - error
1095  *        > 0 - number of pages written
1096  *
1097  * Return true if the pages has been saved, otherwise false is returned.
1098  */
1099 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1100                               int *pages)
1101 {
1102     uint64_t bytes_xmit = 0;
1103     int ret;
1104
1105     *pages = -1;
1106     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1107                                 &bytes_xmit);
1108     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1109         return false;
1110     }
1111
1112     if (bytes_xmit) {
1113         ram_counters.transferred += bytes_xmit;
1114         *pages = 1;
1115     }
1116
1117     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1118         return true;
1119     }
1120
1121     if (bytes_xmit > 0) {
1122         ram_counters.normal++;
1123     } else if (bytes_xmit == 0) {
1124         ram_counters.duplicate++;
1125     }
1126
1127     return true;
1128 }
1129
1130 /*
1131  * directly send the page to the stream
1132  *
1133  * Returns the number of pages written.
1134  *
1135  * @rs: current RAM state
1136  * @block: block that contains the page we want to send
1137  * @offset: offset inside the block for the page
1138  * @buf: the page to be sent
1139  * @async: send to page asyncly
1140  */
1141 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1142                             uint8_t *buf, bool async)
1143 {
1144     ram_counters.transferred += save_page_header(rs, rs->f, block,
1145                                                  offset | RAM_SAVE_FLAG_PAGE);
1146     if (async) {
1147         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1148                               migrate_release_ram() &
1149                               migration_in_postcopy());
1150     } else {
1151         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1152     }
1153     ram_counters.transferred += TARGET_PAGE_SIZE;
1154     ram_counters.normal++;
1155     return 1;
1156 }
1157
1158 /**
1159  * ram_save_page: send the given page to the stream
1160  *
1161  * Returns the number of pages written.
1162  *          < 0 - error
1163  *          >=0 - Number of pages written - this might legally be 0
1164  *                if xbzrle noticed the page was the same.
1165  *
1166  * @rs: current RAM state
1167  * @block: block that contains the page we want to send
1168  * @offset: offset inside the block for the page
1169  * @last_stage: if we are at the completion stage
1170  */
1171 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1172 {
1173     int pages = -1;
1174     uint8_t *p;
1175     bool send_async = true;
1176     RAMBlock *block = pss->block;
1177     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1178     ram_addr_t current_addr = block->offset + offset;
1179
1180     p = block->host + offset;
1181     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1182
1183     XBZRLE_cache_lock();
1184     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1185         migrate_use_xbzrle()) {
1186         pages = save_xbzrle_page(rs, &p, current_addr, block,
1187                                  offset, last_stage);
1188         if (!last_stage) {
1189             /* Can't send this cached data async, since the cache page
1190              * might get updated before it gets to the wire
1191              */
1192             send_async = false;
1193         }
1194     }
1195
1196     /* XBZRLE overflow or normal page */
1197     if (pages == -1) {
1198         pages = save_normal_page(rs, block, offset, p, send_async);
1199     }
1200
1201     XBZRLE_cache_unlock();
1202
1203     return pages;
1204 }
1205
1206 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1207                                  ram_addr_t offset)
1208 {
1209     if (multifd_queue_page(rs->f, block, offset) < 0) {
1210         return -1;
1211     }
1212     ram_counters.normal++;
1213
1214     return 1;
1215 }
1216
1217 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1218                                  ram_addr_t offset, uint8_t *source_buf)
1219 {
1220     RAMState *rs = ram_state;
1221     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1222     bool zero_page = false;
1223     int ret;
1224
1225     if (save_zero_page_to_file(rs, f, block, offset)) {
1226         zero_page = true;
1227         goto exit;
1228     }
1229
1230     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1231
1232     /*
1233      * copy it to a internal buffer to avoid it being modified by VM
1234      * so that we can catch up the error during compression and
1235      * decompression
1236      */
1237     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1238     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1239     if (ret < 0) {
1240         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1241         error_report("compressed data failed!");
1242         return false;
1243     }
1244
1245 exit:
1246     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1247     return zero_page;
1248 }
1249
1250 static void
1251 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1252 {
1253     ram_counters.transferred += bytes_xmit;
1254
1255     if (param->zero_page) {
1256         ram_counters.duplicate++;
1257         return;
1258     }
1259
1260     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1261     compression_counters.compressed_size += bytes_xmit - 8;
1262     compression_counters.pages++;
1263 }
1264
1265 static bool save_page_use_compression(RAMState *rs);
1266
1267 static void flush_compressed_data(RAMState *rs)
1268 {
1269     int idx, len, thread_count;
1270
1271     if (!save_page_use_compression(rs)) {
1272         return;
1273     }
1274     thread_count = migrate_compress_threads();
1275
1276     qemu_mutex_lock(&comp_done_lock);
1277     for (idx = 0; idx < thread_count; idx++) {
1278         while (!comp_param[idx].done) {
1279             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1280         }
1281     }
1282     qemu_mutex_unlock(&comp_done_lock);
1283
1284     for (idx = 0; idx < thread_count; idx++) {
1285         qemu_mutex_lock(&comp_param[idx].mutex);
1286         if (!comp_param[idx].quit) {
1287             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1288             /*
1289              * it's safe to fetch zero_page without holding comp_done_lock
1290              * as there is no further request submitted to the thread,
1291              * i.e, the thread should be waiting for a request at this point.
1292              */
1293             update_compress_thread_counts(&comp_param[idx], len);
1294         }
1295         qemu_mutex_unlock(&comp_param[idx].mutex);
1296     }
1297 }
1298
1299 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1300                                        ram_addr_t offset)
1301 {
1302     param->block = block;
1303     param->offset = offset;
1304 }
1305
1306 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1307                                            ram_addr_t offset)
1308 {
1309     int idx, thread_count, bytes_xmit = -1, pages = -1;
1310     bool wait = migrate_compress_wait_thread();
1311
1312     thread_count = migrate_compress_threads();
1313     qemu_mutex_lock(&comp_done_lock);
1314 retry:
1315     for (idx = 0; idx < thread_count; idx++) {
1316         if (comp_param[idx].done) {
1317             comp_param[idx].done = false;
1318             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1319             qemu_mutex_lock(&comp_param[idx].mutex);
1320             set_compress_params(&comp_param[idx], block, offset);
1321             qemu_cond_signal(&comp_param[idx].cond);
1322             qemu_mutex_unlock(&comp_param[idx].mutex);
1323             pages = 1;
1324             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1325             break;
1326         }
1327     }
1328
1329     /*
1330      * wait for the free thread if the user specifies 'compress-wait-thread',
1331      * otherwise we will post the page out in the main thread as normal page.
1332      */
1333     if (pages < 0 && wait) {
1334         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1335         goto retry;
1336     }
1337     qemu_mutex_unlock(&comp_done_lock);
1338
1339     return pages;
1340 }
1341
1342 /**
1343  * find_dirty_block: find the next dirty page and update any state
1344  * associated with the search process.
1345  *
1346  * Returns true if a page is found
1347  *
1348  * @rs: current RAM state
1349  * @pss: data about the state of the current dirty page scan
1350  * @again: set to false if the search has scanned the whole of RAM
1351  */
1352 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1353 {
1354     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1355     if (pss->complete_round && pss->block == rs->last_seen_block &&
1356         pss->page >= rs->last_page) {
1357         /*
1358          * We've been once around the RAM and haven't found anything.
1359          * Give up.
1360          */
1361         *again = false;
1362         return false;
1363     }
1364     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1365         >= pss->block->used_length) {
1366         /* Didn't find anything in this RAM Block */
1367         pss->page = 0;
1368         pss->block = QLIST_NEXT_RCU(pss->block, next);
1369         if (!pss->block) {
1370             /*
1371              * If memory migration starts over, we will meet a dirtied page
1372              * which may still exists in compression threads's ring, so we
1373              * should flush the compressed data to make sure the new page
1374              * is not overwritten by the old one in the destination.
1375              *
1376              * Also If xbzrle is on, stop using the data compression at this
1377              * point. In theory, xbzrle can do better than compression.
1378              */
1379             flush_compressed_data(rs);
1380
1381             /* Hit the end of the list */
1382             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1383             /* Flag that we've looped */
1384             pss->complete_round = true;
1385             rs->ram_bulk_stage = false;
1386         }
1387         /* Didn't find anything this time, but try again on the new block */
1388         *again = true;
1389         return false;
1390     } else {
1391         /* Can go around again, but... */
1392         *again = true;
1393         /* We've found something so probably don't need to */
1394         return true;
1395     }
1396 }
1397
1398 /**
1399  * unqueue_page: gets a page of the queue
1400  *
1401  * Helper for 'get_queued_page' - gets a page off the queue
1402  *
1403  * Returns the block of the page (or NULL if none available)
1404  *
1405  * @rs: current RAM state
1406  * @offset: used to return the offset within the RAMBlock
1407  */
1408 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1409 {
1410     RAMBlock *block = NULL;
1411
1412     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1413         return NULL;
1414     }
1415
1416     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1417     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1418         struct RAMSrcPageRequest *entry =
1419                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1420         block = entry->rb;
1421         *offset = entry->offset;
1422
1423         if (entry->len > TARGET_PAGE_SIZE) {
1424             entry->len -= TARGET_PAGE_SIZE;
1425             entry->offset += TARGET_PAGE_SIZE;
1426         } else {
1427             memory_region_unref(block->mr);
1428             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1429             g_free(entry);
1430             migration_consume_urgent_request();
1431         }
1432     }
1433
1434     return block;
1435 }
1436
1437 /**
1438  * get_queued_page: unqueue a page from the postcopy requests
1439  *
1440  * Skips pages that are already sent (!dirty)
1441  *
1442  * Returns true if a queued page is found
1443  *
1444  * @rs: current RAM state
1445  * @pss: data about the state of the current dirty page scan
1446  */
1447 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1448 {
1449     RAMBlock  *block;
1450     ram_addr_t offset;
1451     bool dirty;
1452
1453     do {
1454         block = unqueue_page(rs, &offset);
1455         /*
1456          * We're sending this page, and since it's postcopy nothing else
1457          * will dirty it, and we must make sure it doesn't get sent again
1458          * even if this queue request was received after the background
1459          * search already sent it.
1460          */
1461         if (block) {
1462             unsigned long page;
1463
1464             page = offset >> TARGET_PAGE_BITS;
1465             dirty = test_bit(page, block->bmap);
1466             if (!dirty) {
1467                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1468                                                 page);
1469             } else {
1470                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1471             }
1472         }
1473
1474     } while (block && !dirty);
1475
1476     if (block) {
1477         /*
1478          * As soon as we start servicing pages out of order, then we have
1479          * to kill the bulk stage, since the bulk stage assumes
1480          * in (migration_bitmap_find_and_reset_dirty) that every page is
1481          * dirty, that's no longer true.
1482          */
1483         rs->ram_bulk_stage = false;
1484
1485         /*
1486          * We want the background search to continue from the queued page
1487          * since the guest is likely to want other pages near to the page
1488          * it just requested.
1489          */
1490         pss->block = block;
1491         pss->page = offset >> TARGET_PAGE_BITS;
1492
1493         /*
1494          * This unqueued page would break the "one round" check, even is
1495          * really rare.
1496          */
1497         pss->complete_round = false;
1498     }
1499
1500     return !!block;
1501 }
1502
1503 /**
1504  * migration_page_queue_free: drop any remaining pages in the ram
1505  * request queue
1506  *
1507  * It should be empty at the end anyway, but in error cases there may
1508  * be some left.  in case that there is any page left, we drop it.
1509  *
1510  */
1511 static void migration_page_queue_free(RAMState *rs)
1512 {
1513     struct RAMSrcPageRequest *mspr, *next_mspr;
1514     /* This queue generally should be empty - but in the case of a failed
1515      * migration might have some droppings in.
1516      */
1517     RCU_READ_LOCK_GUARD();
1518     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1519         memory_region_unref(mspr->rb->mr);
1520         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1521         g_free(mspr);
1522     }
1523 }
1524
1525 /**
1526  * ram_save_queue_pages: queue the page for transmission
1527  *
1528  * A request from postcopy destination for example.
1529  *
1530  * Returns zero on success or negative on error
1531  *
1532  * @rbname: Name of the RAMBLock of the request. NULL means the
1533  *          same that last one.
1534  * @start: starting address from the start of the RAMBlock
1535  * @len: length (in bytes) to send
1536  */
1537 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1538 {
1539     RAMBlock *ramblock;
1540     RAMState *rs = ram_state;
1541
1542     ram_counters.postcopy_requests++;
1543     RCU_READ_LOCK_GUARD();
1544
1545     if (!rbname) {
1546         /* Reuse last RAMBlock */
1547         ramblock = rs->last_req_rb;
1548
1549         if (!ramblock) {
1550             /*
1551              * Shouldn't happen, we can't reuse the last RAMBlock if
1552              * it's the 1st request.
1553              */
1554             error_report("ram_save_queue_pages no previous block");
1555             return -1;
1556         }
1557     } else {
1558         ramblock = qemu_ram_block_by_name(rbname);
1559
1560         if (!ramblock) {
1561             /* We shouldn't be asked for a non-existent RAMBlock */
1562             error_report("ram_save_queue_pages no block '%s'", rbname);
1563             return -1;
1564         }
1565         rs->last_req_rb = ramblock;
1566     }
1567     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1568     if (start + len > ramblock->used_length) {
1569         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1570                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1571                      __func__, start, len, ramblock->used_length);
1572         return -1;
1573     }
1574
1575     struct RAMSrcPageRequest *new_entry =
1576         g_malloc0(sizeof(struct RAMSrcPageRequest));
1577     new_entry->rb = ramblock;
1578     new_entry->offset = start;
1579     new_entry->len = len;
1580
1581     memory_region_ref(ramblock->mr);
1582     qemu_mutex_lock(&rs->src_page_req_mutex);
1583     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1584     migration_make_urgent_request();
1585     qemu_mutex_unlock(&rs->src_page_req_mutex);
1586
1587     return 0;
1588 }
1589
1590 static bool save_page_use_compression(RAMState *rs)
1591 {
1592     if (!migrate_use_compression()) {
1593         return false;
1594     }
1595
1596     /*
1597      * If xbzrle is on, stop using the data compression after first
1598      * round of migration even if compression is enabled. In theory,
1599      * xbzrle can do better than compression.
1600      */
1601     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1602         return true;
1603     }
1604
1605     return false;
1606 }
1607
1608 /*
1609  * try to compress the page before posting it out, return true if the page
1610  * has been properly handled by compression, otherwise needs other
1611  * paths to handle it
1612  */
1613 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1614 {
1615     if (!save_page_use_compression(rs)) {
1616         return false;
1617     }
1618
1619     /*
1620      * When starting the process of a new block, the first page of
1621      * the block should be sent out before other pages in the same
1622      * block, and all the pages in last block should have been sent
1623      * out, keeping this order is important, because the 'cont' flag
1624      * is used to avoid resending the block name.
1625      *
1626      * We post the fist page as normal page as compression will take
1627      * much CPU resource.
1628      */
1629     if (block != rs->last_sent_block) {
1630         flush_compressed_data(rs);
1631         return false;
1632     }
1633
1634     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1635         return true;
1636     }
1637
1638     compression_counters.busy++;
1639     return false;
1640 }
1641
1642 /**
1643  * ram_save_target_page: save one target page
1644  *
1645  * Returns the number of pages written
1646  *
1647  * @rs: current RAM state
1648  * @pss: data about the page we want to send
1649  * @last_stage: if we are at the completion stage
1650  */
1651 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1652                                 bool last_stage)
1653 {
1654     RAMBlock *block = pss->block;
1655     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1656     int res;
1657
1658     if (control_save_page(rs, block, offset, &res)) {
1659         return res;
1660     }
1661
1662     if (save_compress_page(rs, block, offset)) {
1663         return 1;
1664     }
1665
1666     res = save_zero_page(rs, block, offset);
1667     if (res > 0) {
1668         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1669          * page would be stale
1670          */
1671         if (!save_page_use_compression(rs)) {
1672             XBZRLE_cache_lock();
1673             xbzrle_cache_zero_page(rs, block->offset + offset);
1674             XBZRLE_cache_unlock();
1675         }
1676         ram_release_pages(block->idstr, offset, res);
1677         return res;
1678     }
1679
1680     /*
1681      * Do not use multifd for:
1682      * 1. Compression as the first page in the new block should be posted out
1683      *    before sending the compressed page
1684      * 2. In postcopy as one whole host page should be placed
1685      */
1686     if (!save_page_use_compression(rs) && migrate_use_multifd()
1687         && !migration_in_postcopy()) {
1688         return ram_save_multifd_page(rs, block, offset);
1689     }
1690
1691     return ram_save_page(rs, pss, last_stage);
1692 }
1693
1694 /**
1695  * ram_save_host_page: save a whole host page
1696  *
1697  * Starting at *offset send pages up to the end of the current host
1698  * page. It's valid for the initial offset to point into the middle of
1699  * a host page in which case the remainder of the hostpage is sent.
1700  * Only dirty target pages are sent. Note that the host page size may
1701  * be a huge page for this block.
1702  * The saving stops at the boundary of the used_length of the block
1703  * if the RAMBlock isn't a multiple of the host page size.
1704  *
1705  * Returns the number of pages written or negative on error
1706  *
1707  * @rs: current RAM state
1708  * @ms: current migration state
1709  * @pss: data about the page we want to send
1710  * @last_stage: if we are at the completion stage
1711  */
1712 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1713                               bool last_stage)
1714 {
1715     int tmppages, pages = 0;
1716     size_t pagesize_bits =
1717         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1718
1719     if (ramblock_is_ignored(pss->block)) {
1720         error_report("block %s should not be migrated !", pss->block->idstr);
1721         return 0;
1722     }
1723
1724     do {
1725         /* Check the pages is dirty and if it is send it */
1726         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1727             pss->page++;
1728             continue;
1729         }
1730
1731         tmppages = ram_save_target_page(rs, pss, last_stage);
1732         if (tmppages < 0) {
1733             return tmppages;
1734         }
1735
1736         pages += tmppages;
1737         pss->page++;
1738         /* Allow rate limiting to happen in the middle of huge pages */
1739         migration_rate_limit();
1740     } while ((pss->page & (pagesize_bits - 1)) &&
1741              offset_in_ramblock(pss->block,
1742                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
1743
1744     /* The offset we leave with is the last one we looked at */
1745     pss->page--;
1746     return pages;
1747 }
1748
1749 /**
1750  * ram_find_and_save_block: finds a dirty page and sends it to f
1751  *
1752  * Called within an RCU critical section.
1753  *
1754  * Returns the number of pages written where zero means no dirty pages,
1755  * or negative on error
1756  *
1757  * @rs: current RAM state
1758  * @last_stage: if we are at the completion stage
1759  *
1760  * On systems where host-page-size > target-page-size it will send all the
1761  * pages in a host page that are dirty.
1762  */
1763
1764 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1765 {
1766     PageSearchStatus pss;
1767     int pages = 0;
1768     bool again, found;
1769
1770     /* No dirty page as there is zero RAM */
1771     if (!ram_bytes_total()) {
1772         return pages;
1773     }
1774
1775     pss.block = rs->last_seen_block;
1776     pss.page = rs->last_page;
1777     pss.complete_round = false;
1778
1779     if (!pss.block) {
1780         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1781     }
1782
1783     do {
1784         again = true;
1785         found = get_queued_page(rs, &pss);
1786
1787         if (!found) {
1788             /* priority queue empty, so just search for something dirty */
1789             found = find_dirty_block(rs, &pss, &again);
1790         }
1791
1792         if (found) {
1793             pages = ram_save_host_page(rs, &pss, last_stage);
1794         }
1795     } while (!pages && again);
1796
1797     rs->last_seen_block = pss.block;
1798     rs->last_page = pss.page;
1799
1800     return pages;
1801 }
1802
1803 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1804 {
1805     uint64_t pages = size / TARGET_PAGE_SIZE;
1806
1807     if (zero) {
1808         ram_counters.duplicate += pages;
1809     } else {
1810         ram_counters.normal += pages;
1811         ram_counters.transferred += size;
1812         qemu_update_position(f, size);
1813     }
1814 }
1815
1816 static uint64_t ram_bytes_total_common(bool count_ignored)
1817 {
1818     RAMBlock *block;
1819     uint64_t total = 0;
1820
1821     RCU_READ_LOCK_GUARD();
1822
1823     if (count_ignored) {
1824         RAMBLOCK_FOREACH_MIGRATABLE(block) {
1825             total += block->used_length;
1826         }
1827     } else {
1828         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1829             total += block->used_length;
1830         }
1831     }
1832     return total;
1833 }
1834
1835 uint64_t ram_bytes_total(void)
1836 {
1837     return ram_bytes_total_common(false);
1838 }
1839
1840 static void xbzrle_load_setup(void)
1841 {
1842     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1843 }
1844
1845 static void xbzrle_load_cleanup(void)
1846 {
1847     g_free(XBZRLE.decoded_buf);
1848     XBZRLE.decoded_buf = NULL;
1849 }
1850
1851 static void ram_state_cleanup(RAMState **rsp)
1852 {
1853     if (*rsp) {
1854         migration_page_queue_free(*rsp);
1855         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1856         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1857         g_free(*rsp);
1858         *rsp = NULL;
1859     }
1860 }
1861
1862 static void xbzrle_cleanup(void)
1863 {
1864     XBZRLE_cache_lock();
1865     if (XBZRLE.cache) {
1866         cache_fini(XBZRLE.cache);
1867         g_free(XBZRLE.encoded_buf);
1868         g_free(XBZRLE.current_buf);
1869         g_free(XBZRLE.zero_target_page);
1870         XBZRLE.cache = NULL;
1871         XBZRLE.encoded_buf = NULL;
1872         XBZRLE.current_buf = NULL;
1873         XBZRLE.zero_target_page = NULL;
1874     }
1875     XBZRLE_cache_unlock();
1876 }
1877
1878 static void ram_save_cleanup(void *opaque)
1879 {
1880     RAMState **rsp = opaque;
1881     RAMBlock *block;
1882
1883     /* caller have hold iothread lock or is in a bh, so there is
1884      * no writing race against the migration bitmap
1885      */
1886     memory_global_dirty_log_stop();
1887
1888     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1889         g_free(block->clear_bmap);
1890         block->clear_bmap = NULL;
1891         g_free(block->bmap);
1892         block->bmap = NULL;
1893     }
1894
1895     xbzrle_cleanup();
1896     compress_threads_save_cleanup();
1897     ram_state_cleanup(rsp);
1898 }
1899
1900 static void ram_state_reset(RAMState *rs)
1901 {
1902     rs->last_seen_block = NULL;
1903     rs->last_sent_block = NULL;
1904     rs->last_page = 0;
1905     rs->last_version = ram_list.version;
1906     rs->ram_bulk_stage = true;
1907     rs->fpo_enabled = false;
1908 }
1909
1910 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1911
1912 /*
1913  * 'expected' is the value you expect the bitmap mostly to be full
1914  * of; it won't bother printing lines that are all this value.
1915  * If 'todump' is null the migration bitmap is dumped.
1916  */
1917 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1918                            unsigned long pages)
1919 {
1920     int64_t cur;
1921     int64_t linelen = 128;
1922     char linebuf[129];
1923
1924     for (cur = 0; cur < pages; cur += linelen) {
1925         int64_t curb;
1926         bool found = false;
1927         /*
1928          * Last line; catch the case where the line length
1929          * is longer than remaining ram
1930          */
1931         if (cur + linelen > pages) {
1932             linelen = pages - cur;
1933         }
1934         for (curb = 0; curb < linelen; curb++) {
1935             bool thisbit = test_bit(cur + curb, todump);
1936             linebuf[curb] = thisbit ? '1' : '.';
1937             found = found || (thisbit != expected);
1938         }
1939         if (found) {
1940             linebuf[curb] = '\0';
1941             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1942         }
1943     }
1944 }
1945
1946 /* **** functions for postcopy ***** */
1947
1948 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1949 {
1950     struct RAMBlock *block;
1951
1952     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1953         unsigned long *bitmap = block->bmap;
1954         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1955         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1956
1957         while (run_start < range) {
1958             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1959             ram_discard_range(block->idstr,
1960                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1961                               ((ram_addr_t)(run_end - run_start))
1962                                 << TARGET_PAGE_BITS);
1963             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1964         }
1965     }
1966 }
1967
1968 /**
1969  * postcopy_send_discard_bm_ram: discard a RAMBlock
1970  *
1971  * Returns zero on success
1972  *
1973  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1974  *
1975  * @ms: current migration state
1976  * @block: RAMBlock to discard
1977  */
1978 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
1979 {
1980     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1981     unsigned long current;
1982     unsigned long *bitmap = block->bmap;
1983
1984     for (current = 0; current < end; ) {
1985         unsigned long one = find_next_bit(bitmap, end, current);
1986         unsigned long zero, discard_length;
1987
1988         if (one >= end) {
1989             break;
1990         }
1991
1992         zero = find_next_zero_bit(bitmap, end, one + 1);
1993
1994         if (zero >= end) {
1995             discard_length = end - one;
1996         } else {
1997             discard_length = zero - one;
1998         }
1999         postcopy_discard_send_range(ms, one, discard_length);
2000         current = one + discard_length;
2001     }
2002
2003     return 0;
2004 }
2005
2006 /**
2007  * postcopy_each_ram_send_discard: discard all RAMBlocks
2008  *
2009  * Returns 0 for success or negative for error
2010  *
2011  * Utility for the outgoing postcopy code.
2012  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2013  *   passing it bitmap indexes and name.
2014  * (qemu_ram_foreach_block ends up passing unscaled lengths
2015  *  which would mean postcopy code would have to deal with target page)
2016  *
2017  * @ms: current migration state
2018  */
2019 static int postcopy_each_ram_send_discard(MigrationState *ms)
2020 {
2021     struct RAMBlock *block;
2022     int ret;
2023
2024     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2025         postcopy_discard_send_init(ms, block->idstr);
2026
2027         /*
2028          * Postcopy sends chunks of bitmap over the wire, but it
2029          * just needs indexes at this point, avoids it having
2030          * target page specific code.
2031          */
2032         ret = postcopy_send_discard_bm_ram(ms, block);
2033         postcopy_discard_send_finish(ms);
2034         if (ret) {
2035             return ret;
2036         }
2037     }
2038
2039     return 0;
2040 }
2041
2042 /**
2043  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2044  *
2045  * Helper for postcopy_chunk_hostpages; it's called twice to
2046  * canonicalize the two bitmaps, that are similar, but one is
2047  * inverted.
2048  *
2049  * Postcopy requires that all target pages in a hostpage are dirty or
2050  * clean, not a mix.  This function canonicalizes the bitmaps.
2051  *
2052  * @ms: current migration state
2053  * @block: block that contains the page we want to canonicalize
2054  */
2055 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2056 {
2057     RAMState *rs = ram_state;
2058     unsigned long *bitmap = block->bmap;
2059     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2060     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2061     unsigned long run_start;
2062
2063     if (block->page_size == TARGET_PAGE_SIZE) {
2064         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2065         return;
2066     }
2067
2068     /* Find a dirty page */
2069     run_start = find_next_bit(bitmap, pages, 0);
2070
2071     while (run_start < pages) {
2072
2073         /*
2074          * If the start of this run of pages is in the middle of a host
2075          * page, then we need to fixup this host page.
2076          */
2077         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2078             /* Find the end of this run */
2079             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2080             /*
2081              * If the end isn't at the start of a host page, then the
2082              * run doesn't finish at the end of a host page
2083              * and we need to discard.
2084              */
2085         }
2086
2087         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2088             unsigned long page;
2089             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2090                                                              host_ratio);
2091             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2092
2093             /* Clean up the bitmap */
2094             for (page = fixup_start_addr;
2095                  page < fixup_start_addr + host_ratio; page++) {
2096                 /*
2097                  * Remark them as dirty, updating the count for any pages
2098                  * that weren't previously dirty.
2099                  */
2100                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2101             }
2102         }
2103
2104         /* Find the next dirty page for the next iteration */
2105         run_start = find_next_bit(bitmap, pages, run_start);
2106     }
2107 }
2108
2109 /**
2110  * postcopy_chunk_hostpages: discard any partially sent host page
2111  *
2112  * Utility for the outgoing postcopy code.
2113  *
2114  * Discard any partially sent host-page size chunks, mark any partially
2115  * dirty host-page size chunks as all dirty.  In this case the host-page
2116  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2117  *
2118  * Returns zero on success
2119  *
2120  * @ms: current migration state
2121  * @block: block we want to work with
2122  */
2123 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2124 {
2125     postcopy_discard_send_init(ms, block->idstr);
2126
2127     /*
2128      * Ensure that all partially dirty host pages are made fully dirty.
2129      */
2130     postcopy_chunk_hostpages_pass(ms, block);
2131
2132     postcopy_discard_send_finish(ms);
2133     return 0;
2134 }
2135
2136 /**
2137  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2138  *
2139  * Returns zero on success
2140  *
2141  * Transmit the set of pages to be discarded after precopy to the target
2142  * these are pages that:
2143  *     a) Have been previously transmitted but are now dirty again
2144  *     b) Pages that have never been transmitted, this ensures that
2145  *        any pages on the destination that have been mapped by background
2146  *        tasks get discarded (transparent huge pages is the specific concern)
2147  * Hopefully this is pretty sparse
2148  *
2149  * @ms: current migration state
2150  */
2151 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2152 {
2153     RAMState *rs = ram_state;
2154     RAMBlock *block;
2155     int ret;
2156
2157     RCU_READ_LOCK_GUARD();
2158
2159     /* This should be our last sync, the src is now paused */
2160     migration_bitmap_sync(rs);
2161
2162     /* Easiest way to make sure we don't resume in the middle of a host-page */
2163     rs->last_seen_block = NULL;
2164     rs->last_sent_block = NULL;
2165     rs->last_page = 0;
2166
2167     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2168         /* Deal with TPS != HPS and huge pages */
2169         ret = postcopy_chunk_hostpages(ms, block);
2170         if (ret) {
2171             return ret;
2172         }
2173
2174 #ifdef DEBUG_POSTCOPY
2175         ram_debug_dump_bitmap(block->bmap, true,
2176                               block->used_length >> TARGET_PAGE_BITS);
2177 #endif
2178     }
2179     trace_ram_postcopy_send_discard_bitmap();
2180
2181     return postcopy_each_ram_send_discard(ms);
2182 }
2183
2184 /**
2185  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2186  *
2187  * Returns zero on success
2188  *
2189  * @rbname: name of the RAMBlock of the request. NULL means the
2190  *          same that last one.
2191  * @start: RAMBlock starting page
2192  * @length: RAMBlock size
2193  */
2194 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2195 {
2196     trace_ram_discard_range(rbname, start, length);
2197
2198     RCU_READ_LOCK_GUARD();
2199     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2200
2201     if (!rb) {
2202         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2203         return -1;
2204     }
2205
2206     /*
2207      * On source VM, we don't need to update the received bitmap since
2208      * we don't even have one.
2209      */
2210     if (rb->receivedmap) {
2211         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2212                      length >> qemu_target_page_bits());
2213     }
2214
2215     return ram_block_discard_range(rb, start, length);
2216 }
2217
2218 /*
2219  * For every allocation, we will try not to crash the VM if the
2220  * allocation failed.
2221  */
2222 static int xbzrle_init(void)
2223 {
2224     Error *local_err = NULL;
2225
2226     if (!migrate_use_xbzrle()) {
2227         return 0;
2228     }
2229
2230     XBZRLE_cache_lock();
2231
2232     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2233     if (!XBZRLE.zero_target_page) {
2234         error_report("%s: Error allocating zero page", __func__);
2235         goto err_out;
2236     }
2237
2238     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2239                               TARGET_PAGE_SIZE, &local_err);
2240     if (!XBZRLE.cache) {
2241         error_report_err(local_err);
2242         goto free_zero_page;
2243     }
2244
2245     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2246     if (!XBZRLE.encoded_buf) {
2247         error_report("%s: Error allocating encoded_buf", __func__);
2248         goto free_cache;
2249     }
2250
2251     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2252     if (!XBZRLE.current_buf) {
2253         error_report("%s: Error allocating current_buf", __func__);
2254         goto free_encoded_buf;
2255     }
2256
2257     /* We are all good */
2258     XBZRLE_cache_unlock();
2259     return 0;
2260
2261 free_encoded_buf:
2262     g_free(XBZRLE.encoded_buf);
2263     XBZRLE.encoded_buf = NULL;
2264 free_cache:
2265     cache_fini(XBZRLE.cache);
2266     XBZRLE.cache = NULL;
2267 free_zero_page:
2268     g_free(XBZRLE.zero_target_page);
2269     XBZRLE.zero_target_page = NULL;
2270 err_out:
2271     XBZRLE_cache_unlock();
2272     return -ENOMEM;
2273 }
2274
2275 static int ram_state_init(RAMState **rsp)
2276 {
2277     *rsp = g_try_new0(RAMState, 1);
2278
2279     if (!*rsp) {
2280         error_report("%s: Init ramstate fail", __func__);
2281         return -1;
2282     }
2283
2284     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2285     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2286     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2287
2288     /*
2289      * Count the total number of pages used by ram blocks not including any
2290      * gaps due to alignment or unplugs.
2291      * This must match with the initial values of dirty bitmap.
2292      */
2293     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2294     ram_state_reset(*rsp);
2295
2296     return 0;
2297 }
2298
2299 static void ram_list_init_bitmaps(void)
2300 {
2301     MigrationState *ms = migrate_get_current();
2302     RAMBlock *block;
2303     unsigned long pages;
2304     uint8_t shift;
2305
2306     /* Skip setting bitmap if there is no RAM */
2307     if (ram_bytes_total()) {
2308         shift = ms->clear_bitmap_shift;
2309         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2310             error_report("clear_bitmap_shift (%u) too big, using "
2311                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2312             shift = CLEAR_BITMAP_SHIFT_MAX;
2313         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2314             error_report("clear_bitmap_shift (%u) too small, using "
2315                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2316             shift = CLEAR_BITMAP_SHIFT_MIN;
2317         }
2318
2319         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2320             pages = block->max_length >> TARGET_PAGE_BITS;
2321             /*
2322              * The initial dirty bitmap for migration must be set with all
2323              * ones to make sure we'll migrate every guest RAM page to
2324              * destination.
2325              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2326              * new migration after a failed migration, ram_list.
2327              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2328              * guest memory.
2329              */
2330             block->bmap = bitmap_new(pages);
2331             bitmap_set(block->bmap, 0, pages);
2332             block->clear_bmap_shift = shift;
2333             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2334         }
2335     }
2336 }
2337
2338 static void ram_init_bitmaps(RAMState *rs)
2339 {
2340     /* For memory_global_dirty_log_start below.  */
2341     qemu_mutex_lock_iothread();
2342     qemu_mutex_lock_ramlist();
2343
2344     WITH_RCU_READ_LOCK_GUARD() {
2345         ram_list_init_bitmaps();
2346         memory_global_dirty_log_start();
2347         migration_bitmap_sync_precopy(rs);
2348     }
2349     qemu_mutex_unlock_ramlist();
2350     qemu_mutex_unlock_iothread();
2351 }
2352
2353 static int ram_init_all(RAMState **rsp)
2354 {
2355     if (ram_state_init(rsp)) {
2356         return -1;
2357     }
2358
2359     if (xbzrle_init()) {
2360         ram_state_cleanup(rsp);
2361         return -1;
2362     }
2363
2364     ram_init_bitmaps(*rsp);
2365
2366     return 0;
2367 }
2368
2369 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2370 {
2371     RAMBlock *block;
2372     uint64_t pages = 0;
2373
2374     /*
2375      * Postcopy is not using xbzrle/compression, so no need for that.
2376      * Also, since source are already halted, we don't need to care
2377      * about dirty page logging as well.
2378      */
2379
2380     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2381         pages += bitmap_count_one(block->bmap,
2382                                   block->used_length >> TARGET_PAGE_BITS);
2383     }
2384
2385     /* This may not be aligned with current bitmaps. Recalculate. */
2386     rs->migration_dirty_pages = pages;
2387
2388     rs->last_seen_block = NULL;
2389     rs->last_sent_block = NULL;
2390     rs->last_page = 0;
2391     rs->last_version = ram_list.version;
2392     /*
2393      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2394      * matter what we have sent.
2395      */
2396     rs->ram_bulk_stage = false;
2397
2398     /* Update RAMState cache of output QEMUFile */
2399     rs->f = out;
2400
2401     trace_ram_state_resume_prepare(pages);
2402 }
2403
2404 /*
2405  * This function clears bits of the free pages reported by the caller from the
2406  * migration dirty bitmap. @addr is the host address corresponding to the
2407  * start of the continuous guest free pages, and @len is the total bytes of
2408  * those pages.
2409  */
2410 void qemu_guest_free_page_hint(void *addr, size_t len)
2411 {
2412     RAMBlock *block;
2413     ram_addr_t offset;
2414     size_t used_len, start, npages;
2415     MigrationState *s = migrate_get_current();
2416
2417     /* This function is currently expected to be used during live migration */
2418     if (!migration_is_setup_or_active(s->state)) {
2419         return;
2420     }
2421
2422     for (; len > 0; len -= used_len, addr += used_len) {
2423         block = qemu_ram_block_from_host(addr, false, &offset);
2424         if (unlikely(!block || offset >= block->used_length)) {
2425             /*
2426              * The implementation might not support RAMBlock resize during
2427              * live migration, but it could happen in theory with future
2428              * updates. So we add a check here to capture that case.
2429              */
2430             error_report_once("%s unexpected error", __func__);
2431             return;
2432         }
2433
2434         if (len <= block->used_length - offset) {
2435             used_len = len;
2436         } else {
2437             used_len = block->used_length - offset;
2438         }
2439
2440         start = offset >> TARGET_PAGE_BITS;
2441         npages = used_len >> TARGET_PAGE_BITS;
2442
2443         qemu_mutex_lock(&ram_state->bitmap_mutex);
2444         ram_state->migration_dirty_pages -=
2445                       bitmap_count_one_with_offset(block->bmap, start, npages);
2446         bitmap_clear(block->bmap, start, npages);
2447         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2448     }
2449 }
2450
2451 /*
2452  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2453  * long-running RCU critical section.  When rcu-reclaims in the code
2454  * start to become numerous it will be necessary to reduce the
2455  * granularity of these critical sections.
2456  */
2457
2458 /**
2459  * ram_save_setup: Setup RAM for migration
2460  *
2461  * Returns zero to indicate success and negative for error
2462  *
2463  * @f: QEMUFile where to send the data
2464  * @opaque: RAMState pointer
2465  */
2466 static int ram_save_setup(QEMUFile *f, void *opaque)
2467 {
2468     RAMState **rsp = opaque;
2469     RAMBlock *block;
2470
2471     if (compress_threads_save_setup()) {
2472         return -1;
2473     }
2474
2475     /* migration has already setup the bitmap, reuse it. */
2476     if (!migration_in_colo_state()) {
2477         if (ram_init_all(rsp) != 0) {
2478             compress_threads_save_cleanup();
2479             return -1;
2480         }
2481     }
2482     (*rsp)->f = f;
2483
2484     WITH_RCU_READ_LOCK_GUARD() {
2485         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2486
2487         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2488             qemu_put_byte(f, strlen(block->idstr));
2489             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2490             qemu_put_be64(f, block->used_length);
2491             if (migrate_postcopy_ram() && block->page_size !=
2492                                           qemu_host_page_size) {
2493                 qemu_put_be64(f, block->page_size);
2494             }
2495             if (migrate_ignore_shared()) {
2496                 qemu_put_be64(f, block->mr->addr);
2497             }
2498         }
2499     }
2500
2501     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2502     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2503
2504     multifd_send_sync_main(f);
2505     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2506     qemu_fflush(f);
2507
2508     return 0;
2509 }
2510
2511 /**
2512  * ram_save_iterate: iterative stage for migration
2513  *
2514  * Returns zero to indicate success and negative for error
2515  *
2516  * @f: QEMUFile where to send the data
2517  * @opaque: RAMState pointer
2518  */
2519 static int ram_save_iterate(QEMUFile *f, void *opaque)
2520 {
2521     RAMState **temp = opaque;
2522     RAMState *rs = *temp;
2523     int ret = 0;
2524     int i;
2525     int64_t t0;
2526     int done = 0;
2527
2528     if (blk_mig_bulk_active()) {
2529         /* Avoid transferring ram during bulk phase of block migration as
2530          * the bulk phase will usually take a long time and transferring
2531          * ram updates during that time is pointless. */
2532         goto out;
2533     }
2534
2535     WITH_RCU_READ_LOCK_GUARD() {
2536         if (ram_list.version != rs->last_version) {
2537             ram_state_reset(rs);
2538         }
2539
2540         /* Read version before ram_list.blocks */
2541         smp_rmb();
2542
2543         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2544
2545         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2546         i = 0;
2547         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2548                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2549             int pages;
2550
2551             if (qemu_file_get_error(f)) {
2552                 break;
2553             }
2554
2555             pages = ram_find_and_save_block(rs, false);
2556             /* no more pages to sent */
2557             if (pages == 0) {
2558                 done = 1;
2559                 break;
2560             }
2561
2562             if (pages < 0) {
2563                 qemu_file_set_error(f, pages);
2564                 break;
2565             }
2566
2567             rs->target_page_count += pages;
2568
2569             /*
2570              * During postcopy, it is necessary to make sure one whole host
2571              * page is sent in one chunk.
2572              */
2573             if (migrate_postcopy_ram()) {
2574                 flush_compressed_data(rs);
2575             }
2576
2577             /*
2578              * we want to check in the 1st loop, just in case it was the 1st
2579              * time and we had to sync the dirty bitmap.
2580              * qemu_clock_get_ns() is a bit expensive, so we only check each
2581              * some iterations
2582              */
2583             if ((i & 63) == 0) {
2584                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2585                               1000000;
2586                 if (t1 > MAX_WAIT) {
2587                     trace_ram_save_iterate_big_wait(t1, i);
2588                     break;
2589                 }
2590             }
2591             i++;
2592         }
2593     }
2594
2595     /*
2596      * Must occur before EOS (or any QEMUFile operation)
2597      * because of RDMA protocol.
2598      */
2599     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2600
2601 out:
2602     if (ret >= 0
2603         && migration_is_setup_or_active(migrate_get_current()->state)) {
2604         multifd_send_sync_main(rs->f);
2605         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2606         qemu_fflush(f);
2607         ram_counters.transferred += 8;
2608
2609         ret = qemu_file_get_error(f);
2610     }
2611     if (ret < 0) {
2612         return ret;
2613     }
2614
2615     return done;
2616 }
2617
2618 /**
2619  * ram_save_complete: function called to send the remaining amount of ram
2620  *
2621  * Returns zero to indicate success or negative on error
2622  *
2623  * Called with iothread lock
2624  *
2625  * @f: QEMUFile where to send the data
2626  * @opaque: RAMState pointer
2627  */
2628 static int ram_save_complete(QEMUFile *f, void *opaque)
2629 {
2630     RAMState **temp = opaque;
2631     RAMState *rs = *temp;
2632     int ret = 0;
2633
2634     WITH_RCU_READ_LOCK_GUARD() {
2635         if (!migration_in_postcopy()) {
2636             migration_bitmap_sync_precopy(rs);
2637         }
2638
2639         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2640
2641         /* try transferring iterative blocks of memory */
2642
2643         /* flush all remaining blocks regardless of rate limiting */
2644         while (true) {
2645             int pages;
2646
2647             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2648             /* no more blocks to sent */
2649             if (pages == 0) {
2650                 break;
2651             }
2652             if (pages < 0) {
2653                 ret = pages;
2654                 break;
2655             }
2656         }
2657
2658         flush_compressed_data(rs);
2659         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2660     }
2661
2662     if (ret >= 0) {
2663         multifd_send_sync_main(rs->f);
2664         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2665         qemu_fflush(f);
2666     }
2667
2668     return ret;
2669 }
2670
2671 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2672                              uint64_t *res_precopy_only,
2673                              uint64_t *res_compatible,
2674                              uint64_t *res_postcopy_only)
2675 {
2676     RAMState **temp = opaque;
2677     RAMState *rs = *temp;
2678     uint64_t remaining_size;
2679
2680     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2681
2682     if (!migration_in_postcopy() &&
2683         remaining_size < max_size) {
2684         qemu_mutex_lock_iothread();
2685         WITH_RCU_READ_LOCK_GUARD() {
2686             migration_bitmap_sync_precopy(rs);
2687         }
2688         qemu_mutex_unlock_iothread();
2689         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2690     }
2691
2692     if (migrate_postcopy_ram()) {
2693         /* We can do postcopy, and all the data is postcopiable */
2694         *res_compatible += remaining_size;
2695     } else {
2696         *res_precopy_only += remaining_size;
2697     }
2698 }
2699
2700 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2701 {
2702     unsigned int xh_len;
2703     int xh_flags;
2704     uint8_t *loaded_data;
2705
2706     /* extract RLE header */
2707     xh_flags = qemu_get_byte(f);
2708     xh_len = qemu_get_be16(f);
2709
2710     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2711         error_report("Failed to load XBZRLE page - wrong compression!");
2712         return -1;
2713     }
2714
2715     if (xh_len > TARGET_PAGE_SIZE) {
2716         error_report("Failed to load XBZRLE page - len overflow!");
2717         return -1;
2718     }
2719     loaded_data = XBZRLE.decoded_buf;
2720     /* load data and decode */
2721     /* it can change loaded_data to point to an internal buffer */
2722     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2723
2724     /* decode RLE */
2725     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2726                              TARGET_PAGE_SIZE) == -1) {
2727         error_report("Failed to load XBZRLE page - decode error!");
2728         return -1;
2729     }
2730
2731     return 0;
2732 }
2733
2734 /**
2735  * ram_block_from_stream: read a RAMBlock id from the migration stream
2736  *
2737  * Must be called from within a rcu critical section.
2738  *
2739  * Returns a pointer from within the RCU-protected ram_list.
2740  *
2741  * @f: QEMUFile where to read the data from
2742  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2743  */
2744 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2745 {
2746     static RAMBlock *block;
2747     char id[256];
2748     uint8_t len;
2749
2750     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2751         if (!block) {
2752             error_report("Ack, bad migration stream!");
2753             return NULL;
2754         }
2755         return block;
2756     }
2757
2758     len = qemu_get_byte(f);
2759     qemu_get_buffer(f, (uint8_t *)id, len);
2760     id[len] = 0;
2761
2762     block = qemu_ram_block_by_name(id);
2763     if (!block) {
2764         error_report("Can't find block %s", id);
2765         return NULL;
2766     }
2767
2768     if (ramblock_is_ignored(block)) {
2769         error_report("block %s should not be migrated !", id);
2770         return NULL;
2771     }
2772
2773     return block;
2774 }
2775
2776 static inline void *host_from_ram_block_offset(RAMBlock *block,
2777                                                ram_addr_t offset)
2778 {
2779     if (!offset_in_ramblock(block, offset)) {
2780         return NULL;
2781     }
2782
2783     return block->host + offset;
2784 }
2785
2786 static inline void *colo_cache_from_block_offset(RAMBlock *block,
2787                              ram_addr_t offset, bool record_bitmap)
2788 {
2789     if (!offset_in_ramblock(block, offset)) {
2790         return NULL;
2791     }
2792     if (!block->colo_cache) {
2793         error_report("%s: colo_cache is NULL in block :%s",
2794                      __func__, block->idstr);
2795         return NULL;
2796     }
2797
2798     /*
2799     * During colo checkpoint, we need bitmap of these migrated pages.
2800     * It help us to decide which pages in ram cache should be flushed
2801     * into VM's RAM later.
2802     */
2803     if (record_bitmap &&
2804         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
2805         ram_state->migration_dirty_pages++;
2806     }
2807     return block->colo_cache + offset;
2808 }
2809
2810 /**
2811  * ram_handle_compressed: handle the zero page case
2812  *
2813  * If a page (or a whole RDMA chunk) has been
2814  * determined to be zero, then zap it.
2815  *
2816  * @host: host address for the zero page
2817  * @ch: what the page is filled from.  We only support zero
2818  * @size: size of the zero page
2819  */
2820 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2821 {
2822     if (ch != 0 || !is_zero_range(host, size)) {
2823         memset(host, ch, size);
2824     }
2825 }
2826
2827 /* return the size after decompression, or negative value on error */
2828 static int
2829 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2830                      const uint8_t *source, size_t source_len)
2831 {
2832     int err;
2833
2834     err = inflateReset(stream);
2835     if (err != Z_OK) {
2836         return -1;
2837     }
2838
2839     stream->avail_in = source_len;
2840     stream->next_in = (uint8_t *)source;
2841     stream->avail_out = dest_len;
2842     stream->next_out = dest;
2843
2844     err = inflate(stream, Z_NO_FLUSH);
2845     if (err != Z_STREAM_END) {
2846         return -1;
2847     }
2848
2849     return stream->total_out;
2850 }
2851
2852 static void *do_data_decompress(void *opaque)
2853 {
2854     DecompressParam *param = opaque;
2855     unsigned long pagesize;
2856     uint8_t *des;
2857     int len, ret;
2858
2859     qemu_mutex_lock(&param->mutex);
2860     while (!param->quit) {
2861         if (param->des) {
2862             des = param->des;
2863             len = param->len;
2864             param->des = 0;
2865             qemu_mutex_unlock(&param->mutex);
2866
2867             pagesize = TARGET_PAGE_SIZE;
2868
2869             ret = qemu_uncompress_data(&param->stream, des, pagesize,
2870                                        param->compbuf, len);
2871             if (ret < 0 && migrate_get_current()->decompress_error_check) {
2872                 error_report("decompress data failed");
2873                 qemu_file_set_error(decomp_file, ret);
2874             }
2875
2876             qemu_mutex_lock(&decomp_done_lock);
2877             param->done = true;
2878             qemu_cond_signal(&decomp_done_cond);
2879             qemu_mutex_unlock(&decomp_done_lock);
2880
2881             qemu_mutex_lock(&param->mutex);
2882         } else {
2883             qemu_cond_wait(&param->cond, &param->mutex);
2884         }
2885     }
2886     qemu_mutex_unlock(&param->mutex);
2887
2888     return NULL;
2889 }
2890
2891 static int wait_for_decompress_done(void)
2892 {
2893     int idx, thread_count;
2894
2895     if (!migrate_use_compression()) {
2896         return 0;
2897     }
2898
2899     thread_count = migrate_decompress_threads();
2900     qemu_mutex_lock(&decomp_done_lock);
2901     for (idx = 0; idx < thread_count; idx++) {
2902         while (!decomp_param[idx].done) {
2903             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2904         }
2905     }
2906     qemu_mutex_unlock(&decomp_done_lock);
2907     return qemu_file_get_error(decomp_file);
2908 }
2909
2910 static void compress_threads_load_cleanup(void)
2911 {
2912     int i, thread_count;
2913
2914     if (!migrate_use_compression()) {
2915         return;
2916     }
2917     thread_count = migrate_decompress_threads();
2918     for (i = 0; i < thread_count; i++) {
2919         /*
2920          * we use it as a indicator which shows if the thread is
2921          * properly init'd or not
2922          */
2923         if (!decomp_param[i].compbuf) {
2924             break;
2925         }
2926
2927         qemu_mutex_lock(&decomp_param[i].mutex);
2928         decomp_param[i].quit = true;
2929         qemu_cond_signal(&decomp_param[i].cond);
2930         qemu_mutex_unlock(&decomp_param[i].mutex);
2931     }
2932     for (i = 0; i < thread_count; i++) {
2933         if (!decomp_param[i].compbuf) {
2934             break;
2935         }
2936
2937         qemu_thread_join(decompress_threads + i);
2938         qemu_mutex_destroy(&decomp_param[i].mutex);
2939         qemu_cond_destroy(&decomp_param[i].cond);
2940         inflateEnd(&decomp_param[i].stream);
2941         g_free(decomp_param[i].compbuf);
2942         decomp_param[i].compbuf = NULL;
2943     }
2944     g_free(decompress_threads);
2945     g_free(decomp_param);
2946     decompress_threads = NULL;
2947     decomp_param = NULL;
2948     decomp_file = NULL;
2949 }
2950
2951 static int compress_threads_load_setup(QEMUFile *f)
2952 {
2953     int i, thread_count;
2954
2955     if (!migrate_use_compression()) {
2956         return 0;
2957     }
2958
2959     thread_count = migrate_decompress_threads();
2960     decompress_threads = g_new0(QemuThread, thread_count);
2961     decomp_param = g_new0(DecompressParam, thread_count);
2962     qemu_mutex_init(&decomp_done_lock);
2963     qemu_cond_init(&decomp_done_cond);
2964     decomp_file = f;
2965     for (i = 0; i < thread_count; i++) {
2966         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2967             goto exit;
2968         }
2969
2970         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2971         qemu_mutex_init(&decomp_param[i].mutex);
2972         qemu_cond_init(&decomp_param[i].cond);
2973         decomp_param[i].done = true;
2974         decomp_param[i].quit = false;
2975         qemu_thread_create(decompress_threads + i, "decompress",
2976                            do_data_decompress, decomp_param + i,
2977                            QEMU_THREAD_JOINABLE);
2978     }
2979     return 0;
2980 exit:
2981     compress_threads_load_cleanup();
2982     return -1;
2983 }
2984
2985 static void decompress_data_with_multi_threads(QEMUFile *f,
2986                                                void *host, int len)
2987 {
2988     int idx, thread_count;
2989
2990     thread_count = migrate_decompress_threads();
2991     qemu_mutex_lock(&decomp_done_lock);
2992     while (true) {
2993         for (idx = 0; idx < thread_count; idx++) {
2994             if (decomp_param[idx].done) {
2995                 decomp_param[idx].done = false;
2996                 qemu_mutex_lock(&decomp_param[idx].mutex);
2997                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2998                 decomp_param[idx].des = host;
2999                 decomp_param[idx].len = len;
3000                 qemu_cond_signal(&decomp_param[idx].cond);
3001                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3002                 break;
3003             }
3004         }
3005         if (idx < thread_count) {
3006             break;
3007         } else {
3008             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3009         }
3010     }
3011     qemu_mutex_unlock(&decomp_done_lock);
3012 }
3013
3014 /*
3015  * colo cache: this is for secondary VM, we cache the whole
3016  * memory of the secondary VM, it is need to hold the global lock
3017  * to call this helper.
3018  */
3019 int colo_init_ram_cache(void)
3020 {
3021     RAMBlock *block;
3022
3023     WITH_RCU_READ_LOCK_GUARD() {
3024         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3025             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3026                                                     NULL,
3027                                                     false);
3028             if (!block->colo_cache) {
3029                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3030                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3031                              block->used_length);
3032                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3033                     if (block->colo_cache) {
3034                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3035                         block->colo_cache = NULL;
3036                     }
3037                 }
3038                 return -errno;
3039             }
3040         }
3041     }
3042
3043     /*
3044     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3045     * with to decide which page in cache should be flushed into SVM's RAM. Here
3046     * we use the same name 'ram_bitmap' as for migration.
3047     */
3048     if (ram_bytes_total()) {
3049         RAMBlock *block;
3050
3051         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3052             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3053             block->bmap = bitmap_new(pages);
3054         }
3055     }
3056
3057     ram_state_init(&ram_state);
3058     return 0;
3059 }
3060
3061 /* TODO: duplicated with ram_init_bitmaps */
3062 void colo_incoming_start_dirty_log(void)
3063 {
3064     RAMBlock *block = NULL;
3065     /* For memory_global_dirty_log_start below. */
3066     qemu_mutex_lock_iothread();
3067     qemu_mutex_lock_ramlist();
3068
3069     memory_global_dirty_log_sync();
3070     WITH_RCU_READ_LOCK_GUARD() {
3071         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3072             ramblock_sync_dirty_bitmap(ram_state, block);
3073             /* Discard this dirty bitmap record */
3074             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3075         }
3076         memory_global_dirty_log_start();
3077     }
3078     ram_state->migration_dirty_pages = 0;
3079     qemu_mutex_unlock_ramlist();
3080     qemu_mutex_unlock_iothread();
3081 }
3082
3083 /* It is need to hold the global lock to call this helper */
3084 void colo_release_ram_cache(void)
3085 {
3086     RAMBlock *block;
3087
3088     memory_global_dirty_log_stop();
3089     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3090         g_free(block->bmap);
3091         block->bmap = NULL;
3092     }
3093
3094     WITH_RCU_READ_LOCK_GUARD() {
3095         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3096             if (block->colo_cache) {
3097                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3098                 block->colo_cache = NULL;
3099             }
3100         }
3101     }
3102     ram_state_cleanup(&ram_state);
3103 }
3104
3105 /**
3106  * ram_load_setup: Setup RAM for migration incoming side
3107  *
3108  * Returns zero to indicate success and negative for error
3109  *
3110  * @f: QEMUFile where to receive the data
3111  * @opaque: RAMState pointer
3112  */
3113 static int ram_load_setup(QEMUFile *f, void *opaque)
3114 {
3115     if (compress_threads_load_setup(f)) {
3116         return -1;
3117     }
3118
3119     xbzrle_load_setup();
3120     ramblock_recv_map_init();
3121
3122     return 0;
3123 }
3124
3125 static int ram_load_cleanup(void *opaque)
3126 {
3127     RAMBlock *rb;
3128
3129     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3130         qemu_ram_block_writeback(rb);
3131     }
3132
3133     xbzrle_load_cleanup();
3134     compress_threads_load_cleanup();
3135
3136     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3137         g_free(rb->receivedmap);
3138         rb->receivedmap = NULL;
3139     }
3140
3141     return 0;
3142 }
3143
3144 /**
3145  * ram_postcopy_incoming_init: allocate postcopy data structures
3146  *
3147  * Returns 0 for success and negative if there was one error
3148  *
3149  * @mis: current migration incoming state
3150  *
3151  * Allocate data structures etc needed by incoming migration with
3152  * postcopy-ram. postcopy-ram's similarly names
3153  * postcopy_ram_incoming_init does the work.
3154  */
3155 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3156 {
3157     return postcopy_ram_incoming_init(mis);
3158 }
3159
3160 /**
3161  * ram_load_postcopy: load a page in postcopy case
3162  *
3163  * Returns 0 for success or -errno in case of error
3164  *
3165  * Called in postcopy mode by ram_load().
3166  * rcu_read_lock is taken prior to this being called.
3167  *
3168  * @f: QEMUFile where to send the data
3169  */
3170 static int ram_load_postcopy(QEMUFile *f)
3171 {
3172     int flags = 0, ret = 0;
3173     bool place_needed = false;
3174     bool matches_target_page_size = false;
3175     MigrationIncomingState *mis = migration_incoming_get_current();
3176     /* Temporary page that is later 'placed' */
3177     void *postcopy_host_page = mis->postcopy_tmp_page;
3178     void *this_host = NULL;
3179     bool all_zero = true;
3180     int target_pages = 0;
3181
3182     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3183         ram_addr_t addr;
3184         void *host = NULL;
3185         void *page_buffer = NULL;
3186         void *place_source = NULL;
3187         RAMBlock *block = NULL;
3188         uint8_t ch;
3189         int len;
3190
3191         addr = qemu_get_be64(f);
3192
3193         /*
3194          * If qemu file error, we should stop here, and then "addr"
3195          * may be invalid
3196          */
3197         ret = qemu_file_get_error(f);
3198         if (ret) {
3199             break;
3200         }
3201
3202         flags = addr & ~TARGET_PAGE_MASK;
3203         addr &= TARGET_PAGE_MASK;
3204
3205         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3206         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3207                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3208             block = ram_block_from_stream(f, flags);
3209
3210             host = host_from_ram_block_offset(block, addr);
3211             if (!host) {
3212                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3213                 ret = -EINVAL;
3214                 break;
3215             }
3216             target_pages++;
3217             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3218             /*
3219              * Postcopy requires that we place whole host pages atomically;
3220              * these may be huge pages for RAMBlocks that are backed by
3221              * hugetlbfs.
3222              * To make it atomic, the data is read into a temporary page
3223              * that's moved into place later.
3224              * The migration protocol uses,  possibly smaller, target-pages
3225              * however the source ensures it always sends all the components
3226              * of a host page in one chunk.
3227              */
3228             page_buffer = postcopy_host_page +
3229                           ((uintptr_t)host & (block->page_size - 1));
3230             if (target_pages == 1) {
3231                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3232                                                     block->page_size);
3233             } else {
3234                 /* not the 1st TP within the HP */
3235                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3236                     (uintptr_t)this_host) {
3237                     error_report("Non-same host page %p/%p",
3238                                   host, this_host);
3239                     ret = -EINVAL;
3240                     break;
3241                 }
3242             }
3243
3244             /*
3245              * If it's the last part of a host page then we place the host
3246              * page
3247              */
3248             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3249                 place_needed = true;
3250             }
3251             place_source = postcopy_host_page;
3252         }
3253
3254         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3255         case RAM_SAVE_FLAG_ZERO:
3256             ch = qemu_get_byte(f);
3257             /*
3258              * Can skip to set page_buffer when
3259              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3260              */
3261             if (ch || !matches_target_page_size) {
3262                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3263             }
3264             if (ch) {
3265                 all_zero = false;
3266             }
3267             break;
3268
3269         case RAM_SAVE_FLAG_PAGE:
3270             all_zero = false;
3271             if (!matches_target_page_size) {
3272                 /* For huge pages, we always use temporary buffer */
3273                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3274             } else {
3275                 /*
3276                  * For small pages that matches target page size, we
3277                  * avoid the qemu_file copy.  Instead we directly use
3278                  * the buffer of QEMUFile to place the page.  Note: we
3279                  * cannot do any QEMUFile operation before using that
3280                  * buffer to make sure the buffer is valid when
3281                  * placing the page.
3282                  */
3283                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3284                                          TARGET_PAGE_SIZE);
3285             }
3286             break;
3287         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3288             all_zero = false;
3289             len = qemu_get_be32(f);
3290             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3291                 error_report("Invalid compressed data length: %d", len);
3292                 ret = -EINVAL;
3293                 break;
3294             }
3295             decompress_data_with_multi_threads(f, page_buffer, len);
3296             break;
3297
3298         case RAM_SAVE_FLAG_EOS:
3299             /* normal exit */
3300             multifd_recv_sync_main();
3301             break;
3302         default:
3303             error_report("Unknown combination of migration flags: 0x%x"
3304                          " (postcopy mode)", flags);
3305             ret = -EINVAL;
3306             break;
3307         }
3308
3309         /* Got the whole host page, wait for decompress before placing. */
3310         if (place_needed) {
3311             ret |= wait_for_decompress_done();
3312         }
3313
3314         /* Detect for any possible file errors */
3315         if (!ret && qemu_file_get_error(f)) {
3316             ret = qemu_file_get_error(f);
3317         }
3318
3319         if (!ret && place_needed) {
3320             /* This gets called at the last target page in the host page */
3321             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3322                                                        block->page_size);
3323
3324             if (all_zero) {
3325                 ret = postcopy_place_page_zero(mis, place_dest,
3326                                                block);
3327             } else {
3328                 ret = postcopy_place_page(mis, place_dest,
3329                                           place_source, block);
3330             }
3331             place_needed = false;
3332             target_pages = 0;
3333             /* Assume we have a zero page until we detect something different */
3334             all_zero = true;
3335         }
3336     }
3337
3338     return ret;
3339 }
3340
3341 static bool postcopy_is_advised(void)
3342 {
3343     PostcopyState ps = postcopy_state_get();
3344     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3345 }
3346
3347 static bool postcopy_is_running(void)
3348 {
3349     PostcopyState ps = postcopy_state_get();
3350     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3351 }
3352
3353 /*
3354  * Flush content of RAM cache into SVM's memory.
3355  * Only flush the pages that be dirtied by PVM or SVM or both.
3356  */
3357 void colo_flush_ram_cache(void)
3358 {
3359     RAMBlock *block = NULL;
3360     void *dst_host;
3361     void *src_host;
3362     unsigned long offset = 0;
3363
3364     memory_global_dirty_log_sync();
3365     WITH_RCU_READ_LOCK_GUARD() {
3366         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3367             ramblock_sync_dirty_bitmap(ram_state, block);
3368         }
3369     }
3370
3371     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3372     WITH_RCU_READ_LOCK_GUARD() {
3373         block = QLIST_FIRST_RCU(&ram_list.blocks);
3374
3375         while (block) {
3376             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3377
3378             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3379                 >= block->used_length) {
3380                 offset = 0;
3381                 block = QLIST_NEXT_RCU(block, next);
3382             } else {
3383                 migration_bitmap_clear_dirty(ram_state, block, offset);
3384                 dst_host = block->host
3385                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3386                 src_host = block->colo_cache
3387                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3388                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3389             }
3390         }
3391     }
3392     trace_colo_flush_ram_cache_end();
3393 }
3394
3395 /**
3396  * ram_load_precopy: load pages in precopy case
3397  *
3398  * Returns 0 for success or -errno in case of error
3399  *
3400  * Called in precopy mode by ram_load().
3401  * rcu_read_lock is taken prior to this being called.
3402  *
3403  * @f: QEMUFile where to send the data
3404  */
3405 static int ram_load_precopy(QEMUFile *f)
3406 {
3407     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3408     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3409     bool postcopy_advised = postcopy_is_advised();
3410     if (!migrate_use_compression()) {
3411         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3412     }
3413
3414     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3415         ram_addr_t addr, total_ram_bytes;
3416         void *host = NULL, *host_bak = NULL;
3417         uint8_t ch;
3418
3419         /*
3420          * Yield periodically to let main loop run, but an iteration of
3421          * the main loop is expensive, so do it each some iterations
3422          */
3423         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3424             aio_co_schedule(qemu_get_current_aio_context(),
3425                             qemu_coroutine_self());
3426             qemu_coroutine_yield();
3427         }
3428         i++;
3429
3430         addr = qemu_get_be64(f);
3431         flags = addr & ~TARGET_PAGE_MASK;
3432         addr &= TARGET_PAGE_MASK;
3433
3434         if (flags & invalid_flags) {
3435             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3436                 error_report("Received an unexpected compressed page");
3437             }
3438
3439             ret = -EINVAL;
3440             break;
3441         }
3442
3443         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3444                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3445             RAMBlock *block = ram_block_from_stream(f, flags);
3446
3447             host = host_from_ram_block_offset(block, addr);
3448             /*
3449              * After going into COLO stage, we should not load the page
3450              * into SVM's memory directly, we put them into colo_cache firstly.
3451              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3452              * Previously, we copied all these memory in preparing stage of COLO
3453              * while we need to stop VM, which is a time-consuming process.
3454              * Here we optimize it by a trick, back-up every page while in
3455              * migration process while COLO is enabled, though it affects the
3456              * speed of the migration, but it obviously reduce the downtime of
3457              * back-up all SVM'S memory in COLO preparing stage.
3458              */
3459             if (migration_incoming_colo_enabled()) {
3460                 if (migration_incoming_in_colo_state()) {
3461                     /* In COLO stage, put all pages into cache temporarily */
3462                     host = colo_cache_from_block_offset(block, addr, true);
3463                 } else {
3464                    /*
3465                     * In migration stage but before COLO stage,
3466                     * Put all pages into both cache and SVM's memory.
3467                     */
3468                     host_bak = colo_cache_from_block_offset(block, addr, false);
3469                 }
3470             }
3471             if (!host) {
3472                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3473                 ret = -EINVAL;
3474                 break;
3475             }
3476             if (!migration_incoming_in_colo_state()) {
3477                 ramblock_recv_bitmap_set(block, host);
3478             }
3479
3480             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3481         }
3482
3483         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3484         case RAM_SAVE_FLAG_MEM_SIZE:
3485             /* Synchronize RAM block list */
3486             total_ram_bytes = addr;
3487             while (!ret && total_ram_bytes) {
3488                 RAMBlock *block;
3489                 char id[256];
3490                 ram_addr_t length;
3491
3492                 len = qemu_get_byte(f);
3493                 qemu_get_buffer(f, (uint8_t *)id, len);
3494                 id[len] = 0;
3495                 length = qemu_get_be64(f);
3496
3497                 block = qemu_ram_block_by_name(id);
3498                 if (block && !qemu_ram_is_migratable(block)) {
3499                     error_report("block %s should not be migrated !", id);
3500                     ret = -EINVAL;
3501                 } else if (block) {
3502                     if (length != block->used_length) {
3503                         Error *local_err = NULL;
3504
3505                         ret = qemu_ram_resize(block, length,
3506                                               &local_err);
3507                         if (local_err) {
3508                             error_report_err(local_err);
3509                         }
3510                     }
3511                     /* For postcopy we need to check hugepage sizes match */
3512                     if (postcopy_advised &&
3513                         block->page_size != qemu_host_page_size) {
3514                         uint64_t remote_page_size = qemu_get_be64(f);
3515                         if (remote_page_size != block->page_size) {
3516                             error_report("Mismatched RAM page size %s "
3517                                          "(local) %zd != %" PRId64,
3518                                          id, block->page_size,
3519                                          remote_page_size);
3520                             ret = -EINVAL;
3521                         }
3522                     }
3523                     if (migrate_ignore_shared()) {
3524                         hwaddr addr = qemu_get_be64(f);
3525                         if (ramblock_is_ignored(block) &&
3526                             block->mr->addr != addr) {
3527                             error_report("Mismatched GPAs for block %s "
3528                                          "%" PRId64 "!= %" PRId64,
3529                                          id, (uint64_t)addr,
3530                                          (uint64_t)block->mr->addr);
3531                             ret = -EINVAL;
3532                         }
3533                     }
3534                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3535                                           block->idstr);
3536                 } else {
3537                     error_report("Unknown ramblock \"%s\", cannot "
3538                                  "accept migration", id);
3539                     ret = -EINVAL;
3540                 }
3541
3542                 total_ram_bytes -= length;
3543             }
3544             break;
3545
3546         case RAM_SAVE_FLAG_ZERO:
3547             ch = qemu_get_byte(f);
3548             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3549             break;
3550
3551         case RAM_SAVE_FLAG_PAGE:
3552             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3553             break;
3554
3555         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3556             len = qemu_get_be32(f);
3557             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3558                 error_report("Invalid compressed data length: %d", len);
3559                 ret = -EINVAL;
3560                 break;
3561             }
3562             decompress_data_with_multi_threads(f, host, len);
3563             break;
3564
3565         case RAM_SAVE_FLAG_XBZRLE:
3566             if (load_xbzrle(f, addr, host) < 0) {
3567                 error_report("Failed to decompress XBZRLE page at "
3568                              RAM_ADDR_FMT, addr);
3569                 ret = -EINVAL;
3570                 break;
3571             }
3572             break;
3573         case RAM_SAVE_FLAG_EOS:
3574             /* normal exit */
3575             multifd_recv_sync_main();
3576             break;
3577         default:
3578             if (flags & RAM_SAVE_FLAG_HOOK) {
3579                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3580             } else {
3581                 error_report("Unknown combination of migration flags: 0x%x",
3582                              flags);
3583                 ret = -EINVAL;
3584             }
3585         }
3586         if (!ret) {
3587             ret = qemu_file_get_error(f);
3588         }
3589         if (!ret && host_bak) {
3590             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3591         }
3592     }
3593
3594     ret |= wait_for_decompress_done();
3595     return ret;
3596 }
3597
3598 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3599 {
3600     int ret = 0;
3601     static uint64_t seq_iter;
3602     /*
3603      * If system is running in postcopy mode, page inserts to host memory must
3604      * be atomic
3605      */
3606     bool postcopy_running = postcopy_is_running();
3607
3608     seq_iter++;
3609
3610     if (version_id != 4) {
3611         return -EINVAL;
3612     }
3613
3614     /*
3615      * This RCU critical section can be very long running.
3616      * When RCU reclaims in the code start to become numerous,
3617      * it will be necessary to reduce the granularity of this
3618      * critical section.
3619      */
3620     WITH_RCU_READ_LOCK_GUARD() {
3621         if (postcopy_running) {
3622             ret = ram_load_postcopy(f);
3623         } else {
3624             ret = ram_load_precopy(f);
3625         }
3626     }
3627     trace_ram_load_complete(ret, seq_iter);
3628
3629     return ret;
3630 }
3631
3632 static bool ram_has_postcopy(void *opaque)
3633 {
3634     RAMBlock *rb;
3635     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3636         if (ramblock_is_pmem(rb)) {
3637             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3638                          "is not supported now!", rb->idstr, rb->host);
3639             return false;
3640         }
3641     }
3642
3643     return migrate_postcopy_ram();
3644 }
3645
3646 /* Sync all the dirty bitmap with destination VM.  */
3647 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3648 {
3649     RAMBlock *block;
3650     QEMUFile *file = s->to_dst_file;
3651     int ramblock_count = 0;
3652
3653     trace_ram_dirty_bitmap_sync_start();
3654
3655     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3656         qemu_savevm_send_recv_bitmap(file, block->idstr);
3657         trace_ram_dirty_bitmap_request(block->idstr);
3658         ramblock_count++;
3659     }
3660
3661     trace_ram_dirty_bitmap_sync_wait();
3662
3663     /* Wait until all the ramblocks' dirty bitmap synced */
3664     while (ramblock_count--) {
3665         qemu_sem_wait(&s->rp_state.rp_sem);
3666     }
3667
3668     trace_ram_dirty_bitmap_sync_complete();
3669
3670     return 0;
3671 }
3672
3673 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3674 {
3675     qemu_sem_post(&s->rp_state.rp_sem);
3676 }
3677
3678 /*
3679  * Read the received bitmap, revert it as the initial dirty bitmap.
3680  * This is only used when the postcopy migration is paused but wants
3681  * to resume from a middle point.
3682  */
3683 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3684 {
3685     int ret = -EINVAL;
3686     QEMUFile *file = s->rp_state.from_dst_file;
3687     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3688     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3689     uint64_t size, end_mark;
3690
3691     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3692
3693     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3694         error_report("%s: incorrect state %s", __func__,
3695                      MigrationStatus_str(s->state));
3696         return -EINVAL;
3697     }
3698
3699     /*
3700      * Note: see comments in ramblock_recv_bitmap_send() on why we
3701      * need the endianness conversion, and the paddings.
3702      */
3703     local_size = ROUND_UP(local_size, 8);
3704
3705     /* Add paddings */
3706     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3707
3708     size = qemu_get_be64(file);
3709
3710     /* The size of the bitmap should match with our ramblock */
3711     if (size != local_size) {
3712         error_report("%s: ramblock '%s' bitmap size mismatch "
3713                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3714                      block->idstr, size, local_size);
3715         ret = -EINVAL;
3716         goto out;
3717     }
3718
3719     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3720     end_mark = qemu_get_be64(file);
3721
3722     ret = qemu_file_get_error(file);
3723     if (ret || size != local_size) {
3724         error_report("%s: read bitmap failed for ramblock '%s': %d"
3725                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3726                      __func__, block->idstr, ret, local_size, size);
3727         ret = -EIO;
3728         goto out;
3729     }
3730
3731     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3732         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3733                      __func__, block->idstr, end_mark);
3734         ret = -EINVAL;
3735         goto out;
3736     }
3737
3738     /*
3739      * Endianness conversion. We are during postcopy (though paused).
3740      * The dirty bitmap won't change. We can directly modify it.
3741      */
3742     bitmap_from_le(block->bmap, le_bitmap, nbits);
3743
3744     /*
3745      * What we received is "received bitmap". Revert it as the initial
3746      * dirty bitmap for this ramblock.
3747      */
3748     bitmap_complement(block->bmap, block->bmap, nbits);
3749
3750     trace_ram_dirty_bitmap_reload_complete(block->idstr);
3751
3752     /*
3753      * We succeeded to sync bitmap for current ramblock. If this is
3754      * the last one to sync, we need to notify the main send thread.
3755      */
3756     ram_dirty_bitmap_reload_notify(s);
3757
3758     ret = 0;
3759 out:
3760     g_free(le_bitmap);
3761     return ret;
3762 }
3763
3764 static int ram_resume_prepare(MigrationState *s, void *opaque)
3765 {
3766     RAMState *rs = *(RAMState **)opaque;
3767     int ret;
3768
3769     ret = ram_dirty_bitmap_sync_all(s, rs);
3770     if (ret) {
3771         return ret;
3772     }
3773
3774     ram_state_resume_prepare(rs, s->to_dst_file);
3775
3776     return 0;
3777 }
3778
3779 static SaveVMHandlers savevm_ram_handlers = {
3780     .save_setup = ram_save_setup,
3781     .save_live_iterate = ram_save_iterate,
3782     .save_live_complete_postcopy = ram_save_complete,
3783     .save_live_complete_precopy = ram_save_complete,
3784     .has_postcopy = ram_has_postcopy,
3785     .save_live_pending = ram_save_pending,
3786     .load_state = ram_load,
3787     .save_cleanup = ram_save_cleanup,
3788     .load_setup = ram_load_setup,
3789     .load_cleanup = ram_load_cleanup,
3790     .resume_prepare = ram_resume_prepare,
3791 };
3792
3793 void ram_mig_init(void)
3794 {
3795     qemu_mutex_init(&XBZRLE.lock);
3796     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
3797 }