migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/main-loop.h"
  34 #include "xbzrle.h"
  35 #include "ram.h"
  36 #include "migration.h"
  37 #include "migration/register.h"
  38 #include "migration/misc.h"
  39 #include "qemu-file.h"
  40 #include "postcopy-ram.h"
  41 #include "page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "qapi/error.h"
  44 #include "qapi/qapi-types-migration.h"
  45 #include "qapi/qapi-events-migration.h"
  46 #include "qapi/qmp/qerror.h"
  47 #include "trace.h"
  48 #include "exec/ram_addr.h"
  49 #include "exec/target_page.h"
  50 #include "qemu/rcu_queue.h"
  51 #include "migration/colo.h"
  52 #include "block.h"
  53 #include "sysemu/cpu-throttle.h"
  54 #include "savevm.h"
  55 #include "qemu/iov.h"
  56 #include "multifd.h"
  57 #include "sysemu/runstate.h"
  58
  59 #if defined(__linux__)
  60 #include "qemu/userfaultfd.h"
  61 #endif /* defined(__linux__) */
  62
  63 /***********************************************************/
  64 /* ram save/restore */
  65
  66 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  67  * worked for pages that where filled with the same char.  We switched
  68  * it to only search for the zero value.  And to avoid confusion with
  69  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  70  */
  71
  72 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  73 #define RAM_SAVE_FLAG_ZERO     0x02
  74 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  75 #define RAM_SAVE_FLAG_PAGE     0x08
  76 #define RAM_SAVE_FLAG_EOS      0x10
  77 #define RAM_SAVE_FLAG_CONTINUE 0x20
  78 #define RAM_SAVE_FLAG_XBZRLE   0x40
  79 /* 0x80 is reserved in migration.h start with 0x100 next */
  80 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  81
  82 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  83 {
  84     return buffer_is_zero(p, size);
  85 }
  86
  87 XBZRLECacheStats xbzrle_counters;
  88
  89 /* struct contains XBZRLE cache and a static page
  90    used by the compression */
  91 static struct {
  92     /* buffer used for XBZRLE encoding */
  93     uint8_t *encoded_buf;
  94     /* buffer for storing page content */
  95     uint8_t *current_buf;
  96     /* Cache for XBZRLE, Protected by lock. */
  97     PageCache *cache;
  98     QemuMutex lock;
  99     /* it will store a page full of zeros */
 100     uint8_t *zero_target_page;
 101     /* buffer used for XBZRLE decoding */
 102     uint8_t *decoded_buf;
 103 } XBZRLE;
 104
 105 static void XBZRLE_cache_lock(void)
 106 {
 107     if (migrate_use_xbzrle()) {
 108         qemu_mutex_lock(&XBZRLE.lock);
 109     }
 110 }
 111
 112 static void XBZRLE_cache_unlock(void)
 113 {
 114     if (migrate_use_xbzrle()) {
 115         qemu_mutex_unlock(&XBZRLE.lock);
 116     }
 117 }
 118
 119 /**
 120  * xbzrle_cache_resize: resize the xbzrle cache
 121  *
 122  * This function is called from migrate_params_apply in main
 123  * thread, possibly while a migration is in progress.  A running
 124  * migration may be using the cache and might finish during this call,
 125  * hence changes to the cache are protected by XBZRLE.lock().
 126  *
 127  * Returns 0 for success or -1 for error
 128  *
 129  * @new_size: new cache size
 130  * @errp: set *errp if the check failed, with reason
 131  */
 132 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 133 {
 134     PageCache *new_cache;
 135     int64_t ret = 0;
 136
 137     /* Check for truncation */
 138     if (new_size != (size_t)new_size) {
 139         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 140                    "exceeding address space");
 141         return -1;
 142     }
 143
 144     if (new_size == migrate_xbzrle_cache_size()) {
 145         /* nothing to do */
 146         return 0;
 147     }
 148
 149     XBZRLE_cache_lock();
 150
 151     if (XBZRLE.cache != NULL) {
 152         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 153         if (!new_cache) {
 154             ret = -1;
 155             goto out;
 156         }
 157
 158         cache_fini(XBZRLE.cache);
 159         XBZRLE.cache = new_cache;
 160     }
 161 out:
 162     XBZRLE_cache_unlock();
 163     return ret;
 164 }
 165
 166 bool ramblock_is_ignored(RAMBlock *block)
 167 {
 168     return !qemu_ram_is_migratable(block) ||
 169            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 170 }
 171
 172 #undef RAMBLOCK_FOREACH
 173
 174 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 175 {
 176     RAMBlock *block;
 177     int ret = 0;
 178
 179     RCU_READ_LOCK_GUARD();
 180
 181     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 182         ret = func(block, opaque);
 183         if (ret) {
 184             break;
 185         }
 186     }
 187     return ret;
 188 }
 189
 190 static void ramblock_recv_map_init(void)
 191 {
 192     RAMBlock *rb;
 193
 194     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 195         assert(!rb->receivedmap);
 196         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 197     }
 198 }
 199
 200 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 201 {
 202     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 203                     rb->receivedmap);
 204 }
 205
 206 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 207 {
 208     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 209 }
 210
 211 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 212 {
 213     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 214 }
 215
 216 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 217                                     size_t nr)
 218 {
 219     bitmap_set_atomic(rb->receivedmap,
 220                       ramblock_recv_bitmap_offset(host_addr, rb),
 221                       nr);
 222 }
 223
 224 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 225
 226 /*
 227  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 228  *
 229  * Returns >0 if success with sent bytes, or <0 if error.
 230  */
 231 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 232                                   const char *block_name)
 233 {
 234     RAMBlock *block = qemu_ram_block_by_name(block_name);
 235     unsigned long *le_bitmap, nbits;
 236     uint64_t size;
 237
 238     if (!block) {
 239         error_report("%s: invalid block name: %s", __func__, block_name);
 240         return -1;
 241     }
 242
 243     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 244
 245     /*
 246      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 247      * machines we may need 4 more bytes for padding (see below
 248      * comment). So extend it a bit before hand.
 249      */
 250     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 251
 252     /*
 253      * Always use little endian when sending the bitmap. This is
 254      * required that when source and destination VMs are not using the
 255      * same endianness. (Note: big endian won't work.)
 256      */
 257     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 258
 259     /* Size of the bitmap, in bytes */
 260     size = DIV_ROUND_UP(nbits, 8);
 261
 262     /*
 263      * size is always aligned to 8 bytes for 64bit machines, but it
 264      * may not be true for 32bit machines. We need this padding to
 265      * make sure the migration can survive even between 32bit and
 266      * 64bit machines.
 267      */
 268     size = ROUND_UP(size, 8);
 269
 270     qemu_put_be64(file, size);
 271     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 272     /*
 273      * Mark as an end, in case the middle part is screwed up due to
 274      * some "mysterious" reason.
 275      */
 276     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 277     qemu_fflush(file);
 278
 279     g_free(le_bitmap);
 280
 281     if (qemu_file_get_error(file)) {
 282         return qemu_file_get_error(file);
 283     }
 284
 285     return size + sizeof(size);
 286 }
 287
 288 /*
 289  * An outstanding page request, on the source, having been received
 290  * and queued
 291  */
 292 struct RAMSrcPageRequest {
 293     RAMBlock *rb;
 294     hwaddr    offset;
 295     hwaddr    len;
 296
 297     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 298 };
 299
 300 /* State of RAM for migration */
 301 struct RAMState {
 302     /* QEMUFile used for this migration */
 303     QEMUFile *f;
 304     /* UFFD file descriptor, used in 'write-tracking' migration */
 305     int uffdio_fd;
 306     /* Last block that we have visited searching for dirty pages */
 307     RAMBlock *last_seen_block;
 308     /* Last block from where we have sent data */
 309     RAMBlock *last_sent_block;
 310     /* Last dirty target page we have sent */
 311     ram_addr_t last_page;
 312     /* last ram version we have seen */
 313     uint32_t last_version;
 314     /* How many times we have dirty too many pages */
 315     int dirty_rate_high_cnt;
 316     /* these variables are used for bitmap sync */
 317     /* last time we did a full bitmap_sync */
 318     int64_t time_last_bitmap_sync;
 319     /* bytes transferred at start_time */
 320     uint64_t bytes_xfer_prev;
 321     /* number of dirty pages since start_time */
 322     uint64_t num_dirty_pages_period;
 323     /* xbzrle misses since the beginning of the period */
 324     uint64_t xbzrle_cache_miss_prev;
 325     /* Amount of xbzrle pages since the beginning of the period */
 326     uint64_t xbzrle_pages_prev;
 327     /* Amount of xbzrle encoded bytes since the beginning of the period */
 328     uint64_t xbzrle_bytes_prev;
 329     /* Start using XBZRLE (e.g., after the first round). */
 330     bool xbzrle_enabled;
 331
 332     /* compression statistics since the beginning of the period */
 333     /* amount of count that no free thread to compress data */
 334     uint64_t compress_thread_busy_prev;
 335     /* amount bytes after compression */
 336     uint64_t compressed_size_prev;
 337     /* amount of compressed pages */
 338     uint64_t compress_pages_prev;
 339
 340     /* total handled target pages at the beginning of period */
 341     uint64_t target_page_count_prev;
 342     /* total handled target pages since start */
 343     uint64_t target_page_count;
 344     /* number of dirty bits in the bitmap */
 345     uint64_t migration_dirty_pages;
 346     /* Protects modification of the bitmap and migration dirty pages */
 347     QemuMutex bitmap_mutex;
 348     /* The RAMBlock used in the last src_page_requests */
 349     RAMBlock *last_req_rb;
 350     /* Queue of outstanding page requests from the destination */
 351     QemuMutex src_page_req_mutex;
 352     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 353 };
 354 typedef struct RAMState RAMState;
 355
 356 static RAMState *ram_state;
 357
 358 static NotifierWithReturnList precopy_notifier_list;
 359
 360 void precopy_infrastructure_init(void)
 361 {
 362     notifier_with_return_list_init(&precopy_notifier_list);
 363 }
 364
 365 void precopy_add_notifier(NotifierWithReturn *n)
 366 {
 367     notifier_with_return_list_add(&precopy_notifier_list, n);
 368 }
 369
 370 void precopy_remove_notifier(NotifierWithReturn *n)
 371 {
 372     notifier_with_return_remove(n);
 373 }
 374
 375 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 376 {
 377     PrecopyNotifyData pnd;
 378     pnd.reason = reason;
 379     pnd.errp = errp;
 380
 381     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 382 }
 383
 384 uint64_t ram_bytes_remaining(void)
 385 {
 386     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 387                        0;
 388 }
 389
 390 MigrationStats ram_counters;
 391
 392 /* used by the search for pages to send */
 393 struct PageSearchStatus {
 394     /* Current block being searched */
 395     RAMBlock    *block;
 396     /* Current page to search from */
 397     unsigned long page;
 398     /* Set once we wrap around */
 399     bool         complete_round;
 400 };
 401 typedef struct PageSearchStatus PageSearchStatus;
 402
 403 CompressionStats compression_counters;
 404
 405 struct CompressParam {
 406     bool done;
 407     bool quit;
 408     bool zero_page;
 409     QEMUFile *file;
 410     QemuMutex mutex;
 411     QemuCond cond;
 412     RAMBlock *block;
 413     ram_addr_t offset;
 414
 415     /* internally used fields */
 416     z_stream stream;
 417     uint8_t *originbuf;
 418 };
 419 typedef struct CompressParam CompressParam;
 420
 421 struct DecompressParam {
 422     bool done;
 423     bool quit;
 424     QemuMutex mutex;
 425     QemuCond cond;
 426     void *des;
 427     uint8_t *compbuf;
 428     int len;
 429     z_stream stream;
 430 };
 431 typedef struct DecompressParam DecompressParam;
 432
 433 static CompressParam *comp_param;
 434 static QemuThread *compress_threads;
 435 /* comp_done_cond is used to wake up the migration thread when
 436  * one of the compression threads has finished the compression.
 437  * comp_done_lock is used to co-work with comp_done_cond.
 438  */
 439 static QemuMutex comp_done_lock;
 440 static QemuCond comp_done_cond;
 441 /* The empty QEMUFileOps will be used by file in CompressParam */
 442 static const QEMUFileOps empty_ops = { };
 443
 444 static QEMUFile *decomp_file;
 445 static DecompressParam *decomp_param;
 446 static QemuThread *decompress_threads;
 447 static QemuMutex decomp_done_lock;
 448 static QemuCond decomp_done_cond;
 449
 450 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 451                                  ram_addr_t offset, uint8_t *source_buf);
 452
 453 static void *do_data_compress(void *opaque)
 454 {
 455     CompressParam *param = opaque;
 456     RAMBlock *block;
 457     ram_addr_t offset;
 458     bool zero_page;
 459
 460     qemu_mutex_lock(&param->mutex);
 461     while (!param->quit) {
 462         if (param->block) {
 463             block = param->block;
 464             offset = param->offset;
 465             param->block = NULL;
 466             qemu_mutex_unlock(&param->mutex);
 467
 468             zero_page = do_compress_ram_page(param->file, &param->stream,
 469                                              block, offset, param->originbuf);
 470
 471             qemu_mutex_lock(&comp_done_lock);
 472             param->done = true;
 473             param->zero_page = zero_page;
 474             qemu_cond_signal(&comp_done_cond);
 475             qemu_mutex_unlock(&comp_done_lock);
 476
 477             qemu_mutex_lock(&param->mutex);
 478         } else {
 479             qemu_cond_wait(&param->cond, &param->mutex);
 480         }
 481     }
 482     qemu_mutex_unlock(&param->mutex);
 483
 484     return NULL;
 485 }
 486
 487 static void compress_threads_save_cleanup(void)
 488 {
 489     int i, thread_count;
 490
 491     if (!migrate_use_compression() || !comp_param) {
 492         return;
 493     }
 494
 495     thread_count = migrate_compress_threads();
 496     for (i = 0; i < thread_count; i++) {
 497         /*
 498          * we use it as a indicator which shows if the thread is
 499          * properly init'd or not
 500          */
 501         if (!comp_param[i].file) {
 502             break;
 503         }
 504
 505         qemu_mutex_lock(&comp_param[i].mutex);
 506         comp_param[i].quit = true;
 507         qemu_cond_signal(&comp_param[i].cond);
 508         qemu_mutex_unlock(&comp_param[i].mutex);
 509
 510         qemu_thread_join(compress_threads + i);
 511         qemu_mutex_destroy(&comp_param[i].mutex);
 512         qemu_cond_destroy(&comp_param[i].cond);
 513         deflateEnd(&comp_param[i].stream);
 514         g_free(comp_param[i].originbuf);
 515         qemu_fclose(comp_param[i].file);
 516         comp_param[i].file = NULL;
 517     }
 518     qemu_mutex_destroy(&comp_done_lock);
 519     qemu_cond_destroy(&comp_done_cond);
 520     g_free(compress_threads);
 521     g_free(comp_param);
 522     compress_threads = NULL;
 523     comp_param = NULL;
 524 }
 525
 526 static int compress_threads_save_setup(void)
 527 {
 528     int i, thread_count;
 529
 530     if (!migrate_use_compression()) {
 531         return 0;
 532     }
 533     thread_count = migrate_compress_threads();
 534     compress_threads = g_new0(QemuThread, thread_count);
 535     comp_param = g_new0(CompressParam, thread_count);
 536     qemu_cond_init(&comp_done_cond);
 537     qemu_mutex_init(&comp_done_lock);
 538     for (i = 0; i < thread_count; i++) {
 539         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 540         if (!comp_param[i].originbuf) {
 541             goto exit;
 542         }
 543
 544         if (deflateInit(&comp_param[i].stream,
 545                         migrate_compress_level()) != Z_OK) {
 546             g_free(comp_param[i].originbuf);
 547             goto exit;
 548         }
 549
 550         /* comp_param[i].file is just used as a dummy buffer to save data,
 551          * set its ops to empty.
 552          */
 553         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 554         comp_param[i].done = true;
 555         comp_param[i].quit = false;
 556         qemu_mutex_init(&comp_param[i].mutex);
 557         qemu_cond_init(&comp_param[i].cond);
 558         qemu_thread_create(compress_threads + i, "compress",
 559                            do_data_compress, comp_param + i,
 560                            QEMU_THREAD_JOINABLE);
 561     }
 562     return 0;
 563
 564 exit:
 565     compress_threads_save_cleanup();
 566     return -1;
 567 }
 568
 569 /**
 570  * save_page_header: write page header to wire
 571  *
 572  * If this is the 1st block, it also writes the block identification
 573  *
 574  * Returns the number of bytes written
 575  *
 576  * @f: QEMUFile where to send the data
 577  * @block: block that contains the page we want to send
 578  * @offset: offset inside the block for the page
 579  *          in the lower bits, it contains flags
 580  */
 581 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 582                                ram_addr_t offset)
 583 {
 584     size_t size, len;
 585
 586     if (block == rs->last_sent_block) {
 587         offset |= RAM_SAVE_FLAG_CONTINUE;
 588     }
 589     qemu_put_be64(f, offset);
 590     size = 8;
 591
 592     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 593         len = strlen(block->idstr);
 594         qemu_put_byte(f, len);
 595         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 596         size += 1 + len;
 597         rs->last_sent_block = block;
 598     }
 599     return size;
 600 }
 601
 602 /**
 603  * mig_throttle_guest_down: throttle down the guest
 604  *
 605  * Reduce amount of guest cpu execution to hopefully slow down memory
 606  * writes. If guest dirty memory rate is reduced below the rate at
 607  * which we can transfer pages to the destination then we should be
 608  * able to complete migration. Some workloads dirty memory way too
 609  * fast and will not effectively converge, even with auto-converge.
 610  */
 611 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 612                                     uint64_t bytes_dirty_threshold)
 613 {
 614     MigrationState *s = migrate_get_current();
 615     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 616     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 617     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 618     int pct_max = s->parameters.max_cpu_throttle;
 619
 620     uint64_t throttle_now = cpu_throttle_get_percentage();
 621     uint64_t cpu_now, cpu_ideal, throttle_inc;
 622
 623     /* We have not started throttling yet. Let's start it. */
 624     if (!cpu_throttle_active()) {
 625         cpu_throttle_set(pct_initial);
 626     } else {
 627         /* Throttling already on, just increase the rate */
 628         if (!pct_tailslow) {
 629             throttle_inc = pct_increment;
 630         } else {
 631             /* Compute the ideal CPU percentage used by Guest, which may
 632              * make the dirty rate match the dirty rate threshold. */
 633             cpu_now = 100 - throttle_now;
 634             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 635                         bytes_dirty_period);
 636             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 637         }
 638         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 639     }
 640 }
 641
 642 /**
 643  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 644  *
 645  * @rs: current RAM state
 646  * @current_addr: address for the zero page
 647  *
 648  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 649  * The important thing is that a stale (not-yet-0'd) page be replaced
 650  * by the new data.
 651  * As a bonus, if the page wasn't in the cache it gets added so that
 652  * when a small write is made into the 0'd page it gets XBZRLE sent.
 653  */
 654 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 655 {
 656     if (!rs->xbzrle_enabled) {
 657         return;
 658     }
 659
 660     /* We don't care if this fails to allocate a new cache page
 661      * as long as it updated an old one */
 662     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 663                  ram_counters.dirty_sync_count);
 664 }
 665
 666 #define ENCODING_FLAG_XBZRLE 0x1
 667
 668 /**
 669  * save_xbzrle_page: compress and send current page
 670  *
 671  * Returns: 1 means that we wrote the page
 672  *          0 means that page is identical to the one already sent
 673  *          -1 means that xbzrle would be longer than normal
 674  *
 675  * @rs: current RAM state
 676  * @current_data: pointer to the address of the page contents
 677  * @current_addr: addr of the page
 678  * @block: block that contains the page we want to send
 679  * @offset: offset inside the block for the page
 680  * @last_stage: if we are at the completion stage
 681  */
 682 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 683                             ram_addr_t current_addr, RAMBlock *block,
 684                             ram_addr_t offset, bool last_stage)
 685 {
 686     int encoded_len = 0, bytes_xbzrle;
 687     uint8_t *prev_cached_page;
 688
 689     if (!cache_is_cached(XBZRLE.cache, current_addr,
 690                          ram_counters.dirty_sync_count)) {
 691         xbzrle_counters.cache_miss++;
 692         if (!last_stage) {
 693             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 694                              ram_counters.dirty_sync_count) == -1) {
 695                 return -1;
 696             } else {
 697                 /* update *current_data when the page has been
 698                    inserted into cache */
 699                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 700             }
 701         }
 702         return -1;
 703     }
 704
 705     /*
 706      * Reaching here means the page has hit the xbzrle cache, no matter what
 707      * encoding result it is (normal encoding, overflow or skipping the page),
 708      * count the page as encoded. This is used to calculate the encoding rate.
 709      *
 710      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 711      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 712      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 713      * skipped page included. In this way, the encoding rate can tell if the
 714      * guest page is good for xbzrle encoding.
 715      */
 716     xbzrle_counters.pages++;
 717     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 718
 719     /* save current buffer into memory */
 720     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 721
 722     /* XBZRLE encoding (if there is no overflow) */
 723     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 724                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 725                                        TARGET_PAGE_SIZE);
 726
 727     /*
 728      * Update the cache contents, so that it corresponds to the data
 729      * sent, in all cases except where we skip the page.
 730      */
 731     if (!last_stage && encoded_len != 0) {
 732         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 733         /*
 734          * In the case where we couldn't compress, ensure that the caller
 735          * sends the data from the cache, since the guest might have
 736          * changed the RAM since we copied it.
 737          */
 738         *current_data = prev_cached_page;
 739     }
 740
 741     if (encoded_len == 0) {
 742         trace_save_xbzrle_page_skipping();
 743         return 0;
 744     } else if (encoded_len == -1) {
 745         trace_save_xbzrle_page_overflow();
 746         xbzrle_counters.overflow++;
 747         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 748         return -1;
 749     }
 750
 751     /* Send XBZRLE based compressed page */
 752     bytes_xbzrle = save_page_header(rs, rs->f, block,
 753                                     offset | RAM_SAVE_FLAG_XBZRLE);
 754     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 755     qemu_put_be16(rs->f, encoded_len);
 756     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 757     bytes_xbzrle += encoded_len + 1 + 2;
 758     /*
 759      * Like compressed_size (please see update_compress_thread_counts),
 760      * the xbzrle encoded bytes don't count the 8 byte header with
 761      * RAM_SAVE_FLAG_CONTINUE.
 762      */
 763     xbzrle_counters.bytes += bytes_xbzrle - 8;
 764     ram_counters.transferred += bytes_xbzrle;
 765
 766     return 1;
 767 }
 768
 769 /**
 770  * migration_bitmap_find_dirty: find the next dirty page from start
 771  *
 772  * Returns the page offset within memory region of the start of a dirty page
 773  *
 774  * @rs: current RAM state
 775  * @rb: RAMBlock where to search for dirty pages
 776  * @start: page where we start the search
 777  */
 778 static inline
 779 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 780                                           unsigned long start)
 781 {
 782     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 783     unsigned long *bitmap = rb->bmap;
 784
 785     if (ramblock_is_ignored(rb)) {
 786         return size;
 787     }
 788
 789     return find_next_bit(bitmap, size, start);
 790 }
 791
 792 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 793                                                 RAMBlock *rb,
 794                                                 unsigned long page)
 795 {
 796     bool ret;
 797
 798     /*
 799      * Clear dirty bitmap if needed.  This _must_ be called before we
 800      * send any of the page in the chunk because we need to make sure
 801      * we can capture further page content changes when we sync dirty
 802      * log the next time.  So as long as we are going to send any of
 803      * the page in the chunk we clear the remote dirty bitmap for all.
 804      * Clearing it earlier won't be a problem, but too late will.
 805      */
 806     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
 807         uint8_t shift = rb->clear_bmap_shift;
 808         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
 809         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
 810
 811         /*
 812          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 813          * can make things easier sometimes since then start address
 814          * of the small chunk will always be 64 pages aligned so the
 815          * bitmap will always be aligned to unsigned long.  We should
 816          * even be able to remove this restriction but I'm simply
 817          * keeping it.
 818          */
 819         assert(shift >= 6);
 820         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 821         memory_region_clear_dirty_bitmap(rb->mr, start, size);
 822     }
 823
 824     ret = test_and_clear_bit(page, rb->bmap);
 825
 826     if (ret) {
 827         rs->migration_dirty_pages--;
 828     }
 829
 830     return ret;
 831 }
 832
 833 /* Called with RCU critical section */
 834 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 835 {
 836     uint64_t new_dirty_pages =
 837         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 838
 839     rs->migration_dirty_pages += new_dirty_pages;
 840     rs->num_dirty_pages_period += new_dirty_pages;
 841 }
 842
 843 /**
 844  * ram_pagesize_summary: calculate all the pagesizes of a VM
 845  *
 846  * Returns a summary bitmap of the page sizes of all RAMBlocks
 847  *
 848  * For VMs with just normal pages this is equivalent to the host page
 849  * size. If it's got some huge pages then it's the OR of all the
 850  * different page sizes.
 851  */
 852 uint64_t ram_pagesize_summary(void)
 853 {
 854     RAMBlock *block;
 855     uint64_t summary = 0;
 856
 857     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 858         summary |= block->page_size;
 859     }
 860
 861     return summary;
 862 }
 863
 864 uint64_t ram_get_total_transferred_pages(void)
 865 {
 866     return  ram_counters.normal + ram_counters.duplicate +
 867                 compression_counters.pages + xbzrle_counters.pages;
 868 }
 869
 870 static void migration_update_rates(RAMState *rs, int64_t end_time)
 871 {
 872     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 873     double compressed_size;
 874
 875     /* calculate period counters */
 876     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 877                 / (end_time - rs->time_last_bitmap_sync);
 878
 879     if (!page_count) {
 880         return;
 881     }
 882
 883     if (migrate_use_xbzrle()) {
 884         double encoded_size, unencoded_size;
 885
 886         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 887             rs->xbzrle_cache_miss_prev) / page_count;
 888         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 889         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 890                          TARGET_PAGE_SIZE;
 891         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 892         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 893             xbzrle_counters.encoding_rate = 0;
 894         } else {
 895             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
 896         }
 897         rs->xbzrle_pages_prev = xbzrle_counters.pages;
 898         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
 899     }
 900
 901     if (migrate_use_compression()) {
 902         compression_counters.busy_rate = (double)(compression_counters.busy -
 903             rs->compress_thread_busy_prev) / page_count;
 904         rs->compress_thread_busy_prev = compression_counters.busy;
 905
 906         compressed_size = compression_counters.compressed_size -
 907                           rs->compressed_size_prev;
 908         if (compressed_size) {
 909             double uncompressed_size = (compression_counters.pages -
 910                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 911
 912             /* Compression-Ratio = Uncompressed-size / Compressed-size */
 913             compression_counters.compression_rate =
 914                                         uncompressed_size / compressed_size;
 915
 916             rs->compress_pages_prev = compression_counters.pages;
 917             rs->compressed_size_prev = compression_counters.compressed_size;
 918         }
 919     }
 920 }
 921
 922 static void migration_trigger_throttle(RAMState *rs)
 923 {
 924     MigrationState *s = migrate_get_current();
 925     uint64_t threshold = s->parameters.throttle_trigger_threshold;
 926
 927     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
 928     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 929     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 930
 931     /* During block migration the auto-converge logic incorrectly detects
 932      * that ram migration makes no progress. Avoid this by disabling the
 933      * throttling logic during the bulk phase of block migration. */
 934     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 935         /* The following detection logic can be refined later. For now:
 936            Check to see if the ratio between dirtied bytes and the approx.
 937            amount of bytes that just got transferred since the last time
 938            we were in this routine reaches the threshold. If that happens
 939            twice, start or increase throttling. */
 940
 941         if ((bytes_dirty_period > bytes_dirty_threshold) &&
 942             (++rs->dirty_rate_high_cnt >= 2)) {
 943             trace_migration_throttle();
 944             rs->dirty_rate_high_cnt = 0;
 945             mig_throttle_guest_down(bytes_dirty_period,
 946                                     bytes_dirty_threshold);
 947         }
 948     }
 949 }
 950
 951 static void migration_bitmap_sync(RAMState *rs)
 952 {
 953     RAMBlock *block;
 954     int64_t end_time;
 955
 956     ram_counters.dirty_sync_count++;
 957
 958     if (!rs->time_last_bitmap_sync) {
 959         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 960     }
 961
 962     trace_migration_bitmap_sync_start();
 963     memory_global_dirty_log_sync();
 964
 965     qemu_mutex_lock(&rs->bitmap_mutex);
 966     WITH_RCU_READ_LOCK_GUARD() {
 967         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 968             ramblock_sync_dirty_bitmap(rs, block);
 969         }
 970         ram_counters.remaining = ram_bytes_remaining();
 971     }
 972     qemu_mutex_unlock(&rs->bitmap_mutex);
 973
 974     memory_global_after_dirty_log_sync();
 975     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 976
 977     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 978
 979     /* more than 1 second = 1000 millisecons */
 980     if (end_time > rs->time_last_bitmap_sync + 1000) {
 981         migration_trigger_throttle(rs);
 982
 983         migration_update_rates(rs, end_time);
 984
 985         rs->target_page_count_prev = rs->target_page_count;
 986
 987         /* reset period counters */
 988         rs->time_last_bitmap_sync = end_time;
 989         rs->num_dirty_pages_period = 0;
 990         rs->bytes_xfer_prev = ram_counters.transferred;
 991     }
 992     if (migrate_use_events()) {
 993         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
 994     }
 995 }
 996
 997 static void migration_bitmap_sync_precopy(RAMState *rs)
 998 {
 999     Error *local_err = NULL;
1000
1001     /*
1002      * The current notifier usage is just an optimization to migration, so we
1003      * don't stop the normal migration process in the error case.
1004      */
1005     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1006         error_report_err(local_err);
1007         local_err = NULL;
1008     }
1009
1010     migration_bitmap_sync(rs);
1011
1012     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1013         error_report_err(local_err);
1014     }
1015 }
1016
1017 /**
1018  * save_zero_page_to_file: send the zero page to the file
1019  *
1020  * Returns the size of data written to the file, 0 means the page is not
1021  * a zero page
1022  *
1023  * @rs: current RAM state
1024  * @file: the file where the data is saved
1025  * @block: block that contains the page we want to send
1026  * @offset: offset inside the block for the page
1027  */
1028 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1029                                   RAMBlock *block, ram_addr_t offset)
1030 {
1031     uint8_t *p = block->host + offset;
1032     int len = 0;
1033
1034     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1035         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1036         qemu_put_byte(file, 0);
1037         len += 1;
1038     }
1039     return len;
1040 }
1041
1042 /**
1043  * save_zero_page: send the zero page to the stream
1044  *
1045  * Returns the number of pages written.
1046  *
1047  * @rs: current RAM state
1048  * @block: block that contains the page we want to send
1049  * @offset: offset inside the block for the page
1050  */
1051 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1052 {
1053     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1054
1055     if (len) {
1056         ram_counters.duplicate++;
1057         ram_counters.transferred += len;
1058         return 1;
1059     }
1060     return -1;
1061 }
1062
1063 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1064 {
1065     if (!migrate_release_ram() || !migration_in_postcopy()) {
1066         return;
1067     }
1068
1069     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1070 }
1071
1072 /*
1073  * @pages: the number of pages written by the control path,
1074  *        < 0 - error
1075  *        > 0 - number of pages written
1076  *
1077  * Return true if the pages has been saved, otherwise false is returned.
1078  */
1079 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1080                               int *pages)
1081 {
1082     uint64_t bytes_xmit = 0;
1083     int ret;
1084
1085     *pages = -1;
1086     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1087                                 &bytes_xmit);
1088     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1089         return false;
1090     }
1091
1092     if (bytes_xmit) {
1093         ram_counters.transferred += bytes_xmit;
1094         *pages = 1;
1095     }
1096
1097     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1098         return true;
1099     }
1100
1101     if (bytes_xmit > 0) {
1102         ram_counters.normal++;
1103     } else if (bytes_xmit == 0) {
1104         ram_counters.duplicate++;
1105     }
1106
1107     return true;
1108 }
1109
1110 /*
1111  * directly send the page to the stream
1112  *
1113  * Returns the number of pages written.
1114  *
1115  * @rs: current RAM state
1116  * @block: block that contains the page we want to send
1117  * @offset: offset inside the block for the page
1118  * @buf: the page to be sent
1119  * @async: send to page asyncly
1120  */
1121 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1122                             uint8_t *buf, bool async)
1123 {
1124     ram_counters.transferred += save_page_header(rs, rs->f, block,
1125                                                  offset | RAM_SAVE_FLAG_PAGE);
1126     if (async) {
1127         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1128                               migrate_release_ram() &
1129                               migration_in_postcopy());
1130     } else {
1131         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1132     }
1133     ram_counters.transferred += TARGET_PAGE_SIZE;
1134     ram_counters.normal++;
1135     return 1;
1136 }
1137
1138 /**
1139  * ram_save_page: send the given page to the stream
1140  *
1141  * Returns the number of pages written.
1142  *          < 0 - error
1143  *          >=0 - Number of pages written - this might legally be 0
1144  *                if xbzrle noticed the page was the same.
1145  *
1146  * @rs: current RAM state
1147  * @block: block that contains the page we want to send
1148  * @offset: offset inside the block for the page
1149  * @last_stage: if we are at the completion stage
1150  */
1151 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1152 {
1153     int pages = -1;
1154     uint8_t *p;
1155     bool send_async = true;
1156     RAMBlock *block = pss->block;
1157     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1158     ram_addr_t current_addr = block->offset + offset;
1159
1160     p = block->host + offset;
1161     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1162
1163     XBZRLE_cache_lock();
1164     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1165         pages = save_xbzrle_page(rs, &p, current_addr, block,
1166                                  offset, last_stage);
1167         if (!last_stage) {
1168             /* Can't send this cached data async, since the cache page
1169              * might get updated before it gets to the wire
1170              */
1171             send_async = false;
1172         }
1173     }
1174
1175     /* XBZRLE overflow or normal page */
1176     if (pages == -1) {
1177         pages = save_normal_page(rs, block, offset, p, send_async);
1178     }
1179
1180     XBZRLE_cache_unlock();
1181
1182     return pages;
1183 }
1184
1185 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1186                                  ram_addr_t offset)
1187 {
1188     if (multifd_queue_page(rs->f, block, offset) < 0) {
1189         return -1;
1190     }
1191     ram_counters.normal++;
1192
1193     return 1;
1194 }
1195
1196 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1197                                  ram_addr_t offset, uint8_t *source_buf)
1198 {
1199     RAMState *rs = ram_state;
1200     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1201     bool zero_page = false;
1202     int ret;
1203
1204     if (save_zero_page_to_file(rs, f, block, offset)) {
1205         zero_page = true;
1206         goto exit;
1207     }
1208
1209     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1210
1211     /*
1212      * copy it to a internal buffer to avoid it being modified by VM
1213      * so that we can catch up the error during compression and
1214      * decompression
1215      */
1216     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1217     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1218     if (ret < 0) {
1219         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1220         error_report("compressed data failed!");
1221         return false;
1222     }
1223
1224 exit:
1225     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1226     return zero_page;
1227 }
1228
1229 static void
1230 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1231 {
1232     ram_counters.transferred += bytes_xmit;
1233
1234     if (param->zero_page) {
1235         ram_counters.duplicate++;
1236         return;
1237     }
1238
1239     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1240     compression_counters.compressed_size += bytes_xmit - 8;
1241     compression_counters.pages++;
1242 }
1243
1244 static bool save_page_use_compression(RAMState *rs);
1245
1246 static void flush_compressed_data(RAMState *rs)
1247 {
1248     int idx, len, thread_count;
1249
1250     if (!save_page_use_compression(rs)) {
1251         return;
1252     }
1253     thread_count = migrate_compress_threads();
1254
1255     qemu_mutex_lock(&comp_done_lock);
1256     for (idx = 0; idx < thread_count; idx++) {
1257         while (!comp_param[idx].done) {
1258             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1259         }
1260     }
1261     qemu_mutex_unlock(&comp_done_lock);
1262
1263     for (idx = 0; idx < thread_count; idx++) {
1264         qemu_mutex_lock(&comp_param[idx].mutex);
1265         if (!comp_param[idx].quit) {
1266             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1267             /*
1268              * it's safe to fetch zero_page without holding comp_done_lock
1269              * as there is no further request submitted to the thread,
1270              * i.e, the thread should be waiting for a request at this point.
1271              */
1272             update_compress_thread_counts(&comp_param[idx], len);
1273         }
1274         qemu_mutex_unlock(&comp_param[idx].mutex);
1275     }
1276 }
1277
1278 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1279                                        ram_addr_t offset)
1280 {
1281     param->block = block;
1282     param->offset = offset;
1283 }
1284
1285 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1286                                            ram_addr_t offset)
1287 {
1288     int idx, thread_count, bytes_xmit = -1, pages = -1;
1289     bool wait = migrate_compress_wait_thread();
1290
1291     thread_count = migrate_compress_threads();
1292     qemu_mutex_lock(&comp_done_lock);
1293 retry:
1294     for (idx = 0; idx < thread_count; idx++) {
1295         if (comp_param[idx].done) {
1296             comp_param[idx].done = false;
1297             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1298             qemu_mutex_lock(&comp_param[idx].mutex);
1299             set_compress_params(&comp_param[idx], block, offset);
1300             qemu_cond_signal(&comp_param[idx].cond);
1301             qemu_mutex_unlock(&comp_param[idx].mutex);
1302             pages = 1;
1303             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1304             break;
1305         }
1306     }
1307
1308     /*
1309      * wait for the free thread if the user specifies 'compress-wait-thread',
1310      * otherwise we will post the page out in the main thread as normal page.
1311      */
1312     if (pages < 0 && wait) {
1313         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1314         goto retry;
1315     }
1316     qemu_mutex_unlock(&comp_done_lock);
1317
1318     return pages;
1319 }
1320
1321 /**
1322  * find_dirty_block: find the next dirty page and update any state
1323  * associated with the search process.
1324  *
1325  * Returns true if a page is found
1326  *
1327  * @rs: current RAM state
1328  * @pss: data about the state of the current dirty page scan
1329  * @again: set to false if the search has scanned the whole of RAM
1330  */
1331 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1332 {
1333     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1334     if (pss->complete_round && pss->block == rs->last_seen_block &&
1335         pss->page >= rs->last_page) {
1336         /*
1337          * We've been once around the RAM and haven't found anything.
1338          * Give up.
1339          */
1340         *again = false;
1341         return false;
1342     }
1343     if (!offset_in_ramblock(pss->block,
1344                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1345         /* Didn't find anything in this RAM Block */
1346         pss->page = 0;
1347         pss->block = QLIST_NEXT_RCU(pss->block, next);
1348         if (!pss->block) {
1349             /*
1350              * If memory migration starts over, we will meet a dirtied page
1351              * which may still exists in compression threads's ring, so we
1352              * should flush the compressed data to make sure the new page
1353              * is not overwritten by the old one in the destination.
1354              *
1355              * Also If xbzrle is on, stop using the data compression at this
1356              * point. In theory, xbzrle can do better than compression.
1357              */
1358             flush_compressed_data(rs);
1359
1360             /* Hit the end of the list */
1361             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1362             /* Flag that we've looped */
1363             pss->complete_round = true;
1364             /* After the first round, enable XBZRLE. */
1365             if (migrate_use_xbzrle()) {
1366                 rs->xbzrle_enabled = true;
1367             }
1368         }
1369         /* Didn't find anything this time, but try again on the new block */
1370         *again = true;
1371         return false;
1372     } else {
1373         /* Can go around again, but... */
1374         *again = true;
1375         /* We've found something so probably don't need to */
1376         return true;
1377     }
1378 }
1379
1380 /**
1381  * unqueue_page: gets a page of the queue
1382  *
1383  * Helper for 'get_queued_page' - gets a page off the queue
1384  *
1385  * Returns the block of the page (or NULL if none available)
1386  *
1387  * @rs: current RAM state
1388  * @offset: used to return the offset within the RAMBlock
1389  */
1390 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1391 {
1392     RAMBlock *block = NULL;
1393
1394     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1395         return NULL;
1396     }
1397
1398     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1399     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1400         struct RAMSrcPageRequest *entry =
1401                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1402         block = entry->rb;
1403         *offset = entry->offset;
1404
1405         if (entry->len > TARGET_PAGE_SIZE) {
1406             entry->len -= TARGET_PAGE_SIZE;
1407             entry->offset += TARGET_PAGE_SIZE;
1408         } else {
1409             memory_region_unref(block->mr);
1410             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1411             g_free(entry);
1412             migration_consume_urgent_request();
1413         }
1414     }
1415
1416     return block;
1417 }
1418
1419 #if defined(__linux__)
1420 /**
1421  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1422  *   is found, return RAM block pointer and page offset
1423  *
1424  * Returns pointer to the RAMBlock containing faulting page,
1425  *   NULL if no write faults are pending
1426  *
1427  * @rs: current RAM state
1428  * @offset: page offset from the beginning of the block
1429  */
1430 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1431 {
1432     struct uffd_msg uffd_msg;
1433     void *page_address;
1434     RAMBlock *block;
1435     int res;
1436
1437     if (!migrate_background_snapshot()) {
1438         return NULL;
1439     }
1440
1441     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1442     if (res <= 0) {
1443         return NULL;
1444     }
1445
1446     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1447     block = qemu_ram_block_from_host(page_address, false, offset);
1448     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1449     return block;
1450 }
1451
1452 /**
1453  * ram_save_release_protection: release UFFD write protection after
1454  *   a range of pages has been saved
1455  *
1456  * @rs: current RAM state
1457  * @pss: page-search-status structure
1458  * @start_page: index of the first page in the range relative to pss->block
1459  *
1460  * Returns 0 on success, negative value in case of an error
1461 */
1462 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1463         unsigned long start_page)
1464 {
1465     int res = 0;
1466
1467     /* Check if page is from UFFD-managed region. */
1468     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1469         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1470         uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1471
1472         /* Flush async buffers before un-protect. */
1473         qemu_fflush(rs->f);
1474         /* Un-protect memory range. */
1475         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1476                 false, false);
1477     }
1478
1479     return res;
1480 }
1481
1482 /* ram_write_tracking_available: check if kernel supports required UFFD features
1483  *
1484  * Returns true if supports, false otherwise
1485  */
1486 bool ram_write_tracking_available(void)
1487 {
1488     uint64_t uffd_features;
1489     int res;
1490
1491     res = uffd_query_features(&uffd_features);
1492     return (res == 0 &&
1493             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1494 }
1495
1496 /* ram_write_tracking_compatible: check if guest configuration is
1497  *   compatible with 'write-tracking'
1498  *
1499  * Returns true if compatible, false otherwise
1500  */
1501 bool ram_write_tracking_compatible(void)
1502 {
1503     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1504     int uffd_fd;
1505     RAMBlock *block;
1506     bool ret = false;
1507
1508     /* Open UFFD file descriptor */
1509     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1510     if (uffd_fd < 0) {
1511         return false;
1512     }
1513
1514     RCU_READ_LOCK_GUARD();
1515
1516     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1517         uint64_t uffd_ioctls;
1518
1519         /* Nothing to do with read-only and MMIO-writable regions */
1520         if (block->mr->readonly || block->mr->rom_device) {
1521             continue;
1522         }
1523         /* Try to register block memory via UFFD-IO to track writes */
1524         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1525                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1526             goto out;
1527         }
1528         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1529             goto out;
1530         }
1531     }
1532     ret = true;
1533
1534 out:
1535     uffd_close_fd(uffd_fd);
1536     return ret;
1537 }
1538
1539 /*
1540  * ram_block_populate_pages: populate memory in the RAM block by reading
1541  *   an integer from the beginning of each page.
1542  *
1543  * Since it's solely used for userfault_fd WP feature, here we just
1544  *   hardcode page size to qemu_real_host_page_size.
1545  *
1546  * @block: RAM block to populate
1547  */
1548 static void ram_block_populate_pages(RAMBlock *block)
1549 {
1550     char *ptr = (char *) block->host;
1551
1552     for (ram_addr_t offset = 0; offset < block->used_length;
1553             offset += qemu_real_host_page_size) {
1554         char tmp = *(ptr + offset);
1555
1556         /* Don't optimize the read out */
1557         asm volatile("" : "+r" (tmp));
1558     }
1559 }
1560
1561 /*
1562  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1563  */
1564 void ram_write_tracking_prepare(void)
1565 {
1566     RAMBlock *block;
1567
1568     RCU_READ_LOCK_GUARD();
1569
1570     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1571         /* Nothing to do with read-only and MMIO-writable regions */
1572         if (block->mr->readonly || block->mr->rom_device) {
1573             continue;
1574         }
1575
1576         /*
1577          * Populate pages of the RAM block before enabling userfault_fd
1578          * write protection.
1579          *
1580          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1581          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1582          * pages with pte_none() entries in page table.
1583          */
1584         ram_block_populate_pages(block);
1585     }
1586 }
1587
1588 /*
1589  * ram_write_tracking_start: start UFFD-WP memory tracking
1590  *
1591  * Returns 0 for success or negative value in case of error
1592  */
1593 int ram_write_tracking_start(void)
1594 {
1595     int uffd_fd;
1596     RAMState *rs = ram_state;
1597     RAMBlock *block;
1598
1599     /* Open UFFD file descriptor */
1600     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1601     if (uffd_fd < 0) {
1602         return uffd_fd;
1603     }
1604     rs->uffdio_fd = uffd_fd;
1605
1606     RCU_READ_LOCK_GUARD();
1607
1608     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1609         /* Nothing to do with read-only and MMIO-writable regions */
1610         if (block->mr->readonly || block->mr->rom_device) {
1611             continue;
1612         }
1613
1614         /* Register block memory with UFFD to track writes */
1615         if (uffd_register_memory(rs->uffdio_fd, block->host,
1616                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1617             goto fail;
1618         }
1619         /* Apply UFFD write protection to the block memory range */
1620         if (uffd_change_protection(rs->uffdio_fd, block->host,
1621                 block->max_length, true, false)) {
1622             goto fail;
1623         }
1624         block->flags |= RAM_UF_WRITEPROTECT;
1625         memory_region_ref(block->mr);
1626
1627         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1628                 block->host, block->max_length);
1629     }
1630
1631     return 0;
1632
1633 fail:
1634     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1635
1636     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1637         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1638             continue;
1639         }
1640         /*
1641          * In case some memory block failed to be write-protected
1642          * remove protection and unregister all succeeded RAM blocks
1643          */
1644         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1645                 false, false);
1646         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1647         /* Cleanup flags and remove reference */
1648         block->flags &= ~RAM_UF_WRITEPROTECT;
1649         memory_region_unref(block->mr);
1650     }
1651
1652     uffd_close_fd(uffd_fd);
1653     rs->uffdio_fd = -1;
1654     return -1;
1655 }
1656
1657 /**
1658  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1659  */
1660 void ram_write_tracking_stop(void)
1661 {
1662     RAMState *rs = ram_state;
1663     RAMBlock *block;
1664
1665     RCU_READ_LOCK_GUARD();
1666
1667     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1668         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1669             continue;
1670         }
1671         /* Remove protection and unregister all affected RAM blocks */
1672         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1673                 false, false);
1674         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1675
1676         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1677                 block->host, block->max_length);
1678
1679         /* Cleanup flags and remove reference */
1680         block->flags &= ~RAM_UF_WRITEPROTECT;
1681         memory_region_unref(block->mr);
1682     }
1683
1684     /* Finally close UFFD file descriptor */
1685     uffd_close_fd(rs->uffdio_fd);
1686     rs->uffdio_fd = -1;
1687 }
1688
1689 #else
1690 /* No target OS support, stubs just fail or ignore */
1691
1692 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1693 {
1694     (void) rs;
1695     (void) offset;
1696
1697     return NULL;
1698 }
1699
1700 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1701         unsigned long start_page)
1702 {
1703     (void) rs;
1704     (void) pss;
1705     (void) start_page;
1706
1707     return 0;
1708 }
1709
1710 bool ram_write_tracking_available(void)
1711 {
1712     return false;
1713 }
1714
1715 bool ram_write_tracking_compatible(void)
1716 {
1717     assert(0);
1718     return false;
1719 }
1720
1721 int ram_write_tracking_start(void)
1722 {
1723     assert(0);
1724     return -1;
1725 }
1726
1727 void ram_write_tracking_stop(void)
1728 {
1729     assert(0);
1730 }
1731 #endif /* defined(__linux__) */
1732
1733 /**
1734  * get_queued_page: unqueue a page from the postcopy requests
1735  *
1736  * Skips pages that are already sent (!dirty)
1737  *
1738  * Returns true if a queued page is found
1739  *
1740  * @rs: current RAM state
1741  * @pss: data about the state of the current dirty page scan
1742  */
1743 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1744 {
1745     RAMBlock  *block;
1746     ram_addr_t offset;
1747     bool dirty;
1748
1749     do {
1750         block = unqueue_page(rs, &offset);
1751         /*
1752          * We're sending this page, and since it's postcopy nothing else
1753          * will dirty it, and we must make sure it doesn't get sent again
1754          * even if this queue request was received after the background
1755          * search already sent it.
1756          */
1757         if (block) {
1758             unsigned long page;
1759
1760             page = offset >> TARGET_PAGE_BITS;
1761             dirty = test_bit(page, block->bmap);
1762             if (!dirty) {
1763                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1764                                                 page);
1765             } else {
1766                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1767             }
1768         }
1769
1770     } while (block && !dirty);
1771
1772     if (!block) {
1773         /*
1774          * Poll write faults too if background snapshot is enabled; that's
1775          * when we have vcpus got blocked by the write protected pages.
1776          */
1777         block = poll_fault_page(rs, &offset);
1778     }
1779
1780     if (block) {
1781         /*
1782          * We want the background search to continue from the queued page
1783          * since the guest is likely to want other pages near to the page
1784          * it just requested.
1785          */
1786         pss->block = block;
1787         pss->page = offset >> TARGET_PAGE_BITS;
1788
1789         /*
1790          * This unqueued page would break the "one round" check, even is
1791          * really rare.
1792          */
1793         pss->complete_round = false;
1794     }
1795
1796     return !!block;
1797 }
1798
1799 /**
1800  * migration_page_queue_free: drop any remaining pages in the ram
1801  * request queue
1802  *
1803  * It should be empty at the end anyway, but in error cases there may
1804  * be some left.  in case that there is any page left, we drop it.
1805  *
1806  */
1807 static void migration_page_queue_free(RAMState *rs)
1808 {
1809     struct RAMSrcPageRequest *mspr, *next_mspr;
1810     /* This queue generally should be empty - but in the case of a failed
1811      * migration might have some droppings in.
1812      */
1813     RCU_READ_LOCK_GUARD();
1814     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1815         memory_region_unref(mspr->rb->mr);
1816         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1817         g_free(mspr);
1818     }
1819 }
1820
1821 /**
1822  * ram_save_queue_pages: queue the page for transmission
1823  *
1824  * A request from postcopy destination for example.
1825  *
1826  * Returns zero on success or negative on error
1827  *
1828  * @rbname: Name of the RAMBLock of the request. NULL means the
1829  *          same that last one.
1830  * @start: starting address from the start of the RAMBlock
1831  * @len: length (in bytes) to send
1832  */
1833 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1834 {
1835     RAMBlock *ramblock;
1836     RAMState *rs = ram_state;
1837
1838     ram_counters.postcopy_requests++;
1839     RCU_READ_LOCK_GUARD();
1840
1841     if (!rbname) {
1842         /* Reuse last RAMBlock */
1843         ramblock = rs->last_req_rb;
1844
1845         if (!ramblock) {
1846             /*
1847              * Shouldn't happen, we can't reuse the last RAMBlock if
1848              * it's the 1st request.
1849              */
1850             error_report("ram_save_queue_pages no previous block");
1851             return -1;
1852         }
1853     } else {
1854         ramblock = qemu_ram_block_by_name(rbname);
1855
1856         if (!ramblock) {
1857             /* We shouldn't be asked for a non-existent RAMBlock */
1858             error_report("ram_save_queue_pages no block '%s'", rbname);
1859             return -1;
1860         }
1861         rs->last_req_rb = ramblock;
1862     }
1863     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1864     if (!offset_in_ramblock(ramblock, start + len - 1)) {
1865         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1866                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1867                      __func__, start, len, ramblock->used_length);
1868         return -1;
1869     }
1870
1871     struct RAMSrcPageRequest *new_entry =
1872         g_malloc0(sizeof(struct RAMSrcPageRequest));
1873     new_entry->rb = ramblock;
1874     new_entry->offset = start;
1875     new_entry->len = len;
1876
1877     memory_region_ref(ramblock->mr);
1878     qemu_mutex_lock(&rs->src_page_req_mutex);
1879     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1880     migration_make_urgent_request();
1881     qemu_mutex_unlock(&rs->src_page_req_mutex);
1882
1883     return 0;
1884 }
1885
1886 static bool save_page_use_compression(RAMState *rs)
1887 {
1888     if (!migrate_use_compression()) {
1889         return false;
1890     }
1891
1892     /*
1893      * If xbzrle is enabled (e.g., after first round of migration), stop
1894      * using the data compression. In theory, xbzrle can do better than
1895      * compression.
1896      */
1897     if (rs->xbzrle_enabled) {
1898         return false;
1899     }
1900
1901     return true;
1902 }
1903
1904 /*
1905  * try to compress the page before posting it out, return true if the page
1906  * has been properly handled by compression, otherwise needs other
1907  * paths to handle it
1908  */
1909 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1910 {
1911     if (!save_page_use_compression(rs)) {
1912         return false;
1913     }
1914
1915     /*
1916      * When starting the process of a new block, the first page of
1917      * the block should be sent out before other pages in the same
1918      * block, and all the pages in last block should have been sent
1919      * out, keeping this order is important, because the 'cont' flag
1920      * is used to avoid resending the block name.
1921      *
1922      * We post the fist page as normal page as compression will take
1923      * much CPU resource.
1924      */
1925     if (block != rs->last_sent_block) {
1926         flush_compressed_data(rs);
1927         return false;
1928     }
1929
1930     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1931         return true;
1932     }
1933
1934     compression_counters.busy++;
1935     return false;
1936 }
1937
1938 /**
1939  * ram_save_target_page: save one target page
1940  *
1941  * Returns the number of pages written
1942  *
1943  * @rs: current RAM state
1944  * @pss: data about the page we want to send
1945  * @last_stage: if we are at the completion stage
1946  */
1947 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1948                                 bool last_stage)
1949 {
1950     RAMBlock *block = pss->block;
1951     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1952     int res;
1953
1954     if (control_save_page(rs, block, offset, &res)) {
1955         return res;
1956     }
1957
1958     if (save_compress_page(rs, block, offset)) {
1959         return 1;
1960     }
1961
1962     res = save_zero_page(rs, block, offset);
1963     if (res > 0) {
1964         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1965          * page would be stale
1966          */
1967         if (!save_page_use_compression(rs)) {
1968             XBZRLE_cache_lock();
1969             xbzrle_cache_zero_page(rs, block->offset + offset);
1970             XBZRLE_cache_unlock();
1971         }
1972         ram_release_pages(block->idstr, offset, res);
1973         return res;
1974     }
1975
1976     /*
1977      * Do not use multifd for:
1978      * 1. Compression as the first page in the new block should be posted out
1979      *    before sending the compressed page
1980      * 2. In postcopy as one whole host page should be placed
1981      */
1982     if (!save_page_use_compression(rs) && migrate_use_multifd()
1983         && !migration_in_postcopy()) {
1984         return ram_save_multifd_page(rs, block, offset);
1985     }
1986
1987     return ram_save_page(rs, pss, last_stage);
1988 }
1989
1990 /**
1991  * ram_save_host_page: save a whole host page
1992  *
1993  * Starting at *offset send pages up to the end of the current host
1994  * page. It's valid for the initial offset to point into the middle of
1995  * a host page in which case the remainder of the hostpage is sent.
1996  * Only dirty target pages are sent. Note that the host page size may
1997  * be a huge page for this block.
1998  * The saving stops at the boundary of the used_length of the block
1999  * if the RAMBlock isn't a multiple of the host page size.
2000  *
2001  * Returns the number of pages written or negative on error
2002  *
2003  * @rs: current RAM state
2004  * @ms: current migration state
2005  * @pss: data about the page we want to send
2006  * @last_stage: if we are at the completion stage
2007  */
2008 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2009                               bool last_stage)
2010 {
2011     int tmppages, pages = 0;
2012     size_t pagesize_bits =
2013         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2014     unsigned long hostpage_boundary =
2015         QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2016     unsigned long start_page = pss->page;
2017     int res;
2018
2019     if (ramblock_is_ignored(pss->block)) {
2020         error_report("block %s should not be migrated !", pss->block->idstr);
2021         return 0;
2022     }
2023
2024     do {
2025         /* Check the pages is dirty and if it is send it */
2026         if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2027             tmppages = ram_save_target_page(rs, pss, last_stage);
2028             if (tmppages < 0) {
2029                 return tmppages;
2030             }
2031
2032             pages += tmppages;
2033             /*
2034              * Allow rate limiting to happen in the middle of huge pages if
2035              * something is sent in the current iteration.
2036              */
2037             if (pagesize_bits > 1 && tmppages > 0) {
2038                 migration_rate_limit();
2039             }
2040         }
2041         pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2042     } while ((pss->page < hostpage_boundary) &&
2043              offset_in_ramblock(pss->block,
2044                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2045     /* The offset we leave with is the min boundary of host page and block */
2046     pss->page = MIN(pss->page, hostpage_boundary) - 1;
2047
2048     res = ram_save_release_protection(rs, pss, start_page);
2049     return (res < 0 ? res : pages);
2050 }
2051
2052 /**
2053  * ram_find_and_save_block: finds a dirty page and sends it to f
2054  *
2055  * Called within an RCU critical section.
2056  *
2057  * Returns the number of pages written where zero means no dirty pages,
2058  * or negative on error
2059  *
2060  * @rs: current RAM state
2061  * @last_stage: if we are at the completion stage
2062  *
2063  * On systems where host-page-size > target-page-size it will send all the
2064  * pages in a host page that are dirty.
2065  */
2066
2067 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2068 {
2069     PageSearchStatus pss;
2070     int pages = 0;
2071     bool again, found;
2072
2073     /* No dirty page as there is zero RAM */
2074     if (!ram_bytes_total()) {
2075         return pages;
2076     }
2077
2078     pss.block = rs->last_seen_block;
2079     pss.page = rs->last_page;
2080     pss.complete_round = false;
2081
2082     if (!pss.block) {
2083         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2084     }
2085
2086     do {
2087         again = true;
2088         found = get_queued_page(rs, &pss);
2089
2090         if (!found) {
2091             /* priority queue empty, so just search for something dirty */
2092             found = find_dirty_block(rs, &pss, &again);
2093         }
2094
2095         if (found) {
2096             pages = ram_save_host_page(rs, &pss, last_stage);
2097         }
2098     } while (!pages && again);
2099
2100     rs->last_seen_block = pss.block;
2101     rs->last_page = pss.page;
2102
2103     return pages;
2104 }
2105
2106 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2107 {
2108     uint64_t pages = size / TARGET_PAGE_SIZE;
2109
2110     if (zero) {
2111         ram_counters.duplicate += pages;
2112     } else {
2113         ram_counters.normal += pages;
2114         ram_counters.transferred += size;
2115         qemu_update_position(f, size);
2116     }
2117 }
2118
2119 static uint64_t ram_bytes_total_common(bool count_ignored)
2120 {
2121     RAMBlock *block;
2122     uint64_t total = 0;
2123
2124     RCU_READ_LOCK_GUARD();
2125
2126     if (count_ignored) {
2127         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2128             total += block->used_length;
2129         }
2130     } else {
2131         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2132             total += block->used_length;
2133         }
2134     }
2135     return total;
2136 }
2137
2138 uint64_t ram_bytes_total(void)
2139 {
2140     return ram_bytes_total_common(false);
2141 }
2142
2143 static void xbzrle_load_setup(void)
2144 {
2145     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2146 }
2147
2148 static void xbzrle_load_cleanup(void)
2149 {
2150     g_free(XBZRLE.decoded_buf);
2151     XBZRLE.decoded_buf = NULL;
2152 }
2153
2154 static void ram_state_cleanup(RAMState **rsp)
2155 {
2156     if (*rsp) {
2157         migration_page_queue_free(*rsp);
2158         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2159         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2160         g_free(*rsp);
2161         *rsp = NULL;
2162     }
2163 }
2164
2165 static void xbzrle_cleanup(void)
2166 {
2167     XBZRLE_cache_lock();
2168     if (XBZRLE.cache) {
2169         cache_fini(XBZRLE.cache);
2170         g_free(XBZRLE.encoded_buf);
2171         g_free(XBZRLE.current_buf);
2172         g_free(XBZRLE.zero_target_page);
2173         XBZRLE.cache = NULL;
2174         XBZRLE.encoded_buf = NULL;
2175         XBZRLE.current_buf = NULL;
2176         XBZRLE.zero_target_page = NULL;
2177     }
2178     XBZRLE_cache_unlock();
2179 }
2180
2181 static void ram_save_cleanup(void *opaque)
2182 {
2183     RAMState **rsp = opaque;
2184     RAMBlock *block;
2185
2186     /* We don't use dirty log with background snapshots */
2187     if (!migrate_background_snapshot()) {
2188         /* caller have hold iothread lock or is in a bh, so there is
2189          * no writing race against the migration bitmap
2190          */
2191         memory_global_dirty_log_stop();
2192     }
2193
2194     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2195         g_free(block->clear_bmap);
2196         block->clear_bmap = NULL;
2197         g_free(block->bmap);
2198         block->bmap = NULL;
2199     }
2200
2201     xbzrle_cleanup();
2202     compress_threads_save_cleanup();
2203     ram_state_cleanup(rsp);
2204 }
2205
2206 static void ram_state_reset(RAMState *rs)
2207 {
2208     rs->last_seen_block = NULL;
2209     rs->last_sent_block = NULL;
2210     rs->last_page = 0;
2211     rs->last_version = ram_list.version;
2212     rs->xbzrle_enabled = false;
2213 }
2214
2215 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2216
2217 /*
2218  * 'expected' is the value you expect the bitmap mostly to be full
2219  * of; it won't bother printing lines that are all this value.
2220  * If 'todump' is null the migration bitmap is dumped.
2221  */
2222 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2223                            unsigned long pages)
2224 {
2225     int64_t cur;
2226     int64_t linelen = 128;
2227     char linebuf[129];
2228
2229     for (cur = 0; cur < pages; cur += linelen) {
2230         int64_t curb;
2231         bool found = false;
2232         /*
2233          * Last line; catch the case where the line length
2234          * is longer than remaining ram
2235          */
2236         if (cur + linelen > pages) {
2237             linelen = pages - cur;
2238         }
2239         for (curb = 0; curb < linelen; curb++) {
2240             bool thisbit = test_bit(cur + curb, todump);
2241             linebuf[curb] = thisbit ? '1' : '.';
2242             found = found || (thisbit != expected);
2243         }
2244         if (found) {
2245             linebuf[curb] = '\0';
2246             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2247         }
2248     }
2249 }
2250
2251 /* **** functions for postcopy ***** */
2252
2253 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2254 {
2255     struct RAMBlock *block;
2256
2257     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2258         unsigned long *bitmap = block->bmap;
2259         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2260         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2261
2262         while (run_start < range) {
2263             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2264             ram_discard_range(block->idstr,
2265                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2266                               ((ram_addr_t)(run_end - run_start))
2267                                 << TARGET_PAGE_BITS);
2268             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2269         }
2270     }
2271 }
2272
2273 /**
2274  * postcopy_send_discard_bm_ram: discard a RAMBlock
2275  *
2276  * Returns zero on success
2277  *
2278  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2279  *
2280  * @ms: current migration state
2281  * @block: RAMBlock to discard
2282  */
2283 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2284 {
2285     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2286     unsigned long current;
2287     unsigned long *bitmap = block->bmap;
2288
2289     for (current = 0; current < end; ) {
2290         unsigned long one = find_next_bit(bitmap, end, current);
2291         unsigned long zero, discard_length;
2292
2293         if (one >= end) {
2294             break;
2295         }
2296
2297         zero = find_next_zero_bit(bitmap, end, one + 1);
2298
2299         if (zero >= end) {
2300             discard_length = end - one;
2301         } else {
2302             discard_length = zero - one;
2303         }
2304         postcopy_discard_send_range(ms, one, discard_length);
2305         current = one + discard_length;
2306     }
2307
2308     return 0;
2309 }
2310
2311 /**
2312  * postcopy_each_ram_send_discard: discard all RAMBlocks
2313  *
2314  * Returns 0 for success or negative for error
2315  *
2316  * Utility for the outgoing postcopy code.
2317  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2318  *   passing it bitmap indexes and name.
2319  * (qemu_ram_foreach_block ends up passing unscaled lengths
2320  *  which would mean postcopy code would have to deal with target page)
2321  *
2322  * @ms: current migration state
2323  */
2324 static int postcopy_each_ram_send_discard(MigrationState *ms)
2325 {
2326     struct RAMBlock *block;
2327     int ret;
2328
2329     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2330         postcopy_discard_send_init(ms, block->idstr);
2331
2332         /*
2333          * Postcopy sends chunks of bitmap over the wire, but it
2334          * just needs indexes at this point, avoids it having
2335          * target page specific code.
2336          */
2337         ret = postcopy_send_discard_bm_ram(ms, block);
2338         postcopy_discard_send_finish(ms);
2339         if (ret) {
2340             return ret;
2341         }
2342     }
2343
2344     return 0;
2345 }
2346
2347 /**
2348  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2349  *
2350  * Helper for postcopy_chunk_hostpages; it's called twice to
2351  * canonicalize the two bitmaps, that are similar, but one is
2352  * inverted.
2353  *
2354  * Postcopy requires that all target pages in a hostpage are dirty or
2355  * clean, not a mix.  This function canonicalizes the bitmaps.
2356  *
2357  * @ms: current migration state
2358  * @block: block that contains the page we want to canonicalize
2359  */
2360 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2361 {
2362     RAMState *rs = ram_state;
2363     unsigned long *bitmap = block->bmap;
2364     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2365     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2366     unsigned long run_start;
2367
2368     if (block->page_size == TARGET_PAGE_SIZE) {
2369         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2370         return;
2371     }
2372
2373     /* Find a dirty page */
2374     run_start = find_next_bit(bitmap, pages, 0);
2375
2376     while (run_start < pages) {
2377
2378         /*
2379          * If the start of this run of pages is in the middle of a host
2380          * page, then we need to fixup this host page.
2381          */
2382         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2383             /* Find the end of this run */
2384             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2385             /*
2386              * If the end isn't at the start of a host page, then the
2387              * run doesn't finish at the end of a host page
2388              * and we need to discard.
2389              */
2390         }
2391
2392         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2393             unsigned long page;
2394             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2395                                                              host_ratio);
2396             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2397
2398             /* Clean up the bitmap */
2399             for (page = fixup_start_addr;
2400                  page < fixup_start_addr + host_ratio; page++) {
2401                 /*
2402                  * Remark them as dirty, updating the count for any pages
2403                  * that weren't previously dirty.
2404                  */
2405                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2406             }
2407         }
2408
2409         /* Find the next dirty page for the next iteration */
2410         run_start = find_next_bit(bitmap, pages, run_start);
2411     }
2412 }
2413
2414 /**
2415  * postcopy_chunk_hostpages: discard any partially sent host page
2416  *
2417  * Utility for the outgoing postcopy code.
2418  *
2419  * Discard any partially sent host-page size chunks, mark any partially
2420  * dirty host-page size chunks as all dirty.  In this case the host-page
2421  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2422  *
2423  * Returns zero on success
2424  *
2425  * @ms: current migration state
2426  * @block: block we want to work with
2427  */
2428 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2429 {
2430     postcopy_discard_send_init(ms, block->idstr);
2431
2432     /*
2433      * Ensure that all partially dirty host pages are made fully dirty.
2434      */
2435     postcopy_chunk_hostpages_pass(ms, block);
2436
2437     postcopy_discard_send_finish(ms);
2438     return 0;
2439 }
2440
2441 /**
2442  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2443  *
2444  * Returns zero on success
2445  *
2446  * Transmit the set of pages to be discarded after precopy to the target
2447  * these are pages that:
2448  *     a) Have been previously transmitted but are now dirty again
2449  *     b) Pages that have never been transmitted, this ensures that
2450  *        any pages on the destination that have been mapped by background
2451  *        tasks get discarded (transparent huge pages is the specific concern)
2452  * Hopefully this is pretty sparse
2453  *
2454  * @ms: current migration state
2455  */
2456 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2457 {
2458     RAMState *rs = ram_state;
2459     RAMBlock *block;
2460     int ret;
2461
2462     RCU_READ_LOCK_GUARD();
2463
2464     /* This should be our last sync, the src is now paused */
2465     migration_bitmap_sync(rs);
2466
2467     /* Easiest way to make sure we don't resume in the middle of a host-page */
2468     rs->last_seen_block = NULL;
2469     rs->last_sent_block = NULL;
2470     rs->last_page = 0;
2471
2472     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2473         /* Deal with TPS != HPS and huge pages */
2474         ret = postcopy_chunk_hostpages(ms, block);
2475         if (ret) {
2476             return ret;
2477         }
2478
2479 #ifdef DEBUG_POSTCOPY
2480         ram_debug_dump_bitmap(block->bmap, true,
2481                               block->used_length >> TARGET_PAGE_BITS);
2482 #endif
2483     }
2484     trace_ram_postcopy_send_discard_bitmap();
2485
2486     return postcopy_each_ram_send_discard(ms);
2487 }
2488
2489 /**
2490  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2491  *
2492  * Returns zero on success
2493  *
2494  * @rbname: name of the RAMBlock of the request. NULL means the
2495  *          same that last one.
2496  * @start: RAMBlock starting page
2497  * @length: RAMBlock size
2498  */
2499 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2500 {
2501     trace_ram_discard_range(rbname, start, length);
2502
2503     RCU_READ_LOCK_GUARD();
2504     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2505
2506     if (!rb) {
2507         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2508         return -1;
2509     }
2510
2511     /*
2512      * On source VM, we don't need to update the received bitmap since
2513      * we don't even have one.
2514      */
2515     if (rb->receivedmap) {
2516         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2517                      length >> qemu_target_page_bits());
2518     }
2519
2520     return ram_block_discard_range(rb, start, length);
2521 }
2522
2523 /*
2524  * For every allocation, we will try not to crash the VM if the
2525  * allocation failed.
2526  */
2527 static int xbzrle_init(void)
2528 {
2529     Error *local_err = NULL;
2530
2531     if (!migrate_use_xbzrle()) {
2532         return 0;
2533     }
2534
2535     XBZRLE_cache_lock();
2536
2537     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2538     if (!XBZRLE.zero_target_page) {
2539         error_report("%s: Error allocating zero page", __func__);
2540         goto err_out;
2541     }
2542
2543     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2544                               TARGET_PAGE_SIZE, &local_err);
2545     if (!XBZRLE.cache) {
2546         error_report_err(local_err);
2547         goto free_zero_page;
2548     }
2549
2550     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2551     if (!XBZRLE.encoded_buf) {
2552         error_report("%s: Error allocating encoded_buf", __func__);
2553         goto free_cache;
2554     }
2555
2556     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2557     if (!XBZRLE.current_buf) {
2558         error_report("%s: Error allocating current_buf", __func__);
2559         goto free_encoded_buf;
2560     }
2561
2562     /* We are all good */
2563     XBZRLE_cache_unlock();
2564     return 0;
2565
2566 free_encoded_buf:
2567     g_free(XBZRLE.encoded_buf);
2568     XBZRLE.encoded_buf = NULL;
2569 free_cache:
2570     cache_fini(XBZRLE.cache);
2571     XBZRLE.cache = NULL;
2572 free_zero_page:
2573     g_free(XBZRLE.zero_target_page);
2574     XBZRLE.zero_target_page = NULL;
2575 err_out:
2576     XBZRLE_cache_unlock();
2577     return -ENOMEM;
2578 }
2579
2580 static int ram_state_init(RAMState **rsp)
2581 {
2582     *rsp = g_try_new0(RAMState, 1);
2583
2584     if (!*rsp) {
2585         error_report("%s: Init ramstate fail", __func__);
2586         return -1;
2587     }
2588
2589     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2590     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2591     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2592
2593     /*
2594      * Count the total number of pages used by ram blocks not including any
2595      * gaps due to alignment or unplugs.
2596      * This must match with the initial values of dirty bitmap.
2597      */
2598     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2599     ram_state_reset(*rsp);
2600
2601     return 0;
2602 }
2603
2604 static void ram_list_init_bitmaps(void)
2605 {
2606     MigrationState *ms = migrate_get_current();
2607     RAMBlock *block;
2608     unsigned long pages;
2609     uint8_t shift;
2610
2611     /* Skip setting bitmap if there is no RAM */
2612     if (ram_bytes_total()) {
2613         shift = ms->clear_bitmap_shift;
2614         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2615             error_report("clear_bitmap_shift (%u) too big, using "
2616                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2617             shift = CLEAR_BITMAP_SHIFT_MAX;
2618         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2619             error_report("clear_bitmap_shift (%u) too small, using "
2620                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2621             shift = CLEAR_BITMAP_SHIFT_MIN;
2622         }
2623
2624         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2625             pages = block->max_length >> TARGET_PAGE_BITS;
2626             /*
2627              * The initial dirty bitmap for migration must be set with all
2628              * ones to make sure we'll migrate every guest RAM page to
2629              * destination.
2630              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2631              * new migration after a failed migration, ram_list.
2632              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2633              * guest memory.
2634              */
2635             block->bmap = bitmap_new(pages);
2636             bitmap_set(block->bmap, 0, pages);
2637             block->clear_bmap_shift = shift;
2638             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2639         }
2640     }
2641 }
2642
2643 static void ram_init_bitmaps(RAMState *rs)
2644 {
2645     /* For memory_global_dirty_log_start below.  */
2646     qemu_mutex_lock_iothread();
2647     qemu_mutex_lock_ramlist();
2648
2649     WITH_RCU_READ_LOCK_GUARD() {
2650         ram_list_init_bitmaps();
2651         /* We don't use dirty log with background snapshots */
2652         if (!migrate_background_snapshot()) {
2653             memory_global_dirty_log_start();
2654             migration_bitmap_sync_precopy(rs);
2655         }
2656     }
2657     qemu_mutex_unlock_ramlist();
2658     qemu_mutex_unlock_iothread();
2659 }
2660
2661 static int ram_init_all(RAMState **rsp)
2662 {
2663     if (ram_state_init(rsp)) {
2664         return -1;
2665     }
2666
2667     if (xbzrle_init()) {
2668         ram_state_cleanup(rsp);
2669         return -1;
2670     }
2671
2672     ram_init_bitmaps(*rsp);
2673
2674     return 0;
2675 }
2676
2677 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2678 {
2679     RAMBlock *block;
2680     uint64_t pages = 0;
2681
2682     /*
2683      * Postcopy is not using xbzrle/compression, so no need for that.
2684      * Also, since source are already halted, we don't need to care
2685      * about dirty page logging as well.
2686      */
2687
2688     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2689         pages += bitmap_count_one(block->bmap,
2690                                   block->used_length >> TARGET_PAGE_BITS);
2691     }
2692
2693     /* This may not be aligned with current bitmaps. Recalculate. */
2694     rs->migration_dirty_pages = pages;
2695
2696     ram_state_reset(rs);
2697
2698     /* Update RAMState cache of output QEMUFile */
2699     rs->f = out;
2700
2701     trace_ram_state_resume_prepare(pages);
2702 }
2703
2704 /*
2705  * This function clears bits of the free pages reported by the caller from the
2706  * migration dirty bitmap. @addr is the host address corresponding to the
2707  * start of the continuous guest free pages, and @len is the total bytes of
2708  * those pages.
2709  */
2710 void qemu_guest_free_page_hint(void *addr, size_t len)
2711 {
2712     RAMBlock *block;
2713     ram_addr_t offset;
2714     size_t used_len, start, npages;
2715     MigrationState *s = migrate_get_current();
2716
2717     /* This function is currently expected to be used during live migration */
2718     if (!migration_is_setup_or_active(s->state)) {
2719         return;
2720     }
2721
2722     for (; len > 0; len -= used_len, addr += used_len) {
2723         block = qemu_ram_block_from_host(addr, false, &offset);
2724         if (unlikely(!block || offset >= block->used_length)) {
2725             /*
2726              * The implementation might not support RAMBlock resize during
2727              * live migration, but it could happen in theory with future
2728              * updates. So we add a check here to capture that case.
2729              */
2730             error_report_once("%s unexpected error", __func__);
2731             return;
2732         }
2733
2734         if (len <= block->used_length - offset) {
2735             used_len = len;
2736         } else {
2737             used_len = block->used_length - offset;
2738         }
2739
2740         start = offset >> TARGET_PAGE_BITS;
2741         npages = used_len >> TARGET_PAGE_BITS;
2742
2743         qemu_mutex_lock(&ram_state->bitmap_mutex);
2744         ram_state->migration_dirty_pages -=
2745                       bitmap_count_one_with_offset(block->bmap, start, npages);
2746         bitmap_clear(block->bmap, start, npages);
2747         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2748     }
2749 }
2750
2751 /*
2752  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2753  * long-running RCU critical section.  When rcu-reclaims in the code
2754  * start to become numerous it will be necessary to reduce the
2755  * granularity of these critical sections.
2756  */
2757
2758 /**
2759  * ram_save_setup: Setup RAM for migration
2760  *
2761  * Returns zero to indicate success and negative for error
2762  *
2763  * @f: QEMUFile where to send the data
2764  * @opaque: RAMState pointer
2765  */
2766 static int ram_save_setup(QEMUFile *f, void *opaque)
2767 {
2768     RAMState **rsp = opaque;
2769     RAMBlock *block;
2770
2771     if (compress_threads_save_setup()) {
2772         return -1;
2773     }
2774
2775     /* migration has already setup the bitmap, reuse it. */
2776     if (!migration_in_colo_state()) {
2777         if (ram_init_all(rsp) != 0) {
2778             compress_threads_save_cleanup();
2779             return -1;
2780         }
2781     }
2782     (*rsp)->f = f;
2783
2784     WITH_RCU_READ_LOCK_GUARD() {
2785         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2786
2787         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2788             qemu_put_byte(f, strlen(block->idstr));
2789             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2790             qemu_put_be64(f, block->used_length);
2791             if (migrate_postcopy_ram() && block->page_size !=
2792                                           qemu_host_page_size) {
2793                 qemu_put_be64(f, block->page_size);
2794             }
2795             if (migrate_ignore_shared()) {
2796                 qemu_put_be64(f, block->mr->addr);
2797             }
2798         }
2799     }
2800
2801     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2802     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2803
2804     multifd_send_sync_main(f);
2805     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2806     qemu_fflush(f);
2807
2808     return 0;
2809 }
2810
2811 /**
2812  * ram_save_iterate: iterative stage for migration
2813  *
2814  * Returns zero to indicate success and negative for error
2815  *
2816  * @f: QEMUFile where to send the data
2817  * @opaque: RAMState pointer
2818  */
2819 static int ram_save_iterate(QEMUFile *f, void *opaque)
2820 {
2821     RAMState **temp = opaque;
2822     RAMState *rs = *temp;
2823     int ret = 0;
2824     int i;
2825     int64_t t0;
2826     int done = 0;
2827
2828     if (blk_mig_bulk_active()) {
2829         /* Avoid transferring ram during bulk phase of block migration as
2830          * the bulk phase will usually take a long time and transferring
2831          * ram updates during that time is pointless. */
2832         goto out;
2833     }
2834
2835     /*
2836      * We'll take this lock a little bit long, but it's okay for two reasons.
2837      * Firstly, the only possible other thread to take it is who calls
2838      * qemu_guest_free_page_hint(), which should be rare; secondly, see
2839      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
2840      * guarantees that we'll at least released it in a regular basis.
2841      */
2842     qemu_mutex_lock(&rs->bitmap_mutex);
2843     WITH_RCU_READ_LOCK_GUARD() {
2844         if (ram_list.version != rs->last_version) {
2845             ram_state_reset(rs);
2846         }
2847
2848         /* Read version before ram_list.blocks */
2849         smp_rmb();
2850
2851         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2852
2853         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2854         i = 0;
2855         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2856                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2857             int pages;
2858
2859             if (qemu_file_get_error(f)) {
2860                 break;
2861             }
2862
2863             pages = ram_find_and_save_block(rs, false);
2864             /* no more pages to sent */
2865             if (pages == 0) {
2866                 done = 1;
2867                 break;
2868             }
2869
2870             if (pages < 0) {
2871                 qemu_file_set_error(f, pages);
2872                 break;
2873             }
2874
2875             rs->target_page_count += pages;
2876
2877             /*
2878              * During postcopy, it is necessary to make sure one whole host
2879              * page is sent in one chunk.
2880              */
2881             if (migrate_postcopy_ram()) {
2882                 flush_compressed_data(rs);
2883             }
2884
2885             /*
2886              * we want to check in the 1st loop, just in case it was the 1st
2887              * time and we had to sync the dirty bitmap.
2888              * qemu_clock_get_ns() is a bit expensive, so we only check each
2889              * some iterations
2890              */
2891             if ((i & 63) == 0) {
2892                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2893                               1000000;
2894                 if (t1 > MAX_WAIT) {
2895                     trace_ram_save_iterate_big_wait(t1, i);
2896                     break;
2897                 }
2898             }
2899             i++;
2900         }
2901     }
2902     qemu_mutex_unlock(&rs->bitmap_mutex);
2903
2904     /*
2905      * Must occur before EOS (or any QEMUFile operation)
2906      * because of RDMA protocol.
2907      */
2908     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2909
2910 out:
2911     if (ret >= 0
2912         && migration_is_setup_or_active(migrate_get_current()->state)) {
2913         multifd_send_sync_main(rs->f);
2914         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2915         qemu_fflush(f);
2916         ram_counters.transferred += 8;
2917
2918         ret = qemu_file_get_error(f);
2919     }
2920     if (ret < 0) {
2921         return ret;
2922     }
2923
2924     return done;
2925 }
2926
2927 /**
2928  * ram_save_complete: function called to send the remaining amount of ram
2929  *
2930  * Returns zero to indicate success or negative on error
2931  *
2932  * Called with iothread lock
2933  *
2934  * @f: QEMUFile where to send the data
2935  * @opaque: RAMState pointer
2936  */
2937 static int ram_save_complete(QEMUFile *f, void *opaque)
2938 {
2939     RAMState **temp = opaque;
2940     RAMState *rs = *temp;
2941     int ret = 0;
2942
2943     WITH_RCU_READ_LOCK_GUARD() {
2944         if (!migration_in_postcopy()) {
2945             migration_bitmap_sync_precopy(rs);
2946         }
2947
2948         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2949
2950         /* try transferring iterative blocks of memory */
2951
2952         /* flush all remaining blocks regardless of rate limiting */
2953         while (true) {
2954             int pages;
2955
2956             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2957             /* no more blocks to sent */
2958             if (pages == 0) {
2959                 break;
2960             }
2961             if (pages < 0) {
2962                 ret = pages;
2963                 break;
2964             }
2965         }
2966
2967         flush_compressed_data(rs);
2968         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2969     }
2970
2971     if (ret >= 0) {
2972         multifd_send_sync_main(rs->f);
2973         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2974         qemu_fflush(f);
2975     }
2976
2977     return ret;
2978 }
2979
2980 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2981                              uint64_t *res_precopy_only,
2982                              uint64_t *res_compatible,
2983                              uint64_t *res_postcopy_only)
2984 {
2985     RAMState **temp = opaque;
2986     RAMState *rs = *temp;
2987     uint64_t remaining_size;
2988
2989     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2990
2991     if (!migration_in_postcopy() &&
2992         remaining_size < max_size) {
2993         qemu_mutex_lock_iothread();
2994         WITH_RCU_READ_LOCK_GUARD() {
2995             migration_bitmap_sync_precopy(rs);
2996         }
2997         qemu_mutex_unlock_iothread();
2998         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2999     }
3000
3001     if (migrate_postcopy_ram()) {
3002         /* We can do postcopy, and all the data is postcopiable */
3003         *res_compatible += remaining_size;
3004     } else {
3005         *res_precopy_only += remaining_size;
3006     }
3007 }
3008
3009 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3010 {
3011     unsigned int xh_len;
3012     int xh_flags;
3013     uint8_t *loaded_data;
3014
3015     /* extract RLE header */
3016     xh_flags = qemu_get_byte(f);
3017     xh_len = qemu_get_be16(f);
3018
3019     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3020         error_report("Failed to load XBZRLE page - wrong compression!");
3021         return -1;
3022     }
3023
3024     if (xh_len > TARGET_PAGE_SIZE) {
3025         error_report("Failed to load XBZRLE page - len overflow!");
3026         return -1;
3027     }
3028     loaded_data = XBZRLE.decoded_buf;
3029     /* load data and decode */
3030     /* it can change loaded_data to point to an internal buffer */
3031     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3032
3033     /* decode RLE */
3034     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3035                              TARGET_PAGE_SIZE) == -1) {
3036         error_report("Failed to load XBZRLE page - decode error!");
3037         return -1;
3038     }
3039
3040     return 0;
3041 }
3042
3043 /**
3044  * ram_block_from_stream: read a RAMBlock id from the migration stream
3045  *
3046  * Must be called from within a rcu critical section.
3047  *
3048  * Returns a pointer from within the RCU-protected ram_list.
3049  *
3050  * @f: QEMUFile where to read the data from
3051  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3052  */
3053 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3054 {
3055     static RAMBlock *block;
3056     char id[256];
3057     uint8_t len;
3058
3059     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3060         if (!block) {
3061             error_report("Ack, bad migration stream!");
3062             return NULL;
3063         }
3064         return block;
3065     }
3066
3067     len = qemu_get_byte(f);
3068     qemu_get_buffer(f, (uint8_t *)id, len);
3069     id[len] = 0;
3070
3071     block = qemu_ram_block_by_name(id);
3072     if (!block) {
3073         error_report("Can't find block %s", id);
3074         return NULL;
3075     }
3076
3077     if (ramblock_is_ignored(block)) {
3078         error_report("block %s should not be migrated !", id);
3079         return NULL;
3080     }
3081
3082     return block;
3083 }
3084
3085 static inline void *host_from_ram_block_offset(RAMBlock *block,
3086                                                ram_addr_t offset)
3087 {
3088     if (!offset_in_ramblock(block, offset)) {
3089         return NULL;
3090     }
3091
3092     return block->host + offset;
3093 }
3094
3095 static void *host_page_from_ram_block_offset(RAMBlock *block,
3096                                              ram_addr_t offset)
3097 {
3098     /* Note: Explicitly no check against offset_in_ramblock(). */
3099     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3100                                    block->page_size);
3101 }
3102
3103 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3104                                                          ram_addr_t offset)
3105 {
3106     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3107 }
3108
3109 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3110                              ram_addr_t offset, bool record_bitmap)
3111 {
3112     if (!offset_in_ramblock(block, offset)) {
3113         return NULL;
3114     }
3115     if (!block->colo_cache) {
3116         error_report("%s: colo_cache is NULL in block :%s",
3117                      __func__, block->idstr);
3118         return NULL;
3119     }
3120
3121     /*
3122     * During colo checkpoint, we need bitmap of these migrated pages.
3123     * It help us to decide which pages in ram cache should be flushed
3124     * into VM's RAM later.
3125     */
3126     if (record_bitmap &&
3127         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3128         ram_state->migration_dirty_pages++;
3129     }
3130     return block->colo_cache + offset;
3131 }
3132
3133 /**
3134  * ram_handle_compressed: handle the zero page case
3135  *
3136  * If a page (or a whole RDMA chunk) has been
3137  * determined to be zero, then zap it.
3138  *
3139  * @host: host address for the zero page
3140  * @ch: what the page is filled from.  We only support zero
3141  * @size: size of the zero page
3142  */
3143 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3144 {
3145     if (ch != 0 || !is_zero_range(host, size)) {
3146         memset(host, ch, size);
3147     }
3148 }
3149
3150 /* return the size after decompression, or negative value on error */
3151 static int
3152 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3153                      const uint8_t *source, size_t source_len)
3154 {
3155     int err;
3156
3157     err = inflateReset(stream);
3158     if (err != Z_OK) {
3159         return -1;
3160     }
3161
3162     stream->avail_in = source_len;
3163     stream->next_in = (uint8_t *)source;
3164     stream->avail_out = dest_len;
3165     stream->next_out = dest;
3166
3167     err = inflate(stream, Z_NO_FLUSH);
3168     if (err != Z_STREAM_END) {
3169         return -1;
3170     }
3171
3172     return stream->total_out;
3173 }
3174
3175 static void *do_data_decompress(void *opaque)
3176 {
3177     DecompressParam *param = opaque;
3178     unsigned long pagesize;
3179     uint8_t *des;
3180     int len, ret;
3181
3182     qemu_mutex_lock(&param->mutex);
3183     while (!param->quit) {
3184         if (param->des) {
3185             des = param->des;
3186             len = param->len;
3187             param->des = 0;
3188             qemu_mutex_unlock(&param->mutex);
3189
3190             pagesize = TARGET_PAGE_SIZE;
3191
3192             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3193                                        param->compbuf, len);
3194             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3195                 error_report("decompress data failed");
3196                 qemu_file_set_error(decomp_file, ret);
3197             }
3198
3199             qemu_mutex_lock(&decomp_done_lock);
3200             param->done = true;
3201             qemu_cond_signal(&decomp_done_cond);
3202             qemu_mutex_unlock(&decomp_done_lock);
3203
3204             qemu_mutex_lock(&param->mutex);
3205         } else {
3206             qemu_cond_wait(&param->cond, &param->mutex);
3207         }
3208     }
3209     qemu_mutex_unlock(&param->mutex);
3210
3211     return NULL;
3212 }
3213
3214 static int wait_for_decompress_done(void)
3215 {
3216     int idx, thread_count;
3217
3218     if (!migrate_use_compression()) {
3219         return 0;
3220     }
3221
3222     thread_count = migrate_decompress_threads();
3223     qemu_mutex_lock(&decomp_done_lock);
3224     for (idx = 0; idx < thread_count; idx++) {
3225         while (!decomp_param[idx].done) {
3226             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3227         }
3228     }
3229     qemu_mutex_unlock(&decomp_done_lock);
3230     return qemu_file_get_error(decomp_file);
3231 }
3232
3233 static void compress_threads_load_cleanup(void)
3234 {
3235     int i, thread_count;
3236
3237     if (!migrate_use_compression()) {
3238         return;
3239     }
3240     thread_count = migrate_decompress_threads();
3241     for (i = 0; i < thread_count; i++) {
3242         /*
3243          * we use it as a indicator which shows if the thread is
3244          * properly init'd or not
3245          */
3246         if (!decomp_param[i].compbuf) {
3247             break;
3248         }
3249
3250         qemu_mutex_lock(&decomp_param[i].mutex);
3251         decomp_param[i].quit = true;
3252         qemu_cond_signal(&decomp_param[i].cond);
3253         qemu_mutex_unlock(&decomp_param[i].mutex);
3254     }
3255     for (i = 0; i < thread_count; i++) {
3256         if (!decomp_param[i].compbuf) {
3257             break;
3258         }
3259
3260         qemu_thread_join(decompress_threads + i);
3261         qemu_mutex_destroy(&decomp_param[i].mutex);
3262         qemu_cond_destroy(&decomp_param[i].cond);
3263         inflateEnd(&decomp_param[i].stream);
3264         g_free(decomp_param[i].compbuf);
3265         decomp_param[i].compbuf = NULL;
3266     }
3267     g_free(decompress_threads);
3268     g_free(decomp_param);
3269     decompress_threads = NULL;
3270     decomp_param = NULL;
3271     decomp_file = NULL;
3272 }
3273
3274 static int compress_threads_load_setup(QEMUFile *f)
3275 {
3276     int i, thread_count;
3277
3278     if (!migrate_use_compression()) {
3279         return 0;
3280     }
3281
3282     thread_count = migrate_decompress_threads();
3283     decompress_threads = g_new0(QemuThread, thread_count);
3284     decomp_param = g_new0(DecompressParam, thread_count);
3285     qemu_mutex_init(&decomp_done_lock);
3286     qemu_cond_init(&decomp_done_cond);
3287     decomp_file = f;
3288     for (i = 0; i < thread_count; i++) {
3289         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3290             goto exit;
3291         }
3292
3293         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3294         qemu_mutex_init(&decomp_param[i].mutex);
3295         qemu_cond_init(&decomp_param[i].cond);
3296         decomp_param[i].done = true;
3297         decomp_param[i].quit = false;
3298         qemu_thread_create(decompress_threads + i, "decompress",
3299                            do_data_decompress, decomp_param + i,
3300                            QEMU_THREAD_JOINABLE);
3301     }
3302     return 0;
3303 exit:
3304     compress_threads_load_cleanup();
3305     return -1;
3306 }
3307
3308 static void decompress_data_with_multi_threads(QEMUFile *f,
3309                                                void *host, int len)
3310 {
3311     int idx, thread_count;
3312
3313     thread_count = migrate_decompress_threads();
3314     QEMU_LOCK_GUARD(&decomp_done_lock);
3315     while (true) {
3316         for (idx = 0; idx < thread_count; idx++) {
3317             if (decomp_param[idx].done) {
3318                 decomp_param[idx].done = false;
3319                 qemu_mutex_lock(&decomp_param[idx].mutex);
3320                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3321                 decomp_param[idx].des = host;
3322                 decomp_param[idx].len = len;
3323                 qemu_cond_signal(&decomp_param[idx].cond);
3324                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3325                 break;
3326             }
3327         }
3328         if (idx < thread_count) {
3329             break;
3330         } else {
3331             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3332         }
3333     }
3334 }
3335
3336 static void colo_init_ram_state(void)
3337 {
3338     ram_state_init(&ram_state);
3339 }
3340
3341 /*
3342  * colo cache: this is for secondary VM, we cache the whole
3343  * memory of the secondary VM, it is need to hold the global lock
3344  * to call this helper.
3345  */
3346 int colo_init_ram_cache(void)
3347 {
3348     RAMBlock *block;
3349
3350     WITH_RCU_READ_LOCK_GUARD() {
3351         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3352             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3353                                                     NULL, false, false);
3354             if (!block->colo_cache) {
3355                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3356                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3357                              block->used_length);
3358                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3359                     if (block->colo_cache) {
3360                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3361                         block->colo_cache = NULL;
3362                     }
3363                 }
3364                 return -errno;
3365             }
3366         }
3367     }
3368
3369     /*
3370     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3371     * with to decide which page in cache should be flushed into SVM's RAM. Here
3372     * we use the same name 'ram_bitmap' as for migration.
3373     */
3374     if (ram_bytes_total()) {
3375         RAMBlock *block;
3376
3377         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3378             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3379             block->bmap = bitmap_new(pages);
3380         }
3381     }
3382
3383     colo_init_ram_state();
3384     return 0;
3385 }
3386
3387 /* TODO: duplicated with ram_init_bitmaps */
3388 void colo_incoming_start_dirty_log(void)
3389 {
3390     RAMBlock *block = NULL;
3391     /* For memory_global_dirty_log_start below. */
3392     qemu_mutex_lock_iothread();
3393     qemu_mutex_lock_ramlist();
3394
3395     memory_global_dirty_log_sync();
3396     WITH_RCU_READ_LOCK_GUARD() {
3397         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3398             ramblock_sync_dirty_bitmap(ram_state, block);
3399             /* Discard this dirty bitmap record */
3400             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3401         }
3402         memory_global_dirty_log_start();
3403     }
3404     ram_state->migration_dirty_pages = 0;
3405     qemu_mutex_unlock_ramlist();
3406     qemu_mutex_unlock_iothread();
3407 }
3408
3409 /* It is need to hold the global lock to call this helper */
3410 void colo_release_ram_cache(void)
3411 {
3412     RAMBlock *block;
3413
3414     memory_global_dirty_log_stop();
3415     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3416         g_free(block->bmap);
3417         block->bmap = NULL;
3418     }
3419
3420     WITH_RCU_READ_LOCK_GUARD() {
3421         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3422             if (block->colo_cache) {
3423                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3424                 block->colo_cache = NULL;
3425             }
3426         }
3427     }
3428     ram_state_cleanup(&ram_state);
3429 }
3430
3431 /**
3432  * ram_load_setup: Setup RAM for migration incoming side
3433  *
3434  * Returns zero to indicate success and negative for error
3435  *
3436  * @f: QEMUFile where to receive the data
3437  * @opaque: RAMState pointer
3438  */
3439 static int ram_load_setup(QEMUFile *f, void *opaque)
3440 {
3441     if (compress_threads_load_setup(f)) {
3442         return -1;
3443     }
3444
3445     xbzrle_load_setup();
3446     ramblock_recv_map_init();
3447
3448     return 0;
3449 }
3450
3451 static int ram_load_cleanup(void *opaque)
3452 {
3453     RAMBlock *rb;
3454
3455     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3456         qemu_ram_block_writeback(rb);
3457     }
3458
3459     xbzrle_load_cleanup();
3460     compress_threads_load_cleanup();
3461
3462     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3463         g_free(rb->receivedmap);
3464         rb->receivedmap = NULL;
3465     }
3466
3467     return 0;
3468 }
3469
3470 /**
3471  * ram_postcopy_incoming_init: allocate postcopy data structures
3472  *
3473  * Returns 0 for success and negative if there was one error
3474  *
3475  * @mis: current migration incoming state
3476  *
3477  * Allocate data structures etc needed by incoming migration with
3478  * postcopy-ram. postcopy-ram's similarly names
3479  * postcopy_ram_incoming_init does the work.
3480  */
3481 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3482 {
3483     return postcopy_ram_incoming_init(mis);
3484 }
3485
3486 /**
3487  * ram_load_postcopy: load a page in postcopy case
3488  *
3489  * Returns 0 for success or -errno in case of error
3490  *
3491  * Called in postcopy mode by ram_load().
3492  * rcu_read_lock is taken prior to this being called.
3493  *
3494  * @f: QEMUFile where to send the data
3495  */
3496 static int ram_load_postcopy(QEMUFile *f)
3497 {
3498     int flags = 0, ret = 0;
3499     bool place_needed = false;
3500     bool matches_target_page_size = false;
3501     MigrationIncomingState *mis = migration_incoming_get_current();
3502     /* Temporary page that is later 'placed' */
3503     void *postcopy_host_page = mis->postcopy_tmp_page;
3504     void *host_page = NULL;
3505     bool all_zero = true;
3506     int target_pages = 0;
3507
3508     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3509         ram_addr_t addr;
3510         void *page_buffer = NULL;
3511         void *place_source = NULL;
3512         RAMBlock *block = NULL;
3513         uint8_t ch;
3514         int len;
3515
3516         addr = qemu_get_be64(f);
3517
3518         /*
3519          * If qemu file error, we should stop here, and then "addr"
3520          * may be invalid
3521          */
3522         ret = qemu_file_get_error(f);
3523         if (ret) {
3524             break;
3525         }
3526
3527         flags = addr & ~TARGET_PAGE_MASK;
3528         addr &= TARGET_PAGE_MASK;
3529
3530         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3531         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3532                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3533             block = ram_block_from_stream(f, flags);
3534             if (!block) {
3535                 ret = -EINVAL;
3536                 break;
3537             }
3538
3539             /*
3540              * Relying on used_length is racy and can result in false positives.
3541              * We might place pages beyond used_length in case RAM was shrunk
3542              * while in postcopy, which is fine - trying to place via
3543              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3544              */
3545             if (!block->host || addr >= block->postcopy_length) {
3546                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3547                 ret = -EINVAL;
3548                 break;
3549             }
3550             target_pages++;
3551             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3552             /*
3553              * Postcopy requires that we place whole host pages atomically;
3554              * these may be huge pages for RAMBlocks that are backed by
3555              * hugetlbfs.
3556              * To make it atomic, the data is read into a temporary page
3557              * that's moved into place later.
3558              * The migration protocol uses,  possibly smaller, target-pages
3559              * however the source ensures it always sends all the components
3560              * of a host page in one chunk.
3561              */
3562             page_buffer = postcopy_host_page +
3563                           host_page_offset_from_ram_block_offset(block, addr);
3564             /* If all TP are zero then we can optimise the place */
3565             if (target_pages == 1) {
3566                 host_page = host_page_from_ram_block_offset(block, addr);
3567             } else if (host_page != host_page_from_ram_block_offset(block,
3568                                                                     addr)) {
3569                 /* not the 1st TP within the HP */
3570                 error_report("Non-same host page %p/%p", host_page,
3571                              host_page_from_ram_block_offset(block, addr));
3572                 ret = -EINVAL;
3573                 break;
3574             }
3575
3576             /*
3577              * If it's the last part of a host page then we place the host
3578              * page
3579              */
3580             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3581                 place_needed = true;
3582             }
3583             place_source = postcopy_host_page;
3584         }
3585
3586         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3587         case RAM_SAVE_FLAG_ZERO:
3588             ch = qemu_get_byte(f);
3589             /*
3590              * Can skip to set page_buffer when
3591              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3592              */
3593             if (ch || !matches_target_page_size) {
3594                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3595             }
3596             if (ch) {
3597                 all_zero = false;
3598             }
3599             break;
3600
3601         case RAM_SAVE_FLAG_PAGE:
3602             all_zero = false;
3603             if (!matches_target_page_size) {
3604                 /* For huge pages, we always use temporary buffer */
3605                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3606             } else {
3607                 /*
3608                  * For small pages that matches target page size, we
3609                  * avoid the qemu_file copy.  Instead we directly use
3610                  * the buffer of QEMUFile to place the page.  Note: we
3611                  * cannot do any QEMUFile operation before using that
3612                  * buffer to make sure the buffer is valid when
3613                  * placing the page.
3614                  */
3615                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3616                                          TARGET_PAGE_SIZE);
3617             }
3618             break;
3619         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3620             all_zero = false;
3621             len = qemu_get_be32(f);
3622             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3623                 error_report("Invalid compressed data length: %d", len);
3624                 ret = -EINVAL;
3625                 break;
3626             }
3627             decompress_data_with_multi_threads(f, page_buffer, len);
3628             break;
3629
3630         case RAM_SAVE_FLAG_EOS:
3631             /* normal exit */
3632             multifd_recv_sync_main();
3633             break;
3634         default:
3635             error_report("Unknown combination of migration flags: 0x%x"
3636                          " (postcopy mode)", flags);
3637             ret = -EINVAL;
3638             break;
3639         }
3640
3641         /* Got the whole host page, wait for decompress before placing. */
3642         if (place_needed) {
3643             ret |= wait_for_decompress_done();
3644         }
3645
3646         /* Detect for any possible file errors */
3647         if (!ret && qemu_file_get_error(f)) {
3648             ret = qemu_file_get_error(f);
3649         }
3650
3651         if (!ret && place_needed) {
3652             if (all_zero) {
3653                 ret = postcopy_place_page_zero(mis, host_page, block);
3654             } else {
3655                 ret = postcopy_place_page(mis, host_page, place_source,
3656                                           block);
3657             }
3658             place_needed = false;
3659             target_pages = 0;
3660             /* Assume we have a zero page until we detect something different */
3661             all_zero = true;
3662         }
3663     }
3664
3665     return ret;
3666 }
3667
3668 static bool postcopy_is_advised(void)
3669 {
3670     PostcopyState ps = postcopy_state_get();
3671     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3672 }
3673
3674 static bool postcopy_is_running(void)
3675 {
3676     PostcopyState ps = postcopy_state_get();
3677     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3678 }
3679
3680 /*
3681  * Flush content of RAM cache into SVM's memory.
3682  * Only flush the pages that be dirtied by PVM or SVM or both.
3683  */
3684 void colo_flush_ram_cache(void)
3685 {
3686     RAMBlock *block = NULL;
3687     void *dst_host;
3688     void *src_host;
3689     unsigned long offset = 0;
3690
3691     memory_global_dirty_log_sync();
3692     qemu_mutex_lock(&ram_state->bitmap_mutex);
3693     WITH_RCU_READ_LOCK_GUARD() {
3694         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3695             ramblock_sync_dirty_bitmap(ram_state, block);
3696         }
3697     }
3698
3699     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3700     WITH_RCU_READ_LOCK_GUARD() {
3701         block = QLIST_FIRST_RCU(&ram_list.blocks);
3702
3703         while (block) {
3704             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3705
3706             if (!offset_in_ramblock(block,
3707                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3708                 offset = 0;
3709                 block = QLIST_NEXT_RCU(block, next);
3710             } else {
3711                 migration_bitmap_clear_dirty(ram_state, block, offset);
3712                 dst_host = block->host
3713                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3714                 src_host = block->colo_cache
3715                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3716                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3717             }
3718         }
3719     }
3720     trace_colo_flush_ram_cache_end();
3721     qemu_mutex_unlock(&ram_state->bitmap_mutex);
3722 }
3723
3724 /**
3725  * ram_load_precopy: load pages in precopy case
3726  *
3727  * Returns 0 for success or -errno in case of error
3728  *
3729  * Called in precopy mode by ram_load().
3730  * rcu_read_lock is taken prior to this being called.
3731  *
3732  * @f: QEMUFile where to send the data
3733  */
3734 static int ram_load_precopy(QEMUFile *f)
3735 {
3736     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3737     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3738     bool postcopy_advised = postcopy_is_advised();
3739     if (!migrate_use_compression()) {
3740         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3741     }
3742
3743     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3744         ram_addr_t addr, total_ram_bytes;
3745         void *host = NULL, *host_bak = NULL;
3746         uint8_t ch;
3747
3748         /*
3749          * Yield periodically to let main loop run, but an iteration of
3750          * the main loop is expensive, so do it each some iterations
3751          */
3752         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3753             aio_co_schedule(qemu_get_current_aio_context(),
3754                             qemu_coroutine_self());
3755             qemu_coroutine_yield();
3756         }
3757         i++;
3758
3759         addr = qemu_get_be64(f);
3760         flags = addr & ~TARGET_PAGE_MASK;
3761         addr &= TARGET_PAGE_MASK;
3762
3763         if (flags & invalid_flags) {
3764             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3765                 error_report("Received an unexpected compressed page");
3766             }
3767
3768             ret = -EINVAL;
3769             break;
3770         }
3771
3772         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3773                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3774             RAMBlock *block = ram_block_from_stream(f, flags);
3775
3776             host = host_from_ram_block_offset(block, addr);
3777             /*
3778              * After going into COLO stage, we should not load the page
3779              * into SVM's memory directly, we put them into colo_cache firstly.
3780              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3781              * Previously, we copied all these memory in preparing stage of COLO
3782              * while we need to stop VM, which is a time-consuming process.
3783              * Here we optimize it by a trick, back-up every page while in
3784              * migration process while COLO is enabled, though it affects the
3785              * speed of the migration, but it obviously reduce the downtime of
3786              * back-up all SVM'S memory in COLO preparing stage.
3787              */
3788             if (migration_incoming_colo_enabled()) {
3789                 if (migration_incoming_in_colo_state()) {
3790                     /* In COLO stage, put all pages into cache temporarily */
3791                     host = colo_cache_from_block_offset(block, addr, true);
3792                 } else {
3793                    /*
3794                     * In migration stage but before COLO stage,
3795                     * Put all pages into both cache and SVM's memory.
3796                     */
3797                     host_bak = colo_cache_from_block_offset(block, addr, false);
3798                 }
3799             }
3800             if (!host) {
3801                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3802                 ret = -EINVAL;
3803                 break;
3804             }
3805             if (!migration_incoming_in_colo_state()) {
3806                 ramblock_recv_bitmap_set(block, host);
3807             }
3808
3809             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3810         }
3811
3812         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3813         case RAM_SAVE_FLAG_MEM_SIZE:
3814             /* Synchronize RAM block list */
3815             total_ram_bytes = addr;
3816             while (!ret && total_ram_bytes) {
3817                 RAMBlock *block;
3818                 char id[256];
3819                 ram_addr_t length;
3820
3821                 len = qemu_get_byte(f);
3822                 qemu_get_buffer(f, (uint8_t *)id, len);
3823                 id[len] = 0;
3824                 length = qemu_get_be64(f);
3825
3826                 block = qemu_ram_block_by_name(id);
3827                 if (block && !qemu_ram_is_migratable(block)) {
3828                     error_report("block %s should not be migrated !", id);
3829                     ret = -EINVAL;
3830                 } else if (block) {
3831                     if (length != block->used_length) {
3832                         Error *local_err = NULL;
3833
3834                         ret = qemu_ram_resize(block, length,
3835                                               &local_err);
3836                         if (local_err) {
3837                             error_report_err(local_err);
3838                         }
3839                     }
3840                     /* For postcopy we need to check hugepage sizes match */
3841                     if (postcopy_advised && migrate_postcopy_ram() &&
3842                         block->page_size != qemu_host_page_size) {
3843                         uint64_t remote_page_size = qemu_get_be64(f);
3844                         if (remote_page_size != block->page_size) {
3845                             error_report("Mismatched RAM page size %s "
3846                                          "(local) %zd != %" PRId64,
3847                                          id, block->page_size,
3848                                          remote_page_size);
3849                             ret = -EINVAL;
3850                         }
3851                     }
3852                     if (migrate_ignore_shared()) {
3853                         hwaddr addr = qemu_get_be64(f);
3854                         if (ramblock_is_ignored(block) &&
3855                             block->mr->addr != addr) {
3856                             error_report("Mismatched GPAs for block %s "
3857                                          "%" PRId64 "!= %" PRId64,
3858                                          id, (uint64_t)addr,
3859                                          (uint64_t)block->mr->addr);
3860                             ret = -EINVAL;
3861                         }
3862                     }
3863                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3864                                           block->idstr);
3865                 } else {
3866                     error_report("Unknown ramblock \"%s\", cannot "
3867                                  "accept migration", id);
3868                     ret = -EINVAL;
3869                 }
3870
3871                 total_ram_bytes -= length;
3872             }
3873             break;
3874
3875         case RAM_SAVE_FLAG_ZERO:
3876             ch = qemu_get_byte(f);
3877             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3878             break;
3879
3880         case RAM_SAVE_FLAG_PAGE:
3881             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3882             break;
3883
3884         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3885             len = qemu_get_be32(f);
3886             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3887                 error_report("Invalid compressed data length: %d", len);
3888                 ret = -EINVAL;
3889                 break;
3890             }
3891             decompress_data_with_multi_threads(f, host, len);
3892             break;
3893
3894         case RAM_SAVE_FLAG_XBZRLE:
3895             if (load_xbzrle(f, addr, host) < 0) {
3896                 error_report("Failed to decompress XBZRLE page at "
3897                              RAM_ADDR_FMT, addr);
3898                 ret = -EINVAL;
3899                 break;
3900             }
3901             break;
3902         case RAM_SAVE_FLAG_EOS:
3903             /* normal exit */
3904             multifd_recv_sync_main();
3905             break;
3906         default:
3907             if (flags & RAM_SAVE_FLAG_HOOK) {
3908                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3909             } else {
3910                 error_report("Unknown combination of migration flags: 0x%x",
3911                              flags);
3912                 ret = -EINVAL;
3913             }
3914         }
3915         if (!ret) {
3916             ret = qemu_file_get_error(f);
3917         }
3918         if (!ret && host_bak) {
3919             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3920         }
3921     }
3922
3923     ret |= wait_for_decompress_done();
3924     return ret;
3925 }
3926
3927 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3928 {
3929     int ret = 0;
3930     static uint64_t seq_iter;
3931     /*
3932      * If system is running in postcopy mode, page inserts to host memory must
3933      * be atomic
3934      */
3935     bool postcopy_running = postcopy_is_running();
3936
3937     seq_iter++;
3938
3939     if (version_id != 4) {
3940         return -EINVAL;
3941     }
3942
3943     /*
3944      * This RCU critical section can be very long running.
3945      * When RCU reclaims in the code start to become numerous,
3946      * it will be necessary to reduce the granularity of this
3947      * critical section.
3948      */
3949     WITH_RCU_READ_LOCK_GUARD() {
3950         if (postcopy_running) {
3951             ret = ram_load_postcopy(f);
3952         } else {
3953             ret = ram_load_precopy(f);
3954         }
3955     }
3956     trace_ram_load_complete(ret, seq_iter);
3957
3958     return ret;
3959 }
3960
3961 static bool ram_has_postcopy(void *opaque)
3962 {
3963     RAMBlock *rb;
3964     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3965         if (ramblock_is_pmem(rb)) {
3966             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3967                          "is not supported now!", rb->idstr, rb->host);
3968             return false;
3969         }
3970     }
3971
3972     return migrate_postcopy_ram();
3973 }
3974
3975 /* Sync all the dirty bitmap with destination VM.  */
3976 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3977 {
3978     RAMBlock *block;
3979     QEMUFile *file = s->to_dst_file;
3980     int ramblock_count = 0;
3981
3982     trace_ram_dirty_bitmap_sync_start();
3983
3984     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3985         qemu_savevm_send_recv_bitmap(file, block->idstr);
3986         trace_ram_dirty_bitmap_request(block->idstr);
3987         ramblock_count++;
3988     }
3989
3990     trace_ram_dirty_bitmap_sync_wait();
3991
3992     /* Wait until all the ramblocks' dirty bitmap synced */
3993     while (ramblock_count--) {
3994         qemu_sem_wait(&s->rp_state.rp_sem);
3995     }
3996
3997     trace_ram_dirty_bitmap_sync_complete();
3998
3999     return 0;
4000 }
4001
4002 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4003 {
4004     qemu_sem_post(&s->rp_state.rp_sem);
4005 }
4006
4007 /*
4008  * Read the received bitmap, revert it as the initial dirty bitmap.
4009  * This is only used when the postcopy migration is paused but wants
4010  * to resume from a middle point.
4011  */
4012 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4013 {
4014     int ret = -EINVAL;
4015     QEMUFile *file = s->rp_state.from_dst_file;
4016     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4017     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4018     uint64_t size, end_mark;
4019
4020     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4021
4022     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4023         error_report("%s: incorrect state %s", __func__,
4024                      MigrationStatus_str(s->state));
4025         return -EINVAL;
4026     }
4027
4028     /*
4029      * Note: see comments in ramblock_recv_bitmap_send() on why we
4030      * need the endianness conversion, and the paddings.
4031      */
4032     local_size = ROUND_UP(local_size, 8);
4033
4034     /* Add paddings */
4035     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4036
4037     size = qemu_get_be64(file);
4038
4039     /* The size of the bitmap should match with our ramblock */
4040     if (size != local_size) {
4041         error_report("%s: ramblock '%s' bitmap size mismatch "
4042                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4043                      block->idstr, size, local_size);
4044         ret = -EINVAL;
4045         goto out;
4046     }
4047
4048     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4049     end_mark = qemu_get_be64(file);
4050
4051     ret = qemu_file_get_error(file);
4052     if (ret || size != local_size) {
4053         error_report("%s: read bitmap failed for ramblock '%s': %d"
4054                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4055                      __func__, block->idstr, ret, local_size, size);
4056         ret = -EIO;
4057         goto out;
4058     }
4059
4060     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4061         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4062                      __func__, block->idstr, end_mark);
4063         ret = -EINVAL;
4064         goto out;
4065     }
4066
4067     /*
4068      * Endianness conversion. We are during postcopy (though paused).
4069      * The dirty bitmap won't change. We can directly modify it.
4070      */
4071     bitmap_from_le(block->bmap, le_bitmap, nbits);
4072
4073     /*
4074      * What we received is "received bitmap". Revert it as the initial
4075      * dirty bitmap for this ramblock.
4076      */
4077     bitmap_complement(block->bmap, block->bmap, nbits);
4078
4079     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4080
4081     /*
4082      * We succeeded to sync bitmap for current ramblock. If this is
4083      * the last one to sync, we need to notify the main send thread.
4084      */
4085     ram_dirty_bitmap_reload_notify(s);
4086
4087     ret = 0;
4088 out:
4089     g_free(le_bitmap);
4090     return ret;
4091 }
4092
4093 static int ram_resume_prepare(MigrationState *s, void *opaque)
4094 {
4095     RAMState *rs = *(RAMState **)opaque;
4096     int ret;
4097
4098     ret = ram_dirty_bitmap_sync_all(s, rs);
4099     if (ret) {
4100         return ret;
4101     }
4102
4103     ram_state_resume_prepare(rs, s->to_dst_file);
4104
4105     return 0;
4106 }
4107
4108 static SaveVMHandlers savevm_ram_handlers = {
4109     .save_setup = ram_save_setup,
4110     .save_live_iterate = ram_save_iterate,
4111     .save_live_complete_postcopy = ram_save_complete,
4112     .save_live_complete_precopy = ram_save_complete,
4113     .has_postcopy = ram_has_postcopy,
4114     .save_live_pending = ram_save_pending,
4115     .load_state = ram_load,
4116     .save_cleanup = ram_save_cleanup,
4117     .load_setup = ram_load_setup,
4118     .load_cleanup = ram_load_cleanup,
4119     .resume_prepare = ram_resume_prepare,
4120 };
4121
4122 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4123                                       size_t old_size, size_t new_size)
4124 {
4125     PostcopyState ps = postcopy_state_get();
4126     ram_addr_t offset;
4127     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4128     Error *err = NULL;
4129
4130     if (ramblock_is_ignored(rb)) {
4131         return;
4132     }
4133
4134     if (!migration_is_idle()) {
4135         /*
4136          * Precopy code on the source cannot deal with the size of RAM blocks
4137          * changing at random points in time - especially after sending the
4138          * RAM block sizes in the migration stream, they must no longer change.
4139          * Abort and indicate a proper reason.
4140          */
4141         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4142         migrate_set_error(migrate_get_current(), err);
4143         error_free(err);
4144         migration_cancel();
4145     }
4146
4147     switch (ps) {
4148     case POSTCOPY_INCOMING_ADVISE:
4149         /*
4150          * Update what ram_postcopy_incoming_init()->init_range() does at the
4151          * time postcopy was advised. Syncing RAM blocks with the source will
4152          * result in RAM resizes.
4153          */
4154         if (old_size < new_size) {
4155             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4156                 error_report("RAM block '%s' discard of resized RAM failed",
4157                              rb->idstr);
4158             }
4159         }
4160         rb->postcopy_length = new_size;
4161         break;
4162     case POSTCOPY_INCOMING_NONE:
4163     case POSTCOPY_INCOMING_RUNNING:
4164     case POSTCOPY_INCOMING_END:
4165         /*
4166          * Once our guest is running, postcopy does no longer care about
4167          * resizes. When growing, the new memory was not available on the
4168          * source, no handler needed.
4169          */
4170         break;
4171     default:
4172         error_report("RAM block '%s' resized during postcopy state: %d",
4173                      rb->idstr, ps);
4174         exit(-1);
4175     }
4176 }
4177
4178 static RAMBlockNotifier ram_mig_ram_notifier = {
4179     .ram_block_resized = ram_mig_ram_block_resized,
4180 };
4181
4182 void ram_mig_init(void)
4183 {
4184     qemu_mutex_init(&XBZRLE.lock);
4185     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4186     ram_block_notifier_add(&ram_mig_ram_notifier);
4187 }