migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/madvise.h"
  34 #include "qemu/main-loop.h"
  35 #include "io/channel-null.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration/register.h"
  40 #include "migration/misc.h"
  41 #include "qemu-file.h"
  42 #include "postcopy-ram.h"
  43 #include "page_cache.h"
  44 #include "qemu/error-report.h"
  45 #include "qapi/error.h"
  46 #include "qapi/qapi-types-migration.h"
  47 #include "qapi/qapi-events-migration.h"
  48 #include "qapi/qmp/qerror.h"
  49 #include "trace.h"
  50 #include "exec/ram_addr.h"
  51 #include "exec/target_page.h"
  52 #include "qemu/rcu_queue.h"
  53 #include "migration/colo.h"
  54 #include "block.h"
  55 #include "sysemu/cpu-throttle.h"
  56 #include "savevm.h"
  57 #include "qemu/iov.h"
  58 #include "multifd.h"
  59 #include "sysemu/runstate.h"
  60
  61 #include "hw/boards.h" /* for machine_dump_guest_core() */
  62
  63 #if defined(__linux__)
  64 #include "qemu/userfaultfd.h"
  65 #endif /* defined(__linux__) */
  66
  67 /***********************************************************/
  68 /* ram save/restore */
  69
  70 /*
  71  * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  72  * worked for pages that were filled with the same char.  We switched
  73  * it to only search for the zero value.  And to avoid confusion with
  74  * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
  75  */
  76 /*
  77  * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
  78  */
  79 #define RAM_SAVE_FLAG_FULL     0x01
  80 #define RAM_SAVE_FLAG_ZERO     0x02
  81 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  82 #define RAM_SAVE_FLAG_PAGE     0x08
  83 #define RAM_SAVE_FLAG_EOS      0x10
  84 #define RAM_SAVE_FLAG_CONTINUE 0x20
  85 #define RAM_SAVE_FLAG_XBZRLE   0x40
  86 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
  87 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  88 /* We can't use any flag that is bigger than 0x200 */
  89
  90 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
  91      uint8_t *, int) = xbzrle_encode_buffer;
  92 #if defined(CONFIG_AVX512BW_OPT)
  93 #include "qemu/cpuid.h"
  94 static void __attribute__((constructor)) init_cpu_flag(void)
  95 {
  96     unsigned max = __get_cpuid_max(0, NULL);
  97     int a, b, c, d;
  98     if (max >= 1) {
  99         __cpuid(1, a, b, c, d);
 100          /* We must check that AVX is not just available, but usable.  */
 101         if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
 102             int bv;
 103             __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
 104             __cpuid_count(7, 0, a, b, c, d);
 105            /* 0xe6:
 106             *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
 107             *                    and ZMM16-ZMM31 state are enabled by OS)
 108             *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
 109             */
 110             if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
 111                 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
 112             }
 113         }
 114     }
 115 }
 116 #endif
 117
 118 XBZRLECacheStats xbzrle_counters;
 119
 120 /* used by the search for pages to send */
 121 struct PageSearchStatus {
 122     /* The migration channel used for a specific host page */
 123     QEMUFile    *pss_channel;
 124     /* Last block from where we have sent data */
 125     RAMBlock *last_sent_block;
 126     /* Current block being searched */
 127     RAMBlock    *block;
 128     /* Current page to search from */
 129     unsigned long page;
 130     /* Set once we wrap around */
 131     bool         complete_round;
 132     /* Whether we're sending a host page */
 133     bool          host_page_sending;
 134     /* The start/end of current host page.  Invalid if host_page_sending==false */
 135     unsigned long host_page_start;
 136     unsigned long host_page_end;
 137 };
 138 typedef struct PageSearchStatus PageSearchStatus;
 139
 140 /* struct contains XBZRLE cache and a static page
 141    used by the compression */
 142 static struct {
 143     /* buffer used for XBZRLE encoding */
 144     uint8_t *encoded_buf;
 145     /* buffer for storing page content */
 146     uint8_t *current_buf;
 147     /* Cache for XBZRLE, Protected by lock. */
 148     PageCache *cache;
 149     QemuMutex lock;
 150     /* it will store a page full of zeros */
 151     uint8_t *zero_target_page;
 152     /* buffer used for XBZRLE decoding */
 153     uint8_t *decoded_buf;
 154 } XBZRLE;
 155
 156 static void XBZRLE_cache_lock(void)
 157 {
 158     if (migrate_use_xbzrle()) {
 159         qemu_mutex_lock(&XBZRLE.lock);
 160     }
 161 }
 162
 163 static void XBZRLE_cache_unlock(void)
 164 {
 165     if (migrate_use_xbzrle()) {
 166         qemu_mutex_unlock(&XBZRLE.lock);
 167     }
 168 }
 169
 170 /**
 171  * xbzrle_cache_resize: resize the xbzrle cache
 172  *
 173  * This function is called from migrate_params_apply in main
 174  * thread, possibly while a migration is in progress.  A running
 175  * migration may be using the cache and might finish during this call,
 176  * hence changes to the cache are protected by XBZRLE.lock().
 177  *
 178  * Returns 0 for success or -1 for error
 179  *
 180  * @new_size: new cache size
 181  * @errp: set *errp if the check failed, with reason
 182  */
 183 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 184 {
 185     PageCache *new_cache;
 186     int64_t ret = 0;
 187
 188     /* Check for truncation */
 189     if (new_size != (size_t)new_size) {
 190         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 191                    "exceeding address space");
 192         return -1;
 193     }
 194
 195     if (new_size == migrate_xbzrle_cache_size()) {
 196         /* nothing to do */
 197         return 0;
 198     }
 199
 200     XBZRLE_cache_lock();
 201
 202     if (XBZRLE.cache != NULL) {
 203         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 204         if (!new_cache) {
 205             ret = -1;
 206             goto out;
 207         }
 208
 209         cache_fini(XBZRLE.cache);
 210         XBZRLE.cache = new_cache;
 211     }
 212 out:
 213     XBZRLE_cache_unlock();
 214     return ret;
 215 }
 216
 217 static bool postcopy_preempt_active(void)
 218 {
 219     return migrate_postcopy_preempt() && migration_in_postcopy();
 220 }
 221
 222 bool ramblock_is_ignored(RAMBlock *block)
 223 {
 224     return !qemu_ram_is_migratable(block) ||
 225            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 226 }
 227
 228 #undef RAMBLOCK_FOREACH
 229
 230 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 231 {
 232     RAMBlock *block;
 233     int ret = 0;
 234
 235     RCU_READ_LOCK_GUARD();
 236
 237     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 238         ret = func(block, opaque);
 239         if (ret) {
 240             break;
 241         }
 242     }
 243     return ret;
 244 }
 245
 246 static void ramblock_recv_map_init(void)
 247 {
 248     RAMBlock *rb;
 249
 250     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 251         assert(!rb->receivedmap);
 252         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 253     }
 254 }
 255
 256 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 257 {
 258     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 259                     rb->receivedmap);
 260 }
 261
 262 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 263 {
 264     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 265 }
 266
 267 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 268 {
 269     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 270 }
 271
 272 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 273                                     size_t nr)
 274 {
 275     bitmap_set_atomic(rb->receivedmap,
 276                       ramblock_recv_bitmap_offset(host_addr, rb),
 277                       nr);
 278 }
 279
 280 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 281
 282 /*
 283  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 284  *
 285  * Returns >0 if success with sent bytes, or <0 if error.
 286  */
 287 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 288                                   const char *block_name)
 289 {
 290     RAMBlock *block = qemu_ram_block_by_name(block_name);
 291     unsigned long *le_bitmap, nbits;
 292     uint64_t size;
 293
 294     if (!block) {
 295         error_report("%s: invalid block name: %s", __func__, block_name);
 296         return -1;
 297     }
 298
 299     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 300
 301     /*
 302      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 303      * machines we may need 4 more bytes for padding (see below
 304      * comment). So extend it a bit before hand.
 305      */
 306     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 307
 308     /*
 309      * Always use little endian when sending the bitmap. This is
 310      * required that when source and destination VMs are not using the
 311      * same endianness. (Note: big endian won't work.)
 312      */
 313     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 314
 315     /* Size of the bitmap, in bytes */
 316     size = DIV_ROUND_UP(nbits, 8);
 317
 318     /*
 319      * size is always aligned to 8 bytes for 64bit machines, but it
 320      * may not be true for 32bit machines. We need this padding to
 321      * make sure the migration can survive even between 32bit and
 322      * 64bit machines.
 323      */
 324     size = ROUND_UP(size, 8);
 325
 326     qemu_put_be64(file, size);
 327     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 328     /*
 329      * Mark as an end, in case the middle part is screwed up due to
 330      * some "mysterious" reason.
 331      */
 332     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 333     qemu_fflush(file);
 334
 335     g_free(le_bitmap);
 336
 337     if (qemu_file_get_error(file)) {
 338         return qemu_file_get_error(file);
 339     }
 340
 341     return size + sizeof(size);
 342 }
 343
 344 /*
 345  * An outstanding page request, on the source, having been received
 346  * and queued
 347  */
 348 struct RAMSrcPageRequest {
 349     RAMBlock *rb;
 350     hwaddr    offset;
 351     hwaddr    len;
 352
 353     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 354 };
 355
 356 /* State of RAM for migration */
 357 struct RAMState {
 358     /*
 359      * PageSearchStatus structures for the channels when send pages.
 360      * Protected by the bitmap_mutex.
 361      */
 362     PageSearchStatus pss[RAM_CHANNEL_MAX];
 363     /* UFFD file descriptor, used in 'write-tracking' migration */
 364     int uffdio_fd;
 365     /* total ram size in bytes */
 366     uint64_t ram_bytes_total;
 367     /* Last block that we have visited searching for dirty pages */
 368     RAMBlock *last_seen_block;
 369     /* Last dirty target page we have sent */
 370     ram_addr_t last_page;
 371     /* last ram version we have seen */
 372     uint32_t last_version;
 373     /* How many times we have dirty too many pages */
 374     int dirty_rate_high_cnt;
 375     /* these variables are used for bitmap sync */
 376     /* last time we did a full bitmap_sync */
 377     int64_t time_last_bitmap_sync;
 378     /* bytes transferred at start_time */
 379     uint64_t bytes_xfer_prev;
 380     /* number of dirty pages since start_time */
 381     uint64_t num_dirty_pages_period;
 382     /* xbzrle misses since the beginning of the period */
 383     uint64_t xbzrle_cache_miss_prev;
 384     /* Amount of xbzrle pages since the beginning of the period */
 385     uint64_t xbzrle_pages_prev;
 386     /* Amount of xbzrle encoded bytes since the beginning of the period */
 387     uint64_t xbzrle_bytes_prev;
 388     /* Start using XBZRLE (e.g., after the first round). */
 389     bool xbzrle_enabled;
 390     /* Are we on the last stage of migration */
 391     bool last_stage;
 392     /* compression statistics since the beginning of the period */
 393     /* amount of count that no free thread to compress data */
 394     uint64_t compress_thread_busy_prev;
 395     /* amount bytes after compression */
 396     uint64_t compressed_size_prev;
 397     /* amount of compressed pages */
 398     uint64_t compress_pages_prev;
 399
 400     /* total handled target pages at the beginning of period */
 401     uint64_t target_page_count_prev;
 402     /* total handled target pages since start */
 403     uint64_t target_page_count;
 404     /* number of dirty bits in the bitmap */
 405     uint64_t migration_dirty_pages;
 406     /*
 407      * Protects:
 408      * - dirty/clear bitmap
 409      * - migration_dirty_pages
 410      * - pss structures
 411      */
 412     QemuMutex bitmap_mutex;
 413     /* The RAMBlock used in the last src_page_requests */
 414     RAMBlock *last_req_rb;
 415     /* Queue of outstanding page requests from the destination */
 416     QemuMutex src_page_req_mutex;
 417     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 418 };
 419 typedef struct RAMState RAMState;
 420
 421 static RAMState *ram_state;
 422
 423 static NotifierWithReturnList precopy_notifier_list;
 424
 425 /* Whether postcopy has queued requests? */
 426 static bool postcopy_has_request(RAMState *rs)
 427 {
 428     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
 429 }
 430
 431 void precopy_infrastructure_init(void)
 432 {
 433     notifier_with_return_list_init(&precopy_notifier_list);
 434 }
 435
 436 void precopy_add_notifier(NotifierWithReturn *n)
 437 {
 438     notifier_with_return_list_add(&precopy_notifier_list, n);
 439 }
 440
 441 void precopy_remove_notifier(NotifierWithReturn *n)
 442 {
 443     notifier_with_return_remove(n);
 444 }
 445
 446 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 447 {
 448     PrecopyNotifyData pnd;
 449     pnd.reason = reason;
 450     pnd.errp = errp;
 451
 452     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 453 }
 454
 455 uint64_t ram_bytes_remaining(void)
 456 {
 457     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 458                        0;
 459 }
 460
 461 /*
 462  * NOTE: not all stats in ram_counters are used in reality.  See comments
 463  * for struct MigrationAtomicStats.  The ultimate result of ram migration
 464  * counters will be a merged version with both ram_counters and the atomic
 465  * fields in ram_atomic_counters.
 466  */
 467 MigrationStats ram_counters;
 468 MigrationAtomicStats ram_atomic_counters;
 469
 470 void ram_transferred_add(uint64_t bytes)
 471 {
 472     if (runstate_is_running()) {
 473         ram_counters.precopy_bytes += bytes;
 474     } else if (migration_in_postcopy()) {
 475         stat64_add(&ram_atomic_counters.postcopy_bytes, bytes);
 476     } else {
 477         ram_counters.downtime_bytes += bytes;
 478     }
 479     stat64_add(&ram_atomic_counters.transferred, bytes);
 480 }
 481
 482 void dirty_sync_missed_zero_copy(void)
 483 {
 484     ram_counters.dirty_sync_missed_zero_copy++;
 485 }
 486
 487 struct MigrationOps {
 488     int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
 489 };
 490 typedef struct MigrationOps MigrationOps;
 491
 492 MigrationOps *migration_ops;
 493
 494 CompressionStats compression_counters;
 495
 496 struct CompressParam {
 497     bool done;
 498     bool quit;
 499     bool zero_page;
 500     QEMUFile *file;
 501     QemuMutex mutex;
 502     QemuCond cond;
 503     RAMBlock *block;
 504     ram_addr_t offset;
 505
 506     /* internally used fields */
 507     z_stream stream;
 508     uint8_t *originbuf;
 509 };
 510 typedef struct CompressParam CompressParam;
 511
 512 struct DecompressParam {
 513     bool done;
 514     bool quit;
 515     QemuMutex mutex;
 516     QemuCond cond;
 517     void *des;
 518     uint8_t *compbuf;
 519     int len;
 520     z_stream stream;
 521 };
 522 typedef struct DecompressParam DecompressParam;
 523
 524 static CompressParam *comp_param;
 525 static QemuThread *compress_threads;
 526 /* comp_done_cond is used to wake up the migration thread when
 527  * one of the compression threads has finished the compression.
 528  * comp_done_lock is used to co-work with comp_done_cond.
 529  */
 530 static QemuMutex comp_done_lock;
 531 static QemuCond comp_done_cond;
 532
 533 static QEMUFile *decomp_file;
 534 static DecompressParam *decomp_param;
 535 static QemuThread *decompress_threads;
 536 static QemuMutex decomp_done_lock;
 537 static QemuCond decomp_done_cond;
 538
 539 static int ram_save_host_page_urgent(PageSearchStatus *pss);
 540
 541 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 542                                  ram_addr_t offset, uint8_t *source_buf);
 543
 544 /* NOTE: page is the PFN not real ram_addr_t. */
 545 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
 546 {
 547     pss->block = rb;
 548     pss->page = page;
 549     pss->complete_round = false;
 550 }
 551
 552 /*
 553  * Check whether two PSSs are actively sending the same page.  Return true
 554  * if it is, false otherwise.
 555  */
 556 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
 557 {
 558     return pss1->host_page_sending && pss2->host_page_sending &&
 559         (pss1->host_page_start == pss2->host_page_start);
 560 }
 561
 562 static void *do_data_compress(void *opaque)
 563 {
 564     CompressParam *param = opaque;
 565     RAMBlock *block;
 566     ram_addr_t offset;
 567     bool zero_page;
 568
 569     qemu_mutex_lock(&param->mutex);
 570     while (!param->quit) {
 571         if (param->block) {
 572             block = param->block;
 573             offset = param->offset;
 574             param->block = NULL;
 575             qemu_mutex_unlock(&param->mutex);
 576
 577             zero_page = do_compress_ram_page(param->file, &param->stream,
 578                                              block, offset, param->originbuf);
 579
 580             qemu_mutex_lock(&comp_done_lock);
 581             param->done = true;
 582             param->zero_page = zero_page;
 583             qemu_cond_signal(&comp_done_cond);
 584             qemu_mutex_unlock(&comp_done_lock);
 585
 586             qemu_mutex_lock(&param->mutex);
 587         } else {
 588             qemu_cond_wait(&param->cond, &param->mutex);
 589         }
 590     }
 591     qemu_mutex_unlock(&param->mutex);
 592
 593     return NULL;
 594 }
 595
 596 static void compress_threads_save_cleanup(void)
 597 {
 598     int i, thread_count;
 599
 600     if (!migrate_use_compression() || !comp_param) {
 601         return;
 602     }
 603
 604     thread_count = migrate_compress_threads();
 605     for (i = 0; i < thread_count; i++) {
 606         /*
 607          * we use it as a indicator which shows if the thread is
 608          * properly init'd or not
 609          */
 610         if (!comp_param[i].file) {
 611             break;
 612         }
 613
 614         qemu_mutex_lock(&comp_param[i].mutex);
 615         comp_param[i].quit = true;
 616         qemu_cond_signal(&comp_param[i].cond);
 617         qemu_mutex_unlock(&comp_param[i].mutex);
 618
 619         qemu_thread_join(compress_threads + i);
 620         qemu_mutex_destroy(&comp_param[i].mutex);
 621         qemu_cond_destroy(&comp_param[i].cond);
 622         deflateEnd(&comp_param[i].stream);
 623         g_free(comp_param[i].originbuf);
 624         qemu_fclose(comp_param[i].file);
 625         comp_param[i].file = NULL;
 626     }
 627     qemu_mutex_destroy(&comp_done_lock);
 628     qemu_cond_destroy(&comp_done_cond);
 629     g_free(compress_threads);
 630     g_free(comp_param);
 631     compress_threads = NULL;
 632     comp_param = NULL;
 633 }
 634
 635 static int compress_threads_save_setup(void)
 636 {
 637     int i, thread_count;
 638
 639     if (!migrate_use_compression()) {
 640         return 0;
 641     }
 642     thread_count = migrate_compress_threads();
 643     compress_threads = g_new0(QemuThread, thread_count);
 644     comp_param = g_new0(CompressParam, thread_count);
 645     qemu_cond_init(&comp_done_cond);
 646     qemu_mutex_init(&comp_done_lock);
 647     for (i = 0; i < thread_count; i++) {
 648         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 649         if (!comp_param[i].originbuf) {
 650             goto exit;
 651         }
 652
 653         if (deflateInit(&comp_param[i].stream,
 654                         migrate_compress_level()) != Z_OK) {
 655             g_free(comp_param[i].originbuf);
 656             goto exit;
 657         }
 658
 659         /* comp_param[i].file is just used as a dummy buffer to save data,
 660          * set its ops to empty.
 661          */
 662         comp_param[i].file = qemu_file_new_output(
 663             QIO_CHANNEL(qio_channel_null_new()));
 664         comp_param[i].done = true;
 665         comp_param[i].quit = false;
 666         qemu_mutex_init(&comp_param[i].mutex);
 667         qemu_cond_init(&comp_param[i].cond);
 668         qemu_thread_create(compress_threads + i, "compress",
 669                            do_data_compress, comp_param + i,
 670                            QEMU_THREAD_JOINABLE);
 671     }
 672     return 0;
 673
 674 exit:
 675     compress_threads_save_cleanup();
 676     return -1;
 677 }
 678
 679 /**
 680  * save_page_header: write page header to wire
 681  *
 682  * If this is the 1st block, it also writes the block identification
 683  *
 684  * Returns the number of bytes written
 685  *
 686  * @pss: current PSS channel status
 687  * @block: block that contains the page we want to send
 688  * @offset: offset inside the block for the page
 689  *          in the lower bits, it contains flags
 690  */
 691 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
 692                                RAMBlock *block, ram_addr_t offset)
 693 {
 694     size_t size, len;
 695     bool same_block = (block == pss->last_sent_block);
 696
 697     if (same_block) {
 698         offset |= RAM_SAVE_FLAG_CONTINUE;
 699     }
 700     qemu_put_be64(f, offset);
 701     size = 8;
 702
 703     if (!same_block) {
 704         len = strlen(block->idstr);
 705         qemu_put_byte(f, len);
 706         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 707         size += 1 + len;
 708         pss->last_sent_block = block;
 709     }
 710     return size;
 711 }
 712
 713 /**
 714  * mig_throttle_guest_down: throttle down the guest
 715  *
 716  * Reduce amount of guest cpu execution to hopefully slow down memory
 717  * writes. If guest dirty memory rate is reduced below the rate at
 718  * which we can transfer pages to the destination then we should be
 719  * able to complete migration. Some workloads dirty memory way too
 720  * fast and will not effectively converge, even with auto-converge.
 721  */
 722 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 723                                     uint64_t bytes_dirty_threshold)
 724 {
 725     MigrationState *s = migrate_get_current();
 726     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 727     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 728     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 729     int pct_max = s->parameters.max_cpu_throttle;
 730
 731     uint64_t throttle_now = cpu_throttle_get_percentage();
 732     uint64_t cpu_now, cpu_ideal, throttle_inc;
 733
 734     /* We have not started throttling yet. Let's start it. */
 735     if (!cpu_throttle_active()) {
 736         cpu_throttle_set(pct_initial);
 737     } else {
 738         /* Throttling already on, just increase the rate */
 739         if (!pct_tailslow) {
 740             throttle_inc = pct_increment;
 741         } else {
 742             /* Compute the ideal CPU percentage used by Guest, which may
 743              * make the dirty rate match the dirty rate threshold. */
 744             cpu_now = 100 - throttle_now;
 745             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 746                         bytes_dirty_period);
 747             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 748         }
 749         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 750     }
 751 }
 752
 753 void mig_throttle_counter_reset(void)
 754 {
 755     RAMState *rs = ram_state;
 756
 757     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 758     rs->num_dirty_pages_period = 0;
 759     rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
 760 }
 761
 762 /**
 763  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 764  *
 765  * @rs: current RAM state
 766  * @current_addr: address for the zero page
 767  *
 768  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 769  * The important thing is that a stale (not-yet-0'd) page be replaced
 770  * by the new data.
 771  * As a bonus, if the page wasn't in the cache it gets added so that
 772  * when a small write is made into the 0'd page it gets XBZRLE sent.
 773  */
 774 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 775 {
 776     /* We don't care if this fails to allocate a new cache page
 777      * as long as it updated an old one */
 778     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 779                  ram_counters.dirty_sync_count);
 780 }
 781
 782 #define ENCODING_FLAG_XBZRLE 0x1
 783
 784 /**
 785  * save_xbzrle_page: compress and send current page
 786  *
 787  * Returns: 1 means that we wrote the page
 788  *          0 means that page is identical to the one already sent
 789  *          -1 means that xbzrle would be longer than normal
 790  *
 791  * @rs: current RAM state
 792  * @pss: current PSS channel
 793  * @current_data: pointer to the address of the page contents
 794  * @current_addr: addr of the page
 795  * @block: block that contains the page we want to send
 796  * @offset: offset inside the block for the page
 797  */
 798 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
 799                             uint8_t **current_data, ram_addr_t current_addr,
 800                             RAMBlock *block, ram_addr_t offset)
 801 {
 802     int encoded_len = 0, bytes_xbzrle;
 803     uint8_t *prev_cached_page;
 804     QEMUFile *file = pss->pss_channel;
 805
 806     if (!cache_is_cached(XBZRLE.cache, current_addr,
 807                          ram_counters.dirty_sync_count)) {
 808         xbzrle_counters.cache_miss++;
 809         if (!rs->last_stage) {
 810             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 811                              ram_counters.dirty_sync_count) == -1) {
 812                 return -1;
 813             } else {
 814                 /* update *current_data when the page has been
 815                    inserted into cache */
 816                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 817             }
 818         }
 819         return -1;
 820     }
 821
 822     /*
 823      * Reaching here means the page has hit the xbzrle cache, no matter what
 824      * encoding result it is (normal encoding, overflow or skipping the page),
 825      * count the page as encoded. This is used to calculate the encoding rate.
 826      *
 827      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 828      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 829      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 830      * skipped page included. In this way, the encoding rate can tell if the
 831      * guest page is good for xbzrle encoding.
 832      */
 833     xbzrle_counters.pages++;
 834     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 835
 836     /* save current buffer into memory */
 837     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 838
 839     /* XBZRLE encoding (if there is no overflow) */
 840     encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
 841                                             TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 842                                             TARGET_PAGE_SIZE);
 843
 844     /*
 845      * Update the cache contents, so that it corresponds to the data
 846      * sent, in all cases except where we skip the page.
 847      */
 848     if (!rs->last_stage && encoded_len != 0) {
 849         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 850         /*
 851          * In the case where we couldn't compress, ensure that the caller
 852          * sends the data from the cache, since the guest might have
 853          * changed the RAM since we copied it.
 854          */
 855         *current_data = prev_cached_page;
 856     }
 857
 858     if (encoded_len == 0) {
 859         trace_save_xbzrle_page_skipping();
 860         return 0;
 861     } else if (encoded_len == -1) {
 862         trace_save_xbzrle_page_overflow();
 863         xbzrle_counters.overflow++;
 864         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 865         return -1;
 866     }
 867
 868     /* Send XBZRLE based compressed page */
 869     bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
 870                                     offset | RAM_SAVE_FLAG_XBZRLE);
 871     qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
 872     qemu_put_be16(file, encoded_len);
 873     qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
 874     bytes_xbzrle += encoded_len + 1 + 2;
 875     /*
 876      * Like compressed_size (please see update_compress_thread_counts),
 877      * the xbzrle encoded bytes don't count the 8 byte header with
 878      * RAM_SAVE_FLAG_CONTINUE.
 879      */
 880     xbzrle_counters.bytes += bytes_xbzrle - 8;
 881     ram_transferred_add(bytes_xbzrle);
 882
 883     return 1;
 884 }
 885
 886 /**
 887  * pss_find_next_dirty: find the next dirty page of current ramblock
 888  *
 889  * This function updates pss->page to point to the next dirty page index
 890  * within the ramblock to migrate, or the end of ramblock when nothing
 891  * found.  Note that when pss->host_page_sending==true it means we're
 892  * during sending a host page, so we won't look for dirty page that is
 893  * outside the host page boundary.
 894  *
 895  * @pss: the current page search status
 896  */
 897 static void pss_find_next_dirty(PageSearchStatus *pss)
 898 {
 899     RAMBlock *rb = pss->block;
 900     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 901     unsigned long *bitmap = rb->bmap;
 902
 903     if (ramblock_is_ignored(rb)) {
 904         /* Points directly to the end, so we know no dirty page */
 905         pss->page = size;
 906         return;
 907     }
 908
 909     /*
 910      * If during sending a host page, only look for dirty pages within the
 911      * current host page being send.
 912      */
 913     if (pss->host_page_sending) {
 914         assert(pss->host_page_end);
 915         size = MIN(size, pss->host_page_end);
 916     }
 917
 918     pss->page = find_next_bit(bitmap, size, pss->page);
 919 }
 920
 921 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 922                                                        unsigned long page)
 923 {
 924     uint8_t shift;
 925     hwaddr size, start;
 926
 927     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 928         return;
 929     }
 930
 931     shift = rb->clear_bmap_shift;
 932     /*
 933      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 934      * can make things easier sometimes since then start address
 935      * of the small chunk will always be 64 pages aligned so the
 936      * bitmap will always be aligned to unsigned long. We should
 937      * even be able to remove this restriction but I'm simply
 938      * keeping it.
 939      */
 940     assert(shift >= 6);
 941
 942     size = 1ULL << (TARGET_PAGE_BITS + shift);
 943     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
 944     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 945     memory_region_clear_dirty_bitmap(rb->mr, start, size);
 946 }
 947
 948 static void
 949 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 950                                                  unsigned long start,
 951                                                  unsigned long npages)
 952 {
 953     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 954     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 955     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 956
 957     /*
 958      * Clear pages from start to start + npages - 1, so the end boundary is
 959      * exclusive.
 960      */
 961     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 962         migration_clear_memory_region_dirty_bitmap(rb, i);
 963     }
 964 }
 965
 966 /*
 967  * colo_bitmap_find_diry:find contiguous dirty pages from start
 968  *
 969  * Returns the page offset within memory region of the start of the contiguout
 970  * dirty page
 971  *
 972  * @rs: current RAM state
 973  * @rb: RAMBlock where to search for dirty pages
 974  * @start: page where we start the search
 975  * @num: the number of contiguous dirty pages
 976  */
 977 static inline
 978 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 979                                      unsigned long start, unsigned long *num)
 980 {
 981     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 982     unsigned long *bitmap = rb->bmap;
 983     unsigned long first, next;
 984
 985     *num = 0;
 986
 987     if (ramblock_is_ignored(rb)) {
 988         return size;
 989     }
 990
 991     first = find_next_bit(bitmap, size, start);
 992     if (first >= size) {
 993         return first;
 994     }
 995     next = find_next_zero_bit(bitmap, size, first + 1);
 996     assert(next >= first);
 997     *num = next - first;
 998     return first;
 999 }
1000
1001 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1002                                                 RAMBlock *rb,
1003                                                 unsigned long page)
1004 {
1005     bool ret;
1006
1007     /*
1008      * Clear dirty bitmap if needed.  This _must_ be called before we
1009      * send any of the page in the chunk because we need to make sure
1010      * we can capture further page content changes when we sync dirty
1011      * log the next time.  So as long as we are going to send any of
1012      * the page in the chunk we clear the remote dirty bitmap for all.
1013      * Clearing it earlier won't be a problem, but too late will.
1014      */
1015     migration_clear_memory_region_dirty_bitmap(rb, page);
1016
1017     ret = test_and_clear_bit(page, rb->bmap);
1018     if (ret) {
1019         rs->migration_dirty_pages--;
1020     }
1021
1022     return ret;
1023 }
1024
1025 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
1026                                        void *opaque)
1027 {
1028     const hwaddr offset = section->offset_within_region;
1029     const hwaddr size = int128_get64(section->size);
1030     const unsigned long start = offset >> TARGET_PAGE_BITS;
1031     const unsigned long npages = size >> TARGET_PAGE_BITS;
1032     RAMBlock *rb = section->mr->ram_block;
1033     uint64_t *cleared_bits = opaque;
1034
1035     /*
1036      * We don't grab ram_state->bitmap_mutex because we expect to run
1037      * only when starting migration or during postcopy recovery where
1038      * we don't have concurrent access.
1039      */
1040     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1041         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1042     }
1043     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1044     bitmap_clear(rb->bmap, start, npages);
1045 }
1046
1047 /*
1048  * Exclude all dirty pages from migration that fall into a discarded range as
1049  * managed by a RamDiscardManager responsible for the mapped memory region of
1050  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1051  *
1052  * Discarded pages ("logically unplugged") have undefined content and must
1053  * not get migrated, because even reading these pages for migration might
1054  * result in undesired behavior.
1055  *
1056  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1057  *
1058  * Note: The result is only stable while migrating (precopy/postcopy).
1059  */
1060 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1061 {
1062     uint64_t cleared_bits = 0;
1063
1064     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1065         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1066         MemoryRegionSection section = {
1067             .mr = rb->mr,
1068             .offset_within_region = 0,
1069             .size = int128_make64(qemu_ram_get_used_length(rb)),
1070         };
1071
1072         ram_discard_manager_replay_discarded(rdm, &section,
1073                                              dirty_bitmap_clear_section,
1074                                              &cleared_bits);
1075     }
1076     return cleared_bits;
1077 }
1078
1079 /*
1080  * Check if a host-page aligned page falls into a discarded range as managed by
1081  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1082  *
1083  * Note: The result is only stable while migrating (precopy/postcopy).
1084  */
1085 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1086 {
1087     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1088         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1089         MemoryRegionSection section = {
1090             .mr = rb->mr,
1091             .offset_within_region = start,
1092             .size = int128_make64(qemu_ram_pagesize(rb)),
1093         };
1094
1095         return !ram_discard_manager_is_populated(rdm, &section);
1096     }
1097     return false;
1098 }
1099
1100 /* Called with RCU critical section */
1101 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1102 {
1103     uint64_t new_dirty_pages =
1104         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1105
1106     rs->migration_dirty_pages += new_dirty_pages;
1107     rs->num_dirty_pages_period += new_dirty_pages;
1108 }
1109
1110 /**
1111  * ram_pagesize_summary: calculate all the pagesizes of a VM
1112  *
1113  * Returns a summary bitmap of the page sizes of all RAMBlocks
1114  *
1115  * For VMs with just normal pages this is equivalent to the host page
1116  * size. If it's got some huge pages then it's the OR of all the
1117  * different page sizes.
1118  */
1119 uint64_t ram_pagesize_summary(void)
1120 {
1121     RAMBlock *block;
1122     uint64_t summary = 0;
1123
1124     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1125         summary |= block->page_size;
1126     }
1127
1128     return summary;
1129 }
1130
1131 uint64_t ram_get_total_transferred_pages(void)
1132 {
1133     return  stat64_get(&ram_atomic_counters.normal) +
1134         stat64_get(&ram_atomic_counters.duplicate) +
1135         compression_counters.pages + xbzrle_counters.pages;
1136 }
1137
1138 static void migration_update_rates(RAMState *rs, int64_t end_time)
1139 {
1140     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1141     double compressed_size;
1142
1143     /* calculate period counters */
1144     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1145                 / (end_time - rs->time_last_bitmap_sync);
1146
1147     if (!page_count) {
1148         return;
1149     }
1150
1151     if (migrate_use_xbzrle()) {
1152         double encoded_size, unencoded_size;
1153
1154         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1155             rs->xbzrle_cache_miss_prev) / page_count;
1156         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1157         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1158                          TARGET_PAGE_SIZE;
1159         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1160         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1161             xbzrle_counters.encoding_rate = 0;
1162         } else {
1163             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1164         }
1165         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1166         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1167     }
1168
1169     if (migrate_use_compression()) {
1170         compression_counters.busy_rate = (double)(compression_counters.busy -
1171             rs->compress_thread_busy_prev) / page_count;
1172         rs->compress_thread_busy_prev = compression_counters.busy;
1173
1174         compressed_size = compression_counters.compressed_size -
1175                           rs->compressed_size_prev;
1176         if (compressed_size) {
1177             double uncompressed_size = (compression_counters.pages -
1178                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1179
1180             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1181             compression_counters.compression_rate =
1182                                         uncompressed_size / compressed_size;
1183
1184             rs->compress_pages_prev = compression_counters.pages;
1185             rs->compressed_size_prev = compression_counters.compressed_size;
1186         }
1187     }
1188 }
1189
1190 static void migration_trigger_throttle(RAMState *rs)
1191 {
1192     MigrationState *s = migrate_get_current();
1193     uint64_t threshold = s->parameters.throttle_trigger_threshold;
1194     uint64_t bytes_xfer_period =
1195         stat64_get(&ram_atomic_counters.transferred) - rs->bytes_xfer_prev;
1196     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1197     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1198
1199     /* During block migration the auto-converge logic incorrectly detects
1200      * that ram migration makes no progress. Avoid this by disabling the
1201      * throttling logic during the bulk phase of block migration. */
1202     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1203         /* The following detection logic can be refined later. For now:
1204            Check to see if the ratio between dirtied bytes and the approx.
1205            amount of bytes that just got transferred since the last time
1206            we were in this routine reaches the threshold. If that happens
1207            twice, start or increase throttling. */
1208
1209         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1210             (++rs->dirty_rate_high_cnt >= 2)) {
1211             trace_migration_throttle();
1212             rs->dirty_rate_high_cnt = 0;
1213             mig_throttle_guest_down(bytes_dirty_period,
1214                                     bytes_dirty_threshold);
1215         }
1216     }
1217 }
1218
1219 static void migration_bitmap_sync(RAMState *rs)
1220 {
1221     RAMBlock *block;
1222     int64_t end_time;
1223
1224     ram_counters.dirty_sync_count++;
1225
1226     if (!rs->time_last_bitmap_sync) {
1227         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1228     }
1229
1230     trace_migration_bitmap_sync_start();
1231     memory_global_dirty_log_sync();
1232
1233     qemu_mutex_lock(&rs->bitmap_mutex);
1234     WITH_RCU_READ_LOCK_GUARD() {
1235         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1236             ramblock_sync_dirty_bitmap(rs, block);
1237         }
1238         ram_counters.remaining = ram_bytes_remaining();
1239     }
1240     qemu_mutex_unlock(&rs->bitmap_mutex);
1241
1242     memory_global_after_dirty_log_sync();
1243     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1244
1245     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1246
1247     /* more than 1 second = 1000 millisecons */
1248     if (end_time > rs->time_last_bitmap_sync + 1000) {
1249         migration_trigger_throttle(rs);
1250
1251         migration_update_rates(rs, end_time);
1252
1253         rs->target_page_count_prev = rs->target_page_count;
1254
1255         /* reset period counters */
1256         rs->time_last_bitmap_sync = end_time;
1257         rs->num_dirty_pages_period = 0;
1258         rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
1259     }
1260     if (migrate_use_events()) {
1261         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1262     }
1263 }
1264
1265 static void migration_bitmap_sync_precopy(RAMState *rs)
1266 {
1267     Error *local_err = NULL;
1268
1269     /*
1270      * The current notifier usage is just an optimization to migration, so we
1271      * don't stop the normal migration process in the error case.
1272      */
1273     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1274         error_report_err(local_err);
1275         local_err = NULL;
1276     }
1277
1278     migration_bitmap_sync(rs);
1279
1280     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1281         error_report_err(local_err);
1282     }
1283 }
1284
1285 void ram_release_page(const char *rbname, uint64_t offset)
1286 {
1287     if (!migrate_release_ram() || !migration_in_postcopy()) {
1288         return;
1289     }
1290
1291     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1292 }
1293
1294 /**
1295  * save_zero_page_to_file: send the zero page to the file
1296  *
1297  * Returns the size of data written to the file, 0 means the page is not
1298  * a zero page
1299  *
1300  * @pss: current PSS channel
1301  * @block: block that contains the page we want to send
1302  * @offset: offset inside the block for the page
1303  */
1304 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
1305                                   RAMBlock *block, ram_addr_t offset)
1306 {
1307     uint8_t *p = block->host + offset;
1308     int len = 0;
1309
1310     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1311         len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1312         qemu_put_byte(file, 0);
1313         len += 1;
1314         ram_release_page(block->idstr, offset);
1315     }
1316     return len;
1317 }
1318
1319 /**
1320  * save_zero_page: send the zero page to the stream
1321  *
1322  * Returns the number of pages written.
1323  *
1324  * @pss: current PSS channel
1325  * @block: block that contains the page we want to send
1326  * @offset: offset inside the block for the page
1327  */
1328 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
1329                           ram_addr_t offset)
1330 {
1331     int len = save_zero_page_to_file(pss, f, block, offset);
1332
1333     if (len) {
1334         stat64_add(&ram_atomic_counters.duplicate, 1);
1335         ram_transferred_add(len);
1336         return 1;
1337     }
1338     return -1;
1339 }
1340
1341 /*
1342  * @pages: the number of pages written by the control path,
1343  *        < 0 - error
1344  *        > 0 - number of pages written
1345  *
1346  * Return true if the pages has been saved, otherwise false is returned.
1347  */
1348 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1349                               ram_addr_t offset, int *pages)
1350 {
1351     uint64_t bytes_xmit = 0;
1352     int ret;
1353
1354     *pages = -1;
1355     ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1356                                 TARGET_PAGE_SIZE, &bytes_xmit);
1357     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1358         return false;
1359     }
1360
1361     if (bytes_xmit) {
1362         ram_transferred_add(bytes_xmit);
1363         *pages = 1;
1364     }
1365
1366     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1367         return true;
1368     }
1369
1370     if (bytes_xmit > 0) {
1371         stat64_add(&ram_atomic_counters.normal, 1);
1372     } else if (bytes_xmit == 0) {
1373         stat64_add(&ram_atomic_counters.duplicate, 1);
1374     }
1375
1376     return true;
1377 }
1378
1379 /*
1380  * directly send the page to the stream
1381  *
1382  * Returns the number of pages written.
1383  *
1384  * @pss: current PSS channel
1385  * @block: block that contains the page we want to send
1386  * @offset: offset inside the block for the page
1387  * @buf: the page to be sent
1388  * @async: send to page asyncly
1389  */
1390 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1391                             ram_addr_t offset, uint8_t *buf, bool async)
1392 {
1393     QEMUFile *file = pss->pss_channel;
1394
1395     ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
1396                                          offset | RAM_SAVE_FLAG_PAGE));
1397     if (async) {
1398         qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1399                               migrate_release_ram() &&
1400                               migration_in_postcopy());
1401     } else {
1402         qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1403     }
1404     ram_transferred_add(TARGET_PAGE_SIZE);
1405     stat64_add(&ram_atomic_counters.normal, 1);
1406     return 1;
1407 }
1408
1409 /**
1410  * ram_save_page: send the given page to the stream
1411  *
1412  * Returns the number of pages written.
1413  *          < 0 - error
1414  *          >=0 - Number of pages written - this might legally be 0
1415  *                if xbzrle noticed the page was the same.
1416  *
1417  * @rs: current RAM state
1418  * @block: block that contains the page we want to send
1419  * @offset: offset inside the block for the page
1420  */
1421 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1422 {
1423     int pages = -1;
1424     uint8_t *p;
1425     bool send_async = true;
1426     RAMBlock *block = pss->block;
1427     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1428     ram_addr_t current_addr = block->offset + offset;
1429
1430     p = block->host + offset;
1431     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1432
1433     XBZRLE_cache_lock();
1434     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1435         pages = save_xbzrle_page(rs, pss, &p, current_addr,
1436                                  block, offset);
1437         if (!rs->last_stage) {
1438             /* Can't send this cached data async, since the cache page
1439              * might get updated before it gets to the wire
1440              */
1441             send_async = false;
1442         }
1443     }
1444
1445     /* XBZRLE overflow or normal page */
1446     if (pages == -1) {
1447         pages = save_normal_page(pss, block, offset, p, send_async);
1448     }
1449
1450     XBZRLE_cache_unlock();
1451
1452     return pages;
1453 }
1454
1455 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1456                                  ram_addr_t offset)
1457 {
1458     if (multifd_queue_page(file, block, offset) < 0) {
1459         return -1;
1460     }
1461     stat64_add(&ram_atomic_counters.normal, 1);
1462
1463     return 1;
1464 }
1465
1466 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1467                                  ram_addr_t offset, uint8_t *source_buf)
1468 {
1469     RAMState *rs = ram_state;
1470     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
1471     uint8_t *p = block->host + offset;
1472     int ret;
1473
1474     if (save_zero_page_to_file(pss, f, block, offset)) {
1475         return true;
1476     }
1477
1478     save_page_header(pss, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1479
1480     /*
1481      * copy it to a internal buffer to avoid it being modified by VM
1482      * so that we can catch up the error during compression and
1483      * decompression
1484      */
1485     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1486     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1487     if (ret < 0) {
1488         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1489         error_report("compressed data failed!");
1490     }
1491     return false;
1492 }
1493
1494 static void
1495 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1496 {
1497     ram_transferred_add(bytes_xmit);
1498
1499     if (param->zero_page) {
1500         stat64_add(&ram_atomic_counters.duplicate, 1);
1501         return;
1502     }
1503
1504     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1505     compression_counters.compressed_size += bytes_xmit - 8;
1506     compression_counters.pages++;
1507 }
1508
1509 static bool save_page_use_compression(RAMState *rs);
1510
1511 static void flush_compressed_data(RAMState *rs)
1512 {
1513     MigrationState *ms = migrate_get_current();
1514     int idx, len, thread_count;
1515
1516     if (!save_page_use_compression(rs)) {
1517         return;
1518     }
1519     thread_count = migrate_compress_threads();
1520
1521     qemu_mutex_lock(&comp_done_lock);
1522     for (idx = 0; idx < thread_count; idx++) {
1523         while (!comp_param[idx].done) {
1524             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1525         }
1526     }
1527     qemu_mutex_unlock(&comp_done_lock);
1528
1529     for (idx = 0; idx < thread_count; idx++) {
1530         qemu_mutex_lock(&comp_param[idx].mutex);
1531         if (!comp_param[idx].quit) {
1532             len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
1533             /*
1534              * it's safe to fetch zero_page without holding comp_done_lock
1535              * as there is no further request submitted to the thread,
1536              * i.e, the thread should be waiting for a request at this point.
1537              */
1538             update_compress_thread_counts(&comp_param[idx], len);
1539         }
1540         qemu_mutex_unlock(&comp_param[idx].mutex);
1541     }
1542 }
1543
1544 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1545                                        ram_addr_t offset)
1546 {
1547     param->block = block;
1548     param->offset = offset;
1549 }
1550
1551 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
1552 {
1553     int idx, thread_count, bytes_xmit = -1, pages = -1;
1554     bool wait = migrate_compress_wait_thread();
1555     MigrationState *ms = migrate_get_current();
1556
1557     thread_count = migrate_compress_threads();
1558     qemu_mutex_lock(&comp_done_lock);
1559 retry:
1560     for (idx = 0; idx < thread_count; idx++) {
1561         if (comp_param[idx].done) {
1562             comp_param[idx].done = false;
1563             bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1564                                             comp_param[idx].file);
1565             qemu_mutex_lock(&comp_param[idx].mutex);
1566             set_compress_params(&comp_param[idx], block, offset);
1567             qemu_cond_signal(&comp_param[idx].cond);
1568             qemu_mutex_unlock(&comp_param[idx].mutex);
1569             pages = 1;
1570             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1571             break;
1572         }
1573     }
1574
1575     /*
1576      * wait for the free thread if the user specifies 'compress-wait-thread',
1577      * otherwise we will post the page out in the main thread as normal page.
1578      */
1579     if (pages < 0 && wait) {
1580         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1581         goto retry;
1582     }
1583     qemu_mutex_unlock(&comp_done_lock);
1584
1585     return pages;
1586 }
1587
1588 #define PAGE_ALL_CLEAN 0
1589 #define PAGE_TRY_AGAIN 1
1590 #define PAGE_DIRTY_FOUND 2
1591 /**
1592  * find_dirty_block: find the next dirty page and update any state
1593  * associated with the search process.
1594  *
1595  * Returns:
1596  *         PAGE_ALL_CLEAN: no dirty page found, give up
1597  *         PAGE_TRY_AGAIN: no dirty page found, retry for next block
1598  *         PAGE_DIRTY_FOUND: dirty page found
1599  *
1600  * @rs: current RAM state
1601  * @pss: data about the state of the current dirty page scan
1602  * @again: set to false if the search has scanned the whole of RAM
1603  */
1604 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
1605 {
1606     /* Update pss->page for the next dirty bit in ramblock */
1607     pss_find_next_dirty(pss);
1608
1609     if (pss->complete_round && pss->block == rs->last_seen_block &&
1610         pss->page >= rs->last_page) {
1611         /*
1612          * We've been once around the RAM and haven't found anything.
1613          * Give up.
1614          */
1615         return PAGE_ALL_CLEAN;
1616     }
1617     if (!offset_in_ramblock(pss->block,
1618                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1619         /* Didn't find anything in this RAM Block */
1620         pss->page = 0;
1621         pss->block = QLIST_NEXT_RCU(pss->block, next);
1622         if (!pss->block) {
1623             /*
1624              * If memory migration starts over, we will meet a dirtied page
1625              * which may still exists in compression threads's ring, so we
1626              * should flush the compressed data to make sure the new page
1627              * is not overwritten by the old one in the destination.
1628              *
1629              * Also If xbzrle is on, stop using the data compression at this
1630              * point. In theory, xbzrle can do better than compression.
1631              */
1632             flush_compressed_data(rs);
1633
1634             /* Hit the end of the list */
1635             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1636             /* Flag that we've looped */
1637             pss->complete_round = true;
1638             /* After the first round, enable XBZRLE. */
1639             if (migrate_use_xbzrle()) {
1640                 rs->xbzrle_enabled = true;
1641             }
1642         }
1643         /* Didn't find anything this time, but try again on the new block */
1644         return PAGE_TRY_AGAIN;
1645     } else {
1646         /* We've found something */
1647         return PAGE_DIRTY_FOUND;
1648     }
1649 }
1650
1651 /**
1652  * unqueue_page: gets a page of the queue
1653  *
1654  * Helper for 'get_queued_page' - gets a page off the queue
1655  *
1656  * Returns the block of the page (or NULL if none available)
1657  *
1658  * @rs: current RAM state
1659  * @offset: used to return the offset within the RAMBlock
1660  */
1661 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1662 {
1663     struct RAMSrcPageRequest *entry;
1664     RAMBlock *block = NULL;
1665
1666     if (!postcopy_has_request(rs)) {
1667         return NULL;
1668     }
1669
1670     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1671
1672     /*
1673      * This should _never_ change even after we take the lock, because no one
1674      * should be taking anything off the request list other than us.
1675      */
1676     assert(postcopy_has_request(rs));
1677
1678     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1679     block = entry->rb;
1680     *offset = entry->offset;
1681
1682     if (entry->len > TARGET_PAGE_SIZE) {
1683         entry->len -= TARGET_PAGE_SIZE;
1684         entry->offset += TARGET_PAGE_SIZE;
1685     } else {
1686         memory_region_unref(block->mr);
1687         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1688         g_free(entry);
1689         migration_consume_urgent_request();
1690     }
1691
1692     return block;
1693 }
1694
1695 #if defined(__linux__)
1696 /**
1697  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1698  *   is found, return RAM block pointer and page offset
1699  *
1700  * Returns pointer to the RAMBlock containing faulting page,
1701  *   NULL if no write faults are pending
1702  *
1703  * @rs: current RAM state
1704  * @offset: page offset from the beginning of the block
1705  */
1706 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1707 {
1708     struct uffd_msg uffd_msg;
1709     void *page_address;
1710     RAMBlock *block;
1711     int res;
1712
1713     if (!migrate_background_snapshot()) {
1714         return NULL;
1715     }
1716
1717     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1718     if (res <= 0) {
1719         return NULL;
1720     }
1721
1722     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1723     block = qemu_ram_block_from_host(page_address, false, offset);
1724     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1725     return block;
1726 }
1727
1728 /**
1729  * ram_save_release_protection: release UFFD write protection after
1730  *   a range of pages has been saved
1731  *
1732  * @rs: current RAM state
1733  * @pss: page-search-status structure
1734  * @start_page: index of the first page in the range relative to pss->block
1735  *
1736  * Returns 0 on success, negative value in case of an error
1737 */
1738 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1739         unsigned long start_page)
1740 {
1741     int res = 0;
1742
1743     /* Check if page is from UFFD-managed region. */
1744     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1745         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1746         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1747
1748         /* Flush async buffers before un-protect. */
1749         qemu_fflush(pss->pss_channel);
1750         /* Un-protect memory range. */
1751         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1752                 false, false);
1753     }
1754
1755     return res;
1756 }
1757
1758 /* ram_write_tracking_available: check if kernel supports required UFFD features
1759  *
1760  * Returns true if supports, false otherwise
1761  */
1762 bool ram_write_tracking_available(void)
1763 {
1764     uint64_t uffd_features;
1765     int res;
1766
1767     res = uffd_query_features(&uffd_features);
1768     return (res == 0 &&
1769             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1770 }
1771
1772 /* ram_write_tracking_compatible: check if guest configuration is
1773  *   compatible with 'write-tracking'
1774  *
1775  * Returns true if compatible, false otherwise
1776  */
1777 bool ram_write_tracking_compatible(void)
1778 {
1779     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1780     int uffd_fd;
1781     RAMBlock *block;
1782     bool ret = false;
1783
1784     /* Open UFFD file descriptor */
1785     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1786     if (uffd_fd < 0) {
1787         return false;
1788     }
1789
1790     RCU_READ_LOCK_GUARD();
1791
1792     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1793         uint64_t uffd_ioctls;
1794
1795         /* Nothing to do with read-only and MMIO-writable regions */
1796         if (block->mr->readonly || block->mr->rom_device) {
1797             continue;
1798         }
1799         /* Try to register block memory via UFFD-IO to track writes */
1800         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1801                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1802             goto out;
1803         }
1804         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1805             goto out;
1806         }
1807     }
1808     ret = true;
1809
1810 out:
1811     uffd_close_fd(uffd_fd);
1812     return ret;
1813 }
1814
1815 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1816                                        ram_addr_t size)
1817 {
1818     const ram_addr_t end = offset + size;
1819
1820     /*
1821      * We read one byte of each page; this will preallocate page tables if
1822      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1823      * where no page was populated yet. This might require adaption when
1824      * supporting other mappings, like shmem.
1825      */
1826     for (; offset < end; offset += block->page_size) {
1827         char tmp = *((char *)block->host + offset);
1828
1829         /* Don't optimize the read out */
1830         asm volatile("" : "+r" (tmp));
1831     }
1832 }
1833
1834 static inline int populate_read_section(MemoryRegionSection *section,
1835                                         void *opaque)
1836 {
1837     const hwaddr size = int128_get64(section->size);
1838     hwaddr offset = section->offset_within_region;
1839     RAMBlock *block = section->mr->ram_block;
1840
1841     populate_read_range(block, offset, size);
1842     return 0;
1843 }
1844
1845 /*
1846  * ram_block_populate_read: preallocate page tables and populate pages in the
1847  *   RAM block by reading a byte of each page.
1848  *
1849  * Since it's solely used for userfault_fd WP feature, here we just
1850  *   hardcode page size to qemu_real_host_page_size.
1851  *
1852  * @block: RAM block to populate
1853  */
1854 static void ram_block_populate_read(RAMBlock *rb)
1855 {
1856     /*
1857      * Skip populating all pages that fall into a discarded range as managed by
1858      * a RamDiscardManager responsible for the mapped memory region of the
1859      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1860      * must not get populated automatically. We don't have to track
1861      * modifications via userfaultfd WP reliably, because these pages will
1862      * not be part of the migration stream either way -- see
1863      * ramblock_dirty_bitmap_exclude_discarded_pages().
1864      *
1865      * Note: The result is only stable while migrating (precopy/postcopy).
1866      */
1867     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1868         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1869         MemoryRegionSection section = {
1870             .mr = rb->mr,
1871             .offset_within_region = 0,
1872             .size = rb->mr->size,
1873         };
1874
1875         ram_discard_manager_replay_populated(rdm, &section,
1876                                              populate_read_section, NULL);
1877     } else {
1878         populate_read_range(rb, 0, rb->used_length);
1879     }
1880 }
1881
1882 /*
1883  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1884  */
1885 void ram_write_tracking_prepare(void)
1886 {
1887     RAMBlock *block;
1888
1889     RCU_READ_LOCK_GUARD();
1890
1891     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1892         /* Nothing to do with read-only and MMIO-writable regions */
1893         if (block->mr->readonly || block->mr->rom_device) {
1894             continue;
1895         }
1896
1897         /*
1898          * Populate pages of the RAM block before enabling userfault_fd
1899          * write protection.
1900          *
1901          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1902          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1903          * pages with pte_none() entries in page table.
1904          */
1905         ram_block_populate_read(block);
1906     }
1907 }
1908
1909 static inline int uffd_protect_section(MemoryRegionSection *section,
1910                                        void *opaque)
1911 {
1912     const hwaddr size = int128_get64(section->size);
1913     const hwaddr offset = section->offset_within_region;
1914     RAMBlock *rb = section->mr->ram_block;
1915     int uffd_fd = (uintptr_t)opaque;
1916
1917     return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1918                                   false);
1919 }
1920
1921 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1922 {
1923     assert(rb->flags & RAM_UF_WRITEPROTECT);
1924
1925     /* See ram_block_populate_read() */
1926     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1927         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1928         MemoryRegionSection section = {
1929             .mr = rb->mr,
1930             .offset_within_region = 0,
1931             .size = rb->mr->size,
1932         };
1933
1934         return ram_discard_manager_replay_populated(rdm, &section,
1935                                                     uffd_protect_section,
1936                                                     (void *)(uintptr_t)uffd_fd);
1937     }
1938     return uffd_change_protection(uffd_fd, rb->host,
1939                                   rb->used_length, true, false);
1940 }
1941
1942 /*
1943  * ram_write_tracking_start: start UFFD-WP memory tracking
1944  *
1945  * Returns 0 for success or negative value in case of error
1946  */
1947 int ram_write_tracking_start(void)
1948 {
1949     int uffd_fd;
1950     RAMState *rs = ram_state;
1951     RAMBlock *block;
1952
1953     /* Open UFFD file descriptor */
1954     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1955     if (uffd_fd < 0) {
1956         return uffd_fd;
1957     }
1958     rs->uffdio_fd = uffd_fd;
1959
1960     RCU_READ_LOCK_GUARD();
1961
1962     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1963         /* Nothing to do with read-only and MMIO-writable regions */
1964         if (block->mr->readonly || block->mr->rom_device) {
1965             continue;
1966         }
1967
1968         /* Register block memory with UFFD to track writes */
1969         if (uffd_register_memory(rs->uffdio_fd, block->host,
1970                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1971             goto fail;
1972         }
1973         block->flags |= RAM_UF_WRITEPROTECT;
1974         memory_region_ref(block->mr);
1975
1976         /* Apply UFFD write protection to the block memory range */
1977         if (ram_block_uffd_protect(block, uffd_fd)) {
1978             goto fail;
1979         }
1980
1981         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1982                 block->host, block->max_length);
1983     }
1984
1985     return 0;
1986
1987 fail:
1988     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1989
1990     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1991         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1992             continue;
1993         }
1994         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1995         /* Cleanup flags and remove reference */
1996         block->flags &= ~RAM_UF_WRITEPROTECT;
1997         memory_region_unref(block->mr);
1998     }
1999
2000     uffd_close_fd(uffd_fd);
2001     rs->uffdio_fd = -1;
2002     return -1;
2003 }
2004
2005 /**
2006  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
2007  */
2008 void ram_write_tracking_stop(void)
2009 {
2010     RAMState *rs = ram_state;
2011     RAMBlock *block;
2012
2013     RCU_READ_LOCK_GUARD();
2014
2015     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2016         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
2017             continue;
2018         }
2019         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
2020
2021         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
2022                 block->host, block->max_length);
2023
2024         /* Cleanup flags and remove reference */
2025         block->flags &= ~RAM_UF_WRITEPROTECT;
2026         memory_region_unref(block->mr);
2027     }
2028
2029     /* Finally close UFFD file descriptor */
2030     uffd_close_fd(rs->uffdio_fd);
2031     rs->uffdio_fd = -1;
2032 }
2033
2034 #else
2035 /* No target OS support, stubs just fail or ignore */
2036
2037 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
2038 {
2039     (void) rs;
2040     (void) offset;
2041
2042     return NULL;
2043 }
2044
2045 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
2046         unsigned long start_page)
2047 {
2048     (void) rs;
2049     (void) pss;
2050     (void) start_page;
2051
2052     return 0;
2053 }
2054
2055 bool ram_write_tracking_available(void)
2056 {
2057     return false;
2058 }
2059
2060 bool ram_write_tracking_compatible(void)
2061 {
2062     assert(0);
2063     return false;
2064 }
2065
2066 int ram_write_tracking_start(void)
2067 {
2068     assert(0);
2069     return -1;
2070 }
2071
2072 void ram_write_tracking_stop(void)
2073 {
2074     assert(0);
2075 }
2076 #endif /* defined(__linux__) */
2077
2078 /**
2079  * get_queued_page: unqueue a page from the postcopy requests
2080  *
2081  * Skips pages that are already sent (!dirty)
2082  *
2083  * Returns true if a queued page is found
2084  *
2085  * @rs: current RAM state
2086  * @pss: data about the state of the current dirty page scan
2087  */
2088 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2089 {
2090     RAMBlock  *block;
2091     ram_addr_t offset;
2092     bool dirty;
2093
2094     do {
2095         block = unqueue_page(rs, &offset);
2096         /*
2097          * We're sending this page, and since it's postcopy nothing else
2098          * will dirty it, and we must make sure it doesn't get sent again
2099          * even if this queue request was received after the background
2100          * search already sent it.
2101          */
2102         if (block) {
2103             unsigned long page;
2104
2105             page = offset >> TARGET_PAGE_BITS;
2106             dirty = test_bit(page, block->bmap);
2107             if (!dirty) {
2108                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2109                                                 page);
2110             } else {
2111                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2112             }
2113         }
2114
2115     } while (block && !dirty);
2116
2117     if (!block) {
2118         /*
2119          * Poll write faults too if background snapshot is enabled; that's
2120          * when we have vcpus got blocked by the write protected pages.
2121          */
2122         block = poll_fault_page(rs, &offset);
2123     }
2124
2125     if (block) {
2126         /*
2127          * We want the background search to continue from the queued page
2128          * since the guest is likely to want other pages near to the page
2129          * it just requested.
2130          */
2131         pss->block = block;
2132         pss->page = offset >> TARGET_PAGE_BITS;
2133
2134         /*
2135          * This unqueued page would break the "one round" check, even is
2136          * really rare.
2137          */
2138         pss->complete_round = false;
2139     }
2140
2141     return !!block;
2142 }
2143
2144 /**
2145  * migration_page_queue_free: drop any remaining pages in the ram
2146  * request queue
2147  *
2148  * It should be empty at the end anyway, but in error cases there may
2149  * be some left.  in case that there is any page left, we drop it.
2150  *
2151  */
2152 static void migration_page_queue_free(RAMState *rs)
2153 {
2154     struct RAMSrcPageRequest *mspr, *next_mspr;
2155     /* This queue generally should be empty - but in the case of a failed
2156      * migration might have some droppings in.
2157      */
2158     RCU_READ_LOCK_GUARD();
2159     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2160         memory_region_unref(mspr->rb->mr);
2161         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2162         g_free(mspr);
2163     }
2164 }
2165
2166 /**
2167  * ram_save_queue_pages: queue the page for transmission
2168  *
2169  * A request from postcopy destination for example.
2170  *
2171  * Returns zero on success or negative on error
2172  *
2173  * @rbname: Name of the RAMBLock of the request. NULL means the
2174  *          same that last one.
2175  * @start: starting address from the start of the RAMBlock
2176  * @len: length (in bytes) to send
2177  */
2178 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2179 {
2180     RAMBlock *ramblock;
2181     RAMState *rs = ram_state;
2182
2183     ram_counters.postcopy_requests++;
2184     RCU_READ_LOCK_GUARD();
2185
2186     if (!rbname) {
2187         /* Reuse last RAMBlock */
2188         ramblock = rs->last_req_rb;
2189
2190         if (!ramblock) {
2191             /*
2192              * Shouldn't happen, we can't reuse the last RAMBlock if
2193              * it's the 1st request.
2194              */
2195             error_report("ram_save_queue_pages no previous block");
2196             return -1;
2197         }
2198     } else {
2199         ramblock = qemu_ram_block_by_name(rbname);
2200
2201         if (!ramblock) {
2202             /* We shouldn't be asked for a non-existent RAMBlock */
2203             error_report("ram_save_queue_pages no block '%s'", rbname);
2204             return -1;
2205         }
2206         rs->last_req_rb = ramblock;
2207     }
2208     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2209     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2210         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2211                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2212                      __func__, start, len, ramblock->used_length);
2213         return -1;
2214     }
2215
2216     /*
2217      * When with postcopy preempt, we send back the page directly in the
2218      * rp-return thread.
2219      */
2220     if (postcopy_preempt_active()) {
2221         ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2222         size_t page_size = qemu_ram_pagesize(ramblock);
2223         PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2224         int ret = 0;
2225
2226         qemu_mutex_lock(&rs->bitmap_mutex);
2227
2228         pss_init(pss, ramblock, page_start);
2229         /*
2230          * Always use the preempt channel, and make sure it's there.  It's
2231          * safe to access without lock, because when rp-thread is running
2232          * we should be the only one who operates on the qemufile
2233          */
2234         pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
2235         assert(pss->pss_channel);
2236
2237         /*
2238          * It must be either one or multiple of host page size.  Just
2239          * assert; if something wrong we're mostly split brain anyway.
2240          */
2241         assert(len % page_size == 0);
2242         while (len) {
2243             if (ram_save_host_page_urgent(pss)) {
2244                 error_report("%s: ram_save_host_page_urgent() failed: "
2245                              "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2246                              __func__, ramblock->idstr, start);
2247                 ret = -1;
2248                 break;
2249             }
2250             /*
2251              * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2252              * will automatically be moved and point to the next host page
2253              * we're going to send, so no need to update here.
2254              *
2255              * Normally QEMU never sends >1 host page in requests, so
2256              * logically we don't even need that as the loop should only
2257              * run once, but just to be consistent.
2258              */
2259             len -= page_size;
2260         };
2261         qemu_mutex_unlock(&rs->bitmap_mutex);
2262
2263         return ret;
2264     }
2265
2266     struct RAMSrcPageRequest *new_entry =
2267         g_new0(struct RAMSrcPageRequest, 1);
2268     new_entry->rb = ramblock;
2269     new_entry->offset = start;
2270     new_entry->len = len;
2271
2272     memory_region_ref(ramblock->mr);
2273     qemu_mutex_lock(&rs->src_page_req_mutex);
2274     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2275     migration_make_urgent_request();
2276     qemu_mutex_unlock(&rs->src_page_req_mutex);
2277
2278     return 0;
2279 }
2280
2281 static bool save_page_use_compression(RAMState *rs)
2282 {
2283     if (!migrate_use_compression()) {
2284         return false;
2285     }
2286
2287     /*
2288      * If xbzrle is enabled (e.g., after first round of migration), stop
2289      * using the data compression. In theory, xbzrle can do better than
2290      * compression.
2291      */
2292     if (rs->xbzrle_enabled) {
2293         return false;
2294     }
2295
2296     return true;
2297 }
2298
2299 /*
2300  * try to compress the page before posting it out, return true if the page
2301  * has been properly handled by compression, otherwise needs other
2302  * paths to handle it
2303  */
2304 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2305                                RAMBlock *block, ram_addr_t offset)
2306 {
2307     if (!save_page_use_compression(rs)) {
2308         return false;
2309     }
2310
2311     /*
2312      * When starting the process of a new block, the first page of
2313      * the block should be sent out before other pages in the same
2314      * block, and all the pages in last block should have been sent
2315      * out, keeping this order is important, because the 'cont' flag
2316      * is used to avoid resending the block name.
2317      *
2318      * We post the fist page as normal page as compression will take
2319      * much CPU resource.
2320      */
2321     if (block != pss->last_sent_block) {
2322         flush_compressed_data(rs);
2323         return false;
2324     }
2325
2326     if (compress_page_with_multi_thread(block, offset) > 0) {
2327         return true;
2328     }
2329
2330     compression_counters.busy++;
2331     return false;
2332 }
2333
2334 /**
2335  * ram_save_target_page_legacy: save one target page
2336  *
2337  * Returns the number of pages written
2338  *
2339  * @rs: current RAM state
2340  * @pss: data about the page we want to send
2341  */
2342 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
2343 {
2344     RAMBlock *block = pss->block;
2345     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2346     int res;
2347
2348     if (control_save_page(pss, block, offset, &res)) {
2349         return res;
2350     }
2351
2352     if (save_compress_page(rs, pss, block, offset)) {
2353         return 1;
2354     }
2355
2356     res = save_zero_page(pss, pss->pss_channel, block, offset);
2357     if (res > 0) {
2358         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2359          * page would be stale
2360          */
2361         if (rs->xbzrle_enabled) {
2362             XBZRLE_cache_lock();
2363             xbzrle_cache_zero_page(rs, block->offset + offset);
2364             XBZRLE_cache_unlock();
2365         }
2366         return res;
2367     }
2368
2369     /*
2370      * Do not use multifd in postcopy as one whole host page should be
2371      * placed.  Meanwhile postcopy requires atomic update of pages, so even
2372      * if host page size == guest page size the dest guest during run may
2373      * still see partially copied pages which is data corruption.
2374      */
2375     if (migrate_use_multifd() && !migration_in_postcopy()) {
2376         return ram_save_multifd_page(pss->pss_channel, block, offset);
2377     }
2378
2379     return ram_save_page(rs, pss);
2380 }
2381
2382 /* Should be called before sending a host page */
2383 static void pss_host_page_prepare(PageSearchStatus *pss)
2384 {
2385     /* How many guest pages are there in one host page? */
2386     size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2387
2388     pss->host_page_sending = true;
2389     if (guest_pfns <= 1) {
2390         /*
2391          * This covers both when guest psize == host psize, or when guest
2392          * has larger psize than the host (guest_pfns==0).
2393          *
2394          * For the latter, we always send one whole guest page per
2395          * iteration of the host page (example: an Alpha VM on x86 host
2396          * will have guest psize 8K while host psize 4K).
2397          */
2398         pss->host_page_start = pss->page;
2399         pss->host_page_end = pss->page + 1;
2400     } else {
2401         /*
2402          * The host page spans over multiple guest pages, we send them
2403          * within the same host page iteration.
2404          */
2405         pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2406         pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2407     }
2408 }
2409
2410 /*
2411  * Whether the page pointed by PSS is within the host page being sent.
2412  * Must be called after a previous pss_host_page_prepare().
2413  */
2414 static bool pss_within_range(PageSearchStatus *pss)
2415 {
2416     ram_addr_t ram_addr;
2417
2418     assert(pss->host_page_sending);
2419
2420     /* Over host-page boundary? */
2421     if (pss->page >= pss->host_page_end) {
2422         return false;
2423     }
2424
2425     ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2426
2427     return offset_in_ramblock(pss->block, ram_addr);
2428 }
2429
2430 static void pss_host_page_finish(PageSearchStatus *pss)
2431 {
2432     pss->host_page_sending = false;
2433     /* This is not needed, but just to reset it */
2434     pss->host_page_start = pss->host_page_end = 0;
2435 }
2436
2437 /*
2438  * Send an urgent host page specified by `pss'.  Need to be called with
2439  * bitmap_mutex held.
2440  *
2441  * Returns 0 if save host page succeeded, false otherwise.
2442  */
2443 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2444 {
2445     bool page_dirty, sent = false;
2446     RAMState *rs = ram_state;
2447     int ret = 0;
2448
2449     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2450     pss_host_page_prepare(pss);
2451
2452     /*
2453      * If precopy is sending the same page, let it be done in precopy, or
2454      * we could send the same page in two channels and none of them will
2455      * receive the whole page.
2456      */
2457     if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2458         trace_postcopy_preempt_hit(pss->block->idstr,
2459                                    pss->page << TARGET_PAGE_BITS);
2460         return 0;
2461     }
2462
2463     do {
2464         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2465
2466         if (page_dirty) {
2467             /* Be strict to return code; it must be 1, or what else? */
2468             if (migration_ops->ram_save_target_page(rs, pss) != 1) {
2469                 error_report_once("%s: ram_save_target_page failed", __func__);
2470                 ret = -1;
2471                 goto out;
2472             }
2473             sent = true;
2474         }
2475         pss_find_next_dirty(pss);
2476     } while (pss_within_range(pss));
2477 out:
2478     pss_host_page_finish(pss);
2479     /* For urgent requests, flush immediately if sent */
2480     if (sent) {
2481         qemu_fflush(pss->pss_channel);
2482     }
2483     return ret;
2484 }
2485
2486 /**
2487  * ram_save_host_page: save a whole host page
2488  *
2489  * Starting at *offset send pages up to the end of the current host
2490  * page. It's valid for the initial offset to point into the middle of
2491  * a host page in which case the remainder of the hostpage is sent.
2492  * Only dirty target pages are sent. Note that the host page size may
2493  * be a huge page for this block.
2494  *
2495  * The saving stops at the boundary of the used_length of the block
2496  * if the RAMBlock isn't a multiple of the host page size.
2497  *
2498  * The caller must be with ram_state.bitmap_mutex held to call this
2499  * function.  Note that this function can temporarily release the lock, but
2500  * when the function is returned it'll make sure the lock is still held.
2501  *
2502  * Returns the number of pages written or negative on error
2503  *
2504  * @rs: current RAM state
2505  * @pss: data about the page we want to send
2506  */
2507 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2508 {
2509     bool page_dirty, preempt_active = postcopy_preempt_active();
2510     int tmppages, pages = 0;
2511     size_t pagesize_bits =
2512         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2513     unsigned long start_page = pss->page;
2514     int res;
2515
2516     if (ramblock_is_ignored(pss->block)) {
2517         error_report("block %s should not be migrated !", pss->block->idstr);
2518         return 0;
2519     }
2520
2521     /* Update host page boundary information */
2522     pss_host_page_prepare(pss);
2523
2524     do {
2525         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2526
2527         /* Check the pages is dirty and if it is send it */
2528         if (page_dirty) {
2529             /*
2530              * Properly yield the lock only in postcopy preempt mode
2531              * because both migration thread and rp-return thread can
2532              * operate on the bitmaps.
2533              */
2534             if (preempt_active) {
2535                 qemu_mutex_unlock(&rs->bitmap_mutex);
2536             }
2537             tmppages = migration_ops->ram_save_target_page(rs, pss);
2538             if (tmppages >= 0) {
2539                 pages += tmppages;
2540                 /*
2541                  * Allow rate limiting to happen in the middle of huge pages if
2542                  * something is sent in the current iteration.
2543                  */
2544                 if (pagesize_bits > 1 && tmppages > 0) {
2545                     migration_rate_limit();
2546                 }
2547             }
2548             if (preempt_active) {
2549                 qemu_mutex_lock(&rs->bitmap_mutex);
2550             }
2551         } else {
2552             tmppages = 0;
2553         }
2554
2555         if (tmppages < 0) {
2556             pss_host_page_finish(pss);
2557             return tmppages;
2558         }
2559
2560         pss_find_next_dirty(pss);
2561     } while (pss_within_range(pss));
2562
2563     pss_host_page_finish(pss);
2564
2565     res = ram_save_release_protection(rs, pss, start_page);
2566     return (res < 0 ? res : pages);
2567 }
2568
2569 /**
2570  * ram_find_and_save_block: finds a dirty page and sends it to f
2571  *
2572  * Called within an RCU critical section.
2573  *
2574  * Returns the number of pages written where zero means no dirty pages,
2575  * or negative on error
2576  *
2577  * @rs: current RAM state
2578  *
2579  * On systems where host-page-size > target-page-size it will send all the
2580  * pages in a host page that are dirty.
2581  */
2582 static int ram_find_and_save_block(RAMState *rs)
2583 {
2584     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2585     int pages = 0;
2586
2587     /* No dirty page as there is zero RAM */
2588     if (!rs->ram_bytes_total) {
2589         return pages;
2590     }
2591
2592     /*
2593      * Always keep last_seen_block/last_page valid during this procedure,
2594      * because find_dirty_block() relies on these values (e.g., we compare
2595      * last_seen_block with pss.block to see whether we searched all the
2596      * ramblocks) to detect the completion of migration.  Having NULL value
2597      * of last_seen_block can conditionally cause below loop to run forever.
2598      */
2599     if (!rs->last_seen_block) {
2600         rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2601         rs->last_page = 0;
2602     }
2603
2604     pss_init(pss, rs->last_seen_block, rs->last_page);
2605
2606     while (true){
2607         if (!get_queued_page(rs, pss)) {
2608             /* priority queue empty, so just search for something dirty */
2609             int res = find_dirty_block(rs, pss);
2610             if (res != PAGE_DIRTY_FOUND) {
2611                 if (res == PAGE_ALL_CLEAN) {
2612                     break;
2613                 } else if (res == PAGE_TRY_AGAIN) {
2614                     continue;
2615                 }
2616             }
2617         }
2618         pages = ram_save_host_page(rs, pss);
2619         if (pages) {
2620             break;
2621         }
2622     }
2623
2624     rs->last_seen_block = pss->block;
2625     rs->last_page = pss->page;
2626
2627     return pages;
2628 }
2629
2630 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2631 {
2632     uint64_t pages = size / TARGET_PAGE_SIZE;
2633
2634     if (zero) {
2635         stat64_add(&ram_atomic_counters.duplicate, pages);
2636     } else {
2637         stat64_add(&ram_atomic_counters.normal, pages);
2638         ram_transferred_add(size);
2639         qemu_file_credit_transfer(f, size);
2640     }
2641 }
2642
2643 static uint64_t ram_bytes_total_with_ignored(void)
2644 {
2645     RAMBlock *block;
2646     uint64_t total = 0;
2647
2648     RCU_READ_LOCK_GUARD();
2649
2650     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2651         total += block->used_length;
2652     }
2653     return total;
2654 }
2655
2656 uint64_t ram_bytes_total(void)
2657 {
2658     RAMBlock *block;
2659     uint64_t total = 0;
2660
2661     RCU_READ_LOCK_GUARD();
2662
2663     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2664         total += block->used_length;
2665     }
2666     return total;
2667 }
2668
2669 static void xbzrle_load_setup(void)
2670 {
2671     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2672 }
2673
2674 static void xbzrle_load_cleanup(void)
2675 {
2676     g_free(XBZRLE.decoded_buf);
2677     XBZRLE.decoded_buf = NULL;
2678 }
2679
2680 static void ram_state_cleanup(RAMState **rsp)
2681 {
2682     if (*rsp) {
2683         migration_page_queue_free(*rsp);
2684         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2685         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2686         g_free(*rsp);
2687         *rsp = NULL;
2688     }
2689 }
2690
2691 static void xbzrle_cleanup(void)
2692 {
2693     XBZRLE_cache_lock();
2694     if (XBZRLE.cache) {
2695         cache_fini(XBZRLE.cache);
2696         g_free(XBZRLE.encoded_buf);
2697         g_free(XBZRLE.current_buf);
2698         g_free(XBZRLE.zero_target_page);
2699         XBZRLE.cache = NULL;
2700         XBZRLE.encoded_buf = NULL;
2701         XBZRLE.current_buf = NULL;
2702         XBZRLE.zero_target_page = NULL;
2703     }
2704     XBZRLE_cache_unlock();
2705 }
2706
2707 static void ram_save_cleanup(void *opaque)
2708 {
2709     RAMState **rsp = opaque;
2710     RAMBlock *block;
2711
2712     /* We don't use dirty log with background snapshots */
2713     if (!migrate_background_snapshot()) {
2714         /* caller have hold iothread lock or is in a bh, so there is
2715          * no writing race against the migration bitmap
2716          */
2717         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2718             /*
2719              * do not stop dirty log without starting it, since
2720              * memory_global_dirty_log_stop will assert that
2721              * memory_global_dirty_log_start/stop used in pairs
2722              */
2723             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2724         }
2725     }
2726
2727     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2728         g_free(block->clear_bmap);
2729         block->clear_bmap = NULL;
2730         g_free(block->bmap);
2731         block->bmap = NULL;
2732     }
2733
2734     xbzrle_cleanup();
2735     compress_threads_save_cleanup();
2736     ram_state_cleanup(rsp);
2737     g_free(migration_ops);
2738     migration_ops = NULL;
2739 }
2740
2741 static void ram_state_reset(RAMState *rs)
2742 {
2743     int i;
2744
2745     for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2746         rs->pss[i].last_sent_block = NULL;
2747     }
2748
2749     rs->last_seen_block = NULL;
2750     rs->last_page = 0;
2751     rs->last_version = ram_list.version;
2752     rs->xbzrle_enabled = false;
2753 }
2754
2755 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2756
2757 /* **** functions for postcopy ***** */
2758
2759 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2760 {
2761     struct RAMBlock *block;
2762
2763     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2764         unsigned long *bitmap = block->bmap;
2765         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2766         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2767
2768         while (run_start < range) {
2769             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2770             ram_discard_range(block->idstr,
2771                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2772                               ((ram_addr_t)(run_end - run_start))
2773                                 << TARGET_PAGE_BITS);
2774             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2775         }
2776     }
2777 }
2778
2779 /**
2780  * postcopy_send_discard_bm_ram: discard a RAMBlock
2781  *
2782  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2783  *
2784  * @ms: current migration state
2785  * @block: RAMBlock to discard
2786  */
2787 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2788 {
2789     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2790     unsigned long current;
2791     unsigned long *bitmap = block->bmap;
2792
2793     for (current = 0; current < end; ) {
2794         unsigned long one = find_next_bit(bitmap, end, current);
2795         unsigned long zero, discard_length;
2796
2797         if (one >= end) {
2798             break;
2799         }
2800
2801         zero = find_next_zero_bit(bitmap, end, one + 1);
2802
2803         if (zero >= end) {
2804             discard_length = end - one;
2805         } else {
2806             discard_length = zero - one;
2807         }
2808         postcopy_discard_send_range(ms, one, discard_length);
2809         current = one + discard_length;
2810     }
2811 }
2812
2813 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2814
2815 /**
2816  * postcopy_each_ram_send_discard: discard all RAMBlocks
2817  *
2818  * Utility for the outgoing postcopy code.
2819  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2820  *   passing it bitmap indexes and name.
2821  * (qemu_ram_foreach_block ends up passing unscaled lengths
2822  *  which would mean postcopy code would have to deal with target page)
2823  *
2824  * @ms: current migration state
2825  */
2826 static void postcopy_each_ram_send_discard(MigrationState *ms)
2827 {
2828     struct RAMBlock *block;
2829
2830     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2831         postcopy_discard_send_init(ms, block->idstr);
2832
2833         /*
2834          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2835          * host-page size chunks, mark any partially dirty host-page size
2836          * chunks as all dirty.  In this case the host-page is the host-page
2837          * for the particular RAMBlock, i.e. it might be a huge page.
2838          */
2839         postcopy_chunk_hostpages_pass(ms, block);
2840
2841         /*
2842          * Postcopy sends chunks of bitmap over the wire, but it
2843          * just needs indexes at this point, avoids it having
2844          * target page specific code.
2845          */
2846         postcopy_send_discard_bm_ram(ms, block);
2847         postcopy_discard_send_finish(ms);
2848     }
2849 }
2850
2851 /**
2852  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2853  *
2854  * Helper for postcopy_chunk_hostpages; it's called twice to
2855  * canonicalize the two bitmaps, that are similar, but one is
2856  * inverted.
2857  *
2858  * Postcopy requires that all target pages in a hostpage are dirty or
2859  * clean, not a mix.  This function canonicalizes the bitmaps.
2860  *
2861  * @ms: current migration state
2862  * @block: block that contains the page we want to canonicalize
2863  */
2864 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2865 {
2866     RAMState *rs = ram_state;
2867     unsigned long *bitmap = block->bmap;
2868     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2869     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2870     unsigned long run_start;
2871
2872     if (block->page_size == TARGET_PAGE_SIZE) {
2873         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2874         return;
2875     }
2876
2877     /* Find a dirty page */
2878     run_start = find_next_bit(bitmap, pages, 0);
2879
2880     while (run_start < pages) {
2881
2882         /*
2883          * If the start of this run of pages is in the middle of a host
2884          * page, then we need to fixup this host page.
2885          */
2886         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2887             /* Find the end of this run */
2888             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2889             /*
2890              * If the end isn't at the start of a host page, then the
2891              * run doesn't finish at the end of a host page
2892              * and we need to discard.
2893              */
2894         }
2895
2896         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2897             unsigned long page;
2898             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2899                                                              host_ratio);
2900             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2901
2902             /* Clean up the bitmap */
2903             for (page = fixup_start_addr;
2904                  page < fixup_start_addr + host_ratio; page++) {
2905                 /*
2906                  * Remark them as dirty, updating the count for any pages
2907                  * that weren't previously dirty.
2908                  */
2909                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2910             }
2911         }
2912
2913         /* Find the next dirty page for the next iteration */
2914         run_start = find_next_bit(bitmap, pages, run_start);
2915     }
2916 }
2917
2918 /**
2919  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2920  *
2921  * Transmit the set of pages to be discarded after precopy to the target
2922  * these are pages that:
2923  *     a) Have been previously transmitted but are now dirty again
2924  *     b) Pages that have never been transmitted, this ensures that
2925  *        any pages on the destination that have been mapped by background
2926  *        tasks get discarded (transparent huge pages is the specific concern)
2927  * Hopefully this is pretty sparse
2928  *
2929  * @ms: current migration state
2930  */
2931 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2932 {
2933     RAMState *rs = ram_state;
2934
2935     RCU_READ_LOCK_GUARD();
2936
2937     /* This should be our last sync, the src is now paused */
2938     migration_bitmap_sync(rs);
2939
2940     /* Easiest way to make sure we don't resume in the middle of a host-page */
2941     rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2942     rs->last_seen_block = NULL;
2943     rs->last_page = 0;
2944
2945     postcopy_each_ram_send_discard(ms);
2946
2947     trace_ram_postcopy_send_discard_bitmap();
2948 }
2949
2950 /**
2951  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2952  *
2953  * Returns zero on success
2954  *
2955  * @rbname: name of the RAMBlock of the request. NULL means the
2956  *          same that last one.
2957  * @start: RAMBlock starting page
2958  * @length: RAMBlock size
2959  */
2960 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2961 {
2962     trace_ram_discard_range(rbname, start, length);
2963
2964     RCU_READ_LOCK_GUARD();
2965     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2966
2967     if (!rb) {
2968         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2969         return -1;
2970     }
2971
2972     /*
2973      * On source VM, we don't need to update the received bitmap since
2974      * we don't even have one.
2975      */
2976     if (rb->receivedmap) {
2977         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2978                      length >> qemu_target_page_bits());
2979     }
2980
2981     return ram_block_discard_range(rb, start, length);
2982 }
2983
2984 /*
2985  * For every allocation, we will try not to crash the VM if the
2986  * allocation failed.
2987  */
2988 static int xbzrle_init(void)
2989 {
2990     Error *local_err = NULL;
2991
2992     if (!migrate_use_xbzrle()) {
2993         return 0;
2994     }
2995
2996     XBZRLE_cache_lock();
2997
2998     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2999     if (!XBZRLE.zero_target_page) {
3000         error_report("%s: Error allocating zero page", __func__);
3001         goto err_out;
3002     }
3003
3004     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3005                               TARGET_PAGE_SIZE, &local_err);
3006     if (!XBZRLE.cache) {
3007         error_report_err(local_err);
3008         goto free_zero_page;
3009     }
3010
3011     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3012     if (!XBZRLE.encoded_buf) {
3013         error_report("%s: Error allocating encoded_buf", __func__);
3014         goto free_cache;
3015     }
3016
3017     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3018     if (!XBZRLE.current_buf) {
3019         error_report("%s: Error allocating current_buf", __func__);
3020         goto free_encoded_buf;
3021     }
3022
3023     /* We are all good */
3024     XBZRLE_cache_unlock();
3025     return 0;
3026
3027 free_encoded_buf:
3028     g_free(XBZRLE.encoded_buf);
3029     XBZRLE.encoded_buf = NULL;
3030 free_cache:
3031     cache_fini(XBZRLE.cache);
3032     XBZRLE.cache = NULL;
3033 free_zero_page:
3034     g_free(XBZRLE.zero_target_page);
3035     XBZRLE.zero_target_page = NULL;
3036 err_out:
3037     XBZRLE_cache_unlock();
3038     return -ENOMEM;
3039 }
3040
3041 static int ram_state_init(RAMState **rsp)
3042 {
3043     *rsp = g_try_new0(RAMState, 1);
3044
3045     if (!*rsp) {
3046         error_report("%s: Init ramstate fail", __func__);
3047         return -1;
3048     }
3049
3050     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3051     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3052     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3053     (*rsp)->ram_bytes_total = ram_bytes_total();
3054
3055     /*
3056      * Count the total number of pages used by ram blocks not including any
3057      * gaps due to alignment or unplugs.
3058      * This must match with the initial values of dirty bitmap.
3059      */
3060     (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
3061     ram_state_reset(*rsp);
3062
3063     return 0;
3064 }
3065
3066 static void ram_list_init_bitmaps(void)
3067 {
3068     MigrationState *ms = migrate_get_current();
3069     RAMBlock *block;
3070     unsigned long pages;
3071     uint8_t shift;
3072
3073     /* Skip setting bitmap if there is no RAM */
3074     if (ram_bytes_total()) {
3075         shift = ms->clear_bitmap_shift;
3076         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3077             error_report("clear_bitmap_shift (%u) too big, using "
3078                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3079             shift = CLEAR_BITMAP_SHIFT_MAX;
3080         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3081             error_report("clear_bitmap_shift (%u) too small, using "
3082                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3083             shift = CLEAR_BITMAP_SHIFT_MIN;
3084         }
3085
3086         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3087             pages = block->max_length >> TARGET_PAGE_BITS;
3088             /*
3089              * The initial dirty bitmap for migration must be set with all
3090              * ones to make sure we'll migrate every guest RAM page to
3091              * destination.
3092              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3093              * new migration after a failed migration, ram_list.
3094              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3095              * guest memory.
3096              */
3097             block->bmap = bitmap_new(pages);
3098             bitmap_set(block->bmap, 0, pages);
3099             block->clear_bmap_shift = shift;
3100             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3101         }
3102     }
3103 }
3104
3105 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3106 {
3107     unsigned long pages;
3108     RAMBlock *rb;
3109
3110     RCU_READ_LOCK_GUARD();
3111
3112     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3113             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3114             rs->migration_dirty_pages -= pages;
3115     }
3116 }
3117
3118 static void ram_init_bitmaps(RAMState *rs)
3119 {
3120     /* For memory_global_dirty_log_start below.  */
3121     qemu_mutex_lock_iothread();
3122     qemu_mutex_lock_ramlist();
3123
3124     WITH_RCU_READ_LOCK_GUARD() {
3125         ram_list_init_bitmaps();
3126         /* We don't use dirty log with background snapshots */
3127         if (!migrate_background_snapshot()) {
3128             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3129             migration_bitmap_sync_precopy(rs);
3130         }
3131     }
3132     qemu_mutex_unlock_ramlist();
3133     qemu_mutex_unlock_iothread();
3134
3135     /*
3136      * After an eventual first bitmap sync, fixup the initial bitmap
3137      * containing all 1s to exclude any discarded pages from migration.
3138      */
3139     migration_bitmap_clear_discarded_pages(rs);
3140 }
3141
3142 static int ram_init_all(RAMState **rsp)
3143 {
3144     if (ram_state_init(rsp)) {
3145         return -1;
3146     }
3147
3148     if (xbzrle_init()) {
3149         ram_state_cleanup(rsp);
3150         return -1;
3151     }
3152
3153     ram_init_bitmaps(*rsp);
3154
3155     return 0;
3156 }
3157
3158 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3159 {
3160     RAMBlock *block;
3161     uint64_t pages = 0;
3162
3163     /*
3164      * Postcopy is not using xbzrle/compression, so no need for that.
3165      * Also, since source are already halted, we don't need to care
3166      * about dirty page logging as well.
3167      */
3168
3169     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3170         pages += bitmap_count_one(block->bmap,
3171                                   block->used_length >> TARGET_PAGE_BITS);
3172     }
3173
3174     /* This may not be aligned with current bitmaps. Recalculate. */
3175     rs->migration_dirty_pages = pages;
3176
3177     ram_state_reset(rs);
3178
3179     /* Update RAMState cache of output QEMUFile */
3180     rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
3181
3182     trace_ram_state_resume_prepare(pages);
3183 }
3184
3185 /*
3186  * This function clears bits of the free pages reported by the caller from the
3187  * migration dirty bitmap. @addr is the host address corresponding to the
3188  * start of the continuous guest free pages, and @len is the total bytes of
3189  * those pages.
3190  */
3191 void qemu_guest_free_page_hint(void *addr, size_t len)
3192 {
3193     RAMBlock *block;
3194     ram_addr_t offset;
3195     size_t used_len, start, npages;
3196     MigrationState *s = migrate_get_current();
3197
3198     /* This function is currently expected to be used during live migration */
3199     if (!migration_is_setup_or_active(s->state)) {
3200         return;
3201     }
3202
3203     for (; len > 0; len -= used_len, addr += used_len) {
3204         block = qemu_ram_block_from_host(addr, false, &offset);
3205         if (unlikely(!block || offset >= block->used_length)) {
3206             /*
3207              * The implementation might not support RAMBlock resize during
3208              * live migration, but it could happen in theory with future
3209              * updates. So we add a check here to capture that case.
3210              */
3211             error_report_once("%s unexpected error", __func__);
3212             return;
3213         }
3214
3215         if (len <= block->used_length - offset) {
3216             used_len = len;
3217         } else {
3218             used_len = block->used_length - offset;
3219         }
3220
3221         start = offset >> TARGET_PAGE_BITS;
3222         npages = used_len >> TARGET_PAGE_BITS;
3223
3224         qemu_mutex_lock(&ram_state->bitmap_mutex);
3225         /*
3226          * The skipped free pages are equavalent to be sent from clear_bmap's
3227          * perspective, so clear the bits from the memory region bitmap which
3228          * are initially set. Otherwise those skipped pages will be sent in
3229          * the next round after syncing from the memory region bitmap.
3230          */
3231         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3232         ram_state->migration_dirty_pages -=
3233                       bitmap_count_one_with_offset(block->bmap, start, npages);
3234         bitmap_clear(block->bmap, start, npages);
3235         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3236     }
3237 }
3238
3239 /*
3240  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3241  * long-running RCU critical section.  When rcu-reclaims in the code
3242  * start to become numerous it will be necessary to reduce the
3243  * granularity of these critical sections.
3244  */
3245
3246 /**
3247  * ram_save_setup: Setup RAM for migration
3248  *
3249  * Returns zero to indicate success and negative for error
3250  *
3251  * @f: QEMUFile where to send the data
3252  * @opaque: RAMState pointer
3253  */
3254 static int ram_save_setup(QEMUFile *f, void *opaque)
3255 {
3256     RAMState **rsp = opaque;
3257     RAMBlock *block;
3258     int ret;
3259
3260     if (compress_threads_save_setup()) {
3261         return -1;
3262     }
3263
3264     /* migration has already setup the bitmap, reuse it. */
3265     if (!migration_in_colo_state()) {
3266         if (ram_init_all(rsp) != 0) {
3267             compress_threads_save_cleanup();
3268             return -1;
3269         }
3270     }
3271     (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3272
3273     WITH_RCU_READ_LOCK_GUARD() {
3274         qemu_put_be64(f, ram_bytes_total_with_ignored()
3275                          | RAM_SAVE_FLAG_MEM_SIZE);
3276
3277         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3278             qemu_put_byte(f, strlen(block->idstr));
3279             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3280             qemu_put_be64(f, block->used_length);
3281             if (migrate_postcopy_ram() && block->page_size !=
3282                                           qemu_host_page_size) {
3283                 qemu_put_be64(f, block->page_size);
3284             }
3285             if (migrate_ignore_shared()) {
3286                 qemu_put_be64(f, block->mr->addr);
3287             }
3288         }
3289     }
3290
3291     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3292     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3293
3294     migration_ops = g_malloc0(sizeof(MigrationOps));
3295     migration_ops->ram_save_target_page = ram_save_target_page_legacy;
3296     ret =  multifd_send_sync_main(f);
3297     if (ret < 0) {
3298         return ret;
3299     }
3300
3301     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3302     qemu_fflush(f);
3303
3304     return 0;
3305 }
3306
3307 /**
3308  * ram_save_iterate: iterative stage for migration
3309  *
3310  * Returns zero to indicate success and negative for error
3311  *
3312  * @f: QEMUFile where to send the data
3313  * @opaque: RAMState pointer
3314  */
3315 static int ram_save_iterate(QEMUFile *f, void *opaque)
3316 {
3317     RAMState **temp = opaque;
3318     RAMState *rs = *temp;
3319     int ret = 0;
3320     int i;
3321     int64_t t0;
3322     int done = 0;
3323
3324     if (blk_mig_bulk_active()) {
3325         /* Avoid transferring ram during bulk phase of block migration as
3326          * the bulk phase will usually take a long time and transferring
3327          * ram updates during that time is pointless. */
3328         goto out;
3329     }
3330
3331     /*
3332      * We'll take this lock a little bit long, but it's okay for two reasons.
3333      * Firstly, the only possible other thread to take it is who calls
3334      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3335      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3336      * guarantees that we'll at least released it in a regular basis.
3337      */
3338     qemu_mutex_lock(&rs->bitmap_mutex);
3339     WITH_RCU_READ_LOCK_GUARD() {
3340         if (ram_list.version != rs->last_version) {
3341             ram_state_reset(rs);
3342         }
3343
3344         /* Read version before ram_list.blocks */
3345         smp_rmb();
3346
3347         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3348
3349         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3350         i = 0;
3351         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3352                postcopy_has_request(rs)) {
3353             int pages;
3354
3355             if (qemu_file_get_error(f)) {
3356                 break;
3357             }
3358
3359             pages = ram_find_and_save_block(rs);
3360             /* no more pages to sent */
3361             if (pages == 0) {
3362                 done = 1;
3363                 break;
3364             }
3365
3366             if (pages < 0) {
3367                 qemu_file_set_error(f, pages);
3368                 break;
3369             }
3370
3371             rs->target_page_count += pages;
3372
3373             /*
3374              * During postcopy, it is necessary to make sure one whole host
3375              * page is sent in one chunk.
3376              */
3377             if (migrate_postcopy_ram()) {
3378                 flush_compressed_data(rs);
3379             }
3380
3381             /*
3382              * we want to check in the 1st loop, just in case it was the 1st
3383              * time and we had to sync the dirty bitmap.
3384              * qemu_clock_get_ns() is a bit expensive, so we only check each
3385              * some iterations
3386              */
3387             if ((i & 63) == 0) {
3388                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3389                               1000000;
3390                 if (t1 > MAX_WAIT) {
3391                     trace_ram_save_iterate_big_wait(t1, i);
3392                     break;
3393                 }
3394             }
3395             i++;
3396         }
3397     }
3398     qemu_mutex_unlock(&rs->bitmap_mutex);
3399
3400     /*
3401      * Must occur before EOS (or any QEMUFile operation)
3402      * because of RDMA protocol.
3403      */
3404     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3405
3406 out:
3407     if (ret >= 0
3408         && migration_is_setup_or_active(migrate_get_current()->state)) {
3409         ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3410         if (ret < 0) {
3411             return ret;
3412         }
3413
3414         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3415         qemu_fflush(f);
3416         ram_transferred_add(8);
3417
3418         ret = qemu_file_get_error(f);
3419     }
3420     if (ret < 0) {
3421         return ret;
3422     }
3423
3424     return done;
3425 }
3426
3427 /**
3428  * ram_save_complete: function called to send the remaining amount of ram
3429  *
3430  * Returns zero to indicate success or negative on error
3431  *
3432  * Called with iothread lock
3433  *
3434  * @f: QEMUFile where to send the data
3435  * @opaque: RAMState pointer
3436  */
3437 static int ram_save_complete(QEMUFile *f, void *opaque)
3438 {
3439     RAMState **temp = opaque;
3440     RAMState *rs = *temp;
3441     int ret = 0;
3442
3443     rs->last_stage = !migration_in_colo_state();
3444
3445     WITH_RCU_READ_LOCK_GUARD() {
3446         if (!migration_in_postcopy()) {
3447             migration_bitmap_sync_precopy(rs);
3448         }
3449
3450         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3451
3452         /* try transferring iterative blocks of memory */
3453
3454         /* flush all remaining blocks regardless of rate limiting */
3455         qemu_mutex_lock(&rs->bitmap_mutex);
3456         while (true) {
3457             int pages;
3458
3459             pages = ram_find_and_save_block(rs);
3460             /* no more blocks to sent */
3461             if (pages == 0) {
3462                 break;
3463             }
3464             if (pages < 0) {
3465                 ret = pages;
3466                 break;
3467             }
3468         }
3469         qemu_mutex_unlock(&rs->bitmap_mutex);
3470
3471         flush_compressed_data(rs);
3472         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3473     }
3474
3475     if (ret < 0) {
3476         return ret;
3477     }
3478
3479     ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3480     if (ret < 0) {
3481         return ret;
3482     }
3483
3484     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3485     qemu_fflush(f);
3486
3487     return 0;
3488 }
3489
3490 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3491                                        uint64_t *can_postcopy)
3492 {
3493     RAMState **temp = opaque;
3494     RAMState *rs = *temp;
3495
3496     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3497
3498     if (migrate_postcopy_ram()) {
3499         /* We can do postcopy, and all the data is postcopiable */
3500         *can_postcopy += remaining_size;
3501     } else {
3502         *must_precopy += remaining_size;
3503     }
3504 }
3505
3506 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3507                                     uint64_t *can_postcopy)
3508 {
3509     MigrationState *s = migrate_get_current();
3510     RAMState **temp = opaque;
3511     RAMState *rs = *temp;
3512
3513     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3514
3515     if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
3516         qemu_mutex_lock_iothread();
3517         WITH_RCU_READ_LOCK_GUARD() {
3518             migration_bitmap_sync_precopy(rs);
3519         }
3520         qemu_mutex_unlock_iothread();
3521         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3522     }
3523
3524     if (migrate_postcopy_ram()) {
3525         /* We can do postcopy, and all the data is postcopiable */
3526         *can_postcopy += remaining_size;
3527     } else {
3528         *must_precopy += remaining_size;
3529     }
3530 }
3531
3532 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3533 {
3534     unsigned int xh_len;
3535     int xh_flags;
3536     uint8_t *loaded_data;
3537
3538     /* extract RLE header */
3539     xh_flags = qemu_get_byte(f);
3540     xh_len = qemu_get_be16(f);
3541
3542     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3543         error_report("Failed to load XBZRLE page - wrong compression!");
3544         return -1;
3545     }
3546
3547     if (xh_len > TARGET_PAGE_SIZE) {
3548         error_report("Failed to load XBZRLE page - len overflow!");
3549         return -1;
3550     }
3551     loaded_data = XBZRLE.decoded_buf;
3552     /* load data and decode */
3553     /* it can change loaded_data to point to an internal buffer */
3554     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3555
3556     /* decode RLE */
3557     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3558                              TARGET_PAGE_SIZE) == -1) {
3559         error_report("Failed to load XBZRLE page - decode error!");
3560         return -1;
3561     }
3562
3563     return 0;
3564 }
3565
3566 /**
3567  * ram_block_from_stream: read a RAMBlock id from the migration stream
3568  *
3569  * Must be called from within a rcu critical section.
3570  *
3571  * Returns a pointer from within the RCU-protected ram_list.
3572  *
3573  * @mis: the migration incoming state pointer
3574  * @f: QEMUFile where to read the data from
3575  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3576  * @channel: the channel we're using
3577  */
3578 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3579                                               QEMUFile *f, int flags,
3580                                               int channel)
3581 {
3582     RAMBlock *block = mis->last_recv_block[channel];
3583     char id[256];
3584     uint8_t len;
3585
3586     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3587         if (!block) {
3588             error_report("Ack, bad migration stream!");
3589             return NULL;
3590         }
3591         return block;
3592     }
3593
3594     len = qemu_get_byte(f);
3595     qemu_get_buffer(f, (uint8_t *)id, len);
3596     id[len] = 0;
3597
3598     block = qemu_ram_block_by_name(id);
3599     if (!block) {
3600         error_report("Can't find block %s", id);
3601         return NULL;
3602     }
3603
3604     if (ramblock_is_ignored(block)) {
3605         error_report("block %s should not be migrated !", id);
3606         return NULL;
3607     }
3608
3609     mis->last_recv_block[channel] = block;
3610
3611     return block;
3612 }
3613
3614 static inline void *host_from_ram_block_offset(RAMBlock *block,
3615                                                ram_addr_t offset)
3616 {
3617     if (!offset_in_ramblock(block, offset)) {
3618         return NULL;
3619     }
3620
3621     return block->host + offset;
3622 }
3623
3624 static void *host_page_from_ram_block_offset(RAMBlock *block,
3625                                              ram_addr_t offset)
3626 {
3627     /* Note: Explicitly no check against offset_in_ramblock(). */
3628     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3629                                    block->page_size);
3630 }
3631
3632 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3633                                                          ram_addr_t offset)
3634 {
3635     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3636 }
3637
3638 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3639                              ram_addr_t offset, bool record_bitmap)
3640 {
3641     if (!offset_in_ramblock(block, offset)) {
3642         return NULL;
3643     }
3644     if (!block->colo_cache) {
3645         error_report("%s: colo_cache is NULL in block :%s",
3646                      __func__, block->idstr);
3647         return NULL;
3648     }
3649
3650     /*
3651     * During colo checkpoint, we need bitmap of these migrated pages.
3652     * It help us to decide which pages in ram cache should be flushed
3653     * into VM's RAM later.
3654     */
3655     if (record_bitmap &&
3656         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3657         ram_state->migration_dirty_pages++;
3658     }
3659     return block->colo_cache + offset;
3660 }
3661
3662 /**
3663  * ram_handle_compressed: handle the zero page case
3664  *
3665  * If a page (or a whole RDMA chunk) has been
3666  * determined to be zero, then zap it.
3667  *
3668  * @host: host address for the zero page
3669  * @ch: what the page is filled from.  We only support zero
3670  * @size: size of the zero page
3671  */
3672 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3673 {
3674     if (ch != 0 || !buffer_is_zero(host, size)) {
3675         memset(host, ch, size);
3676     }
3677 }
3678
3679 /* return the size after decompression, or negative value on error */
3680 static int
3681 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3682                      const uint8_t *source, size_t source_len)
3683 {
3684     int err;
3685
3686     err = inflateReset(stream);
3687     if (err != Z_OK) {
3688         return -1;
3689     }
3690
3691     stream->avail_in = source_len;
3692     stream->next_in = (uint8_t *)source;
3693     stream->avail_out = dest_len;
3694     stream->next_out = dest;
3695
3696     err = inflate(stream, Z_NO_FLUSH);
3697     if (err != Z_STREAM_END) {
3698         return -1;
3699     }
3700
3701     return stream->total_out;
3702 }
3703
3704 static void *do_data_decompress(void *opaque)
3705 {
3706     DecompressParam *param = opaque;
3707     unsigned long pagesize;
3708     uint8_t *des;
3709     int len, ret;
3710
3711     qemu_mutex_lock(&param->mutex);
3712     while (!param->quit) {
3713         if (param->des) {
3714             des = param->des;
3715             len = param->len;
3716             param->des = 0;
3717             qemu_mutex_unlock(&param->mutex);
3718
3719             pagesize = TARGET_PAGE_SIZE;
3720
3721             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3722                                        param->compbuf, len);
3723             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3724                 error_report("decompress data failed");
3725                 qemu_file_set_error(decomp_file, ret);
3726             }
3727
3728             qemu_mutex_lock(&decomp_done_lock);
3729             param->done = true;
3730             qemu_cond_signal(&decomp_done_cond);
3731             qemu_mutex_unlock(&decomp_done_lock);
3732
3733             qemu_mutex_lock(&param->mutex);
3734         } else {
3735             qemu_cond_wait(&param->cond, &param->mutex);
3736         }
3737     }
3738     qemu_mutex_unlock(&param->mutex);
3739
3740     return NULL;
3741 }
3742
3743 static int wait_for_decompress_done(void)
3744 {
3745     int idx, thread_count;
3746
3747     if (!migrate_use_compression()) {
3748         return 0;
3749     }
3750
3751     thread_count = migrate_decompress_threads();
3752     qemu_mutex_lock(&decomp_done_lock);
3753     for (idx = 0; idx < thread_count; idx++) {
3754         while (!decomp_param[idx].done) {
3755             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3756         }
3757     }
3758     qemu_mutex_unlock(&decomp_done_lock);
3759     return qemu_file_get_error(decomp_file);
3760 }
3761
3762 static void compress_threads_load_cleanup(void)
3763 {
3764     int i, thread_count;
3765
3766     if (!migrate_use_compression()) {
3767         return;
3768     }
3769     thread_count = migrate_decompress_threads();
3770     for (i = 0; i < thread_count; i++) {
3771         /*
3772          * we use it as a indicator which shows if the thread is
3773          * properly init'd or not
3774          */
3775         if (!decomp_param[i].compbuf) {
3776             break;
3777         }
3778
3779         qemu_mutex_lock(&decomp_param[i].mutex);
3780         decomp_param[i].quit = true;
3781         qemu_cond_signal(&decomp_param[i].cond);
3782         qemu_mutex_unlock(&decomp_param[i].mutex);
3783     }
3784     for (i = 0; i < thread_count; i++) {
3785         if (!decomp_param[i].compbuf) {
3786             break;
3787         }
3788
3789         qemu_thread_join(decompress_threads + i);
3790         qemu_mutex_destroy(&decomp_param[i].mutex);
3791         qemu_cond_destroy(&decomp_param[i].cond);
3792         inflateEnd(&decomp_param[i].stream);
3793         g_free(decomp_param[i].compbuf);
3794         decomp_param[i].compbuf = NULL;
3795     }
3796     g_free(decompress_threads);
3797     g_free(decomp_param);
3798     decompress_threads = NULL;
3799     decomp_param = NULL;
3800     decomp_file = NULL;
3801 }
3802
3803 static int compress_threads_load_setup(QEMUFile *f)
3804 {
3805     int i, thread_count;
3806
3807     if (!migrate_use_compression()) {
3808         return 0;
3809     }
3810
3811     thread_count = migrate_decompress_threads();
3812     decompress_threads = g_new0(QemuThread, thread_count);
3813     decomp_param = g_new0(DecompressParam, thread_count);
3814     qemu_mutex_init(&decomp_done_lock);
3815     qemu_cond_init(&decomp_done_cond);
3816     decomp_file = f;
3817     for (i = 0; i < thread_count; i++) {
3818         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3819             goto exit;
3820         }
3821
3822         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3823         qemu_mutex_init(&decomp_param[i].mutex);
3824         qemu_cond_init(&decomp_param[i].cond);
3825         decomp_param[i].done = true;
3826         decomp_param[i].quit = false;
3827         qemu_thread_create(decompress_threads + i, "decompress",
3828                            do_data_decompress, decomp_param + i,
3829                            QEMU_THREAD_JOINABLE);
3830     }
3831     return 0;
3832 exit:
3833     compress_threads_load_cleanup();
3834     return -1;
3835 }
3836
3837 static void decompress_data_with_multi_threads(QEMUFile *f,
3838                                                void *host, int len)
3839 {
3840     int idx, thread_count;
3841
3842     thread_count = migrate_decompress_threads();
3843     QEMU_LOCK_GUARD(&decomp_done_lock);
3844     while (true) {
3845         for (idx = 0; idx < thread_count; idx++) {
3846             if (decomp_param[idx].done) {
3847                 decomp_param[idx].done = false;
3848                 qemu_mutex_lock(&decomp_param[idx].mutex);
3849                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3850                 decomp_param[idx].des = host;
3851                 decomp_param[idx].len = len;
3852                 qemu_cond_signal(&decomp_param[idx].cond);
3853                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3854                 break;
3855             }
3856         }
3857         if (idx < thread_count) {
3858             break;
3859         } else {
3860             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3861         }
3862     }
3863 }
3864
3865 static void colo_init_ram_state(void)
3866 {
3867     ram_state_init(&ram_state);
3868 }
3869
3870 /*
3871  * colo cache: this is for secondary VM, we cache the whole
3872  * memory of the secondary VM, it is need to hold the global lock
3873  * to call this helper.
3874  */
3875 int colo_init_ram_cache(void)
3876 {
3877     RAMBlock *block;
3878
3879     WITH_RCU_READ_LOCK_GUARD() {
3880         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3881             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3882                                                     NULL, false, false);
3883             if (!block->colo_cache) {
3884                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3885                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3886                              block->used_length);
3887                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3888                     if (block->colo_cache) {
3889                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3890                         block->colo_cache = NULL;
3891                     }
3892                 }
3893                 return -errno;
3894             }
3895             if (!machine_dump_guest_core(current_machine)) {
3896                 qemu_madvise(block->colo_cache, block->used_length,
3897                              QEMU_MADV_DONTDUMP);
3898             }
3899         }
3900     }
3901
3902     /*
3903     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3904     * with to decide which page in cache should be flushed into SVM's RAM. Here
3905     * we use the same name 'ram_bitmap' as for migration.
3906     */
3907     if (ram_bytes_total()) {
3908         RAMBlock *block;
3909
3910         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3911             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3912             block->bmap = bitmap_new(pages);
3913         }
3914     }
3915
3916     colo_init_ram_state();
3917     return 0;
3918 }
3919
3920 /* TODO: duplicated with ram_init_bitmaps */
3921 void colo_incoming_start_dirty_log(void)
3922 {
3923     RAMBlock *block = NULL;
3924     /* For memory_global_dirty_log_start below. */
3925     qemu_mutex_lock_iothread();
3926     qemu_mutex_lock_ramlist();
3927
3928     memory_global_dirty_log_sync();
3929     WITH_RCU_READ_LOCK_GUARD() {
3930         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3931             ramblock_sync_dirty_bitmap(ram_state, block);
3932             /* Discard this dirty bitmap record */
3933             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3934         }
3935         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3936     }
3937     ram_state->migration_dirty_pages = 0;
3938     qemu_mutex_unlock_ramlist();
3939     qemu_mutex_unlock_iothread();
3940 }
3941
3942 /* It is need to hold the global lock to call this helper */
3943 void colo_release_ram_cache(void)
3944 {
3945     RAMBlock *block;
3946
3947     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3948     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3949         g_free(block->bmap);
3950         block->bmap = NULL;
3951     }
3952
3953     WITH_RCU_READ_LOCK_GUARD() {
3954         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3955             if (block->colo_cache) {
3956                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3957                 block->colo_cache = NULL;
3958             }
3959         }
3960     }
3961     ram_state_cleanup(&ram_state);
3962 }
3963
3964 /**
3965  * ram_load_setup: Setup RAM for migration incoming side
3966  *
3967  * Returns zero to indicate success and negative for error
3968  *
3969  * @f: QEMUFile where to receive the data
3970  * @opaque: RAMState pointer
3971  */
3972 static int ram_load_setup(QEMUFile *f, void *opaque)
3973 {
3974     if (compress_threads_load_setup(f)) {
3975         return -1;
3976     }
3977
3978     xbzrle_load_setup();
3979     ramblock_recv_map_init();
3980
3981     return 0;
3982 }
3983
3984 static int ram_load_cleanup(void *opaque)
3985 {
3986     RAMBlock *rb;
3987
3988     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3989         qemu_ram_block_writeback(rb);
3990     }
3991
3992     xbzrle_load_cleanup();
3993     compress_threads_load_cleanup();
3994
3995     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3996         g_free(rb->receivedmap);
3997         rb->receivedmap = NULL;
3998     }
3999
4000     return 0;
4001 }
4002
4003 /**
4004  * ram_postcopy_incoming_init: allocate postcopy data structures
4005  *
4006  * Returns 0 for success and negative if there was one error
4007  *
4008  * @mis: current migration incoming state
4009  *
4010  * Allocate data structures etc needed by incoming migration with
4011  * postcopy-ram. postcopy-ram's similarly names
4012  * postcopy_ram_incoming_init does the work.
4013  */
4014 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4015 {
4016     return postcopy_ram_incoming_init(mis);
4017 }
4018
4019 /**
4020  * ram_load_postcopy: load a page in postcopy case
4021  *
4022  * Returns 0 for success or -errno in case of error
4023  *
4024  * Called in postcopy mode by ram_load().
4025  * rcu_read_lock is taken prior to this being called.
4026  *
4027  * @f: QEMUFile where to send the data
4028  * @channel: the channel to use for loading
4029  */
4030 int ram_load_postcopy(QEMUFile *f, int channel)
4031 {
4032     int flags = 0, ret = 0;
4033     bool place_needed = false;
4034     bool matches_target_page_size = false;
4035     MigrationIncomingState *mis = migration_incoming_get_current();
4036     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
4037
4038     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4039         ram_addr_t addr;
4040         void *page_buffer = NULL;
4041         void *place_source = NULL;
4042         RAMBlock *block = NULL;
4043         uint8_t ch;
4044         int len;
4045
4046         addr = qemu_get_be64(f);
4047
4048         /*
4049          * If qemu file error, we should stop here, and then "addr"
4050          * may be invalid
4051          */
4052         ret = qemu_file_get_error(f);
4053         if (ret) {
4054             break;
4055         }
4056
4057         flags = addr & ~TARGET_PAGE_MASK;
4058         addr &= TARGET_PAGE_MASK;
4059
4060         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
4061         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4062                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
4063             block = ram_block_from_stream(mis, f, flags, channel);
4064             if (!block) {
4065                 ret = -EINVAL;
4066                 break;
4067             }
4068
4069             /*
4070              * Relying on used_length is racy and can result in false positives.
4071              * We might place pages beyond used_length in case RAM was shrunk
4072              * while in postcopy, which is fine - trying to place via
4073              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4074              */
4075             if (!block->host || addr >= block->postcopy_length) {
4076                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4077                 ret = -EINVAL;
4078                 break;
4079             }
4080             tmp_page->target_pages++;
4081             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4082             /*
4083              * Postcopy requires that we place whole host pages atomically;
4084              * these may be huge pages for RAMBlocks that are backed by
4085              * hugetlbfs.
4086              * To make it atomic, the data is read into a temporary page
4087              * that's moved into place later.
4088              * The migration protocol uses,  possibly smaller, target-pages
4089              * however the source ensures it always sends all the components
4090              * of a host page in one chunk.
4091              */
4092             page_buffer = tmp_page->tmp_huge_page +
4093                           host_page_offset_from_ram_block_offset(block, addr);
4094             /* If all TP are zero then we can optimise the place */
4095             if (tmp_page->target_pages == 1) {
4096                 tmp_page->host_addr =
4097                     host_page_from_ram_block_offset(block, addr);
4098             } else if (tmp_page->host_addr !=
4099                        host_page_from_ram_block_offset(block, addr)) {
4100                 /* not the 1st TP within the HP */
4101                 error_report("Non-same host page detected on channel %d: "
4102                              "Target host page %p, received host page %p "
4103                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4104                              channel, tmp_page->host_addr,
4105                              host_page_from_ram_block_offset(block, addr),
4106                              block->idstr, addr, tmp_page->target_pages);
4107                 ret = -EINVAL;
4108                 break;
4109             }
4110
4111             /*
4112              * If it's the last part of a host page then we place the host
4113              * page
4114              */
4115             if (tmp_page->target_pages ==
4116                 (block->page_size / TARGET_PAGE_SIZE)) {
4117                 place_needed = true;
4118             }
4119             place_source = tmp_page->tmp_huge_page;
4120         }
4121
4122         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4123         case RAM_SAVE_FLAG_ZERO:
4124             ch = qemu_get_byte(f);
4125             /*
4126              * Can skip to set page_buffer when
4127              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4128              */
4129             if (ch || !matches_target_page_size) {
4130                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4131             }
4132             if (ch) {
4133                 tmp_page->all_zero = false;
4134             }
4135             break;
4136
4137         case RAM_SAVE_FLAG_PAGE:
4138             tmp_page->all_zero = false;
4139             if (!matches_target_page_size) {
4140                 /* For huge pages, we always use temporary buffer */
4141                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4142             } else {
4143                 /*
4144                  * For small pages that matches target page size, we
4145                  * avoid the qemu_file copy.  Instead we directly use
4146                  * the buffer of QEMUFile to place the page.  Note: we
4147                  * cannot do any QEMUFile operation before using that
4148                  * buffer to make sure the buffer is valid when
4149                  * placing the page.
4150                  */
4151                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4152                                          TARGET_PAGE_SIZE);
4153             }
4154             break;
4155         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4156             tmp_page->all_zero = false;
4157             len = qemu_get_be32(f);
4158             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4159                 error_report("Invalid compressed data length: %d", len);
4160                 ret = -EINVAL;
4161                 break;
4162             }
4163             decompress_data_with_multi_threads(f, page_buffer, len);
4164             break;
4165
4166         case RAM_SAVE_FLAG_EOS:
4167             /* normal exit */
4168             multifd_recv_sync_main();
4169             break;
4170         default:
4171             error_report("Unknown combination of migration flags: 0x%x"
4172                          " (postcopy mode)", flags);
4173             ret = -EINVAL;
4174             break;
4175         }
4176
4177         /* Got the whole host page, wait for decompress before placing. */
4178         if (place_needed) {
4179             ret |= wait_for_decompress_done();
4180         }
4181
4182         /* Detect for any possible file errors */
4183         if (!ret && qemu_file_get_error(f)) {
4184             ret = qemu_file_get_error(f);
4185         }
4186
4187         if (!ret && place_needed) {
4188             if (tmp_page->all_zero) {
4189                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4190             } else {
4191                 ret = postcopy_place_page(mis, tmp_page->host_addr,
4192                                           place_source, block);
4193             }
4194             place_needed = false;
4195             postcopy_temp_page_reset(tmp_page);
4196         }
4197     }
4198
4199     return ret;
4200 }
4201
4202 static bool postcopy_is_running(void)
4203 {
4204     PostcopyState ps = postcopy_state_get();
4205     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4206 }
4207
4208 /*
4209  * Flush content of RAM cache into SVM's memory.
4210  * Only flush the pages that be dirtied by PVM or SVM or both.
4211  */
4212 void colo_flush_ram_cache(void)
4213 {
4214     RAMBlock *block = NULL;
4215     void *dst_host;
4216     void *src_host;
4217     unsigned long offset = 0;
4218
4219     memory_global_dirty_log_sync();
4220     WITH_RCU_READ_LOCK_GUARD() {
4221         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4222             ramblock_sync_dirty_bitmap(ram_state, block);
4223         }
4224     }
4225
4226     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4227     WITH_RCU_READ_LOCK_GUARD() {
4228         block = QLIST_FIRST_RCU(&ram_list.blocks);
4229
4230         while (block) {
4231             unsigned long num = 0;
4232
4233             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4234             if (!offset_in_ramblock(block,
4235                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4236                 offset = 0;
4237                 num = 0;
4238                 block = QLIST_NEXT_RCU(block, next);
4239             } else {
4240                 unsigned long i = 0;
4241
4242                 for (i = 0; i < num; i++) {
4243                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
4244                 }
4245                 dst_host = block->host
4246                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4247                 src_host = block->colo_cache
4248                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4249                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4250                 offset += num;
4251             }
4252         }
4253     }
4254     trace_colo_flush_ram_cache_end();
4255 }
4256
4257 /**
4258  * ram_load_precopy: load pages in precopy case
4259  *
4260  * Returns 0 for success or -errno in case of error
4261  *
4262  * Called in precopy mode by ram_load().
4263  * rcu_read_lock is taken prior to this being called.
4264  *
4265  * @f: QEMUFile where to send the data
4266  */
4267 static int ram_load_precopy(QEMUFile *f)
4268 {
4269     MigrationIncomingState *mis = migration_incoming_get_current();
4270     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4271     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4272     bool postcopy_advised = migration_incoming_postcopy_advised();
4273     if (!migrate_use_compression()) {
4274         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4275     }
4276
4277     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4278         ram_addr_t addr, total_ram_bytes;
4279         void *host = NULL, *host_bak = NULL;
4280         uint8_t ch;
4281
4282         /*
4283          * Yield periodically to let main loop run, but an iteration of
4284          * the main loop is expensive, so do it each some iterations
4285          */
4286         if ((i & 32767) == 0 && qemu_in_coroutine()) {
4287             aio_co_schedule(qemu_get_current_aio_context(),
4288                             qemu_coroutine_self());
4289             qemu_coroutine_yield();
4290         }
4291         i++;
4292
4293         addr = qemu_get_be64(f);
4294         flags = addr & ~TARGET_PAGE_MASK;
4295         addr &= TARGET_PAGE_MASK;
4296
4297         if (flags & invalid_flags) {
4298             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4299                 error_report("Received an unexpected compressed page");
4300             }
4301
4302             ret = -EINVAL;
4303             break;
4304         }
4305
4306         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4307                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4308             RAMBlock *block = ram_block_from_stream(mis, f, flags,
4309                                                     RAM_CHANNEL_PRECOPY);
4310
4311             host = host_from_ram_block_offset(block, addr);
4312             /*
4313              * After going into COLO stage, we should not load the page
4314              * into SVM's memory directly, we put them into colo_cache firstly.
4315              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4316              * Previously, we copied all these memory in preparing stage of COLO
4317              * while we need to stop VM, which is a time-consuming process.
4318              * Here we optimize it by a trick, back-up every page while in
4319              * migration process while COLO is enabled, though it affects the
4320              * speed of the migration, but it obviously reduce the downtime of
4321              * back-up all SVM'S memory in COLO preparing stage.
4322              */
4323             if (migration_incoming_colo_enabled()) {
4324                 if (migration_incoming_in_colo_state()) {
4325                     /* In COLO stage, put all pages into cache temporarily */
4326                     host = colo_cache_from_block_offset(block, addr, true);
4327                 } else {
4328                    /*
4329                     * In migration stage but before COLO stage,
4330                     * Put all pages into both cache and SVM's memory.
4331                     */
4332                     host_bak = colo_cache_from_block_offset(block, addr, false);
4333                 }
4334             }
4335             if (!host) {
4336                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4337                 ret = -EINVAL;
4338                 break;
4339             }
4340             if (!migration_incoming_in_colo_state()) {
4341                 ramblock_recv_bitmap_set(block, host);
4342             }
4343
4344             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4345         }
4346
4347         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4348         case RAM_SAVE_FLAG_MEM_SIZE:
4349             /* Synchronize RAM block list */
4350             total_ram_bytes = addr;
4351             while (!ret && total_ram_bytes) {
4352                 RAMBlock *block;
4353                 char id[256];
4354                 ram_addr_t length;
4355
4356                 len = qemu_get_byte(f);
4357                 qemu_get_buffer(f, (uint8_t *)id, len);
4358                 id[len] = 0;
4359                 length = qemu_get_be64(f);
4360
4361                 block = qemu_ram_block_by_name(id);
4362                 if (block && !qemu_ram_is_migratable(block)) {
4363                     error_report("block %s should not be migrated !", id);
4364                     ret = -EINVAL;
4365                 } else if (block) {
4366                     if (length != block->used_length) {
4367                         Error *local_err = NULL;
4368
4369                         ret = qemu_ram_resize(block, length,
4370                                               &local_err);
4371                         if (local_err) {
4372                             error_report_err(local_err);
4373                         }
4374                     }
4375                     /* For postcopy we need to check hugepage sizes match */
4376                     if (postcopy_advised && migrate_postcopy_ram() &&
4377                         block->page_size != qemu_host_page_size) {
4378                         uint64_t remote_page_size = qemu_get_be64(f);
4379                         if (remote_page_size != block->page_size) {
4380                             error_report("Mismatched RAM page size %s "
4381                                          "(local) %zd != %" PRId64,
4382                                          id, block->page_size,
4383                                          remote_page_size);
4384                             ret = -EINVAL;
4385                         }
4386                     }
4387                     if (migrate_ignore_shared()) {
4388                         hwaddr addr = qemu_get_be64(f);
4389                         if (ramblock_is_ignored(block) &&
4390                             block->mr->addr != addr) {
4391                             error_report("Mismatched GPAs for block %s "
4392                                          "%" PRId64 "!= %" PRId64,
4393                                          id, (uint64_t)addr,
4394                                          (uint64_t)block->mr->addr);
4395                             ret = -EINVAL;
4396                         }
4397                     }
4398                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4399                                           block->idstr);
4400                 } else {
4401                     error_report("Unknown ramblock \"%s\", cannot "
4402                                  "accept migration", id);
4403                     ret = -EINVAL;
4404                 }
4405
4406                 total_ram_bytes -= length;
4407             }
4408             break;
4409
4410         case RAM_SAVE_FLAG_ZERO:
4411             ch = qemu_get_byte(f);
4412             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4413             break;
4414
4415         case RAM_SAVE_FLAG_PAGE:
4416             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4417             break;
4418
4419         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4420             len = qemu_get_be32(f);
4421             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4422                 error_report("Invalid compressed data length: %d", len);
4423                 ret = -EINVAL;
4424                 break;
4425             }
4426             decompress_data_with_multi_threads(f, host, len);
4427             break;
4428
4429         case RAM_SAVE_FLAG_XBZRLE:
4430             if (load_xbzrle(f, addr, host) < 0) {
4431                 error_report("Failed to decompress XBZRLE page at "
4432                              RAM_ADDR_FMT, addr);
4433                 ret = -EINVAL;
4434                 break;
4435             }
4436             break;
4437         case RAM_SAVE_FLAG_EOS:
4438             /* normal exit */
4439             multifd_recv_sync_main();
4440             break;
4441         default:
4442             if (flags & RAM_SAVE_FLAG_HOOK) {
4443                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4444             } else {
4445                 error_report("Unknown combination of migration flags: 0x%x",
4446                              flags);
4447                 ret = -EINVAL;
4448             }
4449         }
4450         if (!ret) {
4451             ret = qemu_file_get_error(f);
4452         }
4453         if (!ret && host_bak) {
4454             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4455         }
4456     }
4457
4458     ret |= wait_for_decompress_done();
4459     return ret;
4460 }
4461
4462 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4463 {
4464     int ret = 0;
4465     static uint64_t seq_iter;
4466     /*
4467      * If system is running in postcopy mode, page inserts to host memory must
4468      * be atomic
4469      */
4470     bool postcopy_running = postcopy_is_running();
4471
4472     seq_iter++;
4473
4474     if (version_id != 4) {
4475         return -EINVAL;
4476     }
4477
4478     /*
4479      * This RCU critical section can be very long running.
4480      * When RCU reclaims in the code start to become numerous,
4481      * it will be necessary to reduce the granularity of this
4482      * critical section.
4483      */
4484     WITH_RCU_READ_LOCK_GUARD() {
4485         if (postcopy_running) {
4486             /*
4487              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4488              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4489              * service fast page faults.
4490              */
4491             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4492         } else {
4493             ret = ram_load_precopy(f);
4494         }
4495     }
4496     trace_ram_load_complete(ret, seq_iter);
4497
4498     return ret;
4499 }
4500
4501 static bool ram_has_postcopy(void *opaque)
4502 {
4503     RAMBlock *rb;
4504     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4505         if (ramblock_is_pmem(rb)) {
4506             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4507                          "is not supported now!", rb->idstr, rb->host);
4508             return false;
4509         }
4510     }
4511
4512     return migrate_postcopy_ram();
4513 }
4514
4515 /* Sync all the dirty bitmap with destination VM.  */
4516 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4517 {
4518     RAMBlock *block;
4519     QEMUFile *file = s->to_dst_file;
4520     int ramblock_count = 0;
4521
4522     trace_ram_dirty_bitmap_sync_start();
4523
4524     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4525         qemu_savevm_send_recv_bitmap(file, block->idstr);
4526         trace_ram_dirty_bitmap_request(block->idstr);
4527         ramblock_count++;
4528     }
4529
4530     trace_ram_dirty_bitmap_sync_wait();
4531
4532     /* Wait until all the ramblocks' dirty bitmap synced */
4533     while (ramblock_count--) {
4534         qemu_sem_wait(&s->rp_state.rp_sem);
4535     }
4536
4537     trace_ram_dirty_bitmap_sync_complete();
4538
4539     return 0;
4540 }
4541
4542 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4543 {
4544     qemu_sem_post(&s->rp_state.rp_sem);
4545 }
4546
4547 /*
4548  * Read the received bitmap, revert it as the initial dirty bitmap.
4549  * This is only used when the postcopy migration is paused but wants
4550  * to resume from a middle point.
4551  */
4552 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4553 {
4554     int ret = -EINVAL;
4555     /* from_dst_file is always valid because we're within rp_thread */
4556     QEMUFile *file = s->rp_state.from_dst_file;
4557     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4558     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4559     uint64_t size, end_mark;
4560
4561     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4562
4563     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4564         error_report("%s: incorrect state %s", __func__,
4565                      MigrationStatus_str(s->state));
4566         return -EINVAL;
4567     }
4568
4569     /*
4570      * Note: see comments in ramblock_recv_bitmap_send() on why we
4571      * need the endianness conversion, and the paddings.
4572      */
4573     local_size = ROUND_UP(local_size, 8);
4574
4575     /* Add paddings */
4576     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4577
4578     size = qemu_get_be64(file);
4579
4580     /* The size of the bitmap should match with our ramblock */
4581     if (size != local_size) {
4582         error_report("%s: ramblock '%s' bitmap size mismatch "
4583                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4584                      block->idstr, size, local_size);
4585         ret = -EINVAL;
4586         goto out;
4587     }
4588
4589     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4590     end_mark = qemu_get_be64(file);
4591
4592     ret = qemu_file_get_error(file);
4593     if (ret || size != local_size) {
4594         error_report("%s: read bitmap failed for ramblock '%s': %d"
4595                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4596                      __func__, block->idstr, ret, local_size, size);
4597         ret = -EIO;
4598         goto out;
4599     }
4600
4601     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4602         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4603                      __func__, block->idstr, end_mark);
4604         ret = -EINVAL;
4605         goto out;
4606     }
4607
4608     /*
4609      * Endianness conversion. We are during postcopy (though paused).
4610      * The dirty bitmap won't change. We can directly modify it.
4611      */
4612     bitmap_from_le(block->bmap, le_bitmap, nbits);
4613
4614     /*
4615      * What we received is "received bitmap". Revert it as the initial
4616      * dirty bitmap for this ramblock.
4617      */
4618     bitmap_complement(block->bmap, block->bmap, nbits);
4619
4620     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4621     ramblock_dirty_bitmap_clear_discarded_pages(block);
4622
4623     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4624     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4625
4626     /*
4627      * We succeeded to sync bitmap for current ramblock. If this is
4628      * the last one to sync, we need to notify the main send thread.
4629      */
4630     ram_dirty_bitmap_reload_notify(s);
4631
4632     ret = 0;
4633 out:
4634     g_free(le_bitmap);
4635     return ret;
4636 }
4637
4638 static int ram_resume_prepare(MigrationState *s, void *opaque)
4639 {
4640     RAMState *rs = *(RAMState **)opaque;
4641     int ret;
4642
4643     ret = ram_dirty_bitmap_sync_all(s, rs);
4644     if (ret) {
4645         return ret;
4646     }
4647
4648     ram_state_resume_prepare(rs, s->to_dst_file);
4649
4650     return 0;
4651 }
4652
4653 void postcopy_preempt_shutdown_file(MigrationState *s)
4654 {
4655     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4656     qemu_fflush(s->postcopy_qemufile_src);
4657 }
4658
4659 static SaveVMHandlers savevm_ram_handlers = {
4660     .save_setup = ram_save_setup,
4661     .save_live_iterate = ram_save_iterate,
4662     .save_live_complete_postcopy = ram_save_complete,
4663     .save_live_complete_precopy = ram_save_complete,
4664     .has_postcopy = ram_has_postcopy,
4665     .state_pending_exact = ram_state_pending_exact,
4666     .state_pending_estimate = ram_state_pending_estimate,
4667     .load_state = ram_load,
4668     .save_cleanup = ram_save_cleanup,
4669     .load_setup = ram_load_setup,
4670     .load_cleanup = ram_load_cleanup,
4671     .resume_prepare = ram_resume_prepare,
4672 };
4673
4674 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4675                                       size_t old_size, size_t new_size)
4676 {
4677     PostcopyState ps = postcopy_state_get();
4678     ram_addr_t offset;
4679     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4680     Error *err = NULL;
4681
4682     if (ramblock_is_ignored(rb)) {
4683         return;
4684     }
4685
4686     if (!migration_is_idle()) {
4687         /*
4688          * Precopy code on the source cannot deal with the size of RAM blocks
4689          * changing at random points in time - especially after sending the
4690          * RAM block sizes in the migration stream, they must no longer change.
4691          * Abort and indicate a proper reason.
4692          */
4693         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4694         migration_cancel(err);
4695         error_free(err);
4696     }
4697
4698     switch (ps) {
4699     case POSTCOPY_INCOMING_ADVISE:
4700         /*
4701          * Update what ram_postcopy_incoming_init()->init_range() does at the
4702          * time postcopy was advised. Syncing RAM blocks with the source will
4703          * result in RAM resizes.
4704          */
4705         if (old_size < new_size) {
4706             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4707                 error_report("RAM block '%s' discard of resized RAM failed",
4708                              rb->idstr);
4709             }
4710         }
4711         rb->postcopy_length = new_size;
4712         break;
4713     case POSTCOPY_INCOMING_NONE:
4714     case POSTCOPY_INCOMING_RUNNING:
4715     case POSTCOPY_INCOMING_END:
4716         /*
4717          * Once our guest is running, postcopy does no longer care about
4718          * resizes. When growing, the new memory was not available on the
4719          * source, no handler needed.
4720          */
4721         break;
4722     default:
4723         error_report("RAM block '%s' resized during postcopy state: %d",
4724                      rb->idstr, ps);
4725         exit(-1);
4726     }
4727 }
4728
4729 static RAMBlockNotifier ram_mig_ram_notifier = {
4730     .ram_block_resized = ram_mig_ram_block_resized,
4731 };
4732
4733 void ram_mig_init(void)
4734 {
4735     qemu_mutex_init(&XBZRLE.lock);
4736     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4737     ram_block_notifier_add(&ram_mig_ram_notifier);
4738 }