migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/madvise.h"
  34 #include "qemu/main-loop.h"
  35 #include "xbzrle.h"
  36 #include "ram-compress.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration-stats.h"
  40 #include "migration/register.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "postcopy-ram.h"
  44 #include "page_cache.h"
  45 #include "qemu/error-report.h"
  46 #include "qapi/error.h"
  47 #include "qapi/qapi-types-migration.h"
  48 #include "qapi/qapi-events-migration.h"
  49 #include "qapi/qapi-commands-migration.h"
  50 #include "qapi/qmp/qerror.h"
  51 #include "trace.h"
  52 #include "exec/ram_addr.h"
  53 #include "exec/target_page.h"
  54 #include "qemu/rcu_queue.h"
  55 #include "migration/colo.h"
  56 #include "block.h"
  57 #include "sysemu/cpu-throttle.h"
  58 #include "savevm.h"
  59 #include "qemu/iov.h"
  60 #include "multifd.h"
  61 #include "sysemu/runstate.h"
  62 #include "rdma.h"
  63 #include "options.h"
  64 #include "sysemu/dirtylimit.h"
  65 #include "sysemu/kvm.h"
  66
  67 #include "hw/boards.h" /* for machine_dump_guest_core() */
  68
  69 #if defined(__linux__)
  70 #include "qemu/userfaultfd.h"
  71 #endif /* defined(__linux__) */
  72
  73 /***********************************************************/
  74 /* ram save/restore */
  75
  76 /*
  77  * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  78  * worked for pages that were filled with the same char.  We switched
  79  * it to only search for the zero value.  And to avoid confusion with
  80  * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
  81  */
  82 /*
  83  * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
  84  */
  85 #define RAM_SAVE_FLAG_FULL     0x01
  86 #define RAM_SAVE_FLAG_ZERO     0x02
  87 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  88 #define RAM_SAVE_FLAG_PAGE     0x08
  89 #define RAM_SAVE_FLAG_EOS      0x10
  90 #define RAM_SAVE_FLAG_CONTINUE 0x20
  91 #define RAM_SAVE_FLAG_XBZRLE   0x40
  92 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
  93 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  94 #define RAM_SAVE_FLAG_MULTIFD_FLUSH    0x200
  95 /* We can't use any flag that is bigger than 0x200 */
  96
  97 XBZRLECacheStats xbzrle_counters;
  98
  99 /* used by the search for pages to send */
 100 struct PageSearchStatus {
 101     /* The migration channel used for a specific host page */
 102     QEMUFile    *pss_channel;
 103     /* Last block from where we have sent data */
 104     RAMBlock *last_sent_block;
 105     /* Current block being searched */
 106     RAMBlock    *block;
 107     /* Current page to search from */
 108     unsigned long page;
 109     /* Set once we wrap around */
 110     bool         complete_round;
 111     /* Whether we're sending a host page */
 112     bool          host_page_sending;
 113     /* The start/end of current host page.  Invalid if host_page_sending==false */
 114     unsigned long host_page_start;
 115     unsigned long host_page_end;
 116 };
 117 typedef struct PageSearchStatus PageSearchStatus;
 118
 119 /* struct contains XBZRLE cache and a static page
 120    used by the compression */
 121 static struct {
 122     /* buffer used for XBZRLE encoding */
 123     uint8_t *encoded_buf;
 124     /* buffer for storing page content */
 125     uint8_t *current_buf;
 126     /* Cache for XBZRLE, Protected by lock. */
 127     PageCache *cache;
 128     QemuMutex lock;
 129     /* it will store a page full of zeros */
 130     uint8_t *zero_target_page;
 131     /* buffer used for XBZRLE decoding */
 132     uint8_t *decoded_buf;
 133 } XBZRLE;
 134
 135 static void XBZRLE_cache_lock(void)
 136 {
 137     if (migrate_xbzrle()) {
 138         qemu_mutex_lock(&XBZRLE.lock);
 139     }
 140 }
 141
 142 static void XBZRLE_cache_unlock(void)
 143 {
 144     if (migrate_xbzrle()) {
 145         qemu_mutex_unlock(&XBZRLE.lock);
 146     }
 147 }
 148
 149 /**
 150  * xbzrle_cache_resize: resize the xbzrle cache
 151  *
 152  * This function is called from migrate_params_apply in main
 153  * thread, possibly while a migration is in progress.  A running
 154  * migration may be using the cache and might finish during this call,
 155  * hence changes to the cache are protected by XBZRLE.lock().
 156  *
 157  * Returns 0 for success or -1 for error
 158  *
 159  * @new_size: new cache size
 160  * @errp: set *errp if the check failed, with reason
 161  */
 162 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 163 {
 164     PageCache *new_cache;
 165     int64_t ret = 0;
 166
 167     /* Check for truncation */
 168     if (new_size != (size_t)new_size) {
 169         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 170                    "exceeding address space");
 171         return -1;
 172     }
 173
 174     if (new_size == migrate_xbzrle_cache_size()) {
 175         /* nothing to do */
 176         return 0;
 177     }
 178
 179     XBZRLE_cache_lock();
 180
 181     if (XBZRLE.cache != NULL) {
 182         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 183         if (!new_cache) {
 184             ret = -1;
 185             goto out;
 186         }
 187
 188         cache_fini(XBZRLE.cache);
 189         XBZRLE.cache = new_cache;
 190     }
 191 out:
 192     XBZRLE_cache_unlock();
 193     return ret;
 194 }
 195
 196 static bool postcopy_preempt_active(void)
 197 {
 198     return migrate_postcopy_preempt() && migration_in_postcopy();
 199 }
 200
 201 bool migrate_ram_is_ignored(RAMBlock *block)
 202 {
 203     return !qemu_ram_is_migratable(block) ||
 204            (migrate_ignore_shared() && qemu_ram_is_shared(block)
 205                                     && qemu_ram_is_named_file(block));
 206 }
 207
 208 #undef RAMBLOCK_FOREACH
 209
 210 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 211 {
 212     RAMBlock *block;
 213     int ret = 0;
 214
 215     RCU_READ_LOCK_GUARD();
 216
 217     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 218         ret = func(block, opaque);
 219         if (ret) {
 220             break;
 221         }
 222     }
 223     return ret;
 224 }
 225
 226 static void ramblock_recv_map_init(void)
 227 {
 228     RAMBlock *rb;
 229
 230     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 231         assert(!rb->receivedmap);
 232         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 233     }
 234 }
 235
 236 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 237 {
 238     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 239                     rb->receivedmap);
 240 }
 241
 242 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 243 {
 244     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 245 }
 246
 247 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 248 {
 249     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 250 }
 251
 252 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 253                                     size_t nr)
 254 {
 255     bitmap_set_atomic(rb->receivedmap,
 256                       ramblock_recv_bitmap_offset(host_addr, rb),
 257                       nr);
 258 }
 259
 260 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 261
 262 /*
 263  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 264  *
 265  * Returns >0 if success with sent bytes, or <0 if error.
 266  */
 267 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 268                                   const char *block_name)
 269 {
 270     RAMBlock *block = qemu_ram_block_by_name(block_name);
 271     unsigned long *le_bitmap, nbits;
 272     uint64_t size;
 273
 274     if (!block) {
 275         error_report("%s: invalid block name: %s", __func__, block_name);
 276         return -1;
 277     }
 278
 279     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 280
 281     /*
 282      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 283      * machines we may need 4 more bytes for padding (see below
 284      * comment). So extend it a bit before hand.
 285      */
 286     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 287
 288     /*
 289      * Always use little endian when sending the bitmap. This is
 290      * required that when source and destination VMs are not using the
 291      * same endianness. (Note: big endian won't work.)
 292      */
 293     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 294
 295     /* Size of the bitmap, in bytes */
 296     size = DIV_ROUND_UP(nbits, 8);
 297
 298     /*
 299      * size is always aligned to 8 bytes for 64bit machines, but it
 300      * may not be true for 32bit machines. We need this padding to
 301      * make sure the migration can survive even between 32bit and
 302      * 64bit machines.
 303      */
 304     size = ROUND_UP(size, 8);
 305
 306     qemu_put_be64(file, size);
 307     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 308     /*
 309      * Mark as an end, in case the middle part is screwed up due to
 310      * some "mysterious" reason.
 311      */
 312     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 313     qemu_fflush(file);
 314
 315     g_free(le_bitmap);
 316
 317     if (qemu_file_get_error(file)) {
 318         return qemu_file_get_error(file);
 319     }
 320
 321     return size + sizeof(size);
 322 }
 323
 324 /*
 325  * An outstanding page request, on the source, having been received
 326  * and queued
 327  */
 328 struct RAMSrcPageRequest {
 329     RAMBlock *rb;
 330     hwaddr    offset;
 331     hwaddr    len;
 332
 333     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 334 };
 335
 336 /* State of RAM for migration */
 337 struct RAMState {
 338     /*
 339      * PageSearchStatus structures for the channels when send pages.
 340      * Protected by the bitmap_mutex.
 341      */
 342     PageSearchStatus pss[RAM_CHANNEL_MAX];
 343     /* UFFD file descriptor, used in 'write-tracking' migration */
 344     int uffdio_fd;
 345     /* total ram size in bytes */
 346     uint64_t ram_bytes_total;
 347     /* Last block that we have visited searching for dirty pages */
 348     RAMBlock *last_seen_block;
 349     /* Last dirty target page we have sent */
 350     ram_addr_t last_page;
 351     /* last ram version we have seen */
 352     uint32_t last_version;
 353     /* How many times we have dirty too many pages */
 354     int dirty_rate_high_cnt;
 355     /* these variables are used for bitmap sync */
 356     /* last time we did a full bitmap_sync */
 357     int64_t time_last_bitmap_sync;
 358     /* bytes transferred at start_time */
 359     uint64_t bytes_xfer_prev;
 360     /* number of dirty pages since start_time */
 361     uint64_t num_dirty_pages_period;
 362     /* xbzrle misses since the beginning of the period */
 363     uint64_t xbzrle_cache_miss_prev;
 364     /* Amount of xbzrle pages since the beginning of the period */
 365     uint64_t xbzrle_pages_prev;
 366     /* Amount of xbzrle encoded bytes since the beginning of the period */
 367     uint64_t xbzrle_bytes_prev;
 368     /* Are we really using XBZRLE (e.g., after the first round). */
 369     bool xbzrle_started;
 370     /* Are we on the last stage of migration */
 371     bool last_stage;
 372     /* compression statistics since the beginning of the period */
 373     /* amount of count that no free thread to compress data */
 374     uint64_t compress_thread_busy_prev;
 375     /* amount bytes after compression */
 376     uint64_t compressed_size_prev;
 377     /* amount of compressed pages */
 378     uint64_t compress_pages_prev;
 379
 380     /* total handled target pages at the beginning of period */
 381     uint64_t target_page_count_prev;
 382     /* total handled target pages since start */
 383     uint64_t target_page_count;
 384     /* number of dirty bits in the bitmap */
 385     uint64_t migration_dirty_pages;
 386     /*
 387      * Protects:
 388      * - dirty/clear bitmap
 389      * - migration_dirty_pages
 390      * - pss structures
 391      */
 392     QemuMutex bitmap_mutex;
 393     /* The RAMBlock used in the last src_page_requests */
 394     RAMBlock *last_req_rb;
 395     /* Queue of outstanding page requests from the destination */
 396     QemuMutex src_page_req_mutex;
 397     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 398
 399     /*
 400      * This is only used when postcopy is in recovery phase, to communicate
 401      * between the migration thread and the return path thread on dirty
 402      * bitmap synchronizations.  This field is unused in other stages of
 403      * RAM migration.
 404      */
 405     unsigned int postcopy_bmap_sync_requested;
 406 };
 407 typedef struct RAMState RAMState;
 408
 409 static RAMState *ram_state;
 410
 411 static NotifierWithReturnList precopy_notifier_list;
 412
 413 /* Whether postcopy has queued requests? */
 414 static bool postcopy_has_request(RAMState *rs)
 415 {
 416     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
 417 }
 418
 419 void precopy_infrastructure_init(void)
 420 {
 421     notifier_with_return_list_init(&precopy_notifier_list);
 422 }
 423
 424 void precopy_add_notifier(NotifierWithReturn *n)
 425 {
 426     notifier_with_return_list_add(&precopy_notifier_list, n);
 427 }
 428
 429 void precopy_remove_notifier(NotifierWithReturn *n)
 430 {
 431     notifier_with_return_remove(n);
 432 }
 433
 434 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 435 {
 436     PrecopyNotifyData pnd;
 437     pnd.reason = reason;
 438     pnd.errp = errp;
 439
 440     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 441 }
 442
 443 uint64_t ram_bytes_remaining(void)
 444 {
 445     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 446                        0;
 447 }
 448
 449 void ram_transferred_add(uint64_t bytes)
 450 {
 451     if (runstate_is_running()) {
 452         stat64_add(&mig_stats.precopy_bytes, bytes);
 453     } else if (migration_in_postcopy()) {
 454         stat64_add(&mig_stats.postcopy_bytes, bytes);
 455     } else {
 456         stat64_add(&mig_stats.downtime_bytes, bytes);
 457     }
 458     stat64_add(&mig_stats.transferred, bytes);
 459 }
 460
 461 struct MigrationOps {
 462     int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
 463 };
 464 typedef struct MigrationOps MigrationOps;
 465
 466 MigrationOps *migration_ops;
 467
 468 static int ram_save_host_page_urgent(PageSearchStatus *pss);
 469
 470 /* NOTE: page is the PFN not real ram_addr_t. */
 471 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
 472 {
 473     pss->block = rb;
 474     pss->page = page;
 475     pss->complete_round = false;
 476 }
 477
 478 /*
 479  * Check whether two PSSs are actively sending the same page.  Return true
 480  * if it is, false otherwise.
 481  */
 482 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
 483 {
 484     return pss1->host_page_sending && pss2->host_page_sending &&
 485         (pss1->host_page_start == pss2->host_page_start);
 486 }
 487
 488 /**
 489  * save_page_header: write page header to wire
 490  *
 491  * If this is the 1st block, it also writes the block identification
 492  *
 493  * Returns the number of bytes written
 494  *
 495  * @pss: current PSS channel status
 496  * @block: block that contains the page we want to send
 497  * @offset: offset inside the block for the page
 498  *          in the lower bits, it contains flags
 499  */
 500 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
 501                                RAMBlock *block, ram_addr_t offset)
 502 {
 503     size_t size, len;
 504     bool same_block = (block == pss->last_sent_block);
 505
 506     if (same_block) {
 507         offset |= RAM_SAVE_FLAG_CONTINUE;
 508     }
 509     qemu_put_be64(f, offset);
 510     size = 8;
 511
 512     if (!same_block) {
 513         len = strlen(block->idstr);
 514         qemu_put_byte(f, len);
 515         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 516         size += 1 + len;
 517         pss->last_sent_block = block;
 518     }
 519     return size;
 520 }
 521
 522 /**
 523  * mig_throttle_guest_down: throttle down the guest
 524  *
 525  * Reduce amount of guest cpu execution to hopefully slow down memory
 526  * writes. If guest dirty memory rate is reduced below the rate at
 527  * which we can transfer pages to the destination then we should be
 528  * able to complete migration. Some workloads dirty memory way too
 529  * fast and will not effectively converge, even with auto-converge.
 530  */
 531 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 532                                     uint64_t bytes_dirty_threshold)
 533 {
 534     uint64_t pct_initial = migrate_cpu_throttle_initial();
 535     uint64_t pct_increment = migrate_cpu_throttle_increment();
 536     bool pct_tailslow = migrate_cpu_throttle_tailslow();
 537     int pct_max = migrate_max_cpu_throttle();
 538
 539     uint64_t throttle_now = cpu_throttle_get_percentage();
 540     uint64_t cpu_now, cpu_ideal, throttle_inc;
 541
 542     /* We have not started throttling yet. Let's start it. */
 543     if (!cpu_throttle_active()) {
 544         cpu_throttle_set(pct_initial);
 545     } else {
 546         /* Throttling already on, just increase the rate */
 547         if (!pct_tailslow) {
 548             throttle_inc = pct_increment;
 549         } else {
 550             /* Compute the ideal CPU percentage used by Guest, which may
 551              * make the dirty rate match the dirty rate threshold. */
 552             cpu_now = 100 - throttle_now;
 553             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 554                         bytes_dirty_period);
 555             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 556         }
 557         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 558     }
 559 }
 560
 561 void mig_throttle_counter_reset(void)
 562 {
 563     RAMState *rs = ram_state;
 564
 565     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 566     rs->num_dirty_pages_period = 0;
 567     rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
 568 }
 569
 570 /**
 571  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 572  *
 573  * @rs: current RAM state
 574  * @current_addr: address for the zero page
 575  *
 576  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 577  * The important thing is that a stale (not-yet-0'd) page be replaced
 578  * by the new data.
 579  * As a bonus, if the page wasn't in the cache it gets added so that
 580  * when a small write is made into the 0'd page it gets XBZRLE sent.
 581  */
 582 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 583 {
 584     /* We don't care if this fails to allocate a new cache page
 585      * as long as it updated an old one */
 586     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 587                  stat64_get(&mig_stats.dirty_sync_count));
 588 }
 589
 590 #define ENCODING_FLAG_XBZRLE 0x1
 591
 592 /**
 593  * save_xbzrle_page: compress and send current page
 594  *
 595  * Returns: 1 means that we wrote the page
 596  *          0 means that page is identical to the one already sent
 597  *          -1 means that xbzrle would be longer than normal
 598  *
 599  * @rs: current RAM state
 600  * @pss: current PSS channel
 601  * @current_data: pointer to the address of the page contents
 602  * @current_addr: addr of the page
 603  * @block: block that contains the page we want to send
 604  * @offset: offset inside the block for the page
 605  */
 606 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
 607                             uint8_t **current_data, ram_addr_t current_addr,
 608                             RAMBlock *block, ram_addr_t offset)
 609 {
 610     int encoded_len = 0, bytes_xbzrle;
 611     uint8_t *prev_cached_page;
 612     QEMUFile *file = pss->pss_channel;
 613     uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
 614
 615     if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
 616         xbzrle_counters.cache_miss++;
 617         if (!rs->last_stage) {
 618             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 619                              generation) == -1) {
 620                 return -1;
 621             } else {
 622                 /* update *current_data when the page has been
 623                    inserted into cache */
 624                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 625             }
 626         }
 627         return -1;
 628     }
 629
 630     /*
 631      * Reaching here means the page has hit the xbzrle cache, no matter what
 632      * encoding result it is (normal encoding, overflow or skipping the page),
 633      * count the page as encoded. This is used to calculate the encoding rate.
 634      *
 635      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 636      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 637      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 638      * skipped page included. In this way, the encoding rate can tell if the
 639      * guest page is good for xbzrle encoding.
 640      */
 641     xbzrle_counters.pages++;
 642     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 643
 644     /* save current buffer into memory */
 645     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 646
 647     /* XBZRLE encoding (if there is no overflow) */
 648     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 649                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 650                                        TARGET_PAGE_SIZE);
 651
 652     /*
 653      * Update the cache contents, so that it corresponds to the data
 654      * sent, in all cases except where we skip the page.
 655      */
 656     if (!rs->last_stage && encoded_len != 0) {
 657         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 658         /*
 659          * In the case where we couldn't compress, ensure that the caller
 660          * sends the data from the cache, since the guest might have
 661          * changed the RAM since we copied it.
 662          */
 663         *current_data = prev_cached_page;
 664     }
 665
 666     if (encoded_len == 0) {
 667         trace_save_xbzrle_page_skipping();
 668         return 0;
 669     } else if (encoded_len == -1) {
 670         trace_save_xbzrle_page_overflow();
 671         xbzrle_counters.overflow++;
 672         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 673         return -1;
 674     }
 675
 676     /* Send XBZRLE based compressed page */
 677     bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
 678                                     offset | RAM_SAVE_FLAG_XBZRLE);
 679     qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
 680     qemu_put_be16(file, encoded_len);
 681     qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
 682     bytes_xbzrle += encoded_len + 1 + 2;
 683     /*
 684      * Like compressed_size (please see update_compress_thread_counts),
 685      * the xbzrle encoded bytes don't count the 8 byte header with
 686      * RAM_SAVE_FLAG_CONTINUE.
 687      */
 688     xbzrle_counters.bytes += bytes_xbzrle - 8;
 689     ram_transferred_add(bytes_xbzrle);
 690
 691     return 1;
 692 }
 693
 694 /**
 695  * pss_find_next_dirty: find the next dirty page of current ramblock
 696  *
 697  * This function updates pss->page to point to the next dirty page index
 698  * within the ramblock to migrate, or the end of ramblock when nothing
 699  * found.  Note that when pss->host_page_sending==true it means we're
 700  * during sending a host page, so we won't look for dirty page that is
 701  * outside the host page boundary.
 702  *
 703  * @pss: the current page search status
 704  */
 705 static void pss_find_next_dirty(PageSearchStatus *pss)
 706 {
 707     RAMBlock *rb = pss->block;
 708     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 709     unsigned long *bitmap = rb->bmap;
 710
 711     if (migrate_ram_is_ignored(rb)) {
 712         /* Points directly to the end, so we know no dirty page */
 713         pss->page = size;
 714         return;
 715     }
 716
 717     /*
 718      * If during sending a host page, only look for dirty pages within the
 719      * current host page being send.
 720      */
 721     if (pss->host_page_sending) {
 722         assert(pss->host_page_end);
 723         size = MIN(size, pss->host_page_end);
 724     }
 725
 726     pss->page = find_next_bit(bitmap, size, pss->page);
 727 }
 728
 729 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 730                                                        unsigned long page)
 731 {
 732     uint8_t shift;
 733     hwaddr size, start;
 734
 735     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 736         return;
 737     }
 738
 739     shift = rb->clear_bmap_shift;
 740     /*
 741      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 742      * can make things easier sometimes since then start address
 743      * of the small chunk will always be 64 pages aligned so the
 744      * bitmap will always be aligned to unsigned long. We should
 745      * even be able to remove this restriction but I'm simply
 746      * keeping it.
 747      */
 748     assert(shift >= 6);
 749
 750     size = 1ULL << (TARGET_PAGE_BITS + shift);
 751     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
 752     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 753     memory_region_clear_dirty_bitmap(rb->mr, start, size);
 754 }
 755
 756 static void
 757 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 758                                                  unsigned long start,
 759                                                  unsigned long npages)
 760 {
 761     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 762     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 763     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 764
 765     /*
 766      * Clear pages from start to start + npages - 1, so the end boundary is
 767      * exclusive.
 768      */
 769     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 770         migration_clear_memory_region_dirty_bitmap(rb, i);
 771     }
 772 }
 773
 774 /*
 775  * colo_bitmap_find_diry:find contiguous dirty pages from start
 776  *
 777  * Returns the page offset within memory region of the start of the contiguout
 778  * dirty page
 779  *
 780  * @rs: current RAM state
 781  * @rb: RAMBlock where to search for dirty pages
 782  * @start: page where we start the search
 783  * @num: the number of contiguous dirty pages
 784  */
 785 static inline
 786 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 787                                      unsigned long start, unsigned long *num)
 788 {
 789     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 790     unsigned long *bitmap = rb->bmap;
 791     unsigned long first, next;
 792
 793     *num = 0;
 794
 795     if (migrate_ram_is_ignored(rb)) {
 796         return size;
 797     }
 798
 799     first = find_next_bit(bitmap, size, start);
 800     if (first >= size) {
 801         return first;
 802     }
 803     next = find_next_zero_bit(bitmap, size, first + 1);
 804     assert(next >= first);
 805     *num = next - first;
 806     return first;
 807 }
 808
 809 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 810                                                 RAMBlock *rb,
 811                                                 unsigned long page)
 812 {
 813     bool ret;
 814
 815     /*
 816      * Clear dirty bitmap if needed.  This _must_ be called before we
 817      * send any of the page in the chunk because we need to make sure
 818      * we can capture further page content changes when we sync dirty
 819      * log the next time.  So as long as we are going to send any of
 820      * the page in the chunk we clear the remote dirty bitmap for all.
 821      * Clearing it earlier won't be a problem, but too late will.
 822      */
 823     migration_clear_memory_region_dirty_bitmap(rb, page);
 824
 825     ret = test_and_clear_bit(page, rb->bmap);
 826     if (ret) {
 827         rs->migration_dirty_pages--;
 828     }
 829
 830     return ret;
 831 }
 832
 833 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
 834                                        void *opaque)
 835 {
 836     const hwaddr offset = section->offset_within_region;
 837     const hwaddr size = int128_get64(section->size);
 838     const unsigned long start = offset >> TARGET_PAGE_BITS;
 839     const unsigned long npages = size >> TARGET_PAGE_BITS;
 840     RAMBlock *rb = section->mr->ram_block;
 841     uint64_t *cleared_bits = opaque;
 842
 843     /*
 844      * We don't grab ram_state->bitmap_mutex because we expect to run
 845      * only when starting migration or during postcopy recovery where
 846      * we don't have concurrent access.
 847      */
 848     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
 849         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
 850     }
 851     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
 852     bitmap_clear(rb->bmap, start, npages);
 853 }
 854
 855 /*
 856  * Exclude all dirty pages from migration that fall into a discarded range as
 857  * managed by a RamDiscardManager responsible for the mapped memory region of
 858  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
 859  *
 860  * Discarded pages ("logically unplugged") have undefined content and must
 861  * not get migrated, because even reading these pages for migration might
 862  * result in undesired behavior.
 863  *
 864  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
 865  *
 866  * Note: The result is only stable while migrating (precopy/postcopy).
 867  */
 868 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
 869 {
 870     uint64_t cleared_bits = 0;
 871
 872     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
 873         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 874         MemoryRegionSection section = {
 875             .mr = rb->mr,
 876             .offset_within_region = 0,
 877             .size = int128_make64(qemu_ram_get_used_length(rb)),
 878         };
 879
 880         ram_discard_manager_replay_discarded(rdm, &section,
 881                                              dirty_bitmap_clear_section,
 882                                              &cleared_bits);
 883     }
 884     return cleared_bits;
 885 }
 886
 887 /*
 888  * Check if a host-page aligned page falls into a discarded range as managed by
 889  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
 890  *
 891  * Note: The result is only stable while migrating (precopy/postcopy).
 892  */
 893 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
 894 {
 895     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
 896         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 897         MemoryRegionSection section = {
 898             .mr = rb->mr,
 899             .offset_within_region = start,
 900             .size = int128_make64(qemu_ram_pagesize(rb)),
 901         };
 902
 903         return !ram_discard_manager_is_populated(rdm, &section);
 904     }
 905     return false;
 906 }
 907
 908 /* Called with RCU critical section */
 909 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 910 {
 911     uint64_t new_dirty_pages =
 912         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 913
 914     rs->migration_dirty_pages += new_dirty_pages;
 915     rs->num_dirty_pages_period += new_dirty_pages;
 916 }
 917
 918 /**
 919  * ram_pagesize_summary: calculate all the pagesizes of a VM
 920  *
 921  * Returns a summary bitmap of the page sizes of all RAMBlocks
 922  *
 923  * For VMs with just normal pages this is equivalent to the host page
 924  * size. If it's got some huge pages then it's the OR of all the
 925  * different page sizes.
 926  */
 927 uint64_t ram_pagesize_summary(void)
 928 {
 929     RAMBlock *block;
 930     uint64_t summary = 0;
 931
 932     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 933         summary |= block->page_size;
 934     }
 935
 936     return summary;
 937 }
 938
 939 uint64_t ram_get_total_transferred_pages(void)
 940 {
 941     return stat64_get(&mig_stats.normal_pages) +
 942         stat64_get(&mig_stats.zero_pages) +
 943         compression_counters.pages + xbzrle_counters.pages;
 944 }
 945
 946 static void migration_update_rates(RAMState *rs, int64_t end_time)
 947 {
 948     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 949     double compressed_size;
 950
 951     /* calculate period counters */
 952     stat64_set(&mig_stats.dirty_pages_rate,
 953                rs->num_dirty_pages_period * 1000 /
 954                (end_time - rs->time_last_bitmap_sync));
 955
 956     if (!page_count) {
 957         return;
 958     }
 959
 960     if (migrate_xbzrle()) {
 961         double encoded_size, unencoded_size;
 962
 963         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 964             rs->xbzrle_cache_miss_prev) / page_count;
 965         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 966         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 967                          TARGET_PAGE_SIZE;
 968         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 969         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 970             xbzrle_counters.encoding_rate = 0;
 971         } else {
 972             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
 973         }
 974         rs->xbzrle_pages_prev = xbzrle_counters.pages;
 975         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
 976     }
 977
 978     if (migrate_compress()) {
 979         compression_counters.busy_rate = (double)(compression_counters.busy -
 980             rs->compress_thread_busy_prev) / page_count;
 981         rs->compress_thread_busy_prev = compression_counters.busy;
 982
 983         compressed_size = compression_counters.compressed_size -
 984                           rs->compressed_size_prev;
 985         if (compressed_size) {
 986             double uncompressed_size = (compression_counters.pages -
 987                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 988
 989             /* Compression-Ratio = Uncompressed-size / Compressed-size */
 990             compression_counters.compression_rate =
 991                                         uncompressed_size / compressed_size;
 992
 993             rs->compress_pages_prev = compression_counters.pages;
 994             rs->compressed_size_prev = compression_counters.compressed_size;
 995         }
 996     }
 997 }
 998
 999 /*
1000  * Enable dirty-limit to throttle down the guest
1001  */
1002 static void migration_dirty_limit_guest(void)
1003 {
1004     /*
1005      * dirty page rate quota for all vCPUs fetched from
1006      * migration parameter 'vcpu_dirty_limit'
1007      */
1008     static int64_t quota_dirtyrate;
1009     MigrationState *s = migrate_get_current();
1010
1011     /*
1012      * If dirty limit already enabled and migration parameter
1013      * vcpu-dirty-limit untouched.
1014      */
1015     if (dirtylimit_in_service() &&
1016         quota_dirtyrate == s->parameters.vcpu_dirty_limit) {
1017         return;
1018     }
1019
1020     quota_dirtyrate = s->parameters.vcpu_dirty_limit;
1021
1022     /*
1023      * Set all vCPU a quota dirtyrate, note that the second
1024      * parameter will be ignored if setting all vCPU for the vm
1025      */
1026     qmp_set_vcpu_dirty_limit(false, -1, quota_dirtyrate, NULL);
1027     trace_migration_dirty_limit_guest(quota_dirtyrate);
1028 }
1029
1030 static void migration_trigger_throttle(RAMState *rs)
1031 {
1032     uint64_t threshold = migrate_throttle_trigger_threshold();
1033     uint64_t bytes_xfer_period =
1034         stat64_get(&mig_stats.transferred) - rs->bytes_xfer_prev;
1035     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1036     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1037
1038     /* During block migration the auto-converge logic incorrectly detects
1039      * that ram migration makes no progress. Avoid this by disabling the
1040      * throttling logic during the bulk phase of block migration. */
1041     if (blk_mig_bulk_active()) {
1042         return;
1043     }
1044
1045     /*
1046      * The following detection logic can be refined later. For now:
1047      * Check to see if the ratio between dirtied bytes and the approx.
1048      * amount of bytes that just got transferred since the last time
1049      * we were in this routine reaches the threshold. If that happens
1050      * twice, start or increase throttling.
1051      */
1052     if ((bytes_dirty_period > bytes_dirty_threshold) &&
1053         (++rs->dirty_rate_high_cnt >= 2)) {
1054         rs->dirty_rate_high_cnt = 0;
1055         if (migrate_auto_converge()) {
1056             trace_migration_throttle();
1057             mig_throttle_guest_down(bytes_dirty_period,
1058                                     bytes_dirty_threshold);
1059         } else if (migrate_dirty_limit()) {
1060             migration_dirty_limit_guest();
1061         }
1062     }
1063 }
1064
1065 static void migration_bitmap_sync(RAMState *rs, bool last_stage)
1066 {
1067     RAMBlock *block;
1068     int64_t end_time;
1069
1070     stat64_add(&mig_stats.dirty_sync_count, 1);
1071
1072     if (!rs->time_last_bitmap_sync) {
1073         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1074     }
1075
1076     trace_migration_bitmap_sync_start();
1077     memory_global_dirty_log_sync(last_stage);
1078
1079     qemu_mutex_lock(&rs->bitmap_mutex);
1080     WITH_RCU_READ_LOCK_GUARD() {
1081         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1082             ramblock_sync_dirty_bitmap(rs, block);
1083         }
1084         stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining());
1085     }
1086     qemu_mutex_unlock(&rs->bitmap_mutex);
1087
1088     memory_global_after_dirty_log_sync();
1089     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1090
1091     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1092
1093     /* more than 1 second = 1000 millisecons */
1094     if (end_time > rs->time_last_bitmap_sync + 1000) {
1095         migration_trigger_throttle(rs);
1096
1097         migration_update_rates(rs, end_time);
1098
1099         rs->target_page_count_prev = rs->target_page_count;
1100
1101         /* reset period counters */
1102         rs->time_last_bitmap_sync = end_time;
1103         rs->num_dirty_pages_period = 0;
1104         rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
1105     }
1106     if (migrate_events()) {
1107         uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
1108         qapi_event_send_migration_pass(generation);
1109     }
1110 }
1111
1112 static void migration_bitmap_sync_precopy(RAMState *rs, bool last_stage)
1113 {
1114     Error *local_err = NULL;
1115
1116     /*
1117      * The current notifier usage is just an optimization to migration, so we
1118      * don't stop the normal migration process in the error case.
1119      */
1120     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1121         error_report_err(local_err);
1122         local_err = NULL;
1123     }
1124
1125     migration_bitmap_sync(rs, last_stage);
1126
1127     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1128         error_report_err(local_err);
1129     }
1130 }
1131
1132 void ram_release_page(const char *rbname, uint64_t offset)
1133 {
1134     if (!migrate_release_ram() || !migration_in_postcopy()) {
1135         return;
1136     }
1137
1138     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1139 }
1140
1141 /**
1142  * save_zero_page_to_file: send the zero page to the file
1143  *
1144  * Returns the size of data written to the file, 0 means the page is not
1145  * a zero page
1146  *
1147  * @pss: current PSS channel
1148  * @block: block that contains the page we want to send
1149  * @offset: offset inside the block for the page
1150  */
1151 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
1152                                   RAMBlock *block, ram_addr_t offset)
1153 {
1154     uint8_t *p = block->host + offset;
1155     int len = 0;
1156
1157     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1158         len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1159         qemu_put_byte(file, 0);
1160         len += 1;
1161         ram_release_page(block->idstr, offset);
1162     }
1163     return len;
1164 }
1165
1166 /**
1167  * save_zero_page: send the zero page to the stream
1168  *
1169  * Returns the number of pages written.
1170  *
1171  * @pss: current PSS channel
1172  * @block: block that contains the page we want to send
1173  * @offset: offset inside the block for the page
1174  */
1175 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
1176                           ram_addr_t offset)
1177 {
1178     int len = save_zero_page_to_file(pss, f, block, offset);
1179
1180     if (len) {
1181         stat64_add(&mig_stats.zero_pages, 1);
1182         ram_transferred_add(len);
1183         return 1;
1184     }
1185     return -1;
1186 }
1187
1188 /*
1189  * @pages: the number of pages written by the control path,
1190  *        < 0 - error
1191  *        > 0 - number of pages written
1192  *
1193  * Return true if the pages has been saved, otherwise false is returned.
1194  */
1195 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1196                               ram_addr_t offset, int *pages)
1197 {
1198     int ret;
1199
1200     ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1201                                 TARGET_PAGE_SIZE);
1202     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1203         return false;
1204     }
1205
1206     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1207         *pages = 1;
1208         return true;
1209     }
1210     *pages = ret;
1211     return true;
1212 }
1213
1214 /*
1215  * directly send the page to the stream
1216  *
1217  * Returns the number of pages written.
1218  *
1219  * @pss: current PSS channel
1220  * @block: block that contains the page we want to send
1221  * @offset: offset inside the block for the page
1222  * @buf: the page to be sent
1223  * @async: send to page asyncly
1224  */
1225 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1226                             ram_addr_t offset, uint8_t *buf, bool async)
1227 {
1228     QEMUFile *file = pss->pss_channel;
1229
1230     ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
1231                                          offset | RAM_SAVE_FLAG_PAGE));
1232     if (async) {
1233         qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1234                               migrate_release_ram() &&
1235                               migration_in_postcopy());
1236     } else {
1237         qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1238     }
1239     ram_transferred_add(TARGET_PAGE_SIZE);
1240     stat64_add(&mig_stats.normal_pages, 1);
1241     return 1;
1242 }
1243
1244 /**
1245  * ram_save_page: send the given page to the stream
1246  *
1247  * Returns the number of pages written.
1248  *          < 0 - error
1249  *          >=0 - Number of pages written - this might legally be 0
1250  *                if xbzrle noticed the page was the same.
1251  *
1252  * @rs: current RAM state
1253  * @block: block that contains the page we want to send
1254  * @offset: offset inside the block for the page
1255  */
1256 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1257 {
1258     int pages = -1;
1259     uint8_t *p;
1260     bool send_async = true;
1261     RAMBlock *block = pss->block;
1262     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1263     ram_addr_t current_addr = block->offset + offset;
1264
1265     p = block->host + offset;
1266     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1267
1268     XBZRLE_cache_lock();
1269     if (rs->xbzrle_started && !migration_in_postcopy()) {
1270         pages = save_xbzrle_page(rs, pss, &p, current_addr,
1271                                  block, offset);
1272         if (!rs->last_stage) {
1273             /* Can't send this cached data async, since the cache page
1274              * might get updated before it gets to the wire
1275              */
1276             send_async = false;
1277         }
1278     }
1279
1280     /* XBZRLE overflow or normal page */
1281     if (pages == -1) {
1282         pages = save_normal_page(pss, block, offset, p, send_async);
1283     }
1284
1285     XBZRLE_cache_unlock();
1286
1287     return pages;
1288 }
1289
1290 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1291                                  ram_addr_t offset)
1292 {
1293     if (multifd_queue_page(file, block, offset) < 0) {
1294         return -1;
1295     }
1296     stat64_add(&mig_stats.normal_pages, 1);
1297
1298     return 1;
1299 }
1300
1301 static void
1302 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1303 {
1304     ram_transferred_add(bytes_xmit);
1305
1306     if (param->result == RES_ZEROPAGE) {
1307         stat64_add(&mig_stats.zero_pages, 1);
1308         return;
1309     }
1310
1311     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1312     compression_counters.compressed_size += bytes_xmit - 8;
1313     compression_counters.pages++;
1314 }
1315
1316 static bool save_page_use_compression(RAMState *rs);
1317
1318 static int send_queued_data(CompressParam *param)
1319 {
1320     PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_PRECOPY];
1321     MigrationState *ms = migrate_get_current();
1322     QEMUFile *file = ms->to_dst_file;
1323     int len = 0;
1324
1325     RAMBlock *block = param->block;
1326     ram_addr_t offset = param->offset;
1327
1328     if (param->result == RES_NONE) {
1329         return 0;
1330     }
1331
1332     assert(block == pss->last_sent_block);
1333
1334     if (param->result == RES_ZEROPAGE) {
1335         assert(qemu_file_buffer_empty(param->file));
1336         len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1337         qemu_put_byte(file, 0);
1338         len += 1;
1339         ram_release_page(block->idstr, offset);
1340     } else if (param->result == RES_COMPRESS) {
1341         assert(!qemu_file_buffer_empty(param->file));
1342         len += save_page_header(pss, file, block,
1343                                 offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1344         len += qemu_put_qemu_file(file, param->file);
1345     } else {
1346         abort();
1347     }
1348
1349     update_compress_thread_counts(param, len);
1350
1351     return len;
1352 }
1353
1354 static void ram_flush_compressed_data(RAMState *rs)
1355 {
1356     if (!save_page_use_compression(rs)) {
1357         return;
1358     }
1359
1360     flush_compressed_data(send_queued_data);
1361 }
1362
1363 #define PAGE_ALL_CLEAN 0
1364 #define PAGE_TRY_AGAIN 1
1365 #define PAGE_DIRTY_FOUND 2
1366 /**
1367  * find_dirty_block: find the next dirty page and update any state
1368  * associated with the search process.
1369  *
1370  * Returns:
1371  *         <0: An error happened
1372  *         PAGE_ALL_CLEAN: no dirty page found, give up
1373  *         PAGE_TRY_AGAIN: no dirty page found, retry for next block
1374  *         PAGE_DIRTY_FOUND: dirty page found
1375  *
1376  * @rs: current RAM state
1377  * @pss: data about the state of the current dirty page scan
1378  * @again: set to false if the search has scanned the whole of RAM
1379  */
1380 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
1381 {
1382     /* Update pss->page for the next dirty bit in ramblock */
1383     pss_find_next_dirty(pss);
1384
1385     if (pss->complete_round && pss->block == rs->last_seen_block &&
1386         pss->page >= rs->last_page) {
1387         /*
1388          * We've been once around the RAM and haven't found anything.
1389          * Give up.
1390          */
1391         return PAGE_ALL_CLEAN;
1392     }
1393     if (!offset_in_ramblock(pss->block,
1394                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1395         /* Didn't find anything in this RAM Block */
1396         pss->page = 0;
1397         pss->block = QLIST_NEXT_RCU(pss->block, next);
1398         if (!pss->block) {
1399             if (migrate_multifd() &&
1400                 !migrate_multifd_flush_after_each_section()) {
1401                 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
1402                 int ret = multifd_send_sync_main(f);
1403                 if (ret < 0) {
1404                     return ret;
1405                 }
1406                 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
1407                 qemu_fflush(f);
1408             }
1409             /*
1410              * If memory migration starts over, we will meet a dirtied page
1411              * which may still exists in compression threads's ring, so we
1412              * should flush the compressed data to make sure the new page
1413              * is not overwritten by the old one in the destination.
1414              *
1415              * Also If xbzrle is on, stop using the data compression at this
1416              * point. In theory, xbzrle can do better than compression.
1417              */
1418             ram_flush_compressed_data(rs);
1419
1420             /* Hit the end of the list */
1421             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1422             /* Flag that we've looped */
1423             pss->complete_round = true;
1424             /* After the first round, enable XBZRLE. */
1425             if (migrate_xbzrle()) {
1426                 rs->xbzrle_started = true;
1427             }
1428         }
1429         /* Didn't find anything this time, but try again on the new block */
1430         return PAGE_TRY_AGAIN;
1431     } else {
1432         /* We've found something */
1433         return PAGE_DIRTY_FOUND;
1434     }
1435 }
1436
1437 /**
1438  * unqueue_page: gets a page of the queue
1439  *
1440  * Helper for 'get_queued_page' - gets a page off the queue
1441  *
1442  * Returns the block of the page (or NULL if none available)
1443  *
1444  * @rs: current RAM state
1445  * @offset: used to return the offset within the RAMBlock
1446  */
1447 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1448 {
1449     struct RAMSrcPageRequest *entry;
1450     RAMBlock *block = NULL;
1451
1452     if (!postcopy_has_request(rs)) {
1453         return NULL;
1454     }
1455
1456     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1457
1458     /*
1459      * This should _never_ change even after we take the lock, because no one
1460      * should be taking anything off the request list other than us.
1461      */
1462     assert(postcopy_has_request(rs));
1463
1464     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1465     block = entry->rb;
1466     *offset = entry->offset;
1467
1468     if (entry->len > TARGET_PAGE_SIZE) {
1469         entry->len -= TARGET_PAGE_SIZE;
1470         entry->offset += TARGET_PAGE_SIZE;
1471     } else {
1472         memory_region_unref(block->mr);
1473         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1474         g_free(entry);
1475         migration_consume_urgent_request();
1476     }
1477
1478     return block;
1479 }
1480
1481 #if defined(__linux__)
1482 /**
1483  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1484  *   is found, return RAM block pointer and page offset
1485  *
1486  * Returns pointer to the RAMBlock containing faulting page,
1487  *   NULL if no write faults are pending
1488  *
1489  * @rs: current RAM state
1490  * @offset: page offset from the beginning of the block
1491  */
1492 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1493 {
1494     struct uffd_msg uffd_msg;
1495     void *page_address;
1496     RAMBlock *block;
1497     int res;
1498
1499     if (!migrate_background_snapshot()) {
1500         return NULL;
1501     }
1502
1503     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1504     if (res <= 0) {
1505         return NULL;
1506     }
1507
1508     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1509     block = qemu_ram_block_from_host(page_address, false, offset);
1510     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1511     return block;
1512 }
1513
1514 /**
1515  * ram_save_release_protection: release UFFD write protection after
1516  *   a range of pages has been saved
1517  *
1518  * @rs: current RAM state
1519  * @pss: page-search-status structure
1520  * @start_page: index of the first page in the range relative to pss->block
1521  *
1522  * Returns 0 on success, negative value in case of an error
1523 */
1524 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1525         unsigned long start_page)
1526 {
1527     int res = 0;
1528
1529     /* Check if page is from UFFD-managed region. */
1530     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1531         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1532         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1533
1534         /* Flush async buffers before un-protect. */
1535         qemu_fflush(pss->pss_channel);
1536         /* Un-protect memory range. */
1537         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1538                 false, false);
1539     }
1540
1541     return res;
1542 }
1543
1544 /* ram_write_tracking_available: check if kernel supports required UFFD features
1545  *
1546  * Returns true if supports, false otherwise
1547  */
1548 bool ram_write_tracking_available(void)
1549 {
1550     uint64_t uffd_features;
1551     int res;
1552
1553     res = uffd_query_features(&uffd_features);
1554     return (res == 0 &&
1555             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1556 }
1557
1558 /* ram_write_tracking_compatible: check if guest configuration is
1559  *   compatible with 'write-tracking'
1560  *
1561  * Returns true if compatible, false otherwise
1562  */
1563 bool ram_write_tracking_compatible(void)
1564 {
1565     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1566     int uffd_fd;
1567     RAMBlock *block;
1568     bool ret = false;
1569
1570     /* Open UFFD file descriptor */
1571     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1572     if (uffd_fd < 0) {
1573         return false;
1574     }
1575
1576     RCU_READ_LOCK_GUARD();
1577
1578     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1579         uint64_t uffd_ioctls;
1580
1581         /* Nothing to do with read-only and MMIO-writable regions */
1582         if (block->mr->readonly || block->mr->rom_device) {
1583             continue;
1584         }
1585         /* Try to register block memory via UFFD-IO to track writes */
1586         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1587                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1588             goto out;
1589         }
1590         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1591             goto out;
1592         }
1593     }
1594     ret = true;
1595
1596 out:
1597     uffd_close_fd(uffd_fd);
1598     return ret;
1599 }
1600
1601 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1602                                        ram_addr_t size)
1603 {
1604     const ram_addr_t end = offset + size;
1605
1606     /*
1607      * We read one byte of each page; this will preallocate page tables if
1608      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1609      * where no page was populated yet. This might require adaption when
1610      * supporting other mappings, like shmem.
1611      */
1612     for (; offset < end; offset += block->page_size) {
1613         char tmp = *((char *)block->host + offset);
1614
1615         /* Don't optimize the read out */
1616         asm volatile("" : "+r" (tmp));
1617     }
1618 }
1619
1620 static inline int populate_read_section(MemoryRegionSection *section,
1621                                         void *opaque)
1622 {
1623     const hwaddr size = int128_get64(section->size);
1624     hwaddr offset = section->offset_within_region;
1625     RAMBlock *block = section->mr->ram_block;
1626
1627     populate_read_range(block, offset, size);
1628     return 0;
1629 }
1630
1631 /*
1632  * ram_block_populate_read: preallocate page tables and populate pages in the
1633  *   RAM block by reading a byte of each page.
1634  *
1635  * Since it's solely used for userfault_fd WP feature, here we just
1636  *   hardcode page size to qemu_real_host_page_size.
1637  *
1638  * @block: RAM block to populate
1639  */
1640 static void ram_block_populate_read(RAMBlock *rb)
1641 {
1642     /*
1643      * Skip populating all pages that fall into a discarded range as managed by
1644      * a RamDiscardManager responsible for the mapped memory region of the
1645      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1646      * must not get populated automatically. We don't have to track
1647      * modifications via userfaultfd WP reliably, because these pages will
1648      * not be part of the migration stream either way -- see
1649      * ramblock_dirty_bitmap_exclude_discarded_pages().
1650      *
1651      * Note: The result is only stable while migrating (precopy/postcopy).
1652      */
1653     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1654         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1655         MemoryRegionSection section = {
1656             .mr = rb->mr,
1657             .offset_within_region = 0,
1658             .size = rb->mr->size,
1659         };
1660
1661         ram_discard_manager_replay_populated(rdm, &section,
1662                                              populate_read_section, NULL);
1663     } else {
1664         populate_read_range(rb, 0, rb->used_length);
1665     }
1666 }
1667
1668 /*
1669  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1670  */
1671 void ram_write_tracking_prepare(void)
1672 {
1673     RAMBlock *block;
1674
1675     RCU_READ_LOCK_GUARD();
1676
1677     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1678         /* Nothing to do with read-only and MMIO-writable regions */
1679         if (block->mr->readonly || block->mr->rom_device) {
1680             continue;
1681         }
1682
1683         /*
1684          * Populate pages of the RAM block before enabling userfault_fd
1685          * write protection.
1686          *
1687          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1688          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1689          * pages with pte_none() entries in page table.
1690          */
1691         ram_block_populate_read(block);
1692     }
1693 }
1694
1695 static inline int uffd_protect_section(MemoryRegionSection *section,
1696                                        void *opaque)
1697 {
1698     const hwaddr size = int128_get64(section->size);
1699     const hwaddr offset = section->offset_within_region;
1700     RAMBlock *rb = section->mr->ram_block;
1701     int uffd_fd = (uintptr_t)opaque;
1702
1703     return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1704                                   false);
1705 }
1706
1707 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1708 {
1709     assert(rb->flags & RAM_UF_WRITEPROTECT);
1710
1711     /* See ram_block_populate_read() */
1712     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1713         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1714         MemoryRegionSection section = {
1715             .mr = rb->mr,
1716             .offset_within_region = 0,
1717             .size = rb->mr->size,
1718         };
1719
1720         return ram_discard_manager_replay_populated(rdm, &section,
1721                                                     uffd_protect_section,
1722                                                     (void *)(uintptr_t)uffd_fd);
1723     }
1724     return uffd_change_protection(uffd_fd, rb->host,
1725                                   rb->used_length, true, false);
1726 }
1727
1728 /*
1729  * ram_write_tracking_start: start UFFD-WP memory tracking
1730  *
1731  * Returns 0 for success or negative value in case of error
1732  */
1733 int ram_write_tracking_start(void)
1734 {
1735     int uffd_fd;
1736     RAMState *rs = ram_state;
1737     RAMBlock *block;
1738
1739     /* Open UFFD file descriptor */
1740     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1741     if (uffd_fd < 0) {
1742         return uffd_fd;
1743     }
1744     rs->uffdio_fd = uffd_fd;
1745
1746     RCU_READ_LOCK_GUARD();
1747
1748     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1749         /* Nothing to do with read-only and MMIO-writable regions */
1750         if (block->mr->readonly || block->mr->rom_device) {
1751             continue;
1752         }
1753
1754         /* Register block memory with UFFD to track writes */
1755         if (uffd_register_memory(rs->uffdio_fd, block->host,
1756                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1757             goto fail;
1758         }
1759         block->flags |= RAM_UF_WRITEPROTECT;
1760         memory_region_ref(block->mr);
1761
1762         /* Apply UFFD write protection to the block memory range */
1763         if (ram_block_uffd_protect(block, uffd_fd)) {
1764             goto fail;
1765         }
1766
1767         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1768                 block->host, block->max_length);
1769     }
1770
1771     return 0;
1772
1773 fail:
1774     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1775
1776     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1777         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1778             continue;
1779         }
1780         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1781         /* Cleanup flags and remove reference */
1782         block->flags &= ~RAM_UF_WRITEPROTECT;
1783         memory_region_unref(block->mr);
1784     }
1785
1786     uffd_close_fd(uffd_fd);
1787     rs->uffdio_fd = -1;
1788     return -1;
1789 }
1790
1791 /**
1792  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1793  */
1794 void ram_write_tracking_stop(void)
1795 {
1796     RAMState *rs = ram_state;
1797     RAMBlock *block;
1798
1799     RCU_READ_LOCK_GUARD();
1800
1801     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1802         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1803             continue;
1804         }
1805         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1806
1807         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1808                 block->host, block->max_length);
1809
1810         /* Cleanup flags and remove reference */
1811         block->flags &= ~RAM_UF_WRITEPROTECT;
1812         memory_region_unref(block->mr);
1813     }
1814
1815     /* Finally close UFFD file descriptor */
1816     uffd_close_fd(rs->uffdio_fd);
1817     rs->uffdio_fd = -1;
1818 }
1819
1820 #else
1821 /* No target OS support, stubs just fail or ignore */
1822
1823 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1824 {
1825     (void) rs;
1826     (void) offset;
1827
1828     return NULL;
1829 }
1830
1831 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1832         unsigned long start_page)
1833 {
1834     (void) rs;
1835     (void) pss;
1836     (void) start_page;
1837
1838     return 0;
1839 }
1840
1841 bool ram_write_tracking_available(void)
1842 {
1843     return false;
1844 }
1845
1846 bool ram_write_tracking_compatible(void)
1847 {
1848     assert(0);
1849     return false;
1850 }
1851
1852 int ram_write_tracking_start(void)
1853 {
1854     assert(0);
1855     return -1;
1856 }
1857
1858 void ram_write_tracking_stop(void)
1859 {
1860     assert(0);
1861 }
1862 #endif /* defined(__linux__) */
1863
1864 /**
1865  * get_queued_page: unqueue a page from the postcopy requests
1866  *
1867  * Skips pages that are already sent (!dirty)
1868  *
1869  * Returns true if a queued page is found
1870  *
1871  * @rs: current RAM state
1872  * @pss: data about the state of the current dirty page scan
1873  */
1874 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1875 {
1876     RAMBlock  *block;
1877     ram_addr_t offset;
1878     bool dirty;
1879
1880     do {
1881         block = unqueue_page(rs, &offset);
1882         /*
1883          * We're sending this page, and since it's postcopy nothing else
1884          * will dirty it, and we must make sure it doesn't get sent again
1885          * even if this queue request was received after the background
1886          * search already sent it.
1887          */
1888         if (block) {
1889             unsigned long page;
1890
1891             page = offset >> TARGET_PAGE_BITS;
1892             dirty = test_bit(page, block->bmap);
1893             if (!dirty) {
1894                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1895                                                 page);
1896             } else {
1897                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1898             }
1899         }
1900
1901     } while (block && !dirty);
1902
1903     if (!block) {
1904         /*
1905          * Poll write faults too if background snapshot is enabled; that's
1906          * when we have vcpus got blocked by the write protected pages.
1907          */
1908         block = poll_fault_page(rs, &offset);
1909     }
1910
1911     if (block) {
1912         /*
1913          * We want the background search to continue from the queued page
1914          * since the guest is likely to want other pages near to the page
1915          * it just requested.
1916          */
1917         pss->block = block;
1918         pss->page = offset >> TARGET_PAGE_BITS;
1919
1920         /*
1921          * This unqueued page would break the "one round" check, even is
1922          * really rare.
1923          */
1924         pss->complete_round = false;
1925     }
1926
1927     return !!block;
1928 }
1929
1930 /**
1931  * migration_page_queue_free: drop any remaining pages in the ram
1932  * request queue
1933  *
1934  * It should be empty at the end anyway, but in error cases there may
1935  * be some left.  in case that there is any page left, we drop it.
1936  *
1937  */
1938 static void migration_page_queue_free(RAMState *rs)
1939 {
1940     struct RAMSrcPageRequest *mspr, *next_mspr;
1941     /* This queue generally should be empty - but in the case of a failed
1942      * migration might have some droppings in.
1943      */
1944     RCU_READ_LOCK_GUARD();
1945     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1946         memory_region_unref(mspr->rb->mr);
1947         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1948         g_free(mspr);
1949     }
1950 }
1951
1952 /**
1953  * ram_save_queue_pages: queue the page for transmission
1954  *
1955  * A request from postcopy destination for example.
1956  *
1957  * Returns zero on success or negative on error
1958  *
1959  * @rbname: Name of the RAMBLock of the request. NULL means the
1960  *          same that last one.
1961  * @start: starting address from the start of the RAMBlock
1962  * @len: length (in bytes) to send
1963  */
1964 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1965 {
1966     RAMBlock *ramblock;
1967     RAMState *rs = ram_state;
1968
1969     stat64_add(&mig_stats.postcopy_requests, 1);
1970     RCU_READ_LOCK_GUARD();
1971
1972     if (!rbname) {
1973         /* Reuse last RAMBlock */
1974         ramblock = rs->last_req_rb;
1975
1976         if (!ramblock) {
1977             /*
1978              * Shouldn't happen, we can't reuse the last RAMBlock if
1979              * it's the 1st request.
1980              */
1981             error_report("ram_save_queue_pages no previous block");
1982             return -1;
1983         }
1984     } else {
1985         ramblock = qemu_ram_block_by_name(rbname);
1986
1987         if (!ramblock) {
1988             /* We shouldn't be asked for a non-existent RAMBlock */
1989             error_report("ram_save_queue_pages no block '%s'", rbname);
1990             return -1;
1991         }
1992         rs->last_req_rb = ramblock;
1993     }
1994     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1995     if (!offset_in_ramblock(ramblock, start + len - 1)) {
1996         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1997                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1998                      __func__, start, len, ramblock->used_length);
1999         return -1;
2000     }
2001
2002     /*
2003      * When with postcopy preempt, we send back the page directly in the
2004      * rp-return thread.
2005      */
2006     if (postcopy_preempt_active()) {
2007         ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2008         size_t page_size = qemu_ram_pagesize(ramblock);
2009         PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2010         int ret = 0;
2011
2012         qemu_mutex_lock(&rs->bitmap_mutex);
2013
2014         pss_init(pss, ramblock, page_start);
2015         /*
2016          * Always use the preempt channel, and make sure it's there.  It's
2017          * safe to access without lock, because when rp-thread is running
2018          * we should be the only one who operates on the qemufile
2019          */
2020         pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
2021         assert(pss->pss_channel);
2022
2023         /*
2024          * It must be either one or multiple of host page size.  Just
2025          * assert; if something wrong we're mostly split brain anyway.
2026          */
2027         assert(len % page_size == 0);
2028         while (len) {
2029             if (ram_save_host_page_urgent(pss)) {
2030                 error_report("%s: ram_save_host_page_urgent() failed: "
2031                              "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2032                              __func__, ramblock->idstr, start);
2033                 ret = -1;
2034                 break;
2035             }
2036             /*
2037              * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2038              * will automatically be moved and point to the next host page
2039              * we're going to send, so no need to update here.
2040              *
2041              * Normally QEMU never sends >1 host page in requests, so
2042              * logically we don't even need that as the loop should only
2043              * run once, but just to be consistent.
2044              */
2045             len -= page_size;
2046         };
2047         qemu_mutex_unlock(&rs->bitmap_mutex);
2048
2049         return ret;
2050     }
2051
2052     struct RAMSrcPageRequest *new_entry =
2053         g_new0(struct RAMSrcPageRequest, 1);
2054     new_entry->rb = ramblock;
2055     new_entry->offset = start;
2056     new_entry->len = len;
2057
2058     memory_region_ref(ramblock->mr);
2059     qemu_mutex_lock(&rs->src_page_req_mutex);
2060     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2061     migration_make_urgent_request();
2062     qemu_mutex_unlock(&rs->src_page_req_mutex);
2063
2064     return 0;
2065 }
2066
2067 static bool save_page_use_compression(RAMState *rs)
2068 {
2069     if (!migrate_compress()) {
2070         return false;
2071     }
2072
2073     /*
2074      * If xbzrle is enabled (e.g., after first round of migration), stop
2075      * using the data compression. In theory, xbzrle can do better than
2076      * compression.
2077      */
2078     if (rs->xbzrle_started) {
2079         return false;
2080     }
2081
2082     return true;
2083 }
2084
2085 /*
2086  * try to compress the page before posting it out, return true if the page
2087  * has been properly handled by compression, otherwise needs other
2088  * paths to handle it
2089  */
2090 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2091                                RAMBlock *block, ram_addr_t offset)
2092 {
2093     if (!save_page_use_compression(rs)) {
2094         return false;
2095     }
2096
2097     /*
2098      * When starting the process of a new block, the first page of
2099      * the block should be sent out before other pages in the same
2100      * block, and all the pages in last block should have been sent
2101      * out, keeping this order is important, because the 'cont' flag
2102      * is used to avoid resending the block name.
2103      *
2104      * We post the fist page as normal page as compression will take
2105      * much CPU resource.
2106      */
2107     if (block != pss->last_sent_block) {
2108         ram_flush_compressed_data(rs);
2109         return false;
2110     }
2111
2112     if (compress_page_with_multi_thread(block, offset, send_queued_data) > 0) {
2113         return true;
2114     }
2115
2116     compression_counters.busy++;
2117     return false;
2118 }
2119
2120 /**
2121  * ram_save_target_page_legacy: save one target page
2122  *
2123  * Returns the number of pages written
2124  *
2125  * @rs: current RAM state
2126  * @pss: data about the page we want to send
2127  */
2128 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
2129 {
2130     RAMBlock *block = pss->block;
2131     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2132     int res;
2133
2134     if (control_save_page(pss, block, offset, &res)) {
2135         return res;
2136     }
2137
2138     if (save_compress_page(rs, pss, block, offset)) {
2139         return 1;
2140     }
2141
2142     res = save_zero_page(pss, pss->pss_channel, block, offset);
2143     if (res > 0) {
2144         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2145          * page would be stale
2146          */
2147         if (rs->xbzrle_started) {
2148             XBZRLE_cache_lock();
2149             xbzrle_cache_zero_page(rs, block->offset + offset);
2150             XBZRLE_cache_unlock();
2151         }
2152         return res;
2153     }
2154
2155     /*
2156      * Do not use multifd in postcopy as one whole host page should be
2157      * placed.  Meanwhile postcopy requires atomic update of pages, so even
2158      * if host page size == guest page size the dest guest during run may
2159      * still see partially copied pages which is data corruption.
2160      */
2161     if (migrate_multifd() && !migration_in_postcopy()) {
2162         return ram_save_multifd_page(pss->pss_channel, block, offset);
2163     }
2164
2165     return ram_save_page(rs, pss);
2166 }
2167
2168 /* Should be called before sending a host page */
2169 static void pss_host_page_prepare(PageSearchStatus *pss)
2170 {
2171     /* How many guest pages are there in one host page? */
2172     size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2173
2174     pss->host_page_sending = true;
2175     if (guest_pfns <= 1) {
2176         /*
2177          * This covers both when guest psize == host psize, or when guest
2178          * has larger psize than the host (guest_pfns==0).
2179          *
2180          * For the latter, we always send one whole guest page per
2181          * iteration of the host page (example: an Alpha VM on x86 host
2182          * will have guest psize 8K while host psize 4K).
2183          */
2184         pss->host_page_start = pss->page;
2185         pss->host_page_end = pss->page + 1;
2186     } else {
2187         /*
2188          * The host page spans over multiple guest pages, we send them
2189          * within the same host page iteration.
2190          */
2191         pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2192         pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2193     }
2194 }
2195
2196 /*
2197  * Whether the page pointed by PSS is within the host page being sent.
2198  * Must be called after a previous pss_host_page_prepare().
2199  */
2200 static bool pss_within_range(PageSearchStatus *pss)
2201 {
2202     ram_addr_t ram_addr;
2203
2204     assert(pss->host_page_sending);
2205
2206     /* Over host-page boundary? */
2207     if (pss->page >= pss->host_page_end) {
2208         return false;
2209     }
2210
2211     ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2212
2213     return offset_in_ramblock(pss->block, ram_addr);
2214 }
2215
2216 static void pss_host_page_finish(PageSearchStatus *pss)
2217 {
2218     pss->host_page_sending = false;
2219     /* This is not needed, but just to reset it */
2220     pss->host_page_start = pss->host_page_end = 0;
2221 }
2222
2223 /*
2224  * Send an urgent host page specified by `pss'.  Need to be called with
2225  * bitmap_mutex held.
2226  *
2227  * Returns 0 if save host page succeeded, false otherwise.
2228  */
2229 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2230 {
2231     bool page_dirty, sent = false;
2232     RAMState *rs = ram_state;
2233     int ret = 0;
2234
2235     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2236     pss_host_page_prepare(pss);
2237
2238     /*
2239      * If precopy is sending the same page, let it be done in precopy, or
2240      * we could send the same page in two channels and none of them will
2241      * receive the whole page.
2242      */
2243     if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2244         trace_postcopy_preempt_hit(pss->block->idstr,
2245                                    pss->page << TARGET_PAGE_BITS);
2246         return 0;
2247     }
2248
2249     do {
2250         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2251
2252         if (page_dirty) {
2253             /* Be strict to return code; it must be 1, or what else? */
2254             if (migration_ops->ram_save_target_page(rs, pss) != 1) {
2255                 error_report_once("%s: ram_save_target_page failed", __func__);
2256                 ret = -1;
2257                 goto out;
2258             }
2259             sent = true;
2260         }
2261         pss_find_next_dirty(pss);
2262     } while (pss_within_range(pss));
2263 out:
2264     pss_host_page_finish(pss);
2265     /* For urgent requests, flush immediately if sent */
2266     if (sent) {
2267         qemu_fflush(pss->pss_channel);
2268     }
2269     return ret;
2270 }
2271
2272 /**
2273  * ram_save_host_page: save a whole host page
2274  *
2275  * Starting at *offset send pages up to the end of the current host
2276  * page. It's valid for the initial offset to point into the middle of
2277  * a host page in which case the remainder of the hostpage is sent.
2278  * Only dirty target pages are sent. Note that the host page size may
2279  * be a huge page for this block.
2280  *
2281  * The saving stops at the boundary of the used_length of the block
2282  * if the RAMBlock isn't a multiple of the host page size.
2283  *
2284  * The caller must be with ram_state.bitmap_mutex held to call this
2285  * function.  Note that this function can temporarily release the lock, but
2286  * when the function is returned it'll make sure the lock is still held.
2287  *
2288  * Returns the number of pages written or negative on error
2289  *
2290  * @rs: current RAM state
2291  * @pss: data about the page we want to send
2292  */
2293 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2294 {
2295     bool page_dirty, preempt_active = postcopy_preempt_active();
2296     int tmppages, pages = 0;
2297     size_t pagesize_bits =
2298         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2299     unsigned long start_page = pss->page;
2300     int res;
2301
2302     if (migrate_ram_is_ignored(pss->block)) {
2303         error_report("block %s should not be migrated !", pss->block->idstr);
2304         return 0;
2305     }
2306
2307     /* Update host page boundary information */
2308     pss_host_page_prepare(pss);
2309
2310     do {
2311         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2312
2313         /* Check the pages is dirty and if it is send it */
2314         if (page_dirty) {
2315             /*
2316              * Properly yield the lock only in postcopy preempt mode
2317              * because both migration thread and rp-return thread can
2318              * operate on the bitmaps.
2319              */
2320             if (preempt_active) {
2321                 qemu_mutex_unlock(&rs->bitmap_mutex);
2322             }
2323             tmppages = migration_ops->ram_save_target_page(rs, pss);
2324             if (tmppages >= 0) {
2325                 pages += tmppages;
2326                 /*
2327                  * Allow rate limiting to happen in the middle of huge pages if
2328                  * something is sent in the current iteration.
2329                  */
2330                 if (pagesize_bits > 1 && tmppages > 0) {
2331                     migration_rate_limit();
2332                 }
2333             }
2334             if (preempt_active) {
2335                 qemu_mutex_lock(&rs->bitmap_mutex);
2336             }
2337         } else {
2338             tmppages = 0;
2339         }
2340
2341         if (tmppages < 0) {
2342             pss_host_page_finish(pss);
2343             return tmppages;
2344         }
2345
2346         pss_find_next_dirty(pss);
2347     } while (pss_within_range(pss));
2348
2349     pss_host_page_finish(pss);
2350
2351     res = ram_save_release_protection(rs, pss, start_page);
2352     return (res < 0 ? res : pages);
2353 }
2354
2355 /**
2356  * ram_find_and_save_block: finds a dirty page and sends it to f
2357  *
2358  * Called within an RCU critical section.
2359  *
2360  * Returns the number of pages written where zero means no dirty pages,
2361  * or negative on error
2362  *
2363  * @rs: current RAM state
2364  *
2365  * On systems where host-page-size > target-page-size it will send all the
2366  * pages in a host page that are dirty.
2367  */
2368 static int ram_find_and_save_block(RAMState *rs)
2369 {
2370     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2371     int pages = 0;
2372
2373     /* No dirty page as there is zero RAM */
2374     if (!rs->ram_bytes_total) {
2375         return pages;
2376     }
2377
2378     /*
2379      * Always keep last_seen_block/last_page valid during this procedure,
2380      * because find_dirty_block() relies on these values (e.g., we compare
2381      * last_seen_block with pss.block to see whether we searched all the
2382      * ramblocks) to detect the completion of migration.  Having NULL value
2383      * of last_seen_block can conditionally cause below loop to run forever.
2384      */
2385     if (!rs->last_seen_block) {
2386         rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2387         rs->last_page = 0;
2388     }
2389
2390     pss_init(pss, rs->last_seen_block, rs->last_page);
2391
2392     while (true){
2393         if (!get_queued_page(rs, pss)) {
2394             /* priority queue empty, so just search for something dirty */
2395             int res = find_dirty_block(rs, pss);
2396             if (res != PAGE_DIRTY_FOUND) {
2397                 if (res == PAGE_ALL_CLEAN) {
2398                     break;
2399                 } else if (res == PAGE_TRY_AGAIN) {
2400                     continue;
2401                 } else if (res < 0) {
2402                     pages = res;
2403                     break;
2404                 }
2405             }
2406         }
2407         pages = ram_save_host_page(rs, pss);
2408         if (pages) {
2409             break;
2410         }
2411     }
2412
2413     rs->last_seen_block = pss->block;
2414     rs->last_page = pss->page;
2415
2416     return pages;
2417 }
2418
2419 static uint64_t ram_bytes_total_with_ignored(void)
2420 {
2421     RAMBlock *block;
2422     uint64_t total = 0;
2423
2424     RCU_READ_LOCK_GUARD();
2425
2426     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2427         total += block->used_length;
2428     }
2429     return total;
2430 }
2431
2432 uint64_t ram_bytes_total(void)
2433 {
2434     RAMBlock *block;
2435     uint64_t total = 0;
2436
2437     RCU_READ_LOCK_GUARD();
2438
2439     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2440         total += block->used_length;
2441     }
2442     return total;
2443 }
2444
2445 static void xbzrle_load_setup(void)
2446 {
2447     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2448 }
2449
2450 static void xbzrle_load_cleanup(void)
2451 {
2452     g_free(XBZRLE.decoded_buf);
2453     XBZRLE.decoded_buf = NULL;
2454 }
2455
2456 static void ram_state_cleanup(RAMState **rsp)
2457 {
2458     if (*rsp) {
2459         migration_page_queue_free(*rsp);
2460         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2461         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2462         g_free(*rsp);
2463         *rsp = NULL;
2464     }
2465 }
2466
2467 static void xbzrle_cleanup(void)
2468 {
2469     XBZRLE_cache_lock();
2470     if (XBZRLE.cache) {
2471         cache_fini(XBZRLE.cache);
2472         g_free(XBZRLE.encoded_buf);
2473         g_free(XBZRLE.current_buf);
2474         g_free(XBZRLE.zero_target_page);
2475         XBZRLE.cache = NULL;
2476         XBZRLE.encoded_buf = NULL;
2477         XBZRLE.current_buf = NULL;
2478         XBZRLE.zero_target_page = NULL;
2479     }
2480     XBZRLE_cache_unlock();
2481 }
2482
2483 static void ram_save_cleanup(void *opaque)
2484 {
2485     RAMState **rsp = opaque;
2486     RAMBlock *block;
2487
2488     /* We don't use dirty log with background snapshots */
2489     if (!migrate_background_snapshot()) {
2490         /* caller have hold iothread lock or is in a bh, so there is
2491          * no writing race against the migration bitmap
2492          */
2493         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2494             /*
2495              * do not stop dirty log without starting it, since
2496              * memory_global_dirty_log_stop will assert that
2497              * memory_global_dirty_log_start/stop used in pairs
2498              */
2499             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2500         }
2501     }
2502
2503     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2504         g_free(block->clear_bmap);
2505         block->clear_bmap = NULL;
2506         g_free(block->bmap);
2507         block->bmap = NULL;
2508     }
2509
2510     xbzrle_cleanup();
2511     compress_threads_save_cleanup();
2512     ram_state_cleanup(rsp);
2513     g_free(migration_ops);
2514     migration_ops = NULL;
2515 }
2516
2517 static void ram_state_reset(RAMState *rs)
2518 {
2519     int i;
2520
2521     for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2522         rs->pss[i].last_sent_block = NULL;
2523     }
2524
2525     rs->last_seen_block = NULL;
2526     rs->last_page = 0;
2527     rs->last_version = ram_list.version;
2528     rs->xbzrle_started = false;
2529 }
2530
2531 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2532
2533 /* **** functions for postcopy ***** */
2534
2535 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2536 {
2537     struct RAMBlock *block;
2538
2539     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2540         unsigned long *bitmap = block->bmap;
2541         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2542         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2543
2544         while (run_start < range) {
2545             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2546             ram_discard_range(block->idstr,
2547                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2548                               ((ram_addr_t)(run_end - run_start))
2549                                 << TARGET_PAGE_BITS);
2550             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2551         }
2552     }
2553 }
2554
2555 /**
2556  * postcopy_send_discard_bm_ram: discard a RAMBlock
2557  *
2558  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2559  *
2560  * @ms: current migration state
2561  * @block: RAMBlock to discard
2562  */
2563 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2564 {
2565     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2566     unsigned long current;
2567     unsigned long *bitmap = block->bmap;
2568
2569     for (current = 0; current < end; ) {
2570         unsigned long one = find_next_bit(bitmap, end, current);
2571         unsigned long zero, discard_length;
2572
2573         if (one >= end) {
2574             break;
2575         }
2576
2577         zero = find_next_zero_bit(bitmap, end, one + 1);
2578
2579         if (zero >= end) {
2580             discard_length = end - one;
2581         } else {
2582             discard_length = zero - one;
2583         }
2584         postcopy_discard_send_range(ms, one, discard_length);
2585         current = one + discard_length;
2586     }
2587 }
2588
2589 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2590
2591 /**
2592  * postcopy_each_ram_send_discard: discard all RAMBlocks
2593  *
2594  * Utility for the outgoing postcopy code.
2595  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2596  *   passing it bitmap indexes and name.
2597  * (qemu_ram_foreach_block ends up passing unscaled lengths
2598  *  which would mean postcopy code would have to deal with target page)
2599  *
2600  * @ms: current migration state
2601  */
2602 static void postcopy_each_ram_send_discard(MigrationState *ms)
2603 {
2604     struct RAMBlock *block;
2605
2606     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2607         postcopy_discard_send_init(ms, block->idstr);
2608
2609         /*
2610          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2611          * host-page size chunks, mark any partially dirty host-page size
2612          * chunks as all dirty.  In this case the host-page is the host-page
2613          * for the particular RAMBlock, i.e. it might be a huge page.
2614          */
2615         postcopy_chunk_hostpages_pass(ms, block);
2616
2617         /*
2618          * Postcopy sends chunks of bitmap over the wire, but it
2619          * just needs indexes at this point, avoids it having
2620          * target page specific code.
2621          */
2622         postcopy_send_discard_bm_ram(ms, block);
2623         postcopy_discard_send_finish(ms);
2624     }
2625 }
2626
2627 /**
2628  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2629  *
2630  * Helper for postcopy_chunk_hostpages; it's called twice to
2631  * canonicalize the two bitmaps, that are similar, but one is
2632  * inverted.
2633  *
2634  * Postcopy requires that all target pages in a hostpage are dirty or
2635  * clean, not a mix.  This function canonicalizes the bitmaps.
2636  *
2637  * @ms: current migration state
2638  * @block: block that contains the page we want to canonicalize
2639  */
2640 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2641 {
2642     RAMState *rs = ram_state;
2643     unsigned long *bitmap = block->bmap;
2644     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2645     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2646     unsigned long run_start;
2647
2648     if (block->page_size == TARGET_PAGE_SIZE) {
2649         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2650         return;
2651     }
2652
2653     /* Find a dirty page */
2654     run_start = find_next_bit(bitmap, pages, 0);
2655
2656     while (run_start < pages) {
2657
2658         /*
2659          * If the start of this run of pages is in the middle of a host
2660          * page, then we need to fixup this host page.
2661          */
2662         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2663             /* Find the end of this run */
2664             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2665             /*
2666              * If the end isn't at the start of a host page, then the
2667              * run doesn't finish at the end of a host page
2668              * and we need to discard.
2669              */
2670         }
2671
2672         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2673             unsigned long page;
2674             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2675                                                              host_ratio);
2676             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2677
2678             /* Clean up the bitmap */
2679             for (page = fixup_start_addr;
2680                  page < fixup_start_addr + host_ratio; page++) {
2681                 /*
2682                  * Remark them as dirty, updating the count for any pages
2683                  * that weren't previously dirty.
2684                  */
2685                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2686             }
2687         }
2688
2689         /* Find the next dirty page for the next iteration */
2690         run_start = find_next_bit(bitmap, pages, run_start);
2691     }
2692 }
2693
2694 /**
2695  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2696  *
2697  * Transmit the set of pages to be discarded after precopy to the target
2698  * these are pages that:
2699  *     a) Have been previously transmitted but are now dirty again
2700  *     b) Pages that have never been transmitted, this ensures that
2701  *        any pages on the destination that have been mapped by background
2702  *        tasks get discarded (transparent huge pages is the specific concern)
2703  * Hopefully this is pretty sparse
2704  *
2705  * @ms: current migration state
2706  */
2707 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2708 {
2709     RAMState *rs = ram_state;
2710
2711     RCU_READ_LOCK_GUARD();
2712
2713     /* This should be our last sync, the src is now paused */
2714     migration_bitmap_sync(rs, false);
2715
2716     /* Easiest way to make sure we don't resume in the middle of a host-page */
2717     rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2718     rs->last_seen_block = NULL;
2719     rs->last_page = 0;
2720
2721     postcopy_each_ram_send_discard(ms);
2722
2723     trace_ram_postcopy_send_discard_bitmap();
2724 }
2725
2726 /**
2727  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2728  *
2729  * Returns zero on success
2730  *
2731  * @rbname: name of the RAMBlock of the request. NULL means the
2732  *          same that last one.
2733  * @start: RAMBlock starting page
2734  * @length: RAMBlock size
2735  */
2736 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2737 {
2738     trace_ram_discard_range(rbname, start, length);
2739
2740     RCU_READ_LOCK_GUARD();
2741     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2742
2743     if (!rb) {
2744         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2745         return -1;
2746     }
2747
2748     /*
2749      * On source VM, we don't need to update the received bitmap since
2750      * we don't even have one.
2751      */
2752     if (rb->receivedmap) {
2753         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2754                      length >> qemu_target_page_bits());
2755     }
2756
2757     return ram_block_discard_range(rb, start, length);
2758 }
2759
2760 /*
2761  * For every allocation, we will try not to crash the VM if the
2762  * allocation failed.
2763  */
2764 static int xbzrle_init(void)
2765 {
2766     Error *local_err = NULL;
2767
2768     if (!migrate_xbzrle()) {
2769         return 0;
2770     }
2771
2772     XBZRLE_cache_lock();
2773
2774     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2775     if (!XBZRLE.zero_target_page) {
2776         error_report("%s: Error allocating zero page", __func__);
2777         goto err_out;
2778     }
2779
2780     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2781                               TARGET_PAGE_SIZE, &local_err);
2782     if (!XBZRLE.cache) {
2783         error_report_err(local_err);
2784         goto free_zero_page;
2785     }
2786
2787     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2788     if (!XBZRLE.encoded_buf) {
2789         error_report("%s: Error allocating encoded_buf", __func__);
2790         goto free_cache;
2791     }
2792
2793     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2794     if (!XBZRLE.current_buf) {
2795         error_report("%s: Error allocating current_buf", __func__);
2796         goto free_encoded_buf;
2797     }
2798
2799     /* We are all good */
2800     XBZRLE_cache_unlock();
2801     return 0;
2802
2803 free_encoded_buf:
2804     g_free(XBZRLE.encoded_buf);
2805     XBZRLE.encoded_buf = NULL;
2806 free_cache:
2807     cache_fini(XBZRLE.cache);
2808     XBZRLE.cache = NULL;
2809 free_zero_page:
2810     g_free(XBZRLE.zero_target_page);
2811     XBZRLE.zero_target_page = NULL;
2812 err_out:
2813     XBZRLE_cache_unlock();
2814     return -ENOMEM;
2815 }
2816
2817 static int ram_state_init(RAMState **rsp)
2818 {
2819     *rsp = g_try_new0(RAMState, 1);
2820
2821     if (!*rsp) {
2822         error_report("%s: Init ramstate fail", __func__);
2823         return -1;
2824     }
2825
2826     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2827     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2828     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2829     (*rsp)->ram_bytes_total = ram_bytes_total();
2830
2831     /*
2832      * Count the total number of pages used by ram blocks not including any
2833      * gaps due to alignment or unplugs.
2834      * This must match with the initial values of dirty bitmap.
2835      */
2836     (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
2837     ram_state_reset(*rsp);
2838
2839     return 0;
2840 }
2841
2842 static void ram_list_init_bitmaps(void)
2843 {
2844     MigrationState *ms = migrate_get_current();
2845     RAMBlock *block;
2846     unsigned long pages;
2847     uint8_t shift;
2848
2849     /* Skip setting bitmap if there is no RAM */
2850     if (ram_bytes_total()) {
2851         shift = ms->clear_bitmap_shift;
2852         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2853             error_report("clear_bitmap_shift (%u) too big, using "
2854                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2855             shift = CLEAR_BITMAP_SHIFT_MAX;
2856         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2857             error_report("clear_bitmap_shift (%u) too small, using "
2858                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2859             shift = CLEAR_BITMAP_SHIFT_MIN;
2860         }
2861
2862         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2863             pages = block->max_length >> TARGET_PAGE_BITS;
2864             /*
2865              * The initial dirty bitmap for migration must be set with all
2866              * ones to make sure we'll migrate every guest RAM page to
2867              * destination.
2868              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2869              * new migration after a failed migration, ram_list.
2870              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2871              * guest memory.
2872              */
2873             block->bmap = bitmap_new(pages);
2874             bitmap_set(block->bmap, 0, pages);
2875             block->clear_bmap_shift = shift;
2876             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2877         }
2878     }
2879 }
2880
2881 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2882 {
2883     unsigned long pages;
2884     RAMBlock *rb;
2885
2886     RCU_READ_LOCK_GUARD();
2887
2888     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2889             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2890             rs->migration_dirty_pages -= pages;
2891     }
2892 }
2893
2894 static void ram_init_bitmaps(RAMState *rs)
2895 {
2896     qemu_mutex_lock_ramlist();
2897
2898     WITH_RCU_READ_LOCK_GUARD() {
2899         ram_list_init_bitmaps();
2900         /* We don't use dirty log with background snapshots */
2901         if (!migrate_background_snapshot()) {
2902             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2903             migration_bitmap_sync_precopy(rs, false);
2904         }
2905     }
2906     qemu_mutex_unlock_ramlist();
2907
2908     /*
2909      * After an eventual first bitmap sync, fixup the initial bitmap
2910      * containing all 1s to exclude any discarded pages from migration.
2911      */
2912     migration_bitmap_clear_discarded_pages(rs);
2913 }
2914
2915 static int ram_init_all(RAMState **rsp)
2916 {
2917     if (ram_state_init(rsp)) {
2918         return -1;
2919     }
2920
2921     if (xbzrle_init()) {
2922         ram_state_cleanup(rsp);
2923         return -1;
2924     }
2925
2926     ram_init_bitmaps(*rsp);
2927
2928     return 0;
2929 }
2930
2931 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2932 {
2933     RAMBlock *block;
2934     uint64_t pages = 0;
2935
2936     /*
2937      * Postcopy is not using xbzrle/compression, so no need for that.
2938      * Also, since source are already halted, we don't need to care
2939      * about dirty page logging as well.
2940      */
2941
2942     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2943         pages += bitmap_count_one(block->bmap,
2944                                   block->used_length >> TARGET_PAGE_BITS);
2945     }
2946
2947     /* This may not be aligned with current bitmaps. Recalculate. */
2948     rs->migration_dirty_pages = pages;
2949
2950     ram_state_reset(rs);
2951
2952     /* Update RAMState cache of output QEMUFile */
2953     rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
2954
2955     trace_ram_state_resume_prepare(pages);
2956 }
2957
2958 /*
2959  * This function clears bits of the free pages reported by the caller from the
2960  * migration dirty bitmap. @addr is the host address corresponding to the
2961  * start of the continuous guest free pages, and @len is the total bytes of
2962  * those pages.
2963  */
2964 void qemu_guest_free_page_hint(void *addr, size_t len)
2965 {
2966     RAMBlock *block;
2967     ram_addr_t offset;
2968     size_t used_len, start, npages;
2969     MigrationState *s = migrate_get_current();
2970
2971     /* This function is currently expected to be used during live migration */
2972     if (!migration_is_setup_or_active(s->state)) {
2973         return;
2974     }
2975
2976     for (; len > 0; len -= used_len, addr += used_len) {
2977         block = qemu_ram_block_from_host(addr, false, &offset);
2978         if (unlikely(!block || offset >= block->used_length)) {
2979             /*
2980              * The implementation might not support RAMBlock resize during
2981              * live migration, but it could happen in theory with future
2982              * updates. So we add a check here to capture that case.
2983              */
2984             error_report_once("%s unexpected error", __func__);
2985             return;
2986         }
2987
2988         if (len <= block->used_length - offset) {
2989             used_len = len;
2990         } else {
2991             used_len = block->used_length - offset;
2992         }
2993
2994         start = offset >> TARGET_PAGE_BITS;
2995         npages = used_len >> TARGET_PAGE_BITS;
2996
2997         qemu_mutex_lock(&ram_state->bitmap_mutex);
2998         /*
2999          * The skipped free pages are equavalent to be sent from clear_bmap's
3000          * perspective, so clear the bits from the memory region bitmap which
3001          * are initially set. Otherwise those skipped pages will be sent in
3002          * the next round after syncing from the memory region bitmap.
3003          */
3004         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3005         ram_state->migration_dirty_pages -=
3006                       bitmap_count_one_with_offset(block->bmap, start, npages);
3007         bitmap_clear(block->bmap, start, npages);
3008         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3009     }
3010 }
3011
3012 /*
3013  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3014  * long-running RCU critical section.  When rcu-reclaims in the code
3015  * start to become numerous it will be necessary to reduce the
3016  * granularity of these critical sections.
3017  */
3018
3019 /**
3020  * ram_save_setup: Setup RAM for migration
3021  *
3022  * Returns zero to indicate success and negative for error
3023  *
3024  * @f: QEMUFile where to send the data
3025  * @opaque: RAMState pointer
3026  */
3027 static int ram_save_setup(QEMUFile *f, void *opaque)
3028 {
3029     RAMState **rsp = opaque;
3030     RAMBlock *block;
3031     int ret;
3032
3033     if (compress_threads_save_setup()) {
3034         return -1;
3035     }
3036
3037     /* migration has already setup the bitmap, reuse it. */
3038     if (!migration_in_colo_state()) {
3039         if (ram_init_all(rsp) != 0) {
3040             compress_threads_save_cleanup();
3041             return -1;
3042         }
3043     }
3044     (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3045
3046     WITH_RCU_READ_LOCK_GUARD() {
3047         qemu_put_be64(f, ram_bytes_total_with_ignored()
3048                          | RAM_SAVE_FLAG_MEM_SIZE);
3049
3050         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3051             qemu_put_byte(f, strlen(block->idstr));
3052             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3053             qemu_put_be64(f, block->used_length);
3054             if (migrate_postcopy_ram() && block->page_size !=
3055                                           qemu_host_page_size) {
3056                 qemu_put_be64(f, block->page_size);
3057             }
3058             if (migrate_ignore_shared()) {
3059                 qemu_put_be64(f, block->mr->addr);
3060             }
3061         }
3062     }
3063
3064     ret = qemu_rdma_registration_start(f, RAM_CONTROL_SETUP);
3065     if (ret < 0) {
3066         qemu_file_set_error(f, ret);
3067     }
3068
3069     ret = qemu_rdma_registration_stop(f, RAM_CONTROL_SETUP);
3070     if (ret < 0) {
3071         qemu_file_set_error(f, ret);
3072     }
3073
3074     migration_ops = g_malloc0(sizeof(MigrationOps));
3075     migration_ops->ram_save_target_page = ram_save_target_page_legacy;
3076
3077     qemu_mutex_unlock_iothread();
3078     ret = multifd_send_sync_main(f);
3079     qemu_mutex_lock_iothread();
3080     if (ret < 0) {
3081         return ret;
3082     }
3083
3084     if (migrate_multifd() && !migrate_multifd_flush_after_each_section()) {
3085         qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3086     }
3087
3088     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3089     qemu_fflush(f);
3090
3091     return 0;
3092 }
3093
3094 /**
3095  * ram_save_iterate: iterative stage for migration
3096  *
3097  * Returns zero to indicate success and negative for error
3098  *
3099  * @f: QEMUFile where to send the data
3100  * @opaque: RAMState pointer
3101  */
3102 static int ram_save_iterate(QEMUFile *f, void *opaque)
3103 {
3104     RAMState **temp = opaque;
3105     RAMState *rs = *temp;
3106     int ret = 0;
3107     int i;
3108     int64_t t0;
3109     int done = 0;
3110
3111     if (blk_mig_bulk_active()) {
3112         /* Avoid transferring ram during bulk phase of block migration as
3113          * the bulk phase will usually take a long time and transferring
3114          * ram updates during that time is pointless. */
3115         goto out;
3116     }
3117
3118     /*
3119      * We'll take this lock a little bit long, but it's okay for two reasons.
3120      * Firstly, the only possible other thread to take it is who calls
3121      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3122      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3123      * guarantees that we'll at least released it in a regular basis.
3124      */
3125     qemu_mutex_lock(&rs->bitmap_mutex);
3126     WITH_RCU_READ_LOCK_GUARD() {
3127         if (ram_list.version != rs->last_version) {
3128             ram_state_reset(rs);
3129         }
3130
3131         /* Read version before ram_list.blocks */
3132         smp_rmb();
3133
3134         ret = qemu_rdma_registration_start(f, RAM_CONTROL_ROUND);
3135         if (ret < 0) {
3136             qemu_file_set_error(f, ret);
3137         }
3138
3139         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3140         i = 0;
3141         while ((ret = migration_rate_exceeded(f)) == 0 ||
3142                postcopy_has_request(rs)) {
3143             int pages;
3144
3145             if (qemu_file_get_error(f)) {
3146                 break;
3147             }
3148
3149             pages = ram_find_and_save_block(rs);
3150             /* no more pages to sent */
3151             if (pages == 0) {
3152                 done = 1;
3153                 break;
3154             }
3155
3156             if (pages < 0) {
3157                 qemu_file_set_error(f, pages);
3158                 break;
3159             }
3160
3161             rs->target_page_count += pages;
3162
3163             /*
3164              * During postcopy, it is necessary to make sure one whole host
3165              * page is sent in one chunk.
3166              */
3167             if (migrate_postcopy_ram()) {
3168                 ram_flush_compressed_data(rs);
3169             }
3170
3171             /*
3172              * we want to check in the 1st loop, just in case it was the 1st
3173              * time and we had to sync the dirty bitmap.
3174              * qemu_clock_get_ns() is a bit expensive, so we only check each
3175              * some iterations
3176              */
3177             if ((i & 63) == 0) {
3178                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3179                               1000000;
3180                 if (t1 > MAX_WAIT) {
3181                     trace_ram_save_iterate_big_wait(t1, i);
3182                     break;
3183                 }
3184             }
3185             i++;
3186         }
3187     }
3188     qemu_mutex_unlock(&rs->bitmap_mutex);
3189
3190     /*
3191      * Must occur before EOS (or any QEMUFile operation)
3192      * because of RDMA protocol.
3193      */
3194     ret = qemu_rdma_registration_stop(f, RAM_CONTROL_ROUND);
3195     if (ret < 0) {
3196         qemu_file_set_error(f, ret);
3197     }
3198
3199 out:
3200     if (ret >= 0
3201         && migration_is_setup_or_active(migrate_get_current()->state)) {
3202         if (migrate_multifd() && migrate_multifd_flush_after_each_section()) {
3203             ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3204             if (ret < 0) {
3205                 return ret;
3206             }
3207         }
3208
3209         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3210         qemu_fflush(f);
3211         ram_transferred_add(8);
3212
3213         ret = qemu_file_get_error(f);
3214     }
3215     if (ret < 0) {
3216         return ret;
3217     }
3218
3219     return done;
3220 }
3221
3222 /**
3223  * ram_save_complete: function called to send the remaining amount of ram
3224  *
3225  * Returns zero to indicate success or negative on error
3226  *
3227  * Called with iothread lock
3228  *
3229  * @f: QEMUFile where to send the data
3230  * @opaque: RAMState pointer
3231  */
3232 static int ram_save_complete(QEMUFile *f, void *opaque)
3233 {
3234     RAMState **temp = opaque;
3235     RAMState *rs = *temp;
3236     int ret = 0;
3237
3238     rs->last_stage = !migration_in_colo_state();
3239
3240     WITH_RCU_READ_LOCK_GUARD() {
3241         if (!migration_in_postcopy()) {
3242             migration_bitmap_sync_precopy(rs, true);
3243         }
3244
3245         ret = qemu_rdma_registration_start(f, RAM_CONTROL_FINISH);
3246         if (ret < 0) {
3247             qemu_file_set_error(f, ret);
3248         }
3249
3250         /* try transferring iterative blocks of memory */
3251
3252         /* flush all remaining blocks regardless of rate limiting */
3253         qemu_mutex_lock(&rs->bitmap_mutex);
3254         while (true) {
3255             int pages;
3256
3257             pages = ram_find_and_save_block(rs);
3258             /* no more blocks to sent */
3259             if (pages == 0) {
3260                 break;
3261             }
3262             if (pages < 0) {
3263                 ret = pages;
3264                 break;
3265             }
3266         }
3267         qemu_mutex_unlock(&rs->bitmap_mutex);
3268
3269         ram_flush_compressed_data(rs);
3270
3271         int ret = qemu_rdma_registration_stop(f, RAM_CONTROL_FINISH);
3272         if (ret < 0) {
3273             qemu_file_set_error(f, ret);
3274         }
3275     }
3276
3277     if (ret < 0) {
3278         return ret;
3279     }
3280
3281     ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3282     if (ret < 0) {
3283         return ret;
3284     }
3285
3286     if (migrate_multifd() && !migrate_multifd_flush_after_each_section()) {
3287         qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3288     }
3289     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3290     qemu_fflush(f);
3291
3292     return 0;
3293 }
3294
3295 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3296                                        uint64_t *can_postcopy)
3297 {
3298     RAMState **temp = opaque;
3299     RAMState *rs = *temp;
3300
3301     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3302
3303     if (migrate_postcopy_ram()) {
3304         /* We can do postcopy, and all the data is postcopiable */
3305         *can_postcopy += remaining_size;
3306     } else {
3307         *must_precopy += remaining_size;
3308     }
3309 }
3310
3311 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3312                                     uint64_t *can_postcopy)
3313 {
3314     MigrationState *s = migrate_get_current();
3315     RAMState **temp = opaque;
3316     RAMState *rs = *temp;
3317
3318     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3319
3320     if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
3321         qemu_mutex_lock_iothread();
3322         WITH_RCU_READ_LOCK_GUARD() {
3323             migration_bitmap_sync_precopy(rs, false);
3324         }
3325         qemu_mutex_unlock_iothread();
3326         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3327     }
3328
3329     if (migrate_postcopy_ram()) {
3330         /* We can do postcopy, and all the data is postcopiable */
3331         *can_postcopy += remaining_size;
3332     } else {
3333         *must_precopy += remaining_size;
3334     }
3335 }
3336
3337 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3338 {
3339     unsigned int xh_len;
3340     int xh_flags;
3341     uint8_t *loaded_data;
3342
3343     /* extract RLE header */
3344     xh_flags = qemu_get_byte(f);
3345     xh_len = qemu_get_be16(f);
3346
3347     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3348         error_report("Failed to load XBZRLE page - wrong compression!");
3349         return -1;
3350     }
3351
3352     if (xh_len > TARGET_PAGE_SIZE) {
3353         error_report("Failed to load XBZRLE page - len overflow!");
3354         return -1;
3355     }
3356     loaded_data = XBZRLE.decoded_buf;
3357     /* load data and decode */
3358     /* it can change loaded_data to point to an internal buffer */
3359     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3360
3361     /* decode RLE */
3362     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3363                              TARGET_PAGE_SIZE) == -1) {
3364         error_report("Failed to load XBZRLE page - decode error!");
3365         return -1;
3366     }
3367
3368     return 0;
3369 }
3370
3371 /**
3372  * ram_block_from_stream: read a RAMBlock id from the migration stream
3373  *
3374  * Must be called from within a rcu critical section.
3375  *
3376  * Returns a pointer from within the RCU-protected ram_list.
3377  *
3378  * @mis: the migration incoming state pointer
3379  * @f: QEMUFile where to read the data from
3380  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3381  * @channel: the channel we're using
3382  */
3383 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3384                                               QEMUFile *f, int flags,
3385                                               int channel)
3386 {
3387     RAMBlock *block = mis->last_recv_block[channel];
3388     char id[256];
3389     uint8_t len;
3390
3391     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3392         if (!block) {
3393             error_report("Ack, bad migration stream!");
3394             return NULL;
3395         }
3396         return block;
3397     }
3398
3399     len = qemu_get_byte(f);
3400     qemu_get_buffer(f, (uint8_t *)id, len);
3401     id[len] = 0;
3402
3403     block = qemu_ram_block_by_name(id);
3404     if (!block) {
3405         error_report("Can't find block %s", id);
3406         return NULL;
3407     }
3408
3409     if (migrate_ram_is_ignored(block)) {
3410         error_report("block %s should not be migrated !", id);
3411         return NULL;
3412     }
3413
3414     mis->last_recv_block[channel] = block;
3415
3416     return block;
3417 }
3418
3419 static inline void *host_from_ram_block_offset(RAMBlock *block,
3420                                                ram_addr_t offset)
3421 {
3422     if (!offset_in_ramblock(block, offset)) {
3423         return NULL;
3424     }
3425
3426     return block->host + offset;
3427 }
3428
3429 static void *host_page_from_ram_block_offset(RAMBlock *block,
3430                                              ram_addr_t offset)
3431 {
3432     /* Note: Explicitly no check against offset_in_ramblock(). */
3433     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3434                                    block->page_size);
3435 }
3436
3437 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3438                                                          ram_addr_t offset)
3439 {
3440     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3441 }
3442
3443 void colo_record_bitmap(RAMBlock *block, ram_addr_t *normal, uint32_t pages)
3444 {
3445     qemu_mutex_lock(&ram_state->bitmap_mutex);
3446     for (int i = 0; i < pages; i++) {
3447         ram_addr_t offset = normal[i];
3448         ram_state->migration_dirty_pages += !test_and_set_bit(
3449                                                 offset >> TARGET_PAGE_BITS,
3450                                                 block->bmap);
3451     }
3452     qemu_mutex_unlock(&ram_state->bitmap_mutex);
3453 }
3454
3455 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3456                              ram_addr_t offset, bool record_bitmap)
3457 {
3458     if (!offset_in_ramblock(block, offset)) {
3459         return NULL;
3460     }
3461     if (!block->colo_cache) {
3462         error_report("%s: colo_cache is NULL in block :%s",
3463                      __func__, block->idstr);
3464         return NULL;
3465     }
3466
3467     /*
3468     * During colo checkpoint, we need bitmap of these migrated pages.
3469     * It help us to decide which pages in ram cache should be flushed
3470     * into VM's RAM later.
3471     */
3472     if (record_bitmap) {
3473         colo_record_bitmap(block, &offset, 1);
3474     }
3475     return block->colo_cache + offset;
3476 }
3477
3478 /**
3479  * ram_handle_compressed: handle the zero page case
3480  *
3481  * If a page (or a whole RDMA chunk) has been
3482  * determined to be zero, then zap it.
3483  *
3484  * @host: host address for the zero page
3485  * @ch: what the page is filled from.  We only support zero
3486  * @size: size of the zero page
3487  */
3488 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3489 {
3490     if (ch != 0 || !buffer_is_zero(host, size)) {
3491         memset(host, ch, size);
3492     }
3493 }
3494
3495 static void colo_init_ram_state(void)
3496 {
3497     ram_state_init(&ram_state);
3498 }
3499
3500 /*
3501  * colo cache: this is for secondary VM, we cache the whole
3502  * memory of the secondary VM, it is need to hold the global lock
3503  * to call this helper.
3504  */
3505 int colo_init_ram_cache(void)
3506 {
3507     RAMBlock *block;
3508
3509     WITH_RCU_READ_LOCK_GUARD() {
3510         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3511             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3512                                                     NULL, false, false);
3513             if (!block->colo_cache) {
3514                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3515                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3516                              block->used_length);
3517                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3518                     if (block->colo_cache) {
3519                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3520                         block->colo_cache = NULL;
3521                     }
3522                 }
3523                 return -errno;
3524             }
3525             if (!machine_dump_guest_core(current_machine)) {
3526                 qemu_madvise(block->colo_cache, block->used_length,
3527                              QEMU_MADV_DONTDUMP);
3528             }
3529         }
3530     }
3531
3532     /*
3533     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3534     * with to decide which page in cache should be flushed into SVM's RAM. Here
3535     * we use the same name 'ram_bitmap' as for migration.
3536     */
3537     if (ram_bytes_total()) {
3538         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3539             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3540             block->bmap = bitmap_new(pages);
3541         }
3542     }
3543
3544     colo_init_ram_state();
3545     return 0;
3546 }
3547
3548 /* TODO: duplicated with ram_init_bitmaps */
3549 void colo_incoming_start_dirty_log(void)
3550 {
3551     RAMBlock *block = NULL;
3552     /* For memory_global_dirty_log_start below. */
3553     qemu_mutex_lock_iothread();
3554     qemu_mutex_lock_ramlist();
3555
3556     memory_global_dirty_log_sync(false);
3557     WITH_RCU_READ_LOCK_GUARD() {
3558         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3559             ramblock_sync_dirty_bitmap(ram_state, block);
3560             /* Discard this dirty bitmap record */
3561             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3562         }
3563         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3564     }
3565     ram_state->migration_dirty_pages = 0;
3566     qemu_mutex_unlock_ramlist();
3567     qemu_mutex_unlock_iothread();
3568 }
3569
3570 /* It is need to hold the global lock to call this helper */
3571 void colo_release_ram_cache(void)
3572 {
3573     RAMBlock *block;
3574
3575     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3576     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3577         g_free(block->bmap);
3578         block->bmap = NULL;
3579     }
3580
3581     WITH_RCU_READ_LOCK_GUARD() {
3582         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3583             if (block->colo_cache) {
3584                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3585                 block->colo_cache = NULL;
3586             }
3587         }
3588     }
3589     ram_state_cleanup(&ram_state);
3590 }
3591
3592 /**
3593  * ram_load_setup: Setup RAM for migration incoming side
3594  *
3595  * Returns zero to indicate success and negative for error
3596  *
3597  * @f: QEMUFile where to receive the data
3598  * @opaque: RAMState pointer
3599  */
3600 static int ram_load_setup(QEMUFile *f, void *opaque)
3601 {
3602     xbzrle_load_setup();
3603     ramblock_recv_map_init();
3604
3605     return 0;
3606 }
3607
3608 static int ram_load_cleanup(void *opaque)
3609 {
3610     RAMBlock *rb;
3611
3612     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3613         qemu_ram_block_writeback(rb);
3614     }
3615
3616     xbzrle_load_cleanup();
3617
3618     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3619         g_free(rb->receivedmap);
3620         rb->receivedmap = NULL;
3621     }
3622
3623     return 0;
3624 }
3625
3626 /**
3627  * ram_postcopy_incoming_init: allocate postcopy data structures
3628  *
3629  * Returns 0 for success and negative if there was one error
3630  *
3631  * @mis: current migration incoming state
3632  *
3633  * Allocate data structures etc needed by incoming migration with
3634  * postcopy-ram. postcopy-ram's similarly names
3635  * postcopy_ram_incoming_init does the work.
3636  */
3637 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3638 {
3639     return postcopy_ram_incoming_init(mis);
3640 }
3641
3642 /**
3643  * ram_load_postcopy: load a page in postcopy case
3644  *
3645  * Returns 0 for success or -errno in case of error
3646  *
3647  * Called in postcopy mode by ram_load().
3648  * rcu_read_lock is taken prior to this being called.
3649  *
3650  * @f: QEMUFile where to send the data
3651  * @channel: the channel to use for loading
3652  */
3653 int ram_load_postcopy(QEMUFile *f, int channel)
3654 {
3655     int flags = 0, ret = 0;
3656     bool place_needed = false;
3657     bool matches_target_page_size = false;
3658     MigrationIncomingState *mis = migration_incoming_get_current();
3659     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
3660
3661     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3662         ram_addr_t addr;
3663         void *page_buffer = NULL;
3664         void *place_source = NULL;
3665         RAMBlock *block = NULL;
3666         uint8_t ch;
3667         int len;
3668
3669         addr = qemu_get_be64(f);
3670
3671         /*
3672          * If qemu file error, we should stop here, and then "addr"
3673          * may be invalid
3674          */
3675         ret = qemu_file_get_error(f);
3676         if (ret) {
3677             break;
3678         }
3679
3680         flags = addr & ~TARGET_PAGE_MASK;
3681         addr &= TARGET_PAGE_MASK;
3682
3683         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
3684         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3685                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3686             block = ram_block_from_stream(mis, f, flags, channel);
3687             if (!block) {
3688                 ret = -EINVAL;
3689                 break;
3690             }
3691
3692             /*
3693              * Relying on used_length is racy and can result in false positives.
3694              * We might place pages beyond used_length in case RAM was shrunk
3695              * while in postcopy, which is fine - trying to place via
3696              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3697              */
3698             if (!block->host || addr >= block->postcopy_length) {
3699                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3700                 ret = -EINVAL;
3701                 break;
3702             }
3703             tmp_page->target_pages++;
3704             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3705             /*
3706              * Postcopy requires that we place whole host pages atomically;
3707              * these may be huge pages for RAMBlocks that are backed by
3708              * hugetlbfs.
3709              * To make it atomic, the data is read into a temporary page
3710              * that's moved into place later.
3711              * The migration protocol uses,  possibly smaller, target-pages
3712              * however the source ensures it always sends all the components
3713              * of a host page in one chunk.
3714              */
3715             page_buffer = tmp_page->tmp_huge_page +
3716                           host_page_offset_from_ram_block_offset(block, addr);
3717             /* If all TP are zero then we can optimise the place */
3718             if (tmp_page->target_pages == 1) {
3719                 tmp_page->host_addr =
3720                     host_page_from_ram_block_offset(block, addr);
3721             } else if (tmp_page->host_addr !=
3722                        host_page_from_ram_block_offset(block, addr)) {
3723                 /* not the 1st TP within the HP */
3724                 error_report("Non-same host page detected on channel %d: "
3725                              "Target host page %p, received host page %p "
3726                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
3727                              channel, tmp_page->host_addr,
3728                              host_page_from_ram_block_offset(block, addr),
3729                              block->idstr, addr, tmp_page->target_pages);
3730                 ret = -EINVAL;
3731                 break;
3732             }
3733
3734             /*
3735              * If it's the last part of a host page then we place the host
3736              * page
3737              */
3738             if (tmp_page->target_pages ==
3739                 (block->page_size / TARGET_PAGE_SIZE)) {
3740                 place_needed = true;
3741             }
3742             place_source = tmp_page->tmp_huge_page;
3743         }
3744
3745         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3746         case RAM_SAVE_FLAG_ZERO:
3747             ch = qemu_get_byte(f);
3748             /*
3749              * Can skip to set page_buffer when
3750              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3751              */
3752             if (ch || !matches_target_page_size) {
3753                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3754             }
3755             if (ch) {
3756                 tmp_page->all_zero = false;
3757             }
3758             break;
3759
3760         case RAM_SAVE_FLAG_PAGE:
3761             tmp_page->all_zero = false;
3762             if (!matches_target_page_size) {
3763                 /* For huge pages, we always use temporary buffer */
3764                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3765             } else {
3766                 /*
3767                  * For small pages that matches target page size, we
3768                  * avoid the qemu_file copy.  Instead we directly use
3769                  * the buffer of QEMUFile to place the page.  Note: we
3770                  * cannot do any QEMUFile operation before using that
3771                  * buffer to make sure the buffer is valid when
3772                  * placing the page.
3773                  */
3774                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3775                                          TARGET_PAGE_SIZE);
3776             }
3777             break;
3778         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3779             tmp_page->all_zero = false;
3780             len = qemu_get_be32(f);
3781             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3782                 error_report("Invalid compressed data length: %d", len);
3783                 ret = -EINVAL;
3784                 break;
3785             }
3786             decompress_data_with_multi_threads(f, page_buffer, len);
3787             break;
3788         case RAM_SAVE_FLAG_MULTIFD_FLUSH:
3789             multifd_recv_sync_main();
3790             break;
3791         case RAM_SAVE_FLAG_EOS:
3792             /* normal exit */
3793             if (migrate_multifd() &&
3794                 migrate_multifd_flush_after_each_section()) {
3795                 multifd_recv_sync_main();
3796             }
3797             break;
3798         default:
3799             error_report("Unknown combination of migration flags: 0x%x"
3800                          " (postcopy mode)", flags);
3801             ret = -EINVAL;
3802             break;
3803         }
3804
3805         /* Got the whole host page, wait for decompress before placing. */
3806         if (place_needed) {
3807             ret |= wait_for_decompress_done();
3808         }
3809
3810         /* Detect for any possible file errors */
3811         if (!ret && qemu_file_get_error(f)) {
3812             ret = qemu_file_get_error(f);
3813         }
3814
3815         if (!ret && place_needed) {
3816             if (tmp_page->all_zero) {
3817                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
3818             } else {
3819                 ret = postcopy_place_page(mis, tmp_page->host_addr,
3820                                           place_source, block);
3821             }
3822             place_needed = false;
3823             postcopy_temp_page_reset(tmp_page);
3824         }
3825     }
3826
3827     return ret;
3828 }
3829
3830 static bool postcopy_is_running(void)
3831 {
3832     PostcopyState ps = postcopy_state_get();
3833     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3834 }
3835
3836 /*
3837  * Flush content of RAM cache into SVM's memory.
3838  * Only flush the pages that be dirtied by PVM or SVM or both.
3839  */
3840 void colo_flush_ram_cache(void)
3841 {
3842     RAMBlock *block = NULL;
3843     void *dst_host;
3844     void *src_host;
3845     unsigned long offset = 0;
3846
3847     memory_global_dirty_log_sync(false);
3848     qemu_mutex_lock(&ram_state->bitmap_mutex);
3849     WITH_RCU_READ_LOCK_GUARD() {
3850         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3851             ramblock_sync_dirty_bitmap(ram_state, block);
3852         }
3853     }
3854
3855     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3856     WITH_RCU_READ_LOCK_GUARD() {
3857         block = QLIST_FIRST_RCU(&ram_list.blocks);
3858
3859         while (block) {
3860             unsigned long num = 0;
3861
3862             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3863             if (!offset_in_ramblock(block,
3864                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3865                 offset = 0;
3866                 num = 0;
3867                 block = QLIST_NEXT_RCU(block, next);
3868             } else {
3869                 unsigned long i = 0;
3870
3871                 for (i = 0; i < num; i++) {
3872                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
3873                 }
3874                 dst_host = block->host
3875                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3876                 src_host = block->colo_cache
3877                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3878                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3879                 offset += num;
3880             }
3881         }
3882     }
3883     qemu_mutex_unlock(&ram_state->bitmap_mutex);
3884     trace_colo_flush_ram_cache_end();
3885 }
3886
3887 /**
3888  * ram_load_precopy: load pages in precopy case
3889  *
3890  * Returns 0 for success or -errno in case of error
3891  *
3892  * Called in precopy mode by ram_load().
3893  * rcu_read_lock is taken prior to this being called.
3894  *
3895  * @f: QEMUFile where to send the data
3896  */
3897 static int ram_load_precopy(QEMUFile *f)
3898 {
3899     MigrationIncomingState *mis = migration_incoming_get_current();
3900     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3901     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3902     bool postcopy_advised = migration_incoming_postcopy_advised();
3903     if (!migrate_compress()) {
3904         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3905     }
3906
3907     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3908         ram_addr_t addr, total_ram_bytes;
3909         void *host = NULL, *host_bak = NULL;
3910         uint8_t ch;
3911
3912         /*
3913          * Yield periodically to let main loop run, but an iteration of
3914          * the main loop is expensive, so do it each some iterations
3915          */
3916         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3917             aio_co_schedule(qemu_get_current_aio_context(),
3918                             qemu_coroutine_self());
3919             qemu_coroutine_yield();
3920         }
3921         i++;
3922
3923         addr = qemu_get_be64(f);
3924         flags = addr & ~TARGET_PAGE_MASK;
3925         addr &= TARGET_PAGE_MASK;
3926
3927         if (flags & invalid_flags) {
3928             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3929                 error_report("Received an unexpected compressed page");
3930             }
3931
3932             ret = -EINVAL;
3933             break;
3934         }
3935
3936         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3937                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3938             RAMBlock *block = ram_block_from_stream(mis, f, flags,
3939                                                     RAM_CHANNEL_PRECOPY);
3940
3941             host = host_from_ram_block_offset(block, addr);
3942             /*
3943              * After going into COLO stage, we should not load the page
3944              * into SVM's memory directly, we put them into colo_cache firstly.
3945              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3946              * Previously, we copied all these memory in preparing stage of COLO
3947              * while we need to stop VM, which is a time-consuming process.
3948              * Here we optimize it by a trick, back-up every page while in
3949              * migration process while COLO is enabled, though it affects the
3950              * speed of the migration, but it obviously reduce the downtime of
3951              * back-up all SVM'S memory in COLO preparing stage.
3952              */
3953             if (migration_incoming_colo_enabled()) {
3954                 if (migration_incoming_in_colo_state()) {
3955                     /* In COLO stage, put all pages into cache temporarily */
3956                     host = colo_cache_from_block_offset(block, addr, true);
3957                 } else {
3958                    /*
3959                     * In migration stage but before COLO stage,
3960                     * Put all pages into both cache and SVM's memory.
3961                     */
3962                     host_bak = colo_cache_from_block_offset(block, addr, false);
3963                 }
3964             }
3965             if (!host) {
3966                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3967                 ret = -EINVAL;
3968                 break;
3969             }
3970             if (!migration_incoming_in_colo_state()) {
3971                 ramblock_recv_bitmap_set(block, host);
3972             }
3973
3974             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3975         }
3976
3977         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3978         case RAM_SAVE_FLAG_MEM_SIZE:
3979             /* Synchronize RAM block list */
3980             total_ram_bytes = addr;
3981             while (!ret && total_ram_bytes) {
3982                 RAMBlock *block;
3983                 char id[256];
3984                 ram_addr_t length;
3985
3986                 len = qemu_get_byte(f);
3987                 qemu_get_buffer(f, (uint8_t *)id, len);
3988                 id[len] = 0;
3989                 length = qemu_get_be64(f);
3990
3991                 block = qemu_ram_block_by_name(id);
3992                 if (block && !qemu_ram_is_migratable(block)) {
3993                     error_report("block %s should not be migrated !", id);
3994                     ret = -EINVAL;
3995                 } else if (block) {
3996                     if (length != block->used_length) {
3997                         Error *local_err = NULL;
3998
3999                         ret = qemu_ram_resize(block, length,
4000                                               &local_err);
4001                         if (local_err) {
4002                             error_report_err(local_err);
4003                         }
4004                     }
4005                     /* For postcopy we need to check hugepage sizes match */
4006                     if (postcopy_advised && migrate_postcopy_ram() &&
4007                         block->page_size != qemu_host_page_size) {
4008                         uint64_t remote_page_size = qemu_get_be64(f);
4009                         if (remote_page_size != block->page_size) {
4010                             error_report("Mismatched RAM page size %s "
4011                                          "(local) %zd != %" PRId64,
4012                                          id, block->page_size,
4013                                          remote_page_size);
4014                             ret = -EINVAL;
4015                         }
4016                     }
4017                     if (migrate_ignore_shared()) {
4018                         hwaddr addr2 = qemu_get_be64(f);
4019                         if (migrate_ram_is_ignored(block) &&
4020                             block->mr->addr != addr2) {
4021                             error_report("Mismatched GPAs for block %s "
4022                                          "%" PRId64 "!= %" PRId64,
4023                                          id, (uint64_t)addr2,
4024                                          (uint64_t)block->mr->addr);
4025                             ret = -EINVAL;
4026                         }
4027                     }
4028                     ret = rdma_block_notification_handle(f, block->idstr);
4029                     if (ret < 0) {
4030                         qemu_file_set_error(f, ret);
4031                     }
4032                 } else {
4033                     error_report("Unknown ramblock \"%s\", cannot "
4034                                  "accept migration", id);
4035                     ret = -EINVAL;
4036                 }
4037
4038                 total_ram_bytes -= length;
4039             }
4040             break;
4041
4042         case RAM_SAVE_FLAG_ZERO:
4043             ch = qemu_get_byte(f);
4044             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4045             break;
4046
4047         case RAM_SAVE_FLAG_PAGE:
4048             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4049             break;
4050
4051         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4052             len = qemu_get_be32(f);
4053             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4054                 error_report("Invalid compressed data length: %d", len);
4055                 ret = -EINVAL;
4056                 break;
4057             }
4058             decompress_data_with_multi_threads(f, host, len);
4059             break;
4060
4061         case RAM_SAVE_FLAG_XBZRLE:
4062             if (load_xbzrle(f, addr, host) < 0) {
4063                 error_report("Failed to decompress XBZRLE page at "
4064                              RAM_ADDR_FMT, addr);
4065                 ret = -EINVAL;
4066                 break;
4067             }
4068             break;
4069         case RAM_SAVE_FLAG_MULTIFD_FLUSH:
4070             multifd_recv_sync_main();
4071             break;
4072         case RAM_SAVE_FLAG_EOS:
4073             /* normal exit */
4074             if (migrate_multifd() &&
4075                 migrate_multifd_flush_after_each_section()) {
4076                 multifd_recv_sync_main();
4077             }
4078             break;
4079         case RAM_SAVE_FLAG_HOOK:
4080             ret = qemu_rdma_registration_handle(f);
4081             if (ret < 0) {
4082                 qemu_file_set_error(f, ret);
4083             }
4084             break;
4085         default:
4086             error_report("Unknown combination of migration flags: 0x%x", flags);
4087             ret = -EINVAL;
4088         }
4089         if (!ret) {
4090             ret = qemu_file_get_error(f);
4091         }
4092         if (!ret && host_bak) {
4093             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4094         }
4095     }
4096
4097     ret |= wait_for_decompress_done();
4098     return ret;
4099 }
4100
4101 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4102 {
4103     int ret = 0;
4104     static uint64_t seq_iter;
4105     /*
4106      * If system is running in postcopy mode, page inserts to host memory must
4107      * be atomic
4108      */
4109     bool postcopy_running = postcopy_is_running();
4110
4111     seq_iter++;
4112
4113     if (version_id != 4) {
4114         return -EINVAL;
4115     }
4116
4117     /*
4118      * This RCU critical section can be very long running.
4119      * When RCU reclaims in the code start to become numerous,
4120      * it will be necessary to reduce the granularity of this
4121      * critical section.
4122      */
4123     WITH_RCU_READ_LOCK_GUARD() {
4124         if (postcopy_running) {
4125             /*
4126              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4127              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4128              * service fast page faults.
4129              */
4130             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4131         } else {
4132             ret = ram_load_precopy(f);
4133         }
4134     }
4135     trace_ram_load_complete(ret, seq_iter);
4136
4137     return ret;
4138 }
4139
4140 static bool ram_has_postcopy(void *opaque)
4141 {
4142     RAMBlock *rb;
4143     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4144         if (ramblock_is_pmem(rb)) {
4145             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4146                          "is not supported now!", rb->idstr, rb->host);
4147             return false;
4148         }
4149     }
4150
4151     return migrate_postcopy_ram();
4152 }
4153
4154 /* Sync all the dirty bitmap with destination VM.  */
4155 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4156 {
4157     RAMBlock *block;
4158     QEMUFile *file = s->to_dst_file;
4159
4160     trace_ram_dirty_bitmap_sync_start();
4161
4162     qatomic_set(&rs->postcopy_bmap_sync_requested, 0);
4163     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4164         qemu_savevm_send_recv_bitmap(file, block->idstr);
4165         trace_ram_dirty_bitmap_request(block->idstr);
4166         qatomic_inc(&rs->postcopy_bmap_sync_requested);
4167     }
4168
4169     trace_ram_dirty_bitmap_sync_wait();
4170
4171     /* Wait until all the ramblocks' dirty bitmap synced */
4172     while (qatomic_read(&rs->postcopy_bmap_sync_requested)) {
4173         migration_rp_wait(s);
4174     }
4175
4176     trace_ram_dirty_bitmap_sync_complete();
4177
4178     return 0;
4179 }
4180
4181 /*
4182  * Read the received bitmap, revert it as the initial dirty bitmap.
4183  * This is only used when the postcopy migration is paused but wants
4184  * to resume from a middle point.
4185  */
4186 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4187 {
4188     int ret = -EINVAL;
4189     /* from_dst_file is always valid because we're within rp_thread */
4190     QEMUFile *file = s->rp_state.from_dst_file;
4191     g_autofree unsigned long *le_bitmap = NULL;
4192     unsigned long nbits = block->used_length >> TARGET_PAGE_BITS;
4193     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4194     uint64_t size, end_mark;
4195     RAMState *rs = ram_state;
4196
4197     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4198
4199     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4200         error_report("%s: incorrect state %s", __func__,
4201                      MigrationStatus_str(s->state));
4202         return -EINVAL;
4203     }
4204
4205     /*
4206      * Note: see comments in ramblock_recv_bitmap_send() on why we
4207      * need the endianness conversion, and the paddings.
4208      */
4209     local_size = ROUND_UP(local_size, 8);
4210
4211     /* Add paddings */
4212     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4213
4214     size = qemu_get_be64(file);
4215
4216     /* The size of the bitmap should match with our ramblock */
4217     if (size != local_size) {
4218         error_report("%s: ramblock '%s' bitmap size mismatch "
4219                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4220                      block->idstr, size, local_size);
4221         return -EINVAL;
4222     }
4223
4224     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4225     end_mark = qemu_get_be64(file);
4226
4227     ret = qemu_file_get_error(file);
4228     if (ret || size != local_size) {
4229         error_report("%s: read bitmap failed for ramblock '%s': %d"
4230                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4231                      __func__, block->idstr, ret, local_size, size);
4232         return -EIO;
4233     }
4234
4235     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4236         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4237                      __func__, block->idstr, end_mark);
4238         return -EINVAL;
4239     }
4240
4241     /*
4242      * Endianness conversion. We are during postcopy (though paused).
4243      * The dirty bitmap won't change. We can directly modify it.
4244      */
4245     bitmap_from_le(block->bmap, le_bitmap, nbits);
4246
4247     /*
4248      * What we received is "received bitmap". Revert it as the initial
4249      * dirty bitmap for this ramblock.
4250      */
4251     bitmap_complement(block->bmap, block->bmap, nbits);
4252
4253     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4254     ramblock_dirty_bitmap_clear_discarded_pages(block);
4255
4256     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4257     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4258
4259     qatomic_dec(&rs->postcopy_bmap_sync_requested);
4260
4261     /*
4262      * We succeeded to sync bitmap for current ramblock. Always kick the
4263      * migration thread to check whether all requested bitmaps are
4264      * reloaded.  NOTE: it's racy to only kick when requested==0, because
4265      * we don't know whether the migration thread may still be increasing
4266      * it.
4267      */
4268     migration_rp_kick(s);
4269
4270     return 0;
4271 }
4272
4273 static int ram_resume_prepare(MigrationState *s, void *opaque)
4274 {
4275     RAMState *rs = *(RAMState **)opaque;
4276     int ret;
4277
4278     ret = ram_dirty_bitmap_sync_all(s, rs);
4279     if (ret) {
4280         return ret;
4281     }
4282
4283     ram_state_resume_prepare(rs, s->to_dst_file);
4284
4285     return 0;
4286 }
4287
4288 void postcopy_preempt_shutdown_file(MigrationState *s)
4289 {
4290     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4291     qemu_fflush(s->postcopy_qemufile_src);
4292 }
4293
4294 static SaveVMHandlers savevm_ram_handlers = {
4295     .save_setup = ram_save_setup,
4296     .save_live_iterate = ram_save_iterate,
4297     .save_live_complete_postcopy = ram_save_complete,
4298     .save_live_complete_precopy = ram_save_complete,
4299     .has_postcopy = ram_has_postcopy,
4300     .state_pending_exact = ram_state_pending_exact,
4301     .state_pending_estimate = ram_state_pending_estimate,
4302     .load_state = ram_load,
4303     .save_cleanup = ram_save_cleanup,
4304     .load_setup = ram_load_setup,
4305     .load_cleanup = ram_load_cleanup,
4306     .resume_prepare = ram_resume_prepare,
4307 };
4308
4309 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4310                                       size_t old_size, size_t new_size)
4311 {
4312     PostcopyState ps = postcopy_state_get();
4313     ram_addr_t offset;
4314     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4315     Error *err = NULL;
4316
4317     if (!rb) {
4318         error_report("RAM block not found");
4319         return;
4320     }
4321
4322     if (migrate_ram_is_ignored(rb)) {
4323         return;
4324     }
4325
4326     if (!migration_is_idle()) {
4327         /*
4328          * Precopy code on the source cannot deal with the size of RAM blocks
4329          * changing at random points in time - especially after sending the
4330          * RAM block sizes in the migration stream, they must no longer change.
4331          * Abort and indicate a proper reason.
4332          */
4333         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4334         migration_cancel(err);
4335         error_free(err);
4336     }
4337
4338     switch (ps) {
4339     case POSTCOPY_INCOMING_ADVISE:
4340         /*
4341          * Update what ram_postcopy_incoming_init()->init_range() does at the
4342          * time postcopy was advised. Syncing RAM blocks with the source will
4343          * result in RAM resizes.
4344          */
4345         if (old_size < new_size) {
4346             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4347                 error_report("RAM block '%s' discard of resized RAM failed",
4348                              rb->idstr);
4349             }
4350         }
4351         rb->postcopy_length = new_size;
4352         break;
4353     case POSTCOPY_INCOMING_NONE:
4354     case POSTCOPY_INCOMING_RUNNING:
4355     case POSTCOPY_INCOMING_END:
4356         /*
4357          * Once our guest is running, postcopy does no longer care about
4358          * resizes. When growing, the new memory was not available on the
4359          * source, no handler needed.
4360          */
4361         break;
4362     default:
4363         error_report("RAM block '%s' resized during postcopy state: %d",
4364                      rb->idstr, ps);
4365         exit(-1);
4366     }
4367 }
4368
4369 static RAMBlockNotifier ram_mig_ram_notifier = {
4370     .ram_block_resized = ram_mig_ram_block_resized,
4371 };
4372
4373 void ram_mig_init(void)
4374 {
4375     qemu_mutex_init(&XBZRLE.lock);
4376     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4377     ram_block_notifier_add(&ram_mig_ram_notifier);
4378 }