migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/madvise.h"
  34 #include "qemu/main-loop.h"
  35 #include "xbzrle.h"
  36 #include "ram-compress.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration-stats.h"
  40 #include "migration/register.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "postcopy-ram.h"
  44 #include "page_cache.h"
  45 #include "qemu/error-report.h"
  46 #include "qapi/error.h"
  47 #include "qapi/qapi-types-migration.h"
  48 #include "qapi/qapi-events-migration.h"
  49 #include "qapi/qmp/qerror.h"
  50 #include "trace.h"
  51 #include "exec/ram_addr.h"
  52 #include "exec/target_page.h"
  53 #include "qemu/rcu_queue.h"
  54 #include "migration/colo.h"
  55 #include "block.h"
  56 #include "sysemu/cpu-throttle.h"
  57 #include "savevm.h"
  58 #include "qemu/iov.h"
  59 #include "multifd.h"
  60 #include "sysemu/runstate.h"
  61 #include "options.h"
  62
  63 #include "hw/boards.h" /* for machine_dump_guest_core() */
  64
  65 #if defined(__linux__)
  66 #include "qemu/userfaultfd.h"
  67 #endif /* defined(__linux__) */
  68
  69 /***********************************************************/
  70 /* ram save/restore */
  71
  72 /*
  73  * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  74  * worked for pages that were filled with the same char.  We switched
  75  * it to only search for the zero value.  And to avoid confusion with
  76  * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
  77  */
  78 /*
  79  * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
  80  */
  81 #define RAM_SAVE_FLAG_FULL     0x01
  82 #define RAM_SAVE_FLAG_ZERO     0x02
  83 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  84 #define RAM_SAVE_FLAG_PAGE     0x08
  85 #define RAM_SAVE_FLAG_EOS      0x10
  86 #define RAM_SAVE_FLAG_CONTINUE 0x20
  87 #define RAM_SAVE_FLAG_XBZRLE   0x40
  88 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
  89 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  90 #define RAM_SAVE_FLAG_MULTIFD_FLUSH    0x200
  91 /* We can't use any flag that is bigger than 0x200 */
  92
  93 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
  94      uint8_t *, int) = xbzrle_encode_buffer;
  95 #if defined(CONFIG_AVX512BW_OPT)
  96 #include "qemu/cpuid.h"
  97 static void __attribute__((constructor)) init_cpu_flag(void)
  98 {
  99     unsigned max = __get_cpuid_max(0, NULL);
 100     int a, b, c, d;
 101     if (max >= 1) {
 102         __cpuid(1, a, b, c, d);
 103          /* We must check that AVX is not just available, but usable.  */
 104         if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
 105             int bv;
 106             __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
 107             __cpuid_count(7, 0, a, b, c, d);
 108            /* 0xe6:
 109             *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
 110             *                    and ZMM16-ZMM31 state are enabled by OS)
 111             *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
 112             */
 113             if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
 114                 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
 115             }
 116         }
 117     }
 118 }
 119 #endif
 120
 121 XBZRLECacheStats xbzrle_counters;
 122
 123 /* used by the search for pages to send */
 124 struct PageSearchStatus {
 125     /* The migration channel used for a specific host page */
 126     QEMUFile    *pss_channel;
 127     /* Last block from where we have sent data */
 128     RAMBlock *last_sent_block;
 129     /* Current block being searched */
 130     RAMBlock    *block;
 131     /* Current page to search from */
 132     unsigned long page;
 133     /* Set once we wrap around */
 134     bool         complete_round;
 135     /* Whether we're sending a host page */
 136     bool          host_page_sending;
 137     /* The start/end of current host page.  Invalid if host_page_sending==false */
 138     unsigned long host_page_start;
 139     unsigned long host_page_end;
 140 };
 141 typedef struct PageSearchStatus PageSearchStatus;
 142
 143 /* struct contains XBZRLE cache and a static page
 144    used by the compression */
 145 static struct {
 146     /* buffer used for XBZRLE encoding */
 147     uint8_t *encoded_buf;
 148     /* buffer for storing page content */
 149     uint8_t *current_buf;
 150     /* Cache for XBZRLE, Protected by lock. */
 151     PageCache *cache;
 152     QemuMutex lock;
 153     /* it will store a page full of zeros */
 154     uint8_t *zero_target_page;
 155     /* buffer used for XBZRLE decoding */
 156     uint8_t *decoded_buf;
 157 } XBZRLE;
 158
 159 static void XBZRLE_cache_lock(void)
 160 {
 161     if (migrate_xbzrle()) {
 162         qemu_mutex_lock(&XBZRLE.lock);
 163     }
 164 }
 165
 166 static void XBZRLE_cache_unlock(void)
 167 {
 168     if (migrate_xbzrle()) {
 169         qemu_mutex_unlock(&XBZRLE.lock);
 170     }
 171 }
 172
 173 /**
 174  * xbzrle_cache_resize: resize the xbzrle cache
 175  *
 176  * This function is called from migrate_params_apply in main
 177  * thread, possibly while a migration is in progress.  A running
 178  * migration may be using the cache and might finish during this call,
 179  * hence changes to the cache are protected by XBZRLE.lock().
 180  *
 181  * Returns 0 for success or -1 for error
 182  *
 183  * @new_size: new cache size
 184  * @errp: set *errp if the check failed, with reason
 185  */
 186 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 187 {
 188     PageCache *new_cache;
 189     int64_t ret = 0;
 190
 191     /* Check for truncation */
 192     if (new_size != (size_t)new_size) {
 193         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 194                    "exceeding address space");
 195         return -1;
 196     }
 197
 198     if (new_size == migrate_xbzrle_cache_size()) {
 199         /* nothing to do */
 200         return 0;
 201     }
 202
 203     XBZRLE_cache_lock();
 204
 205     if (XBZRLE.cache != NULL) {
 206         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 207         if (!new_cache) {
 208             ret = -1;
 209             goto out;
 210         }
 211
 212         cache_fini(XBZRLE.cache);
 213         XBZRLE.cache = new_cache;
 214     }
 215 out:
 216     XBZRLE_cache_unlock();
 217     return ret;
 218 }
 219
 220 static bool postcopy_preempt_active(void)
 221 {
 222     return migrate_postcopy_preempt() && migration_in_postcopy();
 223 }
 224
 225 bool ramblock_is_ignored(RAMBlock *block)
 226 {
 227     return !qemu_ram_is_migratable(block) ||
 228            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 229 }
 230
 231 #undef RAMBLOCK_FOREACH
 232
 233 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 234 {
 235     RAMBlock *block;
 236     int ret = 0;
 237
 238     RCU_READ_LOCK_GUARD();
 239
 240     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 241         ret = func(block, opaque);
 242         if (ret) {
 243             break;
 244         }
 245     }
 246     return ret;
 247 }
 248
 249 static void ramblock_recv_map_init(void)
 250 {
 251     RAMBlock *rb;
 252
 253     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 254         assert(!rb->receivedmap);
 255         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 256     }
 257 }
 258
 259 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 260 {
 261     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 262                     rb->receivedmap);
 263 }
 264
 265 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 266 {
 267     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 268 }
 269
 270 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 271 {
 272     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 273 }
 274
 275 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 276                                     size_t nr)
 277 {
 278     bitmap_set_atomic(rb->receivedmap,
 279                       ramblock_recv_bitmap_offset(host_addr, rb),
 280                       nr);
 281 }
 282
 283 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 284
 285 /*
 286  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 287  *
 288  * Returns >0 if success with sent bytes, or <0 if error.
 289  */
 290 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 291                                   const char *block_name)
 292 {
 293     RAMBlock *block = qemu_ram_block_by_name(block_name);
 294     unsigned long *le_bitmap, nbits;
 295     uint64_t size;
 296
 297     if (!block) {
 298         error_report("%s: invalid block name: %s", __func__, block_name);
 299         return -1;
 300     }
 301
 302     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 303
 304     /*
 305      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 306      * machines we may need 4 more bytes for padding (see below
 307      * comment). So extend it a bit before hand.
 308      */
 309     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 310
 311     /*
 312      * Always use little endian when sending the bitmap. This is
 313      * required that when source and destination VMs are not using the
 314      * same endianness. (Note: big endian won't work.)
 315      */
 316     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 317
 318     /* Size of the bitmap, in bytes */
 319     size = DIV_ROUND_UP(nbits, 8);
 320
 321     /*
 322      * size is always aligned to 8 bytes for 64bit machines, but it
 323      * may not be true for 32bit machines. We need this padding to
 324      * make sure the migration can survive even between 32bit and
 325      * 64bit machines.
 326      */
 327     size = ROUND_UP(size, 8);
 328
 329     qemu_put_be64(file, size);
 330     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 331     /*
 332      * Mark as an end, in case the middle part is screwed up due to
 333      * some "mysterious" reason.
 334      */
 335     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 336     qemu_fflush(file);
 337
 338     g_free(le_bitmap);
 339
 340     if (qemu_file_get_error(file)) {
 341         return qemu_file_get_error(file);
 342     }
 343
 344     return size + sizeof(size);
 345 }
 346
 347 /*
 348  * An outstanding page request, on the source, having been received
 349  * and queued
 350  */
 351 struct RAMSrcPageRequest {
 352     RAMBlock *rb;
 353     hwaddr    offset;
 354     hwaddr    len;
 355
 356     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 357 };
 358
 359 /* State of RAM for migration */
 360 struct RAMState {
 361     /*
 362      * PageSearchStatus structures for the channels when send pages.
 363      * Protected by the bitmap_mutex.
 364      */
 365     PageSearchStatus pss[RAM_CHANNEL_MAX];
 366     /* UFFD file descriptor, used in 'write-tracking' migration */
 367     int uffdio_fd;
 368     /* total ram size in bytes */
 369     uint64_t ram_bytes_total;
 370     /* Last block that we have visited searching for dirty pages */
 371     RAMBlock *last_seen_block;
 372     /* Last dirty target page we have sent */
 373     ram_addr_t last_page;
 374     /* last ram version we have seen */
 375     uint32_t last_version;
 376     /* How many times we have dirty too many pages */
 377     int dirty_rate_high_cnt;
 378     /* these variables are used for bitmap sync */
 379     /* last time we did a full bitmap_sync */
 380     int64_t time_last_bitmap_sync;
 381     /* bytes transferred at start_time */
 382     uint64_t bytes_xfer_prev;
 383     /* number of dirty pages since start_time */
 384     uint64_t num_dirty_pages_period;
 385     /* xbzrle misses since the beginning of the period */
 386     uint64_t xbzrle_cache_miss_prev;
 387     /* Amount of xbzrle pages since the beginning of the period */
 388     uint64_t xbzrle_pages_prev;
 389     /* Amount of xbzrle encoded bytes since the beginning of the period */
 390     uint64_t xbzrle_bytes_prev;
 391     /* Are we really using XBZRLE (e.g., after the first round). */
 392     bool xbzrle_started;
 393     /* Are we on the last stage of migration */
 394     bool last_stage;
 395     /* compression statistics since the beginning of the period */
 396     /* amount of count that no free thread to compress data */
 397     uint64_t compress_thread_busy_prev;
 398     /* amount bytes after compression */
 399     uint64_t compressed_size_prev;
 400     /* amount of compressed pages */
 401     uint64_t compress_pages_prev;
 402
 403     /* total handled target pages at the beginning of period */
 404     uint64_t target_page_count_prev;
 405     /* total handled target pages since start */
 406     uint64_t target_page_count;
 407     /* number of dirty bits in the bitmap */
 408     uint64_t migration_dirty_pages;
 409     /*
 410      * Protects:
 411      * - dirty/clear bitmap
 412      * - migration_dirty_pages
 413      * - pss structures
 414      */
 415     QemuMutex bitmap_mutex;
 416     /* The RAMBlock used in the last src_page_requests */
 417     RAMBlock *last_req_rb;
 418     /* Queue of outstanding page requests from the destination */
 419     QemuMutex src_page_req_mutex;
 420     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 421 };
 422 typedef struct RAMState RAMState;
 423
 424 static RAMState *ram_state;
 425
 426 static NotifierWithReturnList precopy_notifier_list;
 427
 428 /* Whether postcopy has queued requests? */
 429 static bool postcopy_has_request(RAMState *rs)
 430 {
 431     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
 432 }
 433
 434 void precopy_infrastructure_init(void)
 435 {
 436     notifier_with_return_list_init(&precopy_notifier_list);
 437 }
 438
 439 void precopy_add_notifier(NotifierWithReturn *n)
 440 {
 441     notifier_with_return_list_add(&precopy_notifier_list, n);
 442 }
 443
 444 void precopy_remove_notifier(NotifierWithReturn *n)
 445 {
 446     notifier_with_return_remove(n);
 447 }
 448
 449 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 450 {
 451     PrecopyNotifyData pnd;
 452     pnd.reason = reason;
 453     pnd.errp = errp;
 454
 455     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 456 }
 457
 458 uint64_t ram_bytes_remaining(void)
 459 {
 460     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 461                        0;
 462 }
 463
 464 void ram_transferred_add(uint64_t bytes)
 465 {
 466     if (runstate_is_running()) {
 467         stat64_add(&mig_stats.precopy_bytes, bytes);
 468     } else if (migration_in_postcopy()) {
 469         stat64_add(&mig_stats.postcopy_bytes, bytes);
 470     } else {
 471         stat64_add(&mig_stats.downtime_bytes, bytes);
 472     }
 473     stat64_add(&mig_stats.transferred, bytes);
 474 }
 475
 476 struct MigrationOps {
 477     int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
 478 };
 479 typedef struct MigrationOps MigrationOps;
 480
 481 MigrationOps *migration_ops;
 482
 483 static int ram_save_host_page_urgent(PageSearchStatus *pss);
 484
 485 /* NOTE: page is the PFN not real ram_addr_t. */
 486 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
 487 {
 488     pss->block = rb;
 489     pss->page = page;
 490     pss->complete_round = false;
 491 }
 492
 493 /*
 494  * Check whether two PSSs are actively sending the same page.  Return true
 495  * if it is, false otherwise.
 496  */
 497 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
 498 {
 499     return pss1->host_page_sending && pss2->host_page_sending &&
 500         (pss1->host_page_start == pss2->host_page_start);
 501 }
 502
 503 /**
 504  * save_page_header: write page header to wire
 505  *
 506  * If this is the 1st block, it also writes the block identification
 507  *
 508  * Returns the number of bytes written
 509  *
 510  * @pss: current PSS channel status
 511  * @block: block that contains the page we want to send
 512  * @offset: offset inside the block for the page
 513  *          in the lower bits, it contains flags
 514  */
 515 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
 516                                RAMBlock *block, ram_addr_t offset)
 517 {
 518     size_t size, len;
 519     bool same_block = (block == pss->last_sent_block);
 520
 521     if (same_block) {
 522         offset |= RAM_SAVE_FLAG_CONTINUE;
 523     }
 524     qemu_put_be64(f, offset);
 525     size = 8;
 526
 527     if (!same_block) {
 528         len = strlen(block->idstr);
 529         qemu_put_byte(f, len);
 530         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 531         size += 1 + len;
 532         pss->last_sent_block = block;
 533     }
 534     return size;
 535 }
 536
 537 /**
 538  * mig_throttle_guest_down: throttle down the guest
 539  *
 540  * Reduce amount of guest cpu execution to hopefully slow down memory
 541  * writes. If guest dirty memory rate is reduced below the rate at
 542  * which we can transfer pages to the destination then we should be
 543  * able to complete migration. Some workloads dirty memory way too
 544  * fast and will not effectively converge, even with auto-converge.
 545  */
 546 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 547                                     uint64_t bytes_dirty_threshold)
 548 {
 549     uint64_t pct_initial = migrate_cpu_throttle_initial();
 550     uint64_t pct_increment = migrate_cpu_throttle_increment();
 551     bool pct_tailslow = migrate_cpu_throttle_tailslow();
 552     int pct_max = migrate_max_cpu_throttle();
 553
 554     uint64_t throttle_now = cpu_throttle_get_percentage();
 555     uint64_t cpu_now, cpu_ideal, throttle_inc;
 556
 557     /* We have not started throttling yet. Let's start it. */
 558     if (!cpu_throttle_active()) {
 559         cpu_throttle_set(pct_initial);
 560     } else {
 561         /* Throttling already on, just increase the rate */
 562         if (!pct_tailslow) {
 563             throttle_inc = pct_increment;
 564         } else {
 565             /* Compute the ideal CPU percentage used by Guest, which may
 566              * make the dirty rate match the dirty rate threshold. */
 567             cpu_now = 100 - throttle_now;
 568             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 569                         bytes_dirty_period);
 570             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 571         }
 572         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 573     }
 574 }
 575
 576 void mig_throttle_counter_reset(void)
 577 {
 578     RAMState *rs = ram_state;
 579
 580     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 581     rs->num_dirty_pages_period = 0;
 582     rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
 583 }
 584
 585 /**
 586  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 587  *
 588  * @rs: current RAM state
 589  * @current_addr: address for the zero page
 590  *
 591  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 592  * The important thing is that a stale (not-yet-0'd) page be replaced
 593  * by the new data.
 594  * As a bonus, if the page wasn't in the cache it gets added so that
 595  * when a small write is made into the 0'd page it gets XBZRLE sent.
 596  */
 597 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 598 {
 599     /* We don't care if this fails to allocate a new cache page
 600      * as long as it updated an old one */
 601     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 602                  stat64_get(&mig_stats.dirty_sync_count));
 603 }
 604
 605 #define ENCODING_FLAG_XBZRLE 0x1
 606
 607 /**
 608  * save_xbzrle_page: compress and send current page
 609  *
 610  * Returns: 1 means that we wrote the page
 611  *          0 means that page is identical to the one already sent
 612  *          -1 means that xbzrle would be longer than normal
 613  *
 614  * @rs: current RAM state
 615  * @pss: current PSS channel
 616  * @current_data: pointer to the address of the page contents
 617  * @current_addr: addr of the page
 618  * @block: block that contains the page we want to send
 619  * @offset: offset inside the block for the page
 620  */
 621 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
 622                             uint8_t **current_data, ram_addr_t current_addr,
 623                             RAMBlock *block, ram_addr_t offset)
 624 {
 625     int encoded_len = 0, bytes_xbzrle;
 626     uint8_t *prev_cached_page;
 627     QEMUFile *file = pss->pss_channel;
 628     uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
 629
 630     if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
 631         xbzrle_counters.cache_miss++;
 632         if (!rs->last_stage) {
 633             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 634                              generation) == -1) {
 635                 return -1;
 636             } else {
 637                 /* update *current_data when the page has been
 638                    inserted into cache */
 639                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 640             }
 641         }
 642         return -1;
 643     }
 644
 645     /*
 646      * Reaching here means the page has hit the xbzrle cache, no matter what
 647      * encoding result it is (normal encoding, overflow or skipping the page),
 648      * count the page as encoded. This is used to calculate the encoding rate.
 649      *
 650      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 651      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 652      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 653      * skipped page included. In this way, the encoding rate can tell if the
 654      * guest page is good for xbzrle encoding.
 655      */
 656     xbzrle_counters.pages++;
 657     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 658
 659     /* save current buffer into memory */
 660     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 661
 662     /* XBZRLE encoding (if there is no overflow) */
 663     encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
 664                                             TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 665                                             TARGET_PAGE_SIZE);
 666
 667     /*
 668      * Update the cache contents, so that it corresponds to the data
 669      * sent, in all cases except where we skip the page.
 670      */
 671     if (!rs->last_stage && encoded_len != 0) {
 672         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 673         /*
 674          * In the case where we couldn't compress, ensure that the caller
 675          * sends the data from the cache, since the guest might have
 676          * changed the RAM since we copied it.
 677          */
 678         *current_data = prev_cached_page;
 679     }
 680
 681     if (encoded_len == 0) {
 682         trace_save_xbzrle_page_skipping();
 683         return 0;
 684     } else if (encoded_len == -1) {
 685         trace_save_xbzrle_page_overflow();
 686         xbzrle_counters.overflow++;
 687         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 688         return -1;
 689     }
 690
 691     /* Send XBZRLE based compressed page */
 692     bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
 693                                     offset | RAM_SAVE_FLAG_XBZRLE);
 694     qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
 695     qemu_put_be16(file, encoded_len);
 696     qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
 697     bytes_xbzrle += encoded_len + 1 + 2;
 698     /*
 699      * Like compressed_size (please see update_compress_thread_counts),
 700      * the xbzrle encoded bytes don't count the 8 byte header with
 701      * RAM_SAVE_FLAG_CONTINUE.
 702      */
 703     xbzrle_counters.bytes += bytes_xbzrle - 8;
 704     ram_transferred_add(bytes_xbzrle);
 705
 706     return 1;
 707 }
 708
 709 /**
 710  * pss_find_next_dirty: find the next dirty page of current ramblock
 711  *
 712  * This function updates pss->page to point to the next dirty page index
 713  * within the ramblock to migrate, or the end of ramblock when nothing
 714  * found.  Note that when pss->host_page_sending==true it means we're
 715  * during sending a host page, so we won't look for dirty page that is
 716  * outside the host page boundary.
 717  *
 718  * @pss: the current page search status
 719  */
 720 static void pss_find_next_dirty(PageSearchStatus *pss)
 721 {
 722     RAMBlock *rb = pss->block;
 723     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 724     unsigned long *bitmap = rb->bmap;
 725
 726     if (ramblock_is_ignored(rb)) {
 727         /* Points directly to the end, so we know no dirty page */
 728         pss->page = size;
 729         return;
 730     }
 731
 732     /*
 733      * If during sending a host page, only look for dirty pages within the
 734      * current host page being send.
 735      */
 736     if (pss->host_page_sending) {
 737         assert(pss->host_page_end);
 738         size = MIN(size, pss->host_page_end);
 739     }
 740
 741     pss->page = find_next_bit(bitmap, size, pss->page);
 742 }
 743
 744 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 745                                                        unsigned long page)
 746 {
 747     uint8_t shift;
 748     hwaddr size, start;
 749
 750     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 751         return;
 752     }
 753
 754     shift = rb->clear_bmap_shift;
 755     /*
 756      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 757      * can make things easier sometimes since then start address
 758      * of the small chunk will always be 64 pages aligned so the
 759      * bitmap will always be aligned to unsigned long. We should
 760      * even be able to remove this restriction but I'm simply
 761      * keeping it.
 762      */
 763     assert(shift >= 6);
 764
 765     size = 1ULL << (TARGET_PAGE_BITS + shift);
 766     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
 767     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 768     memory_region_clear_dirty_bitmap(rb->mr, start, size);
 769 }
 770
 771 static void
 772 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 773                                                  unsigned long start,
 774                                                  unsigned long npages)
 775 {
 776     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 777     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 778     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 779
 780     /*
 781      * Clear pages from start to start + npages - 1, so the end boundary is
 782      * exclusive.
 783      */
 784     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 785         migration_clear_memory_region_dirty_bitmap(rb, i);
 786     }
 787 }
 788
 789 /*
 790  * colo_bitmap_find_diry:find contiguous dirty pages from start
 791  *
 792  * Returns the page offset within memory region of the start of the contiguout
 793  * dirty page
 794  *
 795  * @rs: current RAM state
 796  * @rb: RAMBlock where to search for dirty pages
 797  * @start: page where we start the search
 798  * @num: the number of contiguous dirty pages
 799  */
 800 static inline
 801 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 802                                      unsigned long start, unsigned long *num)
 803 {
 804     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 805     unsigned long *bitmap = rb->bmap;
 806     unsigned long first, next;
 807
 808     *num = 0;
 809
 810     if (ramblock_is_ignored(rb)) {
 811         return size;
 812     }
 813
 814     first = find_next_bit(bitmap, size, start);
 815     if (first >= size) {
 816         return first;
 817     }
 818     next = find_next_zero_bit(bitmap, size, first + 1);
 819     assert(next >= first);
 820     *num = next - first;
 821     return first;
 822 }
 823
 824 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 825                                                 RAMBlock *rb,
 826                                                 unsigned long page)
 827 {
 828     bool ret;
 829
 830     /*
 831      * Clear dirty bitmap if needed.  This _must_ be called before we
 832      * send any of the page in the chunk because we need to make sure
 833      * we can capture further page content changes when we sync dirty
 834      * log the next time.  So as long as we are going to send any of
 835      * the page in the chunk we clear the remote dirty bitmap for all.
 836      * Clearing it earlier won't be a problem, but too late will.
 837      */
 838     migration_clear_memory_region_dirty_bitmap(rb, page);
 839
 840     ret = test_and_clear_bit(page, rb->bmap);
 841     if (ret) {
 842         rs->migration_dirty_pages--;
 843     }
 844
 845     return ret;
 846 }
 847
 848 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
 849                                        void *opaque)
 850 {
 851     const hwaddr offset = section->offset_within_region;
 852     const hwaddr size = int128_get64(section->size);
 853     const unsigned long start = offset >> TARGET_PAGE_BITS;
 854     const unsigned long npages = size >> TARGET_PAGE_BITS;
 855     RAMBlock *rb = section->mr->ram_block;
 856     uint64_t *cleared_bits = opaque;
 857
 858     /*
 859      * We don't grab ram_state->bitmap_mutex because we expect to run
 860      * only when starting migration or during postcopy recovery where
 861      * we don't have concurrent access.
 862      */
 863     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
 864         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
 865     }
 866     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
 867     bitmap_clear(rb->bmap, start, npages);
 868 }
 869
 870 /*
 871  * Exclude all dirty pages from migration that fall into a discarded range as
 872  * managed by a RamDiscardManager responsible for the mapped memory region of
 873  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
 874  *
 875  * Discarded pages ("logically unplugged") have undefined content and must
 876  * not get migrated, because even reading these pages for migration might
 877  * result in undesired behavior.
 878  *
 879  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
 880  *
 881  * Note: The result is only stable while migrating (precopy/postcopy).
 882  */
 883 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
 884 {
 885     uint64_t cleared_bits = 0;
 886
 887     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
 888         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 889         MemoryRegionSection section = {
 890             .mr = rb->mr,
 891             .offset_within_region = 0,
 892             .size = int128_make64(qemu_ram_get_used_length(rb)),
 893         };
 894
 895         ram_discard_manager_replay_discarded(rdm, &section,
 896                                              dirty_bitmap_clear_section,
 897                                              &cleared_bits);
 898     }
 899     return cleared_bits;
 900 }
 901
 902 /*
 903  * Check if a host-page aligned page falls into a discarded range as managed by
 904  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
 905  *
 906  * Note: The result is only stable while migrating (precopy/postcopy).
 907  */
 908 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
 909 {
 910     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
 911         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 912         MemoryRegionSection section = {
 913             .mr = rb->mr,
 914             .offset_within_region = start,
 915             .size = int128_make64(qemu_ram_pagesize(rb)),
 916         };
 917
 918         return !ram_discard_manager_is_populated(rdm, &section);
 919     }
 920     return false;
 921 }
 922
 923 /* Called with RCU critical section */
 924 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 925 {
 926     uint64_t new_dirty_pages =
 927         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 928
 929     rs->migration_dirty_pages += new_dirty_pages;
 930     rs->num_dirty_pages_period += new_dirty_pages;
 931 }
 932
 933 /**
 934  * ram_pagesize_summary: calculate all the pagesizes of a VM
 935  *
 936  * Returns a summary bitmap of the page sizes of all RAMBlocks
 937  *
 938  * For VMs with just normal pages this is equivalent to the host page
 939  * size. If it's got some huge pages then it's the OR of all the
 940  * different page sizes.
 941  */
 942 uint64_t ram_pagesize_summary(void)
 943 {
 944     RAMBlock *block;
 945     uint64_t summary = 0;
 946
 947     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 948         summary |= block->page_size;
 949     }
 950
 951     return summary;
 952 }
 953
 954 uint64_t ram_get_total_transferred_pages(void)
 955 {
 956     return stat64_get(&mig_stats.normal_pages) +
 957         stat64_get(&mig_stats.zero_pages) +
 958         compression_counters.pages + xbzrle_counters.pages;
 959 }
 960
 961 static void migration_update_rates(RAMState *rs, int64_t end_time)
 962 {
 963     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 964     double compressed_size;
 965
 966     /* calculate period counters */
 967     stat64_set(&mig_stats.dirty_pages_rate,
 968                rs->num_dirty_pages_period * 1000 /
 969                (end_time - rs->time_last_bitmap_sync));
 970
 971     if (!page_count) {
 972         return;
 973     }
 974
 975     if (migrate_xbzrle()) {
 976         double encoded_size, unencoded_size;
 977
 978         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 979             rs->xbzrle_cache_miss_prev) / page_count;
 980         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 981         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 982                          TARGET_PAGE_SIZE;
 983         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 984         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 985             xbzrle_counters.encoding_rate = 0;
 986         } else {
 987             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
 988         }
 989         rs->xbzrle_pages_prev = xbzrle_counters.pages;
 990         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
 991     }
 992
 993     if (migrate_compress()) {
 994         compression_counters.busy_rate = (double)(compression_counters.busy -
 995             rs->compress_thread_busy_prev) / page_count;
 996         rs->compress_thread_busy_prev = compression_counters.busy;
 997
 998         compressed_size = compression_counters.compressed_size -
 999                           rs->compressed_size_prev;
1000         if (compressed_size) {
1001             double uncompressed_size = (compression_counters.pages -
1002                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1003
1004             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1005             compression_counters.compression_rate =
1006                                         uncompressed_size / compressed_size;
1007
1008             rs->compress_pages_prev = compression_counters.pages;
1009             rs->compressed_size_prev = compression_counters.compressed_size;
1010         }
1011     }
1012 }
1013
1014 static void migration_trigger_throttle(RAMState *rs)
1015 {
1016     uint64_t threshold = migrate_throttle_trigger_threshold();
1017     uint64_t bytes_xfer_period =
1018         stat64_get(&mig_stats.transferred) - rs->bytes_xfer_prev;
1019     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1020     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1021
1022     /* During block migration the auto-converge logic incorrectly detects
1023      * that ram migration makes no progress. Avoid this by disabling the
1024      * throttling logic during the bulk phase of block migration. */
1025     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1026         /* The following detection logic can be refined later. For now:
1027            Check to see if the ratio between dirtied bytes and the approx.
1028            amount of bytes that just got transferred since the last time
1029            we were in this routine reaches the threshold. If that happens
1030            twice, start or increase throttling. */
1031
1032         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1033             (++rs->dirty_rate_high_cnt >= 2)) {
1034             trace_migration_throttle();
1035             rs->dirty_rate_high_cnt = 0;
1036             mig_throttle_guest_down(bytes_dirty_period,
1037                                     bytes_dirty_threshold);
1038         }
1039     }
1040 }
1041
1042 static void migration_bitmap_sync(RAMState *rs)
1043 {
1044     RAMBlock *block;
1045     int64_t end_time;
1046
1047     stat64_add(&mig_stats.dirty_sync_count, 1);
1048
1049     if (!rs->time_last_bitmap_sync) {
1050         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1051     }
1052
1053     trace_migration_bitmap_sync_start();
1054     memory_global_dirty_log_sync();
1055
1056     qemu_mutex_lock(&rs->bitmap_mutex);
1057     WITH_RCU_READ_LOCK_GUARD() {
1058         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1059             ramblock_sync_dirty_bitmap(rs, block);
1060         }
1061         stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining());
1062     }
1063     qemu_mutex_unlock(&rs->bitmap_mutex);
1064
1065     memory_global_after_dirty_log_sync();
1066     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1067
1068     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1069
1070     /* more than 1 second = 1000 millisecons */
1071     if (end_time > rs->time_last_bitmap_sync + 1000) {
1072         migration_trigger_throttle(rs);
1073
1074         migration_update_rates(rs, end_time);
1075
1076         rs->target_page_count_prev = rs->target_page_count;
1077
1078         /* reset period counters */
1079         rs->time_last_bitmap_sync = end_time;
1080         rs->num_dirty_pages_period = 0;
1081         rs->bytes_xfer_prev = stat64_get(&mig_stats.transferred);
1082     }
1083     if (migrate_events()) {
1084         uint64_t generation = stat64_get(&mig_stats.dirty_sync_count);
1085         qapi_event_send_migration_pass(generation);
1086     }
1087 }
1088
1089 static void migration_bitmap_sync_precopy(RAMState *rs)
1090 {
1091     Error *local_err = NULL;
1092
1093     /*
1094      * The current notifier usage is just an optimization to migration, so we
1095      * don't stop the normal migration process in the error case.
1096      */
1097     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1098         error_report_err(local_err);
1099         local_err = NULL;
1100     }
1101
1102     migration_bitmap_sync(rs);
1103
1104     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1105         error_report_err(local_err);
1106     }
1107 }
1108
1109 void ram_release_page(const char *rbname, uint64_t offset)
1110 {
1111     if (!migrate_release_ram() || !migration_in_postcopy()) {
1112         return;
1113     }
1114
1115     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1116 }
1117
1118 /**
1119  * save_zero_page_to_file: send the zero page to the file
1120  *
1121  * Returns the size of data written to the file, 0 means the page is not
1122  * a zero page
1123  *
1124  * @pss: current PSS channel
1125  * @block: block that contains the page we want to send
1126  * @offset: offset inside the block for the page
1127  */
1128 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
1129                                   RAMBlock *block, ram_addr_t offset)
1130 {
1131     uint8_t *p = block->host + offset;
1132     int len = 0;
1133
1134     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1135         len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1136         qemu_put_byte(file, 0);
1137         len += 1;
1138         ram_release_page(block->idstr, offset);
1139     }
1140     return len;
1141 }
1142
1143 /**
1144  * save_zero_page: send the zero page to the stream
1145  *
1146  * Returns the number of pages written.
1147  *
1148  * @pss: current PSS channel
1149  * @block: block that contains the page we want to send
1150  * @offset: offset inside the block for the page
1151  */
1152 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
1153                           ram_addr_t offset)
1154 {
1155     int len = save_zero_page_to_file(pss, f, block, offset);
1156
1157     if (len) {
1158         stat64_add(&mig_stats.zero_pages, 1);
1159         ram_transferred_add(len);
1160         return 1;
1161     }
1162     return -1;
1163 }
1164
1165 /*
1166  * @pages: the number of pages written by the control path,
1167  *        < 0 - error
1168  *        > 0 - number of pages written
1169  *
1170  * Return true if the pages has been saved, otherwise false is returned.
1171  */
1172 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1173                               ram_addr_t offset, int *pages)
1174 {
1175     uint64_t bytes_xmit = 0;
1176     int ret;
1177
1178     *pages = -1;
1179     ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1180                                 TARGET_PAGE_SIZE, &bytes_xmit);
1181     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1182         return false;
1183     }
1184
1185     if (bytes_xmit) {
1186         ram_transferred_add(bytes_xmit);
1187         *pages = 1;
1188     }
1189
1190     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1191         return true;
1192     }
1193
1194     if (bytes_xmit > 0) {
1195         stat64_add(&mig_stats.normal_pages, 1);
1196     } else if (bytes_xmit == 0) {
1197         stat64_add(&mig_stats.zero_pages, 1);
1198     }
1199
1200     return true;
1201 }
1202
1203 /*
1204  * directly send the page to the stream
1205  *
1206  * Returns the number of pages written.
1207  *
1208  * @pss: current PSS channel
1209  * @block: block that contains the page we want to send
1210  * @offset: offset inside the block for the page
1211  * @buf: the page to be sent
1212  * @async: send to page asyncly
1213  */
1214 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1215                             ram_addr_t offset, uint8_t *buf, bool async)
1216 {
1217     QEMUFile *file = pss->pss_channel;
1218
1219     ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
1220                                          offset | RAM_SAVE_FLAG_PAGE));
1221     if (async) {
1222         qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1223                               migrate_release_ram() &&
1224                               migration_in_postcopy());
1225     } else {
1226         qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1227     }
1228     ram_transferred_add(TARGET_PAGE_SIZE);
1229     stat64_add(&mig_stats.normal_pages, 1);
1230     return 1;
1231 }
1232
1233 /**
1234  * ram_save_page: send the given page to the stream
1235  *
1236  * Returns the number of pages written.
1237  *          < 0 - error
1238  *          >=0 - Number of pages written - this might legally be 0
1239  *                if xbzrle noticed the page was the same.
1240  *
1241  * @rs: current RAM state
1242  * @block: block that contains the page we want to send
1243  * @offset: offset inside the block for the page
1244  */
1245 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1246 {
1247     int pages = -1;
1248     uint8_t *p;
1249     bool send_async = true;
1250     RAMBlock *block = pss->block;
1251     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1252     ram_addr_t current_addr = block->offset + offset;
1253
1254     p = block->host + offset;
1255     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1256
1257     XBZRLE_cache_lock();
1258     if (rs->xbzrle_started && !migration_in_postcopy()) {
1259         pages = save_xbzrle_page(rs, pss, &p, current_addr,
1260                                  block, offset);
1261         if (!rs->last_stage) {
1262             /* Can't send this cached data async, since the cache page
1263              * might get updated before it gets to the wire
1264              */
1265             send_async = false;
1266         }
1267     }
1268
1269     /* XBZRLE overflow or normal page */
1270     if (pages == -1) {
1271         pages = save_normal_page(pss, block, offset, p, send_async);
1272     }
1273
1274     XBZRLE_cache_unlock();
1275
1276     return pages;
1277 }
1278
1279 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1280                                  ram_addr_t offset)
1281 {
1282     if (multifd_queue_page(file, block, offset) < 0) {
1283         return -1;
1284     }
1285     stat64_add(&mig_stats.normal_pages, 1);
1286
1287     return 1;
1288 }
1289
1290 static void
1291 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1292 {
1293     ram_transferred_add(bytes_xmit);
1294
1295     if (param->result == RES_ZEROPAGE) {
1296         stat64_add(&mig_stats.zero_pages, 1);
1297         return;
1298     }
1299
1300     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1301     compression_counters.compressed_size += bytes_xmit - 8;
1302     compression_counters.pages++;
1303 }
1304
1305 static bool save_page_use_compression(RAMState *rs);
1306
1307 static int send_queued_data(CompressParam *param)
1308 {
1309     PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_PRECOPY];
1310     MigrationState *ms = migrate_get_current();
1311     QEMUFile *file = ms->to_dst_file;
1312     int len = 0;
1313
1314     RAMBlock *block = param->block;
1315     ram_addr_t offset = param->offset;
1316
1317     if (param->result == RES_NONE) {
1318         return 0;
1319     }
1320
1321     assert(block == pss->last_sent_block);
1322
1323     if (param->result == RES_ZEROPAGE) {
1324         assert(qemu_file_buffer_empty(param->file));
1325         len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1326         qemu_put_byte(file, 0);
1327         len += 1;
1328         ram_release_page(block->idstr, offset);
1329     } else if (param->result == RES_COMPRESS) {
1330         assert(!qemu_file_buffer_empty(param->file));
1331         len += save_page_header(pss, file, block,
1332                                 offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1333         len += qemu_put_qemu_file(file, param->file);
1334     } else {
1335         abort();
1336     }
1337
1338     update_compress_thread_counts(param, len);
1339
1340     return len;
1341 }
1342
1343 static void ram_flush_compressed_data(RAMState *rs)
1344 {
1345     if (!save_page_use_compression(rs)) {
1346         return;
1347     }
1348
1349     flush_compressed_data(send_queued_data);
1350 }
1351
1352 #define PAGE_ALL_CLEAN 0
1353 #define PAGE_TRY_AGAIN 1
1354 #define PAGE_DIRTY_FOUND 2
1355 /**
1356  * find_dirty_block: find the next dirty page and update any state
1357  * associated with the search process.
1358  *
1359  * Returns:
1360  *         <0: An error happened
1361  *         PAGE_ALL_CLEAN: no dirty page found, give up
1362  *         PAGE_TRY_AGAIN: no dirty page found, retry for next block
1363  *         PAGE_DIRTY_FOUND: dirty page found
1364  *
1365  * @rs: current RAM state
1366  * @pss: data about the state of the current dirty page scan
1367  * @again: set to false if the search has scanned the whole of RAM
1368  */
1369 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
1370 {
1371     /* Update pss->page for the next dirty bit in ramblock */
1372     pss_find_next_dirty(pss);
1373
1374     if (pss->complete_round && pss->block == rs->last_seen_block &&
1375         pss->page >= rs->last_page) {
1376         /*
1377          * We've been once around the RAM and haven't found anything.
1378          * Give up.
1379          */
1380         return PAGE_ALL_CLEAN;
1381     }
1382     if (!offset_in_ramblock(pss->block,
1383                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1384         /* Didn't find anything in this RAM Block */
1385         pss->page = 0;
1386         pss->block = QLIST_NEXT_RCU(pss->block, next);
1387         if (!pss->block) {
1388             if (!migrate_multifd_flush_after_each_section()) {
1389                 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
1390                 int ret = multifd_send_sync_main(f);
1391                 if (ret < 0) {
1392                     return ret;
1393                 }
1394                 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
1395                 qemu_fflush(f);
1396             }
1397             /*
1398              * If memory migration starts over, we will meet a dirtied page
1399              * which may still exists in compression threads's ring, so we
1400              * should flush the compressed data to make sure the new page
1401              * is not overwritten by the old one in the destination.
1402              *
1403              * Also If xbzrle is on, stop using the data compression at this
1404              * point. In theory, xbzrle can do better than compression.
1405              */
1406             ram_flush_compressed_data(rs);
1407
1408             /* Hit the end of the list */
1409             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1410             /* Flag that we've looped */
1411             pss->complete_round = true;
1412             /* After the first round, enable XBZRLE. */
1413             if (migrate_xbzrle()) {
1414                 rs->xbzrle_started = true;
1415             }
1416         }
1417         /* Didn't find anything this time, but try again on the new block */
1418         return PAGE_TRY_AGAIN;
1419     } else {
1420         /* We've found something */
1421         return PAGE_DIRTY_FOUND;
1422     }
1423 }
1424
1425 /**
1426  * unqueue_page: gets a page of the queue
1427  *
1428  * Helper for 'get_queued_page' - gets a page off the queue
1429  *
1430  * Returns the block of the page (or NULL if none available)
1431  *
1432  * @rs: current RAM state
1433  * @offset: used to return the offset within the RAMBlock
1434  */
1435 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1436 {
1437     struct RAMSrcPageRequest *entry;
1438     RAMBlock *block = NULL;
1439
1440     if (!postcopy_has_request(rs)) {
1441         return NULL;
1442     }
1443
1444     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1445
1446     /*
1447      * This should _never_ change even after we take the lock, because no one
1448      * should be taking anything off the request list other than us.
1449      */
1450     assert(postcopy_has_request(rs));
1451
1452     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1453     block = entry->rb;
1454     *offset = entry->offset;
1455
1456     if (entry->len > TARGET_PAGE_SIZE) {
1457         entry->len -= TARGET_PAGE_SIZE;
1458         entry->offset += TARGET_PAGE_SIZE;
1459     } else {
1460         memory_region_unref(block->mr);
1461         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1462         g_free(entry);
1463         migration_consume_urgent_request();
1464     }
1465
1466     return block;
1467 }
1468
1469 #if defined(__linux__)
1470 /**
1471  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1472  *   is found, return RAM block pointer and page offset
1473  *
1474  * Returns pointer to the RAMBlock containing faulting page,
1475  *   NULL if no write faults are pending
1476  *
1477  * @rs: current RAM state
1478  * @offset: page offset from the beginning of the block
1479  */
1480 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1481 {
1482     struct uffd_msg uffd_msg;
1483     void *page_address;
1484     RAMBlock *block;
1485     int res;
1486
1487     if (!migrate_background_snapshot()) {
1488         return NULL;
1489     }
1490
1491     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1492     if (res <= 0) {
1493         return NULL;
1494     }
1495
1496     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1497     block = qemu_ram_block_from_host(page_address, false, offset);
1498     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1499     return block;
1500 }
1501
1502 /**
1503  * ram_save_release_protection: release UFFD write protection after
1504  *   a range of pages has been saved
1505  *
1506  * @rs: current RAM state
1507  * @pss: page-search-status structure
1508  * @start_page: index of the first page in the range relative to pss->block
1509  *
1510  * Returns 0 on success, negative value in case of an error
1511 */
1512 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1513         unsigned long start_page)
1514 {
1515     int res = 0;
1516
1517     /* Check if page is from UFFD-managed region. */
1518     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1519         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1520         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1521
1522         /* Flush async buffers before un-protect. */
1523         qemu_fflush(pss->pss_channel);
1524         /* Un-protect memory range. */
1525         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1526                 false, false);
1527     }
1528
1529     return res;
1530 }
1531
1532 /* ram_write_tracking_available: check if kernel supports required UFFD features
1533  *
1534  * Returns true if supports, false otherwise
1535  */
1536 bool ram_write_tracking_available(void)
1537 {
1538     uint64_t uffd_features;
1539     int res;
1540
1541     res = uffd_query_features(&uffd_features);
1542     return (res == 0 &&
1543             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1544 }
1545
1546 /* ram_write_tracking_compatible: check if guest configuration is
1547  *   compatible with 'write-tracking'
1548  *
1549  * Returns true if compatible, false otherwise
1550  */
1551 bool ram_write_tracking_compatible(void)
1552 {
1553     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1554     int uffd_fd;
1555     RAMBlock *block;
1556     bool ret = false;
1557
1558     /* Open UFFD file descriptor */
1559     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1560     if (uffd_fd < 0) {
1561         return false;
1562     }
1563
1564     RCU_READ_LOCK_GUARD();
1565
1566     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1567         uint64_t uffd_ioctls;
1568
1569         /* Nothing to do with read-only and MMIO-writable regions */
1570         if (block->mr->readonly || block->mr->rom_device) {
1571             continue;
1572         }
1573         /* Try to register block memory via UFFD-IO to track writes */
1574         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1575                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1576             goto out;
1577         }
1578         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1579             goto out;
1580         }
1581     }
1582     ret = true;
1583
1584 out:
1585     uffd_close_fd(uffd_fd);
1586     return ret;
1587 }
1588
1589 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1590                                        ram_addr_t size)
1591 {
1592     const ram_addr_t end = offset + size;
1593
1594     /*
1595      * We read one byte of each page; this will preallocate page tables if
1596      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1597      * where no page was populated yet. This might require adaption when
1598      * supporting other mappings, like shmem.
1599      */
1600     for (; offset < end; offset += block->page_size) {
1601         char tmp = *((char *)block->host + offset);
1602
1603         /* Don't optimize the read out */
1604         asm volatile("" : "+r" (tmp));
1605     }
1606 }
1607
1608 static inline int populate_read_section(MemoryRegionSection *section,
1609                                         void *opaque)
1610 {
1611     const hwaddr size = int128_get64(section->size);
1612     hwaddr offset = section->offset_within_region;
1613     RAMBlock *block = section->mr->ram_block;
1614
1615     populate_read_range(block, offset, size);
1616     return 0;
1617 }
1618
1619 /*
1620  * ram_block_populate_read: preallocate page tables and populate pages in the
1621  *   RAM block by reading a byte of each page.
1622  *
1623  * Since it's solely used for userfault_fd WP feature, here we just
1624  *   hardcode page size to qemu_real_host_page_size.
1625  *
1626  * @block: RAM block to populate
1627  */
1628 static void ram_block_populate_read(RAMBlock *rb)
1629 {
1630     /*
1631      * Skip populating all pages that fall into a discarded range as managed by
1632      * a RamDiscardManager responsible for the mapped memory region of the
1633      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1634      * must not get populated automatically. We don't have to track
1635      * modifications via userfaultfd WP reliably, because these pages will
1636      * not be part of the migration stream either way -- see
1637      * ramblock_dirty_bitmap_exclude_discarded_pages().
1638      *
1639      * Note: The result is only stable while migrating (precopy/postcopy).
1640      */
1641     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1642         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1643         MemoryRegionSection section = {
1644             .mr = rb->mr,
1645             .offset_within_region = 0,
1646             .size = rb->mr->size,
1647         };
1648
1649         ram_discard_manager_replay_populated(rdm, &section,
1650                                              populate_read_section, NULL);
1651     } else {
1652         populate_read_range(rb, 0, rb->used_length);
1653     }
1654 }
1655
1656 /*
1657  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1658  */
1659 void ram_write_tracking_prepare(void)
1660 {
1661     RAMBlock *block;
1662
1663     RCU_READ_LOCK_GUARD();
1664
1665     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1666         /* Nothing to do with read-only and MMIO-writable regions */
1667         if (block->mr->readonly || block->mr->rom_device) {
1668             continue;
1669         }
1670
1671         /*
1672          * Populate pages of the RAM block before enabling userfault_fd
1673          * write protection.
1674          *
1675          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1676          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1677          * pages with pte_none() entries in page table.
1678          */
1679         ram_block_populate_read(block);
1680     }
1681 }
1682
1683 static inline int uffd_protect_section(MemoryRegionSection *section,
1684                                        void *opaque)
1685 {
1686     const hwaddr size = int128_get64(section->size);
1687     const hwaddr offset = section->offset_within_region;
1688     RAMBlock *rb = section->mr->ram_block;
1689     int uffd_fd = (uintptr_t)opaque;
1690
1691     return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1692                                   false);
1693 }
1694
1695 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1696 {
1697     assert(rb->flags & RAM_UF_WRITEPROTECT);
1698
1699     /* See ram_block_populate_read() */
1700     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1701         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1702         MemoryRegionSection section = {
1703             .mr = rb->mr,
1704             .offset_within_region = 0,
1705             .size = rb->mr->size,
1706         };
1707
1708         return ram_discard_manager_replay_populated(rdm, &section,
1709                                                     uffd_protect_section,
1710                                                     (void *)(uintptr_t)uffd_fd);
1711     }
1712     return uffd_change_protection(uffd_fd, rb->host,
1713                                   rb->used_length, true, false);
1714 }
1715
1716 /*
1717  * ram_write_tracking_start: start UFFD-WP memory tracking
1718  *
1719  * Returns 0 for success or negative value in case of error
1720  */
1721 int ram_write_tracking_start(void)
1722 {
1723     int uffd_fd;
1724     RAMState *rs = ram_state;
1725     RAMBlock *block;
1726
1727     /* Open UFFD file descriptor */
1728     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1729     if (uffd_fd < 0) {
1730         return uffd_fd;
1731     }
1732     rs->uffdio_fd = uffd_fd;
1733
1734     RCU_READ_LOCK_GUARD();
1735
1736     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1737         /* Nothing to do with read-only and MMIO-writable regions */
1738         if (block->mr->readonly || block->mr->rom_device) {
1739             continue;
1740         }
1741
1742         /* Register block memory with UFFD to track writes */
1743         if (uffd_register_memory(rs->uffdio_fd, block->host,
1744                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1745             goto fail;
1746         }
1747         block->flags |= RAM_UF_WRITEPROTECT;
1748         memory_region_ref(block->mr);
1749
1750         /* Apply UFFD write protection to the block memory range */
1751         if (ram_block_uffd_protect(block, uffd_fd)) {
1752             goto fail;
1753         }
1754
1755         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1756                 block->host, block->max_length);
1757     }
1758
1759     return 0;
1760
1761 fail:
1762     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1763
1764     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1765         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1766             continue;
1767         }
1768         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1769         /* Cleanup flags and remove reference */
1770         block->flags &= ~RAM_UF_WRITEPROTECT;
1771         memory_region_unref(block->mr);
1772     }
1773
1774     uffd_close_fd(uffd_fd);
1775     rs->uffdio_fd = -1;
1776     return -1;
1777 }
1778
1779 /**
1780  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1781  */
1782 void ram_write_tracking_stop(void)
1783 {
1784     RAMState *rs = ram_state;
1785     RAMBlock *block;
1786
1787     RCU_READ_LOCK_GUARD();
1788
1789     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1790         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1791             continue;
1792         }
1793         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1794
1795         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1796                 block->host, block->max_length);
1797
1798         /* Cleanup flags and remove reference */
1799         block->flags &= ~RAM_UF_WRITEPROTECT;
1800         memory_region_unref(block->mr);
1801     }
1802
1803     /* Finally close UFFD file descriptor */
1804     uffd_close_fd(rs->uffdio_fd);
1805     rs->uffdio_fd = -1;
1806 }
1807
1808 #else
1809 /* No target OS support, stubs just fail or ignore */
1810
1811 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1812 {
1813     (void) rs;
1814     (void) offset;
1815
1816     return NULL;
1817 }
1818
1819 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1820         unsigned long start_page)
1821 {
1822     (void) rs;
1823     (void) pss;
1824     (void) start_page;
1825
1826     return 0;
1827 }
1828
1829 bool ram_write_tracking_available(void)
1830 {
1831     return false;
1832 }
1833
1834 bool ram_write_tracking_compatible(void)
1835 {
1836     assert(0);
1837     return false;
1838 }
1839
1840 int ram_write_tracking_start(void)
1841 {
1842     assert(0);
1843     return -1;
1844 }
1845
1846 void ram_write_tracking_stop(void)
1847 {
1848     assert(0);
1849 }
1850 #endif /* defined(__linux__) */
1851
1852 /**
1853  * get_queued_page: unqueue a page from the postcopy requests
1854  *
1855  * Skips pages that are already sent (!dirty)
1856  *
1857  * Returns true if a queued page is found
1858  *
1859  * @rs: current RAM state
1860  * @pss: data about the state of the current dirty page scan
1861  */
1862 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1863 {
1864     RAMBlock  *block;
1865     ram_addr_t offset;
1866     bool dirty;
1867
1868     do {
1869         block = unqueue_page(rs, &offset);
1870         /*
1871          * We're sending this page, and since it's postcopy nothing else
1872          * will dirty it, and we must make sure it doesn't get sent again
1873          * even if this queue request was received after the background
1874          * search already sent it.
1875          */
1876         if (block) {
1877             unsigned long page;
1878
1879             page = offset >> TARGET_PAGE_BITS;
1880             dirty = test_bit(page, block->bmap);
1881             if (!dirty) {
1882                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1883                                                 page);
1884             } else {
1885                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1886             }
1887         }
1888
1889     } while (block && !dirty);
1890
1891     if (!block) {
1892         /*
1893          * Poll write faults too if background snapshot is enabled; that's
1894          * when we have vcpus got blocked by the write protected pages.
1895          */
1896         block = poll_fault_page(rs, &offset);
1897     }
1898
1899     if (block) {
1900         /*
1901          * We want the background search to continue from the queued page
1902          * since the guest is likely to want other pages near to the page
1903          * it just requested.
1904          */
1905         pss->block = block;
1906         pss->page = offset >> TARGET_PAGE_BITS;
1907
1908         /*
1909          * This unqueued page would break the "one round" check, even is
1910          * really rare.
1911          */
1912         pss->complete_round = false;
1913     }
1914
1915     return !!block;
1916 }
1917
1918 /**
1919  * migration_page_queue_free: drop any remaining pages in the ram
1920  * request queue
1921  *
1922  * It should be empty at the end anyway, but in error cases there may
1923  * be some left.  in case that there is any page left, we drop it.
1924  *
1925  */
1926 static void migration_page_queue_free(RAMState *rs)
1927 {
1928     struct RAMSrcPageRequest *mspr, *next_mspr;
1929     /* This queue generally should be empty - but in the case of a failed
1930      * migration might have some droppings in.
1931      */
1932     RCU_READ_LOCK_GUARD();
1933     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1934         memory_region_unref(mspr->rb->mr);
1935         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1936         g_free(mspr);
1937     }
1938 }
1939
1940 /**
1941  * ram_save_queue_pages: queue the page for transmission
1942  *
1943  * A request from postcopy destination for example.
1944  *
1945  * Returns zero on success or negative on error
1946  *
1947  * @rbname: Name of the RAMBLock of the request. NULL means the
1948  *          same that last one.
1949  * @start: starting address from the start of the RAMBlock
1950  * @len: length (in bytes) to send
1951  */
1952 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1953 {
1954     RAMBlock *ramblock;
1955     RAMState *rs = ram_state;
1956
1957     stat64_add(&mig_stats.postcopy_requests, 1);
1958     RCU_READ_LOCK_GUARD();
1959
1960     if (!rbname) {
1961         /* Reuse last RAMBlock */
1962         ramblock = rs->last_req_rb;
1963
1964         if (!ramblock) {
1965             /*
1966              * Shouldn't happen, we can't reuse the last RAMBlock if
1967              * it's the 1st request.
1968              */
1969             error_report("ram_save_queue_pages no previous block");
1970             return -1;
1971         }
1972     } else {
1973         ramblock = qemu_ram_block_by_name(rbname);
1974
1975         if (!ramblock) {
1976             /* We shouldn't be asked for a non-existent RAMBlock */
1977             error_report("ram_save_queue_pages no block '%s'", rbname);
1978             return -1;
1979         }
1980         rs->last_req_rb = ramblock;
1981     }
1982     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1983     if (!offset_in_ramblock(ramblock, start + len - 1)) {
1984         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1985                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1986                      __func__, start, len, ramblock->used_length);
1987         return -1;
1988     }
1989
1990     /*
1991      * When with postcopy preempt, we send back the page directly in the
1992      * rp-return thread.
1993      */
1994     if (postcopy_preempt_active()) {
1995         ram_addr_t page_start = start >> TARGET_PAGE_BITS;
1996         size_t page_size = qemu_ram_pagesize(ramblock);
1997         PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
1998         int ret = 0;
1999
2000         qemu_mutex_lock(&rs->bitmap_mutex);
2001
2002         pss_init(pss, ramblock, page_start);
2003         /*
2004          * Always use the preempt channel, and make sure it's there.  It's
2005          * safe to access without lock, because when rp-thread is running
2006          * we should be the only one who operates on the qemufile
2007          */
2008         pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
2009         assert(pss->pss_channel);
2010
2011         /*
2012          * It must be either one or multiple of host page size.  Just
2013          * assert; if something wrong we're mostly split brain anyway.
2014          */
2015         assert(len % page_size == 0);
2016         while (len) {
2017             if (ram_save_host_page_urgent(pss)) {
2018                 error_report("%s: ram_save_host_page_urgent() failed: "
2019                              "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2020                              __func__, ramblock->idstr, start);
2021                 ret = -1;
2022                 break;
2023             }
2024             /*
2025              * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2026              * will automatically be moved and point to the next host page
2027              * we're going to send, so no need to update here.
2028              *
2029              * Normally QEMU never sends >1 host page in requests, so
2030              * logically we don't even need that as the loop should only
2031              * run once, but just to be consistent.
2032              */
2033             len -= page_size;
2034         };
2035         qemu_mutex_unlock(&rs->bitmap_mutex);
2036
2037         return ret;
2038     }
2039
2040     struct RAMSrcPageRequest *new_entry =
2041         g_new0(struct RAMSrcPageRequest, 1);
2042     new_entry->rb = ramblock;
2043     new_entry->offset = start;
2044     new_entry->len = len;
2045
2046     memory_region_ref(ramblock->mr);
2047     qemu_mutex_lock(&rs->src_page_req_mutex);
2048     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2049     migration_make_urgent_request();
2050     qemu_mutex_unlock(&rs->src_page_req_mutex);
2051
2052     return 0;
2053 }
2054
2055 static bool save_page_use_compression(RAMState *rs)
2056 {
2057     if (!migrate_compress()) {
2058         return false;
2059     }
2060
2061     /*
2062      * If xbzrle is enabled (e.g., after first round of migration), stop
2063      * using the data compression. In theory, xbzrle can do better than
2064      * compression.
2065      */
2066     if (rs->xbzrle_started) {
2067         return false;
2068     }
2069
2070     return true;
2071 }
2072
2073 /*
2074  * try to compress the page before posting it out, return true if the page
2075  * has been properly handled by compression, otherwise needs other
2076  * paths to handle it
2077  */
2078 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2079                                RAMBlock *block, ram_addr_t offset)
2080 {
2081     if (!save_page_use_compression(rs)) {
2082         return false;
2083     }
2084
2085     /*
2086      * When starting the process of a new block, the first page of
2087      * the block should be sent out before other pages in the same
2088      * block, and all the pages in last block should have been sent
2089      * out, keeping this order is important, because the 'cont' flag
2090      * is used to avoid resending the block name.
2091      *
2092      * We post the fist page as normal page as compression will take
2093      * much CPU resource.
2094      */
2095     if (block != pss->last_sent_block) {
2096         ram_flush_compressed_data(rs);
2097         return false;
2098     }
2099
2100     if (compress_page_with_multi_thread(block, offset, send_queued_data) > 0) {
2101         return true;
2102     }
2103
2104     compression_counters.busy++;
2105     return false;
2106 }
2107
2108 /**
2109  * ram_save_target_page_legacy: save one target page
2110  *
2111  * Returns the number of pages written
2112  *
2113  * @rs: current RAM state
2114  * @pss: data about the page we want to send
2115  */
2116 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
2117 {
2118     RAMBlock *block = pss->block;
2119     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2120     int res;
2121
2122     if (control_save_page(pss, block, offset, &res)) {
2123         return res;
2124     }
2125
2126     if (save_compress_page(rs, pss, block, offset)) {
2127         return 1;
2128     }
2129
2130     res = save_zero_page(pss, pss->pss_channel, block, offset);
2131     if (res > 0) {
2132         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2133          * page would be stale
2134          */
2135         if (rs->xbzrle_started) {
2136             XBZRLE_cache_lock();
2137             xbzrle_cache_zero_page(rs, block->offset + offset);
2138             XBZRLE_cache_unlock();
2139         }
2140         return res;
2141     }
2142
2143     /*
2144      * Do not use multifd in postcopy as one whole host page should be
2145      * placed.  Meanwhile postcopy requires atomic update of pages, so even
2146      * if host page size == guest page size the dest guest during run may
2147      * still see partially copied pages which is data corruption.
2148      */
2149     if (migrate_multifd() && !migration_in_postcopy()) {
2150         return ram_save_multifd_page(pss->pss_channel, block, offset);
2151     }
2152
2153     return ram_save_page(rs, pss);
2154 }
2155
2156 /* Should be called before sending a host page */
2157 static void pss_host_page_prepare(PageSearchStatus *pss)
2158 {
2159     /* How many guest pages are there in one host page? */
2160     size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2161
2162     pss->host_page_sending = true;
2163     if (guest_pfns <= 1) {
2164         /*
2165          * This covers both when guest psize == host psize, or when guest
2166          * has larger psize than the host (guest_pfns==0).
2167          *
2168          * For the latter, we always send one whole guest page per
2169          * iteration of the host page (example: an Alpha VM on x86 host
2170          * will have guest psize 8K while host psize 4K).
2171          */
2172         pss->host_page_start = pss->page;
2173         pss->host_page_end = pss->page + 1;
2174     } else {
2175         /*
2176          * The host page spans over multiple guest pages, we send them
2177          * within the same host page iteration.
2178          */
2179         pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2180         pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2181     }
2182 }
2183
2184 /*
2185  * Whether the page pointed by PSS is within the host page being sent.
2186  * Must be called after a previous pss_host_page_prepare().
2187  */
2188 static bool pss_within_range(PageSearchStatus *pss)
2189 {
2190     ram_addr_t ram_addr;
2191
2192     assert(pss->host_page_sending);
2193
2194     /* Over host-page boundary? */
2195     if (pss->page >= pss->host_page_end) {
2196         return false;
2197     }
2198
2199     ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2200
2201     return offset_in_ramblock(pss->block, ram_addr);
2202 }
2203
2204 static void pss_host_page_finish(PageSearchStatus *pss)
2205 {
2206     pss->host_page_sending = false;
2207     /* This is not needed, but just to reset it */
2208     pss->host_page_start = pss->host_page_end = 0;
2209 }
2210
2211 /*
2212  * Send an urgent host page specified by `pss'.  Need to be called with
2213  * bitmap_mutex held.
2214  *
2215  * Returns 0 if save host page succeeded, false otherwise.
2216  */
2217 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2218 {
2219     bool page_dirty, sent = false;
2220     RAMState *rs = ram_state;
2221     int ret = 0;
2222
2223     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2224     pss_host_page_prepare(pss);
2225
2226     /*
2227      * If precopy is sending the same page, let it be done in precopy, or
2228      * we could send the same page in two channels and none of them will
2229      * receive the whole page.
2230      */
2231     if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2232         trace_postcopy_preempt_hit(pss->block->idstr,
2233                                    pss->page << TARGET_PAGE_BITS);
2234         return 0;
2235     }
2236
2237     do {
2238         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2239
2240         if (page_dirty) {
2241             /* Be strict to return code; it must be 1, or what else? */
2242             if (migration_ops->ram_save_target_page(rs, pss) != 1) {
2243                 error_report_once("%s: ram_save_target_page failed", __func__);
2244                 ret = -1;
2245                 goto out;
2246             }
2247             sent = true;
2248         }
2249         pss_find_next_dirty(pss);
2250     } while (pss_within_range(pss));
2251 out:
2252     pss_host_page_finish(pss);
2253     /* For urgent requests, flush immediately if sent */
2254     if (sent) {
2255         qemu_fflush(pss->pss_channel);
2256     }
2257     return ret;
2258 }
2259
2260 /**
2261  * ram_save_host_page: save a whole host page
2262  *
2263  * Starting at *offset send pages up to the end of the current host
2264  * page. It's valid for the initial offset to point into the middle of
2265  * a host page in which case the remainder of the hostpage is sent.
2266  * Only dirty target pages are sent. Note that the host page size may
2267  * be a huge page for this block.
2268  *
2269  * The saving stops at the boundary of the used_length of the block
2270  * if the RAMBlock isn't a multiple of the host page size.
2271  *
2272  * The caller must be with ram_state.bitmap_mutex held to call this
2273  * function.  Note that this function can temporarily release the lock, but
2274  * when the function is returned it'll make sure the lock is still held.
2275  *
2276  * Returns the number of pages written or negative on error
2277  *
2278  * @rs: current RAM state
2279  * @pss: data about the page we want to send
2280  */
2281 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2282 {
2283     bool page_dirty, preempt_active = postcopy_preempt_active();
2284     int tmppages, pages = 0;
2285     size_t pagesize_bits =
2286         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2287     unsigned long start_page = pss->page;
2288     int res;
2289
2290     if (ramblock_is_ignored(pss->block)) {
2291         error_report("block %s should not be migrated !", pss->block->idstr);
2292         return 0;
2293     }
2294
2295     /* Update host page boundary information */
2296     pss_host_page_prepare(pss);
2297
2298     do {
2299         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2300
2301         /* Check the pages is dirty and if it is send it */
2302         if (page_dirty) {
2303             /*
2304              * Properly yield the lock only in postcopy preempt mode
2305              * because both migration thread and rp-return thread can
2306              * operate on the bitmaps.
2307              */
2308             if (preempt_active) {
2309                 qemu_mutex_unlock(&rs->bitmap_mutex);
2310             }
2311             tmppages = migration_ops->ram_save_target_page(rs, pss);
2312             if (tmppages >= 0) {
2313                 pages += tmppages;
2314                 /*
2315                  * Allow rate limiting to happen in the middle of huge pages if
2316                  * something is sent in the current iteration.
2317                  */
2318                 if (pagesize_bits > 1 && tmppages > 0) {
2319                     migration_rate_limit();
2320                 }
2321             }
2322             if (preempt_active) {
2323                 qemu_mutex_lock(&rs->bitmap_mutex);
2324             }
2325         } else {
2326             tmppages = 0;
2327         }
2328
2329         if (tmppages < 0) {
2330             pss_host_page_finish(pss);
2331             return tmppages;
2332         }
2333
2334         pss_find_next_dirty(pss);
2335     } while (pss_within_range(pss));
2336
2337     pss_host_page_finish(pss);
2338
2339     res = ram_save_release_protection(rs, pss, start_page);
2340     return (res < 0 ? res : pages);
2341 }
2342
2343 /**
2344  * ram_find_and_save_block: finds a dirty page and sends it to f
2345  *
2346  * Called within an RCU critical section.
2347  *
2348  * Returns the number of pages written where zero means no dirty pages,
2349  * or negative on error
2350  *
2351  * @rs: current RAM state
2352  *
2353  * On systems where host-page-size > target-page-size it will send all the
2354  * pages in a host page that are dirty.
2355  */
2356 static int ram_find_and_save_block(RAMState *rs)
2357 {
2358     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2359     int pages = 0;
2360
2361     /* No dirty page as there is zero RAM */
2362     if (!rs->ram_bytes_total) {
2363         return pages;
2364     }
2365
2366     /*
2367      * Always keep last_seen_block/last_page valid during this procedure,
2368      * because find_dirty_block() relies on these values (e.g., we compare
2369      * last_seen_block with pss.block to see whether we searched all the
2370      * ramblocks) to detect the completion of migration.  Having NULL value
2371      * of last_seen_block can conditionally cause below loop to run forever.
2372      */
2373     if (!rs->last_seen_block) {
2374         rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2375         rs->last_page = 0;
2376     }
2377
2378     pss_init(pss, rs->last_seen_block, rs->last_page);
2379
2380     while (true){
2381         if (!get_queued_page(rs, pss)) {
2382             /* priority queue empty, so just search for something dirty */
2383             int res = find_dirty_block(rs, pss);
2384             if (res != PAGE_DIRTY_FOUND) {
2385                 if (res == PAGE_ALL_CLEAN) {
2386                     break;
2387                 } else if (res == PAGE_TRY_AGAIN) {
2388                     continue;
2389                 } else if (res < 0) {
2390                     pages = res;
2391                     break;
2392                 }
2393             }
2394         }
2395         pages = ram_save_host_page(rs, pss);
2396         if (pages) {
2397             break;
2398         }
2399     }
2400
2401     rs->last_seen_block = pss->block;
2402     rs->last_page = pss->page;
2403
2404     return pages;
2405 }
2406
2407 static uint64_t ram_bytes_total_with_ignored(void)
2408 {
2409     RAMBlock *block;
2410     uint64_t total = 0;
2411
2412     RCU_READ_LOCK_GUARD();
2413
2414     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2415         total += block->used_length;
2416     }
2417     return total;
2418 }
2419
2420 uint64_t ram_bytes_total(void)
2421 {
2422     RAMBlock *block;
2423     uint64_t total = 0;
2424
2425     RCU_READ_LOCK_GUARD();
2426
2427     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2428         total += block->used_length;
2429     }
2430     return total;
2431 }
2432
2433 static void xbzrle_load_setup(void)
2434 {
2435     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2436 }
2437
2438 static void xbzrle_load_cleanup(void)
2439 {
2440     g_free(XBZRLE.decoded_buf);
2441     XBZRLE.decoded_buf = NULL;
2442 }
2443
2444 static void ram_state_cleanup(RAMState **rsp)
2445 {
2446     if (*rsp) {
2447         migration_page_queue_free(*rsp);
2448         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2449         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2450         g_free(*rsp);
2451         *rsp = NULL;
2452     }
2453 }
2454
2455 static void xbzrle_cleanup(void)
2456 {
2457     XBZRLE_cache_lock();
2458     if (XBZRLE.cache) {
2459         cache_fini(XBZRLE.cache);
2460         g_free(XBZRLE.encoded_buf);
2461         g_free(XBZRLE.current_buf);
2462         g_free(XBZRLE.zero_target_page);
2463         XBZRLE.cache = NULL;
2464         XBZRLE.encoded_buf = NULL;
2465         XBZRLE.current_buf = NULL;
2466         XBZRLE.zero_target_page = NULL;
2467     }
2468     XBZRLE_cache_unlock();
2469 }
2470
2471 static void ram_save_cleanup(void *opaque)
2472 {
2473     RAMState **rsp = opaque;
2474     RAMBlock *block;
2475
2476     /* We don't use dirty log with background snapshots */
2477     if (!migrate_background_snapshot()) {
2478         /* caller have hold iothread lock or is in a bh, so there is
2479          * no writing race against the migration bitmap
2480          */
2481         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2482             /*
2483              * do not stop dirty log without starting it, since
2484              * memory_global_dirty_log_stop will assert that
2485              * memory_global_dirty_log_start/stop used in pairs
2486              */
2487             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2488         }
2489     }
2490
2491     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2492         g_free(block->clear_bmap);
2493         block->clear_bmap = NULL;
2494         g_free(block->bmap);
2495         block->bmap = NULL;
2496     }
2497
2498     xbzrle_cleanup();
2499     compress_threads_save_cleanup();
2500     ram_state_cleanup(rsp);
2501     g_free(migration_ops);
2502     migration_ops = NULL;
2503 }
2504
2505 static void ram_state_reset(RAMState *rs)
2506 {
2507     int i;
2508
2509     for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2510         rs->pss[i].last_sent_block = NULL;
2511     }
2512
2513     rs->last_seen_block = NULL;
2514     rs->last_page = 0;
2515     rs->last_version = ram_list.version;
2516     rs->xbzrle_started = false;
2517 }
2518
2519 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2520
2521 /* **** functions for postcopy ***** */
2522
2523 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2524 {
2525     struct RAMBlock *block;
2526
2527     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2528         unsigned long *bitmap = block->bmap;
2529         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2530         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2531
2532         while (run_start < range) {
2533             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2534             ram_discard_range(block->idstr,
2535                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2536                               ((ram_addr_t)(run_end - run_start))
2537                                 << TARGET_PAGE_BITS);
2538             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2539         }
2540     }
2541 }
2542
2543 /**
2544  * postcopy_send_discard_bm_ram: discard a RAMBlock
2545  *
2546  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2547  *
2548  * @ms: current migration state
2549  * @block: RAMBlock to discard
2550  */
2551 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2552 {
2553     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2554     unsigned long current;
2555     unsigned long *bitmap = block->bmap;
2556
2557     for (current = 0; current < end; ) {
2558         unsigned long one = find_next_bit(bitmap, end, current);
2559         unsigned long zero, discard_length;
2560
2561         if (one >= end) {
2562             break;
2563         }
2564
2565         zero = find_next_zero_bit(bitmap, end, one + 1);
2566
2567         if (zero >= end) {
2568             discard_length = end - one;
2569         } else {
2570             discard_length = zero - one;
2571         }
2572         postcopy_discard_send_range(ms, one, discard_length);
2573         current = one + discard_length;
2574     }
2575 }
2576
2577 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2578
2579 /**
2580  * postcopy_each_ram_send_discard: discard all RAMBlocks
2581  *
2582  * Utility for the outgoing postcopy code.
2583  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2584  *   passing it bitmap indexes and name.
2585  * (qemu_ram_foreach_block ends up passing unscaled lengths
2586  *  which would mean postcopy code would have to deal with target page)
2587  *
2588  * @ms: current migration state
2589  */
2590 static void postcopy_each_ram_send_discard(MigrationState *ms)
2591 {
2592     struct RAMBlock *block;
2593
2594     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2595         postcopy_discard_send_init(ms, block->idstr);
2596
2597         /*
2598          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2599          * host-page size chunks, mark any partially dirty host-page size
2600          * chunks as all dirty.  In this case the host-page is the host-page
2601          * for the particular RAMBlock, i.e. it might be a huge page.
2602          */
2603         postcopy_chunk_hostpages_pass(ms, block);
2604
2605         /*
2606          * Postcopy sends chunks of bitmap over the wire, but it
2607          * just needs indexes at this point, avoids it having
2608          * target page specific code.
2609          */
2610         postcopy_send_discard_bm_ram(ms, block);
2611         postcopy_discard_send_finish(ms);
2612     }
2613 }
2614
2615 /**
2616  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2617  *
2618  * Helper for postcopy_chunk_hostpages; it's called twice to
2619  * canonicalize the two bitmaps, that are similar, but one is
2620  * inverted.
2621  *
2622  * Postcopy requires that all target pages in a hostpage are dirty or
2623  * clean, not a mix.  This function canonicalizes the bitmaps.
2624  *
2625  * @ms: current migration state
2626  * @block: block that contains the page we want to canonicalize
2627  */
2628 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2629 {
2630     RAMState *rs = ram_state;
2631     unsigned long *bitmap = block->bmap;
2632     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2633     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2634     unsigned long run_start;
2635
2636     if (block->page_size == TARGET_PAGE_SIZE) {
2637         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2638         return;
2639     }
2640
2641     /* Find a dirty page */
2642     run_start = find_next_bit(bitmap, pages, 0);
2643
2644     while (run_start < pages) {
2645
2646         /*
2647          * If the start of this run of pages is in the middle of a host
2648          * page, then we need to fixup this host page.
2649          */
2650         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2651             /* Find the end of this run */
2652             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2653             /*
2654              * If the end isn't at the start of a host page, then the
2655              * run doesn't finish at the end of a host page
2656              * and we need to discard.
2657              */
2658         }
2659
2660         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2661             unsigned long page;
2662             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2663                                                              host_ratio);
2664             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2665
2666             /* Clean up the bitmap */
2667             for (page = fixup_start_addr;
2668                  page < fixup_start_addr + host_ratio; page++) {
2669                 /*
2670                  * Remark them as dirty, updating the count for any pages
2671                  * that weren't previously dirty.
2672                  */
2673                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2674             }
2675         }
2676
2677         /* Find the next dirty page for the next iteration */
2678         run_start = find_next_bit(bitmap, pages, run_start);
2679     }
2680 }
2681
2682 /**
2683  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2684  *
2685  * Transmit the set of pages to be discarded after precopy to the target
2686  * these are pages that:
2687  *     a) Have been previously transmitted but are now dirty again
2688  *     b) Pages that have never been transmitted, this ensures that
2689  *        any pages on the destination that have been mapped by background
2690  *        tasks get discarded (transparent huge pages is the specific concern)
2691  * Hopefully this is pretty sparse
2692  *
2693  * @ms: current migration state
2694  */
2695 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2696 {
2697     RAMState *rs = ram_state;
2698
2699     RCU_READ_LOCK_GUARD();
2700
2701     /* This should be our last sync, the src is now paused */
2702     migration_bitmap_sync(rs);
2703
2704     /* Easiest way to make sure we don't resume in the middle of a host-page */
2705     rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2706     rs->last_seen_block = NULL;
2707     rs->last_page = 0;
2708
2709     postcopy_each_ram_send_discard(ms);
2710
2711     trace_ram_postcopy_send_discard_bitmap();
2712 }
2713
2714 /**
2715  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2716  *
2717  * Returns zero on success
2718  *
2719  * @rbname: name of the RAMBlock of the request. NULL means the
2720  *          same that last one.
2721  * @start: RAMBlock starting page
2722  * @length: RAMBlock size
2723  */
2724 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2725 {
2726     trace_ram_discard_range(rbname, start, length);
2727
2728     RCU_READ_LOCK_GUARD();
2729     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2730
2731     if (!rb) {
2732         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2733         return -1;
2734     }
2735
2736     /*
2737      * On source VM, we don't need to update the received bitmap since
2738      * we don't even have one.
2739      */
2740     if (rb->receivedmap) {
2741         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2742                      length >> qemu_target_page_bits());
2743     }
2744
2745     return ram_block_discard_range(rb, start, length);
2746 }
2747
2748 /*
2749  * For every allocation, we will try not to crash the VM if the
2750  * allocation failed.
2751  */
2752 static int xbzrle_init(void)
2753 {
2754     Error *local_err = NULL;
2755
2756     if (!migrate_xbzrle()) {
2757         return 0;
2758     }
2759
2760     XBZRLE_cache_lock();
2761
2762     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2763     if (!XBZRLE.zero_target_page) {
2764         error_report("%s: Error allocating zero page", __func__);
2765         goto err_out;
2766     }
2767
2768     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2769                               TARGET_PAGE_SIZE, &local_err);
2770     if (!XBZRLE.cache) {
2771         error_report_err(local_err);
2772         goto free_zero_page;
2773     }
2774
2775     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2776     if (!XBZRLE.encoded_buf) {
2777         error_report("%s: Error allocating encoded_buf", __func__);
2778         goto free_cache;
2779     }
2780
2781     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2782     if (!XBZRLE.current_buf) {
2783         error_report("%s: Error allocating current_buf", __func__);
2784         goto free_encoded_buf;
2785     }
2786
2787     /* We are all good */
2788     XBZRLE_cache_unlock();
2789     return 0;
2790
2791 free_encoded_buf:
2792     g_free(XBZRLE.encoded_buf);
2793     XBZRLE.encoded_buf = NULL;
2794 free_cache:
2795     cache_fini(XBZRLE.cache);
2796     XBZRLE.cache = NULL;
2797 free_zero_page:
2798     g_free(XBZRLE.zero_target_page);
2799     XBZRLE.zero_target_page = NULL;
2800 err_out:
2801     XBZRLE_cache_unlock();
2802     return -ENOMEM;
2803 }
2804
2805 static int ram_state_init(RAMState **rsp)
2806 {
2807     *rsp = g_try_new0(RAMState, 1);
2808
2809     if (!*rsp) {
2810         error_report("%s: Init ramstate fail", __func__);
2811         return -1;
2812     }
2813
2814     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2815     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2816     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2817     (*rsp)->ram_bytes_total = ram_bytes_total();
2818
2819     /*
2820      * Count the total number of pages used by ram blocks not including any
2821      * gaps due to alignment or unplugs.
2822      * This must match with the initial values of dirty bitmap.
2823      */
2824     (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
2825     ram_state_reset(*rsp);
2826
2827     return 0;
2828 }
2829
2830 static void ram_list_init_bitmaps(void)
2831 {
2832     MigrationState *ms = migrate_get_current();
2833     RAMBlock *block;
2834     unsigned long pages;
2835     uint8_t shift;
2836
2837     /* Skip setting bitmap if there is no RAM */
2838     if (ram_bytes_total()) {
2839         shift = ms->clear_bitmap_shift;
2840         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2841             error_report("clear_bitmap_shift (%u) too big, using "
2842                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2843             shift = CLEAR_BITMAP_SHIFT_MAX;
2844         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2845             error_report("clear_bitmap_shift (%u) too small, using "
2846                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2847             shift = CLEAR_BITMAP_SHIFT_MIN;
2848         }
2849
2850         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2851             pages = block->max_length >> TARGET_PAGE_BITS;
2852             /*
2853              * The initial dirty bitmap for migration must be set with all
2854              * ones to make sure we'll migrate every guest RAM page to
2855              * destination.
2856              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2857              * new migration after a failed migration, ram_list.
2858              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2859              * guest memory.
2860              */
2861             block->bmap = bitmap_new(pages);
2862             bitmap_set(block->bmap, 0, pages);
2863             block->clear_bmap_shift = shift;
2864             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2865         }
2866     }
2867 }
2868
2869 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2870 {
2871     unsigned long pages;
2872     RAMBlock *rb;
2873
2874     RCU_READ_LOCK_GUARD();
2875
2876     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2877             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2878             rs->migration_dirty_pages -= pages;
2879     }
2880 }
2881
2882 static void ram_init_bitmaps(RAMState *rs)
2883 {
2884     /* For memory_global_dirty_log_start below.  */
2885     qemu_mutex_lock_iothread();
2886     qemu_mutex_lock_ramlist();
2887
2888     WITH_RCU_READ_LOCK_GUARD() {
2889         ram_list_init_bitmaps();
2890         /* We don't use dirty log with background snapshots */
2891         if (!migrate_background_snapshot()) {
2892             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2893             migration_bitmap_sync_precopy(rs);
2894         }
2895     }
2896     qemu_mutex_unlock_ramlist();
2897     qemu_mutex_unlock_iothread();
2898
2899     /*
2900      * After an eventual first bitmap sync, fixup the initial bitmap
2901      * containing all 1s to exclude any discarded pages from migration.
2902      */
2903     migration_bitmap_clear_discarded_pages(rs);
2904 }
2905
2906 static int ram_init_all(RAMState **rsp)
2907 {
2908     if (ram_state_init(rsp)) {
2909         return -1;
2910     }
2911
2912     if (xbzrle_init()) {
2913         ram_state_cleanup(rsp);
2914         return -1;
2915     }
2916
2917     ram_init_bitmaps(*rsp);
2918
2919     return 0;
2920 }
2921
2922 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2923 {
2924     RAMBlock *block;
2925     uint64_t pages = 0;
2926
2927     /*
2928      * Postcopy is not using xbzrle/compression, so no need for that.
2929      * Also, since source are already halted, we don't need to care
2930      * about dirty page logging as well.
2931      */
2932
2933     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2934         pages += bitmap_count_one(block->bmap,
2935                                   block->used_length >> TARGET_PAGE_BITS);
2936     }
2937
2938     /* This may not be aligned with current bitmaps. Recalculate. */
2939     rs->migration_dirty_pages = pages;
2940
2941     ram_state_reset(rs);
2942
2943     /* Update RAMState cache of output QEMUFile */
2944     rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
2945
2946     trace_ram_state_resume_prepare(pages);
2947 }
2948
2949 /*
2950  * This function clears bits of the free pages reported by the caller from the
2951  * migration dirty bitmap. @addr is the host address corresponding to the
2952  * start of the continuous guest free pages, and @len is the total bytes of
2953  * those pages.
2954  */
2955 void qemu_guest_free_page_hint(void *addr, size_t len)
2956 {
2957     RAMBlock *block;
2958     ram_addr_t offset;
2959     size_t used_len, start, npages;
2960     MigrationState *s = migrate_get_current();
2961
2962     /* This function is currently expected to be used during live migration */
2963     if (!migration_is_setup_or_active(s->state)) {
2964         return;
2965     }
2966
2967     for (; len > 0; len -= used_len, addr += used_len) {
2968         block = qemu_ram_block_from_host(addr, false, &offset);
2969         if (unlikely(!block || offset >= block->used_length)) {
2970             /*
2971              * The implementation might not support RAMBlock resize during
2972              * live migration, but it could happen in theory with future
2973              * updates. So we add a check here to capture that case.
2974              */
2975             error_report_once("%s unexpected error", __func__);
2976             return;
2977         }
2978
2979         if (len <= block->used_length - offset) {
2980             used_len = len;
2981         } else {
2982             used_len = block->used_length - offset;
2983         }
2984
2985         start = offset >> TARGET_PAGE_BITS;
2986         npages = used_len >> TARGET_PAGE_BITS;
2987
2988         qemu_mutex_lock(&ram_state->bitmap_mutex);
2989         /*
2990          * The skipped free pages are equavalent to be sent from clear_bmap's
2991          * perspective, so clear the bits from the memory region bitmap which
2992          * are initially set. Otherwise those skipped pages will be sent in
2993          * the next round after syncing from the memory region bitmap.
2994          */
2995         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2996         ram_state->migration_dirty_pages -=
2997                       bitmap_count_one_with_offset(block->bmap, start, npages);
2998         bitmap_clear(block->bmap, start, npages);
2999         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3000     }
3001 }
3002
3003 /*
3004  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3005  * long-running RCU critical section.  When rcu-reclaims in the code
3006  * start to become numerous it will be necessary to reduce the
3007  * granularity of these critical sections.
3008  */
3009
3010 /**
3011  * ram_save_setup: Setup RAM for migration
3012  *
3013  * Returns zero to indicate success and negative for error
3014  *
3015  * @f: QEMUFile where to send the data
3016  * @opaque: RAMState pointer
3017  */
3018 static int ram_save_setup(QEMUFile *f, void *opaque)
3019 {
3020     RAMState **rsp = opaque;
3021     RAMBlock *block;
3022     int ret;
3023
3024     if (compress_threads_save_setup()) {
3025         return -1;
3026     }
3027
3028     /* migration has already setup the bitmap, reuse it. */
3029     if (!migration_in_colo_state()) {
3030         if (ram_init_all(rsp) != 0) {
3031             compress_threads_save_cleanup();
3032             return -1;
3033         }
3034     }
3035     (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3036
3037     WITH_RCU_READ_LOCK_GUARD() {
3038         qemu_put_be64(f, ram_bytes_total_with_ignored()
3039                          | RAM_SAVE_FLAG_MEM_SIZE);
3040
3041         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3042             qemu_put_byte(f, strlen(block->idstr));
3043             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3044             qemu_put_be64(f, block->used_length);
3045             if (migrate_postcopy_ram() && block->page_size !=
3046                                           qemu_host_page_size) {
3047                 qemu_put_be64(f, block->page_size);
3048             }
3049             if (migrate_ignore_shared()) {
3050                 qemu_put_be64(f, block->mr->addr);
3051             }
3052         }
3053     }
3054
3055     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3056     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3057
3058     migration_ops = g_malloc0(sizeof(MigrationOps));
3059     migration_ops->ram_save_target_page = ram_save_target_page_legacy;
3060     ret = multifd_send_sync_main(f);
3061     if (ret < 0) {
3062         return ret;
3063     }
3064
3065     if (!migrate_multifd_flush_after_each_section()) {
3066         qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3067     }
3068
3069     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3070     qemu_fflush(f);
3071
3072     return 0;
3073 }
3074
3075 /**
3076  * ram_save_iterate: iterative stage for migration
3077  *
3078  * Returns zero to indicate success and negative for error
3079  *
3080  * @f: QEMUFile where to send the data
3081  * @opaque: RAMState pointer
3082  */
3083 static int ram_save_iterate(QEMUFile *f, void *opaque)
3084 {
3085     RAMState **temp = opaque;
3086     RAMState *rs = *temp;
3087     int ret = 0;
3088     int i;
3089     int64_t t0;
3090     int done = 0;
3091
3092     if (blk_mig_bulk_active()) {
3093         /* Avoid transferring ram during bulk phase of block migration as
3094          * the bulk phase will usually take a long time and transferring
3095          * ram updates during that time is pointless. */
3096         goto out;
3097     }
3098
3099     /*
3100      * We'll take this lock a little bit long, but it's okay for two reasons.
3101      * Firstly, the only possible other thread to take it is who calls
3102      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3103      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3104      * guarantees that we'll at least released it in a regular basis.
3105      */
3106     qemu_mutex_lock(&rs->bitmap_mutex);
3107     WITH_RCU_READ_LOCK_GUARD() {
3108         if (ram_list.version != rs->last_version) {
3109             ram_state_reset(rs);
3110         }
3111
3112         /* Read version before ram_list.blocks */
3113         smp_rmb();
3114
3115         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3116
3117         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3118         i = 0;
3119         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3120                postcopy_has_request(rs)) {
3121             int pages;
3122
3123             if (qemu_file_get_error(f)) {
3124                 break;
3125             }
3126
3127             pages = ram_find_and_save_block(rs);
3128             /* no more pages to sent */
3129             if (pages == 0) {
3130                 done = 1;
3131                 break;
3132             }
3133
3134             if (pages < 0) {
3135                 qemu_file_set_error(f, pages);
3136                 break;
3137             }
3138
3139             rs->target_page_count += pages;
3140
3141             /*
3142              * During postcopy, it is necessary to make sure one whole host
3143              * page is sent in one chunk.
3144              */
3145             if (migrate_postcopy_ram()) {
3146                 ram_flush_compressed_data(rs);
3147             }
3148
3149             /*
3150              * we want to check in the 1st loop, just in case it was the 1st
3151              * time and we had to sync the dirty bitmap.
3152              * qemu_clock_get_ns() is a bit expensive, so we only check each
3153              * some iterations
3154              */
3155             if ((i & 63) == 0) {
3156                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3157                               1000000;
3158                 if (t1 > MAX_WAIT) {
3159                     trace_ram_save_iterate_big_wait(t1, i);
3160                     break;
3161                 }
3162             }
3163             i++;
3164         }
3165     }
3166     qemu_mutex_unlock(&rs->bitmap_mutex);
3167
3168     /*
3169      * Must occur before EOS (or any QEMUFile operation)
3170      * because of RDMA protocol.
3171      */
3172     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3173
3174 out:
3175     if (ret >= 0
3176         && migration_is_setup_or_active(migrate_get_current()->state)) {
3177         if (migrate_multifd_flush_after_each_section()) {
3178             ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3179             if (ret < 0) {
3180                 return ret;
3181             }
3182         }
3183
3184         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3185         qemu_fflush(f);
3186         ram_transferred_add(8);
3187
3188         ret = qemu_file_get_error(f);
3189     }
3190     if (ret < 0) {
3191         return ret;
3192     }
3193
3194     return done;
3195 }
3196
3197 /**
3198  * ram_save_complete: function called to send the remaining amount of ram
3199  *
3200  * Returns zero to indicate success or negative on error
3201  *
3202  * Called with iothread lock
3203  *
3204  * @f: QEMUFile where to send the data
3205  * @opaque: RAMState pointer
3206  */
3207 static int ram_save_complete(QEMUFile *f, void *opaque)
3208 {
3209     RAMState **temp = opaque;
3210     RAMState *rs = *temp;
3211     int ret = 0;
3212
3213     rs->last_stage = !migration_in_colo_state();
3214
3215     WITH_RCU_READ_LOCK_GUARD() {
3216         if (!migration_in_postcopy()) {
3217             migration_bitmap_sync_precopy(rs);
3218         }
3219
3220         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3221
3222         /* try transferring iterative blocks of memory */
3223
3224         /* flush all remaining blocks regardless of rate limiting */
3225         qemu_mutex_lock(&rs->bitmap_mutex);
3226         while (true) {
3227             int pages;
3228
3229             pages = ram_find_and_save_block(rs);
3230             /* no more blocks to sent */
3231             if (pages == 0) {
3232                 break;
3233             }
3234             if (pages < 0) {
3235                 ret = pages;
3236                 break;
3237             }
3238         }
3239         qemu_mutex_unlock(&rs->bitmap_mutex);
3240
3241         ram_flush_compressed_data(rs);
3242         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3243     }
3244
3245     if (ret < 0) {
3246         return ret;
3247     }
3248
3249     ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3250     if (ret < 0) {
3251         return ret;
3252     }
3253
3254     if (!migrate_multifd_flush_after_each_section()) {
3255         qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3256     }
3257     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3258     qemu_fflush(f);
3259
3260     return 0;
3261 }
3262
3263 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3264                                        uint64_t *can_postcopy)
3265 {
3266     RAMState **temp = opaque;
3267     RAMState *rs = *temp;
3268
3269     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3270
3271     if (migrate_postcopy_ram()) {
3272         /* We can do postcopy, and all the data is postcopiable */
3273         *can_postcopy += remaining_size;
3274     } else {
3275         *must_precopy += remaining_size;
3276     }
3277 }
3278
3279 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3280                                     uint64_t *can_postcopy)
3281 {
3282     MigrationState *s = migrate_get_current();
3283     RAMState **temp = opaque;
3284     RAMState *rs = *temp;
3285
3286     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3287
3288     if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
3289         qemu_mutex_lock_iothread();
3290         WITH_RCU_READ_LOCK_GUARD() {
3291             migration_bitmap_sync_precopy(rs);
3292         }
3293         qemu_mutex_unlock_iothread();
3294         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3295     }
3296
3297     if (migrate_postcopy_ram()) {
3298         /* We can do postcopy, and all the data is postcopiable */
3299         *can_postcopy += remaining_size;
3300     } else {
3301         *must_precopy += remaining_size;
3302     }
3303 }
3304
3305 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3306 {
3307     unsigned int xh_len;
3308     int xh_flags;
3309     uint8_t *loaded_data;
3310
3311     /* extract RLE header */
3312     xh_flags = qemu_get_byte(f);
3313     xh_len = qemu_get_be16(f);
3314
3315     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3316         error_report("Failed to load XBZRLE page - wrong compression!");
3317         return -1;
3318     }
3319
3320     if (xh_len > TARGET_PAGE_SIZE) {
3321         error_report("Failed to load XBZRLE page - len overflow!");
3322         return -1;
3323     }
3324     loaded_data = XBZRLE.decoded_buf;
3325     /* load data and decode */
3326     /* it can change loaded_data to point to an internal buffer */
3327     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3328
3329     /* decode RLE */
3330     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3331                              TARGET_PAGE_SIZE) == -1) {
3332         error_report("Failed to load XBZRLE page - decode error!");
3333         return -1;
3334     }
3335
3336     return 0;
3337 }
3338
3339 /**
3340  * ram_block_from_stream: read a RAMBlock id from the migration stream
3341  *
3342  * Must be called from within a rcu critical section.
3343  *
3344  * Returns a pointer from within the RCU-protected ram_list.
3345  *
3346  * @mis: the migration incoming state pointer
3347  * @f: QEMUFile where to read the data from
3348  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3349  * @channel: the channel we're using
3350  */
3351 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3352                                               QEMUFile *f, int flags,
3353                                               int channel)
3354 {
3355     RAMBlock *block = mis->last_recv_block[channel];
3356     char id[256];
3357     uint8_t len;
3358
3359     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3360         if (!block) {
3361             error_report("Ack, bad migration stream!");
3362             return NULL;
3363         }
3364         return block;
3365     }
3366
3367     len = qemu_get_byte(f);
3368     qemu_get_buffer(f, (uint8_t *)id, len);
3369     id[len] = 0;
3370
3371     block = qemu_ram_block_by_name(id);
3372     if (!block) {
3373         error_report("Can't find block %s", id);
3374         return NULL;
3375     }
3376
3377     if (ramblock_is_ignored(block)) {
3378         error_report("block %s should not be migrated !", id);
3379         return NULL;
3380     }
3381
3382     mis->last_recv_block[channel] = block;
3383
3384     return block;
3385 }
3386
3387 static inline void *host_from_ram_block_offset(RAMBlock *block,
3388                                                ram_addr_t offset)
3389 {
3390     if (!offset_in_ramblock(block, offset)) {
3391         return NULL;
3392     }
3393
3394     return block->host + offset;
3395 }
3396
3397 static void *host_page_from_ram_block_offset(RAMBlock *block,
3398                                              ram_addr_t offset)
3399 {
3400     /* Note: Explicitly no check against offset_in_ramblock(). */
3401     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3402                                    block->page_size);
3403 }
3404
3405 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3406                                                          ram_addr_t offset)
3407 {
3408     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3409 }
3410
3411 void colo_record_bitmap(RAMBlock *block, ram_addr_t *normal, uint32_t pages)
3412 {
3413     qemu_mutex_lock(&ram_state->bitmap_mutex);
3414     for (int i = 0; i < pages; i++) {
3415         ram_addr_t offset = normal[i];
3416         ram_state->migration_dirty_pages += !test_and_set_bit(
3417                                                 offset >> TARGET_PAGE_BITS,
3418                                                 block->bmap);
3419     }
3420     qemu_mutex_unlock(&ram_state->bitmap_mutex);
3421 }
3422
3423 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3424                              ram_addr_t offset, bool record_bitmap)
3425 {
3426     if (!offset_in_ramblock(block, offset)) {
3427         return NULL;
3428     }
3429     if (!block->colo_cache) {
3430         error_report("%s: colo_cache is NULL in block :%s",
3431                      __func__, block->idstr);
3432         return NULL;
3433     }
3434
3435     /*
3436     * During colo checkpoint, we need bitmap of these migrated pages.
3437     * It help us to decide which pages in ram cache should be flushed
3438     * into VM's RAM later.
3439     */
3440     if (record_bitmap) {
3441         colo_record_bitmap(block, &offset, 1);
3442     }
3443     return block->colo_cache + offset;
3444 }
3445
3446 /**
3447  * ram_handle_compressed: handle the zero page case
3448  *
3449  * If a page (or a whole RDMA chunk) has been
3450  * determined to be zero, then zap it.
3451  *
3452  * @host: host address for the zero page
3453  * @ch: what the page is filled from.  We only support zero
3454  * @size: size of the zero page
3455  */
3456 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3457 {
3458     if (ch != 0 || !buffer_is_zero(host, size)) {
3459         memset(host, ch, size);
3460     }
3461 }
3462
3463 static void colo_init_ram_state(void)
3464 {
3465     ram_state_init(&ram_state);
3466 }
3467
3468 /*
3469  * colo cache: this is for secondary VM, we cache the whole
3470  * memory of the secondary VM, it is need to hold the global lock
3471  * to call this helper.
3472  */
3473 int colo_init_ram_cache(void)
3474 {
3475     RAMBlock *block;
3476
3477     WITH_RCU_READ_LOCK_GUARD() {
3478         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3479             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3480                                                     NULL, false, false);
3481             if (!block->colo_cache) {
3482                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3483                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3484                              block->used_length);
3485                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3486                     if (block->colo_cache) {
3487                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3488                         block->colo_cache = NULL;
3489                     }
3490                 }
3491                 return -errno;
3492             }
3493             if (!machine_dump_guest_core(current_machine)) {
3494                 qemu_madvise(block->colo_cache, block->used_length,
3495                              QEMU_MADV_DONTDUMP);
3496             }
3497         }
3498     }
3499
3500     /*
3501     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3502     * with to decide which page in cache should be flushed into SVM's RAM. Here
3503     * we use the same name 'ram_bitmap' as for migration.
3504     */
3505     if (ram_bytes_total()) {
3506         RAMBlock *block;
3507
3508         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3509             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3510             block->bmap = bitmap_new(pages);
3511         }
3512     }
3513
3514     colo_init_ram_state();
3515     return 0;
3516 }
3517
3518 /* TODO: duplicated with ram_init_bitmaps */
3519 void colo_incoming_start_dirty_log(void)
3520 {
3521     RAMBlock *block = NULL;
3522     /* For memory_global_dirty_log_start below. */
3523     qemu_mutex_lock_iothread();
3524     qemu_mutex_lock_ramlist();
3525
3526     memory_global_dirty_log_sync();
3527     WITH_RCU_READ_LOCK_GUARD() {
3528         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3529             ramblock_sync_dirty_bitmap(ram_state, block);
3530             /* Discard this dirty bitmap record */
3531             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3532         }
3533         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3534     }
3535     ram_state->migration_dirty_pages = 0;
3536     qemu_mutex_unlock_ramlist();
3537     qemu_mutex_unlock_iothread();
3538 }
3539
3540 /* It is need to hold the global lock to call this helper */
3541 void colo_release_ram_cache(void)
3542 {
3543     RAMBlock *block;
3544
3545     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3546     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3547         g_free(block->bmap);
3548         block->bmap = NULL;
3549     }
3550
3551     WITH_RCU_READ_LOCK_GUARD() {
3552         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3553             if (block->colo_cache) {
3554                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3555                 block->colo_cache = NULL;
3556             }
3557         }
3558     }
3559     ram_state_cleanup(&ram_state);
3560 }
3561
3562 /**
3563  * ram_load_setup: Setup RAM for migration incoming side
3564  *
3565  * Returns zero to indicate success and negative for error
3566  *
3567  * @f: QEMUFile where to receive the data
3568  * @opaque: RAMState pointer
3569  */
3570 static int ram_load_setup(QEMUFile *f, void *opaque)
3571 {
3572     xbzrle_load_setup();
3573     ramblock_recv_map_init();
3574
3575     return 0;
3576 }
3577
3578 static int ram_load_cleanup(void *opaque)
3579 {
3580     RAMBlock *rb;
3581
3582     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3583         qemu_ram_block_writeback(rb);
3584     }
3585
3586     xbzrle_load_cleanup();
3587
3588     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3589         g_free(rb->receivedmap);
3590         rb->receivedmap = NULL;
3591     }
3592
3593     return 0;
3594 }
3595
3596 /**
3597  * ram_postcopy_incoming_init: allocate postcopy data structures
3598  *
3599  * Returns 0 for success and negative if there was one error
3600  *
3601  * @mis: current migration incoming state
3602  *
3603  * Allocate data structures etc needed by incoming migration with
3604  * postcopy-ram. postcopy-ram's similarly names
3605  * postcopy_ram_incoming_init does the work.
3606  */
3607 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3608 {
3609     return postcopy_ram_incoming_init(mis);
3610 }
3611
3612 /**
3613  * ram_load_postcopy: load a page in postcopy case
3614  *
3615  * Returns 0 for success or -errno in case of error
3616  *
3617  * Called in postcopy mode by ram_load().
3618  * rcu_read_lock is taken prior to this being called.
3619  *
3620  * @f: QEMUFile where to send the data
3621  * @channel: the channel to use for loading
3622  */
3623 int ram_load_postcopy(QEMUFile *f, int channel)
3624 {
3625     int flags = 0, ret = 0;
3626     bool place_needed = false;
3627     bool matches_target_page_size = false;
3628     MigrationIncomingState *mis = migration_incoming_get_current();
3629     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
3630
3631     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3632         ram_addr_t addr;
3633         void *page_buffer = NULL;
3634         void *place_source = NULL;
3635         RAMBlock *block = NULL;
3636         uint8_t ch;
3637         int len;
3638
3639         addr = qemu_get_be64(f);
3640
3641         /*
3642          * If qemu file error, we should stop here, and then "addr"
3643          * may be invalid
3644          */
3645         ret = qemu_file_get_error(f);
3646         if (ret) {
3647             break;
3648         }
3649
3650         flags = addr & ~TARGET_PAGE_MASK;
3651         addr &= TARGET_PAGE_MASK;
3652
3653         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
3654         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3655                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3656             block = ram_block_from_stream(mis, f, flags, channel);
3657             if (!block) {
3658                 ret = -EINVAL;
3659                 break;
3660             }
3661
3662             /*
3663              * Relying on used_length is racy and can result in false positives.
3664              * We might place pages beyond used_length in case RAM was shrunk
3665              * while in postcopy, which is fine - trying to place via
3666              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3667              */
3668             if (!block->host || addr >= block->postcopy_length) {
3669                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3670                 ret = -EINVAL;
3671                 break;
3672             }
3673             tmp_page->target_pages++;
3674             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3675             /*
3676              * Postcopy requires that we place whole host pages atomically;
3677              * these may be huge pages for RAMBlocks that are backed by
3678              * hugetlbfs.
3679              * To make it atomic, the data is read into a temporary page
3680              * that's moved into place later.
3681              * The migration protocol uses,  possibly smaller, target-pages
3682              * however the source ensures it always sends all the components
3683              * of a host page in one chunk.
3684              */
3685             page_buffer = tmp_page->tmp_huge_page +
3686                           host_page_offset_from_ram_block_offset(block, addr);
3687             /* If all TP are zero then we can optimise the place */
3688             if (tmp_page->target_pages == 1) {
3689                 tmp_page->host_addr =
3690                     host_page_from_ram_block_offset(block, addr);
3691             } else if (tmp_page->host_addr !=
3692                        host_page_from_ram_block_offset(block, addr)) {
3693                 /* not the 1st TP within the HP */
3694                 error_report("Non-same host page detected on channel %d: "
3695                              "Target host page %p, received host page %p "
3696                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
3697                              channel, tmp_page->host_addr,
3698                              host_page_from_ram_block_offset(block, addr),
3699                              block->idstr, addr, tmp_page->target_pages);
3700                 ret = -EINVAL;
3701                 break;
3702             }
3703
3704             /*
3705              * If it's the last part of a host page then we place the host
3706              * page
3707              */
3708             if (tmp_page->target_pages ==
3709                 (block->page_size / TARGET_PAGE_SIZE)) {
3710                 place_needed = true;
3711             }
3712             place_source = tmp_page->tmp_huge_page;
3713         }
3714
3715         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3716         case RAM_SAVE_FLAG_ZERO:
3717             ch = qemu_get_byte(f);
3718             /*
3719              * Can skip to set page_buffer when
3720              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3721              */
3722             if (ch || !matches_target_page_size) {
3723                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3724             }
3725             if (ch) {
3726                 tmp_page->all_zero = false;
3727             }
3728             break;
3729
3730         case RAM_SAVE_FLAG_PAGE:
3731             tmp_page->all_zero = false;
3732             if (!matches_target_page_size) {
3733                 /* For huge pages, we always use temporary buffer */
3734                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3735             } else {
3736                 /*
3737                  * For small pages that matches target page size, we
3738                  * avoid the qemu_file copy.  Instead we directly use
3739                  * the buffer of QEMUFile to place the page.  Note: we
3740                  * cannot do any QEMUFile operation before using that
3741                  * buffer to make sure the buffer is valid when
3742                  * placing the page.
3743                  */
3744                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3745                                          TARGET_PAGE_SIZE);
3746             }
3747             break;
3748         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3749             tmp_page->all_zero = false;
3750             len = qemu_get_be32(f);
3751             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3752                 error_report("Invalid compressed data length: %d", len);
3753                 ret = -EINVAL;
3754                 break;
3755             }
3756             decompress_data_with_multi_threads(f, page_buffer, len);
3757             break;
3758         case RAM_SAVE_FLAG_MULTIFD_FLUSH:
3759             multifd_recv_sync_main();
3760             break;
3761         case RAM_SAVE_FLAG_EOS:
3762             /* normal exit */
3763             if (migrate_multifd_flush_after_each_section()) {
3764                 multifd_recv_sync_main();
3765             }
3766             break;
3767         default:
3768             error_report("Unknown combination of migration flags: 0x%x"
3769                          " (postcopy mode)", flags);
3770             ret = -EINVAL;
3771             break;
3772         }
3773
3774         /* Got the whole host page, wait for decompress before placing. */
3775         if (place_needed) {
3776             ret |= wait_for_decompress_done();
3777         }
3778
3779         /* Detect for any possible file errors */
3780         if (!ret && qemu_file_get_error(f)) {
3781             ret = qemu_file_get_error(f);
3782         }
3783
3784         if (!ret && place_needed) {
3785             if (tmp_page->all_zero) {
3786                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
3787             } else {
3788                 ret = postcopy_place_page(mis, tmp_page->host_addr,
3789                                           place_source, block);
3790             }
3791             place_needed = false;
3792             postcopy_temp_page_reset(tmp_page);
3793         }
3794     }
3795
3796     return ret;
3797 }
3798
3799 static bool postcopy_is_running(void)
3800 {
3801     PostcopyState ps = postcopy_state_get();
3802     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3803 }
3804
3805 /*
3806  * Flush content of RAM cache into SVM's memory.
3807  * Only flush the pages that be dirtied by PVM or SVM or both.
3808  */
3809 void colo_flush_ram_cache(void)
3810 {
3811     RAMBlock *block = NULL;
3812     void *dst_host;
3813     void *src_host;
3814     unsigned long offset = 0;
3815
3816     memory_global_dirty_log_sync();
3817     qemu_mutex_lock(&ram_state->bitmap_mutex);
3818     WITH_RCU_READ_LOCK_GUARD() {
3819         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3820             ramblock_sync_dirty_bitmap(ram_state, block);
3821         }
3822     }
3823
3824     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3825     WITH_RCU_READ_LOCK_GUARD() {
3826         block = QLIST_FIRST_RCU(&ram_list.blocks);
3827
3828         while (block) {
3829             unsigned long num = 0;
3830
3831             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3832             if (!offset_in_ramblock(block,
3833                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3834                 offset = 0;
3835                 num = 0;
3836                 block = QLIST_NEXT_RCU(block, next);
3837             } else {
3838                 unsigned long i = 0;
3839
3840                 for (i = 0; i < num; i++) {
3841                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
3842                 }
3843                 dst_host = block->host
3844                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3845                 src_host = block->colo_cache
3846                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3847                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3848                 offset += num;
3849             }
3850         }
3851     }
3852     qemu_mutex_unlock(&ram_state->bitmap_mutex);
3853     trace_colo_flush_ram_cache_end();
3854 }
3855
3856 /**
3857  * ram_load_precopy: load pages in precopy case
3858  *
3859  * Returns 0 for success or -errno in case of error
3860  *
3861  * Called in precopy mode by ram_load().
3862  * rcu_read_lock is taken prior to this being called.
3863  *
3864  * @f: QEMUFile where to send the data
3865  */
3866 static int ram_load_precopy(QEMUFile *f)
3867 {
3868     MigrationIncomingState *mis = migration_incoming_get_current();
3869     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3870     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3871     bool postcopy_advised = migration_incoming_postcopy_advised();
3872     if (!migrate_compress()) {
3873         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3874     }
3875
3876     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3877         ram_addr_t addr, total_ram_bytes;
3878         void *host = NULL, *host_bak = NULL;
3879         uint8_t ch;
3880
3881         /*
3882          * Yield periodically to let main loop run, but an iteration of
3883          * the main loop is expensive, so do it each some iterations
3884          */
3885         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3886             aio_co_schedule(qemu_get_current_aio_context(),
3887                             qemu_coroutine_self());
3888             qemu_coroutine_yield();
3889         }
3890         i++;
3891
3892         addr = qemu_get_be64(f);
3893         flags = addr & ~TARGET_PAGE_MASK;
3894         addr &= TARGET_PAGE_MASK;
3895
3896         if (flags & invalid_flags) {
3897             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3898                 error_report("Received an unexpected compressed page");
3899             }
3900
3901             ret = -EINVAL;
3902             break;
3903         }
3904
3905         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3906                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3907             RAMBlock *block = ram_block_from_stream(mis, f, flags,
3908                                                     RAM_CHANNEL_PRECOPY);
3909
3910             host = host_from_ram_block_offset(block, addr);
3911             /*
3912              * After going into COLO stage, we should not load the page
3913              * into SVM's memory directly, we put them into colo_cache firstly.
3914              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3915              * Previously, we copied all these memory in preparing stage of COLO
3916              * while we need to stop VM, which is a time-consuming process.
3917              * Here we optimize it by a trick, back-up every page while in
3918              * migration process while COLO is enabled, though it affects the
3919              * speed of the migration, but it obviously reduce the downtime of
3920              * back-up all SVM'S memory in COLO preparing stage.
3921              */
3922             if (migration_incoming_colo_enabled()) {
3923                 if (migration_incoming_in_colo_state()) {
3924                     /* In COLO stage, put all pages into cache temporarily */
3925                     host = colo_cache_from_block_offset(block, addr, true);
3926                 } else {
3927                    /*
3928                     * In migration stage but before COLO stage,
3929                     * Put all pages into both cache and SVM's memory.
3930                     */
3931                     host_bak = colo_cache_from_block_offset(block, addr, false);
3932                 }
3933             }
3934             if (!host) {
3935                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3936                 ret = -EINVAL;
3937                 break;
3938             }
3939             if (!migration_incoming_in_colo_state()) {
3940                 ramblock_recv_bitmap_set(block, host);
3941             }
3942
3943             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3944         }
3945
3946         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3947         case RAM_SAVE_FLAG_MEM_SIZE:
3948             /* Synchronize RAM block list */
3949             total_ram_bytes = addr;
3950             while (!ret && total_ram_bytes) {
3951                 RAMBlock *block;
3952                 char id[256];
3953                 ram_addr_t length;
3954
3955                 len = qemu_get_byte(f);
3956                 qemu_get_buffer(f, (uint8_t *)id, len);
3957                 id[len] = 0;
3958                 length = qemu_get_be64(f);
3959
3960                 block = qemu_ram_block_by_name(id);
3961                 if (block && !qemu_ram_is_migratable(block)) {
3962                     error_report("block %s should not be migrated !", id);
3963                     ret = -EINVAL;
3964                 } else if (block) {
3965                     if (length != block->used_length) {
3966                         Error *local_err = NULL;
3967
3968                         ret = qemu_ram_resize(block, length,
3969                                               &local_err);
3970                         if (local_err) {
3971                             error_report_err(local_err);
3972                         }
3973                     }
3974                     /* For postcopy we need to check hugepage sizes match */
3975                     if (postcopy_advised && migrate_postcopy_ram() &&
3976                         block->page_size != qemu_host_page_size) {
3977                         uint64_t remote_page_size = qemu_get_be64(f);
3978                         if (remote_page_size != block->page_size) {
3979                             error_report("Mismatched RAM page size %s "
3980                                          "(local) %zd != %" PRId64,
3981                                          id, block->page_size,
3982                                          remote_page_size);
3983                             ret = -EINVAL;
3984                         }
3985                     }
3986                     if (migrate_ignore_shared()) {
3987                         hwaddr addr = qemu_get_be64(f);
3988                         if (ramblock_is_ignored(block) &&
3989                             block->mr->addr != addr) {
3990                             error_report("Mismatched GPAs for block %s "
3991                                          "%" PRId64 "!= %" PRId64,
3992                                          id, (uint64_t)addr,
3993                                          (uint64_t)block->mr->addr);
3994                             ret = -EINVAL;
3995                         }
3996                     }
3997                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3998                                           block->idstr);
3999                 } else {
4000                     error_report("Unknown ramblock \"%s\", cannot "
4001                                  "accept migration", id);
4002                     ret = -EINVAL;
4003                 }
4004
4005                 total_ram_bytes -= length;
4006             }
4007             break;
4008
4009         case RAM_SAVE_FLAG_ZERO:
4010             ch = qemu_get_byte(f);
4011             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4012             break;
4013
4014         case RAM_SAVE_FLAG_PAGE:
4015             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4016             break;
4017
4018         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4019             len = qemu_get_be32(f);
4020             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4021                 error_report("Invalid compressed data length: %d", len);
4022                 ret = -EINVAL;
4023                 break;
4024             }
4025             decompress_data_with_multi_threads(f, host, len);
4026             break;
4027
4028         case RAM_SAVE_FLAG_XBZRLE:
4029             if (load_xbzrle(f, addr, host) < 0) {
4030                 error_report("Failed to decompress XBZRLE page at "
4031                              RAM_ADDR_FMT, addr);
4032                 ret = -EINVAL;
4033                 break;
4034             }
4035             break;
4036         case RAM_SAVE_FLAG_MULTIFD_FLUSH:
4037             multifd_recv_sync_main();
4038             break;
4039         case RAM_SAVE_FLAG_EOS:
4040             /* normal exit */
4041             if (migrate_multifd_flush_after_each_section()) {
4042                 multifd_recv_sync_main();
4043             }
4044             break;
4045         case RAM_SAVE_FLAG_HOOK:
4046             ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4047             break;
4048         default:
4049             error_report("Unknown combination of migration flags: 0x%x", flags);
4050             ret = -EINVAL;
4051         }
4052         if (!ret) {
4053             ret = qemu_file_get_error(f);
4054         }
4055         if (!ret && host_bak) {
4056             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4057         }
4058     }
4059
4060     ret |= wait_for_decompress_done();
4061     return ret;
4062 }
4063
4064 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4065 {
4066     int ret = 0;
4067     static uint64_t seq_iter;
4068     /*
4069      * If system is running in postcopy mode, page inserts to host memory must
4070      * be atomic
4071      */
4072     bool postcopy_running = postcopy_is_running();
4073
4074     seq_iter++;
4075
4076     if (version_id != 4) {
4077         return -EINVAL;
4078     }
4079
4080     /*
4081      * This RCU critical section can be very long running.
4082      * When RCU reclaims in the code start to become numerous,
4083      * it will be necessary to reduce the granularity of this
4084      * critical section.
4085      */
4086     WITH_RCU_READ_LOCK_GUARD() {
4087         if (postcopy_running) {
4088             /*
4089              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4090              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4091              * service fast page faults.
4092              */
4093             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4094         } else {
4095             ret = ram_load_precopy(f);
4096         }
4097     }
4098     trace_ram_load_complete(ret, seq_iter);
4099
4100     return ret;
4101 }
4102
4103 static bool ram_has_postcopy(void *opaque)
4104 {
4105     RAMBlock *rb;
4106     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4107         if (ramblock_is_pmem(rb)) {
4108             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4109                          "is not supported now!", rb->idstr, rb->host);
4110             return false;
4111         }
4112     }
4113
4114     return migrate_postcopy_ram();
4115 }
4116
4117 /* Sync all the dirty bitmap with destination VM.  */
4118 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4119 {
4120     RAMBlock *block;
4121     QEMUFile *file = s->to_dst_file;
4122     int ramblock_count = 0;
4123
4124     trace_ram_dirty_bitmap_sync_start();
4125
4126     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4127         qemu_savevm_send_recv_bitmap(file, block->idstr);
4128         trace_ram_dirty_bitmap_request(block->idstr);
4129         ramblock_count++;
4130     }
4131
4132     trace_ram_dirty_bitmap_sync_wait();
4133
4134     /* Wait until all the ramblocks' dirty bitmap synced */
4135     while (ramblock_count--) {
4136         qemu_sem_wait(&s->rp_state.rp_sem);
4137     }
4138
4139     trace_ram_dirty_bitmap_sync_complete();
4140
4141     return 0;
4142 }
4143
4144 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4145 {
4146     qemu_sem_post(&s->rp_state.rp_sem);
4147 }
4148
4149 /*
4150  * Read the received bitmap, revert it as the initial dirty bitmap.
4151  * This is only used when the postcopy migration is paused but wants
4152  * to resume from a middle point.
4153  */
4154 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4155 {
4156     int ret = -EINVAL;
4157     /* from_dst_file is always valid because we're within rp_thread */
4158     QEMUFile *file = s->rp_state.from_dst_file;
4159     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4160     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4161     uint64_t size, end_mark;
4162
4163     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4164
4165     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4166         error_report("%s: incorrect state %s", __func__,
4167                      MigrationStatus_str(s->state));
4168         return -EINVAL;
4169     }
4170
4171     /*
4172      * Note: see comments in ramblock_recv_bitmap_send() on why we
4173      * need the endianness conversion, and the paddings.
4174      */
4175     local_size = ROUND_UP(local_size, 8);
4176
4177     /* Add paddings */
4178     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4179
4180     size = qemu_get_be64(file);
4181
4182     /* The size of the bitmap should match with our ramblock */
4183     if (size != local_size) {
4184         error_report("%s: ramblock '%s' bitmap size mismatch "
4185                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4186                      block->idstr, size, local_size);
4187         ret = -EINVAL;
4188         goto out;
4189     }
4190
4191     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4192     end_mark = qemu_get_be64(file);
4193
4194     ret = qemu_file_get_error(file);
4195     if (ret || size != local_size) {
4196         error_report("%s: read bitmap failed for ramblock '%s': %d"
4197                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4198                      __func__, block->idstr, ret, local_size, size);
4199         ret = -EIO;
4200         goto out;
4201     }
4202
4203     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4204         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4205                      __func__, block->idstr, end_mark);
4206         ret = -EINVAL;
4207         goto out;
4208     }
4209
4210     /*
4211      * Endianness conversion. We are during postcopy (though paused).
4212      * The dirty bitmap won't change. We can directly modify it.
4213      */
4214     bitmap_from_le(block->bmap, le_bitmap, nbits);
4215
4216     /*
4217      * What we received is "received bitmap". Revert it as the initial
4218      * dirty bitmap for this ramblock.
4219      */
4220     bitmap_complement(block->bmap, block->bmap, nbits);
4221
4222     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4223     ramblock_dirty_bitmap_clear_discarded_pages(block);
4224
4225     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4226     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4227
4228     /*
4229      * We succeeded to sync bitmap for current ramblock. If this is
4230      * the last one to sync, we need to notify the main send thread.
4231      */
4232     ram_dirty_bitmap_reload_notify(s);
4233
4234     ret = 0;
4235 out:
4236     g_free(le_bitmap);
4237     return ret;
4238 }
4239
4240 static int ram_resume_prepare(MigrationState *s, void *opaque)
4241 {
4242     RAMState *rs = *(RAMState **)opaque;
4243     int ret;
4244
4245     ret = ram_dirty_bitmap_sync_all(s, rs);
4246     if (ret) {
4247         return ret;
4248     }
4249
4250     ram_state_resume_prepare(rs, s->to_dst_file);
4251
4252     return 0;
4253 }
4254
4255 void postcopy_preempt_shutdown_file(MigrationState *s)
4256 {
4257     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4258     qemu_fflush(s->postcopy_qemufile_src);
4259 }
4260
4261 static SaveVMHandlers savevm_ram_handlers = {
4262     .save_setup = ram_save_setup,
4263     .save_live_iterate = ram_save_iterate,
4264     .save_live_complete_postcopy = ram_save_complete,
4265     .save_live_complete_precopy = ram_save_complete,
4266     .has_postcopy = ram_has_postcopy,
4267     .state_pending_exact = ram_state_pending_exact,
4268     .state_pending_estimate = ram_state_pending_estimate,
4269     .load_state = ram_load,
4270     .save_cleanup = ram_save_cleanup,
4271     .load_setup = ram_load_setup,
4272     .load_cleanup = ram_load_cleanup,
4273     .resume_prepare = ram_resume_prepare,
4274 };
4275
4276 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4277                                       size_t old_size, size_t new_size)
4278 {
4279     PostcopyState ps = postcopy_state_get();
4280     ram_addr_t offset;
4281     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4282     Error *err = NULL;
4283
4284     if (ramblock_is_ignored(rb)) {
4285         return;
4286     }
4287
4288     if (!migration_is_idle()) {
4289         /*
4290          * Precopy code on the source cannot deal with the size of RAM blocks
4291          * changing at random points in time - especially after sending the
4292          * RAM block sizes in the migration stream, they must no longer change.
4293          * Abort and indicate a proper reason.
4294          */
4295         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4296         migration_cancel(err);
4297         error_free(err);
4298     }
4299
4300     switch (ps) {
4301     case POSTCOPY_INCOMING_ADVISE:
4302         /*
4303          * Update what ram_postcopy_incoming_init()->init_range() does at the
4304          * time postcopy was advised. Syncing RAM blocks with the source will
4305          * result in RAM resizes.
4306          */
4307         if (old_size < new_size) {
4308             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4309                 error_report("RAM block '%s' discard of resized RAM failed",
4310                              rb->idstr);
4311             }
4312         }
4313         rb->postcopy_length = new_size;
4314         break;
4315     case POSTCOPY_INCOMING_NONE:
4316     case POSTCOPY_INCOMING_RUNNING:
4317     case POSTCOPY_INCOMING_END:
4318         /*
4319          * Once our guest is running, postcopy does no longer care about
4320          * resizes. When growing, the new memory was not available on the
4321          * source, no handler needed.
4322          */
4323         break;
4324     default:
4325         error_report("RAM block '%s' resized during postcopy state: %d",
4326                      rb->idstr, ps);
4327         exit(-1);
4328     }
4329 }
4330
4331 static RAMBlockNotifier ram_mig_ram_notifier = {
4332     .ram_block_resized = ram_mig_ram_block_resized,
4333 };
4334
4335 void ram_mig_init(void)
4336 {
4337     qemu_mutex_init(&XBZRLE.lock);
4338     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4339     ram_block_notifier_add(&ram_mig_ram_notifier);
4340 }