migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/main-loop.h"
  34 #include "xbzrle.h"
  35 #include "ram.h"
  36 #include "migration.h"
  37 #include "migration/register.h"
  38 #include "migration/misc.h"
  39 #include "qemu-file.h"
  40 #include "postcopy-ram.h"
  41 #include "page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "qapi/error.h"
  44 #include "qapi/qapi-types-migration.h"
  45 #include "qapi/qapi-events-migration.h"
  46 #include "qapi/qmp/qerror.h"
  47 #include "trace.h"
  48 #include "exec/ram_addr.h"
  49 #include "exec/target_page.h"
  50 #include "qemu/rcu_queue.h"
  51 #include "migration/colo.h"
  52 #include "block.h"
  53 #include "sysemu/cpu-throttle.h"
  54 #include "savevm.h"
  55 #include "qemu/iov.h"
  56 #include "multifd.h"
  57 #include "sysemu/runstate.h"
  58
  59 #include "hw/boards.h" /* for machine_dump_guest_core() */
  60
  61 #if defined(__linux__)
  62 #include "qemu/userfaultfd.h"
  63 #endif /* defined(__linux__) */
  64
  65 /***********************************************************/
  66 /* ram save/restore */
  67
  68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  69  * worked for pages that where filled with the same char.  We switched
  70  * it to only search for the zero value.  And to avoid confusion with
  71  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  72  */
  73
  74 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  75 #define RAM_SAVE_FLAG_ZERO     0x02
  76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  77 #define RAM_SAVE_FLAG_PAGE     0x08
  78 #define RAM_SAVE_FLAG_EOS      0x10
  79 #define RAM_SAVE_FLAG_CONTINUE 0x20
  80 #define RAM_SAVE_FLAG_XBZRLE   0x40
  81 /* 0x80 is reserved in migration.h start with 0x100 next */
  82 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  83
  84 XBZRLECacheStats xbzrle_counters;
  85
  86 /* struct contains XBZRLE cache and a static page
  87    used by the compression */
  88 static struct {
  89     /* buffer used for XBZRLE encoding */
  90     uint8_t *encoded_buf;
  91     /* buffer for storing page content */
  92     uint8_t *current_buf;
  93     /* Cache for XBZRLE, Protected by lock. */
  94     PageCache *cache;
  95     QemuMutex lock;
  96     /* it will store a page full of zeros */
  97     uint8_t *zero_target_page;
  98     /* buffer used for XBZRLE decoding */
  99     uint8_t *decoded_buf;
 100 } XBZRLE;
 101
 102 static void XBZRLE_cache_lock(void)
 103 {
 104     if (migrate_use_xbzrle()) {
 105         qemu_mutex_lock(&XBZRLE.lock);
 106     }
 107 }
 108
 109 static void XBZRLE_cache_unlock(void)
 110 {
 111     if (migrate_use_xbzrle()) {
 112         qemu_mutex_unlock(&XBZRLE.lock);
 113     }
 114 }
 115
 116 /**
 117  * xbzrle_cache_resize: resize the xbzrle cache
 118  *
 119  * This function is called from migrate_params_apply in main
 120  * thread, possibly while a migration is in progress.  A running
 121  * migration may be using the cache and might finish during this call,
 122  * hence changes to the cache are protected by XBZRLE.lock().
 123  *
 124  * Returns 0 for success or -1 for error
 125  *
 126  * @new_size: new cache size
 127  * @errp: set *errp if the check failed, with reason
 128  */
 129 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 130 {
 131     PageCache *new_cache;
 132     int64_t ret = 0;
 133
 134     /* Check for truncation */
 135     if (new_size != (size_t)new_size) {
 136         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 137                    "exceeding address space");
 138         return -1;
 139     }
 140
 141     if (new_size == migrate_xbzrle_cache_size()) {
 142         /* nothing to do */
 143         return 0;
 144     }
 145
 146     XBZRLE_cache_lock();
 147
 148     if (XBZRLE.cache != NULL) {
 149         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 150         if (!new_cache) {
 151             ret = -1;
 152             goto out;
 153         }
 154
 155         cache_fini(XBZRLE.cache);
 156         XBZRLE.cache = new_cache;
 157     }
 158 out:
 159     XBZRLE_cache_unlock();
 160     return ret;
 161 }
 162
 163 bool ramblock_is_ignored(RAMBlock *block)
 164 {
 165     return !qemu_ram_is_migratable(block) ||
 166            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 167 }
 168
 169 #undef RAMBLOCK_FOREACH
 170
 171 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 172 {
 173     RAMBlock *block;
 174     int ret = 0;
 175
 176     RCU_READ_LOCK_GUARD();
 177
 178     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 179         ret = func(block, opaque);
 180         if (ret) {
 181             break;
 182         }
 183     }
 184     return ret;
 185 }
 186
 187 static void ramblock_recv_map_init(void)
 188 {
 189     RAMBlock *rb;
 190
 191     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 192         assert(!rb->receivedmap);
 193         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 194     }
 195 }
 196
 197 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 198 {
 199     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 200                     rb->receivedmap);
 201 }
 202
 203 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 204 {
 205     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 206 }
 207
 208 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 209 {
 210     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 211 }
 212
 213 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 214                                     size_t nr)
 215 {
 216     bitmap_set_atomic(rb->receivedmap,
 217                       ramblock_recv_bitmap_offset(host_addr, rb),
 218                       nr);
 219 }
 220
 221 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 222
 223 /*
 224  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 225  *
 226  * Returns >0 if success with sent bytes, or <0 if error.
 227  */
 228 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 229                                   const char *block_name)
 230 {
 231     RAMBlock *block = qemu_ram_block_by_name(block_name);
 232     unsigned long *le_bitmap, nbits;
 233     uint64_t size;
 234
 235     if (!block) {
 236         error_report("%s: invalid block name: %s", __func__, block_name);
 237         return -1;
 238     }
 239
 240     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 241
 242     /*
 243      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 244      * machines we may need 4 more bytes for padding (see below
 245      * comment). So extend it a bit before hand.
 246      */
 247     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 248
 249     /*
 250      * Always use little endian when sending the bitmap. This is
 251      * required that when source and destination VMs are not using the
 252      * same endianness. (Note: big endian won't work.)
 253      */
 254     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 255
 256     /* Size of the bitmap, in bytes */
 257     size = DIV_ROUND_UP(nbits, 8);
 258
 259     /*
 260      * size is always aligned to 8 bytes for 64bit machines, but it
 261      * may not be true for 32bit machines. We need this padding to
 262      * make sure the migration can survive even between 32bit and
 263      * 64bit machines.
 264      */
 265     size = ROUND_UP(size, 8);
 266
 267     qemu_put_be64(file, size);
 268     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 269     /*
 270      * Mark as an end, in case the middle part is screwed up due to
 271      * some "mysterious" reason.
 272      */
 273     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 274     qemu_fflush(file);
 275
 276     g_free(le_bitmap);
 277
 278     if (qemu_file_get_error(file)) {
 279         return qemu_file_get_error(file);
 280     }
 281
 282     return size + sizeof(size);
 283 }
 284
 285 /*
 286  * An outstanding page request, on the source, having been received
 287  * and queued
 288  */
 289 struct RAMSrcPageRequest {
 290     RAMBlock *rb;
 291     hwaddr    offset;
 292     hwaddr    len;
 293
 294     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 295 };
 296
 297 /* State of RAM for migration */
 298 struct RAMState {
 299     /* QEMUFile used for this migration */
 300     QEMUFile *f;
 301     /* UFFD file descriptor, used in 'write-tracking' migration */
 302     int uffdio_fd;
 303     /* Last block that we have visited searching for dirty pages */
 304     RAMBlock *last_seen_block;
 305     /* Last block from where we have sent data */
 306     RAMBlock *last_sent_block;
 307     /* Last dirty target page we have sent */
 308     ram_addr_t last_page;
 309     /* last ram version we have seen */
 310     uint32_t last_version;
 311     /* How many times we have dirty too many pages */
 312     int dirty_rate_high_cnt;
 313     /* these variables are used for bitmap sync */
 314     /* last time we did a full bitmap_sync */
 315     int64_t time_last_bitmap_sync;
 316     /* bytes transferred at start_time */
 317     uint64_t bytes_xfer_prev;
 318     /* number of dirty pages since start_time */
 319     uint64_t num_dirty_pages_period;
 320     /* xbzrle misses since the beginning of the period */
 321     uint64_t xbzrle_cache_miss_prev;
 322     /* Amount of xbzrle pages since the beginning of the period */
 323     uint64_t xbzrle_pages_prev;
 324     /* Amount of xbzrle encoded bytes since the beginning of the period */
 325     uint64_t xbzrle_bytes_prev;
 326     /* Start using XBZRLE (e.g., after the first round). */
 327     bool xbzrle_enabled;
 328     /* Are we on the last stage of migration */
 329     bool last_stage;
 330     /* compression statistics since the beginning of the period */
 331     /* amount of count that no free thread to compress data */
 332     uint64_t compress_thread_busy_prev;
 333     /* amount bytes after compression */
 334     uint64_t compressed_size_prev;
 335     /* amount of compressed pages */
 336     uint64_t compress_pages_prev;
 337
 338     /* total handled target pages at the beginning of period */
 339     uint64_t target_page_count_prev;
 340     /* total handled target pages since start */
 341     uint64_t target_page_count;
 342     /* number of dirty bits in the bitmap */
 343     uint64_t migration_dirty_pages;
 344     /* Protects modification of the bitmap and migration dirty pages */
 345     QemuMutex bitmap_mutex;
 346     /* The RAMBlock used in the last src_page_requests */
 347     RAMBlock *last_req_rb;
 348     /* Queue of outstanding page requests from the destination */
 349     QemuMutex src_page_req_mutex;
 350     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 351 };
 352 typedef struct RAMState RAMState;
 353
 354 static RAMState *ram_state;
 355
 356 static NotifierWithReturnList precopy_notifier_list;
 357
 358 /* Whether postcopy has queued requests? */
 359 static bool postcopy_has_request(RAMState *rs)
 360 {
 361     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
 362 }
 363
 364 void precopy_infrastructure_init(void)
 365 {
 366     notifier_with_return_list_init(&precopy_notifier_list);
 367 }
 368
 369 void precopy_add_notifier(NotifierWithReturn *n)
 370 {
 371     notifier_with_return_list_add(&precopy_notifier_list, n);
 372 }
 373
 374 void precopy_remove_notifier(NotifierWithReturn *n)
 375 {
 376     notifier_with_return_remove(n);
 377 }
 378
 379 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 380 {
 381     PrecopyNotifyData pnd;
 382     pnd.reason = reason;
 383     pnd.errp = errp;
 384
 385     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 386 }
 387
 388 uint64_t ram_bytes_remaining(void)
 389 {
 390     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 391                        0;
 392 }
 393
 394 MigrationStats ram_counters;
 395
 396 static void ram_transferred_add(uint64_t bytes)
 397 {
 398     if (runstate_is_running()) {
 399         ram_counters.precopy_bytes += bytes;
 400     } else if (migration_in_postcopy()) {
 401         ram_counters.postcopy_bytes += bytes;
 402     } else {
 403         ram_counters.downtime_bytes += bytes;
 404     }
 405     ram_counters.transferred += bytes;
 406 }
 407
 408 /* used by the search for pages to send */
 409 struct PageSearchStatus {
 410     /* Current block being searched */
 411     RAMBlock    *block;
 412     /* Current page to search from */
 413     unsigned long page;
 414     /* Set once we wrap around */
 415     bool         complete_round;
 416 };
 417 typedef struct PageSearchStatus PageSearchStatus;
 418
 419 CompressionStats compression_counters;
 420
 421 struct CompressParam {
 422     bool done;
 423     bool quit;
 424     bool zero_page;
 425     QEMUFile *file;
 426     QemuMutex mutex;
 427     QemuCond cond;
 428     RAMBlock *block;
 429     ram_addr_t offset;
 430
 431     /* internally used fields */
 432     z_stream stream;
 433     uint8_t *originbuf;
 434 };
 435 typedef struct CompressParam CompressParam;
 436
 437 struct DecompressParam {
 438     bool done;
 439     bool quit;
 440     QemuMutex mutex;
 441     QemuCond cond;
 442     void *des;
 443     uint8_t *compbuf;
 444     int len;
 445     z_stream stream;
 446 };
 447 typedef struct DecompressParam DecompressParam;
 448
 449 static CompressParam *comp_param;
 450 static QemuThread *compress_threads;
 451 /* comp_done_cond is used to wake up the migration thread when
 452  * one of the compression threads has finished the compression.
 453  * comp_done_lock is used to co-work with comp_done_cond.
 454  */
 455 static QemuMutex comp_done_lock;
 456 static QemuCond comp_done_cond;
 457 /* The empty QEMUFileOps will be used by file in CompressParam */
 458 static const QEMUFileOps empty_ops = { };
 459
 460 static QEMUFile *decomp_file;
 461 static DecompressParam *decomp_param;
 462 static QemuThread *decompress_threads;
 463 static QemuMutex decomp_done_lock;
 464 static QemuCond decomp_done_cond;
 465
 466 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 467                                  ram_addr_t offset, uint8_t *source_buf);
 468
 469 static void *do_data_compress(void *opaque)
 470 {
 471     CompressParam *param = opaque;
 472     RAMBlock *block;
 473     ram_addr_t offset;
 474     bool zero_page;
 475
 476     qemu_mutex_lock(&param->mutex);
 477     while (!param->quit) {
 478         if (param->block) {
 479             block = param->block;
 480             offset = param->offset;
 481             param->block = NULL;
 482             qemu_mutex_unlock(&param->mutex);
 483
 484             zero_page = do_compress_ram_page(param->file, &param->stream,
 485                                              block, offset, param->originbuf);
 486
 487             qemu_mutex_lock(&comp_done_lock);
 488             param->done = true;
 489             param->zero_page = zero_page;
 490             qemu_cond_signal(&comp_done_cond);
 491             qemu_mutex_unlock(&comp_done_lock);
 492
 493             qemu_mutex_lock(&param->mutex);
 494         } else {
 495             qemu_cond_wait(&param->cond, &param->mutex);
 496         }
 497     }
 498     qemu_mutex_unlock(&param->mutex);
 499
 500     return NULL;
 501 }
 502
 503 static void compress_threads_save_cleanup(void)
 504 {
 505     int i, thread_count;
 506
 507     if (!migrate_use_compression() || !comp_param) {
 508         return;
 509     }
 510
 511     thread_count = migrate_compress_threads();
 512     for (i = 0; i < thread_count; i++) {
 513         /*
 514          * we use it as a indicator which shows if the thread is
 515          * properly init'd or not
 516          */
 517         if (!comp_param[i].file) {
 518             break;
 519         }
 520
 521         qemu_mutex_lock(&comp_param[i].mutex);
 522         comp_param[i].quit = true;
 523         qemu_cond_signal(&comp_param[i].cond);
 524         qemu_mutex_unlock(&comp_param[i].mutex);
 525
 526         qemu_thread_join(compress_threads + i);
 527         qemu_mutex_destroy(&comp_param[i].mutex);
 528         qemu_cond_destroy(&comp_param[i].cond);
 529         deflateEnd(&comp_param[i].stream);
 530         g_free(comp_param[i].originbuf);
 531         qemu_fclose(comp_param[i].file);
 532         comp_param[i].file = NULL;
 533     }
 534     qemu_mutex_destroy(&comp_done_lock);
 535     qemu_cond_destroy(&comp_done_cond);
 536     g_free(compress_threads);
 537     g_free(comp_param);
 538     compress_threads = NULL;
 539     comp_param = NULL;
 540 }
 541
 542 static int compress_threads_save_setup(void)
 543 {
 544     int i, thread_count;
 545
 546     if (!migrate_use_compression()) {
 547         return 0;
 548     }
 549     thread_count = migrate_compress_threads();
 550     compress_threads = g_new0(QemuThread, thread_count);
 551     comp_param = g_new0(CompressParam, thread_count);
 552     qemu_cond_init(&comp_done_cond);
 553     qemu_mutex_init(&comp_done_lock);
 554     for (i = 0; i < thread_count; i++) {
 555         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 556         if (!comp_param[i].originbuf) {
 557             goto exit;
 558         }
 559
 560         if (deflateInit(&comp_param[i].stream,
 561                         migrate_compress_level()) != Z_OK) {
 562             g_free(comp_param[i].originbuf);
 563             goto exit;
 564         }
 565
 566         /* comp_param[i].file is just used as a dummy buffer to save data,
 567          * set its ops to empty.
 568          */
 569         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
 570         comp_param[i].done = true;
 571         comp_param[i].quit = false;
 572         qemu_mutex_init(&comp_param[i].mutex);
 573         qemu_cond_init(&comp_param[i].cond);
 574         qemu_thread_create(compress_threads + i, "compress",
 575                            do_data_compress, comp_param + i,
 576                            QEMU_THREAD_JOINABLE);
 577     }
 578     return 0;
 579
 580 exit:
 581     compress_threads_save_cleanup();
 582     return -1;
 583 }
 584
 585 /**
 586  * save_page_header: write page header to wire
 587  *
 588  * If this is the 1st block, it also writes the block identification
 589  *
 590  * Returns the number of bytes written
 591  *
 592  * @f: QEMUFile where to send the data
 593  * @block: block that contains the page we want to send
 594  * @offset: offset inside the block for the page
 595  *          in the lower bits, it contains flags
 596  */
 597 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 598                                ram_addr_t offset)
 599 {
 600     size_t size, len;
 601
 602     if (block == rs->last_sent_block) {
 603         offset |= RAM_SAVE_FLAG_CONTINUE;
 604     }
 605     qemu_put_be64(f, offset);
 606     size = 8;
 607
 608     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 609         len = strlen(block->idstr);
 610         qemu_put_byte(f, len);
 611         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 612         size += 1 + len;
 613         rs->last_sent_block = block;
 614     }
 615     return size;
 616 }
 617
 618 /**
 619  * mig_throttle_guest_down: throttle down the guest
 620  *
 621  * Reduce amount of guest cpu execution to hopefully slow down memory
 622  * writes. If guest dirty memory rate is reduced below the rate at
 623  * which we can transfer pages to the destination then we should be
 624  * able to complete migration. Some workloads dirty memory way too
 625  * fast and will not effectively converge, even with auto-converge.
 626  */
 627 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 628                                     uint64_t bytes_dirty_threshold)
 629 {
 630     MigrationState *s = migrate_get_current();
 631     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 632     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 633     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 634     int pct_max = s->parameters.max_cpu_throttle;
 635
 636     uint64_t throttle_now = cpu_throttle_get_percentage();
 637     uint64_t cpu_now, cpu_ideal, throttle_inc;
 638
 639     /* We have not started throttling yet. Let's start it. */
 640     if (!cpu_throttle_active()) {
 641         cpu_throttle_set(pct_initial);
 642     } else {
 643         /* Throttling already on, just increase the rate */
 644         if (!pct_tailslow) {
 645             throttle_inc = pct_increment;
 646         } else {
 647             /* Compute the ideal CPU percentage used by Guest, which may
 648              * make the dirty rate match the dirty rate threshold. */
 649             cpu_now = 100 - throttle_now;
 650             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 651                         bytes_dirty_period);
 652             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 653         }
 654         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 655     }
 656 }
 657
 658 void mig_throttle_counter_reset(void)
 659 {
 660     RAMState *rs = ram_state;
 661
 662     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 663     rs->num_dirty_pages_period = 0;
 664     rs->bytes_xfer_prev = ram_counters.transferred;
 665 }
 666
 667 /**
 668  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 669  *
 670  * @rs: current RAM state
 671  * @current_addr: address for the zero page
 672  *
 673  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 674  * The important thing is that a stale (not-yet-0'd) page be replaced
 675  * by the new data.
 676  * As a bonus, if the page wasn't in the cache it gets added so that
 677  * when a small write is made into the 0'd page it gets XBZRLE sent.
 678  */
 679 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 680 {
 681     if (!rs->xbzrle_enabled) {
 682         return;
 683     }
 684
 685     /* We don't care if this fails to allocate a new cache page
 686      * as long as it updated an old one */
 687     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 688                  ram_counters.dirty_sync_count);
 689 }
 690
 691 #define ENCODING_FLAG_XBZRLE 0x1
 692
 693 /**
 694  * save_xbzrle_page: compress and send current page
 695  *
 696  * Returns: 1 means that we wrote the page
 697  *          0 means that page is identical to the one already sent
 698  *          -1 means that xbzrle would be longer than normal
 699  *
 700  * @rs: current RAM state
 701  * @current_data: pointer to the address of the page contents
 702  * @current_addr: addr of the page
 703  * @block: block that contains the page we want to send
 704  * @offset: offset inside the block for the page
 705  */
 706 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 707                             ram_addr_t current_addr, RAMBlock *block,
 708                             ram_addr_t offset)
 709 {
 710     int encoded_len = 0, bytes_xbzrle;
 711     uint8_t *prev_cached_page;
 712
 713     if (!cache_is_cached(XBZRLE.cache, current_addr,
 714                          ram_counters.dirty_sync_count)) {
 715         xbzrle_counters.cache_miss++;
 716         if (!rs->last_stage) {
 717             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 718                              ram_counters.dirty_sync_count) == -1) {
 719                 return -1;
 720             } else {
 721                 /* update *current_data when the page has been
 722                    inserted into cache */
 723                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 724             }
 725         }
 726         return -1;
 727     }
 728
 729     /*
 730      * Reaching here means the page has hit the xbzrle cache, no matter what
 731      * encoding result it is (normal encoding, overflow or skipping the page),
 732      * count the page as encoded. This is used to calculate the encoding rate.
 733      *
 734      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 735      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 736      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 737      * skipped page included. In this way, the encoding rate can tell if the
 738      * guest page is good for xbzrle encoding.
 739      */
 740     xbzrle_counters.pages++;
 741     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 742
 743     /* save current buffer into memory */
 744     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 745
 746     /* XBZRLE encoding (if there is no overflow) */
 747     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 748                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 749                                        TARGET_PAGE_SIZE);
 750
 751     /*
 752      * Update the cache contents, so that it corresponds to the data
 753      * sent, in all cases except where we skip the page.
 754      */
 755     if (!rs->last_stage && encoded_len != 0) {
 756         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 757         /*
 758          * In the case where we couldn't compress, ensure that the caller
 759          * sends the data from the cache, since the guest might have
 760          * changed the RAM since we copied it.
 761          */
 762         *current_data = prev_cached_page;
 763     }
 764
 765     if (encoded_len == 0) {
 766         trace_save_xbzrle_page_skipping();
 767         return 0;
 768     } else if (encoded_len == -1) {
 769         trace_save_xbzrle_page_overflow();
 770         xbzrle_counters.overflow++;
 771         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 772         return -1;
 773     }
 774
 775     /* Send XBZRLE based compressed page */
 776     bytes_xbzrle = save_page_header(rs, rs->f, block,
 777                                     offset | RAM_SAVE_FLAG_XBZRLE);
 778     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 779     qemu_put_be16(rs->f, encoded_len);
 780     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 781     bytes_xbzrle += encoded_len + 1 + 2;
 782     /*
 783      * Like compressed_size (please see update_compress_thread_counts),
 784      * the xbzrle encoded bytes don't count the 8 byte header with
 785      * RAM_SAVE_FLAG_CONTINUE.
 786      */
 787     xbzrle_counters.bytes += bytes_xbzrle - 8;
 788     ram_transferred_add(bytes_xbzrle);
 789
 790     return 1;
 791 }
 792
 793 /**
 794  * migration_bitmap_find_dirty: find the next dirty page from start
 795  *
 796  * Returns the page offset within memory region of the start of a dirty page
 797  *
 798  * @rs: current RAM state
 799  * @rb: RAMBlock where to search for dirty pages
 800  * @start: page where we start the search
 801  */
 802 static inline
 803 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 804                                           unsigned long start)
 805 {
 806     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 807     unsigned long *bitmap = rb->bmap;
 808
 809     if (ramblock_is_ignored(rb)) {
 810         return size;
 811     }
 812
 813     return find_next_bit(bitmap, size, start);
 814 }
 815
 816 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 817                                                        unsigned long page)
 818 {
 819     uint8_t shift;
 820     hwaddr size, start;
 821
 822     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 823         return;
 824     }
 825
 826     shift = rb->clear_bmap_shift;
 827     /*
 828      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 829      * can make things easier sometimes since then start address
 830      * of the small chunk will always be 64 pages aligned so the
 831      * bitmap will always be aligned to unsigned long. We should
 832      * even be able to remove this restriction but I'm simply
 833      * keeping it.
 834      */
 835     assert(shift >= 6);
 836
 837     size = 1ULL << (TARGET_PAGE_BITS + shift);
 838     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
 839     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 840     memory_region_clear_dirty_bitmap(rb->mr, start, size);
 841 }
 842
 843 static void
 844 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 845                                                  unsigned long start,
 846                                                  unsigned long npages)
 847 {
 848     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 849     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 850     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 851
 852     /*
 853      * Clear pages from start to start + npages - 1, so the end boundary is
 854      * exclusive.
 855      */
 856     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 857         migration_clear_memory_region_dirty_bitmap(rb, i);
 858     }
 859 }
 860
 861 /*
 862  * colo_bitmap_find_diry:find contiguous dirty pages from start
 863  *
 864  * Returns the page offset within memory region of the start of the contiguout
 865  * dirty page
 866  *
 867  * @rs: current RAM state
 868  * @rb: RAMBlock where to search for dirty pages
 869  * @start: page where we start the search
 870  * @num: the number of contiguous dirty pages
 871  */
 872 static inline
 873 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 874                                      unsigned long start, unsigned long *num)
 875 {
 876     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 877     unsigned long *bitmap = rb->bmap;
 878     unsigned long first, next;
 879
 880     *num = 0;
 881
 882     if (ramblock_is_ignored(rb)) {
 883         return size;
 884     }
 885
 886     first = find_next_bit(bitmap, size, start);
 887     if (first >= size) {
 888         return first;
 889     }
 890     next = find_next_zero_bit(bitmap, size, first + 1);
 891     assert(next >= first);
 892     *num = next - first;
 893     return first;
 894 }
 895
 896 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 897                                                 RAMBlock *rb,
 898                                                 unsigned long page)
 899 {
 900     bool ret;
 901
 902     /*
 903      * Clear dirty bitmap if needed.  This _must_ be called before we
 904      * send any of the page in the chunk because we need to make sure
 905      * we can capture further page content changes when we sync dirty
 906      * log the next time.  So as long as we are going to send any of
 907      * the page in the chunk we clear the remote dirty bitmap for all.
 908      * Clearing it earlier won't be a problem, but too late will.
 909      */
 910     migration_clear_memory_region_dirty_bitmap(rb, page);
 911
 912     ret = test_and_clear_bit(page, rb->bmap);
 913     if (ret) {
 914         rs->migration_dirty_pages--;
 915     }
 916
 917     return ret;
 918 }
 919
 920 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
 921                                        void *opaque)
 922 {
 923     const hwaddr offset = section->offset_within_region;
 924     const hwaddr size = int128_get64(section->size);
 925     const unsigned long start = offset >> TARGET_PAGE_BITS;
 926     const unsigned long npages = size >> TARGET_PAGE_BITS;
 927     RAMBlock *rb = section->mr->ram_block;
 928     uint64_t *cleared_bits = opaque;
 929
 930     /*
 931      * We don't grab ram_state->bitmap_mutex because we expect to run
 932      * only when starting migration or during postcopy recovery where
 933      * we don't have concurrent access.
 934      */
 935     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
 936         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
 937     }
 938     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
 939     bitmap_clear(rb->bmap, start, npages);
 940 }
 941
 942 /*
 943  * Exclude all dirty pages from migration that fall into a discarded range as
 944  * managed by a RamDiscardManager responsible for the mapped memory region of
 945  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
 946  *
 947  * Discarded pages ("logically unplugged") have undefined content and must
 948  * not get migrated, because even reading these pages for migration might
 949  * result in undesired behavior.
 950  *
 951  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
 952  *
 953  * Note: The result is only stable while migrating (precopy/postcopy).
 954  */
 955 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
 956 {
 957     uint64_t cleared_bits = 0;
 958
 959     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
 960         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 961         MemoryRegionSection section = {
 962             .mr = rb->mr,
 963             .offset_within_region = 0,
 964             .size = int128_make64(qemu_ram_get_used_length(rb)),
 965         };
 966
 967         ram_discard_manager_replay_discarded(rdm, &section,
 968                                              dirty_bitmap_clear_section,
 969                                              &cleared_bits);
 970     }
 971     return cleared_bits;
 972 }
 973
 974 /*
 975  * Check if a host-page aligned page falls into a discarded range as managed by
 976  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
 977  *
 978  * Note: The result is only stable while migrating (precopy/postcopy).
 979  */
 980 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
 981 {
 982     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
 983         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 984         MemoryRegionSection section = {
 985             .mr = rb->mr,
 986             .offset_within_region = start,
 987             .size = int128_make64(qemu_ram_pagesize(rb)),
 988         };
 989
 990         return !ram_discard_manager_is_populated(rdm, &section);
 991     }
 992     return false;
 993 }
 994
 995 /* Called with RCU critical section */
 996 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 997 {
 998     uint64_t new_dirty_pages =
 999         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1000
1001     rs->migration_dirty_pages += new_dirty_pages;
1002     rs->num_dirty_pages_period += new_dirty_pages;
1003 }
1004
1005 /**
1006  * ram_pagesize_summary: calculate all the pagesizes of a VM
1007  *
1008  * Returns a summary bitmap of the page sizes of all RAMBlocks
1009  *
1010  * For VMs with just normal pages this is equivalent to the host page
1011  * size. If it's got some huge pages then it's the OR of all the
1012  * different page sizes.
1013  */
1014 uint64_t ram_pagesize_summary(void)
1015 {
1016     RAMBlock *block;
1017     uint64_t summary = 0;
1018
1019     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1020         summary |= block->page_size;
1021     }
1022
1023     return summary;
1024 }
1025
1026 uint64_t ram_get_total_transferred_pages(void)
1027 {
1028     return  ram_counters.normal + ram_counters.duplicate +
1029                 compression_counters.pages + xbzrle_counters.pages;
1030 }
1031
1032 static void migration_update_rates(RAMState *rs, int64_t end_time)
1033 {
1034     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1035     double compressed_size;
1036
1037     /* calculate period counters */
1038     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1039                 / (end_time - rs->time_last_bitmap_sync);
1040
1041     if (!page_count) {
1042         return;
1043     }
1044
1045     if (migrate_use_xbzrle()) {
1046         double encoded_size, unencoded_size;
1047
1048         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1049             rs->xbzrle_cache_miss_prev) / page_count;
1050         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1051         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1052                          TARGET_PAGE_SIZE;
1053         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1054         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1055             xbzrle_counters.encoding_rate = 0;
1056         } else {
1057             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1058         }
1059         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1060         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1061     }
1062
1063     if (migrate_use_compression()) {
1064         compression_counters.busy_rate = (double)(compression_counters.busy -
1065             rs->compress_thread_busy_prev) / page_count;
1066         rs->compress_thread_busy_prev = compression_counters.busy;
1067
1068         compressed_size = compression_counters.compressed_size -
1069                           rs->compressed_size_prev;
1070         if (compressed_size) {
1071             double uncompressed_size = (compression_counters.pages -
1072                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1073
1074             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1075             compression_counters.compression_rate =
1076                                         uncompressed_size / compressed_size;
1077
1078             rs->compress_pages_prev = compression_counters.pages;
1079             rs->compressed_size_prev = compression_counters.compressed_size;
1080         }
1081     }
1082 }
1083
1084 static void migration_trigger_throttle(RAMState *rs)
1085 {
1086     MigrationState *s = migrate_get_current();
1087     uint64_t threshold = s->parameters.throttle_trigger_threshold;
1088
1089     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1090     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1091     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1092
1093     /* During block migration the auto-converge logic incorrectly detects
1094      * that ram migration makes no progress. Avoid this by disabling the
1095      * throttling logic during the bulk phase of block migration. */
1096     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1097         /* The following detection logic can be refined later. For now:
1098            Check to see if the ratio between dirtied bytes and the approx.
1099            amount of bytes that just got transferred since the last time
1100            we were in this routine reaches the threshold. If that happens
1101            twice, start or increase throttling. */
1102
1103         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1104             (++rs->dirty_rate_high_cnt >= 2)) {
1105             trace_migration_throttle();
1106             rs->dirty_rate_high_cnt = 0;
1107             mig_throttle_guest_down(bytes_dirty_period,
1108                                     bytes_dirty_threshold);
1109         }
1110     }
1111 }
1112
1113 static void migration_bitmap_sync(RAMState *rs)
1114 {
1115     RAMBlock *block;
1116     int64_t end_time;
1117
1118     ram_counters.dirty_sync_count++;
1119
1120     if (!rs->time_last_bitmap_sync) {
1121         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1122     }
1123
1124     trace_migration_bitmap_sync_start();
1125     memory_global_dirty_log_sync();
1126
1127     qemu_mutex_lock(&rs->bitmap_mutex);
1128     WITH_RCU_READ_LOCK_GUARD() {
1129         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1130             ramblock_sync_dirty_bitmap(rs, block);
1131         }
1132         ram_counters.remaining = ram_bytes_remaining();
1133     }
1134     qemu_mutex_unlock(&rs->bitmap_mutex);
1135
1136     memory_global_after_dirty_log_sync();
1137     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1138
1139     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1140
1141     /* more than 1 second = 1000 millisecons */
1142     if (end_time > rs->time_last_bitmap_sync + 1000) {
1143         migration_trigger_throttle(rs);
1144
1145         migration_update_rates(rs, end_time);
1146
1147         rs->target_page_count_prev = rs->target_page_count;
1148
1149         /* reset period counters */
1150         rs->time_last_bitmap_sync = end_time;
1151         rs->num_dirty_pages_period = 0;
1152         rs->bytes_xfer_prev = ram_counters.transferred;
1153     }
1154     if (migrate_use_events()) {
1155         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1156     }
1157 }
1158
1159 static void migration_bitmap_sync_precopy(RAMState *rs)
1160 {
1161     Error *local_err = NULL;
1162
1163     /*
1164      * The current notifier usage is just an optimization to migration, so we
1165      * don't stop the normal migration process in the error case.
1166      */
1167     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1168         error_report_err(local_err);
1169         local_err = NULL;
1170     }
1171
1172     migration_bitmap_sync(rs);
1173
1174     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1175         error_report_err(local_err);
1176     }
1177 }
1178
1179 static void ram_release_page(const char *rbname, uint64_t offset)
1180 {
1181     if (!migrate_release_ram() || !migration_in_postcopy()) {
1182         return;
1183     }
1184
1185     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1186 }
1187
1188 /**
1189  * save_zero_page_to_file: send the zero page to the file
1190  *
1191  * Returns the size of data written to the file, 0 means the page is not
1192  * a zero page
1193  *
1194  * @rs: current RAM state
1195  * @file: the file where the data is saved
1196  * @block: block that contains the page we want to send
1197  * @offset: offset inside the block for the page
1198  */
1199 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1200                                   RAMBlock *block, ram_addr_t offset)
1201 {
1202     uint8_t *p = block->host + offset;
1203     int len = 0;
1204
1205     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1206         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1207         qemu_put_byte(file, 0);
1208         len += 1;
1209         ram_release_page(block->idstr, offset);
1210     }
1211     return len;
1212 }
1213
1214 /**
1215  * save_zero_page: send the zero page to the stream
1216  *
1217  * Returns the number of pages written.
1218  *
1219  * @rs: current RAM state
1220  * @block: block that contains the page we want to send
1221  * @offset: offset inside the block for the page
1222  */
1223 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1224 {
1225     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1226
1227     if (len) {
1228         ram_counters.duplicate++;
1229         ram_transferred_add(len);
1230         return 1;
1231     }
1232     return -1;
1233 }
1234
1235 /*
1236  * @pages: the number of pages written by the control path,
1237  *        < 0 - error
1238  *        > 0 - number of pages written
1239  *
1240  * Return true if the pages has been saved, otherwise false is returned.
1241  */
1242 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1243                               int *pages)
1244 {
1245     uint64_t bytes_xmit = 0;
1246     int ret;
1247
1248     *pages = -1;
1249     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1250                                 &bytes_xmit);
1251     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1252         return false;
1253     }
1254
1255     if (bytes_xmit) {
1256         ram_transferred_add(bytes_xmit);
1257         *pages = 1;
1258     }
1259
1260     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1261         return true;
1262     }
1263
1264     if (bytes_xmit > 0) {
1265         ram_counters.normal++;
1266     } else if (bytes_xmit == 0) {
1267         ram_counters.duplicate++;
1268     }
1269
1270     return true;
1271 }
1272
1273 /*
1274  * directly send the page to the stream
1275  *
1276  * Returns the number of pages written.
1277  *
1278  * @rs: current RAM state
1279  * @block: block that contains the page we want to send
1280  * @offset: offset inside the block for the page
1281  * @buf: the page to be sent
1282  * @async: send to page asyncly
1283  */
1284 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1285                             uint8_t *buf, bool async)
1286 {
1287     ram_transferred_add(save_page_header(rs, rs->f, block,
1288                                          offset | RAM_SAVE_FLAG_PAGE));
1289     if (async) {
1290         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1291                               migrate_release_ram() &
1292                               migration_in_postcopy());
1293     } else {
1294         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1295     }
1296     ram_transferred_add(TARGET_PAGE_SIZE);
1297     ram_counters.normal++;
1298     return 1;
1299 }
1300
1301 /**
1302  * ram_save_page: send the given page to the stream
1303  *
1304  * Returns the number of pages written.
1305  *          < 0 - error
1306  *          >=0 - Number of pages written - this might legally be 0
1307  *                if xbzrle noticed the page was the same.
1308  *
1309  * @rs: current RAM state
1310  * @block: block that contains the page we want to send
1311  * @offset: offset inside the block for the page
1312  */
1313 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1314 {
1315     int pages = -1;
1316     uint8_t *p;
1317     bool send_async = true;
1318     RAMBlock *block = pss->block;
1319     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1320     ram_addr_t current_addr = block->offset + offset;
1321
1322     p = block->host + offset;
1323     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1324
1325     XBZRLE_cache_lock();
1326     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1327         pages = save_xbzrle_page(rs, &p, current_addr, block,
1328                                  offset);
1329         if (!rs->last_stage) {
1330             /* Can't send this cached data async, since the cache page
1331              * might get updated before it gets to the wire
1332              */
1333             send_async = false;
1334         }
1335     }
1336
1337     /* XBZRLE overflow or normal page */
1338     if (pages == -1) {
1339         pages = save_normal_page(rs, block, offset, p, send_async);
1340     }
1341
1342     XBZRLE_cache_unlock();
1343
1344     return pages;
1345 }
1346
1347 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1348                                  ram_addr_t offset)
1349 {
1350     if (multifd_queue_page(rs->f, block, offset) < 0) {
1351         return -1;
1352     }
1353     ram_counters.normal++;
1354
1355     return 1;
1356 }
1357
1358 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1359                                  ram_addr_t offset, uint8_t *source_buf)
1360 {
1361     RAMState *rs = ram_state;
1362     uint8_t *p = block->host + offset;
1363     int ret;
1364
1365     if (save_zero_page_to_file(rs, f, block, offset)) {
1366         return true;
1367     }
1368
1369     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1370
1371     /*
1372      * copy it to a internal buffer to avoid it being modified by VM
1373      * so that we can catch up the error during compression and
1374      * decompression
1375      */
1376     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1377     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1378     if (ret < 0) {
1379         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1380         error_report("compressed data failed!");
1381     }
1382     return false;
1383 }
1384
1385 static void
1386 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1387 {
1388     ram_transferred_add(bytes_xmit);
1389
1390     if (param->zero_page) {
1391         ram_counters.duplicate++;
1392         return;
1393     }
1394
1395     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1396     compression_counters.compressed_size += bytes_xmit - 8;
1397     compression_counters.pages++;
1398 }
1399
1400 static bool save_page_use_compression(RAMState *rs);
1401
1402 static void flush_compressed_data(RAMState *rs)
1403 {
1404     int idx, len, thread_count;
1405
1406     if (!save_page_use_compression(rs)) {
1407         return;
1408     }
1409     thread_count = migrate_compress_threads();
1410
1411     qemu_mutex_lock(&comp_done_lock);
1412     for (idx = 0; idx < thread_count; idx++) {
1413         while (!comp_param[idx].done) {
1414             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1415         }
1416     }
1417     qemu_mutex_unlock(&comp_done_lock);
1418
1419     for (idx = 0; idx < thread_count; idx++) {
1420         qemu_mutex_lock(&comp_param[idx].mutex);
1421         if (!comp_param[idx].quit) {
1422             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1423             /*
1424              * it's safe to fetch zero_page without holding comp_done_lock
1425              * as there is no further request submitted to the thread,
1426              * i.e, the thread should be waiting for a request at this point.
1427              */
1428             update_compress_thread_counts(&comp_param[idx], len);
1429         }
1430         qemu_mutex_unlock(&comp_param[idx].mutex);
1431     }
1432 }
1433
1434 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1435                                        ram_addr_t offset)
1436 {
1437     param->block = block;
1438     param->offset = offset;
1439 }
1440
1441 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1442                                            ram_addr_t offset)
1443 {
1444     int idx, thread_count, bytes_xmit = -1, pages = -1;
1445     bool wait = migrate_compress_wait_thread();
1446
1447     thread_count = migrate_compress_threads();
1448     qemu_mutex_lock(&comp_done_lock);
1449 retry:
1450     for (idx = 0; idx < thread_count; idx++) {
1451         if (comp_param[idx].done) {
1452             comp_param[idx].done = false;
1453             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1454             qemu_mutex_lock(&comp_param[idx].mutex);
1455             set_compress_params(&comp_param[idx], block, offset);
1456             qemu_cond_signal(&comp_param[idx].cond);
1457             qemu_mutex_unlock(&comp_param[idx].mutex);
1458             pages = 1;
1459             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1460             break;
1461         }
1462     }
1463
1464     /*
1465      * wait for the free thread if the user specifies 'compress-wait-thread',
1466      * otherwise we will post the page out in the main thread as normal page.
1467      */
1468     if (pages < 0 && wait) {
1469         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1470         goto retry;
1471     }
1472     qemu_mutex_unlock(&comp_done_lock);
1473
1474     return pages;
1475 }
1476
1477 /**
1478  * find_dirty_block: find the next dirty page and update any state
1479  * associated with the search process.
1480  *
1481  * Returns true if a page is found
1482  *
1483  * @rs: current RAM state
1484  * @pss: data about the state of the current dirty page scan
1485  * @again: set to false if the search has scanned the whole of RAM
1486  */
1487 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1488 {
1489     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1490     if (pss->complete_round && pss->block == rs->last_seen_block &&
1491         pss->page >= rs->last_page) {
1492         /*
1493          * We've been once around the RAM and haven't found anything.
1494          * Give up.
1495          */
1496         *again = false;
1497         return false;
1498     }
1499     if (!offset_in_ramblock(pss->block,
1500                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1501         /* Didn't find anything in this RAM Block */
1502         pss->page = 0;
1503         pss->block = QLIST_NEXT_RCU(pss->block, next);
1504         if (!pss->block) {
1505             /*
1506              * If memory migration starts over, we will meet a dirtied page
1507              * which may still exists in compression threads's ring, so we
1508              * should flush the compressed data to make sure the new page
1509              * is not overwritten by the old one in the destination.
1510              *
1511              * Also If xbzrle is on, stop using the data compression at this
1512              * point. In theory, xbzrle can do better than compression.
1513              */
1514             flush_compressed_data(rs);
1515
1516             /* Hit the end of the list */
1517             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1518             /* Flag that we've looped */
1519             pss->complete_round = true;
1520             /* After the first round, enable XBZRLE. */
1521             if (migrate_use_xbzrle()) {
1522                 rs->xbzrle_enabled = true;
1523             }
1524         }
1525         /* Didn't find anything this time, but try again on the new block */
1526         *again = true;
1527         return false;
1528     } else {
1529         /* Can go around again, but... */
1530         *again = true;
1531         /* We've found something so probably don't need to */
1532         return true;
1533     }
1534 }
1535
1536 /**
1537  * unqueue_page: gets a page of the queue
1538  *
1539  * Helper for 'get_queued_page' - gets a page off the queue
1540  *
1541  * Returns the block of the page (or NULL if none available)
1542  *
1543  * @rs: current RAM state
1544  * @offset: used to return the offset within the RAMBlock
1545  */
1546 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1547 {
1548     struct RAMSrcPageRequest *entry;
1549     RAMBlock *block = NULL;
1550     size_t page_size;
1551
1552     if (!postcopy_has_request(rs)) {
1553         return NULL;
1554     }
1555
1556     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1557
1558     /*
1559      * This should _never_ change even after we take the lock, because no one
1560      * should be taking anything off the request list other than us.
1561      */
1562     assert(postcopy_has_request(rs));
1563
1564     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1565     block = entry->rb;
1566     *offset = entry->offset;
1567     page_size = qemu_ram_pagesize(block);
1568     /* Each page request should only be multiple page size of the ramblock */
1569     assert((entry->len % page_size) == 0);
1570
1571     if (entry->len > page_size) {
1572         entry->len -= page_size;
1573         entry->offset += page_size;
1574     } else {
1575         memory_region_unref(block->mr);
1576         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1577         g_free(entry);
1578         migration_consume_urgent_request();
1579     }
1580
1581     trace_unqueue_page(block->idstr, *offset,
1582                        test_bit((*offset >> TARGET_PAGE_BITS), block->bmap));
1583
1584     return block;
1585 }
1586
1587 #if defined(__linux__)
1588 /**
1589  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1590  *   is found, return RAM block pointer and page offset
1591  *
1592  * Returns pointer to the RAMBlock containing faulting page,
1593  *   NULL if no write faults are pending
1594  *
1595  * @rs: current RAM state
1596  * @offset: page offset from the beginning of the block
1597  */
1598 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1599 {
1600     struct uffd_msg uffd_msg;
1601     void *page_address;
1602     RAMBlock *block;
1603     int res;
1604
1605     if (!migrate_background_snapshot()) {
1606         return NULL;
1607     }
1608
1609     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1610     if (res <= 0) {
1611         return NULL;
1612     }
1613
1614     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1615     block = qemu_ram_block_from_host(page_address, false, offset);
1616     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1617     return block;
1618 }
1619
1620 /**
1621  * ram_save_release_protection: release UFFD write protection after
1622  *   a range of pages has been saved
1623  *
1624  * @rs: current RAM state
1625  * @pss: page-search-status structure
1626  * @start_page: index of the first page in the range relative to pss->block
1627  *
1628  * Returns 0 on success, negative value in case of an error
1629 */
1630 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1631         unsigned long start_page)
1632 {
1633     int res = 0;
1634
1635     /* Check if page is from UFFD-managed region. */
1636     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1637         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1638         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1639
1640         /* Flush async buffers before un-protect. */
1641         qemu_fflush(rs->f);
1642         /* Un-protect memory range. */
1643         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1644                 false, false);
1645     }
1646
1647     return res;
1648 }
1649
1650 /* ram_write_tracking_available: check if kernel supports required UFFD features
1651  *
1652  * Returns true if supports, false otherwise
1653  */
1654 bool ram_write_tracking_available(void)
1655 {
1656     uint64_t uffd_features;
1657     int res;
1658
1659     res = uffd_query_features(&uffd_features);
1660     return (res == 0 &&
1661             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1662 }
1663
1664 /* ram_write_tracking_compatible: check if guest configuration is
1665  *   compatible with 'write-tracking'
1666  *
1667  * Returns true if compatible, false otherwise
1668  */
1669 bool ram_write_tracking_compatible(void)
1670 {
1671     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1672     int uffd_fd;
1673     RAMBlock *block;
1674     bool ret = false;
1675
1676     /* Open UFFD file descriptor */
1677     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1678     if (uffd_fd < 0) {
1679         return false;
1680     }
1681
1682     RCU_READ_LOCK_GUARD();
1683
1684     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1685         uint64_t uffd_ioctls;
1686
1687         /* Nothing to do with read-only and MMIO-writable regions */
1688         if (block->mr->readonly || block->mr->rom_device) {
1689             continue;
1690         }
1691         /* Try to register block memory via UFFD-IO to track writes */
1692         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1693                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1694             goto out;
1695         }
1696         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1697             goto out;
1698         }
1699     }
1700     ret = true;
1701
1702 out:
1703     uffd_close_fd(uffd_fd);
1704     return ret;
1705 }
1706
1707 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1708                                        ram_addr_t size)
1709 {
1710     /*
1711      * We read one byte of each page; this will preallocate page tables if
1712      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1713      * where no page was populated yet. This might require adaption when
1714      * supporting other mappings, like shmem.
1715      */
1716     for (; offset < size; offset += block->page_size) {
1717         char tmp = *((char *)block->host + offset);
1718
1719         /* Don't optimize the read out */
1720         asm volatile("" : "+r" (tmp));
1721     }
1722 }
1723
1724 static inline int populate_read_section(MemoryRegionSection *section,
1725                                         void *opaque)
1726 {
1727     const hwaddr size = int128_get64(section->size);
1728     hwaddr offset = section->offset_within_region;
1729     RAMBlock *block = section->mr->ram_block;
1730
1731     populate_read_range(block, offset, size);
1732     return 0;
1733 }
1734
1735 /*
1736  * ram_block_populate_read: preallocate page tables and populate pages in the
1737  *   RAM block by reading a byte of each page.
1738  *
1739  * Since it's solely used for userfault_fd WP feature, here we just
1740  *   hardcode page size to qemu_real_host_page_size.
1741  *
1742  * @block: RAM block to populate
1743  */
1744 static void ram_block_populate_read(RAMBlock *rb)
1745 {
1746     /*
1747      * Skip populating all pages that fall into a discarded range as managed by
1748      * a RamDiscardManager responsible for the mapped memory region of the
1749      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1750      * must not get populated automatically. We don't have to track
1751      * modifications via userfaultfd WP reliably, because these pages will
1752      * not be part of the migration stream either way -- see
1753      * ramblock_dirty_bitmap_exclude_discarded_pages().
1754      *
1755      * Note: The result is only stable while migrating (precopy/postcopy).
1756      */
1757     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1758         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1759         MemoryRegionSection section = {
1760             .mr = rb->mr,
1761             .offset_within_region = 0,
1762             .size = rb->mr->size,
1763         };
1764
1765         ram_discard_manager_replay_populated(rdm, &section,
1766                                              populate_read_section, NULL);
1767     } else {
1768         populate_read_range(rb, 0, rb->used_length);
1769     }
1770 }
1771
1772 /*
1773  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1774  */
1775 void ram_write_tracking_prepare(void)
1776 {
1777     RAMBlock *block;
1778
1779     RCU_READ_LOCK_GUARD();
1780
1781     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1782         /* Nothing to do with read-only and MMIO-writable regions */
1783         if (block->mr->readonly || block->mr->rom_device) {
1784             continue;
1785         }
1786
1787         /*
1788          * Populate pages of the RAM block before enabling userfault_fd
1789          * write protection.
1790          *
1791          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1792          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1793          * pages with pte_none() entries in page table.
1794          */
1795         ram_block_populate_read(block);
1796     }
1797 }
1798
1799 /*
1800  * ram_write_tracking_start: start UFFD-WP memory tracking
1801  *
1802  * Returns 0 for success or negative value in case of error
1803  */
1804 int ram_write_tracking_start(void)
1805 {
1806     int uffd_fd;
1807     RAMState *rs = ram_state;
1808     RAMBlock *block;
1809
1810     /* Open UFFD file descriptor */
1811     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1812     if (uffd_fd < 0) {
1813         return uffd_fd;
1814     }
1815     rs->uffdio_fd = uffd_fd;
1816
1817     RCU_READ_LOCK_GUARD();
1818
1819     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1820         /* Nothing to do with read-only and MMIO-writable regions */
1821         if (block->mr->readonly || block->mr->rom_device) {
1822             continue;
1823         }
1824
1825         /* Register block memory with UFFD to track writes */
1826         if (uffd_register_memory(rs->uffdio_fd, block->host,
1827                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1828             goto fail;
1829         }
1830         /* Apply UFFD write protection to the block memory range */
1831         if (uffd_change_protection(rs->uffdio_fd, block->host,
1832                 block->max_length, true, false)) {
1833             goto fail;
1834         }
1835         block->flags |= RAM_UF_WRITEPROTECT;
1836         memory_region_ref(block->mr);
1837
1838         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1839                 block->host, block->max_length);
1840     }
1841
1842     return 0;
1843
1844 fail:
1845     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1846
1847     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1848         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1849             continue;
1850         }
1851         /*
1852          * In case some memory block failed to be write-protected
1853          * remove protection and unregister all succeeded RAM blocks
1854          */
1855         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1856                 false, false);
1857         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1858         /* Cleanup flags and remove reference */
1859         block->flags &= ~RAM_UF_WRITEPROTECT;
1860         memory_region_unref(block->mr);
1861     }
1862
1863     uffd_close_fd(uffd_fd);
1864     rs->uffdio_fd = -1;
1865     return -1;
1866 }
1867
1868 /**
1869  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1870  */
1871 void ram_write_tracking_stop(void)
1872 {
1873     RAMState *rs = ram_state;
1874     RAMBlock *block;
1875
1876     RCU_READ_LOCK_GUARD();
1877
1878     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1879         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1880             continue;
1881         }
1882         /* Remove protection and unregister all affected RAM blocks */
1883         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1884                 false, false);
1885         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1886
1887         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1888                 block->host, block->max_length);
1889
1890         /* Cleanup flags and remove reference */
1891         block->flags &= ~RAM_UF_WRITEPROTECT;
1892         memory_region_unref(block->mr);
1893     }
1894
1895     /* Finally close UFFD file descriptor */
1896     uffd_close_fd(rs->uffdio_fd);
1897     rs->uffdio_fd = -1;
1898 }
1899
1900 #else
1901 /* No target OS support, stubs just fail or ignore */
1902
1903 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1904 {
1905     (void) rs;
1906     (void) offset;
1907
1908     return NULL;
1909 }
1910
1911 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1912         unsigned long start_page)
1913 {
1914     (void) rs;
1915     (void) pss;
1916     (void) start_page;
1917
1918     return 0;
1919 }
1920
1921 bool ram_write_tracking_available(void)
1922 {
1923     return false;
1924 }
1925
1926 bool ram_write_tracking_compatible(void)
1927 {
1928     assert(0);
1929     return false;
1930 }
1931
1932 int ram_write_tracking_start(void)
1933 {
1934     assert(0);
1935     return -1;
1936 }
1937
1938 void ram_write_tracking_stop(void)
1939 {
1940     assert(0);
1941 }
1942 #endif /* defined(__linux__) */
1943
1944 /**
1945  * get_queued_page: unqueue a page from the postcopy requests
1946  *
1947  * Skips pages that are already sent (!dirty)
1948  *
1949  * Returns true if a queued page is found
1950  *
1951  * @rs: current RAM state
1952  * @pss: data about the state of the current dirty page scan
1953  */
1954 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1955 {
1956     RAMBlock  *block;
1957     ram_addr_t offset;
1958
1959     block = unqueue_page(rs, &offset);
1960
1961     if (!block) {
1962         /*
1963          * Poll write faults too if background snapshot is enabled; that's
1964          * when we have vcpus got blocked by the write protected pages.
1965          */
1966         block = poll_fault_page(rs, &offset);
1967     }
1968
1969     if (block) {
1970         /*
1971          * We want the background search to continue from the queued page
1972          * since the guest is likely to want other pages near to the page
1973          * it just requested.
1974          */
1975         pss->block = block;
1976         pss->page = offset >> TARGET_PAGE_BITS;
1977
1978         /*
1979          * This unqueued page would break the "one round" check, even is
1980          * really rare.
1981          */
1982         pss->complete_round = false;
1983     }
1984
1985     return !!block;
1986 }
1987
1988 /**
1989  * migration_page_queue_free: drop any remaining pages in the ram
1990  * request queue
1991  *
1992  * It should be empty at the end anyway, but in error cases there may
1993  * be some left.  in case that there is any page left, we drop it.
1994  *
1995  */
1996 static void migration_page_queue_free(RAMState *rs)
1997 {
1998     struct RAMSrcPageRequest *mspr, *next_mspr;
1999     /* This queue generally should be empty - but in the case of a failed
2000      * migration might have some droppings in.
2001      */
2002     RCU_READ_LOCK_GUARD();
2003     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2004         memory_region_unref(mspr->rb->mr);
2005         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2006         g_free(mspr);
2007     }
2008 }
2009
2010 /**
2011  * ram_save_queue_pages: queue the page for transmission
2012  *
2013  * A request from postcopy destination for example.
2014  *
2015  * Returns zero on success or negative on error
2016  *
2017  * @rbname: Name of the RAMBLock of the request. NULL means the
2018  *          same that last one.
2019  * @start: starting address from the start of the RAMBlock
2020  * @len: length (in bytes) to send
2021  */
2022 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2023 {
2024     RAMBlock *ramblock;
2025     RAMState *rs = ram_state;
2026
2027     ram_counters.postcopy_requests++;
2028     RCU_READ_LOCK_GUARD();
2029
2030     if (!rbname) {
2031         /* Reuse last RAMBlock */
2032         ramblock = rs->last_req_rb;
2033
2034         if (!ramblock) {
2035             /*
2036              * Shouldn't happen, we can't reuse the last RAMBlock if
2037              * it's the 1st request.
2038              */
2039             error_report("ram_save_queue_pages no previous block");
2040             return -1;
2041         }
2042     } else {
2043         ramblock = qemu_ram_block_by_name(rbname);
2044
2045         if (!ramblock) {
2046             /* We shouldn't be asked for a non-existent RAMBlock */
2047             error_report("ram_save_queue_pages no block '%s'", rbname);
2048             return -1;
2049         }
2050         rs->last_req_rb = ramblock;
2051     }
2052     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2053     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2054         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2055                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2056                      __func__, start, len, ramblock->used_length);
2057         return -1;
2058     }
2059
2060     struct RAMSrcPageRequest *new_entry =
2061         g_malloc0(sizeof(struct RAMSrcPageRequest));
2062     new_entry->rb = ramblock;
2063     new_entry->offset = start;
2064     new_entry->len = len;
2065
2066     memory_region_ref(ramblock->mr);
2067     qemu_mutex_lock(&rs->src_page_req_mutex);
2068     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2069     migration_make_urgent_request();
2070     qemu_mutex_unlock(&rs->src_page_req_mutex);
2071
2072     return 0;
2073 }
2074
2075 static bool save_page_use_compression(RAMState *rs)
2076 {
2077     if (!migrate_use_compression()) {
2078         return false;
2079     }
2080
2081     /*
2082      * If xbzrle is enabled (e.g., after first round of migration), stop
2083      * using the data compression. In theory, xbzrle can do better than
2084      * compression.
2085      */
2086     if (rs->xbzrle_enabled) {
2087         return false;
2088     }
2089
2090     return true;
2091 }
2092
2093 /*
2094  * try to compress the page before posting it out, return true if the page
2095  * has been properly handled by compression, otherwise needs other
2096  * paths to handle it
2097  */
2098 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2099 {
2100     if (!save_page_use_compression(rs)) {
2101         return false;
2102     }
2103
2104     /*
2105      * When starting the process of a new block, the first page of
2106      * the block should be sent out before other pages in the same
2107      * block, and all the pages in last block should have been sent
2108      * out, keeping this order is important, because the 'cont' flag
2109      * is used to avoid resending the block name.
2110      *
2111      * We post the fist page as normal page as compression will take
2112      * much CPU resource.
2113      */
2114     if (block != rs->last_sent_block) {
2115         flush_compressed_data(rs);
2116         return false;
2117     }
2118
2119     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2120         return true;
2121     }
2122
2123     compression_counters.busy++;
2124     return false;
2125 }
2126
2127 /**
2128  * ram_save_target_page: save one target page
2129  *
2130  * Returns the number of pages written
2131  *
2132  * @rs: current RAM state
2133  * @pss: data about the page we want to send
2134  */
2135 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2136 {
2137     RAMBlock *block = pss->block;
2138     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2139     int res;
2140
2141     if (control_save_page(rs, block, offset, &res)) {
2142         return res;
2143     }
2144
2145     if (save_compress_page(rs, block, offset)) {
2146         return 1;
2147     }
2148
2149     res = save_zero_page(rs, block, offset);
2150     if (res > 0) {
2151         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2152          * page would be stale
2153          */
2154         if (!save_page_use_compression(rs)) {
2155             XBZRLE_cache_lock();
2156             xbzrle_cache_zero_page(rs, block->offset + offset);
2157             XBZRLE_cache_unlock();
2158         }
2159         return res;
2160     }
2161
2162     /*
2163      * Do not use multifd for:
2164      * 1. Compression as the first page in the new block should be posted out
2165      *    before sending the compressed page
2166      * 2. In postcopy as one whole host page should be placed
2167      */
2168     if (!save_page_use_compression(rs) && migrate_use_multifd()
2169         && !migration_in_postcopy()) {
2170         return ram_save_multifd_page(rs, block, offset);
2171     }
2172
2173     return ram_save_page(rs, pss);
2174 }
2175
2176 /**
2177  * ram_save_host_page: save a whole host page
2178  *
2179  * Starting at *offset send pages up to the end of the current host
2180  * page. It's valid for the initial offset to point into the middle of
2181  * a host page in which case the remainder of the hostpage is sent.
2182  * Only dirty target pages are sent. Note that the host page size may
2183  * be a huge page for this block.
2184  * The saving stops at the boundary of the used_length of the block
2185  * if the RAMBlock isn't a multiple of the host page size.
2186  *
2187  * Returns the number of pages written or negative on error
2188  *
2189  * @rs: current RAM state
2190  * @pss: data about the page we want to send
2191  */
2192 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2193 {
2194     int tmppages, pages = 0;
2195     size_t pagesize_bits =
2196         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2197     unsigned long hostpage_boundary =
2198         QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2199     unsigned long start_page = pss->page;
2200     int res;
2201
2202     if (ramblock_is_ignored(pss->block)) {
2203         error_report("block %s should not be migrated !", pss->block->idstr);
2204         return 0;
2205     }
2206
2207     do {
2208         /* Check the pages is dirty and if it is send it */
2209         if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2210             tmppages = ram_save_target_page(rs, pss);
2211             if (tmppages < 0) {
2212                 return tmppages;
2213             }
2214
2215             pages += tmppages;
2216             /*
2217              * Allow rate limiting to happen in the middle of huge pages if
2218              * something is sent in the current iteration.
2219              */
2220             if (pagesize_bits > 1 && tmppages > 0) {
2221                 migration_rate_limit();
2222             }
2223         }
2224         pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2225     } while ((pss->page < hostpage_boundary) &&
2226              offset_in_ramblock(pss->block,
2227                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2228     /* The offset we leave with is the min boundary of host page and block */
2229     pss->page = MIN(pss->page, hostpage_boundary);
2230
2231     res = ram_save_release_protection(rs, pss, start_page);
2232     return (res < 0 ? res : pages);
2233 }
2234
2235 /**
2236  * ram_find_and_save_block: finds a dirty page and sends it to f
2237  *
2238  * Called within an RCU critical section.
2239  *
2240  * Returns the number of pages written where zero means no dirty pages,
2241  * or negative on error
2242  *
2243  * @rs: current RAM state
2244  *
2245  * On systems where host-page-size > target-page-size it will send all the
2246  * pages in a host page that are dirty.
2247  */
2248 static int ram_find_and_save_block(RAMState *rs)
2249 {
2250     PageSearchStatus pss;
2251     int pages = 0;
2252     bool again, found;
2253
2254     /* No dirty page as there is zero RAM */
2255     if (!ram_bytes_total()) {
2256         return pages;
2257     }
2258
2259     pss.block = rs->last_seen_block;
2260     pss.page = rs->last_page;
2261     pss.complete_round = false;
2262
2263     if (!pss.block) {
2264         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2265     }
2266
2267     do {
2268         again = true;
2269         found = get_queued_page(rs, &pss);
2270
2271         if (!found) {
2272             /* priority queue empty, so just search for something dirty */
2273             found = find_dirty_block(rs, &pss, &again);
2274         }
2275
2276         if (found) {
2277             pages = ram_save_host_page(rs, &pss);
2278         }
2279     } while (!pages && again);
2280
2281     rs->last_seen_block = pss.block;
2282     rs->last_page = pss.page;
2283
2284     return pages;
2285 }
2286
2287 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2288 {
2289     uint64_t pages = size / TARGET_PAGE_SIZE;
2290
2291     if (zero) {
2292         ram_counters.duplicate += pages;
2293     } else {
2294         ram_counters.normal += pages;
2295         ram_transferred_add(size);
2296         qemu_update_position(f, size);
2297     }
2298 }
2299
2300 static uint64_t ram_bytes_total_common(bool count_ignored)
2301 {
2302     RAMBlock *block;
2303     uint64_t total = 0;
2304
2305     RCU_READ_LOCK_GUARD();
2306
2307     if (count_ignored) {
2308         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2309             total += block->used_length;
2310         }
2311     } else {
2312         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2313             total += block->used_length;
2314         }
2315     }
2316     return total;
2317 }
2318
2319 uint64_t ram_bytes_total(void)
2320 {
2321     return ram_bytes_total_common(false);
2322 }
2323
2324 static void xbzrle_load_setup(void)
2325 {
2326     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2327 }
2328
2329 static void xbzrle_load_cleanup(void)
2330 {
2331     g_free(XBZRLE.decoded_buf);
2332     XBZRLE.decoded_buf = NULL;
2333 }
2334
2335 static void ram_state_cleanup(RAMState **rsp)
2336 {
2337     if (*rsp) {
2338         migration_page_queue_free(*rsp);
2339         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2340         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2341         g_free(*rsp);
2342         *rsp = NULL;
2343     }
2344 }
2345
2346 static void xbzrle_cleanup(void)
2347 {
2348     XBZRLE_cache_lock();
2349     if (XBZRLE.cache) {
2350         cache_fini(XBZRLE.cache);
2351         g_free(XBZRLE.encoded_buf);
2352         g_free(XBZRLE.current_buf);
2353         g_free(XBZRLE.zero_target_page);
2354         XBZRLE.cache = NULL;
2355         XBZRLE.encoded_buf = NULL;
2356         XBZRLE.current_buf = NULL;
2357         XBZRLE.zero_target_page = NULL;
2358     }
2359     XBZRLE_cache_unlock();
2360 }
2361
2362 static void ram_save_cleanup(void *opaque)
2363 {
2364     RAMState **rsp = opaque;
2365     RAMBlock *block;
2366
2367     /* We don't use dirty log with background snapshots */
2368     if (!migrate_background_snapshot()) {
2369         /* caller have hold iothread lock or is in a bh, so there is
2370          * no writing race against the migration bitmap
2371          */
2372         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2373             /*
2374              * do not stop dirty log without starting it, since
2375              * memory_global_dirty_log_stop will assert that
2376              * memory_global_dirty_log_start/stop used in pairs
2377              */
2378             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2379         }
2380     }
2381
2382     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2383         g_free(block->clear_bmap);
2384         block->clear_bmap = NULL;
2385         g_free(block->bmap);
2386         block->bmap = NULL;
2387     }
2388
2389     xbzrle_cleanup();
2390     compress_threads_save_cleanup();
2391     ram_state_cleanup(rsp);
2392 }
2393
2394 static void ram_state_reset(RAMState *rs)
2395 {
2396     rs->last_seen_block = NULL;
2397     rs->last_sent_block = NULL;
2398     rs->last_page = 0;
2399     rs->last_version = ram_list.version;
2400     rs->xbzrle_enabled = false;
2401 }
2402
2403 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2404
2405 /* **** functions for postcopy ***** */
2406
2407 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2408 {
2409     struct RAMBlock *block;
2410
2411     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2412         unsigned long *bitmap = block->bmap;
2413         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2414         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2415
2416         while (run_start < range) {
2417             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2418             ram_discard_range(block->idstr,
2419                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2420                               ((ram_addr_t)(run_end - run_start))
2421                                 << TARGET_PAGE_BITS);
2422             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2423         }
2424     }
2425 }
2426
2427 /**
2428  * postcopy_send_discard_bm_ram: discard a RAMBlock
2429  *
2430  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2431  *
2432  * @ms: current migration state
2433  * @block: RAMBlock to discard
2434  */
2435 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2436 {
2437     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2438     unsigned long current;
2439     unsigned long *bitmap = block->bmap;
2440
2441     for (current = 0; current < end; ) {
2442         unsigned long one = find_next_bit(bitmap, end, current);
2443         unsigned long zero, discard_length;
2444
2445         if (one >= end) {
2446             break;
2447         }
2448
2449         zero = find_next_zero_bit(bitmap, end, one + 1);
2450
2451         if (zero >= end) {
2452             discard_length = end - one;
2453         } else {
2454             discard_length = zero - one;
2455         }
2456         postcopy_discard_send_range(ms, one, discard_length);
2457         current = one + discard_length;
2458     }
2459 }
2460
2461 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2462
2463 /**
2464  * postcopy_each_ram_send_discard: discard all RAMBlocks
2465  *
2466  * Utility for the outgoing postcopy code.
2467  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2468  *   passing it bitmap indexes and name.
2469  * (qemu_ram_foreach_block ends up passing unscaled lengths
2470  *  which would mean postcopy code would have to deal with target page)
2471  *
2472  * @ms: current migration state
2473  */
2474 static void postcopy_each_ram_send_discard(MigrationState *ms)
2475 {
2476     struct RAMBlock *block;
2477
2478     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2479         postcopy_discard_send_init(ms, block->idstr);
2480
2481         /*
2482          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2483          * host-page size chunks, mark any partially dirty host-page size
2484          * chunks as all dirty.  In this case the host-page is the host-page
2485          * for the particular RAMBlock, i.e. it might be a huge page.
2486          */
2487         postcopy_chunk_hostpages_pass(ms, block);
2488
2489         /*
2490          * Postcopy sends chunks of bitmap over the wire, but it
2491          * just needs indexes at this point, avoids it having
2492          * target page specific code.
2493          */
2494         postcopy_send_discard_bm_ram(ms, block);
2495         postcopy_discard_send_finish(ms);
2496     }
2497 }
2498
2499 /**
2500  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2501  *
2502  * Helper for postcopy_chunk_hostpages; it's called twice to
2503  * canonicalize the two bitmaps, that are similar, but one is
2504  * inverted.
2505  *
2506  * Postcopy requires that all target pages in a hostpage are dirty or
2507  * clean, not a mix.  This function canonicalizes the bitmaps.
2508  *
2509  * @ms: current migration state
2510  * @block: block that contains the page we want to canonicalize
2511  */
2512 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2513 {
2514     RAMState *rs = ram_state;
2515     unsigned long *bitmap = block->bmap;
2516     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2517     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2518     unsigned long run_start;
2519
2520     if (block->page_size == TARGET_PAGE_SIZE) {
2521         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2522         return;
2523     }
2524
2525     /* Find a dirty page */
2526     run_start = find_next_bit(bitmap, pages, 0);
2527
2528     while (run_start < pages) {
2529
2530         /*
2531          * If the start of this run of pages is in the middle of a host
2532          * page, then we need to fixup this host page.
2533          */
2534         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2535             /* Find the end of this run */
2536             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2537             /*
2538              * If the end isn't at the start of a host page, then the
2539              * run doesn't finish at the end of a host page
2540              * and we need to discard.
2541              */
2542         }
2543
2544         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2545             unsigned long page;
2546             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2547                                                              host_ratio);
2548             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2549
2550             /* Clean up the bitmap */
2551             for (page = fixup_start_addr;
2552                  page < fixup_start_addr + host_ratio; page++) {
2553                 /*
2554                  * Remark them as dirty, updating the count for any pages
2555                  * that weren't previously dirty.
2556                  */
2557                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2558             }
2559         }
2560
2561         /* Find the next dirty page for the next iteration */
2562         run_start = find_next_bit(bitmap, pages, run_start);
2563     }
2564 }
2565
2566 /**
2567  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2568  *
2569  * Transmit the set of pages to be discarded after precopy to the target
2570  * these are pages that:
2571  *     a) Have been previously transmitted but are now dirty again
2572  *     b) Pages that have never been transmitted, this ensures that
2573  *        any pages on the destination that have been mapped by background
2574  *        tasks get discarded (transparent huge pages is the specific concern)
2575  * Hopefully this is pretty sparse
2576  *
2577  * @ms: current migration state
2578  */
2579 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2580 {
2581     RAMState *rs = ram_state;
2582
2583     RCU_READ_LOCK_GUARD();
2584
2585     /* This should be our last sync, the src is now paused */
2586     migration_bitmap_sync(rs);
2587
2588     /* Easiest way to make sure we don't resume in the middle of a host-page */
2589     rs->last_seen_block = NULL;
2590     rs->last_sent_block = NULL;
2591     rs->last_page = 0;
2592
2593     postcopy_each_ram_send_discard(ms);
2594
2595     trace_ram_postcopy_send_discard_bitmap();
2596 }
2597
2598 /**
2599  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2600  *
2601  * Returns zero on success
2602  *
2603  * @rbname: name of the RAMBlock of the request. NULL means the
2604  *          same that last one.
2605  * @start: RAMBlock starting page
2606  * @length: RAMBlock size
2607  */
2608 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2609 {
2610     trace_ram_discard_range(rbname, start, length);
2611
2612     RCU_READ_LOCK_GUARD();
2613     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2614
2615     if (!rb) {
2616         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2617         return -1;
2618     }
2619
2620     /*
2621      * On source VM, we don't need to update the received bitmap since
2622      * we don't even have one.
2623      */
2624     if (rb->receivedmap) {
2625         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2626                      length >> qemu_target_page_bits());
2627     }
2628
2629     return ram_block_discard_range(rb, start, length);
2630 }
2631
2632 /*
2633  * For every allocation, we will try not to crash the VM if the
2634  * allocation failed.
2635  */
2636 static int xbzrle_init(void)
2637 {
2638     Error *local_err = NULL;
2639
2640     if (!migrate_use_xbzrle()) {
2641         return 0;
2642     }
2643
2644     XBZRLE_cache_lock();
2645
2646     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2647     if (!XBZRLE.zero_target_page) {
2648         error_report("%s: Error allocating zero page", __func__);
2649         goto err_out;
2650     }
2651
2652     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2653                               TARGET_PAGE_SIZE, &local_err);
2654     if (!XBZRLE.cache) {
2655         error_report_err(local_err);
2656         goto free_zero_page;
2657     }
2658
2659     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2660     if (!XBZRLE.encoded_buf) {
2661         error_report("%s: Error allocating encoded_buf", __func__);
2662         goto free_cache;
2663     }
2664
2665     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2666     if (!XBZRLE.current_buf) {
2667         error_report("%s: Error allocating current_buf", __func__);
2668         goto free_encoded_buf;
2669     }
2670
2671     /* We are all good */
2672     XBZRLE_cache_unlock();
2673     return 0;
2674
2675 free_encoded_buf:
2676     g_free(XBZRLE.encoded_buf);
2677     XBZRLE.encoded_buf = NULL;
2678 free_cache:
2679     cache_fini(XBZRLE.cache);
2680     XBZRLE.cache = NULL;
2681 free_zero_page:
2682     g_free(XBZRLE.zero_target_page);
2683     XBZRLE.zero_target_page = NULL;
2684 err_out:
2685     XBZRLE_cache_unlock();
2686     return -ENOMEM;
2687 }
2688
2689 static int ram_state_init(RAMState **rsp)
2690 {
2691     *rsp = g_try_new0(RAMState, 1);
2692
2693     if (!*rsp) {
2694         error_report("%s: Init ramstate fail", __func__);
2695         return -1;
2696     }
2697
2698     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2699     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2700     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2701
2702     /*
2703      * Count the total number of pages used by ram blocks not including any
2704      * gaps due to alignment or unplugs.
2705      * This must match with the initial values of dirty bitmap.
2706      */
2707     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2708     ram_state_reset(*rsp);
2709
2710     return 0;
2711 }
2712
2713 static void ram_list_init_bitmaps(void)
2714 {
2715     MigrationState *ms = migrate_get_current();
2716     RAMBlock *block;
2717     unsigned long pages;
2718     uint8_t shift;
2719
2720     /* Skip setting bitmap if there is no RAM */
2721     if (ram_bytes_total()) {
2722         shift = ms->clear_bitmap_shift;
2723         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2724             error_report("clear_bitmap_shift (%u) too big, using "
2725                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2726             shift = CLEAR_BITMAP_SHIFT_MAX;
2727         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2728             error_report("clear_bitmap_shift (%u) too small, using "
2729                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2730             shift = CLEAR_BITMAP_SHIFT_MIN;
2731         }
2732
2733         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2734             pages = block->max_length >> TARGET_PAGE_BITS;
2735             /*
2736              * The initial dirty bitmap for migration must be set with all
2737              * ones to make sure we'll migrate every guest RAM page to
2738              * destination.
2739              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2740              * new migration after a failed migration, ram_list.
2741              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2742              * guest memory.
2743              */
2744             block->bmap = bitmap_new(pages);
2745             bitmap_set(block->bmap, 0, pages);
2746             block->clear_bmap_shift = shift;
2747             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2748         }
2749     }
2750 }
2751
2752 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2753 {
2754     unsigned long pages;
2755     RAMBlock *rb;
2756
2757     RCU_READ_LOCK_GUARD();
2758
2759     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2760             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2761             rs->migration_dirty_pages -= pages;
2762     }
2763 }
2764
2765 static void ram_init_bitmaps(RAMState *rs)
2766 {
2767     /* For memory_global_dirty_log_start below.  */
2768     qemu_mutex_lock_iothread();
2769     qemu_mutex_lock_ramlist();
2770
2771     WITH_RCU_READ_LOCK_GUARD() {
2772         ram_list_init_bitmaps();
2773         /* We don't use dirty log with background snapshots */
2774         if (!migrate_background_snapshot()) {
2775             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2776             migration_bitmap_sync_precopy(rs);
2777         }
2778     }
2779     qemu_mutex_unlock_ramlist();
2780     qemu_mutex_unlock_iothread();
2781
2782     /*
2783      * After an eventual first bitmap sync, fixup the initial bitmap
2784      * containing all 1s to exclude any discarded pages from migration.
2785      */
2786     migration_bitmap_clear_discarded_pages(rs);
2787 }
2788
2789 static int ram_init_all(RAMState **rsp)
2790 {
2791     if (ram_state_init(rsp)) {
2792         return -1;
2793     }
2794
2795     if (xbzrle_init()) {
2796         ram_state_cleanup(rsp);
2797         return -1;
2798     }
2799
2800     ram_init_bitmaps(*rsp);
2801
2802     return 0;
2803 }
2804
2805 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2806 {
2807     RAMBlock *block;
2808     uint64_t pages = 0;
2809
2810     /*
2811      * Postcopy is not using xbzrle/compression, so no need for that.
2812      * Also, since source are already halted, we don't need to care
2813      * about dirty page logging as well.
2814      */
2815
2816     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2817         pages += bitmap_count_one(block->bmap,
2818                                   block->used_length >> TARGET_PAGE_BITS);
2819     }
2820
2821     /* This may not be aligned with current bitmaps. Recalculate. */
2822     rs->migration_dirty_pages = pages;
2823
2824     ram_state_reset(rs);
2825
2826     /* Update RAMState cache of output QEMUFile */
2827     rs->f = out;
2828
2829     trace_ram_state_resume_prepare(pages);
2830 }
2831
2832 /*
2833  * This function clears bits of the free pages reported by the caller from the
2834  * migration dirty bitmap. @addr is the host address corresponding to the
2835  * start of the continuous guest free pages, and @len is the total bytes of
2836  * those pages.
2837  */
2838 void qemu_guest_free_page_hint(void *addr, size_t len)
2839 {
2840     RAMBlock *block;
2841     ram_addr_t offset;
2842     size_t used_len, start, npages;
2843     MigrationState *s = migrate_get_current();
2844
2845     /* This function is currently expected to be used during live migration */
2846     if (!migration_is_setup_or_active(s->state)) {
2847         return;
2848     }
2849
2850     for (; len > 0; len -= used_len, addr += used_len) {
2851         block = qemu_ram_block_from_host(addr, false, &offset);
2852         if (unlikely(!block || offset >= block->used_length)) {
2853             /*
2854              * The implementation might not support RAMBlock resize during
2855              * live migration, but it could happen in theory with future
2856              * updates. So we add a check here to capture that case.
2857              */
2858             error_report_once("%s unexpected error", __func__);
2859             return;
2860         }
2861
2862         if (len <= block->used_length - offset) {
2863             used_len = len;
2864         } else {
2865             used_len = block->used_length - offset;
2866         }
2867
2868         start = offset >> TARGET_PAGE_BITS;
2869         npages = used_len >> TARGET_PAGE_BITS;
2870
2871         qemu_mutex_lock(&ram_state->bitmap_mutex);
2872         /*
2873          * The skipped free pages are equavalent to be sent from clear_bmap's
2874          * perspective, so clear the bits from the memory region bitmap which
2875          * are initially set. Otherwise those skipped pages will be sent in
2876          * the next round after syncing from the memory region bitmap.
2877          */
2878         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2879         ram_state->migration_dirty_pages -=
2880                       bitmap_count_one_with_offset(block->bmap, start, npages);
2881         bitmap_clear(block->bmap, start, npages);
2882         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2883     }
2884 }
2885
2886 /*
2887  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2888  * long-running RCU critical section.  When rcu-reclaims in the code
2889  * start to become numerous it will be necessary to reduce the
2890  * granularity of these critical sections.
2891  */
2892
2893 /**
2894  * ram_save_setup: Setup RAM for migration
2895  *
2896  * Returns zero to indicate success and negative for error
2897  *
2898  * @f: QEMUFile where to send the data
2899  * @opaque: RAMState pointer
2900  */
2901 static int ram_save_setup(QEMUFile *f, void *opaque)
2902 {
2903     RAMState **rsp = opaque;
2904     RAMBlock *block;
2905
2906     if (compress_threads_save_setup()) {
2907         return -1;
2908     }
2909
2910     /* migration has already setup the bitmap, reuse it. */
2911     if (!migration_in_colo_state()) {
2912         if (ram_init_all(rsp) != 0) {
2913             compress_threads_save_cleanup();
2914             return -1;
2915         }
2916     }
2917     (*rsp)->f = f;
2918
2919     WITH_RCU_READ_LOCK_GUARD() {
2920         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2921
2922         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2923             qemu_put_byte(f, strlen(block->idstr));
2924             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2925             qemu_put_be64(f, block->used_length);
2926             if (migrate_postcopy_ram() && block->page_size !=
2927                                           qemu_host_page_size) {
2928                 qemu_put_be64(f, block->page_size);
2929             }
2930             if (migrate_ignore_shared()) {
2931                 qemu_put_be64(f, block->mr->addr);
2932             }
2933         }
2934     }
2935
2936     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2937     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2938
2939     multifd_send_sync_main(f);
2940     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2941     qemu_fflush(f);
2942
2943     return 0;
2944 }
2945
2946 /**
2947  * ram_save_iterate: iterative stage for migration
2948  *
2949  * Returns zero to indicate success and negative for error
2950  *
2951  * @f: QEMUFile where to send the data
2952  * @opaque: RAMState pointer
2953  */
2954 static int ram_save_iterate(QEMUFile *f, void *opaque)
2955 {
2956     RAMState **temp = opaque;
2957     RAMState *rs = *temp;
2958     int ret = 0;
2959     int i;
2960     int64_t t0;
2961     int done = 0;
2962
2963     if (blk_mig_bulk_active()) {
2964         /* Avoid transferring ram during bulk phase of block migration as
2965          * the bulk phase will usually take a long time and transferring
2966          * ram updates during that time is pointless. */
2967         goto out;
2968     }
2969
2970     /*
2971      * We'll take this lock a little bit long, but it's okay for two reasons.
2972      * Firstly, the only possible other thread to take it is who calls
2973      * qemu_guest_free_page_hint(), which should be rare; secondly, see
2974      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
2975      * guarantees that we'll at least released it in a regular basis.
2976      */
2977     qemu_mutex_lock(&rs->bitmap_mutex);
2978     WITH_RCU_READ_LOCK_GUARD() {
2979         if (ram_list.version != rs->last_version) {
2980             ram_state_reset(rs);
2981         }
2982
2983         /* Read version before ram_list.blocks */
2984         smp_rmb();
2985
2986         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2987
2988         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2989         i = 0;
2990         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2991                postcopy_has_request(rs)) {
2992             int pages;
2993
2994             if (qemu_file_get_error(f)) {
2995                 break;
2996             }
2997
2998             pages = ram_find_and_save_block(rs);
2999             /* no more pages to sent */
3000             if (pages == 0) {
3001                 done = 1;
3002                 break;
3003             }
3004
3005             if (pages < 0) {
3006                 qemu_file_set_error(f, pages);
3007                 break;
3008             }
3009
3010             rs->target_page_count += pages;
3011
3012             /*
3013              * During postcopy, it is necessary to make sure one whole host
3014              * page is sent in one chunk.
3015              */
3016             if (migrate_postcopy_ram()) {
3017                 flush_compressed_data(rs);
3018             }
3019
3020             /*
3021              * we want to check in the 1st loop, just in case it was the 1st
3022              * time and we had to sync the dirty bitmap.
3023              * qemu_clock_get_ns() is a bit expensive, so we only check each
3024              * some iterations
3025              */
3026             if ((i & 63) == 0) {
3027                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3028                               1000000;
3029                 if (t1 > MAX_WAIT) {
3030                     trace_ram_save_iterate_big_wait(t1, i);
3031                     break;
3032                 }
3033             }
3034             i++;
3035         }
3036     }
3037     qemu_mutex_unlock(&rs->bitmap_mutex);
3038
3039     /*
3040      * Must occur before EOS (or any QEMUFile operation)
3041      * because of RDMA protocol.
3042      */
3043     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3044
3045 out:
3046     if (ret >= 0
3047         && migration_is_setup_or_active(migrate_get_current()->state)) {
3048         multifd_send_sync_main(rs->f);
3049         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3050         qemu_fflush(f);
3051         ram_transferred_add(8);
3052
3053         ret = qemu_file_get_error(f);
3054     }
3055     if (ret < 0) {
3056         return ret;
3057     }
3058
3059     return done;
3060 }
3061
3062 /**
3063  * ram_save_complete: function called to send the remaining amount of ram
3064  *
3065  * Returns zero to indicate success or negative on error
3066  *
3067  * Called with iothread lock
3068  *
3069  * @f: QEMUFile where to send the data
3070  * @opaque: RAMState pointer
3071  */
3072 static int ram_save_complete(QEMUFile *f, void *opaque)
3073 {
3074     RAMState **temp = opaque;
3075     RAMState *rs = *temp;
3076     int ret = 0;
3077
3078     rs->last_stage = !migration_in_colo_state();
3079
3080     WITH_RCU_READ_LOCK_GUARD() {
3081         if (!migration_in_postcopy()) {
3082             migration_bitmap_sync_precopy(rs);
3083         }
3084
3085         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3086
3087         /* try transferring iterative blocks of memory */
3088
3089         /* flush all remaining blocks regardless of rate limiting */
3090         while (true) {
3091             int pages;
3092
3093             pages = ram_find_and_save_block(rs);
3094             /* no more blocks to sent */
3095             if (pages == 0) {
3096                 break;
3097             }
3098             if (pages < 0) {
3099                 ret = pages;
3100                 break;
3101             }
3102         }
3103
3104         flush_compressed_data(rs);
3105         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3106     }
3107
3108     if (ret >= 0) {
3109         multifd_send_sync_main(rs->f);
3110         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3111         qemu_fflush(f);
3112     }
3113
3114     return ret;
3115 }
3116
3117 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3118                              uint64_t *res_precopy_only,
3119                              uint64_t *res_compatible,
3120                              uint64_t *res_postcopy_only)
3121 {
3122     RAMState **temp = opaque;
3123     RAMState *rs = *temp;
3124     uint64_t remaining_size;
3125
3126     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3127
3128     if (!migration_in_postcopy() &&
3129         remaining_size < max_size) {
3130         qemu_mutex_lock_iothread();
3131         WITH_RCU_READ_LOCK_GUARD() {
3132             migration_bitmap_sync_precopy(rs);
3133         }
3134         qemu_mutex_unlock_iothread();
3135         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3136     }
3137
3138     if (migrate_postcopy_ram()) {
3139         /* We can do postcopy, and all the data is postcopiable */
3140         *res_compatible += remaining_size;
3141     } else {
3142         *res_precopy_only += remaining_size;
3143     }
3144 }
3145
3146 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3147 {
3148     unsigned int xh_len;
3149     int xh_flags;
3150     uint8_t *loaded_data;
3151
3152     /* extract RLE header */
3153     xh_flags = qemu_get_byte(f);
3154     xh_len = qemu_get_be16(f);
3155
3156     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3157         error_report("Failed to load XBZRLE page - wrong compression!");
3158         return -1;
3159     }
3160
3161     if (xh_len > TARGET_PAGE_SIZE) {
3162         error_report("Failed to load XBZRLE page - len overflow!");
3163         return -1;
3164     }
3165     loaded_data = XBZRLE.decoded_buf;
3166     /* load data and decode */
3167     /* it can change loaded_data to point to an internal buffer */
3168     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3169
3170     /* decode RLE */
3171     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3172                              TARGET_PAGE_SIZE) == -1) {
3173         error_report("Failed to load XBZRLE page - decode error!");
3174         return -1;
3175     }
3176
3177     return 0;
3178 }
3179
3180 /**
3181  * ram_block_from_stream: read a RAMBlock id from the migration stream
3182  *
3183  * Must be called from within a rcu critical section.
3184  *
3185  * Returns a pointer from within the RCU-protected ram_list.
3186  *
3187  * @f: QEMUFile where to read the data from
3188  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3189  */
3190 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3191 {
3192     static RAMBlock *block;
3193     char id[256];
3194     uint8_t len;
3195
3196     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3197         if (!block) {
3198             error_report("Ack, bad migration stream!");
3199             return NULL;
3200         }
3201         return block;
3202     }
3203
3204     len = qemu_get_byte(f);
3205     qemu_get_buffer(f, (uint8_t *)id, len);
3206     id[len] = 0;
3207
3208     block = qemu_ram_block_by_name(id);
3209     if (!block) {
3210         error_report("Can't find block %s", id);
3211         return NULL;
3212     }
3213
3214     if (ramblock_is_ignored(block)) {
3215         error_report("block %s should not be migrated !", id);
3216         return NULL;
3217     }
3218
3219     return block;
3220 }
3221
3222 static inline void *host_from_ram_block_offset(RAMBlock *block,
3223                                                ram_addr_t offset)
3224 {
3225     if (!offset_in_ramblock(block, offset)) {
3226         return NULL;
3227     }
3228
3229     return block->host + offset;
3230 }
3231
3232 static void *host_page_from_ram_block_offset(RAMBlock *block,
3233                                              ram_addr_t offset)
3234 {
3235     /* Note: Explicitly no check against offset_in_ramblock(). */
3236     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3237                                    block->page_size);
3238 }
3239
3240 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3241                                                          ram_addr_t offset)
3242 {
3243     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3244 }
3245
3246 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3247                              ram_addr_t offset, bool record_bitmap)
3248 {
3249     if (!offset_in_ramblock(block, offset)) {
3250         return NULL;
3251     }
3252     if (!block->colo_cache) {
3253         error_report("%s: colo_cache is NULL in block :%s",
3254                      __func__, block->idstr);
3255         return NULL;
3256     }
3257
3258     /*
3259     * During colo checkpoint, we need bitmap of these migrated pages.
3260     * It help us to decide which pages in ram cache should be flushed
3261     * into VM's RAM later.
3262     */
3263     if (record_bitmap &&
3264         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3265         ram_state->migration_dirty_pages++;
3266     }
3267     return block->colo_cache + offset;
3268 }
3269
3270 /**
3271  * ram_handle_compressed: handle the zero page case
3272  *
3273  * If a page (or a whole RDMA chunk) has been
3274  * determined to be zero, then zap it.
3275  *
3276  * @host: host address for the zero page
3277  * @ch: what the page is filled from.  We only support zero
3278  * @size: size of the zero page
3279  */
3280 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3281 {
3282     if (ch != 0 || !buffer_is_zero(host, size)) {
3283         memset(host, ch, size);
3284     }
3285 }
3286
3287 /* return the size after decompression, or negative value on error */
3288 static int
3289 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3290                      const uint8_t *source, size_t source_len)
3291 {
3292     int err;
3293
3294     err = inflateReset(stream);
3295     if (err != Z_OK) {
3296         return -1;
3297     }
3298
3299     stream->avail_in = source_len;
3300     stream->next_in = (uint8_t *)source;
3301     stream->avail_out = dest_len;
3302     stream->next_out = dest;
3303
3304     err = inflate(stream, Z_NO_FLUSH);
3305     if (err != Z_STREAM_END) {
3306         return -1;
3307     }
3308
3309     return stream->total_out;
3310 }
3311
3312 static void *do_data_decompress(void *opaque)
3313 {
3314     DecompressParam *param = opaque;
3315     unsigned long pagesize;
3316     uint8_t *des;
3317     int len, ret;
3318
3319     qemu_mutex_lock(&param->mutex);
3320     while (!param->quit) {
3321         if (param->des) {
3322             des = param->des;
3323             len = param->len;
3324             param->des = 0;
3325             qemu_mutex_unlock(&param->mutex);
3326
3327             pagesize = TARGET_PAGE_SIZE;
3328
3329             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3330                                        param->compbuf, len);
3331             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3332                 error_report("decompress data failed");
3333                 qemu_file_set_error(decomp_file, ret);
3334             }
3335
3336             qemu_mutex_lock(&decomp_done_lock);
3337             param->done = true;
3338             qemu_cond_signal(&decomp_done_cond);
3339             qemu_mutex_unlock(&decomp_done_lock);
3340
3341             qemu_mutex_lock(&param->mutex);
3342         } else {
3343             qemu_cond_wait(&param->cond, &param->mutex);
3344         }
3345     }
3346     qemu_mutex_unlock(&param->mutex);
3347
3348     return NULL;
3349 }
3350
3351 static int wait_for_decompress_done(void)
3352 {
3353     int idx, thread_count;
3354
3355     if (!migrate_use_compression()) {
3356         return 0;
3357     }
3358
3359     thread_count = migrate_decompress_threads();
3360     qemu_mutex_lock(&decomp_done_lock);
3361     for (idx = 0; idx < thread_count; idx++) {
3362         while (!decomp_param[idx].done) {
3363             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3364         }
3365     }
3366     qemu_mutex_unlock(&decomp_done_lock);
3367     return qemu_file_get_error(decomp_file);
3368 }
3369
3370 static void compress_threads_load_cleanup(void)
3371 {
3372     int i, thread_count;
3373
3374     if (!migrate_use_compression()) {
3375         return;
3376     }
3377     thread_count = migrate_decompress_threads();
3378     for (i = 0; i < thread_count; i++) {
3379         /*
3380          * we use it as a indicator which shows if the thread is
3381          * properly init'd or not
3382          */
3383         if (!decomp_param[i].compbuf) {
3384             break;
3385         }
3386
3387         qemu_mutex_lock(&decomp_param[i].mutex);
3388         decomp_param[i].quit = true;
3389         qemu_cond_signal(&decomp_param[i].cond);
3390         qemu_mutex_unlock(&decomp_param[i].mutex);
3391     }
3392     for (i = 0; i < thread_count; i++) {
3393         if (!decomp_param[i].compbuf) {
3394             break;
3395         }
3396
3397         qemu_thread_join(decompress_threads + i);
3398         qemu_mutex_destroy(&decomp_param[i].mutex);
3399         qemu_cond_destroy(&decomp_param[i].cond);
3400         inflateEnd(&decomp_param[i].stream);
3401         g_free(decomp_param[i].compbuf);
3402         decomp_param[i].compbuf = NULL;
3403     }
3404     g_free(decompress_threads);
3405     g_free(decomp_param);
3406     decompress_threads = NULL;
3407     decomp_param = NULL;
3408     decomp_file = NULL;
3409 }
3410
3411 static int compress_threads_load_setup(QEMUFile *f)
3412 {
3413     int i, thread_count;
3414
3415     if (!migrate_use_compression()) {
3416         return 0;
3417     }
3418
3419     thread_count = migrate_decompress_threads();
3420     decompress_threads = g_new0(QemuThread, thread_count);
3421     decomp_param = g_new0(DecompressParam, thread_count);
3422     qemu_mutex_init(&decomp_done_lock);
3423     qemu_cond_init(&decomp_done_cond);
3424     decomp_file = f;
3425     for (i = 0; i < thread_count; i++) {
3426         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3427             goto exit;
3428         }
3429
3430         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3431         qemu_mutex_init(&decomp_param[i].mutex);
3432         qemu_cond_init(&decomp_param[i].cond);
3433         decomp_param[i].done = true;
3434         decomp_param[i].quit = false;
3435         qemu_thread_create(decompress_threads + i, "decompress",
3436                            do_data_decompress, decomp_param + i,
3437                            QEMU_THREAD_JOINABLE);
3438     }
3439     return 0;
3440 exit:
3441     compress_threads_load_cleanup();
3442     return -1;
3443 }
3444
3445 static void decompress_data_with_multi_threads(QEMUFile *f,
3446                                                void *host, int len)
3447 {
3448     int idx, thread_count;
3449
3450     thread_count = migrate_decompress_threads();
3451     QEMU_LOCK_GUARD(&decomp_done_lock);
3452     while (true) {
3453         for (idx = 0; idx < thread_count; idx++) {
3454             if (decomp_param[idx].done) {
3455                 decomp_param[idx].done = false;
3456                 qemu_mutex_lock(&decomp_param[idx].mutex);
3457                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3458                 decomp_param[idx].des = host;
3459                 decomp_param[idx].len = len;
3460                 qemu_cond_signal(&decomp_param[idx].cond);
3461                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3462                 break;
3463             }
3464         }
3465         if (idx < thread_count) {
3466             break;
3467         } else {
3468             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3469         }
3470     }
3471 }
3472
3473 static void colo_init_ram_state(void)
3474 {
3475     ram_state_init(&ram_state);
3476 }
3477
3478 /*
3479  * colo cache: this is for secondary VM, we cache the whole
3480  * memory of the secondary VM, it is need to hold the global lock
3481  * to call this helper.
3482  */
3483 int colo_init_ram_cache(void)
3484 {
3485     RAMBlock *block;
3486
3487     WITH_RCU_READ_LOCK_GUARD() {
3488         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3489             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3490                                                     NULL, false, false);
3491             if (!block->colo_cache) {
3492                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3493                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3494                              block->used_length);
3495                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3496                     if (block->colo_cache) {
3497                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3498                         block->colo_cache = NULL;
3499                     }
3500                 }
3501                 return -errno;
3502             }
3503             if (!machine_dump_guest_core(current_machine)) {
3504                 qemu_madvise(block->colo_cache, block->used_length,
3505                              QEMU_MADV_DONTDUMP);
3506             }
3507         }
3508     }
3509
3510     /*
3511     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3512     * with to decide which page in cache should be flushed into SVM's RAM. Here
3513     * we use the same name 'ram_bitmap' as for migration.
3514     */
3515     if (ram_bytes_total()) {
3516         RAMBlock *block;
3517
3518         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3519             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3520             block->bmap = bitmap_new(pages);
3521         }
3522     }
3523
3524     colo_init_ram_state();
3525     return 0;
3526 }
3527
3528 /* TODO: duplicated with ram_init_bitmaps */
3529 void colo_incoming_start_dirty_log(void)
3530 {
3531     RAMBlock *block = NULL;
3532     /* For memory_global_dirty_log_start below. */
3533     qemu_mutex_lock_iothread();
3534     qemu_mutex_lock_ramlist();
3535
3536     memory_global_dirty_log_sync();
3537     WITH_RCU_READ_LOCK_GUARD() {
3538         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3539             ramblock_sync_dirty_bitmap(ram_state, block);
3540             /* Discard this dirty bitmap record */
3541             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3542         }
3543         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3544     }
3545     ram_state->migration_dirty_pages = 0;
3546     qemu_mutex_unlock_ramlist();
3547     qemu_mutex_unlock_iothread();
3548 }
3549
3550 /* It is need to hold the global lock to call this helper */
3551 void colo_release_ram_cache(void)
3552 {
3553     RAMBlock *block;
3554
3555     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3556     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3557         g_free(block->bmap);
3558         block->bmap = NULL;
3559     }
3560
3561     WITH_RCU_READ_LOCK_GUARD() {
3562         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3563             if (block->colo_cache) {
3564                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3565                 block->colo_cache = NULL;
3566             }
3567         }
3568     }
3569     ram_state_cleanup(&ram_state);
3570 }
3571
3572 /**
3573  * ram_load_setup: Setup RAM for migration incoming side
3574  *
3575  * Returns zero to indicate success and negative for error
3576  *
3577  * @f: QEMUFile where to receive the data
3578  * @opaque: RAMState pointer
3579  */
3580 static int ram_load_setup(QEMUFile *f, void *opaque)
3581 {
3582     if (compress_threads_load_setup(f)) {
3583         return -1;
3584     }
3585
3586     xbzrle_load_setup();
3587     ramblock_recv_map_init();
3588
3589     return 0;
3590 }
3591
3592 static int ram_load_cleanup(void *opaque)
3593 {
3594     RAMBlock *rb;
3595
3596     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3597         qemu_ram_block_writeback(rb);
3598     }
3599
3600     xbzrle_load_cleanup();
3601     compress_threads_load_cleanup();
3602
3603     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3604         g_free(rb->receivedmap);
3605         rb->receivedmap = NULL;
3606     }
3607
3608     return 0;
3609 }
3610
3611 /**
3612  * ram_postcopy_incoming_init: allocate postcopy data structures
3613  *
3614  * Returns 0 for success and negative if there was one error
3615  *
3616  * @mis: current migration incoming state
3617  *
3618  * Allocate data structures etc needed by incoming migration with
3619  * postcopy-ram. postcopy-ram's similarly names
3620  * postcopy_ram_incoming_init does the work.
3621  */
3622 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3623 {
3624     return postcopy_ram_incoming_init(mis);
3625 }
3626
3627 /**
3628  * ram_load_postcopy: load a page in postcopy case
3629  *
3630  * Returns 0 for success or -errno in case of error
3631  *
3632  * Called in postcopy mode by ram_load().
3633  * rcu_read_lock is taken prior to this being called.
3634  *
3635  * @f: QEMUFile where to send the data
3636  */
3637 static int ram_load_postcopy(QEMUFile *f)
3638 {
3639     int flags = 0, ret = 0;
3640     bool place_needed = false;
3641     bool matches_target_page_size = false;
3642     MigrationIncomingState *mis = migration_incoming_get_current();
3643     /* Temporary page that is later 'placed' */
3644     void *postcopy_host_page = mis->postcopy_tmp_page;
3645     void *host_page = NULL;
3646     bool all_zero = true;
3647     int target_pages = 0;
3648
3649     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3650         ram_addr_t addr;
3651         void *page_buffer = NULL;
3652         void *place_source = NULL;
3653         RAMBlock *block = NULL;
3654         uint8_t ch;
3655         int len;
3656
3657         addr = qemu_get_be64(f);
3658
3659         /*
3660          * If qemu file error, we should stop here, and then "addr"
3661          * may be invalid
3662          */
3663         ret = qemu_file_get_error(f);
3664         if (ret) {
3665             break;
3666         }
3667
3668         flags = addr & ~TARGET_PAGE_MASK;
3669         addr &= TARGET_PAGE_MASK;
3670
3671         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3672         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3673                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3674             block = ram_block_from_stream(f, flags);
3675             if (!block) {
3676                 ret = -EINVAL;
3677                 break;
3678             }
3679
3680             /*
3681              * Relying on used_length is racy and can result in false positives.
3682              * We might place pages beyond used_length in case RAM was shrunk
3683              * while in postcopy, which is fine - trying to place via
3684              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3685              */
3686             if (!block->host || addr >= block->postcopy_length) {
3687                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3688                 ret = -EINVAL;
3689                 break;
3690             }
3691             target_pages++;
3692             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3693             /*
3694              * Postcopy requires that we place whole host pages atomically;
3695              * these may be huge pages for RAMBlocks that are backed by
3696              * hugetlbfs.
3697              * To make it atomic, the data is read into a temporary page
3698              * that's moved into place later.
3699              * The migration protocol uses,  possibly smaller, target-pages
3700              * however the source ensures it always sends all the components
3701              * of a host page in one chunk.
3702              */
3703             page_buffer = postcopy_host_page +
3704                           host_page_offset_from_ram_block_offset(block, addr);
3705             /* If all TP are zero then we can optimise the place */
3706             if (target_pages == 1) {
3707                 host_page = host_page_from_ram_block_offset(block, addr);
3708             } else if (host_page != host_page_from_ram_block_offset(block,
3709                                                                     addr)) {
3710                 /* not the 1st TP within the HP */
3711                 error_report("Non-same host page %p/%p", host_page,
3712                              host_page_from_ram_block_offset(block, addr));
3713                 ret = -EINVAL;
3714                 break;
3715             }
3716
3717             /*
3718              * If it's the last part of a host page then we place the host
3719              * page
3720              */
3721             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3722                 place_needed = true;
3723             }
3724             place_source = postcopy_host_page;
3725         }
3726
3727         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3728         case RAM_SAVE_FLAG_ZERO:
3729             ch = qemu_get_byte(f);
3730             /*
3731              * Can skip to set page_buffer when
3732              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3733              */
3734             if (ch || !matches_target_page_size) {
3735                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3736             }
3737             if (ch) {
3738                 all_zero = false;
3739             }
3740             break;
3741
3742         case RAM_SAVE_FLAG_PAGE:
3743             all_zero = false;
3744             if (!matches_target_page_size) {
3745                 /* For huge pages, we always use temporary buffer */
3746                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3747             } else {
3748                 /*
3749                  * For small pages that matches target page size, we
3750                  * avoid the qemu_file copy.  Instead we directly use
3751                  * the buffer of QEMUFile to place the page.  Note: we
3752                  * cannot do any QEMUFile operation before using that
3753                  * buffer to make sure the buffer is valid when
3754                  * placing the page.
3755                  */
3756                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3757                                          TARGET_PAGE_SIZE);
3758             }
3759             break;
3760         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3761             all_zero = false;
3762             len = qemu_get_be32(f);
3763             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3764                 error_report("Invalid compressed data length: %d", len);
3765                 ret = -EINVAL;
3766                 break;
3767             }
3768             decompress_data_with_multi_threads(f, page_buffer, len);
3769             break;
3770
3771         case RAM_SAVE_FLAG_EOS:
3772             /* normal exit */
3773             multifd_recv_sync_main();
3774             break;
3775         default:
3776             error_report("Unknown combination of migration flags: 0x%x"
3777                          " (postcopy mode)", flags);
3778             ret = -EINVAL;
3779             break;
3780         }
3781
3782         /* Got the whole host page, wait for decompress before placing. */
3783         if (place_needed) {
3784             ret |= wait_for_decompress_done();
3785         }
3786
3787         /* Detect for any possible file errors */
3788         if (!ret && qemu_file_get_error(f)) {
3789             ret = qemu_file_get_error(f);
3790         }
3791
3792         if (!ret && place_needed) {
3793             if (all_zero) {
3794                 ret = postcopy_place_page_zero(mis, host_page, block);
3795             } else {
3796                 ret = postcopy_place_page(mis, host_page, place_source,
3797                                           block);
3798             }
3799             place_needed = false;
3800             target_pages = 0;
3801             /* Assume we have a zero page until we detect something different */
3802             all_zero = true;
3803         }
3804     }
3805
3806     return ret;
3807 }
3808
3809 static bool postcopy_is_advised(void)
3810 {
3811     PostcopyState ps = postcopy_state_get();
3812     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3813 }
3814
3815 static bool postcopy_is_running(void)
3816 {
3817     PostcopyState ps = postcopy_state_get();
3818     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3819 }
3820
3821 /*
3822  * Flush content of RAM cache into SVM's memory.
3823  * Only flush the pages that be dirtied by PVM or SVM or both.
3824  */
3825 void colo_flush_ram_cache(void)
3826 {
3827     RAMBlock *block = NULL;
3828     void *dst_host;
3829     void *src_host;
3830     unsigned long offset = 0;
3831
3832     memory_global_dirty_log_sync();
3833     WITH_RCU_READ_LOCK_GUARD() {
3834         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3835             ramblock_sync_dirty_bitmap(ram_state, block);
3836         }
3837     }
3838
3839     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3840     WITH_RCU_READ_LOCK_GUARD() {
3841         block = QLIST_FIRST_RCU(&ram_list.blocks);
3842
3843         while (block) {
3844             unsigned long num = 0;
3845
3846             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3847             if (!offset_in_ramblock(block,
3848                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3849                 offset = 0;
3850                 num = 0;
3851                 block = QLIST_NEXT_RCU(block, next);
3852             } else {
3853                 unsigned long i = 0;
3854
3855                 for (i = 0; i < num; i++) {
3856                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
3857                 }
3858                 dst_host = block->host
3859                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3860                 src_host = block->colo_cache
3861                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3862                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3863                 offset += num;
3864             }
3865         }
3866     }
3867     trace_colo_flush_ram_cache_end();
3868 }
3869
3870 /**
3871  * ram_load_precopy: load pages in precopy case
3872  *
3873  * Returns 0 for success or -errno in case of error
3874  *
3875  * Called in precopy mode by ram_load().
3876  * rcu_read_lock is taken prior to this being called.
3877  *
3878  * @f: QEMUFile where to send the data
3879  */
3880 static int ram_load_precopy(QEMUFile *f)
3881 {
3882     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3883     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3884     bool postcopy_advised = postcopy_is_advised();
3885     if (!migrate_use_compression()) {
3886         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3887     }
3888
3889     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3890         ram_addr_t addr, total_ram_bytes;
3891         void *host = NULL, *host_bak = NULL;
3892         uint8_t ch;
3893
3894         /*
3895          * Yield periodically to let main loop run, but an iteration of
3896          * the main loop is expensive, so do it each some iterations
3897          */
3898         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3899             aio_co_schedule(qemu_get_current_aio_context(),
3900                             qemu_coroutine_self());
3901             qemu_coroutine_yield();
3902         }
3903         i++;
3904
3905         addr = qemu_get_be64(f);
3906         flags = addr & ~TARGET_PAGE_MASK;
3907         addr &= TARGET_PAGE_MASK;
3908
3909         if (flags & invalid_flags) {
3910             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3911                 error_report("Received an unexpected compressed page");
3912             }
3913
3914             ret = -EINVAL;
3915             break;
3916         }
3917
3918         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3919                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3920             RAMBlock *block = ram_block_from_stream(f, flags);
3921
3922             host = host_from_ram_block_offset(block, addr);
3923             /*
3924              * After going into COLO stage, we should not load the page
3925              * into SVM's memory directly, we put them into colo_cache firstly.
3926              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3927              * Previously, we copied all these memory in preparing stage of COLO
3928              * while we need to stop VM, which is a time-consuming process.
3929              * Here we optimize it by a trick, back-up every page while in
3930              * migration process while COLO is enabled, though it affects the
3931              * speed of the migration, but it obviously reduce the downtime of
3932              * back-up all SVM'S memory in COLO preparing stage.
3933              */
3934             if (migration_incoming_colo_enabled()) {
3935                 if (migration_incoming_in_colo_state()) {
3936                     /* In COLO stage, put all pages into cache temporarily */
3937                     host = colo_cache_from_block_offset(block, addr, true);
3938                 } else {
3939                    /*
3940                     * In migration stage but before COLO stage,
3941                     * Put all pages into both cache and SVM's memory.
3942                     */
3943                     host_bak = colo_cache_from_block_offset(block, addr, false);
3944                 }
3945             }
3946             if (!host) {
3947                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3948                 ret = -EINVAL;
3949                 break;
3950             }
3951             if (!migration_incoming_in_colo_state()) {
3952                 ramblock_recv_bitmap_set(block, host);
3953             }
3954
3955             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3956         }
3957
3958         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3959         case RAM_SAVE_FLAG_MEM_SIZE:
3960             /* Synchronize RAM block list */
3961             total_ram_bytes = addr;
3962             while (!ret && total_ram_bytes) {
3963                 RAMBlock *block;
3964                 char id[256];
3965                 ram_addr_t length;
3966
3967                 len = qemu_get_byte(f);
3968                 qemu_get_buffer(f, (uint8_t *)id, len);
3969                 id[len] = 0;
3970                 length = qemu_get_be64(f);
3971
3972                 block = qemu_ram_block_by_name(id);
3973                 if (block && !qemu_ram_is_migratable(block)) {
3974                     error_report("block %s should not be migrated !", id);
3975                     ret = -EINVAL;
3976                 } else if (block) {
3977                     if (length != block->used_length) {
3978                         Error *local_err = NULL;
3979
3980                         ret = qemu_ram_resize(block, length,
3981                                               &local_err);
3982                         if (local_err) {
3983                             error_report_err(local_err);
3984                         }
3985                     }
3986                     /* For postcopy we need to check hugepage sizes match */
3987                     if (postcopy_advised && migrate_postcopy_ram() &&
3988                         block->page_size != qemu_host_page_size) {
3989                         uint64_t remote_page_size = qemu_get_be64(f);
3990                         if (remote_page_size != block->page_size) {
3991                             error_report("Mismatched RAM page size %s "
3992                                          "(local) %zd != %" PRId64,
3993                                          id, block->page_size,
3994                                          remote_page_size);
3995                             ret = -EINVAL;
3996                         }
3997                     }
3998                     if (migrate_ignore_shared()) {
3999                         hwaddr addr = qemu_get_be64(f);
4000                         if (ramblock_is_ignored(block) &&
4001                             block->mr->addr != addr) {
4002                             error_report("Mismatched GPAs for block %s "
4003                                          "%" PRId64 "!= %" PRId64,
4004                                          id, (uint64_t)addr,
4005                                          (uint64_t)block->mr->addr);
4006                             ret = -EINVAL;
4007                         }
4008                     }
4009                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4010                                           block->idstr);
4011                 } else {
4012                     error_report("Unknown ramblock \"%s\", cannot "
4013                                  "accept migration", id);
4014                     ret = -EINVAL;
4015                 }
4016
4017                 total_ram_bytes -= length;
4018             }
4019             break;
4020
4021         case RAM_SAVE_FLAG_ZERO:
4022             ch = qemu_get_byte(f);
4023             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4024             break;
4025
4026         case RAM_SAVE_FLAG_PAGE:
4027             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4028             break;
4029
4030         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4031             len = qemu_get_be32(f);
4032             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4033                 error_report("Invalid compressed data length: %d", len);
4034                 ret = -EINVAL;
4035                 break;
4036             }
4037             decompress_data_with_multi_threads(f, host, len);
4038             break;
4039
4040         case RAM_SAVE_FLAG_XBZRLE:
4041             if (load_xbzrle(f, addr, host) < 0) {
4042                 error_report("Failed to decompress XBZRLE page at "
4043                              RAM_ADDR_FMT, addr);
4044                 ret = -EINVAL;
4045                 break;
4046             }
4047             break;
4048         case RAM_SAVE_FLAG_EOS:
4049             /* normal exit */
4050             multifd_recv_sync_main();
4051             break;
4052         default:
4053             if (flags & RAM_SAVE_FLAG_HOOK) {
4054                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4055             } else {
4056                 error_report("Unknown combination of migration flags: 0x%x",
4057                              flags);
4058                 ret = -EINVAL;
4059             }
4060         }
4061         if (!ret) {
4062             ret = qemu_file_get_error(f);
4063         }
4064         if (!ret && host_bak) {
4065             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4066         }
4067     }
4068
4069     ret |= wait_for_decompress_done();
4070     return ret;
4071 }
4072
4073 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4074 {
4075     int ret = 0;
4076     static uint64_t seq_iter;
4077     /*
4078      * If system is running in postcopy mode, page inserts to host memory must
4079      * be atomic
4080      */
4081     bool postcopy_running = postcopy_is_running();
4082
4083     seq_iter++;
4084
4085     if (version_id != 4) {
4086         return -EINVAL;
4087     }
4088
4089     /*
4090      * This RCU critical section can be very long running.
4091      * When RCU reclaims in the code start to become numerous,
4092      * it will be necessary to reduce the granularity of this
4093      * critical section.
4094      */
4095     WITH_RCU_READ_LOCK_GUARD() {
4096         if (postcopy_running) {
4097             ret = ram_load_postcopy(f);
4098         } else {
4099             ret = ram_load_precopy(f);
4100         }
4101     }
4102     trace_ram_load_complete(ret, seq_iter);
4103
4104     return ret;
4105 }
4106
4107 static bool ram_has_postcopy(void *opaque)
4108 {
4109     RAMBlock *rb;
4110     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4111         if (ramblock_is_pmem(rb)) {
4112             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4113                          "is not supported now!", rb->idstr, rb->host);
4114             return false;
4115         }
4116     }
4117
4118     return migrate_postcopy_ram();
4119 }
4120
4121 /* Sync all the dirty bitmap with destination VM.  */
4122 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4123 {
4124     RAMBlock *block;
4125     QEMUFile *file = s->to_dst_file;
4126     int ramblock_count = 0;
4127
4128     trace_ram_dirty_bitmap_sync_start();
4129
4130     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4131         qemu_savevm_send_recv_bitmap(file, block->idstr);
4132         trace_ram_dirty_bitmap_request(block->idstr);
4133         ramblock_count++;
4134     }
4135
4136     trace_ram_dirty_bitmap_sync_wait();
4137
4138     /* Wait until all the ramblocks' dirty bitmap synced */
4139     while (ramblock_count--) {
4140         qemu_sem_wait(&s->rp_state.rp_sem);
4141     }
4142
4143     trace_ram_dirty_bitmap_sync_complete();
4144
4145     return 0;
4146 }
4147
4148 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4149 {
4150     qemu_sem_post(&s->rp_state.rp_sem);
4151 }
4152
4153 /*
4154  * Read the received bitmap, revert it as the initial dirty bitmap.
4155  * This is only used when the postcopy migration is paused but wants
4156  * to resume from a middle point.
4157  */
4158 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4159 {
4160     int ret = -EINVAL;
4161     /* from_dst_file is always valid because we're within rp_thread */
4162     QEMUFile *file = s->rp_state.from_dst_file;
4163     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4164     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4165     uint64_t size, end_mark;
4166
4167     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4168
4169     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4170         error_report("%s: incorrect state %s", __func__,
4171                      MigrationStatus_str(s->state));
4172         return -EINVAL;
4173     }
4174
4175     /*
4176      * Note: see comments in ramblock_recv_bitmap_send() on why we
4177      * need the endianness conversion, and the paddings.
4178      */
4179     local_size = ROUND_UP(local_size, 8);
4180
4181     /* Add paddings */
4182     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4183
4184     size = qemu_get_be64(file);
4185
4186     /* The size of the bitmap should match with our ramblock */
4187     if (size != local_size) {
4188         error_report("%s: ramblock '%s' bitmap size mismatch "
4189                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4190                      block->idstr, size, local_size);
4191         ret = -EINVAL;
4192         goto out;
4193     }
4194
4195     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4196     end_mark = qemu_get_be64(file);
4197
4198     ret = qemu_file_get_error(file);
4199     if (ret || size != local_size) {
4200         error_report("%s: read bitmap failed for ramblock '%s': %d"
4201                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4202                      __func__, block->idstr, ret, local_size, size);
4203         ret = -EIO;
4204         goto out;
4205     }
4206
4207     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4208         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4209                      __func__, block->idstr, end_mark);
4210         ret = -EINVAL;
4211         goto out;
4212     }
4213
4214     /*
4215      * Endianness conversion. We are during postcopy (though paused).
4216      * The dirty bitmap won't change. We can directly modify it.
4217      */
4218     bitmap_from_le(block->bmap, le_bitmap, nbits);
4219
4220     /*
4221      * What we received is "received bitmap". Revert it as the initial
4222      * dirty bitmap for this ramblock.
4223      */
4224     bitmap_complement(block->bmap, block->bmap, nbits);
4225
4226     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4227     ramblock_dirty_bitmap_clear_discarded_pages(block);
4228
4229     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4230     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4231
4232     /*
4233      * We succeeded to sync bitmap for current ramblock. If this is
4234      * the last one to sync, we need to notify the main send thread.
4235      */
4236     ram_dirty_bitmap_reload_notify(s);
4237
4238     ret = 0;
4239 out:
4240     g_free(le_bitmap);
4241     return ret;
4242 }
4243
4244 static int ram_resume_prepare(MigrationState *s, void *opaque)
4245 {
4246     RAMState *rs = *(RAMState **)opaque;
4247     int ret;
4248
4249     ret = ram_dirty_bitmap_sync_all(s, rs);
4250     if (ret) {
4251         return ret;
4252     }
4253
4254     ram_state_resume_prepare(rs, s->to_dst_file);
4255
4256     return 0;
4257 }
4258
4259 static SaveVMHandlers savevm_ram_handlers = {
4260     .save_setup = ram_save_setup,
4261     .save_live_iterate = ram_save_iterate,
4262     .save_live_complete_postcopy = ram_save_complete,
4263     .save_live_complete_precopy = ram_save_complete,
4264     .has_postcopy = ram_has_postcopy,
4265     .save_live_pending = ram_save_pending,
4266     .load_state = ram_load,
4267     .save_cleanup = ram_save_cleanup,
4268     .load_setup = ram_load_setup,
4269     .load_cleanup = ram_load_cleanup,
4270     .resume_prepare = ram_resume_prepare,
4271 };
4272
4273 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4274                                       size_t old_size, size_t new_size)
4275 {
4276     PostcopyState ps = postcopy_state_get();
4277     ram_addr_t offset;
4278     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4279     Error *err = NULL;
4280
4281     if (ramblock_is_ignored(rb)) {
4282         return;
4283     }
4284
4285     if (!migration_is_idle()) {
4286         /*
4287          * Precopy code on the source cannot deal with the size of RAM blocks
4288          * changing at random points in time - especially after sending the
4289          * RAM block sizes in the migration stream, they must no longer change.
4290          * Abort and indicate a proper reason.
4291          */
4292         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4293         migration_cancel(err);
4294         error_free(err);
4295     }
4296
4297     switch (ps) {
4298     case POSTCOPY_INCOMING_ADVISE:
4299         /*
4300          * Update what ram_postcopy_incoming_init()->init_range() does at the
4301          * time postcopy was advised. Syncing RAM blocks with the source will
4302          * result in RAM resizes.
4303          */
4304         if (old_size < new_size) {
4305             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4306                 error_report("RAM block '%s' discard of resized RAM failed",
4307                              rb->idstr);
4308             }
4309         }
4310         rb->postcopy_length = new_size;
4311         break;
4312     case POSTCOPY_INCOMING_NONE:
4313     case POSTCOPY_INCOMING_RUNNING:
4314     case POSTCOPY_INCOMING_END:
4315         /*
4316          * Once our guest is running, postcopy does no longer care about
4317          * resizes. When growing, the new memory was not available on the
4318          * source, no handler needed.
4319          */
4320         break;
4321     default:
4322         error_report("RAM block '%s' resized during postcopy state: %d",
4323                      rb->idstr, ps);
4324         exit(-1);
4325     }
4326 }
4327
4328 static RAMBlockNotifier ram_mig_ram_notifier = {
4329     .ram_block_resized = ram_mig_ram_block_resized,
4330 };
4331
4332 void ram_mig_init(void)
4333 {
4334     qemu_mutex_init(&XBZRLE.lock);
4335     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4336     ram_block_notifier_add(&ram_mig_ram_notifier);
4337 }