migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/main-loop.h"
  34 #include "xbzrle.h"
  35 #include "ram.h"
  36 #include "migration.h"
  37 #include "migration/register.h"
  38 #include "migration/misc.h"
  39 #include "qemu-file.h"
  40 #include "postcopy-ram.h"
  41 #include "page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "qapi/error.h"
  44 #include "qapi/qapi-types-migration.h"
  45 #include "qapi/qapi-events-migration.h"
  46 #include "qapi/qmp/qerror.h"
  47 #include "trace.h"
  48 #include "exec/ram_addr.h"
  49 #include "exec/target_page.h"
  50 #include "qemu/rcu_queue.h"
  51 #include "migration/colo.h"
  52 #include "block.h"
  53 #include "sysemu/cpu-throttle.h"
  54 #include "savevm.h"
  55 #include "qemu/iov.h"
  56 #include "multifd.h"
  57 #include "sysemu/runstate.h"
  58
  59 #include "hw/boards.h" /* for machine_dump_guest_core() */
  60
  61 #if defined(__linux__)
  62 #include "qemu/userfaultfd.h"
  63 #endif /* defined(__linux__) */
  64
  65 /***********************************************************/
  66 /* ram save/restore */
  67
  68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  69  * worked for pages that where filled with the same char.  We switched
  70  * it to only search for the zero value.  And to avoid confusion with
  71  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  72  */
  73
  74 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  75 #define RAM_SAVE_FLAG_ZERO     0x02
  76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  77 #define RAM_SAVE_FLAG_PAGE     0x08
  78 #define RAM_SAVE_FLAG_EOS      0x10
  79 #define RAM_SAVE_FLAG_CONTINUE 0x20
  80 #define RAM_SAVE_FLAG_XBZRLE   0x40
  81 /* 0x80 is reserved in migration.h start with 0x100 next */
  82 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  83
  84 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  85 {
  86     return buffer_is_zero(p, size);
  87 }
  88
  89 XBZRLECacheStats xbzrle_counters;
  90
  91 /* struct contains XBZRLE cache and a static page
  92    used by the compression */
  93 static struct {
  94     /* buffer used for XBZRLE encoding */
  95     uint8_t *encoded_buf;
  96     /* buffer for storing page content */
  97     uint8_t *current_buf;
  98     /* Cache for XBZRLE, Protected by lock. */
  99     PageCache *cache;
 100     QemuMutex lock;
 101     /* it will store a page full of zeros */
 102     uint8_t *zero_target_page;
 103     /* buffer used for XBZRLE decoding */
 104     uint8_t *decoded_buf;
 105 } XBZRLE;
 106
 107 static void XBZRLE_cache_lock(void)
 108 {
 109     if (migrate_use_xbzrle()) {
 110         qemu_mutex_lock(&XBZRLE.lock);
 111     }
 112 }
 113
 114 static void XBZRLE_cache_unlock(void)
 115 {
 116     if (migrate_use_xbzrle()) {
 117         qemu_mutex_unlock(&XBZRLE.lock);
 118     }
 119 }
 120
 121 /**
 122  * xbzrle_cache_resize: resize the xbzrle cache
 123  *
 124  * This function is called from migrate_params_apply in main
 125  * thread, possibly while a migration is in progress.  A running
 126  * migration may be using the cache and might finish during this call,
 127  * hence changes to the cache are protected by XBZRLE.lock().
 128  *
 129  * Returns 0 for success or -1 for error
 130  *
 131  * @new_size: new cache size
 132  * @errp: set *errp if the check failed, with reason
 133  */
 134 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 135 {
 136     PageCache *new_cache;
 137     int64_t ret = 0;
 138
 139     /* Check for truncation */
 140     if (new_size != (size_t)new_size) {
 141         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 142                    "exceeding address space");
 143         return -1;
 144     }
 145
 146     if (new_size == migrate_xbzrle_cache_size()) {
 147         /* nothing to do */
 148         return 0;
 149     }
 150
 151     XBZRLE_cache_lock();
 152
 153     if (XBZRLE.cache != NULL) {
 154         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 155         if (!new_cache) {
 156             ret = -1;
 157             goto out;
 158         }
 159
 160         cache_fini(XBZRLE.cache);
 161         XBZRLE.cache = new_cache;
 162     }
 163 out:
 164     XBZRLE_cache_unlock();
 165     return ret;
 166 }
 167
 168 bool ramblock_is_ignored(RAMBlock *block)
 169 {
 170     return !qemu_ram_is_migratable(block) ||
 171            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 172 }
 173
 174 #undef RAMBLOCK_FOREACH
 175
 176 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 177 {
 178     RAMBlock *block;
 179     int ret = 0;
 180
 181     RCU_READ_LOCK_GUARD();
 182
 183     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 184         ret = func(block, opaque);
 185         if (ret) {
 186             break;
 187         }
 188     }
 189     return ret;
 190 }
 191
 192 static void ramblock_recv_map_init(void)
 193 {
 194     RAMBlock *rb;
 195
 196     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 197         assert(!rb->receivedmap);
 198         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 199     }
 200 }
 201
 202 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 203 {
 204     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 205                     rb->receivedmap);
 206 }
 207
 208 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 209 {
 210     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 211 }
 212
 213 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 214 {
 215     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 216 }
 217
 218 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 219                                     size_t nr)
 220 {
 221     bitmap_set_atomic(rb->receivedmap,
 222                       ramblock_recv_bitmap_offset(host_addr, rb),
 223                       nr);
 224 }
 225
 226 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 227
 228 /*
 229  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 230  *
 231  * Returns >0 if success with sent bytes, or <0 if error.
 232  */
 233 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 234                                   const char *block_name)
 235 {
 236     RAMBlock *block = qemu_ram_block_by_name(block_name);
 237     unsigned long *le_bitmap, nbits;
 238     uint64_t size;
 239
 240     if (!block) {
 241         error_report("%s: invalid block name: %s", __func__, block_name);
 242         return -1;
 243     }
 244
 245     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 246
 247     /*
 248      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 249      * machines we may need 4 more bytes for padding (see below
 250      * comment). So extend it a bit before hand.
 251      */
 252     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 253
 254     /*
 255      * Always use little endian when sending the bitmap. This is
 256      * required that when source and destination VMs are not using the
 257      * same endianness. (Note: big endian won't work.)
 258      */
 259     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 260
 261     /* Size of the bitmap, in bytes */
 262     size = DIV_ROUND_UP(nbits, 8);
 263
 264     /*
 265      * size is always aligned to 8 bytes for 64bit machines, but it
 266      * may not be true for 32bit machines. We need this padding to
 267      * make sure the migration can survive even between 32bit and
 268      * 64bit machines.
 269      */
 270     size = ROUND_UP(size, 8);
 271
 272     qemu_put_be64(file, size);
 273     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 274     /*
 275      * Mark as an end, in case the middle part is screwed up due to
 276      * some "mysterious" reason.
 277      */
 278     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 279     qemu_fflush(file);
 280
 281     g_free(le_bitmap);
 282
 283     if (qemu_file_get_error(file)) {
 284         return qemu_file_get_error(file);
 285     }
 286
 287     return size + sizeof(size);
 288 }
 289
 290 /*
 291  * An outstanding page request, on the source, having been received
 292  * and queued
 293  */
 294 struct RAMSrcPageRequest {
 295     RAMBlock *rb;
 296     hwaddr    offset;
 297     hwaddr    len;
 298
 299     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 300 };
 301
 302 /* State of RAM for migration */
 303 struct RAMState {
 304     /* QEMUFile used for this migration */
 305     QEMUFile *f;
 306     /* UFFD file descriptor, used in 'write-tracking' migration */
 307     int uffdio_fd;
 308     /* Last block that we have visited searching for dirty pages */
 309     RAMBlock *last_seen_block;
 310     /* Last block from where we have sent data */
 311     RAMBlock *last_sent_block;
 312     /* Last dirty target page we have sent */
 313     ram_addr_t last_page;
 314     /* last ram version we have seen */
 315     uint32_t last_version;
 316     /* How many times we have dirty too many pages */
 317     int dirty_rate_high_cnt;
 318     /* these variables are used for bitmap sync */
 319     /* last time we did a full bitmap_sync */
 320     int64_t time_last_bitmap_sync;
 321     /* bytes transferred at start_time */
 322     uint64_t bytes_xfer_prev;
 323     /* number of dirty pages since start_time */
 324     uint64_t num_dirty_pages_period;
 325     /* xbzrle misses since the beginning of the period */
 326     uint64_t xbzrle_cache_miss_prev;
 327     /* Amount of xbzrle pages since the beginning of the period */
 328     uint64_t xbzrle_pages_prev;
 329     /* Amount of xbzrle encoded bytes since the beginning of the period */
 330     uint64_t xbzrle_bytes_prev;
 331     /* Start using XBZRLE (e.g., after the first round). */
 332     bool xbzrle_enabled;
 333
 334     /* compression statistics since the beginning of the period */
 335     /* amount of count that no free thread to compress data */
 336     uint64_t compress_thread_busy_prev;
 337     /* amount bytes after compression */
 338     uint64_t compressed_size_prev;
 339     /* amount of compressed pages */
 340     uint64_t compress_pages_prev;
 341
 342     /* total handled target pages at the beginning of period */
 343     uint64_t target_page_count_prev;
 344     /* total handled target pages since start */
 345     uint64_t target_page_count;
 346     /* number of dirty bits in the bitmap */
 347     uint64_t migration_dirty_pages;
 348     /* Protects modification of the bitmap and migration dirty pages */
 349     QemuMutex bitmap_mutex;
 350     /* The RAMBlock used in the last src_page_requests */
 351     RAMBlock *last_req_rb;
 352     /* Queue of outstanding page requests from the destination */
 353     QemuMutex src_page_req_mutex;
 354     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 355 };
 356 typedef struct RAMState RAMState;
 357
 358 static RAMState *ram_state;
 359
 360 static NotifierWithReturnList precopy_notifier_list;
 361
 362 void precopy_infrastructure_init(void)
 363 {
 364     notifier_with_return_list_init(&precopy_notifier_list);
 365 }
 366
 367 void precopy_add_notifier(NotifierWithReturn *n)
 368 {
 369     notifier_with_return_list_add(&precopy_notifier_list, n);
 370 }
 371
 372 void precopy_remove_notifier(NotifierWithReturn *n)
 373 {
 374     notifier_with_return_remove(n);
 375 }
 376
 377 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 378 {
 379     PrecopyNotifyData pnd;
 380     pnd.reason = reason;
 381     pnd.errp = errp;
 382
 383     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 384 }
 385
 386 uint64_t ram_bytes_remaining(void)
 387 {
 388     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 389                        0;
 390 }
 391
 392 MigrationStats ram_counters;
 393
 394 /* used by the search for pages to send */
 395 struct PageSearchStatus {
 396     /* Current block being searched */
 397     RAMBlock    *block;
 398     /* Current page to search from */
 399     unsigned long page;
 400     /* Set once we wrap around */
 401     bool         complete_round;
 402 };
 403 typedef struct PageSearchStatus PageSearchStatus;
 404
 405 CompressionStats compression_counters;
 406
 407 struct CompressParam {
 408     bool done;
 409     bool quit;
 410     bool zero_page;
 411     QEMUFile *file;
 412     QemuMutex mutex;
 413     QemuCond cond;
 414     RAMBlock *block;
 415     ram_addr_t offset;
 416
 417     /* internally used fields */
 418     z_stream stream;
 419     uint8_t *originbuf;
 420 };
 421 typedef struct CompressParam CompressParam;
 422
 423 struct DecompressParam {
 424     bool done;
 425     bool quit;
 426     QemuMutex mutex;
 427     QemuCond cond;
 428     void *des;
 429     uint8_t *compbuf;
 430     int len;
 431     z_stream stream;
 432 };
 433 typedef struct DecompressParam DecompressParam;
 434
 435 static CompressParam *comp_param;
 436 static QemuThread *compress_threads;
 437 /* comp_done_cond is used to wake up the migration thread when
 438  * one of the compression threads has finished the compression.
 439  * comp_done_lock is used to co-work with comp_done_cond.
 440  */
 441 static QemuMutex comp_done_lock;
 442 static QemuCond comp_done_cond;
 443 /* The empty QEMUFileOps will be used by file in CompressParam */
 444 static const QEMUFileOps empty_ops = { };
 445
 446 static QEMUFile *decomp_file;
 447 static DecompressParam *decomp_param;
 448 static QemuThread *decompress_threads;
 449 static QemuMutex decomp_done_lock;
 450 static QemuCond decomp_done_cond;
 451
 452 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 453                                  ram_addr_t offset, uint8_t *source_buf);
 454
 455 static void *do_data_compress(void *opaque)
 456 {
 457     CompressParam *param = opaque;
 458     RAMBlock *block;
 459     ram_addr_t offset;
 460     bool zero_page;
 461
 462     qemu_mutex_lock(&param->mutex);
 463     while (!param->quit) {
 464         if (param->block) {
 465             block = param->block;
 466             offset = param->offset;
 467             param->block = NULL;
 468             qemu_mutex_unlock(&param->mutex);
 469
 470             zero_page = do_compress_ram_page(param->file, &param->stream,
 471                                              block, offset, param->originbuf);
 472
 473             qemu_mutex_lock(&comp_done_lock);
 474             param->done = true;
 475             param->zero_page = zero_page;
 476             qemu_cond_signal(&comp_done_cond);
 477             qemu_mutex_unlock(&comp_done_lock);
 478
 479             qemu_mutex_lock(&param->mutex);
 480         } else {
 481             qemu_cond_wait(&param->cond, &param->mutex);
 482         }
 483     }
 484     qemu_mutex_unlock(&param->mutex);
 485
 486     return NULL;
 487 }
 488
 489 static void compress_threads_save_cleanup(void)
 490 {
 491     int i, thread_count;
 492
 493     if (!migrate_use_compression() || !comp_param) {
 494         return;
 495     }
 496
 497     thread_count = migrate_compress_threads();
 498     for (i = 0; i < thread_count; i++) {
 499         /*
 500          * we use it as a indicator which shows if the thread is
 501          * properly init'd or not
 502          */
 503         if (!comp_param[i].file) {
 504             break;
 505         }
 506
 507         qemu_mutex_lock(&comp_param[i].mutex);
 508         comp_param[i].quit = true;
 509         qemu_cond_signal(&comp_param[i].cond);
 510         qemu_mutex_unlock(&comp_param[i].mutex);
 511
 512         qemu_thread_join(compress_threads + i);
 513         qemu_mutex_destroy(&comp_param[i].mutex);
 514         qemu_cond_destroy(&comp_param[i].cond);
 515         deflateEnd(&comp_param[i].stream);
 516         g_free(comp_param[i].originbuf);
 517         qemu_fclose(comp_param[i].file);
 518         comp_param[i].file = NULL;
 519     }
 520     qemu_mutex_destroy(&comp_done_lock);
 521     qemu_cond_destroy(&comp_done_cond);
 522     g_free(compress_threads);
 523     g_free(comp_param);
 524     compress_threads = NULL;
 525     comp_param = NULL;
 526 }
 527
 528 static int compress_threads_save_setup(void)
 529 {
 530     int i, thread_count;
 531
 532     if (!migrate_use_compression()) {
 533         return 0;
 534     }
 535     thread_count = migrate_compress_threads();
 536     compress_threads = g_new0(QemuThread, thread_count);
 537     comp_param = g_new0(CompressParam, thread_count);
 538     qemu_cond_init(&comp_done_cond);
 539     qemu_mutex_init(&comp_done_lock);
 540     for (i = 0; i < thread_count; i++) {
 541         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 542         if (!comp_param[i].originbuf) {
 543             goto exit;
 544         }
 545
 546         if (deflateInit(&comp_param[i].stream,
 547                         migrate_compress_level()) != Z_OK) {
 548             g_free(comp_param[i].originbuf);
 549             goto exit;
 550         }
 551
 552         /* comp_param[i].file is just used as a dummy buffer to save data,
 553          * set its ops to empty.
 554          */
 555         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
 556         comp_param[i].done = true;
 557         comp_param[i].quit = false;
 558         qemu_mutex_init(&comp_param[i].mutex);
 559         qemu_cond_init(&comp_param[i].cond);
 560         qemu_thread_create(compress_threads + i, "compress",
 561                            do_data_compress, comp_param + i,
 562                            QEMU_THREAD_JOINABLE);
 563     }
 564     return 0;
 565
 566 exit:
 567     compress_threads_save_cleanup();
 568     return -1;
 569 }
 570
 571 /**
 572  * save_page_header: write page header to wire
 573  *
 574  * If this is the 1st block, it also writes the block identification
 575  *
 576  * Returns the number of bytes written
 577  *
 578  * @f: QEMUFile where to send the data
 579  * @block: block that contains the page we want to send
 580  * @offset: offset inside the block for the page
 581  *          in the lower bits, it contains flags
 582  */
 583 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 584                                ram_addr_t offset)
 585 {
 586     size_t size, len;
 587
 588     if (block == rs->last_sent_block) {
 589         offset |= RAM_SAVE_FLAG_CONTINUE;
 590     }
 591     qemu_put_be64(f, offset);
 592     size = 8;
 593
 594     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 595         len = strlen(block->idstr);
 596         qemu_put_byte(f, len);
 597         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 598         size += 1 + len;
 599         rs->last_sent_block = block;
 600     }
 601     return size;
 602 }
 603
 604 /**
 605  * mig_throttle_guest_down: throttle down the guest
 606  *
 607  * Reduce amount of guest cpu execution to hopefully slow down memory
 608  * writes. If guest dirty memory rate is reduced below the rate at
 609  * which we can transfer pages to the destination then we should be
 610  * able to complete migration. Some workloads dirty memory way too
 611  * fast and will not effectively converge, even with auto-converge.
 612  */
 613 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 614                                     uint64_t bytes_dirty_threshold)
 615 {
 616     MigrationState *s = migrate_get_current();
 617     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 618     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 619     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 620     int pct_max = s->parameters.max_cpu_throttle;
 621
 622     uint64_t throttle_now = cpu_throttle_get_percentage();
 623     uint64_t cpu_now, cpu_ideal, throttle_inc;
 624
 625     /* We have not started throttling yet. Let's start it. */
 626     if (!cpu_throttle_active()) {
 627         cpu_throttle_set(pct_initial);
 628     } else {
 629         /* Throttling already on, just increase the rate */
 630         if (!pct_tailslow) {
 631             throttle_inc = pct_increment;
 632         } else {
 633             /* Compute the ideal CPU percentage used by Guest, which may
 634              * make the dirty rate match the dirty rate threshold. */
 635             cpu_now = 100 - throttle_now;
 636             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 637                         bytes_dirty_period);
 638             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 639         }
 640         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 641     }
 642 }
 643
 644 /**
 645  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 646  *
 647  * @rs: current RAM state
 648  * @current_addr: address for the zero page
 649  *
 650  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 651  * The important thing is that a stale (not-yet-0'd) page be replaced
 652  * by the new data.
 653  * As a bonus, if the page wasn't in the cache it gets added so that
 654  * when a small write is made into the 0'd page it gets XBZRLE sent.
 655  */
 656 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 657 {
 658     if (!rs->xbzrle_enabled) {
 659         return;
 660     }
 661
 662     /* We don't care if this fails to allocate a new cache page
 663      * as long as it updated an old one */
 664     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 665                  ram_counters.dirty_sync_count);
 666 }
 667
 668 #define ENCODING_FLAG_XBZRLE 0x1
 669
 670 /**
 671  * save_xbzrle_page: compress and send current page
 672  *
 673  * Returns: 1 means that we wrote the page
 674  *          0 means that page is identical to the one already sent
 675  *          -1 means that xbzrle would be longer than normal
 676  *
 677  * @rs: current RAM state
 678  * @current_data: pointer to the address of the page contents
 679  * @current_addr: addr of the page
 680  * @block: block that contains the page we want to send
 681  * @offset: offset inside the block for the page
 682  * @last_stage: if we are at the completion stage
 683  */
 684 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 685                             ram_addr_t current_addr, RAMBlock *block,
 686                             ram_addr_t offset, bool last_stage)
 687 {
 688     int encoded_len = 0, bytes_xbzrle;
 689     uint8_t *prev_cached_page;
 690
 691     if (!cache_is_cached(XBZRLE.cache, current_addr,
 692                          ram_counters.dirty_sync_count)) {
 693         xbzrle_counters.cache_miss++;
 694         if (!last_stage) {
 695             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 696                              ram_counters.dirty_sync_count) == -1) {
 697                 return -1;
 698             } else {
 699                 /* update *current_data when the page has been
 700                    inserted into cache */
 701                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 702             }
 703         }
 704         return -1;
 705     }
 706
 707     /*
 708      * Reaching here means the page has hit the xbzrle cache, no matter what
 709      * encoding result it is (normal encoding, overflow or skipping the page),
 710      * count the page as encoded. This is used to calculate the encoding rate.
 711      *
 712      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 713      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 714      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 715      * skipped page included. In this way, the encoding rate can tell if the
 716      * guest page is good for xbzrle encoding.
 717      */
 718     xbzrle_counters.pages++;
 719     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 720
 721     /* save current buffer into memory */
 722     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 723
 724     /* XBZRLE encoding (if there is no overflow) */
 725     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 726                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 727                                        TARGET_PAGE_SIZE);
 728
 729     /*
 730      * Update the cache contents, so that it corresponds to the data
 731      * sent, in all cases except where we skip the page.
 732      */
 733     if (!last_stage && encoded_len != 0) {
 734         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 735         /*
 736          * In the case where we couldn't compress, ensure that the caller
 737          * sends the data from the cache, since the guest might have
 738          * changed the RAM since we copied it.
 739          */
 740         *current_data = prev_cached_page;
 741     }
 742
 743     if (encoded_len == 0) {
 744         trace_save_xbzrle_page_skipping();
 745         return 0;
 746     } else if (encoded_len == -1) {
 747         trace_save_xbzrle_page_overflow();
 748         xbzrle_counters.overflow++;
 749         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 750         return -1;
 751     }
 752
 753     /* Send XBZRLE based compressed page */
 754     bytes_xbzrle = save_page_header(rs, rs->f, block,
 755                                     offset | RAM_SAVE_FLAG_XBZRLE);
 756     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 757     qemu_put_be16(rs->f, encoded_len);
 758     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 759     bytes_xbzrle += encoded_len + 1 + 2;
 760     /*
 761      * Like compressed_size (please see update_compress_thread_counts),
 762      * the xbzrle encoded bytes don't count the 8 byte header with
 763      * RAM_SAVE_FLAG_CONTINUE.
 764      */
 765     xbzrle_counters.bytes += bytes_xbzrle - 8;
 766     ram_counters.transferred += bytes_xbzrle;
 767
 768     return 1;
 769 }
 770
 771 /**
 772  * migration_bitmap_find_dirty: find the next dirty page from start
 773  *
 774  * Returns the page offset within memory region of the start of a dirty page
 775  *
 776  * @rs: current RAM state
 777  * @rb: RAMBlock where to search for dirty pages
 778  * @start: page where we start the search
 779  */
 780 static inline
 781 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 782                                           unsigned long start)
 783 {
 784     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 785     unsigned long *bitmap = rb->bmap;
 786
 787     if (ramblock_is_ignored(rb)) {
 788         return size;
 789     }
 790
 791     return find_next_bit(bitmap, size, start);
 792 }
 793
 794 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 795                                                        unsigned long page)
 796 {
 797     uint8_t shift;
 798     hwaddr size, start;
 799
 800     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 801         return;
 802     }
 803
 804     shift = rb->clear_bmap_shift;
 805     /*
 806      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 807      * can make things easier sometimes since then start address
 808      * of the small chunk will always be 64 pages aligned so the
 809      * bitmap will always be aligned to unsigned long. We should
 810      * even be able to remove this restriction but I'm simply
 811      * keeping it.
 812      */
 813     assert(shift >= 6);
 814
 815     size = 1ULL << (TARGET_PAGE_BITS + shift);
 816     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
 817     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 818     memory_region_clear_dirty_bitmap(rb->mr, start, size);
 819 }
 820
 821 static void
 822 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 823                                                  unsigned long start,
 824                                                  unsigned long npages)
 825 {
 826     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 827     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 828     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 829
 830     /*
 831      * Clear pages from start to start + npages - 1, so the end boundary is
 832      * exclusive.
 833      */
 834     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 835         migration_clear_memory_region_dirty_bitmap(rb, i);
 836     }
 837 }
 838
 839 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 840                                                 RAMBlock *rb,
 841                                                 unsigned long page)
 842 {
 843     bool ret;
 844
 845     /*
 846      * Clear dirty bitmap if needed.  This _must_ be called before we
 847      * send any of the page in the chunk because we need to make sure
 848      * we can capture further page content changes when we sync dirty
 849      * log the next time.  So as long as we are going to send any of
 850      * the page in the chunk we clear the remote dirty bitmap for all.
 851      * Clearing it earlier won't be a problem, but too late will.
 852      */
 853     migration_clear_memory_region_dirty_bitmap(rb, page);
 854
 855     ret = test_and_clear_bit(page, rb->bmap);
 856     if (ret) {
 857         rs->migration_dirty_pages--;
 858     }
 859
 860     return ret;
 861 }
 862
 863 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
 864                                        void *opaque)
 865 {
 866     const hwaddr offset = section->offset_within_region;
 867     const hwaddr size = int128_get64(section->size);
 868     const unsigned long start = offset >> TARGET_PAGE_BITS;
 869     const unsigned long npages = size >> TARGET_PAGE_BITS;
 870     RAMBlock *rb = section->mr->ram_block;
 871     uint64_t *cleared_bits = opaque;
 872
 873     /*
 874      * We don't grab ram_state->bitmap_mutex because we expect to run
 875      * only when starting migration or during postcopy recovery where
 876      * we don't have concurrent access.
 877      */
 878     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
 879         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
 880     }
 881     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
 882     bitmap_clear(rb->bmap, start, npages);
 883 }
 884
 885 /*
 886  * Exclude all dirty pages from migration that fall into a discarded range as
 887  * managed by a RamDiscardManager responsible for the mapped memory region of
 888  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
 889  *
 890  * Discarded pages ("logically unplugged") have undefined content and must
 891  * not get migrated, because even reading these pages for migration might
 892  * result in undesired behavior.
 893  *
 894  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
 895  *
 896  * Note: The result is only stable while migrating (precopy/postcopy).
 897  */
 898 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
 899 {
 900     uint64_t cleared_bits = 0;
 901
 902     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
 903         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 904         MemoryRegionSection section = {
 905             .mr = rb->mr,
 906             .offset_within_region = 0,
 907             .size = int128_make64(qemu_ram_get_used_length(rb)),
 908         };
 909
 910         ram_discard_manager_replay_discarded(rdm, &section,
 911                                              dirty_bitmap_clear_section,
 912                                              &cleared_bits);
 913     }
 914     return cleared_bits;
 915 }
 916
 917 /*
 918  * Check if a host-page aligned page falls into a discarded range as managed by
 919  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
 920  *
 921  * Note: The result is only stable while migrating (precopy/postcopy).
 922  */
 923 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
 924 {
 925     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
 926         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 927         MemoryRegionSection section = {
 928             .mr = rb->mr,
 929             .offset_within_region = start,
 930             .size = int128_make64(qemu_ram_pagesize(rb)),
 931         };
 932
 933         return !ram_discard_manager_is_populated(rdm, &section);
 934     }
 935     return false;
 936 }
 937
 938 /* Called with RCU critical section */
 939 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 940 {
 941     uint64_t new_dirty_pages =
 942         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 943
 944     rs->migration_dirty_pages += new_dirty_pages;
 945     rs->num_dirty_pages_period += new_dirty_pages;
 946 }
 947
 948 /**
 949  * ram_pagesize_summary: calculate all the pagesizes of a VM
 950  *
 951  * Returns a summary bitmap of the page sizes of all RAMBlocks
 952  *
 953  * For VMs with just normal pages this is equivalent to the host page
 954  * size. If it's got some huge pages then it's the OR of all the
 955  * different page sizes.
 956  */
 957 uint64_t ram_pagesize_summary(void)
 958 {
 959     RAMBlock *block;
 960     uint64_t summary = 0;
 961
 962     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 963         summary |= block->page_size;
 964     }
 965
 966     return summary;
 967 }
 968
 969 uint64_t ram_get_total_transferred_pages(void)
 970 {
 971     return  ram_counters.normal + ram_counters.duplicate +
 972                 compression_counters.pages + xbzrle_counters.pages;
 973 }
 974
 975 static void migration_update_rates(RAMState *rs, int64_t end_time)
 976 {
 977     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 978     double compressed_size;
 979
 980     /* calculate period counters */
 981     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 982                 / (end_time - rs->time_last_bitmap_sync);
 983
 984     if (!page_count) {
 985         return;
 986     }
 987
 988     if (migrate_use_xbzrle()) {
 989         double encoded_size, unencoded_size;
 990
 991         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 992             rs->xbzrle_cache_miss_prev) / page_count;
 993         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 994         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 995                          TARGET_PAGE_SIZE;
 996         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 997         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 998             xbzrle_counters.encoding_rate = 0;
 999         } else {
1000             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1001         }
1002         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1003         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1004     }
1005
1006     if (migrate_use_compression()) {
1007         compression_counters.busy_rate = (double)(compression_counters.busy -
1008             rs->compress_thread_busy_prev) / page_count;
1009         rs->compress_thread_busy_prev = compression_counters.busy;
1010
1011         compressed_size = compression_counters.compressed_size -
1012                           rs->compressed_size_prev;
1013         if (compressed_size) {
1014             double uncompressed_size = (compression_counters.pages -
1015                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1016
1017             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1018             compression_counters.compression_rate =
1019                                         uncompressed_size / compressed_size;
1020
1021             rs->compress_pages_prev = compression_counters.pages;
1022             rs->compressed_size_prev = compression_counters.compressed_size;
1023         }
1024     }
1025 }
1026
1027 static void migration_trigger_throttle(RAMState *rs)
1028 {
1029     MigrationState *s = migrate_get_current();
1030     uint64_t threshold = s->parameters.throttle_trigger_threshold;
1031
1032     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1033     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1034     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1035
1036     /* During block migration the auto-converge logic incorrectly detects
1037      * that ram migration makes no progress. Avoid this by disabling the
1038      * throttling logic during the bulk phase of block migration. */
1039     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1040         /* The following detection logic can be refined later. For now:
1041            Check to see if the ratio between dirtied bytes and the approx.
1042            amount of bytes that just got transferred since the last time
1043            we were in this routine reaches the threshold. If that happens
1044            twice, start or increase throttling. */
1045
1046         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1047             (++rs->dirty_rate_high_cnt >= 2)) {
1048             trace_migration_throttle();
1049             rs->dirty_rate_high_cnt = 0;
1050             mig_throttle_guest_down(bytes_dirty_period,
1051                                     bytes_dirty_threshold);
1052         }
1053     }
1054 }
1055
1056 static void migration_bitmap_sync(RAMState *rs)
1057 {
1058     RAMBlock *block;
1059     int64_t end_time;
1060
1061     ram_counters.dirty_sync_count++;
1062
1063     if (!rs->time_last_bitmap_sync) {
1064         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1065     }
1066
1067     trace_migration_bitmap_sync_start();
1068     memory_global_dirty_log_sync();
1069
1070     qemu_mutex_lock(&rs->bitmap_mutex);
1071     WITH_RCU_READ_LOCK_GUARD() {
1072         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1073             ramblock_sync_dirty_bitmap(rs, block);
1074         }
1075         ram_counters.remaining = ram_bytes_remaining();
1076     }
1077     qemu_mutex_unlock(&rs->bitmap_mutex);
1078
1079     memory_global_after_dirty_log_sync();
1080     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1081
1082     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1083
1084     /* more than 1 second = 1000 millisecons */
1085     if (end_time > rs->time_last_bitmap_sync + 1000) {
1086         migration_trigger_throttle(rs);
1087
1088         migration_update_rates(rs, end_time);
1089
1090         rs->target_page_count_prev = rs->target_page_count;
1091
1092         /* reset period counters */
1093         rs->time_last_bitmap_sync = end_time;
1094         rs->num_dirty_pages_period = 0;
1095         rs->bytes_xfer_prev = ram_counters.transferred;
1096     }
1097     if (migrate_use_events()) {
1098         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1099     }
1100 }
1101
1102 static void migration_bitmap_sync_precopy(RAMState *rs)
1103 {
1104     Error *local_err = NULL;
1105
1106     /*
1107      * The current notifier usage is just an optimization to migration, so we
1108      * don't stop the normal migration process in the error case.
1109      */
1110     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1111         error_report_err(local_err);
1112         local_err = NULL;
1113     }
1114
1115     migration_bitmap_sync(rs);
1116
1117     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1118         error_report_err(local_err);
1119     }
1120 }
1121
1122 /**
1123  * save_zero_page_to_file: send the zero page to the file
1124  *
1125  * Returns the size of data written to the file, 0 means the page is not
1126  * a zero page
1127  *
1128  * @rs: current RAM state
1129  * @file: the file where the data is saved
1130  * @block: block that contains the page we want to send
1131  * @offset: offset inside the block for the page
1132  */
1133 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1134                                   RAMBlock *block, ram_addr_t offset)
1135 {
1136     uint8_t *p = block->host + offset;
1137     int len = 0;
1138
1139     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1140         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1141         qemu_put_byte(file, 0);
1142         len += 1;
1143     }
1144     return len;
1145 }
1146
1147 /**
1148  * save_zero_page: send the zero page to the stream
1149  *
1150  * Returns the number of pages written.
1151  *
1152  * @rs: current RAM state
1153  * @block: block that contains the page we want to send
1154  * @offset: offset inside the block for the page
1155  */
1156 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1157 {
1158     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1159
1160     if (len) {
1161         ram_counters.duplicate++;
1162         ram_counters.transferred += len;
1163         return 1;
1164     }
1165     return -1;
1166 }
1167
1168 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1169 {
1170     if (!migrate_release_ram() || !migration_in_postcopy()) {
1171         return;
1172     }
1173
1174     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1175 }
1176
1177 /*
1178  * @pages: the number of pages written by the control path,
1179  *        < 0 - error
1180  *        > 0 - number of pages written
1181  *
1182  * Return true if the pages has been saved, otherwise false is returned.
1183  */
1184 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1185                               int *pages)
1186 {
1187     uint64_t bytes_xmit = 0;
1188     int ret;
1189
1190     *pages = -1;
1191     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1192                                 &bytes_xmit);
1193     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1194         return false;
1195     }
1196
1197     if (bytes_xmit) {
1198         ram_counters.transferred += bytes_xmit;
1199         *pages = 1;
1200     }
1201
1202     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1203         return true;
1204     }
1205
1206     if (bytes_xmit > 0) {
1207         ram_counters.normal++;
1208     } else if (bytes_xmit == 0) {
1209         ram_counters.duplicate++;
1210     }
1211
1212     return true;
1213 }
1214
1215 /*
1216  * directly send the page to the stream
1217  *
1218  * Returns the number of pages written.
1219  *
1220  * @rs: current RAM state
1221  * @block: block that contains the page we want to send
1222  * @offset: offset inside the block for the page
1223  * @buf: the page to be sent
1224  * @async: send to page asyncly
1225  */
1226 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1227                             uint8_t *buf, bool async)
1228 {
1229     ram_counters.transferred += save_page_header(rs, rs->f, block,
1230                                                  offset | RAM_SAVE_FLAG_PAGE);
1231     if (async) {
1232         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1233                               migrate_release_ram() &
1234                               migration_in_postcopy());
1235     } else {
1236         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1237     }
1238     ram_counters.transferred += TARGET_PAGE_SIZE;
1239     ram_counters.normal++;
1240     return 1;
1241 }
1242
1243 /**
1244  * ram_save_page: send the given page to the stream
1245  *
1246  * Returns the number of pages written.
1247  *          < 0 - error
1248  *          >=0 - Number of pages written - this might legally be 0
1249  *                if xbzrle noticed the page was the same.
1250  *
1251  * @rs: current RAM state
1252  * @block: block that contains the page we want to send
1253  * @offset: offset inside the block for the page
1254  * @last_stage: if we are at the completion stage
1255  */
1256 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1257 {
1258     int pages = -1;
1259     uint8_t *p;
1260     bool send_async = true;
1261     RAMBlock *block = pss->block;
1262     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1263     ram_addr_t current_addr = block->offset + offset;
1264
1265     p = block->host + offset;
1266     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1267
1268     XBZRLE_cache_lock();
1269     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1270         pages = save_xbzrle_page(rs, &p, current_addr, block,
1271                                  offset, last_stage);
1272         if (!last_stage) {
1273             /* Can't send this cached data async, since the cache page
1274              * might get updated before it gets to the wire
1275              */
1276             send_async = false;
1277         }
1278     }
1279
1280     /* XBZRLE overflow or normal page */
1281     if (pages == -1) {
1282         pages = save_normal_page(rs, block, offset, p, send_async);
1283     }
1284
1285     XBZRLE_cache_unlock();
1286
1287     return pages;
1288 }
1289
1290 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1291                                  ram_addr_t offset)
1292 {
1293     if (multifd_queue_page(rs->f, block, offset) < 0) {
1294         return -1;
1295     }
1296     ram_counters.normal++;
1297
1298     return 1;
1299 }
1300
1301 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1302                                  ram_addr_t offset, uint8_t *source_buf)
1303 {
1304     RAMState *rs = ram_state;
1305     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1306     bool zero_page = false;
1307     int ret;
1308
1309     if (save_zero_page_to_file(rs, f, block, offset)) {
1310         zero_page = true;
1311         goto exit;
1312     }
1313
1314     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1315
1316     /*
1317      * copy it to a internal buffer to avoid it being modified by VM
1318      * so that we can catch up the error during compression and
1319      * decompression
1320      */
1321     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1322     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1323     if (ret < 0) {
1324         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1325         error_report("compressed data failed!");
1326         return false;
1327     }
1328
1329 exit:
1330     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1331     return zero_page;
1332 }
1333
1334 static void
1335 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1336 {
1337     ram_counters.transferred += bytes_xmit;
1338
1339     if (param->zero_page) {
1340         ram_counters.duplicate++;
1341         return;
1342     }
1343
1344     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1345     compression_counters.compressed_size += bytes_xmit - 8;
1346     compression_counters.pages++;
1347 }
1348
1349 static bool save_page_use_compression(RAMState *rs);
1350
1351 static void flush_compressed_data(RAMState *rs)
1352 {
1353     int idx, len, thread_count;
1354
1355     if (!save_page_use_compression(rs)) {
1356         return;
1357     }
1358     thread_count = migrate_compress_threads();
1359
1360     qemu_mutex_lock(&comp_done_lock);
1361     for (idx = 0; idx < thread_count; idx++) {
1362         while (!comp_param[idx].done) {
1363             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1364         }
1365     }
1366     qemu_mutex_unlock(&comp_done_lock);
1367
1368     for (idx = 0; idx < thread_count; idx++) {
1369         qemu_mutex_lock(&comp_param[idx].mutex);
1370         if (!comp_param[idx].quit) {
1371             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1372             /*
1373              * it's safe to fetch zero_page without holding comp_done_lock
1374              * as there is no further request submitted to the thread,
1375              * i.e, the thread should be waiting for a request at this point.
1376              */
1377             update_compress_thread_counts(&comp_param[idx], len);
1378         }
1379         qemu_mutex_unlock(&comp_param[idx].mutex);
1380     }
1381 }
1382
1383 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1384                                        ram_addr_t offset)
1385 {
1386     param->block = block;
1387     param->offset = offset;
1388 }
1389
1390 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1391                                            ram_addr_t offset)
1392 {
1393     int idx, thread_count, bytes_xmit = -1, pages = -1;
1394     bool wait = migrate_compress_wait_thread();
1395
1396     thread_count = migrate_compress_threads();
1397     qemu_mutex_lock(&comp_done_lock);
1398 retry:
1399     for (idx = 0; idx < thread_count; idx++) {
1400         if (comp_param[idx].done) {
1401             comp_param[idx].done = false;
1402             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1403             qemu_mutex_lock(&comp_param[idx].mutex);
1404             set_compress_params(&comp_param[idx], block, offset);
1405             qemu_cond_signal(&comp_param[idx].cond);
1406             qemu_mutex_unlock(&comp_param[idx].mutex);
1407             pages = 1;
1408             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1409             break;
1410         }
1411     }
1412
1413     /*
1414      * wait for the free thread if the user specifies 'compress-wait-thread',
1415      * otherwise we will post the page out in the main thread as normal page.
1416      */
1417     if (pages < 0 && wait) {
1418         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1419         goto retry;
1420     }
1421     qemu_mutex_unlock(&comp_done_lock);
1422
1423     return pages;
1424 }
1425
1426 /**
1427  * find_dirty_block: find the next dirty page and update any state
1428  * associated with the search process.
1429  *
1430  * Returns true if a page is found
1431  *
1432  * @rs: current RAM state
1433  * @pss: data about the state of the current dirty page scan
1434  * @again: set to false if the search has scanned the whole of RAM
1435  */
1436 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1437 {
1438     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1439     if (pss->complete_round && pss->block == rs->last_seen_block &&
1440         pss->page >= rs->last_page) {
1441         /*
1442          * We've been once around the RAM and haven't found anything.
1443          * Give up.
1444          */
1445         *again = false;
1446         return false;
1447     }
1448     if (!offset_in_ramblock(pss->block,
1449                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1450         /* Didn't find anything in this RAM Block */
1451         pss->page = 0;
1452         pss->block = QLIST_NEXT_RCU(pss->block, next);
1453         if (!pss->block) {
1454             /*
1455              * If memory migration starts over, we will meet a dirtied page
1456              * which may still exists in compression threads's ring, so we
1457              * should flush the compressed data to make sure the new page
1458              * is not overwritten by the old one in the destination.
1459              *
1460              * Also If xbzrle is on, stop using the data compression at this
1461              * point. In theory, xbzrle can do better than compression.
1462              */
1463             flush_compressed_data(rs);
1464
1465             /* Hit the end of the list */
1466             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1467             /* Flag that we've looped */
1468             pss->complete_round = true;
1469             /* After the first round, enable XBZRLE. */
1470             if (migrate_use_xbzrle()) {
1471                 rs->xbzrle_enabled = true;
1472             }
1473         }
1474         /* Didn't find anything this time, but try again on the new block */
1475         *again = true;
1476         return false;
1477     } else {
1478         /* Can go around again, but... */
1479         *again = true;
1480         /* We've found something so probably don't need to */
1481         return true;
1482     }
1483 }
1484
1485 /**
1486  * unqueue_page: gets a page of the queue
1487  *
1488  * Helper for 'get_queued_page' - gets a page off the queue
1489  *
1490  * Returns the block of the page (or NULL if none available)
1491  *
1492  * @rs: current RAM state
1493  * @offset: used to return the offset within the RAMBlock
1494  */
1495 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1496 {
1497     RAMBlock *block = NULL;
1498
1499     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1500         return NULL;
1501     }
1502
1503     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1504     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1505         struct RAMSrcPageRequest *entry =
1506                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1507         block = entry->rb;
1508         *offset = entry->offset;
1509
1510         if (entry->len > TARGET_PAGE_SIZE) {
1511             entry->len -= TARGET_PAGE_SIZE;
1512             entry->offset += TARGET_PAGE_SIZE;
1513         } else {
1514             memory_region_unref(block->mr);
1515             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1516             g_free(entry);
1517             migration_consume_urgent_request();
1518         }
1519     }
1520
1521     return block;
1522 }
1523
1524 #if defined(__linux__)
1525 /**
1526  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1527  *   is found, return RAM block pointer and page offset
1528  *
1529  * Returns pointer to the RAMBlock containing faulting page,
1530  *   NULL if no write faults are pending
1531  *
1532  * @rs: current RAM state
1533  * @offset: page offset from the beginning of the block
1534  */
1535 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1536 {
1537     struct uffd_msg uffd_msg;
1538     void *page_address;
1539     RAMBlock *block;
1540     int res;
1541
1542     if (!migrate_background_snapshot()) {
1543         return NULL;
1544     }
1545
1546     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1547     if (res <= 0) {
1548         return NULL;
1549     }
1550
1551     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1552     block = qemu_ram_block_from_host(page_address, false, offset);
1553     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1554     return block;
1555 }
1556
1557 /**
1558  * ram_save_release_protection: release UFFD write protection after
1559  *   a range of pages has been saved
1560  *
1561  * @rs: current RAM state
1562  * @pss: page-search-status structure
1563  * @start_page: index of the first page in the range relative to pss->block
1564  *
1565  * Returns 0 on success, negative value in case of an error
1566 */
1567 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1568         unsigned long start_page)
1569 {
1570     int res = 0;
1571
1572     /* Check if page is from UFFD-managed region. */
1573     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1574         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1575         uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1576
1577         /* Flush async buffers before un-protect. */
1578         qemu_fflush(rs->f);
1579         /* Un-protect memory range. */
1580         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1581                 false, false);
1582     }
1583
1584     return res;
1585 }
1586
1587 /* ram_write_tracking_available: check if kernel supports required UFFD features
1588  *
1589  * Returns true if supports, false otherwise
1590  */
1591 bool ram_write_tracking_available(void)
1592 {
1593     uint64_t uffd_features;
1594     int res;
1595
1596     res = uffd_query_features(&uffd_features);
1597     return (res == 0 &&
1598             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1599 }
1600
1601 /* ram_write_tracking_compatible: check if guest configuration is
1602  *   compatible with 'write-tracking'
1603  *
1604  * Returns true if compatible, false otherwise
1605  */
1606 bool ram_write_tracking_compatible(void)
1607 {
1608     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1609     int uffd_fd;
1610     RAMBlock *block;
1611     bool ret = false;
1612
1613     /* Open UFFD file descriptor */
1614     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1615     if (uffd_fd < 0) {
1616         return false;
1617     }
1618
1619     RCU_READ_LOCK_GUARD();
1620
1621     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1622         uint64_t uffd_ioctls;
1623
1624         /* Nothing to do with read-only and MMIO-writable regions */
1625         if (block->mr->readonly || block->mr->rom_device) {
1626             continue;
1627         }
1628         /* Try to register block memory via UFFD-IO to track writes */
1629         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1630                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1631             goto out;
1632         }
1633         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1634             goto out;
1635         }
1636     }
1637     ret = true;
1638
1639 out:
1640     uffd_close_fd(uffd_fd);
1641     return ret;
1642 }
1643
1644 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1645                                        ram_addr_t size)
1646 {
1647     /*
1648      * We read one byte of each page; this will preallocate page tables if
1649      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1650      * where no page was populated yet. This might require adaption when
1651      * supporting other mappings, like shmem.
1652      */
1653     for (; offset < size; offset += block->page_size) {
1654         char tmp = *((char *)block->host + offset);
1655
1656         /* Don't optimize the read out */
1657         asm volatile("" : "+r" (tmp));
1658     }
1659 }
1660
1661 static inline int populate_read_section(MemoryRegionSection *section,
1662                                         void *opaque)
1663 {
1664     const hwaddr size = int128_get64(section->size);
1665     hwaddr offset = section->offset_within_region;
1666     RAMBlock *block = section->mr->ram_block;
1667
1668     populate_read_range(block, offset, size);
1669     return 0;
1670 }
1671
1672 /*
1673  * ram_block_populate_read: preallocate page tables and populate pages in the
1674  *   RAM block by reading a byte of each page.
1675  *
1676  * Since it's solely used for userfault_fd WP feature, here we just
1677  *   hardcode page size to qemu_real_host_page_size.
1678  *
1679  * @block: RAM block to populate
1680  */
1681 static void ram_block_populate_read(RAMBlock *rb)
1682 {
1683     /*
1684      * Skip populating all pages that fall into a discarded range as managed by
1685      * a RamDiscardManager responsible for the mapped memory region of the
1686      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1687      * must not get populated automatically. We don't have to track
1688      * modifications via userfaultfd WP reliably, because these pages will
1689      * not be part of the migration stream either way -- see
1690      * ramblock_dirty_bitmap_exclude_discarded_pages().
1691      *
1692      * Note: The result is only stable while migrating (precopy/postcopy).
1693      */
1694     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1695         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1696         MemoryRegionSection section = {
1697             .mr = rb->mr,
1698             .offset_within_region = 0,
1699             .size = rb->mr->size,
1700         };
1701
1702         ram_discard_manager_replay_populated(rdm, &section,
1703                                              populate_read_section, NULL);
1704     } else {
1705         populate_read_range(rb, 0, rb->used_length);
1706     }
1707 }
1708
1709 /*
1710  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1711  */
1712 void ram_write_tracking_prepare(void)
1713 {
1714     RAMBlock *block;
1715
1716     RCU_READ_LOCK_GUARD();
1717
1718     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1719         /* Nothing to do with read-only and MMIO-writable regions */
1720         if (block->mr->readonly || block->mr->rom_device) {
1721             continue;
1722         }
1723
1724         /*
1725          * Populate pages of the RAM block before enabling userfault_fd
1726          * write protection.
1727          *
1728          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1729          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1730          * pages with pte_none() entries in page table.
1731          */
1732         ram_block_populate_read(block);
1733     }
1734 }
1735
1736 /*
1737  * ram_write_tracking_start: start UFFD-WP memory tracking
1738  *
1739  * Returns 0 for success or negative value in case of error
1740  */
1741 int ram_write_tracking_start(void)
1742 {
1743     int uffd_fd;
1744     RAMState *rs = ram_state;
1745     RAMBlock *block;
1746
1747     /* Open UFFD file descriptor */
1748     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1749     if (uffd_fd < 0) {
1750         return uffd_fd;
1751     }
1752     rs->uffdio_fd = uffd_fd;
1753
1754     RCU_READ_LOCK_GUARD();
1755
1756     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1757         /* Nothing to do with read-only and MMIO-writable regions */
1758         if (block->mr->readonly || block->mr->rom_device) {
1759             continue;
1760         }
1761
1762         /* Register block memory with UFFD to track writes */
1763         if (uffd_register_memory(rs->uffdio_fd, block->host,
1764                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1765             goto fail;
1766         }
1767         /* Apply UFFD write protection to the block memory range */
1768         if (uffd_change_protection(rs->uffdio_fd, block->host,
1769                 block->max_length, true, false)) {
1770             goto fail;
1771         }
1772         block->flags |= RAM_UF_WRITEPROTECT;
1773         memory_region_ref(block->mr);
1774
1775         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1776                 block->host, block->max_length);
1777     }
1778
1779     return 0;
1780
1781 fail:
1782     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1783
1784     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1785         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1786             continue;
1787         }
1788         /*
1789          * In case some memory block failed to be write-protected
1790          * remove protection and unregister all succeeded RAM blocks
1791          */
1792         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1793                 false, false);
1794         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1795         /* Cleanup flags and remove reference */
1796         block->flags &= ~RAM_UF_WRITEPROTECT;
1797         memory_region_unref(block->mr);
1798     }
1799
1800     uffd_close_fd(uffd_fd);
1801     rs->uffdio_fd = -1;
1802     return -1;
1803 }
1804
1805 /**
1806  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1807  */
1808 void ram_write_tracking_stop(void)
1809 {
1810     RAMState *rs = ram_state;
1811     RAMBlock *block;
1812
1813     RCU_READ_LOCK_GUARD();
1814
1815     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1816         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1817             continue;
1818         }
1819         /* Remove protection and unregister all affected RAM blocks */
1820         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1821                 false, false);
1822         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1823
1824         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1825                 block->host, block->max_length);
1826
1827         /* Cleanup flags and remove reference */
1828         block->flags &= ~RAM_UF_WRITEPROTECT;
1829         memory_region_unref(block->mr);
1830     }
1831
1832     /* Finally close UFFD file descriptor */
1833     uffd_close_fd(rs->uffdio_fd);
1834     rs->uffdio_fd = -1;
1835 }
1836
1837 #else
1838 /* No target OS support, stubs just fail or ignore */
1839
1840 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1841 {
1842     (void) rs;
1843     (void) offset;
1844
1845     return NULL;
1846 }
1847
1848 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1849         unsigned long start_page)
1850 {
1851     (void) rs;
1852     (void) pss;
1853     (void) start_page;
1854
1855     return 0;
1856 }
1857
1858 bool ram_write_tracking_available(void)
1859 {
1860     return false;
1861 }
1862
1863 bool ram_write_tracking_compatible(void)
1864 {
1865     assert(0);
1866     return false;
1867 }
1868
1869 int ram_write_tracking_start(void)
1870 {
1871     assert(0);
1872     return -1;
1873 }
1874
1875 void ram_write_tracking_stop(void)
1876 {
1877     assert(0);
1878 }
1879 #endif /* defined(__linux__) */
1880
1881 /**
1882  * get_queued_page: unqueue a page from the postcopy requests
1883  *
1884  * Skips pages that are already sent (!dirty)
1885  *
1886  * Returns true if a queued page is found
1887  *
1888  * @rs: current RAM state
1889  * @pss: data about the state of the current dirty page scan
1890  */
1891 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1892 {
1893     RAMBlock  *block;
1894     ram_addr_t offset;
1895     bool dirty;
1896
1897     do {
1898         block = unqueue_page(rs, &offset);
1899         /*
1900          * We're sending this page, and since it's postcopy nothing else
1901          * will dirty it, and we must make sure it doesn't get sent again
1902          * even if this queue request was received after the background
1903          * search already sent it.
1904          */
1905         if (block) {
1906             unsigned long page;
1907
1908             page = offset >> TARGET_PAGE_BITS;
1909             dirty = test_bit(page, block->bmap);
1910             if (!dirty) {
1911                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1912                                                 page);
1913             } else {
1914                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1915             }
1916         }
1917
1918     } while (block && !dirty);
1919
1920     if (!block) {
1921         /*
1922          * Poll write faults too if background snapshot is enabled; that's
1923          * when we have vcpus got blocked by the write protected pages.
1924          */
1925         block = poll_fault_page(rs, &offset);
1926     }
1927
1928     if (block) {
1929         /*
1930          * We want the background search to continue from the queued page
1931          * since the guest is likely to want other pages near to the page
1932          * it just requested.
1933          */
1934         pss->block = block;
1935         pss->page = offset >> TARGET_PAGE_BITS;
1936
1937         /*
1938          * This unqueued page would break the "one round" check, even is
1939          * really rare.
1940          */
1941         pss->complete_round = false;
1942     }
1943
1944     return !!block;
1945 }
1946
1947 /**
1948  * migration_page_queue_free: drop any remaining pages in the ram
1949  * request queue
1950  *
1951  * It should be empty at the end anyway, but in error cases there may
1952  * be some left.  in case that there is any page left, we drop it.
1953  *
1954  */
1955 static void migration_page_queue_free(RAMState *rs)
1956 {
1957     struct RAMSrcPageRequest *mspr, *next_mspr;
1958     /* This queue generally should be empty - but in the case of a failed
1959      * migration might have some droppings in.
1960      */
1961     RCU_READ_LOCK_GUARD();
1962     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1963         memory_region_unref(mspr->rb->mr);
1964         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1965         g_free(mspr);
1966     }
1967 }
1968
1969 /**
1970  * ram_save_queue_pages: queue the page for transmission
1971  *
1972  * A request from postcopy destination for example.
1973  *
1974  * Returns zero on success or negative on error
1975  *
1976  * @rbname: Name of the RAMBLock of the request. NULL means the
1977  *          same that last one.
1978  * @start: starting address from the start of the RAMBlock
1979  * @len: length (in bytes) to send
1980  */
1981 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1982 {
1983     RAMBlock *ramblock;
1984     RAMState *rs = ram_state;
1985
1986     ram_counters.postcopy_requests++;
1987     RCU_READ_LOCK_GUARD();
1988
1989     if (!rbname) {
1990         /* Reuse last RAMBlock */
1991         ramblock = rs->last_req_rb;
1992
1993         if (!ramblock) {
1994             /*
1995              * Shouldn't happen, we can't reuse the last RAMBlock if
1996              * it's the 1st request.
1997              */
1998             error_report("ram_save_queue_pages no previous block");
1999             return -1;
2000         }
2001     } else {
2002         ramblock = qemu_ram_block_by_name(rbname);
2003
2004         if (!ramblock) {
2005             /* We shouldn't be asked for a non-existent RAMBlock */
2006             error_report("ram_save_queue_pages no block '%s'", rbname);
2007             return -1;
2008         }
2009         rs->last_req_rb = ramblock;
2010     }
2011     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2012     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2013         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2014                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2015                      __func__, start, len, ramblock->used_length);
2016         return -1;
2017     }
2018
2019     struct RAMSrcPageRequest *new_entry =
2020         g_malloc0(sizeof(struct RAMSrcPageRequest));
2021     new_entry->rb = ramblock;
2022     new_entry->offset = start;
2023     new_entry->len = len;
2024
2025     memory_region_ref(ramblock->mr);
2026     qemu_mutex_lock(&rs->src_page_req_mutex);
2027     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2028     migration_make_urgent_request();
2029     qemu_mutex_unlock(&rs->src_page_req_mutex);
2030
2031     return 0;
2032 }
2033
2034 static bool save_page_use_compression(RAMState *rs)
2035 {
2036     if (!migrate_use_compression()) {
2037         return false;
2038     }
2039
2040     /*
2041      * If xbzrle is enabled (e.g., after first round of migration), stop
2042      * using the data compression. In theory, xbzrle can do better than
2043      * compression.
2044      */
2045     if (rs->xbzrle_enabled) {
2046         return false;
2047     }
2048
2049     return true;
2050 }
2051
2052 /*
2053  * try to compress the page before posting it out, return true if the page
2054  * has been properly handled by compression, otherwise needs other
2055  * paths to handle it
2056  */
2057 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2058 {
2059     if (!save_page_use_compression(rs)) {
2060         return false;
2061     }
2062
2063     /*
2064      * When starting the process of a new block, the first page of
2065      * the block should be sent out before other pages in the same
2066      * block, and all the pages in last block should have been sent
2067      * out, keeping this order is important, because the 'cont' flag
2068      * is used to avoid resending the block name.
2069      *
2070      * We post the fist page as normal page as compression will take
2071      * much CPU resource.
2072      */
2073     if (block != rs->last_sent_block) {
2074         flush_compressed_data(rs);
2075         return false;
2076     }
2077
2078     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2079         return true;
2080     }
2081
2082     compression_counters.busy++;
2083     return false;
2084 }
2085
2086 /**
2087  * ram_save_target_page: save one target page
2088  *
2089  * Returns the number of pages written
2090  *
2091  * @rs: current RAM state
2092  * @pss: data about the page we want to send
2093  * @last_stage: if we are at the completion stage
2094  */
2095 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2096                                 bool last_stage)
2097 {
2098     RAMBlock *block = pss->block;
2099     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2100     int res;
2101
2102     if (control_save_page(rs, block, offset, &res)) {
2103         return res;
2104     }
2105
2106     if (save_compress_page(rs, block, offset)) {
2107         return 1;
2108     }
2109
2110     res = save_zero_page(rs, block, offset);
2111     if (res > 0) {
2112         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2113          * page would be stale
2114          */
2115         if (!save_page_use_compression(rs)) {
2116             XBZRLE_cache_lock();
2117             xbzrle_cache_zero_page(rs, block->offset + offset);
2118             XBZRLE_cache_unlock();
2119         }
2120         ram_release_pages(block->idstr, offset, res);
2121         return res;
2122     }
2123
2124     /*
2125      * Do not use multifd for:
2126      * 1. Compression as the first page in the new block should be posted out
2127      *    before sending the compressed page
2128      * 2. In postcopy as one whole host page should be placed
2129      */
2130     if (!save_page_use_compression(rs) && migrate_use_multifd()
2131         && !migration_in_postcopy()) {
2132         return ram_save_multifd_page(rs, block, offset);
2133     }
2134
2135     return ram_save_page(rs, pss, last_stage);
2136 }
2137
2138 /**
2139  * ram_save_host_page: save a whole host page
2140  *
2141  * Starting at *offset send pages up to the end of the current host
2142  * page. It's valid for the initial offset to point into the middle of
2143  * a host page in which case the remainder of the hostpage is sent.
2144  * Only dirty target pages are sent. Note that the host page size may
2145  * be a huge page for this block.
2146  * The saving stops at the boundary of the used_length of the block
2147  * if the RAMBlock isn't a multiple of the host page size.
2148  *
2149  * Returns the number of pages written or negative on error
2150  *
2151  * @rs: current RAM state
2152  * @ms: current migration state
2153  * @pss: data about the page we want to send
2154  * @last_stage: if we are at the completion stage
2155  */
2156 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2157                               bool last_stage)
2158 {
2159     int tmppages, pages = 0;
2160     size_t pagesize_bits =
2161         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2162     unsigned long hostpage_boundary =
2163         QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2164     unsigned long start_page = pss->page;
2165     int res;
2166
2167     if (ramblock_is_ignored(pss->block)) {
2168         error_report("block %s should not be migrated !", pss->block->idstr);
2169         return 0;
2170     }
2171
2172     do {
2173         /* Check the pages is dirty and if it is send it */
2174         if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2175             tmppages = ram_save_target_page(rs, pss, last_stage);
2176             if (tmppages < 0) {
2177                 return tmppages;
2178             }
2179
2180             pages += tmppages;
2181             /*
2182              * Allow rate limiting to happen in the middle of huge pages if
2183              * something is sent in the current iteration.
2184              */
2185             if (pagesize_bits > 1 && tmppages > 0) {
2186                 migration_rate_limit();
2187             }
2188         }
2189         pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2190     } while ((pss->page < hostpage_boundary) &&
2191              offset_in_ramblock(pss->block,
2192                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2193     /* The offset we leave with is the min boundary of host page and block */
2194     pss->page = MIN(pss->page, hostpage_boundary) - 1;
2195
2196     res = ram_save_release_protection(rs, pss, start_page);
2197     return (res < 0 ? res : pages);
2198 }
2199
2200 /**
2201  * ram_find_and_save_block: finds a dirty page and sends it to f
2202  *
2203  * Called within an RCU critical section.
2204  *
2205  * Returns the number of pages written where zero means no dirty pages,
2206  * or negative on error
2207  *
2208  * @rs: current RAM state
2209  * @last_stage: if we are at the completion stage
2210  *
2211  * On systems where host-page-size > target-page-size it will send all the
2212  * pages in a host page that are dirty.
2213  */
2214
2215 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2216 {
2217     PageSearchStatus pss;
2218     int pages = 0;
2219     bool again, found;
2220
2221     /* No dirty page as there is zero RAM */
2222     if (!ram_bytes_total()) {
2223         return pages;
2224     }
2225
2226     pss.block = rs->last_seen_block;
2227     pss.page = rs->last_page;
2228     pss.complete_round = false;
2229
2230     if (!pss.block) {
2231         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2232     }
2233
2234     do {
2235         again = true;
2236         found = get_queued_page(rs, &pss);
2237
2238         if (!found) {
2239             /* priority queue empty, so just search for something dirty */
2240             found = find_dirty_block(rs, &pss, &again);
2241         }
2242
2243         if (found) {
2244             pages = ram_save_host_page(rs, &pss, last_stage);
2245         }
2246     } while (!pages && again);
2247
2248     rs->last_seen_block = pss.block;
2249     rs->last_page = pss.page;
2250
2251     return pages;
2252 }
2253
2254 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2255 {
2256     uint64_t pages = size / TARGET_PAGE_SIZE;
2257
2258     if (zero) {
2259         ram_counters.duplicate += pages;
2260     } else {
2261         ram_counters.normal += pages;
2262         ram_counters.transferred += size;
2263         qemu_update_position(f, size);
2264     }
2265 }
2266
2267 static uint64_t ram_bytes_total_common(bool count_ignored)
2268 {
2269     RAMBlock *block;
2270     uint64_t total = 0;
2271
2272     RCU_READ_LOCK_GUARD();
2273
2274     if (count_ignored) {
2275         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2276             total += block->used_length;
2277         }
2278     } else {
2279         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2280             total += block->used_length;
2281         }
2282     }
2283     return total;
2284 }
2285
2286 uint64_t ram_bytes_total(void)
2287 {
2288     return ram_bytes_total_common(false);
2289 }
2290
2291 static void xbzrle_load_setup(void)
2292 {
2293     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2294 }
2295
2296 static void xbzrle_load_cleanup(void)
2297 {
2298     g_free(XBZRLE.decoded_buf);
2299     XBZRLE.decoded_buf = NULL;
2300 }
2301
2302 static void ram_state_cleanup(RAMState **rsp)
2303 {
2304     if (*rsp) {
2305         migration_page_queue_free(*rsp);
2306         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2307         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2308         g_free(*rsp);
2309         *rsp = NULL;
2310     }
2311 }
2312
2313 static void xbzrle_cleanup(void)
2314 {
2315     XBZRLE_cache_lock();
2316     if (XBZRLE.cache) {
2317         cache_fini(XBZRLE.cache);
2318         g_free(XBZRLE.encoded_buf);
2319         g_free(XBZRLE.current_buf);
2320         g_free(XBZRLE.zero_target_page);
2321         XBZRLE.cache = NULL;
2322         XBZRLE.encoded_buf = NULL;
2323         XBZRLE.current_buf = NULL;
2324         XBZRLE.zero_target_page = NULL;
2325     }
2326     XBZRLE_cache_unlock();
2327 }
2328
2329 static void ram_save_cleanup(void *opaque)
2330 {
2331     RAMState **rsp = opaque;
2332     RAMBlock *block;
2333
2334     /* We don't use dirty log with background snapshots */
2335     if (!migrate_background_snapshot()) {
2336         /* caller have hold iothread lock or is in a bh, so there is
2337          * no writing race against the migration bitmap
2338          */
2339         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2340             /*
2341              * do not stop dirty log without starting it, since
2342              * memory_global_dirty_log_stop will assert that
2343              * memory_global_dirty_log_start/stop used in pairs
2344              */
2345             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2346         }
2347     }
2348
2349     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2350         g_free(block->clear_bmap);
2351         block->clear_bmap = NULL;
2352         g_free(block->bmap);
2353         block->bmap = NULL;
2354     }
2355
2356     xbzrle_cleanup();
2357     compress_threads_save_cleanup();
2358     ram_state_cleanup(rsp);
2359 }
2360
2361 static void ram_state_reset(RAMState *rs)
2362 {
2363     rs->last_seen_block = NULL;
2364     rs->last_sent_block = NULL;
2365     rs->last_page = 0;
2366     rs->last_version = ram_list.version;
2367     rs->xbzrle_enabled = false;
2368 }
2369
2370 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2371
2372 /*
2373  * 'expected' is the value you expect the bitmap mostly to be full
2374  * of; it won't bother printing lines that are all this value.
2375  * If 'todump' is null the migration bitmap is dumped.
2376  */
2377 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2378                            unsigned long pages)
2379 {
2380     int64_t cur;
2381     int64_t linelen = 128;
2382     char linebuf[129];
2383
2384     for (cur = 0; cur < pages; cur += linelen) {
2385         int64_t curb;
2386         bool found = false;
2387         /*
2388          * Last line; catch the case where the line length
2389          * is longer than remaining ram
2390          */
2391         if (cur + linelen > pages) {
2392             linelen = pages - cur;
2393         }
2394         for (curb = 0; curb < linelen; curb++) {
2395             bool thisbit = test_bit(cur + curb, todump);
2396             linebuf[curb] = thisbit ? '1' : '.';
2397             found = found || (thisbit != expected);
2398         }
2399         if (found) {
2400             linebuf[curb] = '\0';
2401             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2402         }
2403     }
2404 }
2405
2406 /* **** functions for postcopy ***** */
2407
2408 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2409 {
2410     struct RAMBlock *block;
2411
2412     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2413         unsigned long *bitmap = block->bmap;
2414         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2415         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2416
2417         while (run_start < range) {
2418             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2419             ram_discard_range(block->idstr,
2420                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2421                               ((ram_addr_t)(run_end - run_start))
2422                                 << TARGET_PAGE_BITS);
2423             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2424         }
2425     }
2426 }
2427
2428 /**
2429  * postcopy_send_discard_bm_ram: discard a RAMBlock
2430  *
2431  * Returns zero on success
2432  *
2433  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2434  *
2435  * @ms: current migration state
2436  * @block: RAMBlock to discard
2437  */
2438 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2439 {
2440     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2441     unsigned long current;
2442     unsigned long *bitmap = block->bmap;
2443
2444     for (current = 0; current < end; ) {
2445         unsigned long one = find_next_bit(bitmap, end, current);
2446         unsigned long zero, discard_length;
2447
2448         if (one >= end) {
2449             break;
2450         }
2451
2452         zero = find_next_zero_bit(bitmap, end, one + 1);
2453
2454         if (zero >= end) {
2455             discard_length = end - one;
2456         } else {
2457             discard_length = zero - one;
2458         }
2459         postcopy_discard_send_range(ms, one, discard_length);
2460         current = one + discard_length;
2461     }
2462
2463     return 0;
2464 }
2465
2466 /**
2467  * postcopy_each_ram_send_discard: discard all RAMBlocks
2468  *
2469  * Returns 0 for success or negative for error
2470  *
2471  * Utility for the outgoing postcopy code.
2472  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2473  *   passing it bitmap indexes and name.
2474  * (qemu_ram_foreach_block ends up passing unscaled lengths
2475  *  which would mean postcopy code would have to deal with target page)
2476  *
2477  * @ms: current migration state
2478  */
2479 static int postcopy_each_ram_send_discard(MigrationState *ms)
2480 {
2481     struct RAMBlock *block;
2482     int ret;
2483
2484     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2485         postcopy_discard_send_init(ms, block->idstr);
2486
2487         /*
2488          * Postcopy sends chunks of bitmap over the wire, but it
2489          * just needs indexes at this point, avoids it having
2490          * target page specific code.
2491          */
2492         ret = postcopy_send_discard_bm_ram(ms, block);
2493         postcopy_discard_send_finish(ms);
2494         if (ret) {
2495             return ret;
2496         }
2497     }
2498
2499     return 0;
2500 }
2501
2502 /**
2503  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2504  *
2505  * Helper for postcopy_chunk_hostpages; it's called twice to
2506  * canonicalize the two bitmaps, that are similar, but one is
2507  * inverted.
2508  *
2509  * Postcopy requires that all target pages in a hostpage are dirty or
2510  * clean, not a mix.  This function canonicalizes the bitmaps.
2511  *
2512  * @ms: current migration state
2513  * @block: block that contains the page we want to canonicalize
2514  */
2515 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2516 {
2517     RAMState *rs = ram_state;
2518     unsigned long *bitmap = block->bmap;
2519     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2520     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2521     unsigned long run_start;
2522
2523     if (block->page_size == TARGET_PAGE_SIZE) {
2524         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2525         return;
2526     }
2527
2528     /* Find a dirty page */
2529     run_start = find_next_bit(bitmap, pages, 0);
2530
2531     while (run_start < pages) {
2532
2533         /*
2534          * If the start of this run of pages is in the middle of a host
2535          * page, then we need to fixup this host page.
2536          */
2537         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2538             /* Find the end of this run */
2539             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2540             /*
2541              * If the end isn't at the start of a host page, then the
2542              * run doesn't finish at the end of a host page
2543              * and we need to discard.
2544              */
2545         }
2546
2547         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2548             unsigned long page;
2549             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2550                                                              host_ratio);
2551             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2552
2553             /* Clean up the bitmap */
2554             for (page = fixup_start_addr;
2555                  page < fixup_start_addr + host_ratio; page++) {
2556                 /*
2557                  * Remark them as dirty, updating the count for any pages
2558                  * that weren't previously dirty.
2559                  */
2560                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2561             }
2562         }
2563
2564         /* Find the next dirty page for the next iteration */
2565         run_start = find_next_bit(bitmap, pages, run_start);
2566     }
2567 }
2568
2569 /**
2570  * postcopy_chunk_hostpages: discard any partially sent host page
2571  *
2572  * Utility for the outgoing postcopy code.
2573  *
2574  * Discard any partially sent host-page size chunks, mark any partially
2575  * dirty host-page size chunks as all dirty.  In this case the host-page
2576  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2577  *
2578  * Returns zero on success
2579  *
2580  * @ms: current migration state
2581  * @block: block we want to work with
2582  */
2583 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2584 {
2585     postcopy_discard_send_init(ms, block->idstr);
2586
2587     /*
2588      * Ensure that all partially dirty host pages are made fully dirty.
2589      */
2590     postcopy_chunk_hostpages_pass(ms, block);
2591
2592     postcopy_discard_send_finish(ms);
2593     return 0;
2594 }
2595
2596 /**
2597  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2598  *
2599  * Returns zero on success
2600  *
2601  * Transmit the set of pages to be discarded after precopy to the target
2602  * these are pages that:
2603  *     a) Have been previously transmitted but are now dirty again
2604  *     b) Pages that have never been transmitted, this ensures that
2605  *        any pages on the destination that have been mapped by background
2606  *        tasks get discarded (transparent huge pages is the specific concern)
2607  * Hopefully this is pretty sparse
2608  *
2609  * @ms: current migration state
2610  */
2611 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2612 {
2613     RAMState *rs = ram_state;
2614     RAMBlock *block;
2615     int ret;
2616
2617     RCU_READ_LOCK_GUARD();
2618
2619     /* This should be our last sync, the src is now paused */
2620     migration_bitmap_sync(rs);
2621
2622     /* Easiest way to make sure we don't resume in the middle of a host-page */
2623     rs->last_seen_block = NULL;
2624     rs->last_sent_block = NULL;
2625     rs->last_page = 0;
2626
2627     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2628         /* Deal with TPS != HPS and huge pages */
2629         ret = postcopy_chunk_hostpages(ms, block);
2630         if (ret) {
2631             return ret;
2632         }
2633
2634 #ifdef DEBUG_POSTCOPY
2635         ram_debug_dump_bitmap(block->bmap, true,
2636                               block->used_length >> TARGET_PAGE_BITS);
2637 #endif
2638     }
2639     trace_ram_postcopy_send_discard_bitmap();
2640
2641     return postcopy_each_ram_send_discard(ms);
2642 }
2643
2644 /**
2645  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2646  *
2647  * Returns zero on success
2648  *
2649  * @rbname: name of the RAMBlock of the request. NULL means the
2650  *          same that last one.
2651  * @start: RAMBlock starting page
2652  * @length: RAMBlock size
2653  */
2654 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2655 {
2656     trace_ram_discard_range(rbname, start, length);
2657
2658     RCU_READ_LOCK_GUARD();
2659     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2660
2661     if (!rb) {
2662         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2663         return -1;
2664     }
2665
2666     /*
2667      * On source VM, we don't need to update the received bitmap since
2668      * we don't even have one.
2669      */
2670     if (rb->receivedmap) {
2671         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2672                      length >> qemu_target_page_bits());
2673     }
2674
2675     return ram_block_discard_range(rb, start, length);
2676 }
2677
2678 /*
2679  * For every allocation, we will try not to crash the VM if the
2680  * allocation failed.
2681  */
2682 static int xbzrle_init(void)
2683 {
2684     Error *local_err = NULL;
2685
2686     if (!migrate_use_xbzrle()) {
2687         return 0;
2688     }
2689
2690     XBZRLE_cache_lock();
2691
2692     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2693     if (!XBZRLE.zero_target_page) {
2694         error_report("%s: Error allocating zero page", __func__);
2695         goto err_out;
2696     }
2697
2698     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2699                               TARGET_PAGE_SIZE, &local_err);
2700     if (!XBZRLE.cache) {
2701         error_report_err(local_err);
2702         goto free_zero_page;
2703     }
2704
2705     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2706     if (!XBZRLE.encoded_buf) {
2707         error_report("%s: Error allocating encoded_buf", __func__);
2708         goto free_cache;
2709     }
2710
2711     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2712     if (!XBZRLE.current_buf) {
2713         error_report("%s: Error allocating current_buf", __func__);
2714         goto free_encoded_buf;
2715     }
2716
2717     /* We are all good */
2718     XBZRLE_cache_unlock();
2719     return 0;
2720
2721 free_encoded_buf:
2722     g_free(XBZRLE.encoded_buf);
2723     XBZRLE.encoded_buf = NULL;
2724 free_cache:
2725     cache_fini(XBZRLE.cache);
2726     XBZRLE.cache = NULL;
2727 free_zero_page:
2728     g_free(XBZRLE.zero_target_page);
2729     XBZRLE.zero_target_page = NULL;
2730 err_out:
2731     XBZRLE_cache_unlock();
2732     return -ENOMEM;
2733 }
2734
2735 static int ram_state_init(RAMState **rsp)
2736 {
2737     *rsp = g_try_new0(RAMState, 1);
2738
2739     if (!*rsp) {
2740         error_report("%s: Init ramstate fail", __func__);
2741         return -1;
2742     }
2743
2744     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2745     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2746     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2747
2748     /*
2749      * Count the total number of pages used by ram blocks not including any
2750      * gaps due to alignment or unplugs.
2751      * This must match with the initial values of dirty bitmap.
2752      */
2753     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2754     ram_state_reset(*rsp);
2755
2756     return 0;
2757 }
2758
2759 static void ram_list_init_bitmaps(void)
2760 {
2761     MigrationState *ms = migrate_get_current();
2762     RAMBlock *block;
2763     unsigned long pages;
2764     uint8_t shift;
2765
2766     /* Skip setting bitmap if there is no RAM */
2767     if (ram_bytes_total()) {
2768         shift = ms->clear_bitmap_shift;
2769         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2770             error_report("clear_bitmap_shift (%u) too big, using "
2771                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2772             shift = CLEAR_BITMAP_SHIFT_MAX;
2773         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2774             error_report("clear_bitmap_shift (%u) too small, using "
2775                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2776             shift = CLEAR_BITMAP_SHIFT_MIN;
2777         }
2778
2779         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2780             pages = block->max_length >> TARGET_PAGE_BITS;
2781             /*
2782              * The initial dirty bitmap for migration must be set with all
2783              * ones to make sure we'll migrate every guest RAM page to
2784              * destination.
2785              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2786              * new migration after a failed migration, ram_list.
2787              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2788              * guest memory.
2789              */
2790             block->bmap = bitmap_new(pages);
2791             bitmap_set(block->bmap, 0, pages);
2792             block->clear_bmap_shift = shift;
2793             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2794         }
2795     }
2796 }
2797
2798 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2799 {
2800     unsigned long pages;
2801     RAMBlock *rb;
2802
2803     RCU_READ_LOCK_GUARD();
2804
2805     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2806             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2807             rs->migration_dirty_pages -= pages;
2808     }
2809 }
2810
2811 static void ram_init_bitmaps(RAMState *rs)
2812 {
2813     /* For memory_global_dirty_log_start below.  */
2814     qemu_mutex_lock_iothread();
2815     qemu_mutex_lock_ramlist();
2816
2817     WITH_RCU_READ_LOCK_GUARD() {
2818         ram_list_init_bitmaps();
2819         /* We don't use dirty log with background snapshots */
2820         if (!migrate_background_snapshot()) {
2821             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2822             migration_bitmap_sync_precopy(rs);
2823         }
2824     }
2825     qemu_mutex_unlock_ramlist();
2826     qemu_mutex_unlock_iothread();
2827
2828     /*
2829      * After an eventual first bitmap sync, fixup the initial bitmap
2830      * containing all 1s to exclude any discarded pages from migration.
2831      */
2832     migration_bitmap_clear_discarded_pages(rs);
2833 }
2834
2835 static int ram_init_all(RAMState **rsp)
2836 {
2837     if (ram_state_init(rsp)) {
2838         return -1;
2839     }
2840
2841     if (xbzrle_init()) {
2842         ram_state_cleanup(rsp);
2843         return -1;
2844     }
2845
2846     ram_init_bitmaps(*rsp);
2847
2848     return 0;
2849 }
2850
2851 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2852 {
2853     RAMBlock *block;
2854     uint64_t pages = 0;
2855
2856     /*
2857      * Postcopy is not using xbzrle/compression, so no need for that.
2858      * Also, since source are already halted, we don't need to care
2859      * about dirty page logging as well.
2860      */
2861
2862     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2863         pages += bitmap_count_one(block->bmap,
2864                                   block->used_length >> TARGET_PAGE_BITS);
2865     }
2866
2867     /* This may not be aligned with current bitmaps. Recalculate. */
2868     rs->migration_dirty_pages = pages;
2869
2870     ram_state_reset(rs);
2871
2872     /* Update RAMState cache of output QEMUFile */
2873     rs->f = out;
2874
2875     trace_ram_state_resume_prepare(pages);
2876 }
2877
2878 /*
2879  * This function clears bits of the free pages reported by the caller from the
2880  * migration dirty bitmap. @addr is the host address corresponding to the
2881  * start of the continuous guest free pages, and @len is the total bytes of
2882  * those pages.
2883  */
2884 void qemu_guest_free_page_hint(void *addr, size_t len)
2885 {
2886     RAMBlock *block;
2887     ram_addr_t offset;
2888     size_t used_len, start, npages;
2889     MigrationState *s = migrate_get_current();
2890
2891     /* This function is currently expected to be used during live migration */
2892     if (!migration_is_setup_or_active(s->state)) {
2893         return;
2894     }
2895
2896     for (; len > 0; len -= used_len, addr += used_len) {
2897         block = qemu_ram_block_from_host(addr, false, &offset);
2898         if (unlikely(!block || offset >= block->used_length)) {
2899             /*
2900              * The implementation might not support RAMBlock resize during
2901              * live migration, but it could happen in theory with future
2902              * updates. So we add a check here to capture that case.
2903              */
2904             error_report_once("%s unexpected error", __func__);
2905             return;
2906         }
2907
2908         if (len <= block->used_length - offset) {
2909             used_len = len;
2910         } else {
2911             used_len = block->used_length - offset;
2912         }
2913
2914         start = offset >> TARGET_PAGE_BITS;
2915         npages = used_len >> TARGET_PAGE_BITS;
2916
2917         qemu_mutex_lock(&ram_state->bitmap_mutex);
2918         /*
2919          * The skipped free pages are equavalent to be sent from clear_bmap's
2920          * perspective, so clear the bits from the memory region bitmap which
2921          * are initially set. Otherwise those skipped pages will be sent in
2922          * the next round after syncing from the memory region bitmap.
2923          */
2924         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2925         ram_state->migration_dirty_pages -=
2926                       bitmap_count_one_with_offset(block->bmap, start, npages);
2927         bitmap_clear(block->bmap, start, npages);
2928         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2929     }
2930 }
2931
2932 /*
2933  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2934  * long-running RCU critical section.  When rcu-reclaims in the code
2935  * start to become numerous it will be necessary to reduce the
2936  * granularity of these critical sections.
2937  */
2938
2939 /**
2940  * ram_save_setup: Setup RAM for migration
2941  *
2942  * Returns zero to indicate success and negative for error
2943  *
2944  * @f: QEMUFile where to send the data
2945  * @opaque: RAMState pointer
2946  */
2947 static int ram_save_setup(QEMUFile *f, void *opaque)
2948 {
2949     RAMState **rsp = opaque;
2950     RAMBlock *block;
2951
2952     if (compress_threads_save_setup()) {
2953         return -1;
2954     }
2955
2956     /* migration has already setup the bitmap, reuse it. */
2957     if (!migration_in_colo_state()) {
2958         if (ram_init_all(rsp) != 0) {
2959             compress_threads_save_cleanup();
2960             return -1;
2961         }
2962     }
2963     (*rsp)->f = f;
2964
2965     WITH_RCU_READ_LOCK_GUARD() {
2966         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2967
2968         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2969             qemu_put_byte(f, strlen(block->idstr));
2970             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2971             qemu_put_be64(f, block->used_length);
2972             if (migrate_postcopy_ram() && block->page_size !=
2973                                           qemu_host_page_size) {
2974                 qemu_put_be64(f, block->page_size);
2975             }
2976             if (migrate_ignore_shared()) {
2977                 qemu_put_be64(f, block->mr->addr);
2978             }
2979         }
2980     }
2981
2982     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2983     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2984
2985     multifd_send_sync_main(f);
2986     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2987     qemu_fflush(f);
2988
2989     return 0;
2990 }
2991
2992 /**
2993  * ram_save_iterate: iterative stage for migration
2994  *
2995  * Returns zero to indicate success and negative for error
2996  *
2997  * @f: QEMUFile where to send the data
2998  * @opaque: RAMState pointer
2999  */
3000 static int ram_save_iterate(QEMUFile *f, void *opaque)
3001 {
3002     RAMState **temp = opaque;
3003     RAMState *rs = *temp;
3004     int ret = 0;
3005     int i;
3006     int64_t t0;
3007     int done = 0;
3008
3009     if (blk_mig_bulk_active()) {
3010         /* Avoid transferring ram during bulk phase of block migration as
3011          * the bulk phase will usually take a long time and transferring
3012          * ram updates during that time is pointless. */
3013         goto out;
3014     }
3015
3016     /*
3017      * We'll take this lock a little bit long, but it's okay for two reasons.
3018      * Firstly, the only possible other thread to take it is who calls
3019      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3020      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3021      * guarantees that we'll at least released it in a regular basis.
3022      */
3023     qemu_mutex_lock(&rs->bitmap_mutex);
3024     WITH_RCU_READ_LOCK_GUARD() {
3025         if (ram_list.version != rs->last_version) {
3026             ram_state_reset(rs);
3027         }
3028
3029         /* Read version before ram_list.blocks */
3030         smp_rmb();
3031
3032         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3033
3034         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3035         i = 0;
3036         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3037                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3038             int pages;
3039
3040             if (qemu_file_get_error(f)) {
3041                 break;
3042             }
3043
3044             pages = ram_find_and_save_block(rs, false);
3045             /* no more pages to sent */
3046             if (pages == 0) {
3047                 done = 1;
3048                 break;
3049             }
3050
3051             if (pages < 0) {
3052                 qemu_file_set_error(f, pages);
3053                 break;
3054             }
3055
3056             rs->target_page_count += pages;
3057
3058             /*
3059              * During postcopy, it is necessary to make sure one whole host
3060              * page is sent in one chunk.
3061              */
3062             if (migrate_postcopy_ram()) {
3063                 flush_compressed_data(rs);
3064             }
3065
3066             /*
3067              * we want to check in the 1st loop, just in case it was the 1st
3068              * time and we had to sync the dirty bitmap.
3069              * qemu_clock_get_ns() is a bit expensive, so we only check each
3070              * some iterations
3071              */
3072             if ((i & 63) == 0) {
3073                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3074                               1000000;
3075                 if (t1 > MAX_WAIT) {
3076                     trace_ram_save_iterate_big_wait(t1, i);
3077                     break;
3078                 }
3079             }
3080             i++;
3081         }
3082     }
3083     qemu_mutex_unlock(&rs->bitmap_mutex);
3084
3085     /*
3086      * Must occur before EOS (or any QEMUFile operation)
3087      * because of RDMA protocol.
3088      */
3089     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3090
3091 out:
3092     if (ret >= 0
3093         && migration_is_setup_or_active(migrate_get_current()->state)) {
3094         multifd_send_sync_main(rs->f);
3095         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3096         qemu_fflush(f);
3097         ram_counters.transferred += 8;
3098
3099         ret = qemu_file_get_error(f);
3100     }
3101     if (ret < 0) {
3102         return ret;
3103     }
3104
3105     return done;
3106 }
3107
3108 /**
3109  * ram_save_complete: function called to send the remaining amount of ram
3110  *
3111  * Returns zero to indicate success or negative on error
3112  *
3113  * Called with iothread lock
3114  *
3115  * @f: QEMUFile where to send the data
3116  * @opaque: RAMState pointer
3117  */
3118 static int ram_save_complete(QEMUFile *f, void *opaque)
3119 {
3120     RAMState **temp = opaque;
3121     RAMState *rs = *temp;
3122     int ret = 0;
3123
3124     WITH_RCU_READ_LOCK_GUARD() {
3125         if (!migration_in_postcopy()) {
3126             migration_bitmap_sync_precopy(rs);
3127         }
3128
3129         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3130
3131         /* try transferring iterative blocks of memory */
3132
3133         /* flush all remaining blocks regardless of rate limiting */
3134         while (true) {
3135             int pages;
3136
3137             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3138             /* no more blocks to sent */
3139             if (pages == 0) {
3140                 break;
3141             }
3142             if (pages < 0) {
3143                 ret = pages;
3144                 break;
3145             }
3146         }
3147
3148         flush_compressed_data(rs);
3149         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3150     }
3151
3152     if (ret >= 0) {
3153         multifd_send_sync_main(rs->f);
3154         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3155         qemu_fflush(f);
3156     }
3157
3158     return ret;
3159 }
3160
3161 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3162                              uint64_t *res_precopy_only,
3163                              uint64_t *res_compatible,
3164                              uint64_t *res_postcopy_only)
3165 {
3166     RAMState **temp = opaque;
3167     RAMState *rs = *temp;
3168     uint64_t remaining_size;
3169
3170     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3171
3172     if (!migration_in_postcopy() &&
3173         remaining_size < max_size) {
3174         qemu_mutex_lock_iothread();
3175         WITH_RCU_READ_LOCK_GUARD() {
3176             migration_bitmap_sync_precopy(rs);
3177         }
3178         qemu_mutex_unlock_iothread();
3179         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3180     }
3181
3182     if (migrate_postcopy_ram()) {
3183         /* We can do postcopy, and all the data is postcopiable */
3184         *res_compatible += remaining_size;
3185     } else {
3186         *res_precopy_only += remaining_size;
3187     }
3188 }
3189
3190 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3191 {
3192     unsigned int xh_len;
3193     int xh_flags;
3194     uint8_t *loaded_data;
3195
3196     /* extract RLE header */
3197     xh_flags = qemu_get_byte(f);
3198     xh_len = qemu_get_be16(f);
3199
3200     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3201         error_report("Failed to load XBZRLE page - wrong compression!");
3202         return -1;
3203     }
3204
3205     if (xh_len > TARGET_PAGE_SIZE) {
3206         error_report("Failed to load XBZRLE page - len overflow!");
3207         return -1;
3208     }
3209     loaded_data = XBZRLE.decoded_buf;
3210     /* load data and decode */
3211     /* it can change loaded_data to point to an internal buffer */
3212     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3213
3214     /* decode RLE */
3215     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3216                              TARGET_PAGE_SIZE) == -1) {
3217         error_report("Failed to load XBZRLE page - decode error!");
3218         return -1;
3219     }
3220
3221     return 0;
3222 }
3223
3224 /**
3225  * ram_block_from_stream: read a RAMBlock id from the migration stream
3226  *
3227  * Must be called from within a rcu critical section.
3228  *
3229  * Returns a pointer from within the RCU-protected ram_list.
3230  *
3231  * @f: QEMUFile where to read the data from
3232  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3233  */
3234 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3235 {
3236     static RAMBlock *block;
3237     char id[256];
3238     uint8_t len;
3239
3240     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3241         if (!block) {
3242             error_report("Ack, bad migration stream!");
3243             return NULL;
3244         }
3245         return block;
3246     }
3247
3248     len = qemu_get_byte(f);
3249     qemu_get_buffer(f, (uint8_t *)id, len);
3250     id[len] = 0;
3251
3252     block = qemu_ram_block_by_name(id);
3253     if (!block) {
3254         error_report("Can't find block %s", id);
3255         return NULL;
3256     }
3257
3258     if (ramblock_is_ignored(block)) {
3259         error_report("block %s should not be migrated !", id);
3260         return NULL;
3261     }
3262
3263     return block;
3264 }
3265
3266 static inline void *host_from_ram_block_offset(RAMBlock *block,
3267                                                ram_addr_t offset)
3268 {
3269     if (!offset_in_ramblock(block, offset)) {
3270         return NULL;
3271     }
3272
3273     return block->host + offset;
3274 }
3275
3276 static void *host_page_from_ram_block_offset(RAMBlock *block,
3277                                              ram_addr_t offset)
3278 {
3279     /* Note: Explicitly no check against offset_in_ramblock(). */
3280     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3281                                    block->page_size);
3282 }
3283
3284 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3285                                                          ram_addr_t offset)
3286 {
3287     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3288 }
3289
3290 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3291                              ram_addr_t offset, bool record_bitmap)
3292 {
3293     if (!offset_in_ramblock(block, offset)) {
3294         return NULL;
3295     }
3296     if (!block->colo_cache) {
3297         error_report("%s: colo_cache is NULL in block :%s",
3298                      __func__, block->idstr);
3299         return NULL;
3300     }
3301
3302     /*
3303     * During colo checkpoint, we need bitmap of these migrated pages.
3304     * It help us to decide which pages in ram cache should be flushed
3305     * into VM's RAM later.
3306     */
3307     if (record_bitmap &&
3308         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3309         ram_state->migration_dirty_pages++;
3310     }
3311     return block->colo_cache + offset;
3312 }
3313
3314 /**
3315  * ram_handle_compressed: handle the zero page case
3316  *
3317  * If a page (or a whole RDMA chunk) has been
3318  * determined to be zero, then zap it.
3319  *
3320  * @host: host address for the zero page
3321  * @ch: what the page is filled from.  We only support zero
3322  * @size: size of the zero page
3323  */
3324 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3325 {
3326     if (ch != 0 || !is_zero_range(host, size)) {
3327         memset(host, ch, size);
3328     }
3329 }
3330
3331 /* return the size after decompression, or negative value on error */
3332 static int
3333 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3334                      const uint8_t *source, size_t source_len)
3335 {
3336     int err;
3337
3338     err = inflateReset(stream);
3339     if (err != Z_OK) {
3340         return -1;
3341     }
3342
3343     stream->avail_in = source_len;
3344     stream->next_in = (uint8_t *)source;
3345     stream->avail_out = dest_len;
3346     stream->next_out = dest;
3347
3348     err = inflate(stream, Z_NO_FLUSH);
3349     if (err != Z_STREAM_END) {
3350         return -1;
3351     }
3352
3353     return stream->total_out;
3354 }
3355
3356 static void *do_data_decompress(void *opaque)
3357 {
3358     DecompressParam *param = opaque;
3359     unsigned long pagesize;
3360     uint8_t *des;
3361     int len, ret;
3362
3363     qemu_mutex_lock(&param->mutex);
3364     while (!param->quit) {
3365         if (param->des) {
3366             des = param->des;
3367             len = param->len;
3368             param->des = 0;
3369             qemu_mutex_unlock(&param->mutex);
3370
3371             pagesize = TARGET_PAGE_SIZE;
3372
3373             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3374                                        param->compbuf, len);
3375             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3376                 error_report("decompress data failed");
3377                 qemu_file_set_error(decomp_file, ret);
3378             }
3379
3380             qemu_mutex_lock(&decomp_done_lock);
3381             param->done = true;
3382             qemu_cond_signal(&decomp_done_cond);
3383             qemu_mutex_unlock(&decomp_done_lock);
3384
3385             qemu_mutex_lock(&param->mutex);
3386         } else {
3387             qemu_cond_wait(&param->cond, &param->mutex);
3388         }
3389     }
3390     qemu_mutex_unlock(&param->mutex);
3391
3392     return NULL;
3393 }
3394
3395 static int wait_for_decompress_done(void)
3396 {
3397     int idx, thread_count;
3398
3399     if (!migrate_use_compression()) {
3400         return 0;
3401     }
3402
3403     thread_count = migrate_decompress_threads();
3404     qemu_mutex_lock(&decomp_done_lock);
3405     for (idx = 0; idx < thread_count; idx++) {
3406         while (!decomp_param[idx].done) {
3407             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3408         }
3409     }
3410     qemu_mutex_unlock(&decomp_done_lock);
3411     return qemu_file_get_error(decomp_file);
3412 }
3413
3414 static void compress_threads_load_cleanup(void)
3415 {
3416     int i, thread_count;
3417
3418     if (!migrate_use_compression()) {
3419         return;
3420     }
3421     thread_count = migrate_decompress_threads();
3422     for (i = 0; i < thread_count; i++) {
3423         /*
3424          * we use it as a indicator which shows if the thread is
3425          * properly init'd or not
3426          */
3427         if (!decomp_param[i].compbuf) {
3428             break;
3429         }
3430
3431         qemu_mutex_lock(&decomp_param[i].mutex);
3432         decomp_param[i].quit = true;
3433         qemu_cond_signal(&decomp_param[i].cond);
3434         qemu_mutex_unlock(&decomp_param[i].mutex);
3435     }
3436     for (i = 0; i < thread_count; i++) {
3437         if (!decomp_param[i].compbuf) {
3438             break;
3439         }
3440
3441         qemu_thread_join(decompress_threads + i);
3442         qemu_mutex_destroy(&decomp_param[i].mutex);
3443         qemu_cond_destroy(&decomp_param[i].cond);
3444         inflateEnd(&decomp_param[i].stream);
3445         g_free(decomp_param[i].compbuf);
3446         decomp_param[i].compbuf = NULL;
3447     }
3448     g_free(decompress_threads);
3449     g_free(decomp_param);
3450     decompress_threads = NULL;
3451     decomp_param = NULL;
3452     decomp_file = NULL;
3453 }
3454
3455 static int compress_threads_load_setup(QEMUFile *f)
3456 {
3457     int i, thread_count;
3458
3459     if (!migrate_use_compression()) {
3460         return 0;
3461     }
3462
3463     thread_count = migrate_decompress_threads();
3464     decompress_threads = g_new0(QemuThread, thread_count);
3465     decomp_param = g_new0(DecompressParam, thread_count);
3466     qemu_mutex_init(&decomp_done_lock);
3467     qemu_cond_init(&decomp_done_cond);
3468     decomp_file = f;
3469     for (i = 0; i < thread_count; i++) {
3470         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3471             goto exit;
3472         }
3473
3474         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3475         qemu_mutex_init(&decomp_param[i].mutex);
3476         qemu_cond_init(&decomp_param[i].cond);
3477         decomp_param[i].done = true;
3478         decomp_param[i].quit = false;
3479         qemu_thread_create(decompress_threads + i, "decompress",
3480                            do_data_decompress, decomp_param + i,
3481                            QEMU_THREAD_JOINABLE);
3482     }
3483     return 0;
3484 exit:
3485     compress_threads_load_cleanup();
3486     return -1;
3487 }
3488
3489 static void decompress_data_with_multi_threads(QEMUFile *f,
3490                                                void *host, int len)
3491 {
3492     int idx, thread_count;
3493
3494     thread_count = migrate_decompress_threads();
3495     QEMU_LOCK_GUARD(&decomp_done_lock);
3496     while (true) {
3497         for (idx = 0; idx < thread_count; idx++) {
3498             if (decomp_param[idx].done) {
3499                 decomp_param[idx].done = false;
3500                 qemu_mutex_lock(&decomp_param[idx].mutex);
3501                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3502                 decomp_param[idx].des = host;
3503                 decomp_param[idx].len = len;
3504                 qemu_cond_signal(&decomp_param[idx].cond);
3505                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3506                 break;
3507             }
3508         }
3509         if (idx < thread_count) {
3510             break;
3511         } else {
3512             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3513         }
3514     }
3515 }
3516
3517 static void colo_init_ram_state(void)
3518 {
3519     ram_state_init(&ram_state);
3520 }
3521
3522 /*
3523  * colo cache: this is for secondary VM, we cache the whole
3524  * memory of the secondary VM, it is need to hold the global lock
3525  * to call this helper.
3526  */
3527 int colo_init_ram_cache(void)
3528 {
3529     RAMBlock *block;
3530
3531     WITH_RCU_READ_LOCK_GUARD() {
3532         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3533             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3534                                                     NULL, false, false);
3535             if (!block->colo_cache) {
3536                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3537                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3538                              block->used_length);
3539                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3540                     if (block->colo_cache) {
3541                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3542                         block->colo_cache = NULL;
3543                     }
3544                 }
3545                 return -errno;
3546             }
3547             if (!machine_dump_guest_core(current_machine)) {
3548                 qemu_madvise(block->colo_cache, block->used_length,
3549                              QEMU_MADV_DONTDUMP);
3550             }
3551         }
3552     }
3553
3554     /*
3555     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3556     * with to decide which page in cache should be flushed into SVM's RAM. Here
3557     * we use the same name 'ram_bitmap' as for migration.
3558     */
3559     if (ram_bytes_total()) {
3560         RAMBlock *block;
3561
3562         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3563             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3564             block->bmap = bitmap_new(pages);
3565         }
3566     }
3567
3568     colo_init_ram_state();
3569     return 0;
3570 }
3571
3572 /* TODO: duplicated with ram_init_bitmaps */
3573 void colo_incoming_start_dirty_log(void)
3574 {
3575     RAMBlock *block = NULL;
3576     /* For memory_global_dirty_log_start below. */
3577     qemu_mutex_lock_iothread();
3578     qemu_mutex_lock_ramlist();
3579
3580     memory_global_dirty_log_sync();
3581     WITH_RCU_READ_LOCK_GUARD() {
3582         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3583             ramblock_sync_dirty_bitmap(ram_state, block);
3584             /* Discard this dirty bitmap record */
3585             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3586         }
3587         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3588     }
3589     ram_state->migration_dirty_pages = 0;
3590     qemu_mutex_unlock_ramlist();
3591     qemu_mutex_unlock_iothread();
3592 }
3593
3594 /* It is need to hold the global lock to call this helper */
3595 void colo_release_ram_cache(void)
3596 {
3597     RAMBlock *block;
3598
3599     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3600     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3601         g_free(block->bmap);
3602         block->bmap = NULL;
3603     }
3604
3605     WITH_RCU_READ_LOCK_GUARD() {
3606         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3607             if (block->colo_cache) {
3608                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3609                 block->colo_cache = NULL;
3610             }
3611         }
3612     }
3613     ram_state_cleanup(&ram_state);
3614 }
3615
3616 /**
3617  * ram_load_setup: Setup RAM for migration incoming side
3618  *
3619  * Returns zero to indicate success and negative for error
3620  *
3621  * @f: QEMUFile where to receive the data
3622  * @opaque: RAMState pointer
3623  */
3624 static int ram_load_setup(QEMUFile *f, void *opaque)
3625 {
3626     if (compress_threads_load_setup(f)) {
3627         return -1;
3628     }
3629
3630     xbzrle_load_setup();
3631     ramblock_recv_map_init();
3632
3633     return 0;
3634 }
3635
3636 static int ram_load_cleanup(void *opaque)
3637 {
3638     RAMBlock *rb;
3639
3640     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3641         qemu_ram_block_writeback(rb);
3642     }
3643
3644     xbzrle_load_cleanup();
3645     compress_threads_load_cleanup();
3646
3647     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3648         g_free(rb->receivedmap);
3649         rb->receivedmap = NULL;
3650     }
3651
3652     return 0;
3653 }
3654
3655 /**
3656  * ram_postcopy_incoming_init: allocate postcopy data structures
3657  *
3658  * Returns 0 for success and negative if there was one error
3659  *
3660  * @mis: current migration incoming state
3661  *
3662  * Allocate data structures etc needed by incoming migration with
3663  * postcopy-ram. postcopy-ram's similarly names
3664  * postcopy_ram_incoming_init does the work.
3665  */
3666 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3667 {
3668     return postcopy_ram_incoming_init(mis);
3669 }
3670
3671 /**
3672  * ram_load_postcopy: load a page in postcopy case
3673  *
3674  * Returns 0 for success or -errno in case of error
3675  *
3676  * Called in postcopy mode by ram_load().
3677  * rcu_read_lock is taken prior to this being called.
3678  *
3679  * @f: QEMUFile where to send the data
3680  */
3681 static int ram_load_postcopy(QEMUFile *f)
3682 {
3683     int flags = 0, ret = 0;
3684     bool place_needed = false;
3685     bool matches_target_page_size = false;
3686     MigrationIncomingState *mis = migration_incoming_get_current();
3687     /* Temporary page that is later 'placed' */
3688     void *postcopy_host_page = mis->postcopy_tmp_page;
3689     void *host_page = NULL;
3690     bool all_zero = true;
3691     int target_pages = 0;
3692
3693     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3694         ram_addr_t addr;
3695         void *page_buffer = NULL;
3696         void *place_source = NULL;
3697         RAMBlock *block = NULL;
3698         uint8_t ch;
3699         int len;
3700
3701         addr = qemu_get_be64(f);
3702
3703         /*
3704          * If qemu file error, we should stop here, and then "addr"
3705          * may be invalid
3706          */
3707         ret = qemu_file_get_error(f);
3708         if (ret) {
3709             break;
3710         }
3711
3712         flags = addr & ~TARGET_PAGE_MASK;
3713         addr &= TARGET_PAGE_MASK;
3714
3715         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3716         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3717                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3718             block = ram_block_from_stream(f, flags);
3719             if (!block) {
3720                 ret = -EINVAL;
3721                 break;
3722             }
3723
3724             /*
3725              * Relying on used_length is racy and can result in false positives.
3726              * We might place pages beyond used_length in case RAM was shrunk
3727              * while in postcopy, which is fine - trying to place via
3728              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3729              */
3730             if (!block->host || addr >= block->postcopy_length) {
3731                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3732                 ret = -EINVAL;
3733                 break;
3734             }
3735             target_pages++;
3736             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3737             /*
3738              * Postcopy requires that we place whole host pages atomically;
3739              * these may be huge pages for RAMBlocks that are backed by
3740              * hugetlbfs.
3741              * To make it atomic, the data is read into a temporary page
3742              * that's moved into place later.
3743              * The migration protocol uses,  possibly smaller, target-pages
3744              * however the source ensures it always sends all the components
3745              * of a host page in one chunk.
3746              */
3747             page_buffer = postcopy_host_page +
3748                           host_page_offset_from_ram_block_offset(block, addr);
3749             /* If all TP are zero then we can optimise the place */
3750             if (target_pages == 1) {
3751                 host_page = host_page_from_ram_block_offset(block, addr);
3752             } else if (host_page != host_page_from_ram_block_offset(block,
3753                                                                     addr)) {
3754                 /* not the 1st TP within the HP */
3755                 error_report("Non-same host page %p/%p", host_page,
3756                              host_page_from_ram_block_offset(block, addr));
3757                 ret = -EINVAL;
3758                 break;
3759             }
3760
3761             /*
3762              * If it's the last part of a host page then we place the host
3763              * page
3764              */
3765             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3766                 place_needed = true;
3767             }
3768             place_source = postcopy_host_page;
3769         }
3770
3771         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3772         case RAM_SAVE_FLAG_ZERO:
3773             ch = qemu_get_byte(f);
3774             /*
3775              * Can skip to set page_buffer when
3776              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3777              */
3778             if (ch || !matches_target_page_size) {
3779                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3780             }
3781             if (ch) {
3782                 all_zero = false;
3783             }
3784             break;
3785
3786         case RAM_SAVE_FLAG_PAGE:
3787             all_zero = false;
3788             if (!matches_target_page_size) {
3789                 /* For huge pages, we always use temporary buffer */
3790                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3791             } else {
3792                 /*
3793                  * For small pages that matches target page size, we
3794                  * avoid the qemu_file copy.  Instead we directly use
3795                  * the buffer of QEMUFile to place the page.  Note: we
3796                  * cannot do any QEMUFile operation before using that
3797                  * buffer to make sure the buffer is valid when
3798                  * placing the page.
3799                  */
3800                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3801                                          TARGET_PAGE_SIZE);
3802             }
3803             break;
3804         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3805             all_zero = false;
3806             len = qemu_get_be32(f);
3807             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3808                 error_report("Invalid compressed data length: %d", len);
3809                 ret = -EINVAL;
3810                 break;
3811             }
3812             decompress_data_with_multi_threads(f, page_buffer, len);
3813             break;
3814
3815         case RAM_SAVE_FLAG_EOS:
3816             /* normal exit */
3817             multifd_recv_sync_main();
3818             break;
3819         default:
3820             error_report("Unknown combination of migration flags: 0x%x"
3821                          " (postcopy mode)", flags);
3822             ret = -EINVAL;
3823             break;
3824         }
3825
3826         /* Got the whole host page, wait for decompress before placing. */
3827         if (place_needed) {
3828             ret |= wait_for_decompress_done();
3829         }
3830
3831         /* Detect for any possible file errors */
3832         if (!ret && qemu_file_get_error(f)) {
3833             ret = qemu_file_get_error(f);
3834         }
3835
3836         if (!ret && place_needed) {
3837             if (all_zero) {
3838                 ret = postcopy_place_page_zero(mis, host_page, block);
3839             } else {
3840                 ret = postcopy_place_page(mis, host_page, place_source,
3841                                           block);
3842             }
3843             place_needed = false;
3844             target_pages = 0;
3845             /* Assume we have a zero page until we detect something different */
3846             all_zero = true;
3847         }
3848     }
3849
3850     return ret;
3851 }
3852
3853 static bool postcopy_is_advised(void)
3854 {
3855     PostcopyState ps = postcopy_state_get();
3856     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3857 }
3858
3859 static bool postcopy_is_running(void)
3860 {
3861     PostcopyState ps = postcopy_state_get();
3862     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3863 }
3864
3865 /*
3866  * Flush content of RAM cache into SVM's memory.
3867  * Only flush the pages that be dirtied by PVM or SVM or both.
3868  */
3869 void colo_flush_ram_cache(void)
3870 {
3871     RAMBlock *block = NULL;
3872     void *dst_host;
3873     void *src_host;
3874     unsigned long offset = 0;
3875
3876     memory_global_dirty_log_sync();
3877     qemu_mutex_lock(&ram_state->bitmap_mutex);
3878     WITH_RCU_READ_LOCK_GUARD() {
3879         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3880             ramblock_sync_dirty_bitmap(ram_state, block);
3881         }
3882     }
3883
3884     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3885     WITH_RCU_READ_LOCK_GUARD() {
3886         block = QLIST_FIRST_RCU(&ram_list.blocks);
3887
3888         while (block) {
3889             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3890
3891             if (!offset_in_ramblock(block,
3892                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3893                 offset = 0;
3894                 block = QLIST_NEXT_RCU(block, next);
3895             } else {
3896                 migration_bitmap_clear_dirty(ram_state, block, offset);
3897                 dst_host = block->host
3898                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3899                 src_host = block->colo_cache
3900                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3901                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3902             }
3903         }
3904     }
3905     trace_colo_flush_ram_cache_end();
3906     qemu_mutex_unlock(&ram_state->bitmap_mutex);
3907 }
3908
3909 /**
3910  * ram_load_precopy: load pages in precopy case
3911  *
3912  * Returns 0 for success or -errno in case of error
3913  *
3914  * Called in precopy mode by ram_load().
3915  * rcu_read_lock is taken prior to this being called.
3916  *
3917  * @f: QEMUFile where to send the data
3918  */
3919 static int ram_load_precopy(QEMUFile *f)
3920 {
3921     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3922     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3923     bool postcopy_advised = postcopy_is_advised();
3924     if (!migrate_use_compression()) {
3925         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3926     }
3927
3928     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3929         ram_addr_t addr, total_ram_bytes;
3930         void *host = NULL, *host_bak = NULL;
3931         uint8_t ch;
3932
3933         /*
3934          * Yield periodically to let main loop run, but an iteration of
3935          * the main loop is expensive, so do it each some iterations
3936          */
3937         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3938             aio_co_schedule(qemu_get_current_aio_context(),
3939                             qemu_coroutine_self());
3940             qemu_coroutine_yield();
3941         }
3942         i++;
3943
3944         addr = qemu_get_be64(f);
3945         flags = addr & ~TARGET_PAGE_MASK;
3946         addr &= TARGET_PAGE_MASK;
3947
3948         if (flags & invalid_flags) {
3949             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3950                 error_report("Received an unexpected compressed page");
3951             }
3952
3953             ret = -EINVAL;
3954             break;
3955         }
3956
3957         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3958                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3959             RAMBlock *block = ram_block_from_stream(f, flags);
3960
3961             host = host_from_ram_block_offset(block, addr);
3962             /*
3963              * After going into COLO stage, we should not load the page
3964              * into SVM's memory directly, we put them into colo_cache firstly.
3965              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3966              * Previously, we copied all these memory in preparing stage of COLO
3967              * while we need to stop VM, which is a time-consuming process.
3968              * Here we optimize it by a trick, back-up every page while in
3969              * migration process while COLO is enabled, though it affects the
3970              * speed of the migration, but it obviously reduce the downtime of
3971              * back-up all SVM'S memory in COLO preparing stage.
3972              */
3973             if (migration_incoming_colo_enabled()) {
3974                 if (migration_incoming_in_colo_state()) {
3975                     /* In COLO stage, put all pages into cache temporarily */
3976                     host = colo_cache_from_block_offset(block, addr, true);
3977                 } else {
3978                    /*
3979                     * In migration stage but before COLO stage,
3980                     * Put all pages into both cache and SVM's memory.
3981                     */
3982                     host_bak = colo_cache_from_block_offset(block, addr, false);
3983                 }
3984             }
3985             if (!host) {
3986                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3987                 ret = -EINVAL;
3988                 break;
3989             }
3990             if (!migration_incoming_in_colo_state()) {
3991                 ramblock_recv_bitmap_set(block, host);
3992             }
3993
3994             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3995         }
3996
3997         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3998         case RAM_SAVE_FLAG_MEM_SIZE:
3999             /* Synchronize RAM block list */
4000             total_ram_bytes = addr;
4001             while (!ret && total_ram_bytes) {
4002                 RAMBlock *block;
4003                 char id[256];
4004                 ram_addr_t length;
4005
4006                 len = qemu_get_byte(f);
4007                 qemu_get_buffer(f, (uint8_t *)id, len);
4008                 id[len] = 0;
4009                 length = qemu_get_be64(f);
4010
4011                 block = qemu_ram_block_by_name(id);
4012                 if (block && !qemu_ram_is_migratable(block)) {
4013                     error_report("block %s should not be migrated !", id);
4014                     ret = -EINVAL;
4015                 } else if (block) {
4016                     if (length != block->used_length) {
4017                         Error *local_err = NULL;
4018
4019                         ret = qemu_ram_resize(block, length,
4020                                               &local_err);
4021                         if (local_err) {
4022                             error_report_err(local_err);
4023                         }
4024                     }
4025                     /* For postcopy we need to check hugepage sizes match */
4026                     if (postcopy_advised && migrate_postcopy_ram() &&
4027                         block->page_size != qemu_host_page_size) {
4028                         uint64_t remote_page_size = qemu_get_be64(f);
4029                         if (remote_page_size != block->page_size) {
4030                             error_report("Mismatched RAM page size %s "
4031                                          "(local) %zd != %" PRId64,
4032                                          id, block->page_size,
4033                                          remote_page_size);
4034                             ret = -EINVAL;
4035                         }
4036                     }
4037                     if (migrate_ignore_shared()) {
4038                         hwaddr addr = qemu_get_be64(f);
4039                         if (ramblock_is_ignored(block) &&
4040                             block->mr->addr != addr) {
4041                             error_report("Mismatched GPAs for block %s "
4042                                          "%" PRId64 "!= %" PRId64,
4043                                          id, (uint64_t)addr,
4044                                          (uint64_t)block->mr->addr);
4045                             ret = -EINVAL;
4046                         }
4047                     }
4048                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4049                                           block->idstr);
4050                 } else {
4051                     error_report("Unknown ramblock \"%s\", cannot "
4052                                  "accept migration", id);
4053                     ret = -EINVAL;
4054                 }
4055
4056                 total_ram_bytes -= length;
4057             }
4058             break;
4059
4060         case RAM_SAVE_FLAG_ZERO:
4061             ch = qemu_get_byte(f);
4062             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4063             break;
4064
4065         case RAM_SAVE_FLAG_PAGE:
4066             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4067             break;
4068
4069         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4070             len = qemu_get_be32(f);
4071             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4072                 error_report("Invalid compressed data length: %d", len);
4073                 ret = -EINVAL;
4074                 break;
4075             }
4076             decompress_data_with_multi_threads(f, host, len);
4077             break;
4078
4079         case RAM_SAVE_FLAG_XBZRLE:
4080             if (load_xbzrle(f, addr, host) < 0) {
4081                 error_report("Failed to decompress XBZRLE page at "
4082                              RAM_ADDR_FMT, addr);
4083                 ret = -EINVAL;
4084                 break;
4085             }
4086             break;
4087         case RAM_SAVE_FLAG_EOS:
4088             /* normal exit */
4089             multifd_recv_sync_main();
4090             break;
4091         default:
4092             if (flags & RAM_SAVE_FLAG_HOOK) {
4093                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4094             } else {
4095                 error_report("Unknown combination of migration flags: 0x%x",
4096                              flags);
4097                 ret = -EINVAL;
4098             }
4099         }
4100         if (!ret) {
4101             ret = qemu_file_get_error(f);
4102         }
4103         if (!ret && host_bak) {
4104             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4105         }
4106     }
4107
4108     ret |= wait_for_decompress_done();
4109     return ret;
4110 }
4111
4112 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4113 {
4114     int ret = 0;
4115     static uint64_t seq_iter;
4116     /*
4117      * If system is running in postcopy mode, page inserts to host memory must
4118      * be atomic
4119      */
4120     bool postcopy_running = postcopy_is_running();
4121
4122     seq_iter++;
4123
4124     if (version_id != 4) {
4125         return -EINVAL;
4126     }
4127
4128     /*
4129      * This RCU critical section can be very long running.
4130      * When RCU reclaims in the code start to become numerous,
4131      * it will be necessary to reduce the granularity of this
4132      * critical section.
4133      */
4134     WITH_RCU_READ_LOCK_GUARD() {
4135         if (postcopy_running) {
4136             ret = ram_load_postcopy(f);
4137         } else {
4138             ret = ram_load_precopy(f);
4139         }
4140     }
4141     trace_ram_load_complete(ret, seq_iter);
4142
4143     return ret;
4144 }
4145
4146 static bool ram_has_postcopy(void *opaque)
4147 {
4148     RAMBlock *rb;
4149     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4150         if (ramblock_is_pmem(rb)) {
4151             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4152                          "is not supported now!", rb->idstr, rb->host);
4153             return false;
4154         }
4155     }
4156
4157     return migrate_postcopy_ram();
4158 }
4159
4160 /* Sync all the dirty bitmap with destination VM.  */
4161 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4162 {
4163     RAMBlock *block;
4164     QEMUFile *file = s->to_dst_file;
4165     int ramblock_count = 0;
4166
4167     trace_ram_dirty_bitmap_sync_start();
4168
4169     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4170         qemu_savevm_send_recv_bitmap(file, block->idstr);
4171         trace_ram_dirty_bitmap_request(block->idstr);
4172         ramblock_count++;
4173     }
4174
4175     trace_ram_dirty_bitmap_sync_wait();
4176
4177     /* Wait until all the ramblocks' dirty bitmap synced */
4178     while (ramblock_count--) {
4179         qemu_sem_wait(&s->rp_state.rp_sem);
4180     }
4181
4182     trace_ram_dirty_bitmap_sync_complete();
4183
4184     return 0;
4185 }
4186
4187 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4188 {
4189     qemu_sem_post(&s->rp_state.rp_sem);
4190 }
4191
4192 /*
4193  * Read the received bitmap, revert it as the initial dirty bitmap.
4194  * This is only used when the postcopy migration is paused but wants
4195  * to resume from a middle point.
4196  */
4197 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4198 {
4199     int ret = -EINVAL;
4200     /* from_dst_file is always valid because we're within rp_thread */
4201     QEMUFile *file = s->rp_state.from_dst_file;
4202     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4203     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4204     uint64_t size, end_mark;
4205
4206     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4207
4208     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4209         error_report("%s: incorrect state %s", __func__,
4210                      MigrationStatus_str(s->state));
4211         return -EINVAL;
4212     }
4213
4214     /*
4215      * Note: see comments in ramblock_recv_bitmap_send() on why we
4216      * need the endianness conversion, and the paddings.
4217      */
4218     local_size = ROUND_UP(local_size, 8);
4219
4220     /* Add paddings */
4221     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4222
4223     size = qemu_get_be64(file);
4224
4225     /* The size of the bitmap should match with our ramblock */
4226     if (size != local_size) {
4227         error_report("%s: ramblock '%s' bitmap size mismatch "
4228                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4229                      block->idstr, size, local_size);
4230         ret = -EINVAL;
4231         goto out;
4232     }
4233
4234     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4235     end_mark = qemu_get_be64(file);
4236
4237     ret = qemu_file_get_error(file);
4238     if (ret || size != local_size) {
4239         error_report("%s: read bitmap failed for ramblock '%s': %d"
4240                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4241                      __func__, block->idstr, ret, local_size, size);
4242         ret = -EIO;
4243         goto out;
4244     }
4245
4246     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4247         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4248                      __func__, block->idstr, end_mark);
4249         ret = -EINVAL;
4250         goto out;
4251     }
4252
4253     /*
4254      * Endianness conversion. We are during postcopy (though paused).
4255      * The dirty bitmap won't change. We can directly modify it.
4256      */
4257     bitmap_from_le(block->bmap, le_bitmap, nbits);
4258
4259     /*
4260      * What we received is "received bitmap". Revert it as the initial
4261      * dirty bitmap for this ramblock.
4262      */
4263     bitmap_complement(block->bmap, block->bmap, nbits);
4264
4265     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4266     ramblock_dirty_bitmap_clear_discarded_pages(block);
4267
4268     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4269     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4270
4271     /*
4272      * We succeeded to sync bitmap for current ramblock. If this is
4273      * the last one to sync, we need to notify the main send thread.
4274      */
4275     ram_dirty_bitmap_reload_notify(s);
4276
4277     ret = 0;
4278 out:
4279     g_free(le_bitmap);
4280     return ret;
4281 }
4282
4283 static int ram_resume_prepare(MigrationState *s, void *opaque)
4284 {
4285     RAMState *rs = *(RAMState **)opaque;
4286     int ret;
4287
4288     ret = ram_dirty_bitmap_sync_all(s, rs);
4289     if (ret) {
4290         return ret;
4291     }
4292
4293     ram_state_resume_prepare(rs, s->to_dst_file);
4294
4295     return 0;
4296 }
4297
4298 static SaveVMHandlers savevm_ram_handlers = {
4299     .save_setup = ram_save_setup,
4300     .save_live_iterate = ram_save_iterate,
4301     .save_live_complete_postcopy = ram_save_complete,
4302     .save_live_complete_precopy = ram_save_complete,
4303     .has_postcopy = ram_has_postcopy,
4304     .save_live_pending = ram_save_pending,
4305     .load_state = ram_load,
4306     .save_cleanup = ram_save_cleanup,
4307     .load_setup = ram_load_setup,
4308     .load_cleanup = ram_load_cleanup,
4309     .resume_prepare = ram_resume_prepare,
4310 };
4311
4312 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4313                                       size_t old_size, size_t new_size)
4314 {
4315     PostcopyState ps = postcopy_state_get();
4316     ram_addr_t offset;
4317     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4318     Error *err = NULL;
4319
4320     if (ramblock_is_ignored(rb)) {
4321         return;
4322     }
4323
4324     if (!migration_is_idle()) {
4325         /*
4326          * Precopy code on the source cannot deal with the size of RAM blocks
4327          * changing at random points in time - especially after sending the
4328          * RAM block sizes in the migration stream, they must no longer change.
4329          * Abort and indicate a proper reason.
4330          */
4331         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4332         migration_cancel(err);
4333         error_free(err);
4334     }
4335
4336     switch (ps) {
4337     case POSTCOPY_INCOMING_ADVISE:
4338         /*
4339          * Update what ram_postcopy_incoming_init()->init_range() does at the
4340          * time postcopy was advised. Syncing RAM blocks with the source will
4341          * result in RAM resizes.
4342          */
4343         if (old_size < new_size) {
4344             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4345                 error_report("RAM block '%s' discard of resized RAM failed",
4346                              rb->idstr);
4347             }
4348         }
4349         rb->postcopy_length = new_size;
4350         break;
4351     case POSTCOPY_INCOMING_NONE:
4352     case POSTCOPY_INCOMING_RUNNING:
4353     case POSTCOPY_INCOMING_END:
4354         /*
4355          * Once our guest is running, postcopy does no longer care about
4356          * resizes. When growing, the new memory was not available on the
4357          * source, no handler needed.
4358          */
4359         break;
4360     default:
4361         error_report("RAM block '%s' resized during postcopy state: %d",
4362                      rb->idstr, ps);
4363         exit(-1);
4364     }
4365 }
4366
4367 static RAMBlockNotifier ram_mig_ram_notifier = {
4368     .ram_block_resized = ram_mig_ram_block_resized,
4369 };
4370
4371 void ram_mig_init(void)
4372 {
4373     qemu_mutex_init(&XBZRLE.lock);
4374     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4375     ram_block_notifier_add(&ram_mig_ram_notifier);
4376 }