migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "qemu-common.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qapi-event.h"
  33 #include "qemu/cutils.h"
  34 #include "qemu/bitops.h"
  35 #include "qemu/bitmap.h"
  36 #include "qemu/timer.h"
  37 #include "qemu/main-loop.h"
  38 #include "migration/migration.h"
  39 #include "migration/postcopy-ram.h"
  40 #include "exec/address-spaces.h"
  41 #include "migration/page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "trace.h"
  44 #include "exec/ram_addr.h"
  45 #include "qemu/rcu_queue.h"
  46 #include "migration/colo.h"
  47
  48 static int dirty_rate_high_cnt;
  49
  50 static uint64_t bitmap_sync_count;
  51
  52 /***********************************************************/
  53 /* ram save/restore */
  54
  55 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  56 #define RAM_SAVE_FLAG_COMPRESS 0x02
  57 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  58 #define RAM_SAVE_FLAG_PAGE     0x08
  59 #define RAM_SAVE_FLAG_EOS      0x10
  60 #define RAM_SAVE_FLAG_CONTINUE 0x20
  61 #define RAM_SAVE_FLAG_XBZRLE   0x40
  62 /* 0x80 is reserved in migration.h start with 0x100 next */
  63 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  64
  65 static uint8_t *ZERO_TARGET_PAGE;
  66
  67 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  68 {
  69     return buffer_is_zero(p, size);
  70 }
  71
  72 /* struct contains XBZRLE cache and a static page
  73    used by the compression */
  74 static struct {
  75     /* buffer used for XBZRLE encoding */
  76     uint8_t *encoded_buf;
  77     /* buffer for storing page content */
  78     uint8_t *current_buf;
  79     /* Cache for XBZRLE, Protected by lock. */
  80     PageCache *cache;
  81     QemuMutex lock;
  82 } XBZRLE;
  83
  84 /* buffer used for XBZRLE decoding */
  85 static uint8_t *xbzrle_decoded_buf;
  86
  87 static void XBZRLE_cache_lock(void)
  88 {
  89     if (migrate_use_xbzrle())
  90         qemu_mutex_lock(&XBZRLE.lock);
  91 }
  92
  93 static void XBZRLE_cache_unlock(void)
  94 {
  95     if (migrate_use_xbzrle())
  96         qemu_mutex_unlock(&XBZRLE.lock);
  97 }
  98
  99 /*
 100  * called from qmp_migrate_set_cache_size in main thread, possibly while
 101  * a migration is in progress.
 102  * A running migration maybe using the cache and might finish during this
 103  * call, hence changes to the cache are protected by XBZRLE.lock().
 104  */
 105 int64_t xbzrle_cache_resize(int64_t new_size)
 106 {
 107     PageCache *new_cache;
 108     int64_t ret;
 109
 110     if (new_size < TARGET_PAGE_SIZE) {
 111         return -1;
 112     }
 113
 114     XBZRLE_cache_lock();
 115
 116     if (XBZRLE.cache != NULL) {
 117         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 118             goto out_new_size;
 119         }
 120         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 121                                         TARGET_PAGE_SIZE);
 122         if (!new_cache) {
 123             error_report("Error creating cache");
 124             ret = -1;
 125             goto out;
 126         }
 127
 128         cache_fini(XBZRLE.cache);
 129         XBZRLE.cache = new_cache;
 130     }
 131
 132 out_new_size:
 133     ret = pow2floor(new_size);
 134 out:
 135     XBZRLE_cache_unlock();
 136     return ret;
 137 }
 138
 139 /* accounting for migration statistics */
 140 typedef struct AccountingInfo {
 141     uint64_t dup_pages;
 142     uint64_t skipped_pages;
 143     uint64_t norm_pages;
 144     uint64_t iterations;
 145     uint64_t xbzrle_bytes;
 146     uint64_t xbzrle_pages;
 147     uint64_t xbzrle_cache_miss;
 148     double xbzrle_cache_miss_rate;
 149     uint64_t xbzrle_overflows;
 150 } AccountingInfo;
 151
 152 static AccountingInfo acct_info;
 153
 154 static void acct_clear(void)
 155 {
 156     memset(&acct_info, 0, sizeof(acct_info));
 157 }
 158
 159 uint64_t dup_mig_bytes_transferred(void)
 160 {
 161     return acct_info.dup_pages * TARGET_PAGE_SIZE;
 162 }
 163
 164 uint64_t dup_mig_pages_transferred(void)
 165 {
 166     return acct_info.dup_pages;
 167 }
 168
 169 uint64_t skipped_mig_bytes_transferred(void)
 170 {
 171     return acct_info.skipped_pages * TARGET_PAGE_SIZE;
 172 }
 173
 174 uint64_t skipped_mig_pages_transferred(void)
 175 {
 176     return acct_info.skipped_pages;
 177 }
 178
 179 uint64_t norm_mig_bytes_transferred(void)
 180 {
 181     return acct_info.norm_pages * TARGET_PAGE_SIZE;
 182 }
 183
 184 uint64_t norm_mig_pages_transferred(void)
 185 {
 186     return acct_info.norm_pages;
 187 }
 188
 189 uint64_t xbzrle_mig_bytes_transferred(void)
 190 {
 191     return acct_info.xbzrle_bytes;
 192 }
 193
 194 uint64_t xbzrle_mig_pages_transferred(void)
 195 {
 196     return acct_info.xbzrle_pages;
 197 }
 198
 199 uint64_t xbzrle_mig_pages_cache_miss(void)
 200 {
 201     return acct_info.xbzrle_cache_miss;
 202 }
 203
 204 double xbzrle_mig_cache_miss_rate(void)
 205 {
 206     return acct_info.xbzrle_cache_miss_rate;
 207 }
 208
 209 uint64_t xbzrle_mig_pages_overflow(void)
 210 {
 211     return acct_info.xbzrle_overflows;
 212 }
 213
 214 /* This is the last block that we have visited serching for dirty pages
 215  */
 216 static RAMBlock *last_seen_block;
 217 /* This is the last block from where we have sent data */
 218 static RAMBlock *last_sent_block;
 219 static ram_addr_t last_offset;
 220 static QemuMutex migration_bitmap_mutex;
 221 static uint64_t migration_dirty_pages;
 222 static uint32_t last_version;
 223 static bool ram_bulk_stage;
 224
 225 /* used by the search for pages to send */
 226 struct PageSearchStatus {
 227     /* Current block being searched */
 228     RAMBlock    *block;
 229     /* Current offset to search from */
 230     ram_addr_t   offset;
 231     /* Set once we wrap around */
 232     bool         complete_round;
 233 };
 234 typedef struct PageSearchStatus PageSearchStatus;
 235
 236 static struct BitmapRcu {
 237     struct rcu_head rcu;
 238     /* Main migration bitmap */
 239     unsigned long *bmap;
 240     /* bitmap of pages that haven't been sent even once
 241      * only maintained and used in postcopy at the moment
 242      * where it's used to send the dirtymap at the start
 243      * of the postcopy phase
 244      */
 245     unsigned long *unsentmap;
 246 } *migration_bitmap_rcu;
 247
 248 struct CompressParam {
 249     bool done;
 250     bool quit;
 251     QEMUFile *file;
 252     QemuMutex mutex;
 253     QemuCond cond;
 254     RAMBlock *block;
 255     ram_addr_t offset;
 256 };
 257 typedef struct CompressParam CompressParam;
 258
 259 struct DecompressParam {
 260     bool done;
 261     bool quit;
 262     QemuMutex mutex;
 263     QemuCond cond;
 264     void *des;
 265     uint8_t *compbuf;
 266     int len;
 267 };
 268 typedef struct DecompressParam DecompressParam;
 269
 270 static CompressParam *comp_param;
 271 static QemuThread *compress_threads;
 272 /* comp_done_cond is used to wake up the migration thread when
 273  * one of the compression threads has finished the compression.
 274  * comp_done_lock is used to co-work with comp_done_cond.
 275  */
 276 static QemuMutex comp_done_lock;
 277 static QemuCond comp_done_cond;
 278 /* The empty QEMUFileOps will be used by file in CompressParam */
 279 static const QEMUFileOps empty_ops = { };
 280
 281 static bool compression_switch;
 282 static DecompressParam *decomp_param;
 283 static QemuThread *decompress_threads;
 284 static QemuMutex decomp_done_lock;
 285 static QemuCond decomp_done_cond;
 286
 287 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 288                                 ram_addr_t offset);
 289
 290 static void *do_data_compress(void *opaque)
 291 {
 292     CompressParam *param = opaque;
 293     RAMBlock *block;
 294     ram_addr_t offset;
 295
 296     qemu_mutex_lock(&param->mutex);
 297     while (!param->quit) {
 298         if (param->block) {
 299             block = param->block;
 300             offset = param->offset;
 301             param->block = NULL;
 302             qemu_mutex_unlock(&param->mutex);
 303
 304             do_compress_ram_page(param->file, block, offset);
 305
 306             qemu_mutex_lock(&comp_done_lock);
 307             param->done = true;
 308             qemu_cond_signal(&comp_done_cond);
 309             qemu_mutex_unlock(&comp_done_lock);
 310
 311             qemu_mutex_lock(&param->mutex);
 312         } else {
 313             qemu_cond_wait(&param->cond, &param->mutex);
 314         }
 315     }
 316     qemu_mutex_unlock(&param->mutex);
 317
 318     return NULL;
 319 }
 320
 321 static inline void terminate_compression_threads(void)
 322 {
 323     int idx, thread_count;
 324
 325     thread_count = migrate_compress_threads();
 326     for (idx = 0; idx < thread_count; idx++) {
 327         qemu_mutex_lock(&comp_param[idx].mutex);
 328         comp_param[idx].quit = true;
 329         qemu_cond_signal(&comp_param[idx].cond);
 330         qemu_mutex_unlock(&comp_param[idx].mutex);
 331     }
 332 }
 333
 334 void migrate_compress_threads_join(void)
 335 {
 336     int i, thread_count;
 337
 338     if (!migrate_use_compression()) {
 339         return;
 340     }
 341     terminate_compression_threads();
 342     thread_count = migrate_compress_threads();
 343     for (i = 0; i < thread_count; i++) {
 344         qemu_thread_join(compress_threads + i);
 345         qemu_fclose(comp_param[i].file);
 346         qemu_mutex_destroy(&comp_param[i].mutex);
 347         qemu_cond_destroy(&comp_param[i].cond);
 348     }
 349     qemu_mutex_destroy(&comp_done_lock);
 350     qemu_cond_destroy(&comp_done_cond);
 351     g_free(compress_threads);
 352     g_free(comp_param);
 353     compress_threads = NULL;
 354     comp_param = NULL;
 355 }
 356
 357 void migrate_compress_threads_create(void)
 358 {
 359     int i, thread_count;
 360
 361     if (!migrate_use_compression()) {
 362         return;
 363     }
 364     compression_switch = true;
 365     thread_count = migrate_compress_threads();
 366     compress_threads = g_new0(QemuThread, thread_count);
 367     comp_param = g_new0(CompressParam, thread_count);
 368     qemu_cond_init(&comp_done_cond);
 369     qemu_mutex_init(&comp_done_lock);
 370     for (i = 0; i < thread_count; i++) {
 371         /* comp_param[i].file is just used as a dummy buffer to save data,
 372          * set its ops to empty.
 373          */
 374         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 375         comp_param[i].done = true;
 376         comp_param[i].quit = false;
 377         qemu_mutex_init(&comp_param[i].mutex);
 378         qemu_cond_init(&comp_param[i].cond);
 379         qemu_thread_create(compress_threads + i, "compress",
 380                            do_data_compress, comp_param + i,
 381                            QEMU_THREAD_JOINABLE);
 382     }
 383 }
 384
 385 /**
 386  * save_page_header: Write page header to wire
 387  *
 388  * If this is the 1st block, it also writes the block identification
 389  *
 390  * Returns: Number of bytes written
 391  *
 392  * @f: QEMUFile where to send the data
 393  * @block: block that contains the page we want to send
 394  * @offset: offset inside the block for the page
 395  *          in the lower bits, it contains flags
 396  */
 397 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
 398 {
 399     size_t size, len;
 400
 401     qemu_put_be64(f, offset);
 402     size = 8;
 403
 404     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 405         len = strlen(block->idstr);
 406         qemu_put_byte(f, len);
 407         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 408         size += 1 + len;
 409     }
 410     return size;
 411 }
 412
 413 /* Reduce amount of guest cpu execution to hopefully slow down memory writes.
 414  * If guest dirty memory rate is reduced below the rate at which we can
 415  * transfer pages to the destination then we should be able to complete
 416  * migration. Some workloads dirty memory way too fast and will not effectively
 417  * converge, even with auto-converge.
 418  */
 419 static void mig_throttle_guest_down(void)
 420 {
 421     MigrationState *s = migrate_get_current();
 422     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 423     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 424
 425     /* We have not started throttling yet. Let's start it. */
 426     if (!cpu_throttle_active()) {
 427         cpu_throttle_set(pct_initial);
 428     } else {
 429         /* Throttling already on, just increase the rate */
 430         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 431     }
 432 }
 433
 434 /* Update the xbzrle cache to reflect a page that's been sent as all 0.
 435  * The important thing is that a stale (not-yet-0'd) page be replaced
 436  * by the new data.
 437  * As a bonus, if the page wasn't in the cache it gets added so that
 438  * when a small write is made into the 0'd page it gets XBZRLE sent
 439  */
 440 static void xbzrle_cache_zero_page(ram_addr_t current_addr)
 441 {
 442     if (ram_bulk_stage || !migrate_use_xbzrle()) {
 443         return;
 444     }
 445
 446     /* We don't care if this fails to allocate a new cache page
 447      * as long as it updated an old one */
 448     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 449                  bitmap_sync_count);
 450 }
 451
 452 #define ENCODING_FLAG_XBZRLE 0x1
 453
 454 /**
 455  * save_xbzrle_page: compress and send current page
 456  *
 457  * Returns: 1 means that we wrote the page
 458  *          0 means that page is identical to the one already sent
 459  *          -1 means that xbzrle would be longer than normal
 460  *
 461  * @f: QEMUFile where to send the data
 462  * @current_data:
 463  * @current_addr:
 464  * @block: block that contains the page we want to send
 465  * @offset: offset inside the block for the page
 466  * @last_stage: if we are at the completion stage
 467  * @bytes_transferred: increase it with the number of transferred bytes
 468  */
 469 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
 470                             ram_addr_t current_addr, RAMBlock *block,
 471                             ram_addr_t offset, bool last_stage,
 472                             uint64_t *bytes_transferred)
 473 {
 474     int encoded_len = 0, bytes_xbzrle;
 475     uint8_t *prev_cached_page;
 476
 477     if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
 478         acct_info.xbzrle_cache_miss++;
 479         if (!last_stage) {
 480             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 481                              bitmap_sync_count) == -1) {
 482                 return -1;
 483             } else {
 484                 /* update *current_data when the page has been
 485                    inserted into cache */
 486                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 487             }
 488         }
 489         return -1;
 490     }
 491
 492     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 493
 494     /* save current buffer into memory */
 495     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 496
 497     /* XBZRLE encoding (if there is no overflow) */
 498     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 499                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 500                                        TARGET_PAGE_SIZE);
 501     if (encoded_len == 0) {
 502         trace_save_xbzrle_page_skipping();
 503         return 0;
 504     } else if (encoded_len == -1) {
 505         trace_save_xbzrle_page_overflow();
 506         acct_info.xbzrle_overflows++;
 507         /* update data in the cache */
 508         if (!last_stage) {
 509             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 510             *current_data = prev_cached_page;
 511         }
 512         return -1;
 513     }
 514
 515     /* we need to update the data in the cache, in order to get the same data */
 516     if (!last_stage) {
 517         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 518     }
 519
 520     /* Send XBZRLE based compressed page */
 521     bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
 522     qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
 523     qemu_put_be16(f, encoded_len);
 524     qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
 525     bytes_xbzrle += encoded_len + 1 + 2;
 526     acct_info.xbzrle_pages++;
 527     acct_info.xbzrle_bytes += bytes_xbzrle;
 528     *bytes_transferred += bytes_xbzrle;
 529
 530     return 1;
 531 }
 532
 533 /* Called with rcu_read_lock() to protect migration_bitmap
 534  * rb: The RAMBlock  to search for dirty pages in
 535  * start: Start address (typically so we can continue from previous page)
 536  * ram_addr_abs: Pointer into which to store the address of the dirty page
 537  *               within the global ram_addr space
 538  *
 539  * Returns: byte offset within memory region of the start of a dirty page
 540  */
 541 static inline
 542 ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
 543                                        ram_addr_t start,
 544                                        ram_addr_t *ram_addr_abs)
 545 {
 546     unsigned long base = rb->offset >> TARGET_PAGE_BITS;
 547     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
 548     uint64_t rb_size = rb->used_length;
 549     unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
 550     unsigned long *bitmap;
 551
 552     unsigned long next;
 553
 554     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 555     if (ram_bulk_stage && nr > base) {
 556         next = nr + 1;
 557     } else {
 558         next = find_next_bit(bitmap, size, nr);
 559     }
 560
 561     *ram_addr_abs = next << TARGET_PAGE_BITS;
 562     return (next - base) << TARGET_PAGE_BITS;
 563 }
 564
 565 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
 566 {
 567     bool ret;
 568     int nr = addr >> TARGET_PAGE_BITS;
 569     unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 570
 571     ret = test_and_clear_bit(nr, bitmap);
 572
 573     if (ret) {
 574         migration_dirty_pages--;
 575     }
 576     return ret;
 577 }
 578
 579 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
 580 {
 581     unsigned long *bitmap;
 582     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 583     migration_dirty_pages +=
 584         cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
 585 }
 586
 587 /* Fix me: there are too many global variables used in migration process. */
 588 static int64_t start_time;
 589 static int64_t bytes_xfer_prev;
 590 static int64_t num_dirty_pages_period;
 591 static uint64_t xbzrle_cache_miss_prev;
 592 static uint64_t iterations_prev;
 593
 594 static void migration_bitmap_sync_init(void)
 595 {
 596     start_time = 0;
 597     bytes_xfer_prev = 0;
 598     num_dirty_pages_period = 0;
 599     xbzrle_cache_miss_prev = 0;
 600     iterations_prev = 0;
 601 }
 602
 603 static void migration_bitmap_sync(void)
 604 {
 605     RAMBlock *block;
 606     uint64_t num_dirty_pages_init = migration_dirty_pages;
 607     MigrationState *s = migrate_get_current();
 608     int64_t end_time;
 609     int64_t bytes_xfer_now;
 610
 611     bitmap_sync_count++;
 612
 613     if (!bytes_xfer_prev) {
 614         bytes_xfer_prev = ram_bytes_transferred();
 615     }
 616
 617     if (!start_time) {
 618         start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 619     }
 620
 621     trace_migration_bitmap_sync_start();
 622     memory_global_dirty_log_sync();
 623
 624     qemu_mutex_lock(&migration_bitmap_mutex);
 625     rcu_read_lock();
 626     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 627         migration_bitmap_sync_range(block->offset, block->used_length);
 628     }
 629     rcu_read_unlock();
 630     qemu_mutex_unlock(&migration_bitmap_mutex);
 631
 632     trace_migration_bitmap_sync_end(migration_dirty_pages
 633                                     - num_dirty_pages_init);
 634     num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
 635     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 636
 637     /* more than 1 second = 1000 millisecons */
 638     if (end_time > start_time + 1000) {
 639         if (migrate_auto_converge()) {
 640             /* The following detection logic can be refined later. For now:
 641                Check to see if the dirtied bytes is 50% more than the approx.
 642                amount of bytes that just got transferred since the last time we
 643                were in this routine. If that happens twice, start or increase
 644                throttling */
 645             bytes_xfer_now = ram_bytes_transferred();
 646
 647             if (s->dirty_pages_rate &&
 648                (num_dirty_pages_period * TARGET_PAGE_SIZE >
 649                    (bytes_xfer_now - bytes_xfer_prev)/2) &&
 650                (dirty_rate_high_cnt++ >= 2)) {
 651                     trace_migration_throttle();
 652                     dirty_rate_high_cnt = 0;
 653                     mig_throttle_guest_down();
 654              }
 655              bytes_xfer_prev = bytes_xfer_now;
 656         }
 657
 658         if (migrate_use_xbzrle()) {
 659             if (iterations_prev != acct_info.iterations) {
 660                 acct_info.xbzrle_cache_miss_rate =
 661                    (double)(acct_info.xbzrle_cache_miss -
 662                             xbzrle_cache_miss_prev) /
 663                    (acct_info.iterations - iterations_prev);
 664             }
 665             iterations_prev = acct_info.iterations;
 666             xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
 667         }
 668         s->dirty_pages_rate = num_dirty_pages_period * 1000
 669             / (end_time - start_time);
 670         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
 671         start_time = end_time;
 672         num_dirty_pages_period = 0;
 673     }
 674     s->dirty_sync_count = bitmap_sync_count;
 675     if (migrate_use_events()) {
 676         qapi_event_send_migration_pass(bitmap_sync_count, NULL);
 677     }
 678 }
 679
 680 /**
 681  * save_zero_page: Send the zero page to the stream
 682  *
 683  * Returns: Number of pages written.
 684  *
 685  * @f: QEMUFile where to send the data
 686  * @block: block that contains the page we want to send
 687  * @offset: offset inside the block for the page
 688  * @p: pointer to the page
 689  * @bytes_transferred: increase it with the number of transferred bytes
 690  */
 691 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
 692                           uint8_t *p, uint64_t *bytes_transferred)
 693 {
 694     int pages = -1;
 695
 696     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 697         acct_info.dup_pages++;
 698         *bytes_transferred += save_page_header(f, block,
 699                                                offset | RAM_SAVE_FLAG_COMPRESS);
 700         qemu_put_byte(f, 0);
 701         *bytes_transferred += 1;
 702         pages = 1;
 703     }
 704
 705     return pages;
 706 }
 707
 708 static void ram_release_pages(MigrationState *ms, const char *block_name,
 709                               uint64_t offset, int pages)
 710 {
 711     if (!migrate_release_ram() || !migration_in_postcopy(ms)) {
 712         return;
 713     }
 714
 715     ram_discard_range(NULL, block_name, offset, pages << TARGET_PAGE_BITS);
 716 }
 717
 718 /**
 719  * ram_save_page: Send the given page to the stream
 720  *
 721  * Returns: Number of pages written.
 722  *          < 0 - error
 723  *          >=0 - Number of pages written - this might legally be 0
 724  *                if xbzrle noticed the page was the same.
 725  *
 726  * @ms: The current migration state.
 727  * @f: QEMUFile where to send the data
 728  * @block: block that contains the page we want to send
 729  * @offset: offset inside the block for the page
 730  * @last_stage: if we are at the completion stage
 731  * @bytes_transferred: increase it with the number of transferred bytes
 732  */
 733 static int ram_save_page(MigrationState *ms, QEMUFile *f, PageSearchStatus *pss,
 734                          bool last_stage, uint64_t *bytes_transferred)
 735 {
 736     int pages = -1;
 737     uint64_t bytes_xmit;
 738     ram_addr_t current_addr;
 739     uint8_t *p;
 740     int ret;
 741     bool send_async = true;
 742     RAMBlock *block = pss->block;
 743     ram_addr_t offset = pss->offset;
 744
 745     p = block->host + offset;
 746
 747     /* In doubt sent page as normal */
 748     bytes_xmit = 0;
 749     ret = ram_control_save_page(f, block->offset,
 750                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 751     if (bytes_xmit) {
 752         *bytes_transferred += bytes_xmit;
 753         pages = 1;
 754     }
 755
 756     XBZRLE_cache_lock();
 757
 758     current_addr = block->offset + offset;
 759
 760     if (block == last_sent_block) {
 761         offset |= RAM_SAVE_FLAG_CONTINUE;
 762     }
 763     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 764         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 765             if (bytes_xmit > 0) {
 766                 acct_info.norm_pages++;
 767             } else if (bytes_xmit == 0) {
 768                 acct_info.dup_pages++;
 769             }
 770         }
 771     } else {
 772         pages = save_zero_page(f, block, offset, p, bytes_transferred);
 773         if (pages > 0) {
 774             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 775              * page would be stale
 776              */
 777             xbzrle_cache_zero_page(current_addr);
 778             ram_release_pages(ms, block->idstr, pss->offset, pages);
 779         } else if (!ram_bulk_stage &&
 780                    !migration_in_postcopy(ms) && migrate_use_xbzrle()) {
 781             pages = save_xbzrle_page(f, &p, current_addr, block,
 782                                      offset, last_stage, bytes_transferred);
 783             if (!last_stage) {
 784                 /* Can't send this cached data async, since the cache page
 785                  * might get updated before it gets to the wire
 786                  */
 787                 send_async = false;
 788             }
 789         }
 790     }
 791
 792     /* XBZRLE overflow or normal page */
 793     if (pages == -1) {
 794         *bytes_transferred += save_page_header(f, block,
 795                                                offset | RAM_SAVE_FLAG_PAGE);
 796         if (send_async) {
 797             qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE,
 798                                   migrate_release_ram() &
 799                                   migration_in_postcopy(ms));
 800         } else {
 801             qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
 802         }
 803         *bytes_transferred += TARGET_PAGE_SIZE;
 804         pages = 1;
 805         acct_info.norm_pages++;
 806     }
 807
 808     XBZRLE_cache_unlock();
 809
 810     return pages;
 811 }
 812
 813 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 814                                 ram_addr_t offset)
 815 {
 816     int bytes_sent, blen;
 817     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 818
 819     bytes_sent = save_page_header(f, block, offset |
 820                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 821     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 822                                      migrate_compress_level());
 823     if (blen < 0) {
 824         bytes_sent = 0;
 825         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 826         error_report("compressed data failed!");
 827     } else {
 828         bytes_sent += blen;
 829         ram_release_pages(migrate_get_current(), block->idstr,
 830                           offset & TARGET_PAGE_MASK, 1);
 831     }
 832
 833     return bytes_sent;
 834 }
 835
 836 static uint64_t bytes_transferred;
 837
 838 static void flush_compressed_data(QEMUFile *f)
 839 {
 840     int idx, len, thread_count;
 841
 842     if (!migrate_use_compression()) {
 843         return;
 844     }
 845     thread_count = migrate_compress_threads();
 846
 847     qemu_mutex_lock(&comp_done_lock);
 848     for (idx = 0; idx < thread_count; idx++) {
 849         while (!comp_param[idx].done) {
 850             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 851         }
 852     }
 853     qemu_mutex_unlock(&comp_done_lock);
 854
 855     for (idx = 0; idx < thread_count; idx++) {
 856         qemu_mutex_lock(&comp_param[idx].mutex);
 857         if (!comp_param[idx].quit) {
 858             len = qemu_put_qemu_file(f, comp_param[idx].file);
 859             bytes_transferred += len;
 860         }
 861         qemu_mutex_unlock(&comp_param[idx].mutex);
 862     }
 863 }
 864
 865 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 866                                        ram_addr_t offset)
 867 {
 868     param->block = block;
 869     param->offset = offset;
 870 }
 871
 872 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
 873                                            ram_addr_t offset,
 874                                            uint64_t *bytes_transferred)
 875 {
 876     int idx, thread_count, bytes_xmit = -1, pages = -1;
 877
 878     thread_count = migrate_compress_threads();
 879     qemu_mutex_lock(&comp_done_lock);
 880     while (true) {
 881         for (idx = 0; idx < thread_count; idx++) {
 882             if (comp_param[idx].done) {
 883                 comp_param[idx].done = false;
 884                 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
 885                 qemu_mutex_lock(&comp_param[idx].mutex);
 886                 set_compress_params(&comp_param[idx], block, offset);
 887                 qemu_cond_signal(&comp_param[idx].cond);
 888                 qemu_mutex_unlock(&comp_param[idx].mutex);
 889                 pages = 1;
 890                 acct_info.norm_pages++;
 891                 *bytes_transferred += bytes_xmit;
 892                 break;
 893             }
 894         }
 895         if (pages > 0) {
 896             break;
 897         } else {
 898             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 899         }
 900     }
 901     qemu_mutex_unlock(&comp_done_lock);
 902
 903     return pages;
 904 }
 905
 906 /**
 907  * ram_save_compressed_page: compress the given page and send it to the stream
 908  *
 909  * Returns: Number of pages written.
 910  *
 911  * @ms: The current migration state.
 912  * @f: QEMUFile where to send the data
 913  * @block: block that contains the page we want to send
 914  * @offset: offset inside the block for the page
 915  * @last_stage: if we are at the completion stage
 916  * @bytes_transferred: increase it with the number of transferred bytes
 917  */
 918 static int ram_save_compressed_page(MigrationState *ms, QEMUFile *f,
 919                                     PageSearchStatus *pss, bool last_stage,
 920                                     uint64_t *bytes_transferred)
 921 {
 922     int pages = -1;
 923     uint64_t bytes_xmit = 0;
 924     uint8_t *p;
 925     int ret, blen;
 926     RAMBlock *block = pss->block;
 927     ram_addr_t offset = pss->offset;
 928
 929     p = block->host + offset;
 930
 931     ret = ram_control_save_page(f, block->offset,
 932                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 933     if (bytes_xmit) {
 934         *bytes_transferred += bytes_xmit;
 935         pages = 1;
 936     }
 937     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 938         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 939             if (bytes_xmit > 0) {
 940                 acct_info.norm_pages++;
 941             } else if (bytes_xmit == 0) {
 942                 acct_info.dup_pages++;
 943             }
 944         }
 945     } else {
 946         /* When starting the process of a new block, the first page of
 947          * the block should be sent out before other pages in the same
 948          * block, and all the pages in last block should have been sent
 949          * out, keeping this order is important, because the 'cont' flag
 950          * is used to avoid resending the block name.
 951          */
 952         if (block != last_sent_block) {
 953             flush_compressed_data(f);
 954             pages = save_zero_page(f, block, offset, p, bytes_transferred);
 955             if (pages == -1) {
 956                 /* Make sure the first page is sent out before other pages */
 957                 bytes_xmit = save_page_header(f, block, offset |
 958                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
 959                 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 960                                                  migrate_compress_level());
 961                 if (blen > 0) {
 962                     *bytes_transferred += bytes_xmit + blen;
 963                     acct_info.norm_pages++;
 964                     pages = 1;
 965                 } else {
 966                     qemu_file_set_error(f, blen);
 967                     error_report("compressed data failed!");
 968                 }
 969             }
 970             if (pages > 0) {
 971                 ram_release_pages(ms, block->idstr, pss->offset, pages);
 972             }
 973         } else {
 974             offset |= RAM_SAVE_FLAG_CONTINUE;
 975             pages = save_zero_page(f, block, offset, p, bytes_transferred);
 976             if (pages == -1) {
 977                 pages = compress_page_with_multi_thread(f, block, offset,
 978                                                         bytes_transferred);
 979             } else {
 980                 ram_release_pages(ms, block->idstr, pss->offset, pages);
 981             }
 982         }
 983     }
 984
 985     return pages;
 986 }
 987
 988 /*
 989  * Find the next dirty page and update any state associated with
 990  * the search process.
 991  *
 992  * Returns: True if a page is found
 993  *
 994  * @f: Current migration stream.
 995  * @pss: Data about the state of the current dirty page scan.
 996  * @*again: Set to false if the search has scanned the whole of RAM
 997  * *ram_addr_abs: Pointer into which to store the address of the dirty page
 998  *               within the global ram_addr space
 999  */
1000 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
1001                              bool *again, ram_addr_t *ram_addr_abs)
1002 {
1003     pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
1004                                               ram_addr_abs);
1005     if (pss->complete_round && pss->block == last_seen_block &&
1006         pss->offset >= last_offset) {
1007         /*
1008          * We've been once around the RAM and haven't found anything.
1009          * Give up.
1010          */
1011         *again = false;
1012         return false;
1013     }
1014     if (pss->offset >= pss->block->used_length) {
1015         /* Didn't find anything in this RAM Block */
1016         pss->offset = 0;
1017         pss->block = QLIST_NEXT_RCU(pss->block, next);
1018         if (!pss->block) {
1019             /* Hit the end of the list */
1020             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1021             /* Flag that we've looped */
1022             pss->complete_round = true;
1023             ram_bulk_stage = false;
1024             if (migrate_use_xbzrle()) {
1025                 /* If xbzrle is on, stop using the data compression at this
1026                  * point. In theory, xbzrle can do better than compression.
1027                  */
1028                 flush_compressed_data(f);
1029                 compression_switch = false;
1030             }
1031         }
1032         /* Didn't find anything this time, but try again on the new block */
1033         *again = true;
1034         return false;
1035     } else {
1036         /* Can go around again, but... */
1037         *again = true;
1038         /* We've found something so probably don't need to */
1039         return true;
1040     }
1041 }
1042
1043 /*
1044  * Helper for 'get_queued_page' - gets a page off the queue
1045  *      ms:      MigrationState in
1046  * *offset:      Used to return the offset within the RAMBlock
1047  * ram_addr_abs: global offset in the dirty/sent bitmaps
1048  *
1049  * Returns:      block (or NULL if none available)
1050  */
1051 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1052                               ram_addr_t *ram_addr_abs)
1053 {
1054     RAMBlock *block = NULL;
1055
1056     qemu_mutex_lock(&ms->src_page_req_mutex);
1057     if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1058         struct MigrationSrcPageRequest *entry =
1059                                 QSIMPLEQ_FIRST(&ms->src_page_requests);
1060         block = entry->rb;
1061         *offset = entry->offset;
1062         *ram_addr_abs = (entry->offset + entry->rb->offset) &
1063                         TARGET_PAGE_MASK;
1064
1065         if (entry->len > TARGET_PAGE_SIZE) {
1066             entry->len -= TARGET_PAGE_SIZE;
1067             entry->offset += TARGET_PAGE_SIZE;
1068         } else {
1069             memory_region_unref(block->mr);
1070             QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1071             g_free(entry);
1072         }
1073     }
1074     qemu_mutex_unlock(&ms->src_page_req_mutex);
1075
1076     return block;
1077 }
1078
1079 /*
1080  * Unqueue a page from the queue fed by postcopy page requests; skips pages
1081  * that are already sent (!dirty)
1082  *
1083  *      ms:      MigrationState in
1084  *     pss:      PageSearchStatus structure updated with found block/offset
1085  * ram_addr_abs: global offset in the dirty/sent bitmaps
1086  *
1087  * Returns:      true if a queued page is found
1088  */
1089 static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
1090                             ram_addr_t *ram_addr_abs)
1091 {
1092     RAMBlock  *block;
1093     ram_addr_t offset;
1094     bool dirty;
1095
1096     do {
1097         block = unqueue_page(ms, &offset, ram_addr_abs);
1098         /*
1099          * We're sending this page, and since it's postcopy nothing else
1100          * will dirty it, and we must make sure it doesn't get sent again
1101          * even if this queue request was received after the background
1102          * search already sent it.
1103          */
1104         if (block) {
1105             unsigned long *bitmap;
1106             bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1107             dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1108             if (!dirty) {
1109                 trace_get_queued_page_not_dirty(
1110                     block->idstr, (uint64_t)offset,
1111                     (uint64_t)*ram_addr_abs,
1112                     test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1113                          atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1114             } else {
1115                 trace_get_queued_page(block->idstr,
1116                                       (uint64_t)offset,
1117                                       (uint64_t)*ram_addr_abs);
1118             }
1119         }
1120
1121     } while (block && !dirty);
1122
1123     if (block) {
1124         /*
1125          * As soon as we start servicing pages out of order, then we have
1126          * to kill the bulk stage, since the bulk stage assumes
1127          * in (migration_bitmap_find_and_reset_dirty) that every page is
1128          * dirty, that's no longer true.
1129          */
1130         ram_bulk_stage = false;
1131
1132         /*
1133          * We want the background search to continue from the queued page
1134          * since the guest is likely to want other pages near to the page
1135          * it just requested.
1136          */
1137         pss->block = block;
1138         pss->offset = offset;
1139     }
1140
1141     return !!block;
1142 }
1143
1144 /**
1145  * flush_page_queue: Flush any remaining pages in the ram request queue
1146  *    it should be empty at the end anyway, but in error cases there may be
1147  *    some left.
1148  *
1149  * ms: MigrationState
1150  */
1151 void flush_page_queue(MigrationState *ms)
1152 {
1153     struct MigrationSrcPageRequest *mspr, *next_mspr;
1154     /* This queue generally should be empty - but in the case of a failed
1155      * migration might have some droppings in.
1156      */
1157     rcu_read_lock();
1158     QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1159         memory_region_unref(mspr->rb->mr);
1160         QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1161         g_free(mspr);
1162     }
1163     rcu_read_unlock();
1164 }
1165
1166 /**
1167  * Queue the pages for transmission, e.g. a request from postcopy destination
1168  *   ms: MigrationStatus in which the queue is held
1169  *   rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
1170  *   start: Offset from the start of the RAMBlock
1171  *   len: Length (in bytes) to send
1172  *   Return: 0 on success
1173  */
1174 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1175                          ram_addr_t start, ram_addr_t len)
1176 {
1177     RAMBlock *ramblock;
1178
1179     ms->postcopy_requests++;
1180     rcu_read_lock();
1181     if (!rbname) {
1182         /* Reuse last RAMBlock */
1183         ramblock = ms->last_req_rb;
1184
1185         if (!ramblock) {
1186             /*
1187              * Shouldn't happen, we can't reuse the last RAMBlock if
1188              * it's the 1st request.
1189              */
1190             error_report("ram_save_queue_pages no previous block");
1191             goto err;
1192         }
1193     } else {
1194         ramblock = qemu_ram_block_by_name(rbname);
1195
1196         if (!ramblock) {
1197             /* We shouldn't be asked for a non-existent RAMBlock */
1198             error_report("ram_save_queue_pages no block '%s'", rbname);
1199             goto err;
1200         }
1201         ms->last_req_rb = ramblock;
1202     }
1203     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1204     if (start+len > ramblock->used_length) {
1205         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1206                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1207                      __func__, start, len, ramblock->used_length);
1208         goto err;
1209     }
1210
1211     struct MigrationSrcPageRequest *new_entry =
1212         g_malloc0(sizeof(struct MigrationSrcPageRequest));
1213     new_entry->rb = ramblock;
1214     new_entry->offset = start;
1215     new_entry->len = len;
1216
1217     memory_region_ref(ramblock->mr);
1218     qemu_mutex_lock(&ms->src_page_req_mutex);
1219     QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1220     qemu_mutex_unlock(&ms->src_page_req_mutex);
1221     rcu_read_unlock();
1222
1223     return 0;
1224
1225 err:
1226     rcu_read_unlock();
1227     return -1;
1228 }
1229
1230 /**
1231  * ram_save_target_page: Save one target page
1232  *
1233  *
1234  * @f: QEMUFile where to send the data
1235  * @block: pointer to block that contains the page we want to send
1236  * @offset: offset inside the block for the page;
1237  * @last_stage: if we are at the completion stage
1238  * @bytes_transferred: increase it with the number of transferred bytes
1239  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1240  *
1241  * Returns: Number of pages written.
1242  */
1243 static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
1244                                 PageSearchStatus *pss,
1245                                 bool last_stage,
1246                                 uint64_t *bytes_transferred,
1247                                 ram_addr_t dirty_ram_abs)
1248 {
1249     int res = 0;
1250
1251     /* Check the pages is dirty and if it is send it */
1252     if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1253         unsigned long *unsentmap;
1254         if (compression_switch && migrate_use_compression()) {
1255             res = ram_save_compressed_page(ms, f, pss,
1256                                            last_stage,
1257                                            bytes_transferred);
1258         } else {
1259             res = ram_save_page(ms, f, pss, last_stage,
1260                                 bytes_transferred);
1261         }
1262
1263         if (res < 0) {
1264             return res;
1265         }
1266         unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1267         if (unsentmap) {
1268             clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1269         }
1270         /* Only update last_sent_block if a block was actually sent; xbzrle
1271          * might have decided the page was identical so didn't bother writing
1272          * to the stream.
1273          */
1274         if (res > 0) {
1275             last_sent_block = pss->block;
1276         }
1277     }
1278
1279     return res;
1280 }
1281
1282 /**
1283  * ram_save_host_page: Starting at *offset send pages up to the end
1284  *                     of the current host page.  It's valid for the initial
1285  *                     offset to point into the middle of a host page
1286  *                     in which case the remainder of the hostpage is sent.
1287  *                     Only dirty target pages are sent.
1288  *
1289  * Returns: Number of pages written.
1290  *
1291  * @f: QEMUFile where to send the data
1292  * @block: pointer to block that contains the page we want to send
1293  * @offset: offset inside the block for the page; updated to last target page
1294  *          sent
1295  * @last_stage: if we are at the completion stage
1296  * @bytes_transferred: increase it with the number of transferred bytes
1297  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1298  */
1299 static int ram_save_host_page(MigrationState *ms, QEMUFile *f,
1300                               PageSearchStatus *pss,
1301                               bool last_stage,
1302                               uint64_t *bytes_transferred,
1303                               ram_addr_t dirty_ram_abs)
1304 {
1305     int tmppages, pages = 0;
1306     do {
1307         tmppages = ram_save_target_page(ms, f, pss, last_stage,
1308                                         bytes_transferred, dirty_ram_abs);
1309         if (tmppages < 0) {
1310             return tmppages;
1311         }
1312
1313         pages += tmppages;
1314         pss->offset += TARGET_PAGE_SIZE;
1315         dirty_ram_abs += TARGET_PAGE_SIZE;
1316     } while (pss->offset & (qemu_host_page_size - 1));
1317
1318     /* The offset we leave with is the last one we looked at */
1319     pss->offset -= TARGET_PAGE_SIZE;
1320     return pages;
1321 }
1322
1323 /**
1324  * ram_find_and_save_block: Finds a dirty page and sends it to f
1325  *
1326  * Called within an RCU critical section.
1327  *
1328  * Returns:  The number of pages written
1329  *           0 means no dirty pages
1330  *
1331  * @f: QEMUFile where to send the data
1332  * @last_stage: if we are at the completion stage
1333  * @bytes_transferred: increase it with the number of transferred bytes
1334  *
1335  * On systems where host-page-size > target-page-size it will send all the
1336  * pages in a host page that are dirty.
1337  */
1338
1339 static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1340                                    uint64_t *bytes_transferred)
1341 {
1342     PageSearchStatus pss;
1343     MigrationState *ms = migrate_get_current();
1344     int pages = 0;
1345     bool again, found;
1346     ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1347                                  ram_addr_t space */
1348
1349     /* No dirty page as there is zero RAM */
1350     if (!ram_bytes_total()) {
1351         return pages;
1352     }
1353
1354     pss.block = last_seen_block;
1355     pss.offset = last_offset;
1356     pss.complete_round = false;
1357
1358     if (!pss.block) {
1359         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1360     }
1361
1362     do {
1363         again = true;
1364         found = get_queued_page(ms, &pss, &dirty_ram_abs);
1365
1366         if (!found) {
1367             /* priority queue empty, so just search for something dirty */
1368             found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
1369         }
1370
1371         if (found) {
1372             pages = ram_save_host_page(ms, f, &pss,
1373                                        last_stage, bytes_transferred,
1374                                        dirty_ram_abs);
1375         }
1376     } while (!pages && again);
1377
1378     last_seen_block = pss.block;
1379     last_offset = pss.offset;
1380
1381     return pages;
1382 }
1383
1384 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1385 {
1386     uint64_t pages = size / TARGET_PAGE_SIZE;
1387     if (zero) {
1388         acct_info.dup_pages += pages;
1389     } else {
1390         acct_info.norm_pages += pages;
1391         bytes_transferred += size;
1392         qemu_update_position(f, size);
1393     }
1394 }
1395
1396 static ram_addr_t ram_save_remaining(void)
1397 {
1398     return migration_dirty_pages;
1399 }
1400
1401 uint64_t ram_bytes_remaining(void)
1402 {
1403     return ram_save_remaining() * TARGET_PAGE_SIZE;
1404 }
1405
1406 uint64_t ram_bytes_transferred(void)
1407 {
1408     return bytes_transferred;
1409 }
1410
1411 uint64_t ram_bytes_total(void)
1412 {
1413     RAMBlock *block;
1414     uint64_t total = 0;
1415
1416     rcu_read_lock();
1417     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1418         total += block->used_length;
1419     rcu_read_unlock();
1420     return total;
1421 }
1422
1423 void free_xbzrle_decoded_buf(void)
1424 {
1425     g_free(xbzrle_decoded_buf);
1426     xbzrle_decoded_buf = NULL;
1427 }
1428
1429 static void migration_bitmap_free(struct BitmapRcu *bmap)
1430 {
1431     g_free(bmap->bmap);
1432     g_free(bmap->unsentmap);
1433     g_free(bmap);
1434 }
1435
1436 static void ram_migration_cleanup(void *opaque)
1437 {
1438     /* caller have hold iothread lock or is in a bh, so there is
1439      * no writing race against this migration_bitmap
1440      */
1441     struct BitmapRcu *bitmap = migration_bitmap_rcu;
1442     atomic_rcu_set(&migration_bitmap_rcu, NULL);
1443     if (bitmap) {
1444         memory_global_dirty_log_stop();
1445         call_rcu(bitmap, migration_bitmap_free, rcu);
1446     }
1447
1448     XBZRLE_cache_lock();
1449     if (XBZRLE.cache) {
1450         cache_fini(XBZRLE.cache);
1451         g_free(XBZRLE.encoded_buf);
1452         g_free(XBZRLE.current_buf);
1453         g_free(ZERO_TARGET_PAGE);
1454         XBZRLE.cache = NULL;
1455         XBZRLE.encoded_buf = NULL;
1456         XBZRLE.current_buf = NULL;
1457     }
1458     XBZRLE_cache_unlock();
1459 }
1460
1461 static void reset_ram_globals(void)
1462 {
1463     last_seen_block = NULL;
1464     last_sent_block = NULL;
1465     last_offset = 0;
1466     last_version = ram_list.version;
1467     ram_bulk_stage = true;
1468 }
1469
1470 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1471
1472 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1473 {
1474     /* called in qemu main thread, so there is
1475      * no writing race against this migration_bitmap
1476      */
1477     if (migration_bitmap_rcu) {
1478         struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1479         bitmap = g_new(struct BitmapRcu, 1);
1480         bitmap->bmap = bitmap_new(new);
1481
1482         /* prevent migration_bitmap content from being set bit
1483          * by migration_bitmap_sync_range() at the same time.
1484          * it is safe to migration if migration_bitmap is cleared bit
1485          * at the same time.
1486          */
1487         qemu_mutex_lock(&migration_bitmap_mutex);
1488         bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1489         bitmap_set(bitmap->bmap, old, new - old);
1490
1491         /* We don't have a way to safely extend the sentmap
1492          * with RCU; so mark it as missing, entry to postcopy
1493          * will fail.
1494          */
1495         bitmap->unsentmap = NULL;
1496
1497         atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1498         qemu_mutex_unlock(&migration_bitmap_mutex);
1499         migration_dirty_pages += new - old;
1500         call_rcu(old_bitmap, migration_bitmap_free, rcu);
1501     }
1502 }
1503
1504 /*
1505  * 'expected' is the value you expect the bitmap mostly to be full
1506  * of; it won't bother printing lines that are all this value.
1507  * If 'todump' is null the migration bitmap is dumped.
1508  */
1509 void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1510 {
1511     int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1512
1513     int64_t cur;
1514     int64_t linelen = 128;
1515     char linebuf[129];
1516
1517     if (!todump) {
1518         todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1519     }
1520
1521     for (cur = 0; cur < ram_pages; cur += linelen) {
1522         int64_t curb;
1523         bool found = false;
1524         /*
1525          * Last line; catch the case where the line length
1526          * is longer than remaining ram
1527          */
1528         if (cur + linelen > ram_pages) {
1529             linelen = ram_pages - cur;
1530         }
1531         for (curb = 0; curb < linelen; curb++) {
1532             bool thisbit = test_bit(cur + curb, todump);
1533             linebuf[curb] = thisbit ? '1' : '.';
1534             found = found || (thisbit != expected);
1535         }
1536         if (found) {
1537             linebuf[curb] = '\0';
1538             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1539         }
1540     }
1541 }
1542
1543 /* **** functions for postcopy ***** */
1544
1545 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1546 {
1547     struct RAMBlock *block;
1548     unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1549
1550     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1551         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1552         unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS);
1553         unsigned long run_start = find_next_zero_bit(bitmap, range, first);
1554
1555         while (run_start < range) {
1556             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1557             ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS,
1558                               (run_end - run_start) << TARGET_PAGE_BITS);
1559             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1560         }
1561     }
1562 }
1563
1564 /*
1565  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1566  * Note: At this point the 'unsentmap' is the processed bitmap combined
1567  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1568  * start,length: Indexes into the bitmap for the first bit
1569  *            representing the named block and length in target-pages
1570  */
1571 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1572                                         PostcopyDiscardState *pds,
1573                                         unsigned long start,
1574                                         unsigned long length)
1575 {
1576     unsigned long end = start + length; /* one after the end */
1577     unsigned long current;
1578     unsigned long *unsentmap;
1579
1580     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1581     for (current = start; current < end; ) {
1582         unsigned long one = find_next_bit(unsentmap, end, current);
1583
1584         if (one <= end) {
1585             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1586             unsigned long discard_length;
1587
1588             if (zero >= end) {
1589                 discard_length = end - one;
1590             } else {
1591                 discard_length = zero - one;
1592             }
1593             if (discard_length) {
1594                 postcopy_discard_send_range(ms, pds, one, discard_length);
1595             }
1596             current = one + discard_length;
1597         } else {
1598             current = one;
1599         }
1600     }
1601
1602     return 0;
1603 }
1604
1605 /*
1606  * Utility for the outgoing postcopy code.
1607  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1608  *   passing it bitmap indexes and name.
1609  * Returns: 0 on success
1610  * (qemu_ram_foreach_block ends up passing unscaled lengths
1611  *  which would mean postcopy code would have to deal with target page)
1612  */
1613 static int postcopy_each_ram_send_discard(MigrationState *ms)
1614 {
1615     struct RAMBlock *block;
1616     int ret;
1617
1618     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1619         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1620         PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1621                                                                first,
1622                                                                block->idstr);
1623
1624         /*
1625          * Postcopy sends chunks of bitmap over the wire, but it
1626          * just needs indexes at this point, avoids it having
1627          * target page specific code.
1628          */
1629         ret = postcopy_send_discard_bm_ram(ms, pds, first,
1630                                     block->used_length >> TARGET_PAGE_BITS);
1631         postcopy_discard_send_finish(ms, pds);
1632         if (ret) {
1633             return ret;
1634         }
1635     }
1636
1637     return 0;
1638 }
1639
1640 /*
1641  * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
1642  *   the two bitmaps, that are similar, but one is inverted.
1643  *
1644  * We search for runs of target-pages that don't start or end on a
1645  * host page boundary;
1646  * unsent_pass=true: Cleans up partially unsent host pages by searching
1647  *                 the unsentmap
1648  * unsent_pass=false: Cleans up partially dirty host pages by searching
1649  *                 the main migration bitmap
1650  *
1651  */
1652 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1653                                           RAMBlock *block,
1654                                           PostcopyDiscardState *pds)
1655 {
1656     unsigned long *bitmap;
1657     unsigned long *unsentmap;
1658     unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE;
1659     unsigned long first = block->offset >> TARGET_PAGE_BITS;
1660     unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1661     unsigned long last = first + (len - 1);
1662     unsigned long run_start;
1663
1664     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1665     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1666
1667     if (unsent_pass) {
1668         /* Find a sent page */
1669         run_start = find_next_zero_bit(unsentmap, last + 1, first);
1670     } else {
1671         /* Find a dirty page */
1672         run_start = find_next_bit(bitmap, last + 1, first);
1673     }
1674
1675     while (run_start <= last) {
1676         bool do_fixup = false;
1677         unsigned long fixup_start_addr;
1678         unsigned long host_offset;
1679
1680         /*
1681          * If the start of this run of pages is in the middle of a host
1682          * page, then we need to fixup this host page.
1683          */
1684         host_offset = run_start % host_ratio;
1685         if (host_offset) {
1686             do_fixup = true;
1687             run_start -= host_offset;
1688             fixup_start_addr = run_start;
1689             /* For the next pass */
1690             run_start = run_start + host_ratio;
1691         } else {
1692             /* Find the end of this run */
1693             unsigned long run_end;
1694             if (unsent_pass) {
1695                 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1696             } else {
1697                 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1698             }
1699             /*
1700              * If the end isn't at the start of a host page, then the
1701              * run doesn't finish at the end of a host page
1702              * and we need to discard.
1703              */
1704             host_offset = run_end % host_ratio;
1705             if (host_offset) {
1706                 do_fixup = true;
1707                 fixup_start_addr = run_end - host_offset;
1708                 /*
1709                  * This host page has gone, the next loop iteration starts
1710                  * from after the fixup
1711                  */
1712                 run_start = fixup_start_addr + host_ratio;
1713             } else {
1714                 /*
1715                  * No discards on this iteration, next loop starts from
1716                  * next sent/dirty page
1717                  */
1718                 run_start = run_end + 1;
1719             }
1720         }
1721
1722         if (do_fixup) {
1723             unsigned long page;
1724
1725             /* Tell the destination to discard this page */
1726             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1727                 /* For the unsent_pass we:
1728                  *     discard partially sent pages
1729                  * For the !unsent_pass (dirty) we:
1730                  *     discard partially dirty pages that were sent
1731                  *     (any partially sent pages were already discarded
1732                  *     by the previous unsent_pass)
1733                  */
1734                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1735                                             host_ratio);
1736             }
1737
1738             /* Clean up the bitmap */
1739             for (page = fixup_start_addr;
1740                  page < fixup_start_addr + host_ratio; page++) {
1741                 /* All pages in this host page are now not sent */
1742                 set_bit(page, unsentmap);
1743
1744                 /*
1745                  * Remark them as dirty, updating the count for any pages
1746                  * that weren't previously dirty.
1747                  */
1748                 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1749             }
1750         }
1751
1752         if (unsent_pass) {
1753             /* Find the next sent page for the next iteration */
1754             run_start = find_next_zero_bit(unsentmap, last + 1,
1755                                            run_start);
1756         } else {
1757             /* Find the next dirty page for the next iteration */
1758             run_start = find_next_bit(bitmap, last + 1, run_start);
1759         }
1760     }
1761 }
1762
1763 /*
1764  * Utility for the outgoing postcopy code.
1765  *
1766  * Discard any partially sent host-page size chunks, mark any partially
1767  * dirty host-page size chunks as all dirty.
1768  *
1769  * Returns: 0 on success
1770  */
1771 static int postcopy_chunk_hostpages(MigrationState *ms)
1772 {
1773     struct RAMBlock *block;
1774
1775     if (qemu_host_page_size == TARGET_PAGE_SIZE) {
1776         /* Easy case - TPS==HPS - nothing to be done */
1777         return 0;
1778     }
1779
1780     /* Easiest way to make sure we don't resume in the middle of a host-page */
1781     last_seen_block = NULL;
1782     last_sent_block = NULL;
1783     last_offset     = 0;
1784
1785     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1786         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1787
1788         PostcopyDiscardState *pds =
1789                          postcopy_discard_send_init(ms, first, block->idstr);
1790
1791         /* First pass: Discard all partially sent host pages */
1792         postcopy_chunk_hostpages_pass(ms, true, block, pds);
1793         /*
1794          * Second pass: Ensure that all partially dirty host pages are made
1795          * fully dirty.
1796          */
1797         postcopy_chunk_hostpages_pass(ms, false, block, pds);
1798
1799         postcopy_discard_send_finish(ms, pds);
1800     } /* ram_list loop */
1801
1802     return 0;
1803 }
1804
1805 /*
1806  * Transmit the set of pages to be discarded after precopy to the target
1807  * these are pages that:
1808  *     a) Have been previously transmitted but are now dirty again
1809  *     b) Pages that have never been transmitted, this ensures that
1810  *        any pages on the destination that have been mapped by background
1811  *        tasks get discarded (transparent huge pages is the specific concern)
1812  * Hopefully this is pretty sparse
1813  */
1814 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1815 {
1816     int ret;
1817     unsigned long *bitmap, *unsentmap;
1818
1819     rcu_read_lock();
1820
1821     /* This should be our last sync, the src is now paused */
1822     migration_bitmap_sync();
1823
1824     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1825     if (!unsentmap) {
1826         /* We don't have a safe way to resize the sentmap, so
1827          * if the bitmap was resized it will be NULL at this
1828          * point.
1829          */
1830         error_report("migration ram resized during precopy phase");
1831         rcu_read_unlock();
1832         return -EINVAL;
1833     }
1834
1835     /* Deal with TPS != HPS */
1836     ret = postcopy_chunk_hostpages(ms);
1837     if (ret) {
1838         rcu_read_unlock();
1839         return ret;
1840     }
1841
1842     /*
1843      * Update the unsentmap to be unsentmap = unsentmap | dirty
1844      */
1845     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1846     bitmap_or(unsentmap, unsentmap, bitmap,
1847                last_ram_offset() >> TARGET_PAGE_BITS);
1848
1849
1850     trace_ram_postcopy_send_discard_bitmap();
1851 #ifdef DEBUG_POSTCOPY
1852     ram_debug_dump_bitmap(unsentmap, true);
1853 #endif
1854
1855     ret = postcopy_each_ram_send_discard(ms);
1856     rcu_read_unlock();
1857
1858     return ret;
1859 }
1860
1861 /*
1862  * At the start of the postcopy phase of migration, any now-dirty
1863  * precopied pages are discarded.
1864  *
1865  * start, length describe a byte address range within the RAMBlock
1866  *
1867  * Returns 0 on success.
1868  */
1869 int ram_discard_range(MigrationIncomingState *mis,
1870                       const char *block_name,
1871                       uint64_t start, size_t length)
1872 {
1873     int ret = -1;
1874
1875     rcu_read_lock();
1876     RAMBlock *rb = qemu_ram_block_by_name(block_name);
1877
1878     if (!rb) {
1879         error_report("ram_discard_range: Failed to find block '%s'",
1880                      block_name);
1881         goto err;
1882     }
1883
1884     uint8_t *host_startaddr = rb->host + start;
1885
1886     if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) {
1887         error_report("ram_discard_range: Unaligned start address: %p",
1888                      host_startaddr);
1889         goto err;
1890     }
1891
1892     if ((start + length) <= rb->used_length) {
1893         uint8_t *host_endaddr = host_startaddr + length;
1894         if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) {
1895             error_report("ram_discard_range: Unaligned end address: %p",
1896                          host_endaddr);
1897             goto err;
1898         }
1899         ret = postcopy_ram_discard_range(mis, host_startaddr, length);
1900     } else {
1901         error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
1902                      "/%zx/" RAM_ADDR_FMT")",
1903                      block_name, start, length, rb->used_length);
1904     }
1905
1906 err:
1907     rcu_read_unlock();
1908
1909     return ret;
1910 }
1911
1912 static int ram_save_init_globals(void)
1913 {
1914     int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1915
1916     dirty_rate_high_cnt = 0;
1917     bitmap_sync_count = 0;
1918     migration_bitmap_sync_init();
1919     qemu_mutex_init(&migration_bitmap_mutex);
1920
1921     if (migrate_use_xbzrle()) {
1922         XBZRLE_cache_lock();
1923         ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1924         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1925                                   TARGET_PAGE_SIZE,
1926                                   TARGET_PAGE_SIZE);
1927         if (!XBZRLE.cache) {
1928             XBZRLE_cache_unlock();
1929             error_report("Error creating cache");
1930             return -1;
1931         }
1932         XBZRLE_cache_unlock();
1933
1934         /* We prefer not to abort if there is no memory */
1935         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1936         if (!XBZRLE.encoded_buf) {
1937             error_report("Error allocating encoded_buf");
1938             return -1;
1939         }
1940
1941         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1942         if (!XBZRLE.current_buf) {
1943             error_report("Error allocating current_buf");
1944             g_free(XBZRLE.encoded_buf);
1945             XBZRLE.encoded_buf = NULL;
1946             return -1;
1947         }
1948
1949         acct_clear();
1950     }
1951
1952     /* For memory_global_dirty_log_start below.  */
1953     qemu_mutex_lock_iothread();
1954
1955     qemu_mutex_lock_ramlist();
1956     rcu_read_lock();
1957     bytes_transferred = 0;
1958     reset_ram_globals();
1959
1960     migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
1961     /* Skip setting bitmap if there is no RAM */
1962     if (ram_bytes_total()) {
1963         ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1964         migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1965         bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
1966
1967         if (migrate_postcopy_ram()) {
1968             migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
1969             bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
1970         }
1971     }
1972
1973     /*
1974      * Count the total number of pages used by ram blocks not including any
1975      * gaps due to alignment or unplugs.
1976      */
1977     migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1978
1979     memory_global_dirty_log_start();
1980     migration_bitmap_sync();
1981     qemu_mutex_unlock_ramlist();
1982     qemu_mutex_unlock_iothread();
1983     rcu_read_unlock();
1984
1985     return 0;
1986 }
1987
1988 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1989  * long-running RCU critical section.  When rcu-reclaims in the code
1990  * start to become numerous it will be necessary to reduce the
1991  * granularity of these critical sections.
1992  */
1993
1994 static int ram_save_setup(QEMUFile *f, void *opaque)
1995 {
1996     RAMBlock *block;
1997
1998     /* migration has already setup the bitmap, reuse it. */
1999     if (!migration_in_colo_state()) {
2000         if (ram_save_init_globals() < 0) {
2001             return -1;
2002          }
2003     }
2004
2005     rcu_read_lock();
2006
2007     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2008
2009     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2010         qemu_put_byte(f, strlen(block->idstr));
2011         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2012         qemu_put_be64(f, block->used_length);
2013     }
2014
2015     rcu_read_unlock();
2016
2017     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2018     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2019
2020     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2021
2022     return 0;
2023 }
2024
2025 static int ram_save_iterate(QEMUFile *f, void *opaque)
2026 {
2027     int ret;
2028     int i;
2029     int64_t t0;
2030     int done = 0;
2031
2032     rcu_read_lock();
2033     if (ram_list.version != last_version) {
2034         reset_ram_globals();
2035     }
2036
2037     /* Read version before ram_list.blocks */
2038     smp_rmb();
2039
2040     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2041
2042     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2043     i = 0;
2044     while ((ret = qemu_file_rate_limit(f)) == 0) {
2045         int pages;
2046
2047         pages = ram_find_and_save_block(f, false, &bytes_transferred);
2048         /* no more pages to sent */
2049         if (pages == 0) {
2050             done = 1;
2051             break;
2052         }
2053         acct_info.iterations++;
2054
2055         /* we want to check in the 1st loop, just in case it was the 1st time
2056            and we had to sync the dirty bitmap.
2057            qemu_get_clock_ns() is a bit expensive, so we only check each some
2058            iterations
2059         */
2060         if ((i & 63) == 0) {
2061             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2062             if (t1 > MAX_WAIT) {
2063                 trace_ram_save_iterate_big_wait(t1, i);
2064                 break;
2065             }
2066         }
2067         i++;
2068     }
2069     flush_compressed_data(f);
2070     rcu_read_unlock();
2071
2072     /*
2073      * Must occur before EOS (or any QEMUFile operation)
2074      * because of RDMA protocol.
2075      */
2076     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2077
2078     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2079     bytes_transferred += 8;
2080
2081     ret = qemu_file_get_error(f);
2082     if (ret < 0) {
2083         return ret;
2084     }
2085
2086     return done;
2087 }
2088
2089 /* Called with iothread lock */
2090 static int ram_save_complete(QEMUFile *f, void *opaque)
2091 {
2092     rcu_read_lock();
2093
2094     if (!migration_in_postcopy(migrate_get_current())) {
2095         migration_bitmap_sync();
2096     }
2097
2098     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2099
2100     /* try transferring iterative blocks of memory */
2101
2102     /* flush all remaining blocks regardless of rate limiting */
2103     while (true) {
2104         int pages;
2105
2106         pages = ram_find_and_save_block(f, !migration_in_colo_state(),
2107                                         &bytes_transferred);
2108         /* no more blocks to sent */
2109         if (pages == 0) {
2110             break;
2111         }
2112     }
2113
2114     flush_compressed_data(f);
2115     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2116
2117     rcu_read_unlock();
2118
2119     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2120
2121     return 0;
2122 }
2123
2124 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2125                              uint64_t *non_postcopiable_pending,
2126                              uint64_t *postcopiable_pending)
2127 {
2128     uint64_t remaining_size;
2129
2130     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2131
2132     if (!migration_in_postcopy(migrate_get_current()) &&
2133         remaining_size < max_size) {
2134         qemu_mutex_lock_iothread();
2135         rcu_read_lock();
2136         migration_bitmap_sync();
2137         rcu_read_unlock();
2138         qemu_mutex_unlock_iothread();
2139         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2140     }
2141
2142     /* We can do postcopy, and all the data is postcopiable */
2143     *postcopiable_pending += remaining_size;
2144 }
2145
2146 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2147 {
2148     unsigned int xh_len;
2149     int xh_flags;
2150     uint8_t *loaded_data;
2151
2152     if (!xbzrle_decoded_buf) {
2153         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2154     }
2155     loaded_data = xbzrle_decoded_buf;
2156
2157     /* extract RLE header */
2158     xh_flags = qemu_get_byte(f);
2159     xh_len = qemu_get_be16(f);
2160
2161     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2162         error_report("Failed to load XBZRLE page - wrong compression!");
2163         return -1;
2164     }
2165
2166     if (xh_len > TARGET_PAGE_SIZE) {
2167         error_report("Failed to load XBZRLE page - len overflow!");
2168         return -1;
2169     }
2170     /* load data and decode */
2171     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2172
2173     /* decode RLE */
2174     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2175                              TARGET_PAGE_SIZE) == -1) {
2176         error_report("Failed to load XBZRLE page - decode error!");
2177         return -1;
2178     }
2179
2180     return 0;
2181 }
2182
2183 /* Must be called from within a rcu critical section.
2184  * Returns a pointer from within the RCU-protected ram_list.
2185  */
2186 /*
2187  * Read a RAMBlock ID from the stream f.
2188  *
2189  * f: Stream to read from
2190  * flags: Page flags (mostly to see if it's a continuation of previous block)
2191  */
2192 static inline RAMBlock *ram_block_from_stream(QEMUFile *f,
2193                                               int flags)
2194 {
2195     static RAMBlock *block = NULL;
2196     char id[256];
2197     uint8_t len;
2198
2199     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2200         if (!block) {
2201             error_report("Ack, bad migration stream!");
2202             return NULL;
2203         }
2204         return block;
2205     }
2206
2207     len = qemu_get_byte(f);
2208     qemu_get_buffer(f, (uint8_t *)id, len);
2209     id[len] = 0;
2210
2211     block = qemu_ram_block_by_name(id);
2212     if (!block) {
2213         error_report("Can't find block %s", id);
2214         return NULL;
2215     }
2216
2217     return block;
2218 }
2219
2220 static inline void *host_from_ram_block_offset(RAMBlock *block,
2221                                                ram_addr_t offset)
2222 {
2223     if (!offset_in_ramblock(block, offset)) {
2224         return NULL;
2225     }
2226
2227     return block->host + offset;
2228 }
2229
2230 /*
2231  * If a page (or a whole RDMA chunk) has been
2232  * determined to be zero, then zap it.
2233  */
2234 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2235 {
2236     if (ch != 0 || !is_zero_range(host, size)) {
2237         memset(host, ch, size);
2238     }
2239 }
2240
2241 static void *do_data_decompress(void *opaque)
2242 {
2243     DecompressParam *param = opaque;
2244     unsigned long pagesize;
2245     uint8_t *des;
2246     int len;
2247
2248     qemu_mutex_lock(&param->mutex);
2249     while (!param->quit) {
2250         if (param->des) {
2251             des = param->des;
2252             len = param->len;
2253             param->des = 0;
2254             qemu_mutex_unlock(&param->mutex);
2255
2256             pagesize = TARGET_PAGE_SIZE;
2257             /* uncompress() will return failed in some case, especially
2258              * when the page is dirted when doing the compression, it's
2259              * not a problem because the dirty page will be retransferred
2260              * and uncompress() won't break the data in other pages.
2261              */
2262             uncompress((Bytef *)des, &pagesize,
2263                        (const Bytef *)param->compbuf, len);
2264
2265             qemu_mutex_lock(&decomp_done_lock);
2266             param->done = true;
2267             qemu_cond_signal(&decomp_done_cond);
2268             qemu_mutex_unlock(&decomp_done_lock);
2269
2270             qemu_mutex_lock(&param->mutex);
2271         } else {
2272             qemu_cond_wait(&param->cond, &param->mutex);
2273         }
2274     }
2275     qemu_mutex_unlock(&param->mutex);
2276
2277     return NULL;
2278 }
2279
2280 static void wait_for_decompress_done(void)
2281 {
2282     int idx, thread_count;
2283
2284     if (!migrate_use_compression()) {
2285         return;
2286     }
2287
2288     thread_count = migrate_decompress_threads();
2289     qemu_mutex_lock(&decomp_done_lock);
2290     for (idx = 0; idx < thread_count; idx++) {
2291         while (!decomp_param[idx].done) {
2292             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2293         }
2294     }
2295     qemu_mutex_unlock(&decomp_done_lock);
2296 }
2297
2298 void migrate_decompress_threads_create(void)
2299 {
2300     int i, thread_count;
2301
2302     thread_count = migrate_decompress_threads();
2303     decompress_threads = g_new0(QemuThread, thread_count);
2304     decomp_param = g_new0(DecompressParam, thread_count);
2305     qemu_mutex_init(&decomp_done_lock);
2306     qemu_cond_init(&decomp_done_cond);
2307     for (i = 0; i < thread_count; i++) {
2308         qemu_mutex_init(&decomp_param[i].mutex);
2309         qemu_cond_init(&decomp_param[i].cond);
2310         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2311         decomp_param[i].done = true;
2312         decomp_param[i].quit = false;
2313         qemu_thread_create(decompress_threads + i, "decompress",
2314                            do_data_decompress, decomp_param + i,
2315                            QEMU_THREAD_JOINABLE);
2316     }
2317 }
2318
2319 void migrate_decompress_threads_join(void)
2320 {
2321     int i, thread_count;
2322
2323     thread_count = migrate_decompress_threads();
2324     for (i = 0; i < thread_count; i++) {
2325         qemu_mutex_lock(&decomp_param[i].mutex);
2326         decomp_param[i].quit = true;
2327         qemu_cond_signal(&decomp_param[i].cond);
2328         qemu_mutex_unlock(&decomp_param[i].mutex);
2329     }
2330     for (i = 0; i < thread_count; i++) {
2331         qemu_thread_join(decompress_threads + i);
2332         qemu_mutex_destroy(&decomp_param[i].mutex);
2333         qemu_cond_destroy(&decomp_param[i].cond);
2334         g_free(decomp_param[i].compbuf);
2335     }
2336     g_free(decompress_threads);
2337     g_free(decomp_param);
2338     decompress_threads = NULL;
2339     decomp_param = NULL;
2340 }
2341
2342 static void decompress_data_with_multi_threads(QEMUFile *f,
2343                                                void *host, int len)
2344 {
2345     int idx, thread_count;
2346
2347     thread_count = migrate_decompress_threads();
2348     qemu_mutex_lock(&decomp_done_lock);
2349     while (true) {
2350         for (idx = 0; idx < thread_count; idx++) {
2351             if (decomp_param[idx].done) {
2352                 decomp_param[idx].done = false;
2353                 qemu_mutex_lock(&decomp_param[idx].mutex);
2354                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2355                 decomp_param[idx].des = host;
2356                 decomp_param[idx].len = len;
2357                 qemu_cond_signal(&decomp_param[idx].cond);
2358                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2359                 break;
2360             }
2361         }
2362         if (idx < thread_count) {
2363             break;
2364         } else {
2365             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2366         }
2367     }
2368     qemu_mutex_unlock(&decomp_done_lock);
2369 }
2370
2371 /*
2372  * Allocate data structures etc needed by incoming migration with postcopy-ram
2373  * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
2374  */
2375 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2376 {
2377     size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2378
2379     return postcopy_ram_incoming_init(mis, ram_pages);
2380 }
2381
2382 /*
2383  * Called in postcopy mode by ram_load().
2384  * rcu_read_lock is taken prior to this being called.
2385  */
2386 static int ram_load_postcopy(QEMUFile *f)
2387 {
2388     int flags = 0, ret = 0;
2389     bool place_needed = false;
2390     bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
2391     MigrationIncomingState *mis = migration_incoming_get_current();
2392     /* Temporary page that is later 'placed' */
2393     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2394     void *last_host = NULL;
2395     bool all_zero = false;
2396
2397     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2398         ram_addr_t addr;
2399         void *host = NULL;
2400         void *page_buffer = NULL;
2401         void *place_source = NULL;
2402         uint8_t ch;
2403
2404         addr = qemu_get_be64(f);
2405         flags = addr & ~TARGET_PAGE_MASK;
2406         addr &= TARGET_PAGE_MASK;
2407
2408         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2409         place_needed = false;
2410         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2411             RAMBlock *block = ram_block_from_stream(f, flags);
2412
2413             host = host_from_ram_block_offset(block, addr);
2414             if (!host) {
2415                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2416                 ret = -EINVAL;
2417                 break;
2418             }
2419             /*
2420              * Postcopy requires that we place whole host pages atomically.
2421              * To make it atomic, the data is read into a temporary page
2422              * that's moved into place later.
2423              * The migration protocol uses,  possibly smaller, target-pages
2424              * however the source ensures it always sends all the components
2425              * of a host page in order.
2426              */
2427             page_buffer = postcopy_host_page +
2428                           ((uintptr_t)host & ~qemu_host_page_mask);
2429             /* If all TP are zero then we can optimise the place */
2430             if (!((uintptr_t)host & ~qemu_host_page_mask)) {
2431                 all_zero = true;
2432             } else {
2433                 /* not the 1st TP within the HP */
2434                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2435                     error_report("Non-sequential target page %p/%p",
2436                                   host, last_host);
2437                     ret = -EINVAL;
2438                     break;
2439                 }
2440             }
2441
2442
2443             /*
2444              * If it's the last part of a host page then we place the host
2445              * page
2446              */
2447             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2448                                      ~qemu_host_page_mask) == 0;
2449             place_source = postcopy_host_page;
2450         }
2451         last_host = host;
2452
2453         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2454         case RAM_SAVE_FLAG_COMPRESS:
2455             ch = qemu_get_byte(f);
2456             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2457             if (ch) {
2458                 all_zero = false;
2459             }
2460             break;
2461
2462         case RAM_SAVE_FLAG_PAGE:
2463             all_zero = false;
2464             if (!place_needed || !matching_page_sizes) {
2465                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2466             } else {
2467                 /* Avoids the qemu_file copy during postcopy, which is
2468                  * going to do a copy later; can only do it when we
2469                  * do this read in one go (matching page sizes)
2470                  */
2471                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2472                                          TARGET_PAGE_SIZE);
2473             }
2474             break;
2475         case RAM_SAVE_FLAG_EOS:
2476             /* normal exit */
2477             break;
2478         default:
2479             error_report("Unknown combination of migration flags: %#x"
2480                          " (postcopy mode)", flags);
2481             ret = -EINVAL;
2482         }
2483
2484         if (place_needed) {
2485             /* This gets called at the last target page in the host page */
2486             if (all_zero) {
2487                 ret = postcopy_place_page_zero(mis,
2488                                                host + TARGET_PAGE_SIZE -
2489                                                qemu_host_page_size);
2490             } else {
2491                 ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE -
2492                                                qemu_host_page_size,
2493                                                place_source);
2494             }
2495         }
2496         if (!ret) {
2497             ret = qemu_file_get_error(f);
2498         }
2499     }
2500
2501     return ret;
2502 }
2503
2504 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2505 {
2506     int flags = 0, ret = 0;
2507     static uint64_t seq_iter;
2508     int len = 0;
2509     /*
2510      * If system is running in postcopy mode, page inserts to host memory must
2511      * be atomic
2512      */
2513     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2514
2515     seq_iter++;
2516
2517     if (version_id != 4) {
2518         ret = -EINVAL;
2519     }
2520
2521     /* This RCU critical section can be very long running.
2522      * When RCU reclaims in the code start to become numerous,
2523      * it will be necessary to reduce the granularity of this
2524      * critical section.
2525      */
2526     rcu_read_lock();
2527
2528     if (postcopy_running) {
2529         ret = ram_load_postcopy(f);
2530     }
2531
2532     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2533         ram_addr_t addr, total_ram_bytes;
2534         void *host = NULL;
2535         uint8_t ch;
2536
2537         addr = qemu_get_be64(f);
2538         flags = addr & ~TARGET_PAGE_MASK;
2539         addr &= TARGET_PAGE_MASK;
2540
2541         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2542                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2543             RAMBlock *block = ram_block_from_stream(f, flags);
2544
2545             host = host_from_ram_block_offset(block, addr);
2546             if (!host) {
2547                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2548                 ret = -EINVAL;
2549                 break;
2550             }
2551         }
2552
2553         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2554         case RAM_SAVE_FLAG_MEM_SIZE:
2555             /* Synchronize RAM block list */
2556             total_ram_bytes = addr;
2557             while (!ret && total_ram_bytes) {
2558                 RAMBlock *block;
2559                 char id[256];
2560                 ram_addr_t length;
2561
2562                 len = qemu_get_byte(f);
2563                 qemu_get_buffer(f, (uint8_t *)id, len);
2564                 id[len] = 0;
2565                 length = qemu_get_be64(f);
2566
2567                 block = qemu_ram_block_by_name(id);
2568                 if (block) {
2569                     if (length != block->used_length) {
2570                         Error *local_err = NULL;
2571
2572                         ret = qemu_ram_resize(block, length,
2573                                               &local_err);
2574                         if (local_err) {
2575                             error_report_err(local_err);
2576                         }
2577                     }
2578                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2579                                           block->idstr);
2580                 } else {
2581                     error_report("Unknown ramblock \"%s\", cannot "
2582                                  "accept migration", id);
2583                     ret = -EINVAL;
2584                 }
2585
2586                 total_ram_bytes -= length;
2587             }
2588             break;
2589
2590         case RAM_SAVE_FLAG_COMPRESS:
2591             ch = qemu_get_byte(f);
2592             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2593             break;
2594
2595         case RAM_SAVE_FLAG_PAGE:
2596             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2597             break;
2598
2599         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2600             len = qemu_get_be32(f);
2601             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2602                 error_report("Invalid compressed data length: %d", len);
2603                 ret = -EINVAL;
2604                 break;
2605             }
2606             decompress_data_with_multi_threads(f, host, len);
2607             break;
2608
2609         case RAM_SAVE_FLAG_XBZRLE:
2610             if (load_xbzrle(f, addr, host) < 0) {
2611                 error_report("Failed to decompress XBZRLE page at "
2612                              RAM_ADDR_FMT, addr);
2613                 ret = -EINVAL;
2614                 break;
2615             }
2616             break;
2617         case RAM_SAVE_FLAG_EOS:
2618             /* normal exit */
2619             break;
2620         default:
2621             if (flags & RAM_SAVE_FLAG_HOOK) {
2622                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2623             } else {
2624                 error_report("Unknown combination of migration flags: %#x",
2625                              flags);
2626                 ret = -EINVAL;
2627             }
2628         }
2629         if (!ret) {
2630             ret = qemu_file_get_error(f);
2631         }
2632     }
2633
2634     wait_for_decompress_done();
2635     rcu_read_unlock();
2636     trace_ram_load_complete(ret, seq_iter);
2637     return ret;
2638 }
2639
2640 static SaveVMHandlers savevm_ram_handlers = {
2641     .save_live_setup = ram_save_setup,
2642     .save_live_iterate = ram_save_iterate,
2643     .save_live_complete_postcopy = ram_save_complete,
2644     .save_live_complete_precopy = ram_save_complete,
2645     .save_live_pending = ram_save_pending,
2646     .load_state = ram_load,
2647     .cleanup = ram_migration_cleanup,
2648 };
2649
2650 void ram_mig_init(void)
2651 {
2652     qemu_mutex_init(&XBZRLE.lock);
2653     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
2654 }