migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "qemu-common.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qapi-event.h"
  33 #include "qemu/cutils.h"
  34 #include "qemu/bitops.h"
  35 #include "qemu/bitmap.h"
  36 #include "qemu/timer.h"
  37 #include "qemu/main-loop.h"
  38 #include "migration/migration.h"
  39 #include "migration/postcopy-ram.h"
  40 #include "exec/address-spaces.h"
  41 #include "migration/page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "trace.h"
  44 #include "exec/ram_addr.h"
  45 #include "qemu/rcu_queue.h"
  46 #include "migration/colo.h"
  47
  48 static int dirty_rate_high_cnt;
  49
  50 static uint64_t bitmap_sync_count;
  51
  52 /***********************************************************/
  53 /* ram save/restore */
  54
  55 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  56 #define RAM_SAVE_FLAG_COMPRESS 0x02
  57 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  58 #define RAM_SAVE_FLAG_PAGE     0x08
  59 #define RAM_SAVE_FLAG_EOS      0x10
  60 #define RAM_SAVE_FLAG_CONTINUE 0x20
  61 #define RAM_SAVE_FLAG_XBZRLE   0x40
  62 /* 0x80 is reserved in migration.h start with 0x100 next */
  63 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  64
  65 static uint8_t *ZERO_TARGET_PAGE;
  66
  67 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  68 {
  69     return buffer_is_zero(p, size);
  70 }
  71
  72 /* struct contains XBZRLE cache and a static page
  73    used by the compression */
  74 static struct {
  75     /* buffer used for XBZRLE encoding */
  76     uint8_t *encoded_buf;
  77     /* buffer for storing page content */
  78     uint8_t *current_buf;
  79     /* Cache for XBZRLE, Protected by lock. */
  80     PageCache *cache;
  81     QemuMutex lock;
  82 } XBZRLE;
  83
  84 /* buffer used for XBZRLE decoding */
  85 static uint8_t *xbzrle_decoded_buf;
  86
  87 static void XBZRLE_cache_lock(void)
  88 {
  89     if (migrate_use_xbzrle())
  90         qemu_mutex_lock(&XBZRLE.lock);
  91 }
  92
  93 static void XBZRLE_cache_unlock(void)
  94 {
  95     if (migrate_use_xbzrle())
  96         qemu_mutex_unlock(&XBZRLE.lock);
  97 }
  98
  99 /*
 100  * called from qmp_migrate_set_cache_size in main thread, possibly while
 101  * a migration is in progress.
 102  * A running migration maybe using the cache and might finish during this
 103  * call, hence changes to the cache are protected by XBZRLE.lock().
 104  */
 105 int64_t xbzrle_cache_resize(int64_t new_size)
 106 {
 107     PageCache *new_cache;
 108     int64_t ret;
 109
 110     if (new_size < TARGET_PAGE_SIZE) {
 111         return -1;
 112     }
 113
 114     XBZRLE_cache_lock();
 115
 116     if (XBZRLE.cache != NULL) {
 117         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 118             goto out_new_size;
 119         }
 120         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 121                                         TARGET_PAGE_SIZE);
 122         if (!new_cache) {
 123             error_report("Error creating cache");
 124             ret = -1;
 125             goto out;
 126         }
 127
 128         cache_fini(XBZRLE.cache);
 129         XBZRLE.cache = new_cache;
 130     }
 131
 132 out_new_size:
 133     ret = pow2floor(new_size);
 134 out:
 135     XBZRLE_cache_unlock();
 136     return ret;
 137 }
 138
 139 /* accounting for migration statistics */
 140 typedef struct AccountingInfo {
 141     uint64_t dup_pages;
 142     uint64_t skipped_pages;
 143     uint64_t norm_pages;
 144     uint64_t iterations;
 145     uint64_t xbzrle_bytes;
 146     uint64_t xbzrle_pages;
 147     uint64_t xbzrle_cache_miss;
 148     double xbzrle_cache_miss_rate;
 149     uint64_t xbzrle_overflows;
 150 } AccountingInfo;
 151
 152 static AccountingInfo acct_info;
 153
 154 static void acct_clear(void)
 155 {
 156     memset(&acct_info, 0, sizeof(acct_info));
 157 }
 158
 159 uint64_t dup_mig_bytes_transferred(void)
 160 {
 161     return acct_info.dup_pages * TARGET_PAGE_SIZE;
 162 }
 163
 164 uint64_t dup_mig_pages_transferred(void)
 165 {
 166     return acct_info.dup_pages;
 167 }
 168
 169 uint64_t skipped_mig_bytes_transferred(void)
 170 {
 171     return acct_info.skipped_pages * TARGET_PAGE_SIZE;
 172 }
 173
 174 uint64_t skipped_mig_pages_transferred(void)
 175 {
 176     return acct_info.skipped_pages;
 177 }
 178
 179 uint64_t norm_mig_bytes_transferred(void)
 180 {
 181     return acct_info.norm_pages * TARGET_PAGE_SIZE;
 182 }
 183
 184 uint64_t norm_mig_pages_transferred(void)
 185 {
 186     return acct_info.norm_pages;
 187 }
 188
 189 uint64_t xbzrle_mig_bytes_transferred(void)
 190 {
 191     return acct_info.xbzrle_bytes;
 192 }
 193
 194 uint64_t xbzrle_mig_pages_transferred(void)
 195 {
 196     return acct_info.xbzrle_pages;
 197 }
 198
 199 uint64_t xbzrle_mig_pages_cache_miss(void)
 200 {
 201     return acct_info.xbzrle_cache_miss;
 202 }
 203
 204 double xbzrle_mig_cache_miss_rate(void)
 205 {
 206     return acct_info.xbzrle_cache_miss_rate;
 207 }
 208
 209 uint64_t xbzrle_mig_pages_overflow(void)
 210 {
 211     return acct_info.xbzrle_overflows;
 212 }
 213
 214 /* This is the last block that we have visited serching for dirty pages
 215  */
 216 static RAMBlock *last_seen_block;
 217 /* This is the last block from where we have sent data */
 218 static RAMBlock *last_sent_block;
 219 static ram_addr_t last_offset;
 220 static QemuMutex migration_bitmap_mutex;
 221 static uint64_t migration_dirty_pages;
 222 static uint32_t last_version;
 223 static bool ram_bulk_stage;
 224
 225 /* used by the search for pages to send */
 226 struct PageSearchStatus {
 227     /* Current block being searched */
 228     RAMBlock    *block;
 229     /* Current offset to search from */
 230     ram_addr_t   offset;
 231     /* Set once we wrap around */
 232     bool         complete_round;
 233 };
 234 typedef struct PageSearchStatus PageSearchStatus;
 235
 236 static struct BitmapRcu {
 237     struct rcu_head rcu;
 238     /* Main migration bitmap */
 239     unsigned long *bmap;
 240     /* bitmap of pages that haven't been sent even once
 241      * only maintained and used in postcopy at the moment
 242      * where it's used to send the dirtymap at the start
 243      * of the postcopy phase
 244      */
 245     unsigned long *unsentmap;
 246 } *migration_bitmap_rcu;
 247
 248 struct CompressParam {
 249     bool done;
 250     bool quit;
 251     QEMUFile *file;
 252     QemuMutex mutex;
 253     QemuCond cond;
 254     RAMBlock *block;
 255     ram_addr_t offset;
 256 };
 257 typedef struct CompressParam CompressParam;
 258
 259 struct DecompressParam {
 260     bool done;
 261     bool quit;
 262     QemuMutex mutex;
 263     QemuCond cond;
 264     void *des;
 265     uint8_t *compbuf;
 266     int len;
 267 };
 268 typedef struct DecompressParam DecompressParam;
 269
 270 static CompressParam *comp_param;
 271 static QemuThread *compress_threads;
 272 /* comp_done_cond is used to wake up the migration thread when
 273  * one of the compression threads has finished the compression.
 274  * comp_done_lock is used to co-work with comp_done_cond.
 275  */
 276 static QemuMutex comp_done_lock;
 277 static QemuCond comp_done_cond;
 278 /* The empty QEMUFileOps will be used by file in CompressParam */
 279 static const QEMUFileOps empty_ops = { };
 280
 281 static bool compression_switch;
 282 static DecompressParam *decomp_param;
 283 static QemuThread *decompress_threads;
 284 static QemuMutex decomp_done_lock;
 285 static QemuCond decomp_done_cond;
 286
 287 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 288                                 ram_addr_t offset);
 289
 290 static void *do_data_compress(void *opaque)
 291 {
 292     CompressParam *param = opaque;
 293     RAMBlock *block;
 294     ram_addr_t offset;
 295
 296     qemu_mutex_lock(&param->mutex);
 297     while (!param->quit) {
 298         if (param->block) {
 299             block = param->block;
 300             offset = param->offset;
 301             param->block = NULL;
 302             qemu_mutex_unlock(&param->mutex);
 303
 304             do_compress_ram_page(param->file, block, offset);
 305
 306             qemu_mutex_lock(&comp_done_lock);
 307             param->done = true;
 308             qemu_cond_signal(&comp_done_cond);
 309             qemu_mutex_unlock(&comp_done_lock);
 310
 311             qemu_mutex_lock(&param->mutex);
 312         } else {
 313             qemu_cond_wait(&param->cond, &param->mutex);
 314         }
 315     }
 316     qemu_mutex_unlock(&param->mutex);
 317
 318     return NULL;
 319 }
 320
 321 static inline void terminate_compression_threads(void)
 322 {
 323     int idx, thread_count;
 324
 325     thread_count = migrate_compress_threads();
 326     for (idx = 0; idx < thread_count; idx++) {
 327         qemu_mutex_lock(&comp_param[idx].mutex);
 328         comp_param[idx].quit = true;
 329         qemu_cond_signal(&comp_param[idx].cond);
 330         qemu_mutex_unlock(&comp_param[idx].mutex);
 331     }
 332 }
 333
 334 void migrate_compress_threads_join(void)
 335 {
 336     int i, thread_count;
 337
 338     if (!migrate_use_compression()) {
 339         return;
 340     }
 341     terminate_compression_threads();
 342     thread_count = migrate_compress_threads();
 343     for (i = 0; i < thread_count; i++) {
 344         qemu_thread_join(compress_threads + i);
 345         qemu_fclose(comp_param[i].file);
 346         qemu_mutex_destroy(&comp_param[i].mutex);
 347         qemu_cond_destroy(&comp_param[i].cond);
 348     }
 349     qemu_mutex_destroy(&comp_done_lock);
 350     qemu_cond_destroy(&comp_done_cond);
 351     g_free(compress_threads);
 352     g_free(comp_param);
 353     compress_threads = NULL;
 354     comp_param = NULL;
 355 }
 356
 357 void migrate_compress_threads_create(void)
 358 {
 359     int i, thread_count;
 360
 361     if (!migrate_use_compression()) {
 362         return;
 363     }
 364     compression_switch = true;
 365     thread_count = migrate_compress_threads();
 366     compress_threads = g_new0(QemuThread, thread_count);
 367     comp_param = g_new0(CompressParam, thread_count);
 368     qemu_cond_init(&comp_done_cond);
 369     qemu_mutex_init(&comp_done_lock);
 370     for (i = 0; i < thread_count; i++) {
 371         /* comp_param[i].file is just used as a dummy buffer to save data,
 372          * set its ops to empty.
 373          */
 374         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 375         comp_param[i].done = true;
 376         comp_param[i].quit = false;
 377         qemu_mutex_init(&comp_param[i].mutex);
 378         qemu_cond_init(&comp_param[i].cond);
 379         qemu_thread_create(compress_threads + i, "compress",
 380                            do_data_compress, comp_param + i,
 381                            QEMU_THREAD_JOINABLE);
 382     }
 383 }
 384
 385 /**
 386  * save_page_header: Write page header to wire
 387  *
 388  * If this is the 1st block, it also writes the block identification
 389  *
 390  * Returns: Number of bytes written
 391  *
 392  * @f: QEMUFile where to send the data
 393  * @block: block that contains the page we want to send
 394  * @offset: offset inside the block for the page
 395  *          in the lower bits, it contains flags
 396  */
 397 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
 398 {
 399     size_t size, len;
 400
 401     qemu_put_be64(f, offset);
 402     size = 8;
 403
 404     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 405         len = strlen(block->idstr);
 406         qemu_put_byte(f, len);
 407         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 408         size += 1 + len;
 409     }
 410     return size;
 411 }
 412
 413 /* Reduce amount of guest cpu execution to hopefully slow down memory writes.
 414  * If guest dirty memory rate is reduced below the rate at which we can
 415  * transfer pages to the destination then we should be able to complete
 416  * migration. Some workloads dirty memory way too fast and will not effectively
 417  * converge, even with auto-converge.
 418  */
 419 static void mig_throttle_guest_down(void)
 420 {
 421     MigrationState *s = migrate_get_current();
 422     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 423     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 424
 425     /* We have not started throttling yet. Let's start it. */
 426     if (!cpu_throttle_active()) {
 427         cpu_throttle_set(pct_initial);
 428     } else {
 429         /* Throttling already on, just increase the rate */
 430         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 431     }
 432 }
 433
 434 /* Update the xbzrle cache to reflect a page that's been sent as all 0.
 435  * The important thing is that a stale (not-yet-0'd) page be replaced
 436  * by the new data.
 437  * As a bonus, if the page wasn't in the cache it gets added so that
 438  * when a small write is made into the 0'd page it gets XBZRLE sent
 439  */
 440 static void xbzrle_cache_zero_page(ram_addr_t current_addr)
 441 {
 442     if (ram_bulk_stage || !migrate_use_xbzrle()) {
 443         return;
 444     }
 445
 446     /* We don't care if this fails to allocate a new cache page
 447      * as long as it updated an old one */
 448     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 449                  bitmap_sync_count);
 450 }
 451
 452 #define ENCODING_FLAG_XBZRLE 0x1
 453
 454 /**
 455  * save_xbzrle_page: compress and send current page
 456  *
 457  * Returns: 1 means that we wrote the page
 458  *          0 means that page is identical to the one already sent
 459  *          -1 means that xbzrle would be longer than normal
 460  *
 461  * @f: QEMUFile where to send the data
 462  * @current_data:
 463  * @current_addr:
 464  * @block: block that contains the page we want to send
 465  * @offset: offset inside the block for the page
 466  * @last_stage: if we are at the completion stage
 467  * @bytes_transferred: increase it with the number of transferred bytes
 468  */
 469 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
 470                             ram_addr_t current_addr, RAMBlock *block,
 471                             ram_addr_t offset, bool last_stage,
 472                             uint64_t *bytes_transferred)
 473 {
 474     int encoded_len = 0, bytes_xbzrle;
 475     uint8_t *prev_cached_page;
 476
 477     if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
 478         acct_info.xbzrle_cache_miss++;
 479         if (!last_stage) {
 480             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 481                              bitmap_sync_count) == -1) {
 482                 return -1;
 483             } else {
 484                 /* update *current_data when the page has been
 485                    inserted into cache */
 486                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 487             }
 488         }
 489         return -1;
 490     }
 491
 492     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 493
 494     /* save current buffer into memory */
 495     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 496
 497     /* XBZRLE encoding (if there is no overflow) */
 498     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 499                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 500                                        TARGET_PAGE_SIZE);
 501     if (encoded_len == 0) {
 502         trace_save_xbzrle_page_skipping();
 503         return 0;
 504     } else if (encoded_len == -1) {
 505         trace_save_xbzrle_page_overflow();
 506         acct_info.xbzrle_overflows++;
 507         /* update data in the cache */
 508         if (!last_stage) {
 509             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 510             *current_data = prev_cached_page;
 511         }
 512         return -1;
 513     }
 514
 515     /* we need to update the data in the cache, in order to get the same data */
 516     if (!last_stage) {
 517         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 518     }
 519
 520     /* Send XBZRLE based compressed page */
 521     bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
 522     qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
 523     qemu_put_be16(f, encoded_len);
 524     qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
 525     bytes_xbzrle += encoded_len + 1 + 2;
 526     acct_info.xbzrle_pages++;
 527     acct_info.xbzrle_bytes += bytes_xbzrle;
 528     *bytes_transferred += bytes_xbzrle;
 529
 530     return 1;
 531 }
 532
 533 /* Called with rcu_read_lock() to protect migration_bitmap
 534  * rb: The RAMBlock  to search for dirty pages in
 535  * start: Start address (typically so we can continue from previous page)
 536  * ram_addr_abs: Pointer into which to store the address of the dirty page
 537  *               within the global ram_addr space
 538  *
 539  * Returns: byte offset within memory region of the start of a dirty page
 540  */
 541 static inline
 542 ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
 543                                        ram_addr_t start,
 544                                        ram_addr_t *ram_addr_abs)
 545 {
 546     unsigned long base = rb->offset >> TARGET_PAGE_BITS;
 547     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
 548     uint64_t rb_size = rb->used_length;
 549     unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
 550     unsigned long *bitmap;
 551
 552     unsigned long next;
 553
 554     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 555     if (ram_bulk_stage && nr > base) {
 556         next = nr + 1;
 557     } else {
 558         next = find_next_bit(bitmap, size, nr);
 559     }
 560
 561     *ram_addr_abs = next << TARGET_PAGE_BITS;
 562     return (next - base) << TARGET_PAGE_BITS;
 563 }
 564
 565 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
 566 {
 567     bool ret;
 568     int nr = addr >> TARGET_PAGE_BITS;
 569     unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 570
 571     ret = test_and_clear_bit(nr, bitmap);
 572
 573     if (ret) {
 574         migration_dirty_pages--;
 575     }
 576     return ret;
 577 }
 578
 579 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
 580 {
 581     unsigned long *bitmap;
 582     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 583     migration_dirty_pages +=
 584         cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
 585 }
 586
 587 /* Fix me: there are too many global variables used in migration process. */
 588 static int64_t start_time;
 589 static int64_t bytes_xfer_prev;
 590 static int64_t num_dirty_pages_period;
 591 static uint64_t xbzrle_cache_miss_prev;
 592 static uint64_t iterations_prev;
 593
 594 static void migration_bitmap_sync_init(void)
 595 {
 596     start_time = 0;
 597     bytes_xfer_prev = 0;
 598     num_dirty_pages_period = 0;
 599     xbzrle_cache_miss_prev = 0;
 600     iterations_prev = 0;
 601 }
 602
 603 static void migration_bitmap_sync(void)
 604 {
 605     RAMBlock *block;
 606     uint64_t num_dirty_pages_init = migration_dirty_pages;
 607     MigrationState *s = migrate_get_current();
 608     int64_t end_time;
 609     int64_t bytes_xfer_now;
 610
 611     bitmap_sync_count++;
 612
 613     if (!bytes_xfer_prev) {
 614         bytes_xfer_prev = ram_bytes_transferred();
 615     }
 616
 617     if (!start_time) {
 618         start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 619     }
 620
 621     trace_migration_bitmap_sync_start();
 622     memory_global_dirty_log_sync();
 623
 624     qemu_mutex_lock(&migration_bitmap_mutex);
 625     rcu_read_lock();
 626     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 627         migration_bitmap_sync_range(block->offset, block->used_length);
 628     }
 629     rcu_read_unlock();
 630     qemu_mutex_unlock(&migration_bitmap_mutex);
 631
 632     trace_migration_bitmap_sync_end(migration_dirty_pages
 633                                     - num_dirty_pages_init);
 634     num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
 635     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 636
 637     /* more than 1 second = 1000 millisecons */
 638     if (end_time > start_time + 1000) {
 639         if (migrate_auto_converge()) {
 640             /* The following detection logic can be refined later. For now:
 641                Check to see if the dirtied bytes is 50% more than the approx.
 642                amount of bytes that just got transferred since the last time we
 643                were in this routine. If that happens twice, start or increase
 644                throttling */
 645             bytes_xfer_now = ram_bytes_transferred();
 646
 647             if (s->dirty_pages_rate &&
 648                (num_dirty_pages_period * TARGET_PAGE_SIZE >
 649                    (bytes_xfer_now - bytes_xfer_prev)/2) &&
 650                (dirty_rate_high_cnt++ >= 2)) {
 651                     trace_migration_throttle();
 652                     dirty_rate_high_cnt = 0;
 653                     mig_throttle_guest_down();
 654              }
 655              bytes_xfer_prev = bytes_xfer_now;
 656         }
 657
 658         if (migrate_use_xbzrle()) {
 659             if (iterations_prev != acct_info.iterations) {
 660                 acct_info.xbzrle_cache_miss_rate =
 661                    (double)(acct_info.xbzrle_cache_miss -
 662                             xbzrle_cache_miss_prev) /
 663                    (acct_info.iterations - iterations_prev);
 664             }
 665             iterations_prev = acct_info.iterations;
 666             xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
 667         }
 668         s->dirty_pages_rate = num_dirty_pages_period * 1000
 669             / (end_time - start_time);
 670         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
 671         start_time = end_time;
 672         num_dirty_pages_period = 0;
 673     }
 674     s->dirty_sync_count = bitmap_sync_count;
 675     if (migrate_use_events()) {
 676         qapi_event_send_migration_pass(bitmap_sync_count, NULL);
 677     }
 678 }
 679
 680 /**
 681  * save_zero_page: Send the zero page to the stream
 682  *
 683  * Returns: Number of pages written.
 684  *
 685  * @f: QEMUFile where to send the data
 686  * @block: block that contains the page we want to send
 687  * @offset: offset inside the block for the page
 688  * @p: pointer to the page
 689  * @bytes_transferred: increase it with the number of transferred bytes
 690  */
 691 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
 692                           uint8_t *p, uint64_t *bytes_transferred)
 693 {
 694     int pages = -1;
 695
 696     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 697         acct_info.dup_pages++;
 698         *bytes_transferred += save_page_header(f, block,
 699                                                offset | RAM_SAVE_FLAG_COMPRESS);
 700         qemu_put_byte(f, 0);
 701         *bytes_transferred += 1;
 702         pages = 1;
 703     }
 704
 705     return pages;
 706 }
 707
 708 /**
 709  * ram_save_page: Send the given page to the stream
 710  *
 711  * Returns: Number of pages written.
 712  *          < 0 - error
 713  *          >=0 - Number of pages written - this might legally be 0
 714  *                if xbzrle noticed the page was the same.
 715  *
 716  * @ms: The current migration state.
 717  * @f: QEMUFile where to send the data
 718  * @block: block that contains the page we want to send
 719  * @offset: offset inside the block for the page
 720  * @last_stage: if we are at the completion stage
 721  * @bytes_transferred: increase it with the number of transferred bytes
 722  */
 723 static int ram_save_page(MigrationState *ms, QEMUFile *f, PageSearchStatus *pss,
 724                          bool last_stage, uint64_t *bytes_transferred)
 725 {
 726     int pages = -1;
 727     uint64_t bytes_xmit;
 728     ram_addr_t current_addr;
 729     uint8_t *p;
 730     int ret;
 731     bool send_async = true;
 732     RAMBlock *block = pss->block;
 733     ram_addr_t offset = pss->offset;
 734
 735     p = block->host + offset;
 736
 737     /* In doubt sent page as normal */
 738     bytes_xmit = 0;
 739     ret = ram_control_save_page(f, block->offset,
 740                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 741     if (bytes_xmit) {
 742         *bytes_transferred += bytes_xmit;
 743         pages = 1;
 744     }
 745
 746     XBZRLE_cache_lock();
 747
 748     current_addr = block->offset + offset;
 749
 750     if (block == last_sent_block) {
 751         offset |= RAM_SAVE_FLAG_CONTINUE;
 752     }
 753     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 754         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 755             if (bytes_xmit > 0) {
 756                 acct_info.norm_pages++;
 757             } else if (bytes_xmit == 0) {
 758                 acct_info.dup_pages++;
 759             }
 760         }
 761     } else {
 762         pages = save_zero_page(f, block, offset, p, bytes_transferred);
 763         if (pages > 0) {
 764             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 765              * page would be stale
 766              */
 767             xbzrle_cache_zero_page(current_addr);
 768         } else if (!ram_bulk_stage &&
 769                    !migration_in_postcopy(ms) && migrate_use_xbzrle()) {
 770             pages = save_xbzrle_page(f, &p, current_addr, block,
 771                                      offset, last_stage, bytes_transferred);
 772             if (!last_stage) {
 773                 /* Can't send this cached data async, since the cache page
 774                  * might get updated before it gets to the wire
 775                  */
 776                 send_async = false;
 777             }
 778         }
 779     }
 780
 781     /* XBZRLE overflow or normal page */
 782     if (pages == -1) {
 783         *bytes_transferred += save_page_header(f, block,
 784                                                offset | RAM_SAVE_FLAG_PAGE);
 785         if (send_async) {
 786             qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
 787         } else {
 788             qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
 789         }
 790         *bytes_transferred += TARGET_PAGE_SIZE;
 791         pages = 1;
 792         acct_info.norm_pages++;
 793     }
 794
 795     XBZRLE_cache_unlock();
 796
 797     return pages;
 798 }
 799
 800 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 801                                 ram_addr_t offset)
 802 {
 803     int bytes_sent, blen;
 804     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 805
 806     bytes_sent = save_page_header(f, block, offset |
 807                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 808     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 809                                      migrate_compress_level());
 810     if (blen < 0) {
 811         bytes_sent = 0;
 812         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 813         error_report("compressed data failed!");
 814     } else {
 815         bytes_sent += blen;
 816     }
 817
 818     return bytes_sent;
 819 }
 820
 821 static uint64_t bytes_transferred;
 822
 823 static void flush_compressed_data(QEMUFile *f)
 824 {
 825     int idx, len, thread_count;
 826
 827     if (!migrate_use_compression()) {
 828         return;
 829     }
 830     thread_count = migrate_compress_threads();
 831
 832     qemu_mutex_lock(&comp_done_lock);
 833     for (idx = 0; idx < thread_count; idx++) {
 834         while (!comp_param[idx].done) {
 835             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 836         }
 837     }
 838     qemu_mutex_unlock(&comp_done_lock);
 839
 840     for (idx = 0; idx < thread_count; idx++) {
 841         qemu_mutex_lock(&comp_param[idx].mutex);
 842         if (!comp_param[idx].quit) {
 843             len = qemu_put_qemu_file(f, comp_param[idx].file);
 844             bytes_transferred += len;
 845         }
 846         qemu_mutex_unlock(&comp_param[idx].mutex);
 847     }
 848 }
 849
 850 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 851                                        ram_addr_t offset)
 852 {
 853     param->block = block;
 854     param->offset = offset;
 855 }
 856
 857 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
 858                                            ram_addr_t offset,
 859                                            uint64_t *bytes_transferred)
 860 {
 861     int idx, thread_count, bytes_xmit = -1, pages = -1;
 862
 863     thread_count = migrate_compress_threads();
 864     qemu_mutex_lock(&comp_done_lock);
 865     while (true) {
 866         for (idx = 0; idx < thread_count; idx++) {
 867             if (comp_param[idx].done) {
 868                 comp_param[idx].done = false;
 869                 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
 870                 qemu_mutex_lock(&comp_param[idx].mutex);
 871                 set_compress_params(&comp_param[idx], block, offset);
 872                 qemu_cond_signal(&comp_param[idx].cond);
 873                 qemu_mutex_unlock(&comp_param[idx].mutex);
 874                 pages = 1;
 875                 acct_info.norm_pages++;
 876                 *bytes_transferred += bytes_xmit;
 877                 break;
 878             }
 879         }
 880         if (pages > 0) {
 881             break;
 882         } else {
 883             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 884         }
 885     }
 886     qemu_mutex_unlock(&comp_done_lock);
 887
 888     return pages;
 889 }
 890
 891 /**
 892  * ram_save_compressed_page: compress the given page and send it to the stream
 893  *
 894  * Returns: Number of pages written.
 895  *
 896  * @ms: The current migration state.
 897  * @f: QEMUFile where to send the data
 898  * @block: block that contains the page we want to send
 899  * @offset: offset inside the block for the page
 900  * @last_stage: if we are at the completion stage
 901  * @bytes_transferred: increase it with the number of transferred bytes
 902  */
 903 static int ram_save_compressed_page(MigrationState *ms, QEMUFile *f,
 904                                     PageSearchStatus *pss, bool last_stage,
 905                                     uint64_t *bytes_transferred)
 906 {
 907     int pages = -1;
 908     uint64_t bytes_xmit = 0;
 909     uint8_t *p;
 910     int ret, blen;
 911     RAMBlock *block = pss->block;
 912     ram_addr_t offset = pss->offset;
 913
 914     p = block->host + offset;
 915
 916     ret = ram_control_save_page(f, block->offset,
 917                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 918     if (bytes_xmit) {
 919         *bytes_transferred += bytes_xmit;
 920         pages = 1;
 921     }
 922     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 923         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 924             if (bytes_xmit > 0) {
 925                 acct_info.norm_pages++;
 926             } else if (bytes_xmit == 0) {
 927                 acct_info.dup_pages++;
 928             }
 929         }
 930     } else {
 931         /* When starting the process of a new block, the first page of
 932          * the block should be sent out before other pages in the same
 933          * block, and all the pages in last block should have been sent
 934          * out, keeping this order is important, because the 'cont' flag
 935          * is used to avoid resending the block name.
 936          */
 937         if (block != last_sent_block) {
 938             flush_compressed_data(f);
 939             pages = save_zero_page(f, block, offset, p, bytes_transferred);
 940             if (pages == -1) {
 941                 /* Make sure the first page is sent out before other pages */
 942                 bytes_xmit = save_page_header(f, block, offset |
 943                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
 944                 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 945                                                  migrate_compress_level());
 946                 if (blen > 0) {
 947                     *bytes_transferred += bytes_xmit + blen;
 948                     acct_info.norm_pages++;
 949                     pages = 1;
 950                 } else {
 951                     qemu_file_set_error(f, blen);
 952                     error_report("compressed data failed!");
 953                 }
 954             }
 955         } else {
 956             offset |= RAM_SAVE_FLAG_CONTINUE;
 957             pages = save_zero_page(f, block, offset, p, bytes_transferred);
 958             if (pages == -1) {
 959                 pages = compress_page_with_multi_thread(f, block, offset,
 960                                                         bytes_transferred);
 961             }
 962         }
 963     }
 964
 965     return pages;
 966 }
 967
 968 /*
 969  * Find the next dirty page and update any state associated with
 970  * the search process.
 971  *
 972  * Returns: True if a page is found
 973  *
 974  * @f: Current migration stream.
 975  * @pss: Data about the state of the current dirty page scan.
 976  * @*again: Set to false if the search has scanned the whole of RAM
 977  * *ram_addr_abs: Pointer into which to store the address of the dirty page
 978  *               within the global ram_addr space
 979  */
 980 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
 981                              bool *again, ram_addr_t *ram_addr_abs)
 982 {
 983     pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
 984                                               ram_addr_abs);
 985     if (pss->complete_round && pss->block == last_seen_block &&
 986         pss->offset >= last_offset) {
 987         /*
 988          * We've been once around the RAM and haven't found anything.
 989          * Give up.
 990          */
 991         *again = false;
 992         return false;
 993     }
 994     if (pss->offset >= pss->block->used_length) {
 995         /* Didn't find anything in this RAM Block */
 996         pss->offset = 0;
 997         pss->block = QLIST_NEXT_RCU(pss->block, next);
 998         if (!pss->block) {
 999             /* Hit the end of the list */
1000             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1001             /* Flag that we've looped */
1002             pss->complete_round = true;
1003             ram_bulk_stage = false;
1004             if (migrate_use_xbzrle()) {
1005                 /* If xbzrle is on, stop using the data compression at this
1006                  * point. In theory, xbzrle can do better than compression.
1007                  */
1008                 flush_compressed_data(f);
1009                 compression_switch = false;
1010             }
1011         }
1012         /* Didn't find anything this time, but try again on the new block */
1013         *again = true;
1014         return false;
1015     } else {
1016         /* Can go around again, but... */
1017         *again = true;
1018         /* We've found something so probably don't need to */
1019         return true;
1020     }
1021 }
1022
1023 /*
1024  * Helper for 'get_queued_page' - gets a page off the queue
1025  *      ms:      MigrationState in
1026  * *offset:      Used to return the offset within the RAMBlock
1027  * ram_addr_abs: global offset in the dirty/sent bitmaps
1028  *
1029  * Returns:      block (or NULL if none available)
1030  */
1031 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1032                               ram_addr_t *ram_addr_abs)
1033 {
1034     RAMBlock *block = NULL;
1035
1036     qemu_mutex_lock(&ms->src_page_req_mutex);
1037     if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1038         struct MigrationSrcPageRequest *entry =
1039                                 QSIMPLEQ_FIRST(&ms->src_page_requests);
1040         block = entry->rb;
1041         *offset = entry->offset;
1042         *ram_addr_abs = (entry->offset + entry->rb->offset) &
1043                         TARGET_PAGE_MASK;
1044
1045         if (entry->len > TARGET_PAGE_SIZE) {
1046             entry->len -= TARGET_PAGE_SIZE;
1047             entry->offset += TARGET_PAGE_SIZE;
1048         } else {
1049             memory_region_unref(block->mr);
1050             QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1051             g_free(entry);
1052         }
1053     }
1054     qemu_mutex_unlock(&ms->src_page_req_mutex);
1055
1056     return block;
1057 }
1058
1059 /*
1060  * Unqueue a page from the queue fed by postcopy page requests; skips pages
1061  * that are already sent (!dirty)
1062  *
1063  *      ms:      MigrationState in
1064  *     pss:      PageSearchStatus structure updated with found block/offset
1065  * ram_addr_abs: global offset in the dirty/sent bitmaps
1066  *
1067  * Returns:      true if a queued page is found
1068  */
1069 static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
1070                             ram_addr_t *ram_addr_abs)
1071 {
1072     RAMBlock  *block;
1073     ram_addr_t offset;
1074     bool dirty;
1075
1076     do {
1077         block = unqueue_page(ms, &offset, ram_addr_abs);
1078         /*
1079          * We're sending this page, and since it's postcopy nothing else
1080          * will dirty it, and we must make sure it doesn't get sent again
1081          * even if this queue request was received after the background
1082          * search already sent it.
1083          */
1084         if (block) {
1085             unsigned long *bitmap;
1086             bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1087             dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1088             if (!dirty) {
1089                 trace_get_queued_page_not_dirty(
1090                     block->idstr, (uint64_t)offset,
1091                     (uint64_t)*ram_addr_abs,
1092                     test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1093                          atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1094             } else {
1095                 trace_get_queued_page(block->idstr,
1096                                       (uint64_t)offset,
1097                                       (uint64_t)*ram_addr_abs);
1098             }
1099         }
1100
1101     } while (block && !dirty);
1102
1103     if (block) {
1104         /*
1105          * As soon as we start servicing pages out of order, then we have
1106          * to kill the bulk stage, since the bulk stage assumes
1107          * in (migration_bitmap_find_and_reset_dirty) that every page is
1108          * dirty, that's no longer true.
1109          */
1110         ram_bulk_stage = false;
1111
1112         /*
1113          * We want the background search to continue from the queued page
1114          * since the guest is likely to want other pages near to the page
1115          * it just requested.
1116          */
1117         pss->block = block;
1118         pss->offset = offset;
1119     }
1120
1121     return !!block;
1122 }
1123
1124 /**
1125  * flush_page_queue: Flush any remaining pages in the ram request queue
1126  *    it should be empty at the end anyway, but in error cases there may be
1127  *    some left.
1128  *
1129  * ms: MigrationState
1130  */
1131 void flush_page_queue(MigrationState *ms)
1132 {
1133     struct MigrationSrcPageRequest *mspr, *next_mspr;
1134     /* This queue generally should be empty - but in the case of a failed
1135      * migration might have some droppings in.
1136      */
1137     rcu_read_lock();
1138     QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1139         memory_region_unref(mspr->rb->mr);
1140         QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1141         g_free(mspr);
1142     }
1143     rcu_read_unlock();
1144 }
1145
1146 /**
1147  * Queue the pages for transmission, e.g. a request from postcopy destination
1148  *   ms: MigrationStatus in which the queue is held
1149  *   rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
1150  *   start: Offset from the start of the RAMBlock
1151  *   len: Length (in bytes) to send
1152  *   Return: 0 on success
1153  */
1154 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1155                          ram_addr_t start, ram_addr_t len)
1156 {
1157     RAMBlock *ramblock;
1158
1159     ms->postcopy_requests++;
1160     rcu_read_lock();
1161     if (!rbname) {
1162         /* Reuse last RAMBlock */
1163         ramblock = ms->last_req_rb;
1164
1165         if (!ramblock) {
1166             /*
1167              * Shouldn't happen, we can't reuse the last RAMBlock if
1168              * it's the 1st request.
1169              */
1170             error_report("ram_save_queue_pages no previous block");
1171             goto err;
1172         }
1173     } else {
1174         ramblock = qemu_ram_block_by_name(rbname);
1175
1176         if (!ramblock) {
1177             /* We shouldn't be asked for a non-existent RAMBlock */
1178             error_report("ram_save_queue_pages no block '%s'", rbname);
1179             goto err;
1180         }
1181         ms->last_req_rb = ramblock;
1182     }
1183     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1184     if (start+len > ramblock->used_length) {
1185         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1186                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1187                      __func__, start, len, ramblock->used_length);
1188         goto err;
1189     }
1190
1191     struct MigrationSrcPageRequest *new_entry =
1192         g_malloc0(sizeof(struct MigrationSrcPageRequest));
1193     new_entry->rb = ramblock;
1194     new_entry->offset = start;
1195     new_entry->len = len;
1196
1197     memory_region_ref(ramblock->mr);
1198     qemu_mutex_lock(&ms->src_page_req_mutex);
1199     QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1200     qemu_mutex_unlock(&ms->src_page_req_mutex);
1201     rcu_read_unlock();
1202
1203     return 0;
1204
1205 err:
1206     rcu_read_unlock();
1207     return -1;
1208 }
1209
1210 /**
1211  * ram_save_target_page: Save one target page
1212  *
1213  *
1214  * @f: QEMUFile where to send the data
1215  * @block: pointer to block that contains the page we want to send
1216  * @offset: offset inside the block for the page;
1217  * @last_stage: if we are at the completion stage
1218  * @bytes_transferred: increase it with the number of transferred bytes
1219  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1220  *
1221  * Returns: Number of pages written.
1222  */
1223 static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
1224                                 PageSearchStatus *pss,
1225                                 bool last_stage,
1226                                 uint64_t *bytes_transferred,
1227                                 ram_addr_t dirty_ram_abs)
1228 {
1229     int res = 0;
1230
1231     /* Check the pages is dirty and if it is send it */
1232     if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1233         unsigned long *unsentmap;
1234         if (compression_switch && migrate_use_compression()) {
1235             res = ram_save_compressed_page(ms, f, pss,
1236                                            last_stage,
1237                                            bytes_transferred);
1238         } else {
1239             res = ram_save_page(ms, f, pss, last_stage,
1240                                 bytes_transferred);
1241         }
1242
1243         if (res < 0) {
1244             return res;
1245         }
1246         unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1247         if (unsentmap) {
1248             clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1249         }
1250         /* Only update last_sent_block if a block was actually sent; xbzrle
1251          * might have decided the page was identical so didn't bother writing
1252          * to the stream.
1253          */
1254         if (res > 0) {
1255             last_sent_block = pss->block;
1256         }
1257     }
1258
1259     return res;
1260 }
1261
1262 /**
1263  * ram_save_host_page: Starting at *offset send pages up to the end
1264  *                     of the current host page.  It's valid for the initial
1265  *                     offset to point into the middle of a host page
1266  *                     in which case the remainder of the hostpage is sent.
1267  *                     Only dirty target pages are sent.
1268  *
1269  * Returns: Number of pages written.
1270  *
1271  * @f: QEMUFile where to send the data
1272  * @block: pointer to block that contains the page we want to send
1273  * @offset: offset inside the block for the page; updated to last target page
1274  *          sent
1275  * @last_stage: if we are at the completion stage
1276  * @bytes_transferred: increase it with the number of transferred bytes
1277  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1278  */
1279 static int ram_save_host_page(MigrationState *ms, QEMUFile *f,
1280                               PageSearchStatus *pss,
1281                               bool last_stage,
1282                               uint64_t *bytes_transferred,
1283                               ram_addr_t dirty_ram_abs)
1284 {
1285     int tmppages, pages = 0;
1286     do {
1287         tmppages = ram_save_target_page(ms, f, pss, last_stage,
1288                                         bytes_transferred, dirty_ram_abs);
1289         if (tmppages < 0) {
1290             return tmppages;
1291         }
1292
1293         pages += tmppages;
1294         pss->offset += TARGET_PAGE_SIZE;
1295         dirty_ram_abs += TARGET_PAGE_SIZE;
1296     } while (pss->offset & (qemu_host_page_size - 1));
1297
1298     /* The offset we leave with is the last one we looked at */
1299     pss->offset -= TARGET_PAGE_SIZE;
1300     return pages;
1301 }
1302
1303 /**
1304  * ram_find_and_save_block: Finds a dirty page and sends it to f
1305  *
1306  * Called within an RCU critical section.
1307  *
1308  * Returns:  The number of pages written
1309  *           0 means no dirty pages
1310  *
1311  * @f: QEMUFile where to send the data
1312  * @last_stage: if we are at the completion stage
1313  * @bytes_transferred: increase it with the number of transferred bytes
1314  *
1315  * On systems where host-page-size > target-page-size it will send all the
1316  * pages in a host page that are dirty.
1317  */
1318
1319 static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1320                                    uint64_t *bytes_transferred)
1321 {
1322     PageSearchStatus pss;
1323     MigrationState *ms = migrate_get_current();
1324     int pages = 0;
1325     bool again, found;
1326     ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1327                                  ram_addr_t space */
1328
1329     pss.block = last_seen_block;
1330     pss.offset = last_offset;
1331     pss.complete_round = false;
1332
1333     if (!pss.block) {
1334         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1335     }
1336
1337     do {
1338         again = true;
1339         found = get_queued_page(ms, &pss, &dirty_ram_abs);
1340
1341         if (!found) {
1342             /* priority queue empty, so just search for something dirty */
1343             found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
1344         }
1345
1346         if (found) {
1347             pages = ram_save_host_page(ms, f, &pss,
1348                                        last_stage, bytes_transferred,
1349                                        dirty_ram_abs);
1350         }
1351     } while (!pages && again);
1352
1353     last_seen_block = pss.block;
1354     last_offset = pss.offset;
1355
1356     return pages;
1357 }
1358
1359 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1360 {
1361     uint64_t pages = size / TARGET_PAGE_SIZE;
1362     if (zero) {
1363         acct_info.dup_pages += pages;
1364     } else {
1365         acct_info.norm_pages += pages;
1366         bytes_transferred += size;
1367         qemu_update_position(f, size);
1368     }
1369 }
1370
1371 static ram_addr_t ram_save_remaining(void)
1372 {
1373     return migration_dirty_pages;
1374 }
1375
1376 uint64_t ram_bytes_remaining(void)
1377 {
1378     return ram_save_remaining() * TARGET_PAGE_SIZE;
1379 }
1380
1381 uint64_t ram_bytes_transferred(void)
1382 {
1383     return bytes_transferred;
1384 }
1385
1386 uint64_t ram_bytes_total(void)
1387 {
1388     RAMBlock *block;
1389     uint64_t total = 0;
1390
1391     rcu_read_lock();
1392     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1393         total += block->used_length;
1394     rcu_read_unlock();
1395     return total;
1396 }
1397
1398 void free_xbzrle_decoded_buf(void)
1399 {
1400     g_free(xbzrle_decoded_buf);
1401     xbzrle_decoded_buf = NULL;
1402 }
1403
1404 static void migration_bitmap_free(struct BitmapRcu *bmap)
1405 {
1406     g_free(bmap->bmap);
1407     g_free(bmap->unsentmap);
1408     g_free(bmap);
1409 }
1410
1411 static void ram_migration_cleanup(void *opaque)
1412 {
1413     /* caller have hold iothread lock or is in a bh, so there is
1414      * no writing race against this migration_bitmap
1415      */
1416     struct BitmapRcu *bitmap = migration_bitmap_rcu;
1417     atomic_rcu_set(&migration_bitmap_rcu, NULL);
1418     if (bitmap) {
1419         memory_global_dirty_log_stop();
1420         call_rcu(bitmap, migration_bitmap_free, rcu);
1421     }
1422
1423     XBZRLE_cache_lock();
1424     if (XBZRLE.cache) {
1425         cache_fini(XBZRLE.cache);
1426         g_free(XBZRLE.encoded_buf);
1427         g_free(XBZRLE.current_buf);
1428         g_free(ZERO_TARGET_PAGE);
1429         XBZRLE.cache = NULL;
1430         XBZRLE.encoded_buf = NULL;
1431         XBZRLE.current_buf = NULL;
1432     }
1433     XBZRLE_cache_unlock();
1434 }
1435
1436 static void reset_ram_globals(void)
1437 {
1438     last_seen_block = NULL;
1439     last_sent_block = NULL;
1440     last_offset = 0;
1441     last_version = ram_list.version;
1442     ram_bulk_stage = true;
1443 }
1444
1445 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1446
1447 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1448 {
1449     /* called in qemu main thread, so there is
1450      * no writing race against this migration_bitmap
1451      */
1452     if (migration_bitmap_rcu) {
1453         struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1454         bitmap = g_new(struct BitmapRcu, 1);
1455         bitmap->bmap = bitmap_new(new);
1456
1457         /* prevent migration_bitmap content from being set bit
1458          * by migration_bitmap_sync_range() at the same time.
1459          * it is safe to migration if migration_bitmap is cleared bit
1460          * at the same time.
1461          */
1462         qemu_mutex_lock(&migration_bitmap_mutex);
1463         bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1464         bitmap_set(bitmap->bmap, old, new - old);
1465
1466         /* We don't have a way to safely extend the sentmap
1467          * with RCU; so mark it as missing, entry to postcopy
1468          * will fail.
1469          */
1470         bitmap->unsentmap = NULL;
1471
1472         atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1473         qemu_mutex_unlock(&migration_bitmap_mutex);
1474         migration_dirty_pages += new - old;
1475         call_rcu(old_bitmap, migration_bitmap_free, rcu);
1476     }
1477 }
1478
1479 /*
1480  * 'expected' is the value you expect the bitmap mostly to be full
1481  * of; it won't bother printing lines that are all this value.
1482  * If 'todump' is null the migration bitmap is dumped.
1483  */
1484 void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1485 {
1486     int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1487
1488     int64_t cur;
1489     int64_t linelen = 128;
1490     char linebuf[129];
1491
1492     if (!todump) {
1493         todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1494     }
1495
1496     for (cur = 0; cur < ram_pages; cur += linelen) {
1497         int64_t curb;
1498         bool found = false;
1499         /*
1500          * Last line; catch the case where the line length
1501          * is longer than remaining ram
1502          */
1503         if (cur + linelen > ram_pages) {
1504             linelen = ram_pages - cur;
1505         }
1506         for (curb = 0; curb < linelen; curb++) {
1507             bool thisbit = test_bit(cur + curb, todump);
1508             linebuf[curb] = thisbit ? '1' : '.';
1509             found = found || (thisbit != expected);
1510         }
1511         if (found) {
1512             linebuf[curb] = '\0';
1513             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1514         }
1515     }
1516 }
1517
1518 /* **** functions for postcopy ***** */
1519
1520 /*
1521  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1522  * Note: At this point the 'unsentmap' is the processed bitmap combined
1523  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1524  * start,length: Indexes into the bitmap for the first bit
1525  *            representing the named block and length in target-pages
1526  */
1527 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1528                                         PostcopyDiscardState *pds,
1529                                         unsigned long start,
1530                                         unsigned long length)
1531 {
1532     unsigned long end = start + length; /* one after the end */
1533     unsigned long current;
1534     unsigned long *unsentmap;
1535
1536     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1537     for (current = start; current < end; ) {
1538         unsigned long one = find_next_bit(unsentmap, end, current);
1539
1540         if (one <= end) {
1541             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1542             unsigned long discard_length;
1543
1544             if (zero >= end) {
1545                 discard_length = end - one;
1546             } else {
1547                 discard_length = zero - one;
1548             }
1549             if (discard_length) {
1550                 postcopy_discard_send_range(ms, pds, one, discard_length);
1551             }
1552             current = one + discard_length;
1553         } else {
1554             current = one;
1555         }
1556     }
1557
1558     return 0;
1559 }
1560
1561 /*
1562  * Utility for the outgoing postcopy code.
1563  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1564  *   passing it bitmap indexes and name.
1565  * Returns: 0 on success
1566  * (qemu_ram_foreach_block ends up passing unscaled lengths
1567  *  which would mean postcopy code would have to deal with target page)
1568  */
1569 static int postcopy_each_ram_send_discard(MigrationState *ms)
1570 {
1571     struct RAMBlock *block;
1572     int ret;
1573
1574     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1575         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1576         PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1577                                                                first,
1578                                                                block->idstr);
1579
1580         /*
1581          * Postcopy sends chunks of bitmap over the wire, but it
1582          * just needs indexes at this point, avoids it having
1583          * target page specific code.
1584          */
1585         ret = postcopy_send_discard_bm_ram(ms, pds, first,
1586                                     block->used_length >> TARGET_PAGE_BITS);
1587         postcopy_discard_send_finish(ms, pds);
1588         if (ret) {
1589             return ret;
1590         }
1591     }
1592
1593     return 0;
1594 }
1595
1596 /*
1597  * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
1598  *   the two bitmaps, that are similar, but one is inverted.
1599  *
1600  * We search for runs of target-pages that don't start or end on a
1601  * host page boundary;
1602  * unsent_pass=true: Cleans up partially unsent host pages by searching
1603  *                 the unsentmap
1604  * unsent_pass=false: Cleans up partially dirty host pages by searching
1605  *                 the main migration bitmap
1606  *
1607  */
1608 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1609                                           RAMBlock *block,
1610                                           PostcopyDiscardState *pds)
1611 {
1612     unsigned long *bitmap;
1613     unsigned long *unsentmap;
1614     unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE;
1615     unsigned long first = block->offset >> TARGET_PAGE_BITS;
1616     unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1617     unsigned long last = first + (len - 1);
1618     unsigned long run_start;
1619
1620     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1621     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1622
1623     if (unsent_pass) {
1624         /* Find a sent page */
1625         run_start = find_next_zero_bit(unsentmap, last + 1, first);
1626     } else {
1627         /* Find a dirty page */
1628         run_start = find_next_bit(bitmap, last + 1, first);
1629     }
1630
1631     while (run_start <= last) {
1632         bool do_fixup = false;
1633         unsigned long fixup_start_addr;
1634         unsigned long host_offset;
1635
1636         /*
1637          * If the start of this run of pages is in the middle of a host
1638          * page, then we need to fixup this host page.
1639          */
1640         host_offset = run_start % host_ratio;
1641         if (host_offset) {
1642             do_fixup = true;
1643             run_start -= host_offset;
1644             fixup_start_addr = run_start;
1645             /* For the next pass */
1646             run_start = run_start + host_ratio;
1647         } else {
1648             /* Find the end of this run */
1649             unsigned long run_end;
1650             if (unsent_pass) {
1651                 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1652             } else {
1653                 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1654             }
1655             /*
1656              * If the end isn't at the start of a host page, then the
1657              * run doesn't finish at the end of a host page
1658              * and we need to discard.
1659              */
1660             host_offset = run_end % host_ratio;
1661             if (host_offset) {
1662                 do_fixup = true;
1663                 fixup_start_addr = run_end - host_offset;
1664                 /*
1665                  * This host page has gone, the next loop iteration starts
1666                  * from after the fixup
1667                  */
1668                 run_start = fixup_start_addr + host_ratio;
1669             } else {
1670                 /*
1671                  * No discards on this iteration, next loop starts from
1672                  * next sent/dirty page
1673                  */
1674                 run_start = run_end + 1;
1675             }
1676         }
1677
1678         if (do_fixup) {
1679             unsigned long page;
1680
1681             /* Tell the destination to discard this page */
1682             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1683                 /* For the unsent_pass we:
1684                  *     discard partially sent pages
1685                  * For the !unsent_pass (dirty) we:
1686                  *     discard partially dirty pages that were sent
1687                  *     (any partially sent pages were already discarded
1688                  *     by the previous unsent_pass)
1689                  */
1690                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1691                                             host_ratio);
1692             }
1693
1694             /* Clean up the bitmap */
1695             for (page = fixup_start_addr;
1696                  page < fixup_start_addr + host_ratio; page++) {
1697                 /* All pages in this host page are now not sent */
1698                 set_bit(page, unsentmap);
1699
1700                 /*
1701                  * Remark them as dirty, updating the count for any pages
1702                  * that weren't previously dirty.
1703                  */
1704                 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1705             }
1706         }
1707
1708         if (unsent_pass) {
1709             /* Find the next sent page for the next iteration */
1710             run_start = find_next_zero_bit(unsentmap, last + 1,
1711                                            run_start);
1712         } else {
1713             /* Find the next dirty page for the next iteration */
1714             run_start = find_next_bit(bitmap, last + 1, run_start);
1715         }
1716     }
1717 }
1718
1719 /*
1720  * Utility for the outgoing postcopy code.
1721  *
1722  * Discard any partially sent host-page size chunks, mark any partially
1723  * dirty host-page size chunks as all dirty.
1724  *
1725  * Returns: 0 on success
1726  */
1727 static int postcopy_chunk_hostpages(MigrationState *ms)
1728 {
1729     struct RAMBlock *block;
1730
1731     if (qemu_host_page_size == TARGET_PAGE_SIZE) {
1732         /* Easy case - TPS==HPS - nothing to be done */
1733         return 0;
1734     }
1735
1736     /* Easiest way to make sure we don't resume in the middle of a host-page */
1737     last_seen_block = NULL;
1738     last_sent_block = NULL;
1739     last_offset     = 0;
1740
1741     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1742         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1743
1744         PostcopyDiscardState *pds =
1745                          postcopy_discard_send_init(ms, first, block->idstr);
1746
1747         /* First pass: Discard all partially sent host pages */
1748         postcopy_chunk_hostpages_pass(ms, true, block, pds);
1749         /*
1750          * Second pass: Ensure that all partially dirty host pages are made
1751          * fully dirty.
1752          */
1753         postcopy_chunk_hostpages_pass(ms, false, block, pds);
1754
1755         postcopy_discard_send_finish(ms, pds);
1756     } /* ram_list loop */
1757
1758     return 0;
1759 }
1760
1761 /*
1762  * Transmit the set of pages to be discarded after precopy to the target
1763  * these are pages that:
1764  *     a) Have been previously transmitted but are now dirty again
1765  *     b) Pages that have never been transmitted, this ensures that
1766  *        any pages on the destination that have been mapped by background
1767  *        tasks get discarded (transparent huge pages is the specific concern)
1768  * Hopefully this is pretty sparse
1769  */
1770 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1771 {
1772     int ret;
1773     unsigned long *bitmap, *unsentmap;
1774
1775     rcu_read_lock();
1776
1777     /* This should be our last sync, the src is now paused */
1778     migration_bitmap_sync();
1779
1780     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1781     if (!unsentmap) {
1782         /* We don't have a safe way to resize the sentmap, so
1783          * if the bitmap was resized it will be NULL at this
1784          * point.
1785          */
1786         error_report("migration ram resized during precopy phase");
1787         rcu_read_unlock();
1788         return -EINVAL;
1789     }
1790
1791     /* Deal with TPS != HPS */
1792     ret = postcopy_chunk_hostpages(ms);
1793     if (ret) {
1794         rcu_read_unlock();
1795         return ret;
1796     }
1797
1798     /*
1799      * Update the unsentmap to be unsentmap = unsentmap | dirty
1800      */
1801     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1802     bitmap_or(unsentmap, unsentmap, bitmap,
1803                last_ram_offset() >> TARGET_PAGE_BITS);
1804
1805
1806     trace_ram_postcopy_send_discard_bitmap();
1807 #ifdef DEBUG_POSTCOPY
1808     ram_debug_dump_bitmap(unsentmap, true);
1809 #endif
1810
1811     ret = postcopy_each_ram_send_discard(ms);
1812     rcu_read_unlock();
1813
1814     return ret;
1815 }
1816
1817 /*
1818  * At the start of the postcopy phase of migration, any now-dirty
1819  * precopied pages are discarded.
1820  *
1821  * start, length describe a byte address range within the RAMBlock
1822  *
1823  * Returns 0 on success.
1824  */
1825 int ram_discard_range(MigrationIncomingState *mis,
1826                       const char *block_name,
1827                       uint64_t start, size_t length)
1828 {
1829     int ret = -1;
1830
1831     rcu_read_lock();
1832     RAMBlock *rb = qemu_ram_block_by_name(block_name);
1833
1834     if (!rb) {
1835         error_report("ram_discard_range: Failed to find block '%s'",
1836                      block_name);
1837         goto err;
1838     }
1839
1840     uint8_t *host_startaddr = rb->host + start;
1841
1842     if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) {
1843         error_report("ram_discard_range: Unaligned start address: %p",
1844                      host_startaddr);
1845         goto err;
1846     }
1847
1848     if ((start + length) <= rb->used_length) {
1849         uint8_t *host_endaddr = host_startaddr + length;
1850         if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) {
1851             error_report("ram_discard_range: Unaligned end address: %p",
1852                          host_endaddr);
1853             goto err;
1854         }
1855         ret = postcopy_ram_discard_range(mis, host_startaddr, length);
1856     } else {
1857         error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
1858                      "/%zx/" RAM_ADDR_FMT")",
1859                      block_name, start, length, rb->used_length);
1860     }
1861
1862 err:
1863     rcu_read_unlock();
1864
1865     return ret;
1866 }
1867
1868 static int ram_save_init_globals(void)
1869 {
1870     int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1871
1872     dirty_rate_high_cnt = 0;
1873     bitmap_sync_count = 0;
1874     migration_bitmap_sync_init();
1875     qemu_mutex_init(&migration_bitmap_mutex);
1876
1877     if (migrate_use_xbzrle()) {
1878         XBZRLE_cache_lock();
1879         ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1880         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1881                                   TARGET_PAGE_SIZE,
1882                                   TARGET_PAGE_SIZE);
1883         if (!XBZRLE.cache) {
1884             XBZRLE_cache_unlock();
1885             error_report("Error creating cache");
1886             return -1;
1887         }
1888         XBZRLE_cache_unlock();
1889
1890         /* We prefer not to abort if there is no memory */
1891         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1892         if (!XBZRLE.encoded_buf) {
1893             error_report("Error allocating encoded_buf");
1894             return -1;
1895         }
1896
1897         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1898         if (!XBZRLE.current_buf) {
1899             error_report("Error allocating current_buf");
1900             g_free(XBZRLE.encoded_buf);
1901             XBZRLE.encoded_buf = NULL;
1902             return -1;
1903         }
1904
1905         acct_clear();
1906     }
1907
1908     /* For memory_global_dirty_log_start below.  */
1909     qemu_mutex_lock_iothread();
1910
1911     qemu_mutex_lock_ramlist();
1912     rcu_read_lock();
1913     bytes_transferred = 0;
1914     reset_ram_globals();
1915
1916     ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1917     migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
1918     migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1919     bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
1920
1921     if (migrate_postcopy_ram()) {
1922         migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
1923         bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
1924     }
1925
1926     /*
1927      * Count the total number of pages used by ram blocks not including any
1928      * gaps due to alignment or unplugs.
1929      */
1930     migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1931
1932     memory_global_dirty_log_start();
1933     migration_bitmap_sync();
1934     qemu_mutex_unlock_ramlist();
1935     qemu_mutex_unlock_iothread();
1936     rcu_read_unlock();
1937
1938     return 0;
1939 }
1940
1941 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1942  * long-running RCU critical section.  When rcu-reclaims in the code
1943  * start to become numerous it will be necessary to reduce the
1944  * granularity of these critical sections.
1945  */
1946
1947 static int ram_save_setup(QEMUFile *f, void *opaque)
1948 {
1949     RAMBlock *block;
1950
1951     /* migration has already setup the bitmap, reuse it. */
1952     if (!migration_in_colo_state()) {
1953         if (ram_save_init_globals() < 0) {
1954             return -1;
1955          }
1956     }
1957
1958     rcu_read_lock();
1959
1960     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1961
1962     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1963         qemu_put_byte(f, strlen(block->idstr));
1964         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1965         qemu_put_be64(f, block->used_length);
1966     }
1967
1968     rcu_read_unlock();
1969
1970     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1971     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1972
1973     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1974
1975     return 0;
1976 }
1977
1978 static int ram_save_iterate(QEMUFile *f, void *opaque)
1979 {
1980     int ret;
1981     int i;
1982     int64_t t0;
1983     int done = 0;
1984
1985     rcu_read_lock();
1986     if (ram_list.version != last_version) {
1987         reset_ram_globals();
1988     }
1989
1990     /* Read version before ram_list.blocks */
1991     smp_rmb();
1992
1993     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1994
1995     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1996     i = 0;
1997     while ((ret = qemu_file_rate_limit(f)) == 0) {
1998         int pages;
1999
2000         pages = ram_find_and_save_block(f, false, &bytes_transferred);
2001         /* no more pages to sent */
2002         if (pages == 0) {
2003             done = 1;
2004             break;
2005         }
2006         acct_info.iterations++;
2007
2008         /* we want to check in the 1st loop, just in case it was the 1st time
2009            and we had to sync the dirty bitmap.
2010            qemu_get_clock_ns() is a bit expensive, so we only check each some
2011            iterations
2012         */
2013         if ((i & 63) == 0) {
2014             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2015             if (t1 > MAX_WAIT) {
2016                 trace_ram_save_iterate_big_wait(t1, i);
2017                 break;
2018             }
2019         }
2020         i++;
2021     }
2022     flush_compressed_data(f);
2023     rcu_read_unlock();
2024
2025     /*
2026      * Must occur before EOS (or any QEMUFile operation)
2027      * because of RDMA protocol.
2028      */
2029     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2030
2031     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2032     bytes_transferred += 8;
2033
2034     ret = qemu_file_get_error(f);
2035     if (ret < 0) {
2036         return ret;
2037     }
2038
2039     return done;
2040 }
2041
2042 /* Called with iothread lock */
2043 static int ram_save_complete(QEMUFile *f, void *opaque)
2044 {
2045     rcu_read_lock();
2046
2047     if (!migration_in_postcopy(migrate_get_current())) {
2048         migration_bitmap_sync();
2049     }
2050
2051     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2052
2053     /* try transferring iterative blocks of memory */
2054
2055     /* flush all remaining blocks regardless of rate limiting */
2056     while (true) {
2057         int pages;
2058
2059         pages = ram_find_and_save_block(f, !migration_in_colo_state(),
2060                                         &bytes_transferred);
2061         /* no more blocks to sent */
2062         if (pages == 0) {
2063             break;
2064         }
2065     }
2066
2067     flush_compressed_data(f);
2068     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2069
2070     rcu_read_unlock();
2071
2072     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2073
2074     return 0;
2075 }
2076
2077 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2078                              uint64_t *non_postcopiable_pending,
2079                              uint64_t *postcopiable_pending)
2080 {
2081     uint64_t remaining_size;
2082
2083     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2084
2085     if (!migration_in_postcopy(migrate_get_current()) &&
2086         remaining_size < max_size) {
2087         qemu_mutex_lock_iothread();
2088         rcu_read_lock();
2089         migration_bitmap_sync();
2090         rcu_read_unlock();
2091         qemu_mutex_unlock_iothread();
2092         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2093     }
2094
2095     /* We can do postcopy, and all the data is postcopiable */
2096     *postcopiable_pending += remaining_size;
2097 }
2098
2099 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2100 {
2101     unsigned int xh_len;
2102     int xh_flags;
2103     uint8_t *loaded_data;
2104
2105     if (!xbzrle_decoded_buf) {
2106         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2107     }
2108     loaded_data = xbzrle_decoded_buf;
2109
2110     /* extract RLE header */
2111     xh_flags = qemu_get_byte(f);
2112     xh_len = qemu_get_be16(f);
2113
2114     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2115         error_report("Failed to load XBZRLE page - wrong compression!");
2116         return -1;
2117     }
2118
2119     if (xh_len > TARGET_PAGE_SIZE) {
2120         error_report("Failed to load XBZRLE page - len overflow!");
2121         return -1;
2122     }
2123     /* load data and decode */
2124     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2125
2126     /* decode RLE */
2127     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2128                              TARGET_PAGE_SIZE) == -1) {
2129         error_report("Failed to load XBZRLE page - decode error!");
2130         return -1;
2131     }
2132
2133     return 0;
2134 }
2135
2136 /* Must be called from within a rcu critical section.
2137  * Returns a pointer from within the RCU-protected ram_list.
2138  */
2139 /*
2140  * Read a RAMBlock ID from the stream f.
2141  *
2142  * f: Stream to read from
2143  * flags: Page flags (mostly to see if it's a continuation of previous block)
2144  */
2145 static inline RAMBlock *ram_block_from_stream(QEMUFile *f,
2146                                               int flags)
2147 {
2148     static RAMBlock *block = NULL;
2149     char id[256];
2150     uint8_t len;
2151
2152     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2153         if (!block) {
2154             error_report("Ack, bad migration stream!");
2155             return NULL;
2156         }
2157         return block;
2158     }
2159
2160     len = qemu_get_byte(f);
2161     qemu_get_buffer(f, (uint8_t *)id, len);
2162     id[len] = 0;
2163
2164     block = qemu_ram_block_by_name(id);
2165     if (!block) {
2166         error_report("Can't find block %s", id);
2167         return NULL;
2168     }
2169
2170     return block;
2171 }
2172
2173 static inline void *host_from_ram_block_offset(RAMBlock *block,
2174                                                ram_addr_t offset)
2175 {
2176     if (!offset_in_ramblock(block, offset)) {
2177         return NULL;
2178     }
2179
2180     return block->host + offset;
2181 }
2182
2183 /*
2184  * If a page (or a whole RDMA chunk) has been
2185  * determined to be zero, then zap it.
2186  */
2187 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2188 {
2189     if (ch != 0 || !is_zero_range(host, size)) {
2190         memset(host, ch, size);
2191     }
2192 }
2193
2194 static void *do_data_decompress(void *opaque)
2195 {
2196     DecompressParam *param = opaque;
2197     unsigned long pagesize;
2198     uint8_t *des;
2199     int len;
2200
2201     qemu_mutex_lock(&param->mutex);
2202     while (!param->quit) {
2203         if (param->des) {
2204             des = param->des;
2205             len = param->len;
2206             param->des = 0;
2207             qemu_mutex_unlock(&param->mutex);
2208
2209             pagesize = TARGET_PAGE_SIZE;
2210             /* uncompress() will return failed in some case, especially
2211              * when the page is dirted when doing the compression, it's
2212              * not a problem because the dirty page will be retransferred
2213              * and uncompress() won't break the data in other pages.
2214              */
2215             uncompress((Bytef *)des, &pagesize,
2216                        (const Bytef *)param->compbuf, len);
2217
2218             qemu_mutex_lock(&decomp_done_lock);
2219             param->done = true;
2220             qemu_cond_signal(&decomp_done_cond);
2221             qemu_mutex_unlock(&decomp_done_lock);
2222
2223             qemu_mutex_lock(&param->mutex);
2224         } else {
2225             qemu_cond_wait(&param->cond, &param->mutex);
2226         }
2227     }
2228     qemu_mutex_unlock(&param->mutex);
2229
2230     return NULL;
2231 }
2232
2233 static void wait_for_decompress_done(void)
2234 {
2235     int idx, thread_count;
2236
2237     if (!migrate_use_compression()) {
2238         return;
2239     }
2240
2241     thread_count = migrate_decompress_threads();
2242     qemu_mutex_lock(&decomp_done_lock);
2243     for (idx = 0; idx < thread_count; idx++) {
2244         while (!decomp_param[idx].done) {
2245             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2246         }
2247     }
2248     qemu_mutex_unlock(&decomp_done_lock);
2249 }
2250
2251 void migrate_decompress_threads_create(void)
2252 {
2253     int i, thread_count;
2254
2255     thread_count = migrate_decompress_threads();
2256     decompress_threads = g_new0(QemuThread, thread_count);
2257     decomp_param = g_new0(DecompressParam, thread_count);
2258     qemu_mutex_init(&decomp_done_lock);
2259     qemu_cond_init(&decomp_done_cond);
2260     for (i = 0; i < thread_count; i++) {
2261         qemu_mutex_init(&decomp_param[i].mutex);
2262         qemu_cond_init(&decomp_param[i].cond);
2263         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2264         decomp_param[i].done = true;
2265         decomp_param[i].quit = false;
2266         qemu_thread_create(decompress_threads + i, "decompress",
2267                            do_data_decompress, decomp_param + i,
2268                            QEMU_THREAD_JOINABLE);
2269     }
2270 }
2271
2272 void migrate_decompress_threads_join(void)
2273 {
2274     int i, thread_count;
2275
2276     thread_count = migrate_decompress_threads();
2277     for (i = 0; i < thread_count; i++) {
2278         qemu_mutex_lock(&decomp_param[i].mutex);
2279         decomp_param[i].quit = true;
2280         qemu_cond_signal(&decomp_param[i].cond);
2281         qemu_mutex_unlock(&decomp_param[i].mutex);
2282     }
2283     for (i = 0; i < thread_count; i++) {
2284         qemu_thread_join(decompress_threads + i);
2285         qemu_mutex_destroy(&decomp_param[i].mutex);
2286         qemu_cond_destroy(&decomp_param[i].cond);
2287         g_free(decomp_param[i].compbuf);
2288     }
2289     g_free(decompress_threads);
2290     g_free(decomp_param);
2291     decompress_threads = NULL;
2292     decomp_param = NULL;
2293 }
2294
2295 static void decompress_data_with_multi_threads(QEMUFile *f,
2296                                                void *host, int len)
2297 {
2298     int idx, thread_count;
2299
2300     thread_count = migrate_decompress_threads();
2301     qemu_mutex_lock(&decomp_done_lock);
2302     while (true) {
2303         for (idx = 0; idx < thread_count; idx++) {
2304             if (decomp_param[idx].done) {
2305                 decomp_param[idx].done = false;
2306                 qemu_mutex_lock(&decomp_param[idx].mutex);
2307                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2308                 decomp_param[idx].des = host;
2309                 decomp_param[idx].len = len;
2310                 qemu_cond_signal(&decomp_param[idx].cond);
2311                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2312                 break;
2313             }
2314         }
2315         if (idx < thread_count) {
2316             break;
2317         } else {
2318             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2319         }
2320     }
2321     qemu_mutex_unlock(&decomp_done_lock);
2322 }
2323
2324 /*
2325  * Allocate data structures etc needed by incoming migration with postcopy-ram
2326  * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
2327  */
2328 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2329 {
2330     size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2331
2332     return postcopy_ram_incoming_init(mis, ram_pages);
2333 }
2334
2335 /*
2336  * Called in postcopy mode by ram_load().
2337  * rcu_read_lock is taken prior to this being called.
2338  */
2339 static int ram_load_postcopy(QEMUFile *f)
2340 {
2341     int flags = 0, ret = 0;
2342     bool place_needed = false;
2343     bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
2344     MigrationIncomingState *mis = migration_incoming_get_current();
2345     /* Temporary page that is later 'placed' */
2346     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2347     void *last_host = NULL;
2348     bool all_zero = false;
2349
2350     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2351         ram_addr_t addr;
2352         void *host = NULL;
2353         void *page_buffer = NULL;
2354         void *place_source = NULL;
2355         uint8_t ch;
2356
2357         addr = qemu_get_be64(f);
2358         flags = addr & ~TARGET_PAGE_MASK;
2359         addr &= TARGET_PAGE_MASK;
2360
2361         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2362         place_needed = false;
2363         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2364             RAMBlock *block = ram_block_from_stream(f, flags);
2365
2366             host = host_from_ram_block_offset(block, addr);
2367             if (!host) {
2368                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2369                 ret = -EINVAL;
2370                 break;
2371             }
2372             /*
2373              * Postcopy requires that we place whole host pages atomically.
2374              * To make it atomic, the data is read into a temporary page
2375              * that's moved into place later.
2376              * The migration protocol uses,  possibly smaller, target-pages
2377              * however the source ensures it always sends all the components
2378              * of a host page in order.
2379              */
2380             page_buffer = postcopy_host_page +
2381                           ((uintptr_t)host & ~qemu_host_page_mask);
2382             /* If all TP are zero then we can optimise the place */
2383             if (!((uintptr_t)host & ~qemu_host_page_mask)) {
2384                 all_zero = true;
2385             } else {
2386                 /* not the 1st TP within the HP */
2387                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2388                     error_report("Non-sequential target page %p/%p",
2389                                   host, last_host);
2390                     ret = -EINVAL;
2391                     break;
2392                 }
2393             }
2394
2395
2396             /*
2397              * If it's the last part of a host page then we place the host
2398              * page
2399              */
2400             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2401                                      ~qemu_host_page_mask) == 0;
2402             place_source = postcopy_host_page;
2403         }
2404         last_host = host;
2405
2406         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2407         case RAM_SAVE_FLAG_COMPRESS:
2408             ch = qemu_get_byte(f);
2409             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2410             if (ch) {
2411                 all_zero = false;
2412             }
2413             break;
2414
2415         case RAM_SAVE_FLAG_PAGE:
2416             all_zero = false;
2417             if (!place_needed || !matching_page_sizes) {
2418                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2419             } else {
2420                 /* Avoids the qemu_file copy during postcopy, which is
2421                  * going to do a copy later; can only do it when we
2422                  * do this read in one go (matching page sizes)
2423                  */
2424                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2425                                          TARGET_PAGE_SIZE);
2426             }
2427             break;
2428         case RAM_SAVE_FLAG_EOS:
2429             /* normal exit */
2430             break;
2431         default:
2432             error_report("Unknown combination of migration flags: %#x"
2433                          " (postcopy mode)", flags);
2434             ret = -EINVAL;
2435         }
2436
2437         if (place_needed) {
2438             /* This gets called at the last target page in the host page */
2439             if (all_zero) {
2440                 ret = postcopy_place_page_zero(mis,
2441                                                host + TARGET_PAGE_SIZE -
2442                                                qemu_host_page_size);
2443             } else {
2444                 ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE -
2445                                                qemu_host_page_size,
2446                                                place_source);
2447             }
2448         }
2449         if (!ret) {
2450             ret = qemu_file_get_error(f);
2451         }
2452     }
2453
2454     return ret;
2455 }
2456
2457 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2458 {
2459     int flags = 0, ret = 0;
2460     static uint64_t seq_iter;
2461     int len = 0;
2462     /*
2463      * If system is running in postcopy mode, page inserts to host memory must
2464      * be atomic
2465      */
2466     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2467
2468     seq_iter++;
2469
2470     if (version_id != 4) {
2471         ret = -EINVAL;
2472     }
2473
2474     /* This RCU critical section can be very long running.
2475      * When RCU reclaims in the code start to become numerous,
2476      * it will be necessary to reduce the granularity of this
2477      * critical section.
2478      */
2479     rcu_read_lock();
2480
2481     if (postcopy_running) {
2482         ret = ram_load_postcopy(f);
2483     }
2484
2485     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2486         ram_addr_t addr, total_ram_bytes;
2487         void *host = NULL;
2488         uint8_t ch;
2489
2490         addr = qemu_get_be64(f);
2491         flags = addr & ~TARGET_PAGE_MASK;
2492         addr &= TARGET_PAGE_MASK;
2493
2494         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2495                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2496             RAMBlock *block = ram_block_from_stream(f, flags);
2497
2498             host = host_from_ram_block_offset(block, addr);
2499             if (!host) {
2500                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2501                 ret = -EINVAL;
2502                 break;
2503             }
2504         }
2505
2506         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2507         case RAM_SAVE_FLAG_MEM_SIZE:
2508             /* Synchronize RAM block list */
2509             total_ram_bytes = addr;
2510             while (!ret && total_ram_bytes) {
2511                 RAMBlock *block;
2512                 char id[256];
2513                 ram_addr_t length;
2514
2515                 len = qemu_get_byte(f);
2516                 qemu_get_buffer(f, (uint8_t *)id, len);
2517                 id[len] = 0;
2518                 length = qemu_get_be64(f);
2519
2520                 block = qemu_ram_block_by_name(id);
2521                 if (block) {
2522                     if (length != block->used_length) {
2523                         Error *local_err = NULL;
2524
2525                         ret = qemu_ram_resize(block, length,
2526                                               &local_err);
2527                         if (local_err) {
2528                             error_report_err(local_err);
2529                         }
2530                     }
2531                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2532                                           block->idstr);
2533                 } else {
2534                     error_report("Unknown ramblock \"%s\", cannot "
2535                                  "accept migration", id);
2536                     ret = -EINVAL;
2537                 }
2538
2539                 total_ram_bytes -= length;
2540             }
2541             break;
2542
2543         case RAM_SAVE_FLAG_COMPRESS:
2544             ch = qemu_get_byte(f);
2545             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2546             break;
2547
2548         case RAM_SAVE_FLAG_PAGE:
2549             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2550             break;
2551
2552         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2553             len = qemu_get_be32(f);
2554             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2555                 error_report("Invalid compressed data length: %d", len);
2556                 ret = -EINVAL;
2557                 break;
2558             }
2559             decompress_data_with_multi_threads(f, host, len);
2560             break;
2561
2562         case RAM_SAVE_FLAG_XBZRLE:
2563             if (load_xbzrle(f, addr, host) < 0) {
2564                 error_report("Failed to decompress XBZRLE page at "
2565                              RAM_ADDR_FMT, addr);
2566                 ret = -EINVAL;
2567                 break;
2568             }
2569             break;
2570         case RAM_SAVE_FLAG_EOS:
2571             /* normal exit */
2572             break;
2573         default:
2574             if (flags & RAM_SAVE_FLAG_HOOK) {
2575                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2576             } else {
2577                 error_report("Unknown combination of migration flags: %#x",
2578                              flags);
2579                 ret = -EINVAL;
2580             }
2581         }
2582         if (!ret) {
2583             ret = qemu_file_get_error(f);
2584         }
2585     }
2586
2587     wait_for_decompress_done();
2588     rcu_read_unlock();
2589     trace_ram_load_complete(ret, seq_iter);
2590     return ret;
2591 }
2592
2593 static SaveVMHandlers savevm_ram_handlers = {
2594     .save_live_setup = ram_save_setup,
2595     .save_live_iterate = ram_save_iterate,
2596     .save_live_complete_postcopy = ram_save_complete,
2597     .save_live_complete_precopy = ram_save_complete,
2598     .save_live_pending = ram_save_pending,
2599     .load_state = ram_load,
2600     .cleanup = ram_migration_cleanup,
2601 };
2602
2603 void ram_mig_init(void)
2604 {
2605     qemu_mutex_init(&XBZRLE.lock);
2606     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
2607 }