migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "qemu-common.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qapi-event.h"
  33 #include "qemu/cutils.h"
  34 #include "qemu/bitops.h"
  35 #include "qemu/bitmap.h"
  36 #include "qemu/timer.h"
  37 #include "qemu/main-loop.h"
  38 #include "xbzrle.h"
  39 #include "migration/migration.h"
  40 #include "migration/qemu-file.h"
  41 #include "migration/vmstate.h"
  42 #include "postcopy-ram.h"
  43 #include "exec/address-spaces.h"
  44 #include "migration/page_cache.h"
  45 #include "qemu/error-report.h"
  46 #include "trace.h"
  47 #include "exec/ram_addr.h"
  48 #include "qemu/rcu_queue.h"
  49 #include "migration/colo.h"
  50
  51 /***********************************************************/
  52 /* ram save/restore */
  53
  54 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  55  * worked for pages that where filled with the same char.  We switched
  56  * it to only search for the zero value.  And to avoid confusion with
  57  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  58  */
  59
  60 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  61 #define RAM_SAVE_FLAG_ZERO     0x02
  62 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  63 #define RAM_SAVE_FLAG_PAGE     0x08
  64 #define RAM_SAVE_FLAG_EOS      0x10
  65 #define RAM_SAVE_FLAG_CONTINUE 0x20
  66 #define RAM_SAVE_FLAG_XBZRLE   0x40
  67 /* 0x80 is reserved in migration.h start with 0x100 next */
  68 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  69
  70 static uint8_t *ZERO_TARGET_PAGE;
  71
  72 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  73 {
  74     return buffer_is_zero(p, size);
  75 }
  76
  77 /* struct contains XBZRLE cache and a static page
  78    used by the compression */
  79 static struct {
  80     /* buffer used for XBZRLE encoding */
  81     uint8_t *encoded_buf;
  82     /* buffer for storing page content */
  83     uint8_t *current_buf;
  84     /* Cache for XBZRLE, Protected by lock. */
  85     PageCache *cache;
  86     QemuMutex lock;
  87 } XBZRLE;
  88
  89 /* buffer used for XBZRLE decoding */
  90 static uint8_t *xbzrle_decoded_buf;
  91
  92 static void XBZRLE_cache_lock(void)
  93 {
  94     if (migrate_use_xbzrle())
  95         qemu_mutex_lock(&XBZRLE.lock);
  96 }
  97
  98 static void XBZRLE_cache_unlock(void)
  99 {
 100     if (migrate_use_xbzrle())
 101         qemu_mutex_unlock(&XBZRLE.lock);
 102 }
 103
 104 /**
 105  * xbzrle_cache_resize: resize the xbzrle cache
 106  *
 107  * This function is called from qmp_migrate_set_cache_size in main
 108  * thread, possibly while a migration is in progress.  A running
 109  * migration may be using the cache and might finish during this call,
 110  * hence changes to the cache are protected by XBZRLE.lock().
 111  *
 112  * Returns the new_size or negative in case of error.
 113  *
 114  * @new_size: new cache size
 115  */
 116 int64_t xbzrle_cache_resize(int64_t new_size)
 117 {
 118     PageCache *new_cache;
 119     int64_t ret;
 120
 121     if (new_size < TARGET_PAGE_SIZE) {
 122         return -1;
 123     }
 124
 125     XBZRLE_cache_lock();
 126
 127     if (XBZRLE.cache != NULL) {
 128         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 129             goto out_new_size;
 130         }
 131         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 132                                         TARGET_PAGE_SIZE);
 133         if (!new_cache) {
 134             error_report("Error creating cache");
 135             ret = -1;
 136             goto out;
 137         }
 138
 139         cache_fini(XBZRLE.cache);
 140         XBZRLE.cache = new_cache;
 141     }
 142
 143 out_new_size:
 144     ret = pow2floor(new_size);
 145 out:
 146     XBZRLE_cache_unlock();
 147     return ret;
 148 }
 149
 150 /*
 151  * An outstanding page request, on the source, having been received
 152  * and queued
 153  */
 154 struct RAMSrcPageRequest {
 155     RAMBlock *rb;
 156     hwaddr    offset;
 157     hwaddr    len;
 158
 159     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 160 };
 161
 162 /* State of RAM for migration */
 163 struct RAMState {
 164     /* QEMUFile used for this migration */
 165     QEMUFile *f;
 166     /* Last block that we have visited searching for dirty pages */
 167     RAMBlock *last_seen_block;
 168     /* Last block from where we have sent data */
 169     RAMBlock *last_sent_block;
 170     /* Last dirty target page we have sent */
 171     ram_addr_t last_page;
 172     /* last ram version we have seen */
 173     uint32_t last_version;
 174     /* We are in the first round */
 175     bool ram_bulk_stage;
 176     /* How many times we have dirty too many pages */
 177     int dirty_rate_high_cnt;
 178     /* How many times we have synchronized the bitmap */
 179     uint64_t bitmap_sync_count;
 180     /* these variables are used for bitmap sync */
 181     /* last time we did a full bitmap_sync */
 182     int64_t time_last_bitmap_sync;
 183     /* bytes transferred at start_time */
 184     uint64_t bytes_xfer_prev;
 185     /* number of dirty pages since start_time */
 186     uint64_t num_dirty_pages_period;
 187     /* xbzrle misses since the beginning of the period */
 188     uint64_t xbzrle_cache_miss_prev;
 189     /* number of iterations at the beginning of period */
 190     uint64_t iterations_prev;
 191     /* Accounting fields */
 192     /* number of zero pages.  It used to be pages filled by the same char. */
 193     uint64_t zero_pages;
 194     /* number of normal transferred pages */
 195     uint64_t norm_pages;
 196     /* Iterations since start */
 197     uint64_t iterations;
 198     /* xbzrle transmitted bytes.  Notice that this is with
 199      * compression, they can't be calculated from the pages */
 200     uint64_t xbzrle_bytes;
 201     /* xbzrle transmmited pages */
 202     uint64_t xbzrle_pages;
 203     /* xbzrle number of cache miss */
 204     uint64_t xbzrle_cache_miss;
 205     /* xbzrle miss rate */
 206     double xbzrle_cache_miss_rate;
 207     /* xbzrle number of overflows */
 208     uint64_t xbzrle_overflows;
 209     /* number of dirty bits in the bitmap */
 210     uint64_t migration_dirty_pages;
 211     /* total number of bytes transferred */
 212     uint64_t bytes_transferred;
 213     /* number of dirtied pages in the last second */
 214     uint64_t dirty_pages_rate;
 215     /* Count of requests incoming from destination */
 216     uint64_t postcopy_requests;
 217     /* protects modification of the bitmap */
 218     QemuMutex bitmap_mutex;
 219     /* The RAMBlock used in the last src_page_requests */
 220     RAMBlock *last_req_rb;
 221     /* Queue of outstanding page requests from the destination */
 222     QemuMutex src_page_req_mutex;
 223     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 224 };
 225 typedef struct RAMState RAMState;
 226
 227 static RAMState ram_state;
 228
 229 uint64_t dup_mig_pages_transferred(void)
 230 {
 231     return ram_state.zero_pages;
 232 }
 233
 234 uint64_t norm_mig_pages_transferred(void)
 235 {
 236     return ram_state.norm_pages;
 237 }
 238
 239 uint64_t xbzrle_mig_bytes_transferred(void)
 240 {
 241     return ram_state.xbzrle_bytes;
 242 }
 243
 244 uint64_t xbzrle_mig_pages_transferred(void)
 245 {
 246     return ram_state.xbzrle_pages;
 247 }
 248
 249 uint64_t xbzrle_mig_pages_cache_miss(void)
 250 {
 251     return ram_state.xbzrle_cache_miss;
 252 }
 253
 254 double xbzrle_mig_cache_miss_rate(void)
 255 {
 256     return ram_state.xbzrle_cache_miss_rate;
 257 }
 258
 259 uint64_t xbzrle_mig_pages_overflow(void)
 260 {
 261     return ram_state.xbzrle_overflows;
 262 }
 263
 264 uint64_t ram_bytes_transferred(void)
 265 {
 266     return ram_state.bytes_transferred;
 267 }
 268
 269 uint64_t ram_bytes_remaining(void)
 270 {
 271     return ram_state.migration_dirty_pages * TARGET_PAGE_SIZE;
 272 }
 273
 274 uint64_t ram_dirty_sync_count(void)
 275 {
 276     return ram_state.bitmap_sync_count;
 277 }
 278
 279 uint64_t ram_dirty_pages_rate(void)
 280 {
 281     return ram_state.dirty_pages_rate;
 282 }
 283
 284 uint64_t ram_postcopy_requests(void)
 285 {
 286     return ram_state.postcopy_requests;
 287 }
 288
 289 /* used by the search for pages to send */
 290 struct PageSearchStatus {
 291     /* Current block being searched */
 292     RAMBlock    *block;
 293     /* Current page to search from */
 294     unsigned long page;
 295     /* Set once we wrap around */
 296     bool         complete_round;
 297 };
 298 typedef struct PageSearchStatus PageSearchStatus;
 299
 300 struct CompressParam {
 301     bool done;
 302     bool quit;
 303     QEMUFile *file;
 304     QemuMutex mutex;
 305     QemuCond cond;
 306     RAMBlock *block;
 307     ram_addr_t offset;
 308 };
 309 typedef struct CompressParam CompressParam;
 310
 311 struct DecompressParam {
 312     bool done;
 313     bool quit;
 314     QemuMutex mutex;
 315     QemuCond cond;
 316     void *des;
 317     uint8_t *compbuf;
 318     int len;
 319 };
 320 typedef struct DecompressParam DecompressParam;
 321
 322 static CompressParam *comp_param;
 323 static QemuThread *compress_threads;
 324 /* comp_done_cond is used to wake up the migration thread when
 325  * one of the compression threads has finished the compression.
 326  * comp_done_lock is used to co-work with comp_done_cond.
 327  */
 328 static QemuMutex comp_done_lock;
 329 static QemuCond comp_done_cond;
 330 /* The empty QEMUFileOps will be used by file in CompressParam */
 331 static const QEMUFileOps empty_ops = { };
 332
 333 static DecompressParam *decomp_param;
 334 static QemuThread *decompress_threads;
 335 static QemuMutex decomp_done_lock;
 336 static QemuCond decomp_done_cond;
 337
 338 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 339                                 ram_addr_t offset);
 340
 341 static void *do_data_compress(void *opaque)
 342 {
 343     CompressParam *param = opaque;
 344     RAMBlock *block;
 345     ram_addr_t offset;
 346
 347     qemu_mutex_lock(&param->mutex);
 348     while (!param->quit) {
 349         if (param->block) {
 350             block = param->block;
 351             offset = param->offset;
 352             param->block = NULL;
 353             qemu_mutex_unlock(&param->mutex);
 354
 355             do_compress_ram_page(param->file, block, offset);
 356
 357             qemu_mutex_lock(&comp_done_lock);
 358             param->done = true;
 359             qemu_cond_signal(&comp_done_cond);
 360             qemu_mutex_unlock(&comp_done_lock);
 361
 362             qemu_mutex_lock(&param->mutex);
 363         } else {
 364             qemu_cond_wait(&param->cond, &param->mutex);
 365         }
 366     }
 367     qemu_mutex_unlock(&param->mutex);
 368
 369     return NULL;
 370 }
 371
 372 static inline void terminate_compression_threads(void)
 373 {
 374     int idx, thread_count;
 375
 376     thread_count = migrate_compress_threads();
 377
 378     for (idx = 0; idx < thread_count; idx++) {
 379         qemu_mutex_lock(&comp_param[idx].mutex);
 380         comp_param[idx].quit = true;
 381         qemu_cond_signal(&comp_param[idx].cond);
 382         qemu_mutex_unlock(&comp_param[idx].mutex);
 383     }
 384 }
 385
 386 void migrate_compress_threads_join(void)
 387 {
 388     int i, thread_count;
 389
 390     if (!migrate_use_compression()) {
 391         return;
 392     }
 393     terminate_compression_threads();
 394     thread_count = migrate_compress_threads();
 395     for (i = 0; i < thread_count; i++) {
 396         qemu_thread_join(compress_threads + i);
 397         qemu_fclose(comp_param[i].file);
 398         qemu_mutex_destroy(&comp_param[i].mutex);
 399         qemu_cond_destroy(&comp_param[i].cond);
 400     }
 401     qemu_mutex_destroy(&comp_done_lock);
 402     qemu_cond_destroy(&comp_done_cond);
 403     g_free(compress_threads);
 404     g_free(comp_param);
 405     compress_threads = NULL;
 406     comp_param = NULL;
 407 }
 408
 409 void migrate_compress_threads_create(void)
 410 {
 411     int i, thread_count;
 412
 413     if (!migrate_use_compression()) {
 414         return;
 415     }
 416     thread_count = migrate_compress_threads();
 417     compress_threads = g_new0(QemuThread, thread_count);
 418     comp_param = g_new0(CompressParam, thread_count);
 419     qemu_cond_init(&comp_done_cond);
 420     qemu_mutex_init(&comp_done_lock);
 421     for (i = 0; i < thread_count; i++) {
 422         /* comp_param[i].file is just used as a dummy buffer to save data,
 423          * set its ops to empty.
 424          */
 425         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 426         comp_param[i].done = true;
 427         comp_param[i].quit = false;
 428         qemu_mutex_init(&comp_param[i].mutex);
 429         qemu_cond_init(&comp_param[i].cond);
 430         qemu_thread_create(compress_threads + i, "compress",
 431                            do_data_compress, comp_param + i,
 432                            QEMU_THREAD_JOINABLE);
 433     }
 434 }
 435
 436 /**
 437  * save_page_header: write page header to wire
 438  *
 439  * If this is the 1st block, it also writes the block identification
 440  *
 441  * Returns the number of bytes written
 442  *
 443  * @f: QEMUFile where to send the data
 444  * @block: block that contains the page we want to send
 445  * @offset: offset inside the block for the page
 446  *          in the lower bits, it contains flags
 447  */
 448 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 449                                ram_addr_t offset)
 450 {
 451     size_t size, len;
 452
 453     if (block == rs->last_sent_block) {
 454         offset |= RAM_SAVE_FLAG_CONTINUE;
 455     }
 456     qemu_put_be64(f, offset);
 457     size = 8;
 458
 459     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 460         len = strlen(block->idstr);
 461         qemu_put_byte(f, len);
 462         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 463         size += 1 + len;
 464         rs->last_sent_block = block;
 465     }
 466     return size;
 467 }
 468
 469 /**
 470  * mig_throttle_guest_down: throotle down the guest
 471  *
 472  * Reduce amount of guest cpu execution to hopefully slow down memory
 473  * writes. If guest dirty memory rate is reduced below the rate at
 474  * which we can transfer pages to the destination then we should be
 475  * able to complete migration. Some workloads dirty memory way too
 476  * fast and will not effectively converge, even with auto-converge.
 477  */
 478 static void mig_throttle_guest_down(void)
 479 {
 480     MigrationState *s = migrate_get_current();
 481     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 482     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 483
 484     /* We have not started throttling yet. Let's start it. */
 485     if (!cpu_throttle_active()) {
 486         cpu_throttle_set(pct_initial);
 487     } else {
 488         /* Throttling already on, just increase the rate */
 489         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 490     }
 491 }
 492
 493 /**
 494  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 495  *
 496  * @rs: current RAM state
 497  * @current_addr: address for the zero page
 498  *
 499  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 500  * The important thing is that a stale (not-yet-0'd) page be replaced
 501  * by the new data.
 502  * As a bonus, if the page wasn't in the cache it gets added so that
 503  * when a small write is made into the 0'd page it gets XBZRLE sent.
 504  */
 505 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 506 {
 507     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 508         return;
 509     }
 510
 511     /* We don't care if this fails to allocate a new cache page
 512      * as long as it updated an old one */
 513     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 514                  rs->bitmap_sync_count);
 515 }
 516
 517 #define ENCODING_FLAG_XBZRLE 0x1
 518
 519 /**
 520  * save_xbzrle_page: compress and send current page
 521  *
 522  * Returns: 1 means that we wrote the page
 523  *          0 means that page is identical to the one already sent
 524  *          -1 means that xbzrle would be longer than normal
 525  *
 526  * @rs: current RAM state
 527  * @current_data: pointer to the address of the page contents
 528  * @current_addr: addr of the page
 529  * @block: block that contains the page we want to send
 530  * @offset: offset inside the block for the page
 531  * @last_stage: if we are at the completion stage
 532  */
 533 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 534                             ram_addr_t current_addr, RAMBlock *block,
 535                             ram_addr_t offset, bool last_stage)
 536 {
 537     int encoded_len = 0, bytes_xbzrle;
 538     uint8_t *prev_cached_page;
 539
 540     if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
 541         rs->xbzrle_cache_miss++;
 542         if (!last_stage) {
 543             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 544                              rs->bitmap_sync_count) == -1) {
 545                 return -1;
 546             } else {
 547                 /* update *current_data when the page has been
 548                    inserted into cache */
 549                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 550             }
 551         }
 552         return -1;
 553     }
 554
 555     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 556
 557     /* save current buffer into memory */
 558     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 559
 560     /* XBZRLE encoding (if there is no overflow) */
 561     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 562                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 563                                        TARGET_PAGE_SIZE);
 564     if (encoded_len == 0) {
 565         trace_save_xbzrle_page_skipping();
 566         return 0;
 567     } else if (encoded_len == -1) {
 568         trace_save_xbzrle_page_overflow();
 569         rs->xbzrle_overflows++;
 570         /* update data in the cache */
 571         if (!last_stage) {
 572             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 573             *current_data = prev_cached_page;
 574         }
 575         return -1;
 576     }
 577
 578     /* we need to update the data in the cache, in order to get the same data */
 579     if (!last_stage) {
 580         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 581     }
 582
 583     /* Send XBZRLE based compressed page */
 584     bytes_xbzrle = save_page_header(rs, rs->f, block,
 585                                     offset | RAM_SAVE_FLAG_XBZRLE);
 586     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 587     qemu_put_be16(rs->f, encoded_len);
 588     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 589     bytes_xbzrle += encoded_len + 1 + 2;
 590     rs->xbzrle_pages++;
 591     rs->xbzrle_bytes += bytes_xbzrle;
 592     rs->bytes_transferred += bytes_xbzrle;
 593
 594     return 1;
 595 }
 596
 597 /**
 598  * migration_bitmap_find_dirty: find the next dirty page from start
 599  *
 600  * Called with rcu_read_lock() to protect migration_bitmap
 601  *
 602  * Returns the byte offset within memory region of the start of a dirty page
 603  *
 604  * @rs: current RAM state
 605  * @rb: RAMBlock where to search for dirty pages
 606  * @start: page where we start the search
 607  */
 608 static inline
 609 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 610                                           unsigned long start)
 611 {
 612     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 613     unsigned long *bitmap = rb->bmap;
 614     unsigned long next;
 615
 616     if (rs->ram_bulk_stage && start > 0) {
 617         next = start + 1;
 618     } else {
 619         next = find_next_bit(bitmap, size, start);
 620     }
 621
 622     return next;
 623 }
 624
 625 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 626                                                 RAMBlock *rb,
 627                                                 unsigned long page)
 628 {
 629     bool ret;
 630
 631     ret = test_and_clear_bit(page, rb->bmap);
 632
 633     if (ret) {
 634         rs->migration_dirty_pages--;
 635     }
 636     return ret;
 637 }
 638
 639 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 640                                         ram_addr_t start, ram_addr_t length)
 641 {
 642     rs->migration_dirty_pages +=
 643         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 644                                               &rs->num_dirty_pages_period);
 645 }
 646
 647 /**
 648  * ram_pagesize_summary: calculate all the pagesizes of a VM
 649  *
 650  * Returns a summary bitmap of the page sizes of all RAMBlocks
 651  *
 652  * For VMs with just normal pages this is equivalent to the host page
 653  * size. If it's got some huge pages then it's the OR of all the
 654  * different page sizes.
 655  */
 656 uint64_t ram_pagesize_summary(void)
 657 {
 658     RAMBlock *block;
 659     uint64_t summary = 0;
 660
 661     RAMBLOCK_FOREACH(block) {
 662         summary |= block->page_size;
 663     }
 664
 665     return summary;
 666 }
 667
 668 static void migration_bitmap_sync(RAMState *rs)
 669 {
 670     RAMBlock *block;
 671     int64_t end_time;
 672     uint64_t bytes_xfer_now;
 673
 674     rs->bitmap_sync_count++;
 675
 676     if (!rs->bytes_xfer_prev) {
 677         rs->bytes_xfer_prev = ram_bytes_transferred();
 678     }
 679
 680     if (!rs->time_last_bitmap_sync) {
 681         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 682     }
 683
 684     trace_migration_bitmap_sync_start();
 685     memory_global_dirty_log_sync();
 686
 687     qemu_mutex_lock(&rs->bitmap_mutex);
 688     rcu_read_lock();
 689     RAMBLOCK_FOREACH(block) {
 690         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 691     }
 692     rcu_read_unlock();
 693     qemu_mutex_unlock(&rs->bitmap_mutex);
 694
 695     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 696
 697     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 698
 699     /* more than 1 second = 1000 millisecons */
 700     if (end_time > rs->time_last_bitmap_sync + 1000) {
 701         if (migrate_auto_converge()) {
 702             /* The following detection logic can be refined later. For now:
 703                Check to see if the dirtied bytes is 50% more than the approx.
 704                amount of bytes that just got transferred since the last time we
 705                were in this routine. If that happens twice, start or increase
 706                throttling */
 707             bytes_xfer_now = ram_bytes_transferred();
 708
 709             if (rs->dirty_pages_rate &&
 710                (rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 711                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 712                (rs->dirty_rate_high_cnt++ >= 2)) {
 713                     trace_migration_throttle();
 714                     rs->dirty_rate_high_cnt = 0;
 715                     mig_throttle_guest_down();
 716              }
 717              rs->bytes_xfer_prev = bytes_xfer_now;
 718         }
 719
 720         if (migrate_use_xbzrle()) {
 721             if (rs->iterations_prev != rs->iterations) {
 722                 rs->xbzrle_cache_miss_rate =
 723                    (double)(rs->xbzrle_cache_miss -
 724                             rs->xbzrle_cache_miss_prev) /
 725                    (rs->iterations - rs->iterations_prev);
 726             }
 727             rs->iterations_prev = rs->iterations;
 728             rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss;
 729         }
 730         rs->dirty_pages_rate = rs->num_dirty_pages_period * 1000
 731             / (end_time - rs->time_last_bitmap_sync);
 732         rs->time_last_bitmap_sync = end_time;
 733         rs->num_dirty_pages_period = 0;
 734     }
 735     if (migrate_use_events()) {
 736         qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
 737     }
 738 }
 739
 740 /**
 741  * save_zero_page: send the zero page to the stream
 742  *
 743  * Returns the number of pages written.
 744  *
 745  * @rs: current RAM state
 746  * @block: block that contains the page we want to send
 747  * @offset: offset inside the block for the page
 748  * @p: pointer to the page
 749  */
 750 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
 751                           uint8_t *p)
 752 {
 753     int pages = -1;
 754
 755     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 756         rs->zero_pages++;
 757         rs->bytes_transferred +=
 758             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 759         qemu_put_byte(rs->f, 0);
 760         rs->bytes_transferred += 1;
 761         pages = 1;
 762     }
 763
 764     return pages;
 765 }
 766
 767 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 768 {
 769     if (!migrate_release_ram() || !migration_in_postcopy()) {
 770         return;
 771     }
 772
 773     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 774 }
 775
 776 /**
 777  * ram_save_page: send the given page to the stream
 778  *
 779  * Returns the number of pages written.
 780  *          < 0 - error
 781  *          >=0 - Number of pages written - this might legally be 0
 782  *                if xbzrle noticed the page was the same.
 783  *
 784  * @rs: current RAM state
 785  * @block: block that contains the page we want to send
 786  * @offset: offset inside the block for the page
 787  * @last_stage: if we are at the completion stage
 788  */
 789 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 790 {
 791     int pages = -1;
 792     uint64_t bytes_xmit;
 793     ram_addr_t current_addr;
 794     uint8_t *p;
 795     int ret;
 796     bool send_async = true;
 797     RAMBlock *block = pss->block;
 798     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 799
 800     p = block->host + offset;
 801     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 802
 803     /* In doubt sent page as normal */
 804     bytes_xmit = 0;
 805     ret = ram_control_save_page(rs->f, block->offset,
 806                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 807     if (bytes_xmit) {
 808         rs->bytes_transferred += bytes_xmit;
 809         pages = 1;
 810     }
 811
 812     XBZRLE_cache_lock();
 813
 814     current_addr = block->offset + offset;
 815
 816     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 817         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 818             if (bytes_xmit > 0) {
 819                 rs->norm_pages++;
 820             } else if (bytes_xmit == 0) {
 821                 rs->zero_pages++;
 822             }
 823         }
 824     } else {
 825         pages = save_zero_page(rs, block, offset, p);
 826         if (pages > 0) {
 827             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 828              * page would be stale
 829              */
 830             xbzrle_cache_zero_page(rs, current_addr);
 831             ram_release_pages(block->idstr, offset, pages);
 832         } else if (!rs->ram_bulk_stage &&
 833                    !migration_in_postcopy() && migrate_use_xbzrle()) {
 834             pages = save_xbzrle_page(rs, &p, current_addr, block,
 835                                      offset, last_stage);
 836             if (!last_stage) {
 837                 /* Can't send this cached data async, since the cache page
 838                  * might get updated before it gets to the wire
 839                  */
 840                 send_async = false;
 841             }
 842         }
 843     }
 844
 845     /* XBZRLE overflow or normal page */
 846     if (pages == -1) {
 847         rs->bytes_transferred += save_page_header(rs, rs->f, block,
 848                                                   offset | RAM_SAVE_FLAG_PAGE);
 849         if (send_async) {
 850             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
 851                                   migrate_release_ram() &
 852                                   migration_in_postcopy());
 853         } else {
 854             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
 855         }
 856         rs->bytes_transferred += TARGET_PAGE_SIZE;
 857         pages = 1;
 858         rs->norm_pages++;
 859     }
 860
 861     XBZRLE_cache_unlock();
 862
 863     return pages;
 864 }
 865
 866 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 867                                 ram_addr_t offset)
 868 {
 869     RAMState *rs = &ram_state;
 870     int bytes_sent, blen;
 871     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 872
 873     bytes_sent = save_page_header(rs, f, block, offset |
 874                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 875     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 876                                      migrate_compress_level());
 877     if (blen < 0) {
 878         bytes_sent = 0;
 879         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 880         error_report("compressed data failed!");
 881     } else {
 882         bytes_sent += blen;
 883         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
 884     }
 885
 886     return bytes_sent;
 887 }
 888
 889 static void flush_compressed_data(RAMState *rs)
 890 {
 891     int idx, len, thread_count;
 892
 893     if (!migrate_use_compression()) {
 894         return;
 895     }
 896     thread_count = migrate_compress_threads();
 897
 898     qemu_mutex_lock(&comp_done_lock);
 899     for (idx = 0; idx < thread_count; idx++) {
 900         while (!comp_param[idx].done) {
 901             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 902         }
 903     }
 904     qemu_mutex_unlock(&comp_done_lock);
 905
 906     for (idx = 0; idx < thread_count; idx++) {
 907         qemu_mutex_lock(&comp_param[idx].mutex);
 908         if (!comp_param[idx].quit) {
 909             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 910             rs->bytes_transferred += len;
 911         }
 912         qemu_mutex_unlock(&comp_param[idx].mutex);
 913     }
 914 }
 915
 916 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 917                                        ram_addr_t offset)
 918 {
 919     param->block = block;
 920     param->offset = offset;
 921 }
 922
 923 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
 924                                            ram_addr_t offset)
 925 {
 926     int idx, thread_count, bytes_xmit = -1, pages = -1;
 927
 928     thread_count = migrate_compress_threads();
 929     qemu_mutex_lock(&comp_done_lock);
 930     while (true) {
 931         for (idx = 0; idx < thread_count; idx++) {
 932             if (comp_param[idx].done) {
 933                 comp_param[idx].done = false;
 934                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 935                 qemu_mutex_lock(&comp_param[idx].mutex);
 936                 set_compress_params(&comp_param[idx], block, offset);
 937                 qemu_cond_signal(&comp_param[idx].cond);
 938                 qemu_mutex_unlock(&comp_param[idx].mutex);
 939                 pages = 1;
 940                 rs->norm_pages++;
 941                 rs->bytes_transferred += bytes_xmit;
 942                 break;
 943             }
 944         }
 945         if (pages > 0) {
 946             break;
 947         } else {
 948             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 949         }
 950     }
 951     qemu_mutex_unlock(&comp_done_lock);
 952
 953     return pages;
 954 }
 955
 956 /**
 957  * ram_save_compressed_page: compress the given page and send it to the stream
 958  *
 959  * Returns the number of pages written.
 960  *
 961  * @rs: current RAM state
 962  * @block: block that contains the page we want to send
 963  * @offset: offset inside the block for the page
 964  * @last_stage: if we are at the completion stage
 965  */
 966 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
 967                                     bool last_stage)
 968 {
 969     int pages = -1;
 970     uint64_t bytes_xmit = 0;
 971     uint8_t *p;
 972     int ret, blen;
 973     RAMBlock *block = pss->block;
 974     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 975
 976     p = block->host + offset;
 977
 978     ret = ram_control_save_page(rs->f, block->offset,
 979                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 980     if (bytes_xmit) {
 981         rs->bytes_transferred += bytes_xmit;
 982         pages = 1;
 983     }
 984     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 985         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 986             if (bytes_xmit > 0) {
 987                 rs->norm_pages++;
 988             } else if (bytes_xmit == 0) {
 989                 rs->zero_pages++;
 990             }
 991         }
 992     } else {
 993         /* When starting the process of a new block, the first page of
 994          * the block should be sent out before other pages in the same
 995          * block, and all the pages in last block should have been sent
 996          * out, keeping this order is important, because the 'cont' flag
 997          * is used to avoid resending the block name.
 998          */
 999         if (block != rs->last_sent_block) {
1000             flush_compressed_data(rs);
1001             pages = save_zero_page(rs, block, offset, p);
1002             if (pages == -1) {
1003                 /* Make sure the first page is sent out before other pages */
1004                 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1005                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
1006                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1007                                                  migrate_compress_level());
1008                 if (blen > 0) {
1009                     rs->bytes_transferred += bytes_xmit + blen;
1010                     rs->norm_pages++;
1011                     pages = 1;
1012                 } else {
1013                     qemu_file_set_error(rs->f, blen);
1014                     error_report("compressed data failed!");
1015                 }
1016             }
1017             if (pages > 0) {
1018                 ram_release_pages(block->idstr, offset, pages);
1019             }
1020         } else {
1021             pages = save_zero_page(rs, block, offset, p);
1022             if (pages == -1) {
1023                 pages = compress_page_with_multi_thread(rs, block, offset);
1024             } else {
1025                 ram_release_pages(block->idstr, offset, pages);
1026             }
1027         }
1028     }
1029
1030     return pages;
1031 }
1032
1033 /**
1034  * find_dirty_block: find the next dirty page and update any state
1035  * associated with the search process.
1036  *
1037  * Returns if a page is found
1038  *
1039  * @rs: current RAM state
1040  * @pss: data about the state of the current dirty page scan
1041  * @again: set to false if the search has scanned the whole of RAM
1042  */
1043 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1044 {
1045     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1046     if (pss->complete_round && pss->block == rs->last_seen_block &&
1047         pss->page >= rs->last_page) {
1048         /*
1049          * We've been once around the RAM and haven't found anything.
1050          * Give up.
1051          */
1052         *again = false;
1053         return false;
1054     }
1055     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1056         /* Didn't find anything in this RAM Block */
1057         pss->page = 0;
1058         pss->block = QLIST_NEXT_RCU(pss->block, next);
1059         if (!pss->block) {
1060             /* Hit the end of the list */
1061             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1062             /* Flag that we've looped */
1063             pss->complete_round = true;
1064             rs->ram_bulk_stage = false;
1065             if (migrate_use_xbzrle()) {
1066                 /* If xbzrle is on, stop using the data compression at this
1067                  * point. In theory, xbzrle can do better than compression.
1068                  */
1069                 flush_compressed_data(rs);
1070             }
1071         }
1072         /* Didn't find anything this time, but try again on the new block */
1073         *again = true;
1074         return false;
1075     } else {
1076         /* Can go around again, but... */
1077         *again = true;
1078         /* We've found something so probably don't need to */
1079         return true;
1080     }
1081 }
1082
1083 /**
1084  * unqueue_page: gets a page of the queue
1085  *
1086  * Helper for 'get_queued_page' - gets a page off the queue
1087  *
1088  * Returns the block of the page (or NULL if none available)
1089  *
1090  * @rs: current RAM state
1091  * @offset: used to return the offset within the RAMBlock
1092  */
1093 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1094 {
1095     RAMBlock *block = NULL;
1096
1097     qemu_mutex_lock(&rs->src_page_req_mutex);
1098     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1099         struct RAMSrcPageRequest *entry =
1100                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1101         block = entry->rb;
1102         *offset = entry->offset;
1103
1104         if (entry->len > TARGET_PAGE_SIZE) {
1105             entry->len -= TARGET_PAGE_SIZE;
1106             entry->offset += TARGET_PAGE_SIZE;
1107         } else {
1108             memory_region_unref(block->mr);
1109             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1110             g_free(entry);
1111         }
1112     }
1113     qemu_mutex_unlock(&rs->src_page_req_mutex);
1114
1115     return block;
1116 }
1117
1118 /**
1119  * get_queued_page: unqueue a page from the postocpy requests
1120  *
1121  * Skips pages that are already sent (!dirty)
1122  *
1123  * Returns if a queued page is found
1124  *
1125  * @rs: current RAM state
1126  * @pss: data about the state of the current dirty page scan
1127  */
1128 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1129 {
1130     RAMBlock  *block;
1131     ram_addr_t offset;
1132     bool dirty;
1133
1134     do {
1135         block = unqueue_page(rs, &offset);
1136         /*
1137          * We're sending this page, and since it's postcopy nothing else
1138          * will dirty it, and we must make sure it doesn't get sent again
1139          * even if this queue request was received after the background
1140          * search already sent it.
1141          */
1142         if (block) {
1143             unsigned long page;
1144
1145             page = offset >> TARGET_PAGE_BITS;
1146             dirty = test_bit(page, block->bmap);
1147             if (!dirty) {
1148                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1149                        page, test_bit(page, block->unsentmap));
1150             } else {
1151                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1152             }
1153         }
1154
1155     } while (block && !dirty);
1156
1157     if (block) {
1158         /*
1159          * As soon as we start servicing pages out of order, then we have
1160          * to kill the bulk stage, since the bulk stage assumes
1161          * in (migration_bitmap_find_and_reset_dirty) that every page is
1162          * dirty, that's no longer true.
1163          */
1164         rs->ram_bulk_stage = false;
1165
1166         /*
1167          * We want the background search to continue from the queued page
1168          * since the guest is likely to want other pages near to the page
1169          * it just requested.
1170          */
1171         pss->block = block;
1172         pss->page = offset >> TARGET_PAGE_BITS;
1173     }
1174
1175     return !!block;
1176 }
1177
1178 /**
1179  * migration_page_queue_free: drop any remaining pages in the ram
1180  * request queue
1181  *
1182  * It should be empty at the end anyway, but in error cases there may
1183  * be some left.  in case that there is any page left, we drop it.
1184  *
1185  */
1186 void migration_page_queue_free(void)
1187 {
1188     struct RAMSrcPageRequest *mspr, *next_mspr;
1189     RAMState *rs = &ram_state;
1190     /* This queue generally should be empty - but in the case of a failed
1191      * migration might have some droppings in.
1192      */
1193     rcu_read_lock();
1194     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1195         memory_region_unref(mspr->rb->mr);
1196         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1197         g_free(mspr);
1198     }
1199     rcu_read_unlock();
1200 }
1201
1202 /**
1203  * ram_save_queue_pages: queue the page for transmission
1204  *
1205  * A request from postcopy destination for example.
1206  *
1207  * Returns zero on success or negative on error
1208  *
1209  * @rbname: Name of the RAMBLock of the request. NULL means the
1210  *          same that last one.
1211  * @start: starting address from the start of the RAMBlock
1212  * @len: length (in bytes) to send
1213  */
1214 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1215 {
1216     RAMBlock *ramblock;
1217     RAMState *rs = &ram_state;
1218
1219     rs->postcopy_requests++;
1220     rcu_read_lock();
1221     if (!rbname) {
1222         /* Reuse last RAMBlock */
1223         ramblock = rs->last_req_rb;
1224
1225         if (!ramblock) {
1226             /*
1227              * Shouldn't happen, we can't reuse the last RAMBlock if
1228              * it's the 1st request.
1229              */
1230             error_report("ram_save_queue_pages no previous block");
1231             goto err;
1232         }
1233     } else {
1234         ramblock = qemu_ram_block_by_name(rbname);
1235
1236         if (!ramblock) {
1237             /* We shouldn't be asked for a non-existent RAMBlock */
1238             error_report("ram_save_queue_pages no block '%s'", rbname);
1239             goto err;
1240         }
1241         rs->last_req_rb = ramblock;
1242     }
1243     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1244     if (start+len > ramblock->used_length) {
1245         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1246                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1247                      __func__, start, len, ramblock->used_length);
1248         goto err;
1249     }
1250
1251     struct RAMSrcPageRequest *new_entry =
1252         g_malloc0(sizeof(struct RAMSrcPageRequest));
1253     new_entry->rb = ramblock;
1254     new_entry->offset = start;
1255     new_entry->len = len;
1256
1257     memory_region_ref(ramblock->mr);
1258     qemu_mutex_lock(&rs->src_page_req_mutex);
1259     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1260     qemu_mutex_unlock(&rs->src_page_req_mutex);
1261     rcu_read_unlock();
1262
1263     return 0;
1264
1265 err:
1266     rcu_read_unlock();
1267     return -1;
1268 }
1269
1270 /**
1271  * ram_save_target_page: save one target page
1272  *
1273  * Returns the number of pages written
1274  *
1275  * @rs: current RAM state
1276  * @ms: current migration state
1277  * @pss: data about the page we want to send
1278  * @last_stage: if we are at the completion stage
1279  */
1280 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1281                                 bool last_stage)
1282 {
1283     int res = 0;
1284
1285     /* Check the pages is dirty and if it is send it */
1286     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1287         /*
1288          * If xbzrle is on, stop using the data compression after first
1289          * round of migration even if compression is enabled. In theory,
1290          * xbzrle can do better than compression.
1291          */
1292         if (migrate_use_compression() &&
1293             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1294             res = ram_save_compressed_page(rs, pss, last_stage);
1295         } else {
1296             res = ram_save_page(rs, pss, last_stage);
1297         }
1298
1299         if (res < 0) {
1300             return res;
1301         }
1302         if (pss->block->unsentmap) {
1303             clear_bit(pss->page, pss->block->unsentmap);
1304         }
1305     }
1306
1307     return res;
1308 }
1309
1310 /**
1311  * ram_save_host_page: save a whole host page
1312  *
1313  * Starting at *offset send pages up to the end of the current host
1314  * page. It's valid for the initial offset to point into the middle of
1315  * a host page in which case the remainder of the hostpage is sent.
1316  * Only dirty target pages are sent. Note that the host page size may
1317  * be a huge page for this block.
1318  * The saving stops at the boundary of the used_length of the block
1319  * if the RAMBlock isn't a multiple of the host page size.
1320  *
1321  * Returns the number of pages written or negative on error
1322  *
1323  * @rs: current RAM state
1324  * @ms: current migration state
1325  * @pss: data about the page we want to send
1326  * @last_stage: if we are at the completion stage
1327  */
1328 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1329                               bool last_stage)
1330 {
1331     int tmppages, pages = 0;
1332     size_t pagesize_bits =
1333         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1334
1335     do {
1336         tmppages = ram_save_target_page(rs, pss, last_stage);
1337         if (tmppages < 0) {
1338             return tmppages;
1339         }
1340
1341         pages += tmppages;
1342         pss->page++;
1343     } while ((pss->page & (pagesize_bits - 1)) &&
1344              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1345
1346     /* The offset we leave with is the last one we looked at */
1347     pss->page--;
1348     return pages;
1349 }
1350
1351 /**
1352  * ram_find_and_save_block: finds a dirty page and sends it to f
1353  *
1354  * Called within an RCU critical section.
1355  *
1356  * Returns the number of pages written where zero means no dirty pages
1357  *
1358  * @rs: current RAM state
1359  * @last_stage: if we are at the completion stage
1360  *
1361  * On systems where host-page-size > target-page-size it will send all the
1362  * pages in a host page that are dirty.
1363  */
1364
1365 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1366 {
1367     PageSearchStatus pss;
1368     int pages = 0;
1369     bool again, found;
1370
1371     /* No dirty page as there is zero RAM */
1372     if (!ram_bytes_total()) {
1373         return pages;
1374     }
1375
1376     pss.block = rs->last_seen_block;
1377     pss.page = rs->last_page;
1378     pss.complete_round = false;
1379
1380     if (!pss.block) {
1381         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1382     }
1383
1384     do {
1385         again = true;
1386         found = get_queued_page(rs, &pss);
1387
1388         if (!found) {
1389             /* priority queue empty, so just search for something dirty */
1390             found = find_dirty_block(rs, &pss, &again);
1391         }
1392
1393         if (found) {
1394             pages = ram_save_host_page(rs, &pss, last_stage);
1395         }
1396     } while (!pages && again);
1397
1398     rs->last_seen_block = pss.block;
1399     rs->last_page = pss.page;
1400
1401     return pages;
1402 }
1403
1404 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1405 {
1406     uint64_t pages = size / TARGET_PAGE_SIZE;
1407     RAMState *rs = &ram_state;
1408
1409     if (zero) {
1410         rs->zero_pages += pages;
1411     } else {
1412         rs->norm_pages += pages;
1413         rs->bytes_transferred += size;
1414         qemu_update_position(f, size);
1415     }
1416 }
1417
1418 uint64_t ram_bytes_total(void)
1419 {
1420     RAMBlock *block;
1421     uint64_t total = 0;
1422
1423     rcu_read_lock();
1424     RAMBLOCK_FOREACH(block) {
1425         total += block->used_length;
1426     }
1427     rcu_read_unlock();
1428     return total;
1429 }
1430
1431 void free_xbzrle_decoded_buf(void)
1432 {
1433     g_free(xbzrle_decoded_buf);
1434     xbzrle_decoded_buf = NULL;
1435 }
1436
1437 static void ram_migration_cleanup(void *opaque)
1438 {
1439     RAMBlock *block;
1440
1441     /* caller have hold iothread lock or is in a bh, so there is
1442      * no writing race against this migration_bitmap
1443      */
1444     memory_global_dirty_log_stop();
1445
1446     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1447         g_free(block->bmap);
1448         block->bmap = NULL;
1449         g_free(block->unsentmap);
1450         block->unsentmap = NULL;
1451     }
1452
1453     XBZRLE_cache_lock();
1454     if (XBZRLE.cache) {
1455         cache_fini(XBZRLE.cache);
1456         g_free(XBZRLE.encoded_buf);
1457         g_free(XBZRLE.current_buf);
1458         g_free(ZERO_TARGET_PAGE);
1459         XBZRLE.cache = NULL;
1460         XBZRLE.encoded_buf = NULL;
1461         XBZRLE.current_buf = NULL;
1462     }
1463     XBZRLE_cache_unlock();
1464 }
1465
1466 static void ram_state_reset(RAMState *rs)
1467 {
1468     rs->last_seen_block = NULL;
1469     rs->last_sent_block = NULL;
1470     rs->last_page = 0;
1471     rs->last_version = ram_list.version;
1472     rs->ram_bulk_stage = true;
1473 }
1474
1475 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1476
1477 /*
1478  * 'expected' is the value you expect the bitmap mostly to be full
1479  * of; it won't bother printing lines that are all this value.
1480  * If 'todump' is null the migration bitmap is dumped.
1481  */
1482 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1483                            unsigned long pages)
1484 {
1485     int64_t cur;
1486     int64_t linelen = 128;
1487     char linebuf[129];
1488
1489     for (cur = 0; cur < pages; cur += linelen) {
1490         int64_t curb;
1491         bool found = false;
1492         /*
1493          * Last line; catch the case where the line length
1494          * is longer than remaining ram
1495          */
1496         if (cur + linelen > pages) {
1497             linelen = pages - cur;
1498         }
1499         for (curb = 0; curb < linelen; curb++) {
1500             bool thisbit = test_bit(cur + curb, todump);
1501             linebuf[curb] = thisbit ? '1' : '.';
1502             found = found || (thisbit != expected);
1503         }
1504         if (found) {
1505             linebuf[curb] = '\0';
1506             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1507         }
1508     }
1509 }
1510
1511 /* **** functions for postcopy ***** */
1512
1513 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1514 {
1515     struct RAMBlock *block;
1516
1517     RAMBLOCK_FOREACH(block) {
1518         unsigned long *bitmap = block->bmap;
1519         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1520         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1521
1522         while (run_start < range) {
1523             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1524             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1525                               (run_end - run_start) << TARGET_PAGE_BITS);
1526             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1527         }
1528     }
1529 }
1530
1531 /**
1532  * postcopy_send_discard_bm_ram: discard a RAMBlock
1533  *
1534  * Returns zero on success
1535  *
1536  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1537  * Note: At this point the 'unsentmap' is the processed bitmap combined
1538  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1539  *
1540  * @ms: current migration state
1541  * @pds: state for postcopy
1542  * @start: RAMBlock starting page
1543  * @length: RAMBlock size
1544  */
1545 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1546                                         PostcopyDiscardState *pds,
1547                                         RAMBlock *block)
1548 {
1549     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1550     unsigned long current;
1551     unsigned long *unsentmap = block->unsentmap;
1552
1553     for (current = 0; current < end; ) {
1554         unsigned long one = find_next_bit(unsentmap, end, current);
1555
1556         if (one <= end) {
1557             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1558             unsigned long discard_length;
1559
1560             if (zero >= end) {
1561                 discard_length = end - one;
1562             } else {
1563                 discard_length = zero - one;
1564             }
1565             if (discard_length) {
1566                 postcopy_discard_send_range(ms, pds, one, discard_length);
1567             }
1568             current = one + discard_length;
1569         } else {
1570             current = one;
1571         }
1572     }
1573
1574     return 0;
1575 }
1576
1577 /**
1578  * postcopy_each_ram_send_discard: discard all RAMBlocks
1579  *
1580  * Returns 0 for success or negative for error
1581  *
1582  * Utility for the outgoing postcopy code.
1583  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1584  *   passing it bitmap indexes and name.
1585  * (qemu_ram_foreach_block ends up passing unscaled lengths
1586  *  which would mean postcopy code would have to deal with target page)
1587  *
1588  * @ms: current migration state
1589  */
1590 static int postcopy_each_ram_send_discard(MigrationState *ms)
1591 {
1592     struct RAMBlock *block;
1593     int ret;
1594
1595     RAMBLOCK_FOREACH(block) {
1596         PostcopyDiscardState *pds =
1597             postcopy_discard_send_init(ms, block->idstr);
1598
1599         /*
1600          * Postcopy sends chunks of bitmap over the wire, but it
1601          * just needs indexes at this point, avoids it having
1602          * target page specific code.
1603          */
1604         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1605         postcopy_discard_send_finish(ms, pds);
1606         if (ret) {
1607             return ret;
1608         }
1609     }
1610
1611     return 0;
1612 }
1613
1614 /**
1615  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1616  *
1617  * Helper for postcopy_chunk_hostpages; it's called twice to
1618  * canonicalize the two bitmaps, that are similar, but one is
1619  * inverted.
1620  *
1621  * Postcopy requires that all target pages in a hostpage are dirty or
1622  * clean, not a mix.  This function canonicalizes the bitmaps.
1623  *
1624  * @ms: current migration state
1625  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1626  *               otherwise we need to canonicalize partially dirty host pages
1627  * @block: block that contains the page we want to canonicalize
1628  * @pds: state for postcopy
1629  */
1630 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1631                                           RAMBlock *block,
1632                                           PostcopyDiscardState *pds)
1633 {
1634     RAMState *rs = &ram_state;
1635     unsigned long *bitmap = block->bmap;
1636     unsigned long *unsentmap = block->unsentmap;
1637     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1638     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1639     unsigned long run_start;
1640
1641     if (block->page_size == TARGET_PAGE_SIZE) {
1642         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1643         return;
1644     }
1645
1646     if (unsent_pass) {
1647         /* Find a sent page */
1648         run_start = find_next_zero_bit(unsentmap, pages, 0);
1649     } else {
1650         /* Find a dirty page */
1651         run_start = find_next_bit(bitmap, pages, 0);
1652     }
1653
1654     while (run_start < pages) {
1655         bool do_fixup = false;
1656         unsigned long fixup_start_addr;
1657         unsigned long host_offset;
1658
1659         /*
1660          * If the start of this run of pages is in the middle of a host
1661          * page, then we need to fixup this host page.
1662          */
1663         host_offset = run_start % host_ratio;
1664         if (host_offset) {
1665             do_fixup = true;
1666             run_start -= host_offset;
1667             fixup_start_addr = run_start;
1668             /* For the next pass */
1669             run_start = run_start + host_ratio;
1670         } else {
1671             /* Find the end of this run */
1672             unsigned long run_end;
1673             if (unsent_pass) {
1674                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1675             } else {
1676                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1677             }
1678             /*
1679              * If the end isn't at the start of a host page, then the
1680              * run doesn't finish at the end of a host page
1681              * and we need to discard.
1682              */
1683             host_offset = run_end % host_ratio;
1684             if (host_offset) {
1685                 do_fixup = true;
1686                 fixup_start_addr = run_end - host_offset;
1687                 /*
1688                  * This host page has gone, the next loop iteration starts
1689                  * from after the fixup
1690                  */
1691                 run_start = fixup_start_addr + host_ratio;
1692             } else {
1693                 /*
1694                  * No discards on this iteration, next loop starts from
1695                  * next sent/dirty page
1696                  */
1697                 run_start = run_end + 1;
1698             }
1699         }
1700
1701         if (do_fixup) {
1702             unsigned long page;
1703
1704             /* Tell the destination to discard this page */
1705             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1706                 /* For the unsent_pass we:
1707                  *     discard partially sent pages
1708                  * For the !unsent_pass (dirty) we:
1709                  *     discard partially dirty pages that were sent
1710                  *     (any partially sent pages were already discarded
1711                  *     by the previous unsent_pass)
1712                  */
1713                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1714                                             host_ratio);
1715             }
1716
1717             /* Clean up the bitmap */
1718             for (page = fixup_start_addr;
1719                  page < fixup_start_addr + host_ratio; page++) {
1720                 /* All pages in this host page are now not sent */
1721                 set_bit(page, unsentmap);
1722
1723                 /*
1724                  * Remark them as dirty, updating the count for any pages
1725                  * that weren't previously dirty.
1726                  */
1727                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1728             }
1729         }
1730
1731         if (unsent_pass) {
1732             /* Find the next sent page for the next iteration */
1733             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1734         } else {
1735             /* Find the next dirty page for the next iteration */
1736             run_start = find_next_bit(bitmap, pages, run_start);
1737         }
1738     }
1739 }
1740
1741 /**
1742  * postcopy_chuck_hostpages: discrad any partially sent host page
1743  *
1744  * Utility for the outgoing postcopy code.
1745  *
1746  * Discard any partially sent host-page size chunks, mark any partially
1747  * dirty host-page size chunks as all dirty.  In this case the host-page
1748  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1749  *
1750  * Returns zero on success
1751  *
1752  * @ms: current migration state
1753  * @block: block we want to work with
1754  */
1755 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1756 {
1757     PostcopyDiscardState *pds =
1758         postcopy_discard_send_init(ms, block->idstr);
1759
1760     /* First pass: Discard all partially sent host pages */
1761     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1762     /*
1763      * Second pass: Ensure that all partially dirty host pages are made
1764      * fully dirty.
1765      */
1766     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1767
1768     postcopy_discard_send_finish(ms, pds);
1769     return 0;
1770 }
1771
1772 /**
1773  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1774  *
1775  * Returns zero on success
1776  *
1777  * Transmit the set of pages to be discarded after precopy to the target
1778  * these are pages that:
1779  *     a) Have been previously transmitted but are now dirty again
1780  *     b) Pages that have never been transmitted, this ensures that
1781  *        any pages on the destination that have been mapped by background
1782  *        tasks get discarded (transparent huge pages is the specific concern)
1783  * Hopefully this is pretty sparse
1784  *
1785  * @ms: current migration state
1786  */
1787 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1788 {
1789     RAMState *rs = &ram_state;
1790     RAMBlock *block;
1791     int ret;
1792
1793     rcu_read_lock();
1794
1795     /* This should be our last sync, the src is now paused */
1796     migration_bitmap_sync(rs);
1797
1798     /* Easiest way to make sure we don't resume in the middle of a host-page */
1799     rs->last_seen_block = NULL;
1800     rs->last_sent_block = NULL;
1801     rs->last_page = 0;
1802
1803     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1804         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1805         unsigned long *bitmap = block->bmap;
1806         unsigned long *unsentmap = block->unsentmap;
1807
1808         if (!unsentmap) {
1809             /* We don't have a safe way to resize the sentmap, so
1810              * if the bitmap was resized it will be NULL at this
1811              * point.
1812              */
1813             error_report("migration ram resized during precopy phase");
1814             rcu_read_unlock();
1815             return -EINVAL;
1816         }
1817         /* Deal with TPS != HPS and huge pages */
1818         ret = postcopy_chunk_hostpages(ms, block);
1819         if (ret) {
1820             rcu_read_unlock();
1821             return ret;
1822         }
1823
1824         /*
1825          * Update the unsentmap to be unsentmap = unsentmap | dirty
1826          */
1827         bitmap_or(unsentmap, unsentmap, bitmap, pages);
1828 #ifdef DEBUG_POSTCOPY
1829         ram_debug_dump_bitmap(unsentmap, true, pages);
1830 #endif
1831     }
1832     trace_ram_postcopy_send_discard_bitmap();
1833
1834     ret = postcopy_each_ram_send_discard(ms);
1835     rcu_read_unlock();
1836
1837     return ret;
1838 }
1839
1840 /**
1841  * ram_discard_range: discard dirtied pages at the beginning of postcopy
1842  *
1843  * Returns zero on success
1844  *
1845  * @rbname: name of the RAMBlock of the request. NULL means the
1846  *          same that last one.
1847  * @start: RAMBlock starting page
1848  * @length: RAMBlock size
1849  */
1850 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1851 {
1852     int ret = -1;
1853
1854     trace_ram_discard_range(rbname, start, length);
1855
1856     rcu_read_lock();
1857     RAMBlock *rb = qemu_ram_block_by_name(rbname);
1858
1859     if (!rb) {
1860         error_report("ram_discard_range: Failed to find block '%s'", rbname);
1861         goto err;
1862     }
1863
1864     ret = ram_block_discard_range(rb, start, length);
1865
1866 err:
1867     rcu_read_unlock();
1868
1869     return ret;
1870 }
1871
1872 static int ram_state_init(RAMState *rs)
1873 {
1874     memset(rs, 0, sizeof(*rs));
1875     qemu_mutex_init(&rs->bitmap_mutex);
1876     qemu_mutex_init(&rs->src_page_req_mutex);
1877     QSIMPLEQ_INIT(&rs->src_page_requests);
1878
1879     if (migrate_use_xbzrle()) {
1880         XBZRLE_cache_lock();
1881         ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1882         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1883                                   TARGET_PAGE_SIZE,
1884                                   TARGET_PAGE_SIZE);
1885         if (!XBZRLE.cache) {
1886             XBZRLE_cache_unlock();
1887             error_report("Error creating cache");
1888             return -1;
1889         }
1890         XBZRLE_cache_unlock();
1891
1892         /* We prefer not to abort if there is no memory */
1893         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1894         if (!XBZRLE.encoded_buf) {
1895             error_report("Error allocating encoded_buf");
1896             return -1;
1897         }
1898
1899         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1900         if (!XBZRLE.current_buf) {
1901             error_report("Error allocating current_buf");
1902             g_free(XBZRLE.encoded_buf);
1903             XBZRLE.encoded_buf = NULL;
1904             return -1;
1905         }
1906     }
1907
1908     /* For memory_global_dirty_log_start below.  */
1909     qemu_mutex_lock_iothread();
1910
1911     qemu_mutex_lock_ramlist();
1912     rcu_read_lock();
1913     ram_state_reset(rs);
1914
1915     /* Skip setting bitmap if there is no RAM */
1916     if (ram_bytes_total()) {
1917         RAMBlock *block;
1918
1919         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1920             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
1921
1922             block->bmap = bitmap_new(pages);
1923             bitmap_set(block->bmap, 0, pages);
1924             if (migrate_postcopy_ram()) {
1925                 block->unsentmap = bitmap_new(pages);
1926                 bitmap_set(block->unsentmap, 0, pages);
1927             }
1928         }
1929     }
1930
1931     /*
1932      * Count the total number of pages used by ram blocks not including any
1933      * gaps due to alignment or unplugs.
1934      */
1935     rs->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1936
1937     memory_global_dirty_log_start();
1938     migration_bitmap_sync(rs);
1939     qemu_mutex_unlock_ramlist();
1940     qemu_mutex_unlock_iothread();
1941     rcu_read_unlock();
1942
1943     return 0;
1944 }
1945
1946 /*
1947  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1948  * long-running RCU critical section.  When rcu-reclaims in the code
1949  * start to become numerous it will be necessary to reduce the
1950  * granularity of these critical sections.
1951  */
1952
1953 /**
1954  * ram_save_setup: Setup RAM for migration
1955  *
1956  * Returns zero to indicate success and negative for error
1957  *
1958  * @f: QEMUFile where to send the data
1959  * @opaque: RAMState pointer
1960  */
1961 static int ram_save_setup(QEMUFile *f, void *opaque)
1962 {
1963     RAMState *rs = opaque;
1964     RAMBlock *block;
1965
1966     /* migration has already setup the bitmap, reuse it. */
1967     if (!migration_in_colo_state()) {
1968         if (ram_state_init(rs) < 0) {
1969             return -1;
1970          }
1971     }
1972     rs->f = f;
1973
1974     rcu_read_lock();
1975
1976     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1977
1978     RAMBLOCK_FOREACH(block) {
1979         qemu_put_byte(f, strlen(block->idstr));
1980         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1981         qemu_put_be64(f, block->used_length);
1982         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1983             qemu_put_be64(f, block->page_size);
1984         }
1985     }
1986
1987     rcu_read_unlock();
1988
1989     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1990     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1991
1992     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1993
1994     return 0;
1995 }
1996
1997 /**
1998  * ram_save_iterate: iterative stage for migration
1999  *
2000  * Returns zero to indicate success and negative for error
2001  *
2002  * @f: QEMUFile where to send the data
2003  * @opaque: RAMState pointer
2004  */
2005 static int ram_save_iterate(QEMUFile *f, void *opaque)
2006 {
2007     RAMState *rs = opaque;
2008     int ret;
2009     int i;
2010     int64_t t0;
2011     int done = 0;
2012
2013     rcu_read_lock();
2014     if (ram_list.version != rs->last_version) {
2015         ram_state_reset(rs);
2016     }
2017
2018     /* Read version before ram_list.blocks */
2019     smp_rmb();
2020
2021     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2022
2023     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2024     i = 0;
2025     while ((ret = qemu_file_rate_limit(f)) == 0) {
2026         int pages;
2027
2028         pages = ram_find_and_save_block(rs, false);
2029         /* no more pages to sent */
2030         if (pages == 0) {
2031             done = 1;
2032             break;
2033         }
2034         rs->iterations++;
2035
2036         /* we want to check in the 1st loop, just in case it was the 1st time
2037            and we had to sync the dirty bitmap.
2038            qemu_get_clock_ns() is a bit expensive, so we only check each some
2039            iterations
2040         */
2041         if ((i & 63) == 0) {
2042             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2043             if (t1 > MAX_WAIT) {
2044                 trace_ram_save_iterate_big_wait(t1, i);
2045                 break;
2046             }
2047         }
2048         i++;
2049     }
2050     flush_compressed_data(rs);
2051     rcu_read_unlock();
2052
2053     /*
2054      * Must occur before EOS (or any QEMUFile operation)
2055      * because of RDMA protocol.
2056      */
2057     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2058
2059     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2060     rs->bytes_transferred += 8;
2061
2062     ret = qemu_file_get_error(f);
2063     if (ret < 0) {
2064         return ret;
2065     }
2066
2067     return done;
2068 }
2069
2070 /**
2071  * ram_save_complete: function called to send the remaining amount of ram
2072  *
2073  * Returns zero to indicate success
2074  *
2075  * Called with iothread lock
2076  *
2077  * @f: QEMUFile where to send the data
2078  * @opaque: RAMState pointer
2079  */
2080 static int ram_save_complete(QEMUFile *f, void *opaque)
2081 {
2082     RAMState *rs = opaque;
2083
2084     rcu_read_lock();
2085
2086     if (!migration_in_postcopy()) {
2087         migration_bitmap_sync(rs);
2088     }
2089
2090     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2091
2092     /* try transferring iterative blocks of memory */
2093
2094     /* flush all remaining blocks regardless of rate limiting */
2095     while (true) {
2096         int pages;
2097
2098         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2099         /* no more blocks to sent */
2100         if (pages == 0) {
2101             break;
2102         }
2103     }
2104
2105     flush_compressed_data(rs);
2106     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2107
2108     rcu_read_unlock();
2109
2110     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2111
2112     return 0;
2113 }
2114
2115 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2116                              uint64_t *non_postcopiable_pending,
2117                              uint64_t *postcopiable_pending)
2118 {
2119     RAMState *rs = opaque;
2120     uint64_t remaining_size;
2121
2122     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2123
2124     if (!migration_in_postcopy() &&
2125         remaining_size < max_size) {
2126         qemu_mutex_lock_iothread();
2127         rcu_read_lock();
2128         migration_bitmap_sync(rs);
2129         rcu_read_unlock();
2130         qemu_mutex_unlock_iothread();
2131         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2132     }
2133
2134     /* We can do postcopy, and all the data is postcopiable */
2135     *postcopiable_pending += remaining_size;
2136 }
2137
2138 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2139 {
2140     unsigned int xh_len;
2141     int xh_flags;
2142     uint8_t *loaded_data;
2143
2144     if (!xbzrle_decoded_buf) {
2145         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2146     }
2147     loaded_data = xbzrle_decoded_buf;
2148
2149     /* extract RLE header */
2150     xh_flags = qemu_get_byte(f);
2151     xh_len = qemu_get_be16(f);
2152
2153     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2154         error_report("Failed to load XBZRLE page - wrong compression!");
2155         return -1;
2156     }
2157
2158     if (xh_len > TARGET_PAGE_SIZE) {
2159         error_report("Failed to load XBZRLE page - len overflow!");
2160         return -1;
2161     }
2162     /* load data and decode */
2163     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2164
2165     /* decode RLE */
2166     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2167                              TARGET_PAGE_SIZE) == -1) {
2168         error_report("Failed to load XBZRLE page - decode error!");
2169         return -1;
2170     }
2171
2172     return 0;
2173 }
2174
2175 /**
2176  * ram_block_from_stream: read a RAMBlock id from the migration stream
2177  *
2178  * Must be called from within a rcu critical section.
2179  *
2180  * Returns a pointer from within the RCU-protected ram_list.
2181  *
2182  * @f: QEMUFile where to read the data from
2183  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2184  */
2185 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2186 {
2187     static RAMBlock *block = NULL;
2188     char id[256];
2189     uint8_t len;
2190
2191     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2192         if (!block) {
2193             error_report("Ack, bad migration stream!");
2194             return NULL;
2195         }
2196         return block;
2197     }
2198
2199     len = qemu_get_byte(f);
2200     qemu_get_buffer(f, (uint8_t *)id, len);
2201     id[len] = 0;
2202
2203     block = qemu_ram_block_by_name(id);
2204     if (!block) {
2205         error_report("Can't find block %s", id);
2206         return NULL;
2207     }
2208
2209     return block;
2210 }
2211
2212 static inline void *host_from_ram_block_offset(RAMBlock *block,
2213                                                ram_addr_t offset)
2214 {
2215     if (!offset_in_ramblock(block, offset)) {
2216         return NULL;
2217     }
2218
2219     return block->host + offset;
2220 }
2221
2222 /**
2223  * ram_handle_compressed: handle the zero page case
2224  *
2225  * If a page (or a whole RDMA chunk) has been
2226  * determined to be zero, then zap it.
2227  *
2228  * @host: host address for the zero page
2229  * @ch: what the page is filled from.  We only support zero
2230  * @size: size of the zero page
2231  */
2232 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2233 {
2234     if (ch != 0 || !is_zero_range(host, size)) {
2235         memset(host, ch, size);
2236     }
2237 }
2238
2239 static void *do_data_decompress(void *opaque)
2240 {
2241     DecompressParam *param = opaque;
2242     unsigned long pagesize;
2243     uint8_t *des;
2244     int len;
2245
2246     qemu_mutex_lock(&param->mutex);
2247     while (!param->quit) {
2248         if (param->des) {
2249             des = param->des;
2250             len = param->len;
2251             param->des = 0;
2252             qemu_mutex_unlock(&param->mutex);
2253
2254             pagesize = TARGET_PAGE_SIZE;
2255             /* uncompress() will return failed in some case, especially
2256              * when the page is dirted when doing the compression, it's
2257              * not a problem because the dirty page will be retransferred
2258              * and uncompress() won't break the data in other pages.
2259              */
2260             uncompress((Bytef *)des, &pagesize,
2261                        (const Bytef *)param->compbuf, len);
2262
2263             qemu_mutex_lock(&decomp_done_lock);
2264             param->done = true;
2265             qemu_cond_signal(&decomp_done_cond);
2266             qemu_mutex_unlock(&decomp_done_lock);
2267
2268             qemu_mutex_lock(&param->mutex);
2269         } else {
2270             qemu_cond_wait(&param->cond, &param->mutex);
2271         }
2272     }
2273     qemu_mutex_unlock(&param->mutex);
2274
2275     return NULL;
2276 }
2277
2278 static void wait_for_decompress_done(void)
2279 {
2280     int idx, thread_count;
2281
2282     if (!migrate_use_compression()) {
2283         return;
2284     }
2285
2286     thread_count = migrate_decompress_threads();
2287     qemu_mutex_lock(&decomp_done_lock);
2288     for (idx = 0; idx < thread_count; idx++) {
2289         while (!decomp_param[idx].done) {
2290             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2291         }
2292     }
2293     qemu_mutex_unlock(&decomp_done_lock);
2294 }
2295
2296 void migrate_decompress_threads_create(void)
2297 {
2298     int i, thread_count;
2299
2300     thread_count = migrate_decompress_threads();
2301     decompress_threads = g_new0(QemuThread, thread_count);
2302     decomp_param = g_new0(DecompressParam, thread_count);
2303     qemu_mutex_init(&decomp_done_lock);
2304     qemu_cond_init(&decomp_done_cond);
2305     for (i = 0; i < thread_count; i++) {
2306         qemu_mutex_init(&decomp_param[i].mutex);
2307         qemu_cond_init(&decomp_param[i].cond);
2308         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2309         decomp_param[i].done = true;
2310         decomp_param[i].quit = false;
2311         qemu_thread_create(decompress_threads + i, "decompress",
2312                            do_data_decompress, decomp_param + i,
2313                            QEMU_THREAD_JOINABLE);
2314     }
2315 }
2316
2317 void migrate_decompress_threads_join(void)
2318 {
2319     int i, thread_count;
2320
2321     thread_count = migrate_decompress_threads();
2322     for (i = 0; i < thread_count; i++) {
2323         qemu_mutex_lock(&decomp_param[i].mutex);
2324         decomp_param[i].quit = true;
2325         qemu_cond_signal(&decomp_param[i].cond);
2326         qemu_mutex_unlock(&decomp_param[i].mutex);
2327     }
2328     for (i = 0; i < thread_count; i++) {
2329         qemu_thread_join(decompress_threads + i);
2330         qemu_mutex_destroy(&decomp_param[i].mutex);
2331         qemu_cond_destroy(&decomp_param[i].cond);
2332         g_free(decomp_param[i].compbuf);
2333     }
2334     g_free(decompress_threads);
2335     g_free(decomp_param);
2336     decompress_threads = NULL;
2337     decomp_param = NULL;
2338 }
2339
2340 static void decompress_data_with_multi_threads(QEMUFile *f,
2341                                                void *host, int len)
2342 {
2343     int idx, thread_count;
2344
2345     thread_count = migrate_decompress_threads();
2346     qemu_mutex_lock(&decomp_done_lock);
2347     while (true) {
2348         for (idx = 0; idx < thread_count; idx++) {
2349             if (decomp_param[idx].done) {
2350                 decomp_param[idx].done = false;
2351                 qemu_mutex_lock(&decomp_param[idx].mutex);
2352                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2353                 decomp_param[idx].des = host;
2354                 decomp_param[idx].len = len;
2355                 qemu_cond_signal(&decomp_param[idx].cond);
2356                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2357                 break;
2358             }
2359         }
2360         if (idx < thread_count) {
2361             break;
2362         } else {
2363             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2364         }
2365     }
2366     qemu_mutex_unlock(&decomp_done_lock);
2367 }
2368
2369 /**
2370  * ram_postcopy_incoming_init: allocate postcopy data structures
2371  *
2372  * Returns 0 for success and negative if there was one error
2373  *
2374  * @mis: current migration incoming state
2375  *
2376  * Allocate data structures etc needed by incoming migration with
2377  * postcopy-ram. postcopy-ram's similarly names
2378  * postcopy_ram_incoming_init does the work.
2379  */
2380 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2381 {
2382     unsigned long ram_pages = last_ram_page();
2383
2384     return postcopy_ram_incoming_init(mis, ram_pages);
2385 }
2386
2387 /**
2388  * ram_load_postcopy: load a page in postcopy case
2389  *
2390  * Returns 0 for success or -errno in case of error
2391  *
2392  * Called in postcopy mode by ram_load().
2393  * rcu_read_lock is taken prior to this being called.
2394  *
2395  * @f: QEMUFile where to send the data
2396  */
2397 static int ram_load_postcopy(QEMUFile *f)
2398 {
2399     int flags = 0, ret = 0;
2400     bool place_needed = false;
2401     bool matching_page_sizes = false;
2402     MigrationIncomingState *mis = migration_incoming_get_current();
2403     /* Temporary page that is later 'placed' */
2404     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2405     void *last_host = NULL;
2406     bool all_zero = false;
2407
2408     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2409         ram_addr_t addr;
2410         void *host = NULL;
2411         void *page_buffer = NULL;
2412         void *place_source = NULL;
2413         RAMBlock *block = NULL;
2414         uint8_t ch;
2415
2416         addr = qemu_get_be64(f);
2417         flags = addr & ~TARGET_PAGE_MASK;
2418         addr &= TARGET_PAGE_MASK;
2419
2420         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2421         place_needed = false;
2422         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2423             block = ram_block_from_stream(f, flags);
2424
2425             host = host_from_ram_block_offset(block, addr);
2426             if (!host) {
2427                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2428                 ret = -EINVAL;
2429                 break;
2430             }
2431             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2432             /*
2433              * Postcopy requires that we place whole host pages atomically;
2434              * these may be huge pages for RAMBlocks that are backed by
2435              * hugetlbfs.
2436              * To make it atomic, the data is read into a temporary page
2437              * that's moved into place later.
2438              * The migration protocol uses,  possibly smaller, target-pages
2439              * however the source ensures it always sends all the components
2440              * of a host page in order.
2441              */
2442             page_buffer = postcopy_host_page +
2443                           ((uintptr_t)host & (block->page_size - 1));
2444             /* If all TP are zero then we can optimise the place */
2445             if (!((uintptr_t)host & (block->page_size - 1))) {
2446                 all_zero = true;
2447             } else {
2448                 /* not the 1st TP within the HP */
2449                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2450                     error_report("Non-sequential target page %p/%p",
2451                                   host, last_host);
2452                     ret = -EINVAL;
2453                     break;
2454                 }
2455             }
2456
2457
2458             /*
2459              * If it's the last part of a host page then we place the host
2460              * page
2461              */
2462             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2463                                      (block->page_size - 1)) == 0;
2464             place_source = postcopy_host_page;
2465         }
2466         last_host = host;
2467
2468         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2469         case RAM_SAVE_FLAG_ZERO:
2470             ch = qemu_get_byte(f);
2471             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2472             if (ch) {
2473                 all_zero = false;
2474             }
2475             break;
2476
2477         case RAM_SAVE_FLAG_PAGE:
2478             all_zero = false;
2479             if (!place_needed || !matching_page_sizes) {
2480                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2481             } else {
2482                 /* Avoids the qemu_file copy during postcopy, which is
2483                  * going to do a copy later; can only do it when we
2484                  * do this read in one go (matching page sizes)
2485                  */
2486                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2487                                          TARGET_PAGE_SIZE);
2488             }
2489             break;
2490         case RAM_SAVE_FLAG_EOS:
2491             /* normal exit */
2492             break;
2493         default:
2494             error_report("Unknown combination of migration flags: %#x"
2495                          " (postcopy mode)", flags);
2496             ret = -EINVAL;
2497         }
2498
2499         if (place_needed) {
2500             /* This gets called at the last target page in the host page */
2501             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2502
2503             if (all_zero) {
2504                 ret = postcopy_place_page_zero(mis, place_dest,
2505                                                block->page_size);
2506             } else {
2507                 ret = postcopy_place_page(mis, place_dest,
2508                                           place_source, block->page_size);
2509             }
2510         }
2511         if (!ret) {
2512             ret = qemu_file_get_error(f);
2513         }
2514     }
2515
2516     return ret;
2517 }
2518
2519 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2520 {
2521     int flags = 0, ret = 0;
2522     static uint64_t seq_iter;
2523     int len = 0;
2524     /*
2525      * If system is running in postcopy mode, page inserts to host memory must
2526      * be atomic
2527      */
2528     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2529     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2530     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2531
2532     seq_iter++;
2533
2534     if (version_id != 4) {
2535         ret = -EINVAL;
2536     }
2537
2538     /* This RCU critical section can be very long running.
2539      * When RCU reclaims in the code start to become numerous,
2540      * it will be necessary to reduce the granularity of this
2541      * critical section.
2542      */
2543     rcu_read_lock();
2544
2545     if (postcopy_running) {
2546         ret = ram_load_postcopy(f);
2547     }
2548
2549     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2550         ram_addr_t addr, total_ram_bytes;
2551         void *host = NULL;
2552         uint8_t ch;
2553
2554         addr = qemu_get_be64(f);
2555         flags = addr & ~TARGET_PAGE_MASK;
2556         addr &= TARGET_PAGE_MASK;
2557
2558         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2559                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2560             RAMBlock *block = ram_block_from_stream(f, flags);
2561
2562             host = host_from_ram_block_offset(block, addr);
2563             if (!host) {
2564                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2565                 ret = -EINVAL;
2566                 break;
2567             }
2568             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2569         }
2570
2571         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2572         case RAM_SAVE_FLAG_MEM_SIZE:
2573             /* Synchronize RAM block list */
2574             total_ram_bytes = addr;
2575             while (!ret && total_ram_bytes) {
2576                 RAMBlock *block;
2577                 char id[256];
2578                 ram_addr_t length;
2579
2580                 len = qemu_get_byte(f);
2581                 qemu_get_buffer(f, (uint8_t *)id, len);
2582                 id[len] = 0;
2583                 length = qemu_get_be64(f);
2584
2585                 block = qemu_ram_block_by_name(id);
2586                 if (block) {
2587                     if (length != block->used_length) {
2588                         Error *local_err = NULL;
2589
2590                         ret = qemu_ram_resize(block, length,
2591                                               &local_err);
2592                         if (local_err) {
2593                             error_report_err(local_err);
2594                         }
2595                     }
2596                     /* For postcopy we need to check hugepage sizes match */
2597                     if (postcopy_advised &&
2598                         block->page_size != qemu_host_page_size) {
2599                         uint64_t remote_page_size = qemu_get_be64(f);
2600                         if (remote_page_size != block->page_size) {
2601                             error_report("Mismatched RAM page size %s "
2602                                          "(local) %zd != %" PRId64,
2603                                          id, block->page_size,
2604                                          remote_page_size);
2605                             ret = -EINVAL;
2606                         }
2607                     }
2608                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2609                                           block->idstr);
2610                 } else {
2611                     error_report("Unknown ramblock \"%s\", cannot "
2612                                  "accept migration", id);
2613                     ret = -EINVAL;
2614                 }
2615
2616                 total_ram_bytes -= length;
2617             }
2618             break;
2619
2620         case RAM_SAVE_FLAG_ZERO:
2621             ch = qemu_get_byte(f);
2622             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2623             break;
2624
2625         case RAM_SAVE_FLAG_PAGE:
2626             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2627             break;
2628
2629         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2630             len = qemu_get_be32(f);
2631             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2632                 error_report("Invalid compressed data length: %d", len);
2633                 ret = -EINVAL;
2634                 break;
2635             }
2636             decompress_data_with_multi_threads(f, host, len);
2637             break;
2638
2639         case RAM_SAVE_FLAG_XBZRLE:
2640             if (load_xbzrle(f, addr, host) < 0) {
2641                 error_report("Failed to decompress XBZRLE page at "
2642                              RAM_ADDR_FMT, addr);
2643                 ret = -EINVAL;
2644                 break;
2645             }
2646             break;
2647         case RAM_SAVE_FLAG_EOS:
2648             /* normal exit */
2649             break;
2650         default:
2651             if (flags & RAM_SAVE_FLAG_HOOK) {
2652                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2653             } else {
2654                 error_report("Unknown combination of migration flags: %#x",
2655                              flags);
2656                 ret = -EINVAL;
2657             }
2658         }
2659         if (!ret) {
2660             ret = qemu_file_get_error(f);
2661         }
2662     }
2663
2664     wait_for_decompress_done();
2665     rcu_read_unlock();
2666     trace_ram_load_complete(ret, seq_iter);
2667     return ret;
2668 }
2669
2670 static SaveVMHandlers savevm_ram_handlers = {
2671     .save_live_setup = ram_save_setup,
2672     .save_live_iterate = ram_save_iterate,
2673     .save_live_complete_postcopy = ram_save_complete,
2674     .save_live_complete_precopy = ram_save_complete,
2675     .save_live_pending = ram_save_pending,
2676     .load_state = ram_load,
2677     .cleanup = ram_migration_cleanup,
2678 };
2679
2680 void ram_mig_init(void)
2681 {
2682     qemu_mutex_init(&XBZRLE.lock);
2683     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2684 }