migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include "qemu/osdep.h"
  29 #include "qemu-common.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qapi-event.h"
  33 #include "qemu/cutils.h"
  34 #include "qemu/bitops.h"
  35 #include "qemu/bitmap.h"
  36 #include "qemu/timer.h"
  37 #include "qemu/main-loop.h"
  38 #include "migration/migration.h"
  39 #include "postcopy-ram.h"
  40 #include "exec/address-spaces.h"
  41 #include "migration/page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "trace.h"
  44 #include "exec/ram_addr.h"
  45 #include "qemu/rcu_queue.h"
  46 #include "migration/colo.h"
  47
  48 /***********************************************************/
  49 /* ram save/restore */
  50
  51 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  52 #define RAM_SAVE_FLAG_COMPRESS 0x02
  53 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  54 #define RAM_SAVE_FLAG_PAGE     0x08
  55 #define RAM_SAVE_FLAG_EOS      0x10
  56 #define RAM_SAVE_FLAG_CONTINUE 0x20
  57 #define RAM_SAVE_FLAG_XBZRLE   0x40
  58 /* 0x80 is reserved in migration.h start with 0x100 next */
  59 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  60
  61 static uint8_t *ZERO_TARGET_PAGE;
  62
  63 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  64 {
  65     return buffer_is_zero(p, size);
  66 }
  67
  68 /* struct contains XBZRLE cache and a static page
  69    used by the compression */
  70 static struct {
  71     /* buffer used for XBZRLE encoding */
  72     uint8_t *encoded_buf;
  73     /* buffer for storing page content */
  74     uint8_t *current_buf;
  75     /* Cache for XBZRLE, Protected by lock. */
  76     PageCache *cache;
  77     QemuMutex lock;
  78 } XBZRLE;
  79
  80 /* buffer used for XBZRLE decoding */
  81 static uint8_t *xbzrle_decoded_buf;
  82
  83 static void XBZRLE_cache_lock(void)
  84 {
  85     if (migrate_use_xbzrle())
  86         qemu_mutex_lock(&XBZRLE.lock);
  87 }
  88
  89 static void XBZRLE_cache_unlock(void)
  90 {
  91     if (migrate_use_xbzrle())
  92         qemu_mutex_unlock(&XBZRLE.lock);
  93 }
  94
  95 /**
  96  * xbzrle_cache_resize: resize the xbzrle cache
  97  *
  98  * This function is called from qmp_migrate_set_cache_size in main
  99  * thread, possibly while a migration is in progress.  A running
 100  * migration may be using the cache and might finish during this call,
 101  * hence changes to the cache are protected by XBZRLE.lock().
 102  *
 103  * Returns the new_size or negative in case of error.
 104  *
 105  * @new_size: new cache size
 106  */
 107 int64_t xbzrle_cache_resize(int64_t new_size)
 108 {
 109     PageCache *new_cache;
 110     int64_t ret;
 111
 112     if (new_size < TARGET_PAGE_SIZE) {
 113         return -1;
 114     }
 115
 116     XBZRLE_cache_lock();
 117
 118     if (XBZRLE.cache != NULL) {
 119         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 120             goto out_new_size;
 121         }
 122         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 123                                         TARGET_PAGE_SIZE);
 124         if (!new_cache) {
 125             error_report("Error creating cache");
 126             ret = -1;
 127             goto out;
 128         }
 129
 130         cache_fini(XBZRLE.cache);
 131         XBZRLE.cache = new_cache;
 132     }
 133
 134 out_new_size:
 135     ret = pow2floor(new_size);
 136 out:
 137     XBZRLE_cache_unlock();
 138     return ret;
 139 }
 140
 141 /*
 142  * An outstanding page request, on the source, having been received
 143  * and queued
 144  */
 145 struct RAMSrcPageRequest {
 146     RAMBlock *rb;
 147     hwaddr    offset;
 148     hwaddr    len;
 149
 150     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 151 };
 152
 153 /* State of RAM for migration */
 154 struct RAMState {
 155     /* QEMUFile used for this migration */
 156     QEMUFile *f;
 157     /* Last block that we have visited searching for dirty pages */
 158     RAMBlock *last_seen_block;
 159     /* Last block from where we have sent data */
 160     RAMBlock *last_sent_block;
 161     /* Last dirty target page we have sent */
 162     ram_addr_t last_page;
 163     /* last ram version we have seen */
 164     uint32_t last_version;
 165     /* We are in the first round */
 166     bool ram_bulk_stage;
 167     /* How many times we have dirty too many pages */
 168     int dirty_rate_high_cnt;
 169     /* How many times we have synchronized the bitmap */
 170     uint64_t bitmap_sync_count;
 171     /* these variables are used for bitmap sync */
 172     /* last time we did a full bitmap_sync */
 173     int64_t time_last_bitmap_sync;
 174     /* bytes transferred at start_time */
 175     uint64_t bytes_xfer_prev;
 176     /* number of dirty pages since start_time */
 177     uint64_t num_dirty_pages_period;
 178     /* xbzrle misses since the beginning of the period */
 179     uint64_t xbzrle_cache_miss_prev;
 180     /* number of iterations at the beginning of period */
 181     uint64_t iterations_prev;
 182     /* Accounting fields */
 183     /* number of zero pages.  It used to be pages filled by the same char. */
 184     uint64_t zero_pages;
 185     /* number of normal transferred pages */
 186     uint64_t norm_pages;
 187     /* Iterations since start */
 188     uint64_t iterations;
 189     /* xbzrle transmitted bytes.  Notice that this is with
 190      * compression, they can't be calculated from the pages */
 191     uint64_t xbzrle_bytes;
 192     /* xbzrle transmmited pages */
 193     uint64_t xbzrle_pages;
 194     /* xbzrle number of cache miss */
 195     uint64_t xbzrle_cache_miss;
 196     /* xbzrle miss rate */
 197     double xbzrle_cache_miss_rate;
 198     /* xbzrle number of overflows */
 199     uint64_t xbzrle_overflows;
 200     /* number of dirty bits in the bitmap */
 201     uint64_t migration_dirty_pages;
 202     /* total number of bytes transferred */
 203     uint64_t bytes_transferred;
 204     /* number of dirtied pages in the last second */
 205     uint64_t dirty_pages_rate;
 206     /* Count of requests incoming from destination */
 207     uint64_t postcopy_requests;
 208     /* protects modification of the bitmap */
 209     QemuMutex bitmap_mutex;
 210     /* The RAMBlock used in the last src_page_requests */
 211     RAMBlock *last_req_rb;
 212     /* Queue of outstanding page requests from the destination */
 213     QemuMutex src_page_req_mutex;
 214     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 215 };
 216 typedef struct RAMState RAMState;
 217
 218 static RAMState ram_state;
 219
 220 uint64_t dup_mig_pages_transferred(void)
 221 {
 222     return ram_state.zero_pages;
 223 }
 224
 225 uint64_t norm_mig_pages_transferred(void)
 226 {
 227     return ram_state.norm_pages;
 228 }
 229
 230 uint64_t xbzrle_mig_bytes_transferred(void)
 231 {
 232     return ram_state.xbzrle_bytes;
 233 }
 234
 235 uint64_t xbzrle_mig_pages_transferred(void)
 236 {
 237     return ram_state.xbzrle_pages;
 238 }
 239
 240 uint64_t xbzrle_mig_pages_cache_miss(void)
 241 {
 242     return ram_state.xbzrle_cache_miss;
 243 }
 244
 245 double xbzrle_mig_cache_miss_rate(void)
 246 {
 247     return ram_state.xbzrle_cache_miss_rate;
 248 }
 249
 250 uint64_t xbzrle_mig_pages_overflow(void)
 251 {
 252     return ram_state.xbzrle_overflows;
 253 }
 254
 255 uint64_t ram_bytes_transferred(void)
 256 {
 257     return ram_state.bytes_transferred;
 258 }
 259
 260 uint64_t ram_bytes_remaining(void)
 261 {
 262     return ram_state.migration_dirty_pages * TARGET_PAGE_SIZE;
 263 }
 264
 265 uint64_t ram_dirty_sync_count(void)
 266 {
 267     return ram_state.bitmap_sync_count;
 268 }
 269
 270 uint64_t ram_dirty_pages_rate(void)
 271 {
 272     return ram_state.dirty_pages_rate;
 273 }
 274
 275 uint64_t ram_postcopy_requests(void)
 276 {
 277     return ram_state.postcopy_requests;
 278 }
 279
 280 /* used by the search for pages to send */
 281 struct PageSearchStatus {
 282     /* Current block being searched */
 283     RAMBlock    *block;
 284     /* Current page to search from */
 285     unsigned long page;
 286     /* Set once we wrap around */
 287     bool         complete_round;
 288 };
 289 typedef struct PageSearchStatus PageSearchStatus;
 290
 291 struct CompressParam {
 292     bool done;
 293     bool quit;
 294     QEMUFile *file;
 295     QemuMutex mutex;
 296     QemuCond cond;
 297     RAMBlock *block;
 298     ram_addr_t offset;
 299 };
 300 typedef struct CompressParam CompressParam;
 301
 302 struct DecompressParam {
 303     bool done;
 304     bool quit;
 305     QemuMutex mutex;
 306     QemuCond cond;
 307     void *des;
 308     uint8_t *compbuf;
 309     int len;
 310 };
 311 typedef struct DecompressParam DecompressParam;
 312
 313 static CompressParam *comp_param;
 314 static QemuThread *compress_threads;
 315 /* comp_done_cond is used to wake up the migration thread when
 316  * one of the compression threads has finished the compression.
 317  * comp_done_lock is used to co-work with comp_done_cond.
 318  */
 319 static QemuMutex comp_done_lock;
 320 static QemuCond comp_done_cond;
 321 /* The empty QEMUFileOps will be used by file in CompressParam */
 322 static const QEMUFileOps empty_ops = { };
 323
 324 static DecompressParam *decomp_param;
 325 static QemuThread *decompress_threads;
 326 static QemuMutex decomp_done_lock;
 327 static QemuCond decomp_done_cond;
 328
 329 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 330                                 ram_addr_t offset);
 331
 332 static void *do_data_compress(void *opaque)
 333 {
 334     CompressParam *param = opaque;
 335     RAMBlock *block;
 336     ram_addr_t offset;
 337
 338     qemu_mutex_lock(&param->mutex);
 339     while (!param->quit) {
 340         if (param->block) {
 341             block = param->block;
 342             offset = param->offset;
 343             param->block = NULL;
 344             qemu_mutex_unlock(&param->mutex);
 345
 346             do_compress_ram_page(param->file, block, offset);
 347
 348             qemu_mutex_lock(&comp_done_lock);
 349             param->done = true;
 350             qemu_cond_signal(&comp_done_cond);
 351             qemu_mutex_unlock(&comp_done_lock);
 352
 353             qemu_mutex_lock(&param->mutex);
 354         } else {
 355             qemu_cond_wait(&param->cond, &param->mutex);
 356         }
 357     }
 358     qemu_mutex_unlock(&param->mutex);
 359
 360     return NULL;
 361 }
 362
 363 static inline void terminate_compression_threads(void)
 364 {
 365     int idx, thread_count;
 366
 367     thread_count = migrate_compress_threads();
 368
 369     for (idx = 0; idx < thread_count; idx++) {
 370         qemu_mutex_lock(&comp_param[idx].mutex);
 371         comp_param[idx].quit = true;
 372         qemu_cond_signal(&comp_param[idx].cond);
 373         qemu_mutex_unlock(&comp_param[idx].mutex);
 374     }
 375 }
 376
 377 void migrate_compress_threads_join(void)
 378 {
 379     int i, thread_count;
 380
 381     if (!migrate_use_compression()) {
 382         return;
 383     }
 384     terminate_compression_threads();
 385     thread_count = migrate_compress_threads();
 386     for (i = 0; i < thread_count; i++) {
 387         qemu_thread_join(compress_threads + i);
 388         qemu_fclose(comp_param[i].file);
 389         qemu_mutex_destroy(&comp_param[i].mutex);
 390         qemu_cond_destroy(&comp_param[i].cond);
 391     }
 392     qemu_mutex_destroy(&comp_done_lock);
 393     qemu_cond_destroy(&comp_done_cond);
 394     g_free(compress_threads);
 395     g_free(comp_param);
 396     compress_threads = NULL;
 397     comp_param = NULL;
 398 }
 399
 400 void migrate_compress_threads_create(void)
 401 {
 402     int i, thread_count;
 403
 404     if (!migrate_use_compression()) {
 405         return;
 406     }
 407     thread_count = migrate_compress_threads();
 408     compress_threads = g_new0(QemuThread, thread_count);
 409     comp_param = g_new0(CompressParam, thread_count);
 410     qemu_cond_init(&comp_done_cond);
 411     qemu_mutex_init(&comp_done_lock);
 412     for (i = 0; i < thread_count; i++) {
 413         /* comp_param[i].file is just used as a dummy buffer to save data,
 414          * set its ops to empty.
 415          */
 416         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 417         comp_param[i].done = true;
 418         comp_param[i].quit = false;
 419         qemu_mutex_init(&comp_param[i].mutex);
 420         qemu_cond_init(&comp_param[i].cond);
 421         qemu_thread_create(compress_threads + i, "compress",
 422                            do_data_compress, comp_param + i,
 423                            QEMU_THREAD_JOINABLE);
 424     }
 425 }
 426
 427 /**
 428  * save_page_header: write page header to wire
 429  *
 430  * If this is the 1st block, it also writes the block identification
 431  *
 432  * Returns the number of bytes written
 433  *
 434  * @f: QEMUFile where to send the data
 435  * @block: block that contains the page we want to send
 436  * @offset: offset inside the block for the page
 437  *          in the lower bits, it contains flags
 438  */
 439 static size_t save_page_header(RAMState *rs, RAMBlock *block, ram_addr_t offset)
 440 {
 441     size_t size, len;
 442
 443     if (block == rs->last_sent_block) {
 444         offset |= RAM_SAVE_FLAG_CONTINUE;
 445     }
 446     qemu_put_be64(rs->f, offset);
 447     size = 8;
 448
 449     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 450         len = strlen(block->idstr);
 451         qemu_put_byte(rs->f, len);
 452         qemu_put_buffer(rs->f, (uint8_t *)block->idstr, len);
 453         size += 1 + len;
 454         rs->last_sent_block = block;
 455     }
 456     return size;
 457 }
 458
 459 /**
 460  * mig_throttle_guest_down: throotle down the guest
 461  *
 462  * Reduce amount of guest cpu execution to hopefully slow down memory
 463  * writes. If guest dirty memory rate is reduced below the rate at
 464  * which we can transfer pages to the destination then we should be
 465  * able to complete migration. Some workloads dirty memory way too
 466  * fast and will not effectively converge, even with auto-converge.
 467  */
 468 static void mig_throttle_guest_down(void)
 469 {
 470     MigrationState *s = migrate_get_current();
 471     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 472     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 473
 474     /* We have not started throttling yet. Let's start it. */
 475     if (!cpu_throttle_active()) {
 476         cpu_throttle_set(pct_initial);
 477     } else {
 478         /* Throttling already on, just increase the rate */
 479         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 480     }
 481 }
 482
 483 /**
 484  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 485  *
 486  * @rs: current RAM state
 487  * @current_addr: address for the zero page
 488  *
 489  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 490  * The important thing is that a stale (not-yet-0'd) page be replaced
 491  * by the new data.
 492  * As a bonus, if the page wasn't in the cache it gets added so that
 493  * when a small write is made into the 0'd page it gets XBZRLE sent.
 494  */
 495 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 496 {
 497     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 498         return;
 499     }
 500
 501     /* We don't care if this fails to allocate a new cache page
 502      * as long as it updated an old one */
 503     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 504                  rs->bitmap_sync_count);
 505 }
 506
 507 #define ENCODING_FLAG_XBZRLE 0x1
 508
 509 /**
 510  * save_xbzrle_page: compress and send current page
 511  *
 512  * Returns: 1 means that we wrote the page
 513  *          0 means that page is identical to the one already sent
 514  *          -1 means that xbzrle would be longer than normal
 515  *
 516  * @rs: current RAM state
 517  * @current_data: pointer to the address of the page contents
 518  * @current_addr: addr of the page
 519  * @block: block that contains the page we want to send
 520  * @offset: offset inside the block for the page
 521  * @last_stage: if we are at the completion stage
 522  */
 523 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 524                             ram_addr_t current_addr, RAMBlock *block,
 525                             ram_addr_t offset, bool last_stage)
 526 {
 527     int encoded_len = 0, bytes_xbzrle;
 528     uint8_t *prev_cached_page;
 529
 530     if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
 531         rs->xbzrle_cache_miss++;
 532         if (!last_stage) {
 533             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 534                              rs->bitmap_sync_count) == -1) {
 535                 return -1;
 536             } else {
 537                 /* update *current_data when the page has been
 538                    inserted into cache */
 539                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 540             }
 541         }
 542         return -1;
 543     }
 544
 545     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 546
 547     /* save current buffer into memory */
 548     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 549
 550     /* XBZRLE encoding (if there is no overflow) */
 551     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 552                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 553                                        TARGET_PAGE_SIZE);
 554     if (encoded_len == 0) {
 555         trace_save_xbzrle_page_skipping();
 556         return 0;
 557     } else if (encoded_len == -1) {
 558         trace_save_xbzrle_page_overflow();
 559         rs->xbzrle_overflows++;
 560         /* update data in the cache */
 561         if (!last_stage) {
 562             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 563             *current_data = prev_cached_page;
 564         }
 565         return -1;
 566     }
 567
 568     /* we need to update the data in the cache, in order to get the same data */
 569     if (!last_stage) {
 570         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 571     }
 572
 573     /* Send XBZRLE based compressed page */
 574     bytes_xbzrle = save_page_header(rs, block,
 575                                     offset | RAM_SAVE_FLAG_XBZRLE);
 576     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 577     qemu_put_be16(rs->f, encoded_len);
 578     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 579     bytes_xbzrle += encoded_len + 1 + 2;
 580     rs->xbzrle_pages++;
 581     rs->xbzrle_bytes += bytes_xbzrle;
 582     rs->bytes_transferred += bytes_xbzrle;
 583
 584     return 1;
 585 }
 586
 587 /**
 588  * migration_bitmap_find_dirty: find the next dirty page from start
 589  *
 590  * Called with rcu_read_lock() to protect migration_bitmap
 591  *
 592  * Returns the byte offset within memory region of the start of a dirty page
 593  *
 594  * @rs: current RAM state
 595  * @rb: RAMBlock where to search for dirty pages
 596  * @start: page where we start the search
 597  */
 598 static inline
 599 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 600                                           unsigned long start)
 601 {
 602     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 603     unsigned long *bitmap = rb->bmap;
 604     unsigned long next;
 605
 606     if (rs->ram_bulk_stage && start > 0) {
 607         next = start + 1;
 608     } else {
 609         next = find_next_bit(bitmap, size, start);
 610     }
 611
 612     return next;
 613 }
 614
 615 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 616                                                 RAMBlock *rb,
 617                                                 unsigned long page)
 618 {
 619     bool ret;
 620
 621     ret = test_and_clear_bit(page, rb->bmap);
 622
 623     if (ret) {
 624         rs->migration_dirty_pages--;
 625     }
 626     return ret;
 627 }
 628
 629 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 630                                         ram_addr_t start, ram_addr_t length)
 631 {
 632     rs->migration_dirty_pages +=
 633         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 634                                               &rs->num_dirty_pages_period);
 635 }
 636
 637 /**
 638  * ram_pagesize_summary: calculate all the pagesizes of a VM
 639  *
 640  * Returns a summary bitmap of the page sizes of all RAMBlocks
 641  *
 642  * For VMs with just normal pages this is equivalent to the host page
 643  * size. If it's got some huge pages then it's the OR of all the
 644  * different page sizes.
 645  */
 646 uint64_t ram_pagesize_summary(void)
 647 {
 648     RAMBlock *block;
 649     uint64_t summary = 0;
 650
 651     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 652         summary |= block->page_size;
 653     }
 654
 655     return summary;
 656 }
 657
 658 static void migration_bitmap_sync(RAMState *rs)
 659 {
 660     RAMBlock *block;
 661     int64_t end_time;
 662     uint64_t bytes_xfer_now;
 663
 664     rs->bitmap_sync_count++;
 665
 666     if (!rs->bytes_xfer_prev) {
 667         rs->bytes_xfer_prev = ram_bytes_transferred();
 668     }
 669
 670     if (!rs->time_last_bitmap_sync) {
 671         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 672     }
 673
 674     trace_migration_bitmap_sync_start();
 675     memory_global_dirty_log_sync();
 676
 677     qemu_mutex_lock(&rs->bitmap_mutex);
 678     rcu_read_lock();
 679     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 680         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 681     }
 682     rcu_read_unlock();
 683     qemu_mutex_unlock(&rs->bitmap_mutex);
 684
 685     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 686
 687     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 688
 689     /* more than 1 second = 1000 millisecons */
 690     if (end_time > rs->time_last_bitmap_sync + 1000) {
 691         if (migrate_auto_converge()) {
 692             /* The following detection logic can be refined later. For now:
 693                Check to see if the dirtied bytes is 50% more than the approx.
 694                amount of bytes that just got transferred since the last time we
 695                were in this routine. If that happens twice, start or increase
 696                throttling */
 697             bytes_xfer_now = ram_bytes_transferred();
 698
 699             if (rs->dirty_pages_rate &&
 700                (rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 701                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 702                (rs->dirty_rate_high_cnt++ >= 2)) {
 703                     trace_migration_throttle();
 704                     rs->dirty_rate_high_cnt = 0;
 705                     mig_throttle_guest_down();
 706              }
 707              rs->bytes_xfer_prev = bytes_xfer_now;
 708         }
 709
 710         if (migrate_use_xbzrle()) {
 711             if (rs->iterations_prev != rs->iterations) {
 712                 rs->xbzrle_cache_miss_rate =
 713                    (double)(rs->xbzrle_cache_miss -
 714                             rs->xbzrle_cache_miss_prev) /
 715                    (rs->iterations - rs->iterations_prev);
 716             }
 717             rs->iterations_prev = rs->iterations;
 718             rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss;
 719         }
 720         rs->dirty_pages_rate = rs->num_dirty_pages_period * 1000
 721             / (end_time - rs->time_last_bitmap_sync);
 722         rs->time_last_bitmap_sync = end_time;
 723         rs->num_dirty_pages_period = 0;
 724     }
 725     if (migrate_use_events()) {
 726         qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
 727     }
 728 }
 729
 730 /**
 731  * save_zero_page: send the zero page to the stream
 732  *
 733  * Returns the number of pages written.
 734  *
 735  * @rs: current RAM state
 736  * @block: block that contains the page we want to send
 737  * @offset: offset inside the block for the page
 738  * @p: pointer to the page
 739  */
 740 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
 741                           uint8_t *p)
 742 {
 743     int pages = -1;
 744
 745     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 746         rs->zero_pages++;
 747         rs->bytes_transferred +=
 748             save_page_header(rs, block, offset | RAM_SAVE_FLAG_COMPRESS);
 749         qemu_put_byte(rs->f, 0);
 750         rs->bytes_transferred += 1;
 751         pages = 1;
 752     }
 753
 754     return pages;
 755 }
 756
 757 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 758 {
 759     if (!migrate_release_ram() || !migration_in_postcopy()) {
 760         return;
 761     }
 762
 763     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 764 }
 765
 766 /**
 767  * ram_save_page: send the given page to the stream
 768  *
 769  * Returns the number of pages written.
 770  *          < 0 - error
 771  *          >=0 - Number of pages written - this might legally be 0
 772  *                if xbzrle noticed the page was the same.
 773  *
 774  * @rs: current RAM state
 775  * @block: block that contains the page we want to send
 776  * @offset: offset inside the block for the page
 777  * @last_stage: if we are at the completion stage
 778  */
 779 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 780 {
 781     int pages = -1;
 782     uint64_t bytes_xmit;
 783     ram_addr_t current_addr;
 784     uint8_t *p;
 785     int ret;
 786     bool send_async = true;
 787     RAMBlock *block = pss->block;
 788     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 789
 790     p = block->host + offset;
 791     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 792
 793     /* In doubt sent page as normal */
 794     bytes_xmit = 0;
 795     ret = ram_control_save_page(rs->f, block->offset,
 796                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 797     if (bytes_xmit) {
 798         rs->bytes_transferred += bytes_xmit;
 799         pages = 1;
 800     }
 801
 802     XBZRLE_cache_lock();
 803
 804     current_addr = block->offset + offset;
 805
 806     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 807         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 808             if (bytes_xmit > 0) {
 809                 rs->norm_pages++;
 810             } else if (bytes_xmit == 0) {
 811                 rs->zero_pages++;
 812             }
 813         }
 814     } else {
 815         pages = save_zero_page(rs, block, offset, p);
 816         if (pages > 0) {
 817             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 818              * page would be stale
 819              */
 820             xbzrle_cache_zero_page(rs, current_addr);
 821             ram_release_pages(block->idstr, offset, pages);
 822         } else if (!rs->ram_bulk_stage &&
 823                    !migration_in_postcopy() && migrate_use_xbzrle()) {
 824             pages = save_xbzrle_page(rs, &p, current_addr, block,
 825                                      offset, last_stage);
 826             if (!last_stage) {
 827                 /* Can't send this cached data async, since the cache page
 828                  * might get updated before it gets to the wire
 829                  */
 830                 send_async = false;
 831             }
 832         }
 833     }
 834
 835     /* XBZRLE overflow or normal page */
 836     if (pages == -1) {
 837         rs->bytes_transferred += save_page_header(rs, block,
 838                                                   offset | RAM_SAVE_FLAG_PAGE);
 839         if (send_async) {
 840             qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
 841                                   migrate_release_ram() &
 842                                   migration_in_postcopy());
 843         } else {
 844             qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
 845         }
 846         rs->bytes_transferred += TARGET_PAGE_SIZE;
 847         pages = 1;
 848         rs->norm_pages++;
 849     }
 850
 851     XBZRLE_cache_unlock();
 852
 853     return pages;
 854 }
 855
 856 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 857                                 ram_addr_t offset)
 858 {
 859     RAMState *rs = &ram_state;
 860     int bytes_sent, blen;
 861     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 862
 863     bytes_sent = save_page_header(rs, block, offset |
 864                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 865     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 866                                      migrate_compress_level());
 867     if (blen < 0) {
 868         bytes_sent = 0;
 869         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 870         error_report("compressed data failed!");
 871     } else {
 872         bytes_sent += blen;
 873         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
 874     }
 875
 876     return bytes_sent;
 877 }
 878
 879 static void flush_compressed_data(RAMState *rs)
 880 {
 881     int idx, len, thread_count;
 882
 883     if (!migrate_use_compression()) {
 884         return;
 885     }
 886     thread_count = migrate_compress_threads();
 887
 888     qemu_mutex_lock(&comp_done_lock);
 889     for (idx = 0; idx < thread_count; idx++) {
 890         while (!comp_param[idx].done) {
 891             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 892         }
 893     }
 894     qemu_mutex_unlock(&comp_done_lock);
 895
 896     for (idx = 0; idx < thread_count; idx++) {
 897         qemu_mutex_lock(&comp_param[idx].mutex);
 898         if (!comp_param[idx].quit) {
 899             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 900             rs->bytes_transferred += len;
 901         }
 902         qemu_mutex_unlock(&comp_param[idx].mutex);
 903     }
 904 }
 905
 906 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 907                                        ram_addr_t offset)
 908 {
 909     param->block = block;
 910     param->offset = offset;
 911 }
 912
 913 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
 914                                            ram_addr_t offset)
 915 {
 916     int idx, thread_count, bytes_xmit = -1, pages = -1;
 917
 918     thread_count = migrate_compress_threads();
 919     qemu_mutex_lock(&comp_done_lock);
 920     while (true) {
 921         for (idx = 0; idx < thread_count; idx++) {
 922             if (comp_param[idx].done) {
 923                 comp_param[idx].done = false;
 924                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 925                 qemu_mutex_lock(&comp_param[idx].mutex);
 926                 set_compress_params(&comp_param[idx], block, offset);
 927                 qemu_cond_signal(&comp_param[idx].cond);
 928                 qemu_mutex_unlock(&comp_param[idx].mutex);
 929                 pages = 1;
 930                 rs->norm_pages++;
 931                 rs->bytes_transferred += bytes_xmit;
 932                 break;
 933             }
 934         }
 935         if (pages > 0) {
 936             break;
 937         } else {
 938             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 939         }
 940     }
 941     qemu_mutex_unlock(&comp_done_lock);
 942
 943     return pages;
 944 }
 945
 946 /**
 947  * ram_save_compressed_page: compress the given page and send it to the stream
 948  *
 949  * Returns the number of pages written.
 950  *
 951  * @rs: current RAM state
 952  * @block: block that contains the page we want to send
 953  * @offset: offset inside the block for the page
 954  * @last_stage: if we are at the completion stage
 955  */
 956 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
 957                                     bool last_stage)
 958 {
 959     int pages = -1;
 960     uint64_t bytes_xmit = 0;
 961     uint8_t *p;
 962     int ret, blen;
 963     RAMBlock *block = pss->block;
 964     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 965
 966     p = block->host + offset;
 967
 968     ret = ram_control_save_page(rs->f, block->offset,
 969                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 970     if (bytes_xmit) {
 971         rs->bytes_transferred += bytes_xmit;
 972         pages = 1;
 973     }
 974     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 975         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 976             if (bytes_xmit > 0) {
 977                 rs->norm_pages++;
 978             } else if (bytes_xmit == 0) {
 979                 rs->zero_pages++;
 980             }
 981         }
 982     } else {
 983         /* When starting the process of a new block, the first page of
 984          * the block should be sent out before other pages in the same
 985          * block, and all the pages in last block should have been sent
 986          * out, keeping this order is important, because the 'cont' flag
 987          * is used to avoid resending the block name.
 988          */
 989         if (block != rs->last_sent_block) {
 990             flush_compressed_data(rs);
 991             pages = save_zero_page(rs, block, offset, p);
 992             if (pages == -1) {
 993                 /* Make sure the first page is sent out before other pages */
 994                 bytes_xmit = save_page_header(rs, block, offset |
 995                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
 996                 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
 997                                                  migrate_compress_level());
 998                 if (blen > 0) {
 999                     rs->bytes_transferred += bytes_xmit + blen;
1000                     rs->norm_pages++;
1001                     pages = 1;
1002                 } else {
1003                     qemu_file_set_error(rs->f, blen);
1004                     error_report("compressed data failed!");
1005                 }
1006             }
1007             if (pages > 0) {
1008                 ram_release_pages(block->idstr, offset, pages);
1009             }
1010         } else {
1011             pages = save_zero_page(rs, block, offset, p);
1012             if (pages == -1) {
1013                 pages = compress_page_with_multi_thread(rs, block, offset);
1014             } else {
1015                 ram_release_pages(block->idstr, offset, pages);
1016             }
1017         }
1018     }
1019
1020     return pages;
1021 }
1022
1023 /**
1024  * find_dirty_block: find the next dirty page and update any state
1025  * associated with the search process.
1026  *
1027  * Returns if a page is found
1028  *
1029  * @rs: current RAM state
1030  * @pss: data about the state of the current dirty page scan
1031  * @again: set to false if the search has scanned the whole of RAM
1032  */
1033 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1034 {
1035     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1036     if (pss->complete_round && pss->block == rs->last_seen_block &&
1037         pss->page >= rs->last_page) {
1038         /*
1039          * We've been once around the RAM and haven't found anything.
1040          * Give up.
1041          */
1042         *again = false;
1043         return false;
1044     }
1045     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1046         /* Didn't find anything in this RAM Block */
1047         pss->page = 0;
1048         pss->block = QLIST_NEXT_RCU(pss->block, next);
1049         if (!pss->block) {
1050             /* Hit the end of the list */
1051             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1052             /* Flag that we've looped */
1053             pss->complete_round = true;
1054             rs->ram_bulk_stage = false;
1055             if (migrate_use_xbzrle()) {
1056                 /* If xbzrle is on, stop using the data compression at this
1057                  * point. In theory, xbzrle can do better than compression.
1058                  */
1059                 flush_compressed_data(rs);
1060             }
1061         }
1062         /* Didn't find anything this time, but try again on the new block */
1063         *again = true;
1064         return false;
1065     } else {
1066         /* Can go around again, but... */
1067         *again = true;
1068         /* We've found something so probably don't need to */
1069         return true;
1070     }
1071 }
1072
1073 /**
1074  * unqueue_page: gets a page of the queue
1075  *
1076  * Helper for 'get_queued_page' - gets a page off the queue
1077  *
1078  * Returns the block of the page (or NULL if none available)
1079  *
1080  * @rs: current RAM state
1081  * @offset: used to return the offset within the RAMBlock
1082  */
1083 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1084 {
1085     RAMBlock *block = NULL;
1086
1087     qemu_mutex_lock(&rs->src_page_req_mutex);
1088     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1089         struct RAMSrcPageRequest *entry =
1090                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1091         block = entry->rb;
1092         *offset = entry->offset;
1093
1094         if (entry->len > TARGET_PAGE_SIZE) {
1095             entry->len -= TARGET_PAGE_SIZE;
1096             entry->offset += TARGET_PAGE_SIZE;
1097         } else {
1098             memory_region_unref(block->mr);
1099             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1100             g_free(entry);
1101         }
1102     }
1103     qemu_mutex_unlock(&rs->src_page_req_mutex);
1104
1105     return block;
1106 }
1107
1108 /**
1109  * get_queued_page: unqueue a page from the postocpy requests
1110  *
1111  * Skips pages that are already sent (!dirty)
1112  *
1113  * Returns if a queued page is found
1114  *
1115  * @rs: current RAM state
1116  * @pss: data about the state of the current dirty page scan
1117  */
1118 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1119 {
1120     RAMBlock  *block;
1121     ram_addr_t offset;
1122     bool dirty;
1123
1124     do {
1125         block = unqueue_page(rs, &offset);
1126         /*
1127          * We're sending this page, and since it's postcopy nothing else
1128          * will dirty it, and we must make sure it doesn't get sent again
1129          * even if this queue request was received after the background
1130          * search already sent it.
1131          */
1132         if (block) {
1133             unsigned long page;
1134
1135             page = offset >> TARGET_PAGE_BITS;
1136             dirty = test_bit(page, block->bmap);
1137             if (!dirty) {
1138                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1139                        page, test_bit(page, block->unsentmap));
1140             } else {
1141                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1142             }
1143         }
1144
1145     } while (block && !dirty);
1146
1147     if (block) {
1148         /*
1149          * As soon as we start servicing pages out of order, then we have
1150          * to kill the bulk stage, since the bulk stage assumes
1151          * in (migration_bitmap_find_and_reset_dirty) that every page is
1152          * dirty, that's no longer true.
1153          */
1154         rs->ram_bulk_stage = false;
1155
1156         /*
1157          * We want the background search to continue from the queued page
1158          * since the guest is likely to want other pages near to the page
1159          * it just requested.
1160          */
1161         pss->block = block;
1162         pss->page = offset >> TARGET_PAGE_BITS;
1163     }
1164
1165     return !!block;
1166 }
1167
1168 /**
1169  * migration_page_queue_free: drop any remaining pages in the ram
1170  * request queue
1171  *
1172  * It should be empty at the end anyway, but in error cases there may
1173  * be some left.  in case that there is any page left, we drop it.
1174  *
1175  */
1176 void migration_page_queue_free(void)
1177 {
1178     struct RAMSrcPageRequest *mspr, *next_mspr;
1179     RAMState *rs = &ram_state;
1180     /* This queue generally should be empty - but in the case of a failed
1181      * migration might have some droppings in.
1182      */
1183     rcu_read_lock();
1184     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1185         memory_region_unref(mspr->rb->mr);
1186         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1187         g_free(mspr);
1188     }
1189     rcu_read_unlock();
1190 }
1191
1192 /**
1193  * ram_save_queue_pages: queue the page for transmission
1194  *
1195  * A request from postcopy destination for example.
1196  *
1197  * Returns zero on success or negative on error
1198  *
1199  * @rbname: Name of the RAMBLock of the request. NULL means the
1200  *          same that last one.
1201  * @start: starting address from the start of the RAMBlock
1202  * @len: length (in bytes) to send
1203  */
1204 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1205 {
1206     RAMBlock *ramblock;
1207     RAMState *rs = &ram_state;
1208
1209     rs->postcopy_requests++;
1210     rcu_read_lock();
1211     if (!rbname) {
1212         /* Reuse last RAMBlock */
1213         ramblock = rs->last_req_rb;
1214
1215         if (!ramblock) {
1216             /*
1217              * Shouldn't happen, we can't reuse the last RAMBlock if
1218              * it's the 1st request.
1219              */
1220             error_report("ram_save_queue_pages no previous block");
1221             goto err;
1222         }
1223     } else {
1224         ramblock = qemu_ram_block_by_name(rbname);
1225
1226         if (!ramblock) {
1227             /* We shouldn't be asked for a non-existent RAMBlock */
1228             error_report("ram_save_queue_pages no block '%s'", rbname);
1229             goto err;
1230         }
1231         rs->last_req_rb = ramblock;
1232     }
1233     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1234     if (start+len > ramblock->used_length) {
1235         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1236                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1237                      __func__, start, len, ramblock->used_length);
1238         goto err;
1239     }
1240
1241     struct RAMSrcPageRequest *new_entry =
1242         g_malloc0(sizeof(struct RAMSrcPageRequest));
1243     new_entry->rb = ramblock;
1244     new_entry->offset = start;
1245     new_entry->len = len;
1246
1247     memory_region_ref(ramblock->mr);
1248     qemu_mutex_lock(&rs->src_page_req_mutex);
1249     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1250     qemu_mutex_unlock(&rs->src_page_req_mutex);
1251     rcu_read_unlock();
1252
1253     return 0;
1254
1255 err:
1256     rcu_read_unlock();
1257     return -1;
1258 }
1259
1260 /**
1261  * ram_save_target_page: save one target page
1262  *
1263  * Returns the number of pages written
1264  *
1265  * @rs: current RAM state
1266  * @ms: current migration state
1267  * @pss: data about the page we want to send
1268  * @last_stage: if we are at the completion stage
1269  */
1270 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1271                                 bool last_stage)
1272 {
1273     int res = 0;
1274
1275     /* Check the pages is dirty and if it is send it */
1276     if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1277         /*
1278          * If xbzrle is on, stop using the data compression after first
1279          * round of migration even if compression is enabled. In theory,
1280          * xbzrle can do better than compression.
1281          */
1282         if (migrate_use_compression() &&
1283             (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1284             res = ram_save_compressed_page(rs, pss, last_stage);
1285         } else {
1286             res = ram_save_page(rs, pss, last_stage);
1287         }
1288
1289         if (res < 0) {
1290             return res;
1291         }
1292         if (pss->block->unsentmap) {
1293             clear_bit(pss->page, pss->block->unsentmap);
1294         }
1295     }
1296
1297     return res;
1298 }
1299
1300 /**
1301  * ram_save_host_page: save a whole host page
1302  *
1303  * Starting at *offset send pages up to the end of the current host
1304  * page. It's valid for the initial offset to point into the middle of
1305  * a host page in which case the remainder of the hostpage is sent.
1306  * Only dirty target pages are sent. Note that the host page size may
1307  * be a huge page for this block.
1308  *
1309  * Returns the number of pages written or negative on error
1310  *
1311  * @rs: current RAM state
1312  * @ms: current migration state
1313  * @pss: data about the page we want to send
1314  * @last_stage: if we are at the completion stage
1315  */
1316 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1317                               bool last_stage)
1318 {
1319     int tmppages, pages = 0;
1320     size_t pagesize_bits =
1321         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1322
1323     do {
1324         tmppages = ram_save_target_page(rs, pss, last_stage);
1325         if (tmppages < 0) {
1326             return tmppages;
1327         }
1328
1329         pages += tmppages;
1330         pss->page++;
1331     } while (pss->page & (pagesize_bits - 1));
1332
1333     /* The offset we leave with is the last one we looked at */
1334     pss->page--;
1335     return pages;
1336 }
1337
1338 /**
1339  * ram_find_and_save_block: finds a dirty page and sends it to f
1340  *
1341  * Called within an RCU critical section.
1342  *
1343  * Returns the number of pages written where zero means no dirty pages
1344  *
1345  * @rs: current RAM state
1346  * @last_stage: if we are at the completion stage
1347  *
1348  * On systems where host-page-size > target-page-size it will send all the
1349  * pages in a host page that are dirty.
1350  */
1351
1352 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1353 {
1354     PageSearchStatus pss;
1355     int pages = 0;
1356     bool again, found;
1357
1358     /* No dirty page as there is zero RAM */
1359     if (!ram_bytes_total()) {
1360         return pages;
1361     }
1362
1363     pss.block = rs->last_seen_block;
1364     pss.page = rs->last_page;
1365     pss.complete_round = false;
1366
1367     if (!pss.block) {
1368         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1369     }
1370
1371     do {
1372         again = true;
1373         found = get_queued_page(rs, &pss);
1374
1375         if (!found) {
1376             /* priority queue empty, so just search for something dirty */
1377             found = find_dirty_block(rs, &pss, &again);
1378         }
1379
1380         if (found) {
1381             pages = ram_save_host_page(rs, &pss, last_stage);
1382         }
1383     } while (!pages && again);
1384
1385     rs->last_seen_block = pss.block;
1386     rs->last_page = pss.page;
1387
1388     return pages;
1389 }
1390
1391 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1392 {
1393     uint64_t pages = size / TARGET_PAGE_SIZE;
1394     RAMState *rs = &ram_state;
1395
1396     if (zero) {
1397         rs->zero_pages += pages;
1398     } else {
1399         rs->norm_pages += pages;
1400         rs->bytes_transferred += size;
1401         qemu_update_position(f, size);
1402     }
1403 }
1404
1405 uint64_t ram_bytes_total(void)
1406 {
1407     RAMBlock *block;
1408     uint64_t total = 0;
1409
1410     rcu_read_lock();
1411     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1412         total += block->used_length;
1413     rcu_read_unlock();
1414     return total;
1415 }
1416
1417 void free_xbzrle_decoded_buf(void)
1418 {
1419     g_free(xbzrle_decoded_buf);
1420     xbzrle_decoded_buf = NULL;
1421 }
1422
1423 static void ram_migration_cleanup(void *opaque)
1424 {
1425     RAMBlock *block;
1426
1427     /* caller have hold iothread lock or is in a bh, so there is
1428      * no writing race against this migration_bitmap
1429      */
1430     memory_global_dirty_log_stop();
1431
1432     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1433         g_free(block->bmap);
1434         block->bmap = NULL;
1435         g_free(block->unsentmap);
1436         block->unsentmap = NULL;
1437     }
1438
1439     XBZRLE_cache_lock();
1440     if (XBZRLE.cache) {
1441         cache_fini(XBZRLE.cache);
1442         g_free(XBZRLE.encoded_buf);
1443         g_free(XBZRLE.current_buf);
1444         g_free(ZERO_TARGET_PAGE);
1445         XBZRLE.cache = NULL;
1446         XBZRLE.encoded_buf = NULL;
1447         XBZRLE.current_buf = NULL;
1448     }
1449     XBZRLE_cache_unlock();
1450 }
1451
1452 static void ram_state_reset(RAMState *rs)
1453 {
1454     rs->last_seen_block = NULL;
1455     rs->last_sent_block = NULL;
1456     rs->last_page = 0;
1457     rs->last_version = ram_list.version;
1458     rs->ram_bulk_stage = true;
1459 }
1460
1461 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1462
1463 /*
1464  * 'expected' is the value you expect the bitmap mostly to be full
1465  * of; it won't bother printing lines that are all this value.
1466  * If 'todump' is null the migration bitmap is dumped.
1467  */
1468 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1469                            unsigned long pages)
1470 {
1471     int64_t cur;
1472     int64_t linelen = 128;
1473     char linebuf[129];
1474
1475     for (cur = 0; cur < pages; cur += linelen) {
1476         int64_t curb;
1477         bool found = false;
1478         /*
1479          * Last line; catch the case where the line length
1480          * is longer than remaining ram
1481          */
1482         if (cur + linelen > pages) {
1483             linelen = pages - cur;
1484         }
1485         for (curb = 0; curb < linelen; curb++) {
1486             bool thisbit = test_bit(cur + curb, todump);
1487             linebuf[curb] = thisbit ? '1' : '.';
1488             found = found || (thisbit != expected);
1489         }
1490         if (found) {
1491             linebuf[curb] = '\0';
1492             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1493         }
1494     }
1495 }
1496
1497 /* **** functions for postcopy ***** */
1498
1499 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1500 {
1501     struct RAMBlock *block;
1502
1503     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1504         unsigned long *bitmap = block->bmap;
1505         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1506         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1507
1508         while (run_start < range) {
1509             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1510             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1511                               (run_end - run_start) << TARGET_PAGE_BITS);
1512             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1513         }
1514     }
1515 }
1516
1517 /**
1518  * postcopy_send_discard_bm_ram: discard a RAMBlock
1519  *
1520  * Returns zero on success
1521  *
1522  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1523  * Note: At this point the 'unsentmap' is the processed bitmap combined
1524  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1525  *
1526  * @ms: current migration state
1527  * @pds: state for postcopy
1528  * @start: RAMBlock starting page
1529  * @length: RAMBlock size
1530  */
1531 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1532                                         PostcopyDiscardState *pds,
1533                                         RAMBlock *block)
1534 {
1535     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1536     unsigned long current;
1537     unsigned long *unsentmap = block->unsentmap;
1538
1539     for (current = 0; current < end; ) {
1540         unsigned long one = find_next_bit(unsentmap, end, current);
1541
1542         if (one <= end) {
1543             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1544             unsigned long discard_length;
1545
1546             if (zero >= end) {
1547                 discard_length = end - one;
1548             } else {
1549                 discard_length = zero - one;
1550             }
1551             if (discard_length) {
1552                 postcopy_discard_send_range(ms, pds, one, discard_length);
1553             }
1554             current = one + discard_length;
1555         } else {
1556             current = one;
1557         }
1558     }
1559
1560     return 0;
1561 }
1562
1563 /**
1564  * postcopy_each_ram_send_discard: discard all RAMBlocks
1565  *
1566  * Returns 0 for success or negative for error
1567  *
1568  * Utility for the outgoing postcopy code.
1569  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1570  *   passing it bitmap indexes and name.
1571  * (qemu_ram_foreach_block ends up passing unscaled lengths
1572  *  which would mean postcopy code would have to deal with target page)
1573  *
1574  * @ms: current migration state
1575  */
1576 static int postcopy_each_ram_send_discard(MigrationState *ms)
1577 {
1578     struct RAMBlock *block;
1579     int ret;
1580
1581     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1582         PostcopyDiscardState *pds =
1583             postcopy_discard_send_init(ms, block->idstr);
1584
1585         /*
1586          * Postcopy sends chunks of bitmap over the wire, but it
1587          * just needs indexes at this point, avoids it having
1588          * target page specific code.
1589          */
1590         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1591         postcopy_discard_send_finish(ms, pds);
1592         if (ret) {
1593             return ret;
1594         }
1595     }
1596
1597     return 0;
1598 }
1599
1600 /**
1601  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1602  *
1603  * Helper for postcopy_chunk_hostpages; it's called twice to
1604  * canonicalize the two bitmaps, that are similar, but one is
1605  * inverted.
1606  *
1607  * Postcopy requires that all target pages in a hostpage are dirty or
1608  * clean, not a mix.  This function canonicalizes the bitmaps.
1609  *
1610  * @ms: current migration state
1611  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1612  *               otherwise we need to canonicalize partially dirty host pages
1613  * @block: block that contains the page we want to canonicalize
1614  * @pds: state for postcopy
1615  */
1616 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1617                                           RAMBlock *block,
1618                                           PostcopyDiscardState *pds)
1619 {
1620     RAMState *rs = &ram_state;
1621     unsigned long *bitmap = block->bmap;
1622     unsigned long *unsentmap = block->unsentmap;
1623     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1624     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1625     unsigned long run_start;
1626
1627     if (block->page_size == TARGET_PAGE_SIZE) {
1628         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1629         return;
1630     }
1631
1632     if (unsent_pass) {
1633         /* Find a sent page */
1634         run_start = find_next_zero_bit(unsentmap, pages, 0);
1635     } else {
1636         /* Find a dirty page */
1637         run_start = find_next_bit(bitmap, pages, 0);
1638     }
1639
1640     while (run_start < pages) {
1641         bool do_fixup = false;
1642         unsigned long fixup_start_addr;
1643         unsigned long host_offset;
1644
1645         /*
1646          * If the start of this run of pages is in the middle of a host
1647          * page, then we need to fixup this host page.
1648          */
1649         host_offset = run_start % host_ratio;
1650         if (host_offset) {
1651             do_fixup = true;
1652             run_start -= host_offset;
1653             fixup_start_addr = run_start;
1654             /* For the next pass */
1655             run_start = run_start + host_ratio;
1656         } else {
1657             /* Find the end of this run */
1658             unsigned long run_end;
1659             if (unsent_pass) {
1660                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1661             } else {
1662                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1663             }
1664             /*
1665              * If the end isn't at the start of a host page, then the
1666              * run doesn't finish at the end of a host page
1667              * and we need to discard.
1668              */
1669             host_offset = run_end % host_ratio;
1670             if (host_offset) {
1671                 do_fixup = true;
1672                 fixup_start_addr = run_end - host_offset;
1673                 /*
1674                  * This host page has gone, the next loop iteration starts
1675                  * from after the fixup
1676                  */
1677                 run_start = fixup_start_addr + host_ratio;
1678             } else {
1679                 /*
1680                  * No discards on this iteration, next loop starts from
1681                  * next sent/dirty page
1682                  */
1683                 run_start = run_end + 1;
1684             }
1685         }
1686
1687         if (do_fixup) {
1688             unsigned long page;
1689
1690             /* Tell the destination to discard this page */
1691             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1692                 /* For the unsent_pass we:
1693                  *     discard partially sent pages
1694                  * For the !unsent_pass (dirty) we:
1695                  *     discard partially dirty pages that were sent
1696                  *     (any partially sent pages were already discarded
1697                  *     by the previous unsent_pass)
1698                  */
1699                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1700                                             host_ratio);
1701             }
1702
1703             /* Clean up the bitmap */
1704             for (page = fixup_start_addr;
1705                  page < fixup_start_addr + host_ratio; page++) {
1706                 /* All pages in this host page are now not sent */
1707                 set_bit(page, unsentmap);
1708
1709                 /*
1710                  * Remark them as dirty, updating the count for any pages
1711                  * that weren't previously dirty.
1712                  */
1713                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1714             }
1715         }
1716
1717         if (unsent_pass) {
1718             /* Find the next sent page for the next iteration */
1719             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1720         } else {
1721             /* Find the next dirty page for the next iteration */
1722             run_start = find_next_bit(bitmap, pages, run_start);
1723         }
1724     }
1725 }
1726
1727 /**
1728  * postcopy_chuck_hostpages: discrad any partially sent host page
1729  *
1730  * Utility for the outgoing postcopy code.
1731  *
1732  * Discard any partially sent host-page size chunks, mark any partially
1733  * dirty host-page size chunks as all dirty.  In this case the host-page
1734  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1735  *
1736  * Returns zero on success
1737  *
1738  * @ms: current migration state
1739  * @block: block we want to work with
1740  */
1741 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1742 {
1743     PostcopyDiscardState *pds =
1744         postcopy_discard_send_init(ms, block->idstr);
1745
1746     /* First pass: Discard all partially sent host pages */
1747     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1748     /*
1749      * Second pass: Ensure that all partially dirty host pages are made
1750      * fully dirty.
1751      */
1752     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1753
1754     postcopy_discard_send_finish(ms, pds);
1755     return 0;
1756 }
1757
1758 /**
1759  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1760  *
1761  * Returns zero on success
1762  *
1763  * Transmit the set of pages to be discarded after precopy to the target
1764  * these are pages that:
1765  *     a) Have been previously transmitted but are now dirty again
1766  *     b) Pages that have never been transmitted, this ensures that
1767  *        any pages on the destination that have been mapped by background
1768  *        tasks get discarded (transparent huge pages is the specific concern)
1769  * Hopefully this is pretty sparse
1770  *
1771  * @ms: current migration state
1772  */
1773 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1774 {
1775     RAMState *rs = &ram_state;
1776     RAMBlock *block;
1777     int ret;
1778
1779     rcu_read_lock();
1780
1781     /* This should be our last sync, the src is now paused */
1782     migration_bitmap_sync(rs);
1783
1784     /* Easiest way to make sure we don't resume in the middle of a host-page */
1785     rs->last_seen_block = NULL;
1786     rs->last_sent_block = NULL;
1787     rs->last_page = 0;
1788
1789     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1790         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1791         unsigned long *bitmap = block->bmap;
1792         unsigned long *unsentmap = block->unsentmap;
1793
1794         if (!unsentmap) {
1795             /* We don't have a safe way to resize the sentmap, so
1796              * if the bitmap was resized it will be NULL at this
1797              * point.
1798              */
1799             error_report("migration ram resized during precopy phase");
1800             rcu_read_unlock();
1801             return -EINVAL;
1802         }
1803         /* Deal with TPS != HPS and huge pages */
1804         ret = postcopy_chunk_hostpages(ms, block);
1805         if (ret) {
1806             rcu_read_unlock();
1807             return ret;
1808         }
1809
1810         /*
1811          * Update the unsentmap to be unsentmap = unsentmap | dirty
1812          */
1813         bitmap_or(unsentmap, unsentmap, bitmap, pages);
1814 #ifdef DEBUG_POSTCOPY
1815         ram_debug_dump_bitmap(unsentmap, true, pages);
1816 #endif
1817     }
1818     trace_ram_postcopy_send_discard_bitmap();
1819
1820     ret = postcopy_each_ram_send_discard(ms);
1821     rcu_read_unlock();
1822
1823     return ret;
1824 }
1825
1826 /**
1827  * ram_discard_range: discard dirtied pages at the beginning of postcopy
1828  *
1829  * Returns zero on success
1830  *
1831  * @rbname: name of the RAMBlock of the request. NULL means the
1832  *          same that last one.
1833  * @start: RAMBlock starting page
1834  * @length: RAMBlock size
1835  */
1836 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1837 {
1838     int ret = -1;
1839
1840     trace_ram_discard_range(rbname, start, length);
1841
1842     rcu_read_lock();
1843     RAMBlock *rb = qemu_ram_block_by_name(rbname);
1844
1845     if (!rb) {
1846         error_report("ram_discard_range: Failed to find block '%s'", rbname);
1847         goto err;
1848     }
1849
1850     ret = ram_block_discard_range(rb, start, length);
1851
1852 err:
1853     rcu_read_unlock();
1854
1855     return ret;
1856 }
1857
1858 static int ram_state_init(RAMState *rs)
1859 {
1860     memset(rs, 0, sizeof(*rs));
1861     qemu_mutex_init(&rs->bitmap_mutex);
1862     qemu_mutex_init(&rs->src_page_req_mutex);
1863     QSIMPLEQ_INIT(&rs->src_page_requests);
1864
1865     if (migrate_use_xbzrle()) {
1866         XBZRLE_cache_lock();
1867         ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1868         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1869                                   TARGET_PAGE_SIZE,
1870                                   TARGET_PAGE_SIZE);
1871         if (!XBZRLE.cache) {
1872             XBZRLE_cache_unlock();
1873             error_report("Error creating cache");
1874             return -1;
1875         }
1876         XBZRLE_cache_unlock();
1877
1878         /* We prefer not to abort if there is no memory */
1879         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1880         if (!XBZRLE.encoded_buf) {
1881             error_report("Error allocating encoded_buf");
1882             return -1;
1883         }
1884
1885         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1886         if (!XBZRLE.current_buf) {
1887             error_report("Error allocating current_buf");
1888             g_free(XBZRLE.encoded_buf);
1889             XBZRLE.encoded_buf = NULL;
1890             return -1;
1891         }
1892     }
1893
1894     /* For memory_global_dirty_log_start below.  */
1895     qemu_mutex_lock_iothread();
1896
1897     qemu_mutex_lock_ramlist();
1898     rcu_read_lock();
1899     ram_state_reset(rs);
1900
1901     /* Skip setting bitmap if there is no RAM */
1902     if (ram_bytes_total()) {
1903         RAMBlock *block;
1904
1905         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1906             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
1907
1908             block->bmap = bitmap_new(pages);
1909             bitmap_set(block->bmap, 0, pages);
1910             if (migrate_postcopy_ram()) {
1911                 block->unsentmap = bitmap_new(pages);
1912                 bitmap_set(block->unsentmap, 0, pages);
1913             }
1914         }
1915     }
1916
1917     /*
1918      * Count the total number of pages used by ram blocks not including any
1919      * gaps due to alignment or unplugs.
1920      */
1921     rs->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1922
1923     memory_global_dirty_log_start();
1924     migration_bitmap_sync(rs);
1925     qemu_mutex_unlock_ramlist();
1926     qemu_mutex_unlock_iothread();
1927     rcu_read_unlock();
1928
1929     return 0;
1930 }
1931
1932 /*
1933  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1934  * long-running RCU critical section.  When rcu-reclaims in the code
1935  * start to become numerous it will be necessary to reduce the
1936  * granularity of these critical sections.
1937  */
1938
1939 /**
1940  * ram_save_setup: Setup RAM for migration
1941  *
1942  * Returns zero to indicate success and negative for error
1943  *
1944  * @f: QEMUFile where to send the data
1945  * @opaque: RAMState pointer
1946  */
1947 static int ram_save_setup(QEMUFile *f, void *opaque)
1948 {
1949     RAMState *rs = opaque;
1950     RAMBlock *block;
1951
1952     /* migration has already setup the bitmap, reuse it. */
1953     if (!migration_in_colo_state()) {
1954         if (ram_state_init(rs) < 0) {
1955             return -1;
1956          }
1957     }
1958     rs->f = f;
1959
1960     rcu_read_lock();
1961
1962     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1963
1964     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1965         qemu_put_byte(f, strlen(block->idstr));
1966         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1967         qemu_put_be64(f, block->used_length);
1968         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1969             qemu_put_be64(f, block->page_size);
1970         }
1971     }
1972
1973     rcu_read_unlock();
1974
1975     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1976     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1977
1978     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1979
1980     return 0;
1981 }
1982
1983 /**
1984  * ram_save_iterate: iterative stage for migration
1985  *
1986  * Returns zero to indicate success and negative for error
1987  *
1988  * @f: QEMUFile where to send the data
1989  * @opaque: RAMState pointer
1990  */
1991 static int ram_save_iterate(QEMUFile *f, void *opaque)
1992 {
1993     RAMState *rs = opaque;
1994     int ret;
1995     int i;
1996     int64_t t0;
1997     int done = 0;
1998
1999     rcu_read_lock();
2000     if (ram_list.version != rs->last_version) {
2001         ram_state_reset(rs);
2002     }
2003
2004     /* Read version before ram_list.blocks */
2005     smp_rmb();
2006
2007     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2008
2009     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2010     i = 0;
2011     while ((ret = qemu_file_rate_limit(f)) == 0) {
2012         int pages;
2013
2014         pages = ram_find_and_save_block(rs, false);
2015         /* no more pages to sent */
2016         if (pages == 0) {
2017             done = 1;
2018             break;
2019         }
2020         rs->iterations++;
2021
2022         /* we want to check in the 1st loop, just in case it was the 1st time
2023            and we had to sync the dirty bitmap.
2024            qemu_get_clock_ns() is a bit expensive, so we only check each some
2025            iterations
2026         */
2027         if ((i & 63) == 0) {
2028             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2029             if (t1 > MAX_WAIT) {
2030                 trace_ram_save_iterate_big_wait(t1, i);
2031                 break;
2032             }
2033         }
2034         i++;
2035     }
2036     flush_compressed_data(rs);
2037     rcu_read_unlock();
2038
2039     /*
2040      * Must occur before EOS (or any QEMUFile operation)
2041      * because of RDMA protocol.
2042      */
2043     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2044
2045     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2046     rs->bytes_transferred += 8;
2047
2048     ret = qemu_file_get_error(f);
2049     if (ret < 0) {
2050         return ret;
2051     }
2052
2053     return done;
2054 }
2055
2056 /**
2057  * ram_save_complete: function called to send the remaining amount of ram
2058  *
2059  * Returns zero to indicate success
2060  *
2061  * Called with iothread lock
2062  *
2063  * @f: QEMUFile where to send the data
2064  * @opaque: RAMState pointer
2065  */
2066 static int ram_save_complete(QEMUFile *f, void *opaque)
2067 {
2068     RAMState *rs = opaque;
2069
2070     rcu_read_lock();
2071
2072     if (!migration_in_postcopy()) {
2073         migration_bitmap_sync(rs);
2074     }
2075
2076     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2077
2078     /* try transferring iterative blocks of memory */
2079
2080     /* flush all remaining blocks regardless of rate limiting */
2081     while (true) {
2082         int pages;
2083
2084         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2085         /* no more blocks to sent */
2086         if (pages == 0) {
2087             break;
2088         }
2089     }
2090
2091     flush_compressed_data(rs);
2092     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2093
2094     rcu_read_unlock();
2095
2096     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2097
2098     return 0;
2099 }
2100
2101 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2102                              uint64_t *non_postcopiable_pending,
2103                              uint64_t *postcopiable_pending)
2104 {
2105     RAMState *rs = opaque;
2106     uint64_t remaining_size;
2107
2108     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2109
2110     if (!migration_in_postcopy() &&
2111         remaining_size < max_size) {
2112         qemu_mutex_lock_iothread();
2113         rcu_read_lock();
2114         migration_bitmap_sync(rs);
2115         rcu_read_unlock();
2116         qemu_mutex_unlock_iothread();
2117         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2118     }
2119
2120     /* We can do postcopy, and all the data is postcopiable */
2121     *postcopiable_pending += remaining_size;
2122 }
2123
2124 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2125 {
2126     unsigned int xh_len;
2127     int xh_flags;
2128     uint8_t *loaded_data;
2129
2130     if (!xbzrle_decoded_buf) {
2131         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2132     }
2133     loaded_data = xbzrle_decoded_buf;
2134
2135     /* extract RLE header */
2136     xh_flags = qemu_get_byte(f);
2137     xh_len = qemu_get_be16(f);
2138
2139     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2140         error_report("Failed to load XBZRLE page - wrong compression!");
2141         return -1;
2142     }
2143
2144     if (xh_len > TARGET_PAGE_SIZE) {
2145         error_report("Failed to load XBZRLE page - len overflow!");
2146         return -1;
2147     }
2148     /* load data and decode */
2149     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2150
2151     /* decode RLE */
2152     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2153                              TARGET_PAGE_SIZE) == -1) {
2154         error_report("Failed to load XBZRLE page - decode error!");
2155         return -1;
2156     }
2157
2158     return 0;
2159 }
2160
2161 /**
2162  * ram_block_from_stream: read a RAMBlock id from the migration stream
2163  *
2164  * Must be called from within a rcu critical section.
2165  *
2166  * Returns a pointer from within the RCU-protected ram_list.
2167  *
2168  * @f: QEMUFile where to read the data from
2169  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2170  */
2171 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2172 {
2173     static RAMBlock *block = NULL;
2174     char id[256];
2175     uint8_t len;
2176
2177     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2178         if (!block) {
2179             error_report("Ack, bad migration stream!");
2180             return NULL;
2181         }
2182         return block;
2183     }
2184
2185     len = qemu_get_byte(f);
2186     qemu_get_buffer(f, (uint8_t *)id, len);
2187     id[len] = 0;
2188
2189     block = qemu_ram_block_by_name(id);
2190     if (!block) {
2191         error_report("Can't find block %s", id);
2192         return NULL;
2193     }
2194
2195     return block;
2196 }
2197
2198 static inline void *host_from_ram_block_offset(RAMBlock *block,
2199                                                ram_addr_t offset)
2200 {
2201     if (!offset_in_ramblock(block, offset)) {
2202         return NULL;
2203     }
2204
2205     return block->host + offset;
2206 }
2207
2208 /**
2209  * ram_handle_compressed: handle the zero page case
2210  *
2211  * If a page (or a whole RDMA chunk) has been
2212  * determined to be zero, then zap it.
2213  *
2214  * @host: host address for the zero page
2215  * @ch: what the page is filled from.  We only support zero
2216  * @size: size of the zero page
2217  */
2218 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2219 {
2220     if (ch != 0 || !is_zero_range(host, size)) {
2221         memset(host, ch, size);
2222     }
2223 }
2224
2225 static void *do_data_decompress(void *opaque)
2226 {
2227     DecompressParam *param = opaque;
2228     unsigned long pagesize;
2229     uint8_t *des;
2230     int len;
2231
2232     qemu_mutex_lock(&param->mutex);
2233     while (!param->quit) {
2234         if (param->des) {
2235             des = param->des;
2236             len = param->len;
2237             param->des = 0;
2238             qemu_mutex_unlock(&param->mutex);
2239
2240             pagesize = TARGET_PAGE_SIZE;
2241             /* uncompress() will return failed in some case, especially
2242              * when the page is dirted when doing the compression, it's
2243              * not a problem because the dirty page will be retransferred
2244              * and uncompress() won't break the data in other pages.
2245              */
2246             uncompress((Bytef *)des, &pagesize,
2247                        (const Bytef *)param->compbuf, len);
2248
2249             qemu_mutex_lock(&decomp_done_lock);
2250             param->done = true;
2251             qemu_cond_signal(&decomp_done_cond);
2252             qemu_mutex_unlock(&decomp_done_lock);
2253
2254             qemu_mutex_lock(&param->mutex);
2255         } else {
2256             qemu_cond_wait(&param->cond, &param->mutex);
2257         }
2258     }
2259     qemu_mutex_unlock(&param->mutex);
2260
2261     return NULL;
2262 }
2263
2264 static void wait_for_decompress_done(void)
2265 {
2266     int idx, thread_count;
2267
2268     if (!migrate_use_compression()) {
2269         return;
2270     }
2271
2272     thread_count = migrate_decompress_threads();
2273     qemu_mutex_lock(&decomp_done_lock);
2274     for (idx = 0; idx < thread_count; idx++) {
2275         while (!decomp_param[idx].done) {
2276             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2277         }
2278     }
2279     qemu_mutex_unlock(&decomp_done_lock);
2280 }
2281
2282 void migrate_decompress_threads_create(void)
2283 {
2284     int i, thread_count;
2285
2286     thread_count = migrate_decompress_threads();
2287     decompress_threads = g_new0(QemuThread, thread_count);
2288     decomp_param = g_new0(DecompressParam, thread_count);
2289     qemu_mutex_init(&decomp_done_lock);
2290     qemu_cond_init(&decomp_done_cond);
2291     for (i = 0; i < thread_count; i++) {
2292         qemu_mutex_init(&decomp_param[i].mutex);
2293         qemu_cond_init(&decomp_param[i].cond);
2294         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2295         decomp_param[i].done = true;
2296         decomp_param[i].quit = false;
2297         qemu_thread_create(decompress_threads + i, "decompress",
2298                            do_data_decompress, decomp_param + i,
2299                            QEMU_THREAD_JOINABLE);
2300     }
2301 }
2302
2303 void migrate_decompress_threads_join(void)
2304 {
2305     int i, thread_count;
2306
2307     thread_count = migrate_decompress_threads();
2308     for (i = 0; i < thread_count; i++) {
2309         qemu_mutex_lock(&decomp_param[i].mutex);
2310         decomp_param[i].quit = true;
2311         qemu_cond_signal(&decomp_param[i].cond);
2312         qemu_mutex_unlock(&decomp_param[i].mutex);
2313     }
2314     for (i = 0; i < thread_count; i++) {
2315         qemu_thread_join(decompress_threads + i);
2316         qemu_mutex_destroy(&decomp_param[i].mutex);
2317         qemu_cond_destroy(&decomp_param[i].cond);
2318         g_free(decomp_param[i].compbuf);
2319     }
2320     g_free(decompress_threads);
2321     g_free(decomp_param);
2322     decompress_threads = NULL;
2323     decomp_param = NULL;
2324 }
2325
2326 static void decompress_data_with_multi_threads(QEMUFile *f,
2327                                                void *host, int len)
2328 {
2329     int idx, thread_count;
2330
2331     thread_count = migrate_decompress_threads();
2332     qemu_mutex_lock(&decomp_done_lock);
2333     while (true) {
2334         for (idx = 0; idx < thread_count; idx++) {
2335             if (decomp_param[idx].done) {
2336                 decomp_param[idx].done = false;
2337                 qemu_mutex_lock(&decomp_param[idx].mutex);
2338                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2339                 decomp_param[idx].des = host;
2340                 decomp_param[idx].len = len;
2341                 qemu_cond_signal(&decomp_param[idx].cond);
2342                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2343                 break;
2344             }
2345         }
2346         if (idx < thread_count) {
2347             break;
2348         } else {
2349             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2350         }
2351     }
2352     qemu_mutex_unlock(&decomp_done_lock);
2353 }
2354
2355 /**
2356  * ram_postcopy_incoming_init: allocate postcopy data structures
2357  *
2358  * Returns 0 for success and negative if there was one error
2359  *
2360  * @mis: current migration incoming state
2361  *
2362  * Allocate data structures etc needed by incoming migration with
2363  * postcopy-ram. postcopy-ram's similarly names
2364  * postcopy_ram_incoming_init does the work.
2365  */
2366 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2367 {
2368     unsigned long ram_pages = last_ram_page();
2369
2370     return postcopy_ram_incoming_init(mis, ram_pages);
2371 }
2372
2373 /**
2374  * ram_load_postcopy: load a page in postcopy case
2375  *
2376  * Returns 0 for success or -errno in case of error
2377  *
2378  * Called in postcopy mode by ram_load().
2379  * rcu_read_lock is taken prior to this being called.
2380  *
2381  * @f: QEMUFile where to send the data
2382  */
2383 static int ram_load_postcopy(QEMUFile *f)
2384 {
2385     int flags = 0, ret = 0;
2386     bool place_needed = false;
2387     bool matching_page_sizes = false;
2388     MigrationIncomingState *mis = migration_incoming_get_current();
2389     /* Temporary page that is later 'placed' */
2390     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2391     void *last_host = NULL;
2392     bool all_zero = false;
2393
2394     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2395         ram_addr_t addr;
2396         void *host = NULL;
2397         void *page_buffer = NULL;
2398         void *place_source = NULL;
2399         RAMBlock *block = NULL;
2400         uint8_t ch;
2401
2402         addr = qemu_get_be64(f);
2403         flags = addr & ~TARGET_PAGE_MASK;
2404         addr &= TARGET_PAGE_MASK;
2405
2406         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2407         place_needed = false;
2408         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2409             block = ram_block_from_stream(f, flags);
2410
2411             host = host_from_ram_block_offset(block, addr);
2412             if (!host) {
2413                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2414                 ret = -EINVAL;
2415                 break;
2416             }
2417             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2418             /*
2419              * Postcopy requires that we place whole host pages atomically;
2420              * these may be huge pages for RAMBlocks that are backed by
2421              * hugetlbfs.
2422              * To make it atomic, the data is read into a temporary page
2423              * that's moved into place later.
2424              * The migration protocol uses,  possibly smaller, target-pages
2425              * however the source ensures it always sends all the components
2426              * of a host page in order.
2427              */
2428             page_buffer = postcopy_host_page +
2429                           ((uintptr_t)host & (block->page_size - 1));
2430             /* If all TP are zero then we can optimise the place */
2431             if (!((uintptr_t)host & (block->page_size - 1))) {
2432                 all_zero = true;
2433             } else {
2434                 /* not the 1st TP within the HP */
2435                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2436                     error_report("Non-sequential target page %p/%p",
2437                                   host, last_host);
2438                     ret = -EINVAL;
2439                     break;
2440                 }
2441             }
2442
2443
2444             /*
2445              * If it's the last part of a host page then we place the host
2446              * page
2447              */
2448             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2449                                      (block->page_size - 1)) == 0;
2450             place_source = postcopy_host_page;
2451         }
2452         last_host = host;
2453
2454         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2455         case RAM_SAVE_FLAG_COMPRESS:
2456             ch = qemu_get_byte(f);
2457             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2458             if (ch) {
2459                 all_zero = false;
2460             }
2461             break;
2462
2463         case RAM_SAVE_FLAG_PAGE:
2464             all_zero = false;
2465             if (!place_needed || !matching_page_sizes) {
2466                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2467             } else {
2468                 /* Avoids the qemu_file copy during postcopy, which is
2469                  * going to do a copy later; can only do it when we
2470                  * do this read in one go (matching page sizes)
2471                  */
2472                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2473                                          TARGET_PAGE_SIZE);
2474             }
2475             break;
2476         case RAM_SAVE_FLAG_EOS:
2477             /* normal exit */
2478             break;
2479         default:
2480             error_report("Unknown combination of migration flags: %#x"
2481                          " (postcopy mode)", flags);
2482             ret = -EINVAL;
2483         }
2484
2485         if (place_needed) {
2486             /* This gets called at the last target page in the host page */
2487             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2488
2489             if (all_zero) {
2490                 ret = postcopy_place_page_zero(mis, place_dest,
2491                                                block->page_size);
2492             } else {
2493                 ret = postcopy_place_page(mis, place_dest,
2494                                           place_source, block->page_size);
2495             }
2496         }
2497         if (!ret) {
2498             ret = qemu_file_get_error(f);
2499         }
2500     }
2501
2502     return ret;
2503 }
2504
2505 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2506 {
2507     int flags = 0, ret = 0;
2508     static uint64_t seq_iter;
2509     int len = 0;
2510     /*
2511      * If system is running in postcopy mode, page inserts to host memory must
2512      * be atomic
2513      */
2514     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2515     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2516     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2517
2518     seq_iter++;
2519
2520     if (version_id != 4) {
2521         ret = -EINVAL;
2522     }
2523
2524     /* This RCU critical section can be very long running.
2525      * When RCU reclaims in the code start to become numerous,
2526      * it will be necessary to reduce the granularity of this
2527      * critical section.
2528      */
2529     rcu_read_lock();
2530
2531     if (postcopy_running) {
2532         ret = ram_load_postcopy(f);
2533     }
2534
2535     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2536         ram_addr_t addr, total_ram_bytes;
2537         void *host = NULL;
2538         uint8_t ch;
2539
2540         addr = qemu_get_be64(f);
2541         flags = addr & ~TARGET_PAGE_MASK;
2542         addr &= TARGET_PAGE_MASK;
2543
2544         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2545                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2546             RAMBlock *block = ram_block_from_stream(f, flags);
2547
2548             host = host_from_ram_block_offset(block, addr);
2549             if (!host) {
2550                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2551                 ret = -EINVAL;
2552                 break;
2553             }
2554             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2555         }
2556
2557         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2558         case RAM_SAVE_FLAG_MEM_SIZE:
2559             /* Synchronize RAM block list */
2560             total_ram_bytes = addr;
2561             while (!ret && total_ram_bytes) {
2562                 RAMBlock *block;
2563                 char id[256];
2564                 ram_addr_t length;
2565
2566                 len = qemu_get_byte(f);
2567                 qemu_get_buffer(f, (uint8_t *)id, len);
2568                 id[len] = 0;
2569                 length = qemu_get_be64(f);
2570
2571                 block = qemu_ram_block_by_name(id);
2572                 if (block) {
2573                     if (length != block->used_length) {
2574                         Error *local_err = NULL;
2575
2576                         ret = qemu_ram_resize(block, length,
2577                                               &local_err);
2578                         if (local_err) {
2579                             error_report_err(local_err);
2580                         }
2581                     }
2582                     /* For postcopy we need to check hugepage sizes match */
2583                     if (postcopy_advised &&
2584                         block->page_size != qemu_host_page_size) {
2585                         uint64_t remote_page_size = qemu_get_be64(f);
2586                         if (remote_page_size != block->page_size) {
2587                             error_report("Mismatched RAM page size %s "
2588                                          "(local) %zd != %" PRId64,
2589                                          id, block->page_size,
2590                                          remote_page_size);
2591                             ret = -EINVAL;
2592                         }
2593                     }
2594                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2595                                           block->idstr);
2596                 } else {
2597                     error_report("Unknown ramblock \"%s\", cannot "
2598                                  "accept migration", id);
2599                     ret = -EINVAL;
2600                 }
2601
2602                 total_ram_bytes -= length;
2603             }
2604             break;
2605
2606         case RAM_SAVE_FLAG_COMPRESS:
2607             ch = qemu_get_byte(f);
2608             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2609             break;
2610
2611         case RAM_SAVE_FLAG_PAGE:
2612             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2613             break;
2614
2615         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2616             len = qemu_get_be32(f);
2617             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2618                 error_report("Invalid compressed data length: %d", len);
2619                 ret = -EINVAL;
2620                 break;
2621             }
2622             decompress_data_with_multi_threads(f, host, len);
2623             break;
2624
2625         case RAM_SAVE_FLAG_XBZRLE:
2626             if (load_xbzrle(f, addr, host) < 0) {
2627                 error_report("Failed to decompress XBZRLE page at "
2628                              RAM_ADDR_FMT, addr);
2629                 ret = -EINVAL;
2630                 break;
2631             }
2632             break;
2633         case RAM_SAVE_FLAG_EOS:
2634             /* normal exit */
2635             break;
2636         default:
2637             if (flags & RAM_SAVE_FLAG_HOOK) {
2638                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2639             } else {
2640                 error_report("Unknown combination of migration flags: %#x",
2641                              flags);
2642                 ret = -EINVAL;
2643             }
2644         }
2645         if (!ret) {
2646             ret = qemu_file_get_error(f);
2647         }
2648     }
2649
2650     wait_for_decompress_done();
2651     rcu_read_unlock();
2652     trace_ram_load_complete(ret, seq_iter);
2653     return ret;
2654 }
2655
2656 static SaveVMHandlers savevm_ram_handlers = {
2657     .save_live_setup = ram_save_setup,
2658     .save_live_iterate = ram_save_iterate,
2659     .save_live_complete_postcopy = ram_save_complete,
2660     .save_live_complete_precopy = ram_save_complete,
2661     .save_live_pending = ram_save_pending,
2662     .load_state = ram_load,
2663     .cleanup = ram_migration_cleanup,
2664 };
2665
2666 void ram_mig_init(void)
2667 {
2668     qemu_mutex_init(&XBZRLE.lock);
2669     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2670 }