migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "socket.h"
  40 #include "migration/register.h"
  41 #include "migration/misc.h"
  42 #include "qemu-file.h"
  43 #include "postcopy-ram.h"
  44 #include "page_cache.h"
  45 #include "qemu/error-report.h"
  46 #include "qapi/error.h"
  47 #include "qapi/qapi-events-migration.h"
  48 #include "qapi/qmp/qerror.h"
  49 #include "trace.h"
  50 #include "exec/ram_addr.h"
  51 #include "exec/target_page.h"
  52 #include "qemu/rcu_queue.h"
  53 #include "migration/colo.h"
  54 #include "block.h"
  55 #include "sysemu/sysemu.h"
  56 #include "qemu/uuid.h"
  57 #include "savevm.h"
  58 #include "qemu/iov.h"
  59
  60 /***********************************************************/
  61 /* ram save/restore */
  62
  63 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  64  * worked for pages that where filled with the same char.  We switched
  65  * it to only search for the zero value.  And to avoid confusion with
  66  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  67  */
  68
  69 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  70 #define RAM_SAVE_FLAG_ZERO     0x02
  71 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  72 #define RAM_SAVE_FLAG_PAGE     0x08
  73 #define RAM_SAVE_FLAG_EOS      0x10
  74 #define RAM_SAVE_FLAG_CONTINUE 0x20
  75 #define RAM_SAVE_FLAG_XBZRLE   0x40
  76 /* 0x80 is reserved in migration.h start with 0x100 next */
  77 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  78
  79 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  80 {
  81     return buffer_is_zero(p, size);
  82 }
  83
  84 XBZRLECacheStats xbzrle_counters;
  85
  86 /* struct contains XBZRLE cache and a static page
  87    used by the compression */
  88 static struct {
  89     /* buffer used for XBZRLE encoding */
  90     uint8_t *encoded_buf;
  91     /* buffer for storing page content */
  92     uint8_t *current_buf;
  93     /* Cache for XBZRLE, Protected by lock. */
  94     PageCache *cache;
  95     QemuMutex lock;
  96     /* it will store a page full of zeros */
  97     uint8_t *zero_target_page;
  98     /* buffer used for XBZRLE decoding */
  99     uint8_t *decoded_buf;
 100 } XBZRLE;
 101
 102 static void XBZRLE_cache_lock(void)
 103 {
 104     if (migrate_use_xbzrle())
 105         qemu_mutex_lock(&XBZRLE.lock);
 106 }
 107
 108 static void XBZRLE_cache_unlock(void)
 109 {
 110     if (migrate_use_xbzrle())
 111         qemu_mutex_unlock(&XBZRLE.lock);
 112 }
 113
 114 /**
 115  * xbzrle_cache_resize: resize the xbzrle cache
 116  *
 117  * This function is called from qmp_migrate_set_cache_size in main
 118  * thread, possibly while a migration is in progress.  A running
 119  * migration may be using the cache and might finish during this call,
 120  * hence changes to the cache are protected by XBZRLE.lock().
 121  *
 122  * Returns 0 for success or -1 for error
 123  *
 124  * @new_size: new cache size
 125  * @errp: set *errp if the check failed, with reason
 126  */
 127 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 128 {
 129     PageCache *new_cache;
 130     int64_t ret = 0;
 131
 132     /* Check for truncation */
 133     if (new_size != (size_t)new_size) {
 134         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 135                    "exceeding address space");
 136         return -1;
 137     }
 138
 139     if (new_size == migrate_xbzrle_cache_size()) {
 140         /* nothing to do */
 141         return 0;
 142     }
 143
 144     XBZRLE_cache_lock();
 145
 146     if (XBZRLE.cache != NULL) {
 147         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 148         if (!new_cache) {
 149             ret = -1;
 150             goto out;
 151         }
 152
 153         cache_fini(XBZRLE.cache);
 154         XBZRLE.cache = new_cache;
 155     }
 156 out:
 157     XBZRLE_cache_unlock();
 158     return ret;
 159 }
 160
 161 static bool ramblock_is_ignored(RAMBlock *block)
 162 {
 163     return !qemu_ram_is_migratable(block) ||
 164            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 165 }
 166
 167 /* Should be holding either ram_list.mutex, or the RCU lock. */
 168 #define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 169     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 170         if (ramblock_is_ignored(block)) {} else
 171
 172 #define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 173     INTERNAL_RAMBLOCK_FOREACH(block)                   \
 174         if (!qemu_ram_is_migratable(block)) {} else
 175
 176 #undef RAMBLOCK_FOREACH
 177
 178 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 179 {
 180     RAMBlock *block;
 181     int ret = 0;
 182
 183     RCU_READ_LOCK_GUARD();
 184
 185     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 186         ret = func(block, opaque);
 187         if (ret) {
 188             break;
 189         }
 190     }
 191     return ret;
 192 }
 193
 194 static void ramblock_recv_map_init(void)
 195 {
 196     RAMBlock *rb;
 197
 198     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 199         assert(!rb->receivedmap);
 200         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 201     }
 202 }
 203
 204 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 205 {
 206     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 207                     rb->receivedmap);
 208 }
 209
 210 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 211 {
 212     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 213 }
 214
 215 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 216 {
 217     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 218 }
 219
 220 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 221                                     size_t nr)
 222 {
 223     bitmap_set_atomic(rb->receivedmap,
 224                       ramblock_recv_bitmap_offset(host_addr, rb),
 225                       nr);
 226 }
 227
 228 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 229
 230 /*
 231  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 232  *
 233  * Returns >0 if success with sent bytes, or <0 if error.
 234  */
 235 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 236                                   const char *block_name)
 237 {
 238     RAMBlock *block = qemu_ram_block_by_name(block_name);
 239     unsigned long *le_bitmap, nbits;
 240     uint64_t size;
 241
 242     if (!block) {
 243         error_report("%s: invalid block name: %s", __func__, block_name);
 244         return -1;
 245     }
 246
 247     nbits = block->used_length >> TARGET_PAGE_BITS;
 248
 249     /*
 250      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 251      * machines we may need 4 more bytes for padding (see below
 252      * comment). So extend it a bit before hand.
 253      */
 254     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 255
 256     /*
 257      * Always use little endian when sending the bitmap. This is
 258      * required that when source and destination VMs are not using the
 259      * same endianess. (Note: big endian won't work.)
 260      */
 261     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 262
 263     /* Size of the bitmap, in bytes */
 264     size = DIV_ROUND_UP(nbits, 8);
 265
 266     /*
 267      * size is always aligned to 8 bytes for 64bit machines, but it
 268      * may not be true for 32bit machines. We need this padding to
 269      * make sure the migration can survive even between 32bit and
 270      * 64bit machines.
 271      */
 272     size = ROUND_UP(size, 8);
 273
 274     qemu_put_be64(file, size);
 275     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 276     /*
 277      * Mark as an end, in case the middle part is screwed up due to
 278      * some "misterious" reason.
 279      */
 280     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 281     qemu_fflush(file);
 282
 283     g_free(le_bitmap);
 284
 285     if (qemu_file_get_error(file)) {
 286         return qemu_file_get_error(file);
 287     }
 288
 289     return size + sizeof(size);
 290 }
 291
 292 /*
 293  * An outstanding page request, on the source, having been received
 294  * and queued
 295  */
 296 struct RAMSrcPageRequest {
 297     RAMBlock *rb;
 298     hwaddr    offset;
 299     hwaddr    len;
 300
 301     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 302 };
 303
 304 /* State of RAM for migration */
 305 struct RAMState {
 306     /* QEMUFile used for this migration */
 307     QEMUFile *f;
 308     /* Last block that we have visited searching for dirty pages */
 309     RAMBlock *last_seen_block;
 310     /* Last block from where we have sent data */
 311     RAMBlock *last_sent_block;
 312     /* Last dirty target page we have sent */
 313     ram_addr_t last_page;
 314     /* last ram version we have seen */
 315     uint32_t last_version;
 316     /* We are in the first round */
 317     bool ram_bulk_stage;
 318     /* The free page optimization is enabled */
 319     bool fpo_enabled;
 320     /* How many times we have dirty too many pages */
 321     int dirty_rate_high_cnt;
 322     /* these variables are used for bitmap sync */
 323     /* last time we did a full bitmap_sync */
 324     int64_t time_last_bitmap_sync;
 325     /* bytes transferred at start_time */
 326     uint64_t bytes_xfer_prev;
 327     /* number of dirty pages since start_time */
 328     uint64_t num_dirty_pages_period;
 329     /* xbzrle misses since the beginning of the period */
 330     uint64_t xbzrle_cache_miss_prev;
 331
 332     /* compression statistics since the beginning of the period */
 333     /* amount of count that no free thread to compress data */
 334     uint64_t compress_thread_busy_prev;
 335     /* amount bytes after compression */
 336     uint64_t compressed_size_prev;
 337     /* amount of compressed pages */
 338     uint64_t compress_pages_prev;
 339
 340     /* total handled target pages at the beginning of period */
 341     uint64_t target_page_count_prev;
 342     /* total handled target pages since start */
 343     uint64_t target_page_count;
 344     /* number of dirty bits in the bitmap */
 345     uint64_t migration_dirty_pages;
 346     /* Protects modification of the bitmap and migration dirty pages */
 347     QemuMutex bitmap_mutex;
 348     /* The RAMBlock used in the last src_page_requests */
 349     RAMBlock *last_req_rb;
 350     /* Queue of outstanding page requests from the destination */
 351     QemuMutex src_page_req_mutex;
 352     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 353 };
 354 typedef struct RAMState RAMState;
 355
 356 static RAMState *ram_state;
 357
 358 static NotifierWithReturnList precopy_notifier_list;
 359
 360 void precopy_infrastructure_init(void)
 361 {
 362     notifier_with_return_list_init(&precopy_notifier_list);
 363 }
 364
 365 void precopy_add_notifier(NotifierWithReturn *n)
 366 {
 367     notifier_with_return_list_add(&precopy_notifier_list, n);
 368 }
 369
 370 void precopy_remove_notifier(NotifierWithReturn *n)
 371 {
 372     notifier_with_return_remove(n);
 373 }
 374
 375 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 376 {
 377     PrecopyNotifyData pnd;
 378     pnd.reason = reason;
 379     pnd.errp = errp;
 380
 381     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 382 }
 383
 384 void precopy_enable_free_page_optimization(void)
 385 {
 386     if (!ram_state) {
 387         return;
 388     }
 389
 390     ram_state->fpo_enabled = true;
 391 }
 392
 393 uint64_t ram_bytes_remaining(void)
 394 {
 395     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 396                        0;
 397 }
 398
 399 MigrationStats ram_counters;
 400
 401 /* used by the search for pages to send */
 402 struct PageSearchStatus {
 403     /* Current block being searched */
 404     RAMBlock    *block;
 405     /* Current page to search from */
 406     unsigned long page;
 407     /* Set once we wrap around */
 408     bool         complete_round;
 409 };
 410 typedef struct PageSearchStatus PageSearchStatus;
 411
 412 CompressionStats compression_counters;
 413
 414 struct CompressParam {
 415     bool done;
 416     bool quit;
 417     bool zero_page;
 418     QEMUFile *file;
 419     QemuMutex mutex;
 420     QemuCond cond;
 421     RAMBlock *block;
 422     ram_addr_t offset;
 423
 424     /* internally used fields */
 425     z_stream stream;
 426     uint8_t *originbuf;
 427 };
 428 typedef struct CompressParam CompressParam;
 429
 430 struct DecompressParam {
 431     bool done;
 432     bool quit;
 433     QemuMutex mutex;
 434     QemuCond cond;
 435     void *des;
 436     uint8_t *compbuf;
 437     int len;
 438     z_stream stream;
 439 };
 440 typedef struct DecompressParam DecompressParam;
 441
 442 static CompressParam *comp_param;
 443 static QemuThread *compress_threads;
 444 /* comp_done_cond is used to wake up the migration thread when
 445  * one of the compression threads has finished the compression.
 446  * comp_done_lock is used to co-work with comp_done_cond.
 447  */
 448 static QemuMutex comp_done_lock;
 449 static QemuCond comp_done_cond;
 450 /* The empty QEMUFileOps will be used by file in CompressParam */
 451 static const QEMUFileOps empty_ops = { };
 452
 453 static QEMUFile *decomp_file;
 454 static DecompressParam *decomp_param;
 455 static QemuThread *decompress_threads;
 456 static QemuMutex decomp_done_lock;
 457 static QemuCond decomp_done_cond;
 458
 459 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 460                                  ram_addr_t offset, uint8_t *source_buf);
 461
 462 static void *do_data_compress(void *opaque)
 463 {
 464     CompressParam *param = opaque;
 465     RAMBlock *block;
 466     ram_addr_t offset;
 467     bool zero_page;
 468
 469     qemu_mutex_lock(&param->mutex);
 470     while (!param->quit) {
 471         if (param->block) {
 472             block = param->block;
 473             offset = param->offset;
 474             param->block = NULL;
 475             qemu_mutex_unlock(&param->mutex);
 476
 477             zero_page = do_compress_ram_page(param->file, &param->stream,
 478                                              block, offset, param->originbuf);
 479
 480             qemu_mutex_lock(&comp_done_lock);
 481             param->done = true;
 482             param->zero_page = zero_page;
 483             qemu_cond_signal(&comp_done_cond);
 484             qemu_mutex_unlock(&comp_done_lock);
 485
 486             qemu_mutex_lock(&param->mutex);
 487         } else {
 488             qemu_cond_wait(&param->cond, &param->mutex);
 489         }
 490     }
 491     qemu_mutex_unlock(&param->mutex);
 492
 493     return NULL;
 494 }
 495
 496 static void compress_threads_save_cleanup(void)
 497 {
 498     int i, thread_count;
 499
 500     if (!migrate_use_compression() || !comp_param) {
 501         return;
 502     }
 503
 504     thread_count = migrate_compress_threads();
 505     for (i = 0; i < thread_count; i++) {
 506         /*
 507          * we use it as a indicator which shows if the thread is
 508          * properly init'd or not
 509          */
 510         if (!comp_param[i].file) {
 511             break;
 512         }
 513
 514         qemu_mutex_lock(&comp_param[i].mutex);
 515         comp_param[i].quit = true;
 516         qemu_cond_signal(&comp_param[i].cond);
 517         qemu_mutex_unlock(&comp_param[i].mutex);
 518
 519         qemu_thread_join(compress_threads + i);
 520         qemu_mutex_destroy(&comp_param[i].mutex);
 521         qemu_cond_destroy(&comp_param[i].cond);
 522         deflateEnd(&comp_param[i].stream);
 523         g_free(comp_param[i].originbuf);
 524         qemu_fclose(comp_param[i].file);
 525         comp_param[i].file = NULL;
 526     }
 527     qemu_mutex_destroy(&comp_done_lock);
 528     qemu_cond_destroy(&comp_done_cond);
 529     g_free(compress_threads);
 530     g_free(comp_param);
 531     compress_threads = NULL;
 532     comp_param = NULL;
 533 }
 534
 535 static int compress_threads_save_setup(void)
 536 {
 537     int i, thread_count;
 538
 539     if (!migrate_use_compression()) {
 540         return 0;
 541     }
 542     thread_count = migrate_compress_threads();
 543     compress_threads = g_new0(QemuThread, thread_count);
 544     comp_param = g_new0(CompressParam, thread_count);
 545     qemu_cond_init(&comp_done_cond);
 546     qemu_mutex_init(&comp_done_lock);
 547     for (i = 0; i < thread_count; i++) {
 548         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 549         if (!comp_param[i].originbuf) {
 550             goto exit;
 551         }
 552
 553         if (deflateInit(&comp_param[i].stream,
 554                         migrate_compress_level()) != Z_OK) {
 555             g_free(comp_param[i].originbuf);
 556             goto exit;
 557         }
 558
 559         /* comp_param[i].file is just used as a dummy buffer to save data,
 560          * set its ops to empty.
 561          */
 562         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 563         comp_param[i].done = true;
 564         comp_param[i].quit = false;
 565         qemu_mutex_init(&comp_param[i].mutex);
 566         qemu_cond_init(&comp_param[i].cond);
 567         qemu_thread_create(compress_threads + i, "compress",
 568                            do_data_compress, comp_param + i,
 569                            QEMU_THREAD_JOINABLE);
 570     }
 571     return 0;
 572
 573 exit:
 574     compress_threads_save_cleanup();
 575     return -1;
 576 }
 577
 578 /* Multiple fd's */
 579
 580 #define MULTIFD_MAGIC 0x11223344U
 581 #define MULTIFD_VERSION 1
 582
 583 #define MULTIFD_FLAG_SYNC (1 << 0)
 584
 585 /* This value needs to be a multiple of qemu_target_page_size() */
 586 #define MULTIFD_PACKET_SIZE (512 * 1024)
 587
 588 typedef struct {
 589     uint32_t magic;
 590     uint32_t version;
 591     unsigned char uuid[16]; /* QemuUUID */
 592     uint8_t id;
 593     uint8_t unused1[7];     /* Reserved for future use */
 594     uint64_t unused2[4];    /* Reserved for future use */
 595 } __attribute__((packed)) MultiFDInit_t;
 596
 597 typedef struct {
 598     uint32_t magic;
 599     uint32_t version;
 600     uint32_t flags;
 601     /* maximum number of allocated pages */
 602     uint32_t pages_alloc;
 603     uint32_t pages_used;
 604     /* size of the next packet that contains pages */
 605     uint32_t next_packet_size;
 606     uint64_t packet_num;
 607     uint64_t unused[4];    /* Reserved for future use */
 608     char ramblock[256];
 609     uint64_t offset[];
 610 } __attribute__((packed)) MultiFDPacket_t;
 611
 612 typedef struct {
 613     /* number of used pages */
 614     uint32_t used;
 615     /* number of allocated pages */
 616     uint32_t allocated;
 617     /* global number of generated multifd packets */
 618     uint64_t packet_num;
 619     /* offset of each page */
 620     ram_addr_t *offset;
 621     /* pointer to each page */
 622     struct iovec *iov;
 623     RAMBlock *block;
 624 } MultiFDPages_t;
 625
 626 typedef struct {
 627     /* this fields are not changed once the thread is created */
 628     /* channel number */
 629     uint8_t id;
 630     /* channel thread name */
 631     char *name;
 632     /* channel thread id */
 633     QemuThread thread;
 634     /* communication channel */
 635     QIOChannel *c;
 636     /* sem where to wait for more work */
 637     QemuSemaphore sem;
 638     /* this mutex protects the following parameters */
 639     QemuMutex mutex;
 640     /* is this channel thread running */
 641     bool running;
 642     /* should this thread finish */
 643     bool quit;
 644     /* thread has work to do */
 645     int pending_job;
 646     /* array of pages to sent */
 647     MultiFDPages_t *pages;
 648     /* packet allocated len */
 649     uint32_t packet_len;
 650     /* pointer to the packet */
 651     MultiFDPacket_t *packet;
 652     /* multifd flags for each packet */
 653     uint32_t flags;
 654     /* size of the next packet that contains pages */
 655     uint32_t next_packet_size;
 656     /* global number of generated multifd packets */
 657     uint64_t packet_num;
 658     /* thread local variables */
 659     /* packets sent through this channel */
 660     uint64_t num_packets;
 661     /* pages sent through this channel */
 662     uint64_t num_pages;
 663     /* syncs main thread and channels */
 664     QemuSemaphore sem_sync;
 665 }  MultiFDSendParams;
 666
 667 typedef struct {
 668     /* this fields are not changed once the thread is created */
 669     /* channel number */
 670     uint8_t id;
 671     /* channel thread name */
 672     char *name;
 673     /* channel thread id */
 674     QemuThread thread;
 675     /* communication channel */
 676     QIOChannel *c;
 677     /* this mutex protects the following parameters */
 678     QemuMutex mutex;
 679     /* is this channel thread running */
 680     bool running;
 681     /* should this thread finish */
 682     bool quit;
 683     /* array of pages to receive */
 684     MultiFDPages_t *pages;
 685     /* packet allocated len */
 686     uint32_t packet_len;
 687     /* pointer to the packet */
 688     MultiFDPacket_t *packet;
 689     /* multifd flags for each packet */
 690     uint32_t flags;
 691     /* global number of generated multifd packets */
 692     uint64_t packet_num;
 693     /* thread local variables */
 694     /* size of the next packet that contains pages */
 695     uint32_t next_packet_size;
 696     /* packets sent through this channel */
 697     uint64_t num_packets;
 698     /* pages sent through this channel */
 699     uint64_t num_pages;
 700     /* syncs main thread and channels */
 701     QemuSemaphore sem_sync;
 702 } MultiFDRecvParams;
 703
 704 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 705 {
 706     MultiFDInit_t msg = {};
 707     int ret;
 708
 709     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 710     msg.version = cpu_to_be32(MULTIFD_VERSION);
 711     msg.id = p->id;
 712     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 713
 714     ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 715     if (ret != 0) {
 716         return -1;
 717     }
 718     return 0;
 719 }
 720
 721 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 722 {
 723     MultiFDInit_t msg;
 724     int ret;
 725
 726     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 727     if (ret != 0) {
 728         return -1;
 729     }
 730
 731     msg.magic = be32_to_cpu(msg.magic);
 732     msg.version = be32_to_cpu(msg.version);
 733
 734     if (msg.magic != MULTIFD_MAGIC) {
 735         error_setg(errp, "multifd: received packet magic %x "
 736                    "expected %x", msg.magic, MULTIFD_MAGIC);
 737         return -1;
 738     }
 739
 740     if (msg.version != MULTIFD_VERSION) {
 741         error_setg(errp, "multifd: received packet version %d "
 742                    "expected %d", msg.version, MULTIFD_VERSION);
 743         return -1;
 744     }
 745
 746     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 747         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 748         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 749
 750         error_setg(errp, "multifd: received uuid '%s' and expected "
 751                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 752         g_free(uuid);
 753         g_free(msg_uuid);
 754         return -1;
 755     }
 756
 757     if (msg.id > migrate_multifd_channels()) {
 758         error_setg(errp, "multifd: received channel version %d "
 759                    "expected %d", msg.version, MULTIFD_VERSION);
 760         return -1;
 761     }
 762
 763     return msg.id;
 764 }
 765
 766 static MultiFDPages_t *multifd_pages_init(size_t size)
 767 {
 768     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 769
 770     pages->allocated = size;
 771     pages->iov = g_new0(struct iovec, size);
 772     pages->offset = g_new0(ram_addr_t, size);
 773
 774     return pages;
 775 }
 776
 777 static void multifd_pages_clear(MultiFDPages_t *pages)
 778 {
 779     pages->used = 0;
 780     pages->allocated = 0;
 781     pages->packet_num = 0;
 782     pages->block = NULL;
 783     g_free(pages->iov);
 784     pages->iov = NULL;
 785     g_free(pages->offset);
 786     pages->offset = NULL;
 787     g_free(pages);
 788 }
 789
 790 static void multifd_send_fill_packet(MultiFDSendParams *p)
 791 {
 792     MultiFDPacket_t *packet = p->packet;
 793     int i;
 794
 795     packet->flags = cpu_to_be32(p->flags);
 796     packet->pages_alloc = cpu_to_be32(p->pages->allocated);
 797     packet->pages_used = cpu_to_be32(p->pages->used);
 798     packet->next_packet_size = cpu_to_be32(p->next_packet_size);
 799     packet->packet_num = cpu_to_be64(p->packet_num);
 800
 801     if (p->pages->block) {
 802         strncpy(packet->ramblock, p->pages->block->idstr, 256);
 803     }
 804
 805     for (i = 0; i < p->pages->used; i++) {
 806         packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 807     }
 808 }
 809
 810 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 811 {
 812     MultiFDPacket_t *packet = p->packet;
 813     uint32_t pages_max = MULTIFD_PACKET_SIZE / qemu_target_page_size();
 814     RAMBlock *block;
 815     int i;
 816
 817     packet->magic = be32_to_cpu(packet->magic);
 818     if (packet->magic != MULTIFD_MAGIC) {
 819         error_setg(errp, "multifd: received packet "
 820                    "magic %x and expected magic %x",
 821                    packet->magic, MULTIFD_MAGIC);
 822         return -1;
 823     }
 824
 825     packet->version = be32_to_cpu(packet->version);
 826     if (packet->version != MULTIFD_VERSION) {
 827         error_setg(errp, "multifd: received packet "
 828                    "version %d and expected version %d",
 829                    packet->version, MULTIFD_VERSION);
 830         return -1;
 831     }
 832
 833     p->flags = be32_to_cpu(packet->flags);
 834
 835     packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
 836     /*
 837      * If we received a packet that is 100 times bigger than expected
 838      * just stop migration.  It is a magic number.
 839      */
 840     if (packet->pages_alloc > pages_max * 100) {
 841         error_setg(errp, "multifd: received packet "
 842                    "with size %d and expected a maximum size of %d",
 843                    packet->pages_alloc, pages_max * 100) ;
 844         return -1;
 845     }
 846     /*
 847      * We received a packet that is bigger than expected but inside
 848      * reasonable limits (see previous comment).  Just reallocate.
 849      */
 850     if (packet->pages_alloc > p->pages->allocated) {
 851         multifd_pages_clear(p->pages);
 852         p->pages = multifd_pages_init(packet->pages_alloc);
 853     }
 854
 855     p->pages->used = be32_to_cpu(packet->pages_used);
 856     if (p->pages->used > packet->pages_alloc) {
 857         error_setg(errp, "multifd: received packet "
 858                    "with %d pages and expected maximum pages are %d",
 859                    p->pages->used, packet->pages_alloc) ;
 860         return -1;
 861     }
 862
 863     p->next_packet_size = be32_to_cpu(packet->next_packet_size);
 864     p->packet_num = be64_to_cpu(packet->packet_num);
 865
 866     if (p->pages->used == 0) {
 867         return 0;
 868     }
 869
 870     /* make sure that ramblock is 0 terminated */
 871     packet->ramblock[255] = 0;
 872     block = qemu_ram_block_by_name(packet->ramblock);
 873     if (!block) {
 874         error_setg(errp, "multifd: unknown ram block %s",
 875                    packet->ramblock);
 876         return -1;
 877     }
 878
 879     for (i = 0; i < p->pages->used; i++) {
 880         ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 881
 882         if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 883             error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 884                        " (max " RAM_ADDR_FMT ")",
 885                        offset, block->max_length);
 886             return -1;
 887         }
 888         p->pages->iov[i].iov_base = block->host + offset;
 889         p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 890     }
 891
 892     return 0;
 893 }
 894
 895 struct {
 896     MultiFDSendParams *params;
 897     /* array of pages to sent */
 898     MultiFDPages_t *pages;
 899     /* global number of generated multifd packets */
 900     uint64_t packet_num;
 901     /* send channels ready */
 902     QemuSemaphore channels_ready;
 903 } *multifd_send_state;
 904
 905 /*
 906  * How we use multifd_send_state->pages and channel->pages?
 907  *
 908  * We create a pages for each channel, and a main one.  Each time that
 909  * we need to send a batch of pages we interchange the ones between
 910  * multifd_send_state and the channel that is sending it.  There are
 911  * two reasons for that:
 912  *    - to not have to do so many mallocs during migration
 913  *    - to make easier to know what to free at the end of migration
 914  *
 915  * This way we always know who is the owner of each "pages" struct,
 916  * and we don't need any locking.  It belongs to the migration thread
 917  * or to the channel thread.  Switching is safe because the migration
 918  * thread is using the channel mutex when changing it, and the channel
 919  * have to had finish with its own, otherwise pending_job can't be
 920  * false.
 921  */
 922
 923 static int multifd_send_pages(RAMState *rs)
 924 {
 925     int i;
 926     static int next_channel;
 927     MultiFDSendParams *p = NULL; /* make happy gcc */
 928     MultiFDPages_t *pages = multifd_send_state->pages;
 929     uint64_t transferred;
 930
 931     qemu_sem_wait(&multifd_send_state->channels_ready);
 932     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 933         p = &multifd_send_state->params[i];
 934
 935         qemu_mutex_lock(&p->mutex);
 936         if (p->quit) {
 937             error_report("%s: channel %d has already quit!", __func__, i);
 938             qemu_mutex_unlock(&p->mutex);
 939             return -1;
 940         }
 941         if (!p->pending_job) {
 942             p->pending_job++;
 943             next_channel = (i + 1) % migrate_multifd_channels();
 944             break;
 945         }
 946         qemu_mutex_unlock(&p->mutex);
 947     }
 948     p->pages->used = 0;
 949
 950     p->packet_num = multifd_send_state->packet_num++;
 951     p->pages->block = NULL;
 952     multifd_send_state->pages = p->pages;
 953     p->pages = pages;
 954     transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
 955     qemu_file_update_transfer(rs->f, transferred);
 956     ram_counters.multifd_bytes += transferred;
 957     ram_counters.transferred += transferred;;
 958     qemu_mutex_unlock(&p->mutex);
 959     qemu_sem_post(&p->sem);
 960
 961     return 1;
 962 }
 963
 964 static int multifd_queue_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
 965 {
 966     MultiFDPages_t *pages = multifd_send_state->pages;
 967
 968     if (!pages->block) {
 969         pages->block = block;
 970     }
 971
 972     if (pages->block == block) {
 973         pages->offset[pages->used] = offset;
 974         pages->iov[pages->used].iov_base = block->host + offset;
 975         pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 976         pages->used++;
 977
 978         if (pages->used < pages->allocated) {
 979             return 1;
 980         }
 981     }
 982
 983     if (multifd_send_pages(rs) < 0) {
 984         return -1;
 985     }
 986
 987     if (pages->block != block) {
 988         return  multifd_queue_page(rs, block, offset);
 989     }
 990
 991     return 1;
 992 }
 993
 994 static void multifd_send_terminate_threads(Error *err)
 995 {
 996     int i;
 997
 998     trace_multifd_send_terminate_threads(err != NULL);
 999
1000     if (err) {
1001         MigrationState *s = migrate_get_current();
1002         migrate_set_error(s, err);
1003         if (s->state == MIGRATION_STATUS_SETUP ||
1004             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
1005             s->state == MIGRATION_STATUS_DEVICE ||
1006             s->state == MIGRATION_STATUS_ACTIVE) {
1007             migrate_set_state(&s->state, s->state,
1008                               MIGRATION_STATUS_FAILED);
1009         }
1010     }
1011
1012     for (i = 0; i < migrate_multifd_channels(); i++) {
1013         MultiFDSendParams *p = &multifd_send_state->params[i];
1014
1015         qemu_mutex_lock(&p->mutex);
1016         p->quit = true;
1017         qemu_sem_post(&p->sem);
1018         qemu_mutex_unlock(&p->mutex);
1019     }
1020 }
1021
1022 void multifd_save_cleanup(void)
1023 {
1024     int i;
1025
1026     if (!migrate_use_multifd()) {
1027         return;
1028     }
1029     multifd_send_terminate_threads(NULL);
1030     for (i = 0; i < migrate_multifd_channels(); i++) {
1031         MultiFDSendParams *p = &multifd_send_state->params[i];
1032
1033         if (p->running) {
1034             qemu_thread_join(&p->thread);
1035         }
1036         socket_send_channel_destroy(p->c);
1037         p->c = NULL;
1038         qemu_mutex_destroy(&p->mutex);
1039         qemu_sem_destroy(&p->sem);
1040         qemu_sem_destroy(&p->sem_sync);
1041         g_free(p->name);
1042         p->name = NULL;
1043         multifd_pages_clear(p->pages);
1044         p->pages = NULL;
1045         p->packet_len = 0;
1046         g_free(p->packet);
1047         p->packet = NULL;
1048     }
1049     qemu_sem_destroy(&multifd_send_state->channels_ready);
1050     g_free(multifd_send_state->params);
1051     multifd_send_state->params = NULL;
1052     multifd_pages_clear(multifd_send_state->pages);
1053     multifd_send_state->pages = NULL;
1054     g_free(multifd_send_state);
1055     multifd_send_state = NULL;
1056 }
1057
1058 static void multifd_send_sync_main(RAMState *rs)
1059 {
1060     int i;
1061
1062     if (!migrate_use_multifd()) {
1063         return;
1064     }
1065     if (multifd_send_state->pages->used) {
1066         if (multifd_send_pages(rs) < 0) {
1067             error_report("%s: multifd_send_pages fail", __func__);
1068             return;
1069         }
1070     }
1071     for (i = 0; i < migrate_multifd_channels(); i++) {
1072         MultiFDSendParams *p = &multifd_send_state->params[i];
1073
1074         trace_multifd_send_sync_main_signal(p->id);
1075
1076         qemu_mutex_lock(&p->mutex);
1077
1078         if (p->quit) {
1079             error_report("%s: channel %d has already quit", __func__, i);
1080             qemu_mutex_unlock(&p->mutex);
1081             return;
1082         }
1083
1084         p->packet_num = multifd_send_state->packet_num++;
1085         p->flags |= MULTIFD_FLAG_SYNC;
1086         p->pending_job++;
1087         qemu_file_update_transfer(rs->f, p->packet_len);
1088         ram_counters.multifd_bytes += p->packet_len;
1089         ram_counters.transferred += p->packet_len;
1090         qemu_mutex_unlock(&p->mutex);
1091         qemu_sem_post(&p->sem);
1092     }
1093     for (i = 0; i < migrate_multifd_channels(); i++) {
1094         MultiFDSendParams *p = &multifd_send_state->params[i];
1095
1096         trace_multifd_send_sync_main_wait(p->id);
1097         qemu_sem_wait(&p->sem_sync);
1098     }
1099     trace_multifd_send_sync_main(multifd_send_state->packet_num);
1100 }
1101
1102 static void *multifd_send_thread(void *opaque)
1103 {
1104     MultiFDSendParams *p = opaque;
1105     Error *local_err = NULL;
1106     int ret = 0;
1107     uint32_t flags = 0;
1108
1109     trace_multifd_send_thread_start(p->id);
1110     rcu_register_thread();
1111
1112     if (multifd_send_initial_packet(p, &local_err) < 0) {
1113         ret = -1;
1114         goto out;
1115     }
1116     /* initial packet */
1117     p->num_packets = 1;
1118
1119     while (true) {
1120         qemu_sem_wait(&p->sem);
1121         qemu_mutex_lock(&p->mutex);
1122
1123         if (p->pending_job) {
1124             uint32_t used = p->pages->used;
1125             uint64_t packet_num = p->packet_num;
1126             flags = p->flags;
1127
1128             p->next_packet_size = used * qemu_target_page_size();
1129             multifd_send_fill_packet(p);
1130             p->flags = 0;
1131             p->num_packets++;
1132             p->num_pages += used;
1133             qemu_mutex_unlock(&p->mutex);
1134
1135             trace_multifd_send(p->id, packet_num, used, flags,
1136                                p->next_packet_size);
1137
1138             ret = qio_channel_write_all(p->c, (void *)p->packet,
1139                                         p->packet_len, &local_err);
1140             if (ret != 0) {
1141                 break;
1142             }
1143
1144             if (used) {
1145                 ret = qio_channel_writev_all(p->c, p->pages->iov,
1146                                              used, &local_err);
1147                 if (ret != 0) {
1148                     break;
1149                 }
1150             }
1151
1152             qemu_mutex_lock(&p->mutex);
1153             p->pending_job--;
1154             qemu_mutex_unlock(&p->mutex);
1155
1156             if (flags & MULTIFD_FLAG_SYNC) {
1157                 qemu_sem_post(&p->sem_sync);
1158             }
1159             qemu_sem_post(&multifd_send_state->channels_ready);
1160         } else if (p->quit) {
1161             qemu_mutex_unlock(&p->mutex);
1162             break;
1163         } else {
1164             qemu_mutex_unlock(&p->mutex);
1165             /* sometimes there are spurious wakeups */
1166         }
1167     }
1168
1169 out:
1170     if (local_err) {
1171         trace_multifd_send_error(p->id);
1172         multifd_send_terminate_threads(local_err);
1173     }
1174
1175     /*
1176      * Error happen, I will exit, but I can't just leave, tell
1177      * who pay attention to me.
1178      */
1179     if (ret != 0) {
1180         qemu_sem_post(&p->sem_sync);
1181         qemu_sem_post(&multifd_send_state->channels_ready);
1182     }
1183
1184     qemu_mutex_lock(&p->mutex);
1185     p->running = false;
1186     qemu_mutex_unlock(&p->mutex);
1187
1188     rcu_unregister_thread();
1189     trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1190
1191     return NULL;
1192 }
1193
1194 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1195 {
1196     MultiFDSendParams *p = opaque;
1197     QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1198     Error *local_err = NULL;
1199
1200     trace_multifd_new_send_channel_async(p->id);
1201     if (qio_task_propagate_error(task, &local_err)) {
1202         migrate_set_error(migrate_get_current(), local_err);
1203         multifd_save_cleanup();
1204     } else {
1205         p->c = QIO_CHANNEL(sioc);
1206         qio_channel_set_delay(p->c, false);
1207         p->running = true;
1208         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1209                            QEMU_THREAD_JOINABLE);
1210     }
1211 }
1212
1213 int multifd_save_setup(void)
1214 {
1215     int thread_count;
1216     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1217     uint8_t i;
1218
1219     if (!migrate_use_multifd()) {
1220         return 0;
1221     }
1222     thread_count = migrate_multifd_channels();
1223     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1224     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1225     multifd_send_state->pages = multifd_pages_init(page_count);
1226     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1227
1228     for (i = 0; i < thread_count; i++) {
1229         MultiFDSendParams *p = &multifd_send_state->params[i];
1230
1231         qemu_mutex_init(&p->mutex);
1232         qemu_sem_init(&p->sem, 0);
1233         qemu_sem_init(&p->sem_sync, 0);
1234         p->quit = false;
1235         p->pending_job = 0;
1236         p->id = i;
1237         p->pages = multifd_pages_init(page_count);
1238         p->packet_len = sizeof(MultiFDPacket_t)
1239                       + sizeof(ram_addr_t) * page_count;
1240         p->packet = g_malloc0(p->packet_len);
1241         p->packet->magic = cpu_to_be32(MULTIFD_MAGIC);
1242         p->packet->version = cpu_to_be32(MULTIFD_VERSION);
1243         p->name = g_strdup_printf("multifdsend_%d", i);
1244         socket_send_channel_create(multifd_new_send_channel_async, p);
1245     }
1246     return 0;
1247 }
1248
1249 struct {
1250     MultiFDRecvParams *params;
1251     /* number of created threads */
1252     int count;
1253     /* syncs main thread and channels */
1254     QemuSemaphore sem_sync;
1255     /* global number of generated multifd packets */
1256     uint64_t packet_num;
1257 } *multifd_recv_state;
1258
1259 static void multifd_recv_terminate_threads(Error *err)
1260 {
1261     int i;
1262
1263     trace_multifd_recv_terminate_threads(err != NULL);
1264
1265     if (err) {
1266         MigrationState *s = migrate_get_current();
1267         migrate_set_error(s, err);
1268         if (s->state == MIGRATION_STATUS_SETUP ||
1269             s->state == MIGRATION_STATUS_ACTIVE) {
1270             migrate_set_state(&s->state, s->state,
1271                               MIGRATION_STATUS_FAILED);
1272         }
1273     }
1274
1275     for (i = 0; i < migrate_multifd_channels(); i++) {
1276         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1277
1278         qemu_mutex_lock(&p->mutex);
1279         p->quit = true;
1280         /* We could arrive here for two reasons:
1281            - normal quit, i.e. everything went fine, just finished
1282            - error quit: We close the channels so the channel threads
1283              finish the qio_channel_read_all_eof() */
1284         qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1285         qemu_mutex_unlock(&p->mutex);
1286     }
1287 }
1288
1289 int multifd_load_cleanup(Error **errp)
1290 {
1291     int i;
1292     int ret = 0;
1293
1294     if (!migrate_use_multifd()) {
1295         return 0;
1296     }
1297     multifd_recv_terminate_threads(NULL);
1298     for (i = 0; i < migrate_multifd_channels(); i++) {
1299         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1300
1301         if (p->running) {
1302             p->quit = true;
1303             /*
1304              * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code,
1305              * however try to wakeup it without harm in cleanup phase.
1306              */
1307             qemu_sem_post(&p->sem_sync);
1308             qemu_thread_join(&p->thread);
1309         }
1310         object_unref(OBJECT(p->c));
1311         p->c = NULL;
1312         qemu_mutex_destroy(&p->mutex);
1313         qemu_sem_destroy(&p->sem_sync);
1314         g_free(p->name);
1315         p->name = NULL;
1316         multifd_pages_clear(p->pages);
1317         p->pages = NULL;
1318         p->packet_len = 0;
1319         g_free(p->packet);
1320         p->packet = NULL;
1321     }
1322     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1323     g_free(multifd_recv_state->params);
1324     multifd_recv_state->params = NULL;
1325     g_free(multifd_recv_state);
1326     multifd_recv_state = NULL;
1327
1328     return ret;
1329 }
1330
1331 static void multifd_recv_sync_main(void)
1332 {
1333     int i;
1334
1335     if (!migrate_use_multifd()) {
1336         return;
1337     }
1338     for (i = 0; i < migrate_multifd_channels(); i++) {
1339         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1340
1341         trace_multifd_recv_sync_main_wait(p->id);
1342         qemu_sem_wait(&multifd_recv_state->sem_sync);
1343     }
1344     for (i = 0; i < migrate_multifd_channels(); i++) {
1345         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1346
1347         qemu_mutex_lock(&p->mutex);
1348         if (multifd_recv_state->packet_num < p->packet_num) {
1349             multifd_recv_state->packet_num = p->packet_num;
1350         }
1351         qemu_mutex_unlock(&p->mutex);
1352         trace_multifd_recv_sync_main_signal(p->id);
1353         qemu_sem_post(&p->sem_sync);
1354     }
1355     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1356 }
1357
1358 static void *multifd_recv_thread(void *opaque)
1359 {
1360     MultiFDRecvParams *p = opaque;
1361     Error *local_err = NULL;
1362     int ret;
1363
1364     trace_multifd_recv_thread_start(p->id);
1365     rcu_register_thread();
1366
1367     while (true) {
1368         uint32_t used;
1369         uint32_t flags;
1370
1371         if (p->quit) {
1372             break;
1373         }
1374
1375         ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1376                                        p->packet_len, &local_err);
1377         if (ret == 0) {   /* EOF */
1378             break;
1379         }
1380         if (ret == -1) {   /* Error */
1381             break;
1382         }
1383
1384         qemu_mutex_lock(&p->mutex);
1385         ret = multifd_recv_unfill_packet(p, &local_err);
1386         if (ret) {
1387             qemu_mutex_unlock(&p->mutex);
1388             break;
1389         }
1390
1391         used = p->pages->used;
1392         flags = p->flags;
1393         trace_multifd_recv(p->id, p->packet_num, used, flags,
1394                            p->next_packet_size);
1395         p->num_packets++;
1396         p->num_pages += used;
1397         qemu_mutex_unlock(&p->mutex);
1398
1399         if (used) {
1400             ret = qio_channel_readv_all(p->c, p->pages->iov,
1401                                         used, &local_err);
1402             if (ret != 0) {
1403                 break;
1404             }
1405         }
1406
1407         if (flags & MULTIFD_FLAG_SYNC) {
1408             qemu_sem_post(&multifd_recv_state->sem_sync);
1409             qemu_sem_wait(&p->sem_sync);
1410         }
1411     }
1412
1413     if (local_err) {
1414         multifd_recv_terminate_threads(local_err);
1415     }
1416     qemu_mutex_lock(&p->mutex);
1417     p->running = false;
1418     qemu_mutex_unlock(&p->mutex);
1419
1420     rcu_unregister_thread();
1421     trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1422
1423     return NULL;
1424 }
1425
1426 int multifd_load_setup(void)
1427 {
1428     int thread_count;
1429     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1430     uint8_t i;
1431
1432     if (!migrate_use_multifd()) {
1433         return 0;
1434     }
1435     thread_count = migrate_multifd_channels();
1436     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1437     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1438     atomic_set(&multifd_recv_state->count, 0);
1439     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1440
1441     for (i = 0; i < thread_count; i++) {
1442         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1443
1444         qemu_mutex_init(&p->mutex);
1445         qemu_sem_init(&p->sem_sync, 0);
1446         p->quit = false;
1447         p->id = i;
1448         p->pages = multifd_pages_init(page_count);
1449         p->packet_len = sizeof(MultiFDPacket_t)
1450                       + sizeof(ram_addr_t) * page_count;
1451         p->packet = g_malloc0(p->packet_len);
1452         p->name = g_strdup_printf("multifdrecv_%d", i);
1453     }
1454     return 0;
1455 }
1456
1457 bool multifd_recv_all_channels_created(void)
1458 {
1459     int thread_count = migrate_multifd_channels();
1460
1461     if (!migrate_use_multifd()) {
1462         return true;
1463     }
1464
1465     return thread_count == atomic_read(&multifd_recv_state->count);
1466 }
1467
1468 /*
1469  * Try to receive all multifd channels to get ready for the migration.
1470  * - Return true and do not set @errp when correctly receving all channels;
1471  * - Return false and do not set @errp when correctly receiving the current one;
1472  * - Return false and set @errp when failing to receive the current channel.
1473  */
1474 bool multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1475 {
1476     MultiFDRecvParams *p;
1477     Error *local_err = NULL;
1478     int id;
1479
1480     id = multifd_recv_initial_packet(ioc, &local_err);
1481     if (id < 0) {
1482         multifd_recv_terminate_threads(local_err);
1483         error_propagate_prepend(errp, local_err,
1484                                 "failed to receive packet"
1485                                 " via multifd channel %d: ",
1486                                 atomic_read(&multifd_recv_state->count));
1487         return false;
1488     }
1489     trace_multifd_recv_new_channel(id);
1490
1491     p = &multifd_recv_state->params[id];
1492     if (p->c != NULL) {
1493         error_setg(&local_err, "multifd: received id '%d' already setup'",
1494                    id);
1495         multifd_recv_terminate_threads(local_err);
1496         error_propagate(errp, local_err);
1497         return false;
1498     }
1499     p->c = ioc;
1500     object_ref(OBJECT(ioc));
1501     /* initial packet */
1502     p->num_packets = 1;
1503
1504     p->running = true;
1505     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1506                        QEMU_THREAD_JOINABLE);
1507     atomic_inc(&multifd_recv_state->count);
1508     return atomic_read(&multifd_recv_state->count) ==
1509            migrate_multifd_channels();
1510 }
1511
1512 /**
1513  * save_page_header: write page header to wire
1514  *
1515  * If this is the 1st block, it also writes the block identification
1516  *
1517  * Returns the number of bytes written
1518  *
1519  * @f: QEMUFile where to send the data
1520  * @block: block that contains the page we want to send
1521  * @offset: offset inside the block for the page
1522  *          in the lower bits, it contains flags
1523  */
1524 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1525                                ram_addr_t offset)
1526 {
1527     size_t size, len;
1528
1529     if (block == rs->last_sent_block) {
1530         offset |= RAM_SAVE_FLAG_CONTINUE;
1531     }
1532     qemu_put_be64(f, offset);
1533     size = 8;
1534
1535     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1536         len = strlen(block->idstr);
1537         qemu_put_byte(f, len);
1538         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1539         size += 1 + len;
1540         rs->last_sent_block = block;
1541     }
1542     return size;
1543 }
1544
1545 /**
1546  * mig_throttle_guest_down: throotle down the guest
1547  *
1548  * Reduce amount of guest cpu execution to hopefully slow down memory
1549  * writes. If guest dirty memory rate is reduced below the rate at
1550  * which we can transfer pages to the destination then we should be
1551  * able to complete migration. Some workloads dirty memory way too
1552  * fast and will not effectively converge, even with auto-converge.
1553  */
1554 static void mig_throttle_guest_down(void)
1555 {
1556     MigrationState *s = migrate_get_current();
1557     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1558     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1559     int pct_max = s->parameters.max_cpu_throttle;
1560
1561     /* We have not started throttling yet. Let's start it. */
1562     if (!cpu_throttle_active()) {
1563         cpu_throttle_set(pct_initial);
1564     } else {
1565         /* Throttling already on, just increase the rate */
1566         cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
1567                          pct_max));
1568     }
1569 }
1570
1571 /**
1572  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1573  *
1574  * @rs: current RAM state
1575  * @current_addr: address for the zero page
1576  *
1577  * Update the xbzrle cache to reflect a page that's been sent as all 0.
1578  * The important thing is that a stale (not-yet-0'd) page be replaced
1579  * by the new data.
1580  * As a bonus, if the page wasn't in the cache it gets added so that
1581  * when a small write is made into the 0'd page it gets XBZRLE sent.
1582  */
1583 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1584 {
1585     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1586         return;
1587     }
1588
1589     /* We don't care if this fails to allocate a new cache page
1590      * as long as it updated an old one */
1591     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1592                  ram_counters.dirty_sync_count);
1593 }
1594
1595 #define ENCODING_FLAG_XBZRLE 0x1
1596
1597 /**
1598  * save_xbzrle_page: compress and send current page
1599  *
1600  * Returns: 1 means that we wrote the page
1601  *          0 means that page is identical to the one already sent
1602  *          -1 means that xbzrle would be longer than normal
1603  *
1604  * @rs: current RAM state
1605  * @current_data: pointer to the address of the page contents
1606  * @current_addr: addr of the page
1607  * @block: block that contains the page we want to send
1608  * @offset: offset inside the block for the page
1609  * @last_stage: if we are at the completion stage
1610  */
1611 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1612                             ram_addr_t current_addr, RAMBlock *block,
1613                             ram_addr_t offset, bool last_stage)
1614 {
1615     int encoded_len = 0, bytes_xbzrle;
1616     uint8_t *prev_cached_page;
1617
1618     if (!cache_is_cached(XBZRLE.cache, current_addr,
1619                          ram_counters.dirty_sync_count)) {
1620         xbzrle_counters.cache_miss++;
1621         if (!last_stage) {
1622             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1623                              ram_counters.dirty_sync_count) == -1) {
1624                 return -1;
1625             } else {
1626                 /* update *current_data when the page has been
1627                    inserted into cache */
1628                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
1629             }
1630         }
1631         return -1;
1632     }
1633
1634     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1635
1636     /* save current buffer into memory */
1637     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1638
1639     /* XBZRLE encoding (if there is no overflow) */
1640     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1641                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1642                                        TARGET_PAGE_SIZE);
1643
1644     /*
1645      * Update the cache contents, so that it corresponds to the data
1646      * sent, in all cases except where we skip the page.
1647      */
1648     if (!last_stage && encoded_len != 0) {
1649         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1650         /*
1651          * In the case where we couldn't compress, ensure that the caller
1652          * sends the data from the cache, since the guest might have
1653          * changed the RAM since we copied it.
1654          */
1655         *current_data = prev_cached_page;
1656     }
1657
1658     if (encoded_len == 0) {
1659         trace_save_xbzrle_page_skipping();
1660         return 0;
1661     } else if (encoded_len == -1) {
1662         trace_save_xbzrle_page_overflow();
1663         xbzrle_counters.overflow++;
1664         return -1;
1665     }
1666
1667     /* Send XBZRLE based compressed page */
1668     bytes_xbzrle = save_page_header(rs, rs->f, block,
1669                                     offset | RAM_SAVE_FLAG_XBZRLE);
1670     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1671     qemu_put_be16(rs->f, encoded_len);
1672     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1673     bytes_xbzrle += encoded_len + 1 + 2;
1674     xbzrle_counters.pages++;
1675     xbzrle_counters.bytes += bytes_xbzrle;
1676     ram_counters.transferred += bytes_xbzrle;
1677
1678     return 1;
1679 }
1680
1681 /**
1682  * migration_bitmap_find_dirty: find the next dirty page from start
1683  *
1684  * Returns the page offset within memory region of the start of a dirty page
1685  *
1686  * @rs: current RAM state
1687  * @rb: RAMBlock where to search for dirty pages
1688  * @start: page where we start the search
1689  */
1690 static inline
1691 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1692                                           unsigned long start)
1693 {
1694     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1695     unsigned long *bitmap = rb->bmap;
1696     unsigned long next;
1697
1698     if (ramblock_is_ignored(rb)) {
1699         return size;
1700     }
1701
1702     /*
1703      * When the free page optimization is enabled, we need to check the bitmap
1704      * to send the non-free pages rather than all the pages in the bulk stage.
1705      */
1706     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
1707         next = start + 1;
1708     } else {
1709         next = find_next_bit(bitmap, size, start);
1710     }
1711
1712     return next;
1713 }
1714
1715 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1716                                                 RAMBlock *rb,
1717                                                 unsigned long page)
1718 {
1719     bool ret;
1720
1721     qemu_mutex_lock(&rs->bitmap_mutex);
1722
1723     /*
1724      * Clear dirty bitmap if needed.  This _must_ be called before we
1725      * send any of the page in the chunk because we need to make sure
1726      * we can capture further page content changes when we sync dirty
1727      * log the next time.  So as long as we are going to send any of
1728      * the page in the chunk we clear the remote dirty bitmap for all.
1729      * Clearing it earlier won't be a problem, but too late will.
1730      */
1731     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
1732         uint8_t shift = rb->clear_bmap_shift;
1733         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
1734         hwaddr start = (page << TARGET_PAGE_BITS) & (-size);
1735
1736         /*
1737          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
1738          * can make things easier sometimes since then start address
1739          * of the small chunk will always be 64 pages aligned so the
1740          * bitmap will always be aligned to unsigned long.  We should
1741          * even be able to remove this restriction but I'm simply
1742          * keeping it.
1743          */
1744         assert(shift >= 6);
1745         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
1746         memory_region_clear_dirty_bitmap(rb->mr, start, size);
1747     }
1748
1749     ret = test_and_clear_bit(page, rb->bmap);
1750
1751     if (ret) {
1752         rs->migration_dirty_pages--;
1753     }
1754     qemu_mutex_unlock(&rs->bitmap_mutex);
1755
1756     return ret;
1757 }
1758
1759 /* Called with RCU critical section */
1760 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1761 {
1762     rs->migration_dirty_pages +=
1763         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
1764                                               &rs->num_dirty_pages_period);
1765 }
1766
1767 /**
1768  * ram_pagesize_summary: calculate all the pagesizes of a VM
1769  *
1770  * Returns a summary bitmap of the page sizes of all RAMBlocks
1771  *
1772  * For VMs with just normal pages this is equivalent to the host page
1773  * size. If it's got some huge pages then it's the OR of all the
1774  * different page sizes.
1775  */
1776 uint64_t ram_pagesize_summary(void)
1777 {
1778     RAMBlock *block;
1779     uint64_t summary = 0;
1780
1781     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1782         summary |= block->page_size;
1783     }
1784
1785     return summary;
1786 }
1787
1788 uint64_t ram_get_total_transferred_pages(void)
1789 {
1790     return  ram_counters.normal + ram_counters.duplicate +
1791                 compression_counters.pages + xbzrle_counters.pages;
1792 }
1793
1794 static void migration_update_rates(RAMState *rs, int64_t end_time)
1795 {
1796     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1797     double compressed_size;
1798
1799     /* calculate period counters */
1800     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1801                 / (end_time - rs->time_last_bitmap_sync);
1802
1803     if (!page_count) {
1804         return;
1805     }
1806
1807     if (migrate_use_xbzrle()) {
1808         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1809             rs->xbzrle_cache_miss_prev) / page_count;
1810         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1811     }
1812
1813     if (migrate_use_compression()) {
1814         compression_counters.busy_rate = (double)(compression_counters.busy -
1815             rs->compress_thread_busy_prev) / page_count;
1816         rs->compress_thread_busy_prev = compression_counters.busy;
1817
1818         compressed_size = compression_counters.compressed_size -
1819                           rs->compressed_size_prev;
1820         if (compressed_size) {
1821             double uncompressed_size = (compression_counters.pages -
1822                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1823
1824             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1825             compression_counters.compression_rate =
1826                                         uncompressed_size / compressed_size;
1827
1828             rs->compress_pages_prev = compression_counters.pages;
1829             rs->compressed_size_prev = compression_counters.compressed_size;
1830         }
1831     }
1832 }
1833
1834 static void migration_bitmap_sync(RAMState *rs)
1835 {
1836     RAMBlock *block;
1837     int64_t end_time;
1838     uint64_t bytes_xfer_now;
1839
1840     ram_counters.dirty_sync_count++;
1841
1842     if (!rs->time_last_bitmap_sync) {
1843         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1844     }
1845
1846     trace_migration_bitmap_sync_start();
1847     memory_global_dirty_log_sync();
1848
1849     qemu_mutex_lock(&rs->bitmap_mutex);
1850     WITH_RCU_READ_LOCK_GUARD() {
1851         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1852             ramblock_sync_dirty_bitmap(rs, block);
1853         }
1854         ram_counters.remaining = ram_bytes_remaining();
1855     }
1856     qemu_mutex_unlock(&rs->bitmap_mutex);
1857
1858     memory_global_after_dirty_log_sync();
1859     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1860
1861     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1862
1863     /* more than 1 second = 1000 millisecons */
1864     if (end_time > rs->time_last_bitmap_sync + 1000) {
1865         bytes_xfer_now = ram_counters.transferred;
1866
1867         /* During block migration the auto-converge logic incorrectly detects
1868          * that ram migration makes no progress. Avoid this by disabling the
1869          * throttling logic during the bulk phase of block migration. */
1870         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1871             /* The following detection logic can be refined later. For now:
1872                Check to see if the dirtied bytes is 50% more than the approx.
1873                amount of bytes that just got transferred since the last time we
1874                were in this routine. If that happens twice, start or increase
1875                throttling */
1876
1877             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1878                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1879                 (++rs->dirty_rate_high_cnt >= 2)) {
1880                     trace_migration_throttle();
1881                     rs->dirty_rate_high_cnt = 0;
1882                     mig_throttle_guest_down();
1883             }
1884         }
1885
1886         migration_update_rates(rs, end_time);
1887
1888         rs->target_page_count_prev = rs->target_page_count;
1889
1890         /* reset period counters */
1891         rs->time_last_bitmap_sync = end_time;
1892         rs->num_dirty_pages_period = 0;
1893         rs->bytes_xfer_prev = bytes_xfer_now;
1894     }
1895     if (migrate_use_events()) {
1896         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1897     }
1898 }
1899
1900 static void migration_bitmap_sync_precopy(RAMState *rs)
1901 {
1902     Error *local_err = NULL;
1903
1904     /*
1905      * The current notifier usage is just an optimization to migration, so we
1906      * don't stop the normal migration process in the error case.
1907      */
1908     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1909         error_report_err(local_err);
1910     }
1911
1912     migration_bitmap_sync(rs);
1913
1914     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1915         error_report_err(local_err);
1916     }
1917 }
1918
1919 /**
1920  * save_zero_page_to_file: send the zero page to the file
1921  *
1922  * Returns the size of data written to the file, 0 means the page is not
1923  * a zero page
1924  *
1925  * @rs: current RAM state
1926  * @file: the file where the data is saved
1927  * @block: block that contains the page we want to send
1928  * @offset: offset inside the block for the page
1929  */
1930 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1931                                   RAMBlock *block, ram_addr_t offset)
1932 {
1933     uint8_t *p = block->host + offset;
1934     int len = 0;
1935
1936     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1937         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1938         qemu_put_byte(file, 0);
1939         len += 1;
1940     }
1941     return len;
1942 }
1943
1944 /**
1945  * save_zero_page: send the zero page to the stream
1946  *
1947  * Returns the number of pages written.
1948  *
1949  * @rs: current RAM state
1950  * @block: block that contains the page we want to send
1951  * @offset: offset inside the block for the page
1952  */
1953 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1954 {
1955     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1956
1957     if (len) {
1958         ram_counters.duplicate++;
1959         ram_counters.transferred += len;
1960         return 1;
1961     }
1962     return -1;
1963 }
1964
1965 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1966 {
1967     if (!migrate_release_ram() || !migration_in_postcopy()) {
1968         return;
1969     }
1970
1971     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1972 }
1973
1974 /*
1975  * @pages: the number of pages written by the control path,
1976  *        < 0 - error
1977  *        > 0 - number of pages written
1978  *
1979  * Return true if the pages has been saved, otherwise false is returned.
1980  */
1981 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1982                               int *pages)
1983 {
1984     uint64_t bytes_xmit = 0;
1985     int ret;
1986
1987     *pages = -1;
1988     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1989                                 &bytes_xmit);
1990     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1991         return false;
1992     }
1993
1994     if (bytes_xmit) {
1995         ram_counters.transferred += bytes_xmit;
1996         *pages = 1;
1997     }
1998
1999     if (ret == RAM_SAVE_CONTROL_DELAYED) {
2000         return true;
2001     }
2002
2003     if (bytes_xmit > 0) {
2004         ram_counters.normal++;
2005     } else if (bytes_xmit == 0) {
2006         ram_counters.duplicate++;
2007     }
2008
2009     return true;
2010 }
2011
2012 /*
2013  * directly send the page to the stream
2014  *
2015  * Returns the number of pages written.
2016  *
2017  * @rs: current RAM state
2018  * @block: block that contains the page we want to send
2019  * @offset: offset inside the block for the page
2020  * @buf: the page to be sent
2021  * @async: send to page asyncly
2022  */
2023 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
2024                             uint8_t *buf, bool async)
2025 {
2026     ram_counters.transferred += save_page_header(rs, rs->f, block,
2027                                                  offset | RAM_SAVE_FLAG_PAGE);
2028     if (async) {
2029         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
2030                               migrate_release_ram() &
2031                               migration_in_postcopy());
2032     } else {
2033         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
2034     }
2035     ram_counters.transferred += TARGET_PAGE_SIZE;
2036     ram_counters.normal++;
2037     return 1;
2038 }
2039
2040 /**
2041  * ram_save_page: send the given page to the stream
2042  *
2043  * Returns the number of pages written.
2044  *          < 0 - error
2045  *          >=0 - Number of pages written - this might legally be 0
2046  *                if xbzrle noticed the page was the same.
2047  *
2048  * @rs: current RAM state
2049  * @block: block that contains the page we want to send
2050  * @offset: offset inside the block for the page
2051  * @last_stage: if we are at the completion stage
2052  */
2053 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
2054 {
2055     int pages = -1;
2056     uint8_t *p;
2057     bool send_async = true;
2058     RAMBlock *block = pss->block;
2059     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2060     ram_addr_t current_addr = block->offset + offset;
2061
2062     p = block->host + offset;
2063     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
2064
2065     XBZRLE_cache_lock();
2066     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
2067         migrate_use_xbzrle()) {
2068         pages = save_xbzrle_page(rs, &p, current_addr, block,
2069                                  offset, last_stage);
2070         if (!last_stage) {
2071             /* Can't send this cached data async, since the cache page
2072              * might get updated before it gets to the wire
2073              */
2074             send_async = false;
2075         }
2076     }
2077
2078     /* XBZRLE overflow or normal page */
2079     if (pages == -1) {
2080         pages = save_normal_page(rs, block, offset, p, send_async);
2081     }
2082
2083     XBZRLE_cache_unlock();
2084
2085     return pages;
2086 }
2087
2088 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
2089                                  ram_addr_t offset)
2090 {
2091     if (multifd_queue_page(rs, block, offset) < 0) {
2092         return -1;
2093     }
2094     ram_counters.normal++;
2095
2096     return 1;
2097 }
2098
2099 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
2100                                  ram_addr_t offset, uint8_t *source_buf)
2101 {
2102     RAMState *rs = ram_state;
2103     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
2104     bool zero_page = false;
2105     int ret;
2106
2107     if (save_zero_page_to_file(rs, f, block, offset)) {
2108         zero_page = true;
2109         goto exit;
2110     }
2111
2112     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
2113
2114     /*
2115      * copy it to a internal buffer to avoid it being modified by VM
2116      * so that we can catch up the error during compression and
2117      * decompression
2118      */
2119     memcpy(source_buf, p, TARGET_PAGE_SIZE);
2120     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
2121     if (ret < 0) {
2122         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
2123         error_report("compressed data failed!");
2124         return false;
2125     }
2126
2127 exit:
2128     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
2129     return zero_page;
2130 }
2131
2132 static void
2133 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
2134 {
2135     ram_counters.transferred += bytes_xmit;
2136
2137     if (param->zero_page) {
2138         ram_counters.duplicate++;
2139         return;
2140     }
2141
2142     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
2143     compression_counters.compressed_size += bytes_xmit - 8;
2144     compression_counters.pages++;
2145 }
2146
2147 static bool save_page_use_compression(RAMState *rs);
2148
2149 static void flush_compressed_data(RAMState *rs)
2150 {
2151     int idx, len, thread_count;
2152
2153     if (!save_page_use_compression(rs)) {
2154         return;
2155     }
2156     thread_count = migrate_compress_threads();
2157
2158     qemu_mutex_lock(&comp_done_lock);
2159     for (idx = 0; idx < thread_count; idx++) {
2160         while (!comp_param[idx].done) {
2161             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2162         }
2163     }
2164     qemu_mutex_unlock(&comp_done_lock);
2165
2166     for (idx = 0; idx < thread_count; idx++) {
2167         qemu_mutex_lock(&comp_param[idx].mutex);
2168         if (!comp_param[idx].quit) {
2169             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2170             /*
2171              * it's safe to fetch zero_page without holding comp_done_lock
2172              * as there is no further request submitted to the thread,
2173              * i.e, the thread should be waiting for a request at this point.
2174              */
2175             update_compress_thread_counts(&comp_param[idx], len);
2176         }
2177         qemu_mutex_unlock(&comp_param[idx].mutex);
2178     }
2179 }
2180
2181 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
2182                                        ram_addr_t offset)
2183 {
2184     param->block = block;
2185     param->offset = offset;
2186 }
2187
2188 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
2189                                            ram_addr_t offset)
2190 {
2191     int idx, thread_count, bytes_xmit = -1, pages = -1;
2192     bool wait = migrate_compress_wait_thread();
2193
2194     thread_count = migrate_compress_threads();
2195     qemu_mutex_lock(&comp_done_lock);
2196 retry:
2197     for (idx = 0; idx < thread_count; idx++) {
2198         if (comp_param[idx].done) {
2199             comp_param[idx].done = false;
2200             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
2201             qemu_mutex_lock(&comp_param[idx].mutex);
2202             set_compress_params(&comp_param[idx], block, offset);
2203             qemu_cond_signal(&comp_param[idx].cond);
2204             qemu_mutex_unlock(&comp_param[idx].mutex);
2205             pages = 1;
2206             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
2207             break;
2208         }
2209     }
2210
2211     /*
2212      * wait for the free thread if the user specifies 'compress-wait-thread',
2213      * otherwise we will post the page out in the main thread as normal page.
2214      */
2215     if (pages < 0 && wait) {
2216         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
2217         goto retry;
2218     }
2219     qemu_mutex_unlock(&comp_done_lock);
2220
2221     return pages;
2222 }
2223
2224 /**
2225  * find_dirty_block: find the next dirty page and update any state
2226  * associated with the search process.
2227  *
2228  * Returns true if a page is found
2229  *
2230  * @rs: current RAM state
2231  * @pss: data about the state of the current dirty page scan
2232  * @again: set to false if the search has scanned the whole of RAM
2233  */
2234 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
2235 {
2236     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2237     if (pss->complete_round && pss->block == rs->last_seen_block &&
2238         pss->page >= rs->last_page) {
2239         /*
2240          * We've been once around the RAM and haven't found anything.
2241          * Give up.
2242          */
2243         *again = false;
2244         return false;
2245     }
2246     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
2247         /* Didn't find anything in this RAM Block */
2248         pss->page = 0;
2249         pss->block = QLIST_NEXT_RCU(pss->block, next);
2250         if (!pss->block) {
2251             /*
2252              * If memory migration starts over, we will meet a dirtied page
2253              * which may still exists in compression threads's ring, so we
2254              * should flush the compressed data to make sure the new page
2255              * is not overwritten by the old one in the destination.
2256              *
2257              * Also If xbzrle is on, stop using the data compression at this
2258              * point. In theory, xbzrle can do better than compression.
2259              */
2260             flush_compressed_data(rs);
2261
2262             /* Hit the end of the list */
2263             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
2264             /* Flag that we've looped */
2265             pss->complete_round = true;
2266             rs->ram_bulk_stage = false;
2267         }
2268         /* Didn't find anything this time, but try again on the new block */
2269         *again = true;
2270         return false;
2271     } else {
2272         /* Can go around again, but... */
2273         *again = true;
2274         /* We've found something so probably don't need to */
2275         return true;
2276     }
2277 }
2278
2279 /**
2280  * unqueue_page: gets a page of the queue
2281  *
2282  * Helper for 'get_queued_page' - gets a page off the queue
2283  *
2284  * Returns the block of the page (or NULL if none available)
2285  *
2286  * @rs: current RAM state
2287  * @offset: used to return the offset within the RAMBlock
2288  */
2289 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
2290 {
2291     RAMBlock *block = NULL;
2292
2293     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
2294         return NULL;
2295     }
2296
2297     qemu_mutex_lock(&rs->src_page_req_mutex);
2298     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2299         struct RAMSrcPageRequest *entry =
2300                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
2301         block = entry->rb;
2302         *offset = entry->offset;
2303
2304         if (entry->len > TARGET_PAGE_SIZE) {
2305             entry->len -= TARGET_PAGE_SIZE;
2306             entry->offset += TARGET_PAGE_SIZE;
2307         } else {
2308             memory_region_unref(block->mr);
2309             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2310             g_free(entry);
2311             migration_consume_urgent_request();
2312         }
2313     }
2314     qemu_mutex_unlock(&rs->src_page_req_mutex);
2315
2316     return block;
2317 }
2318
2319 /**
2320  * get_queued_page: unqueue a page from the postcopy requests
2321  *
2322  * Skips pages that are already sent (!dirty)
2323  *
2324  * Returns true if a queued page is found
2325  *
2326  * @rs: current RAM state
2327  * @pss: data about the state of the current dirty page scan
2328  */
2329 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2330 {
2331     RAMBlock  *block;
2332     ram_addr_t offset;
2333     bool dirty;
2334
2335     do {
2336         block = unqueue_page(rs, &offset);
2337         /*
2338          * We're sending this page, and since it's postcopy nothing else
2339          * will dirty it, and we must make sure it doesn't get sent again
2340          * even if this queue request was received after the background
2341          * search already sent it.
2342          */
2343         if (block) {
2344             unsigned long page;
2345
2346             page = offset >> TARGET_PAGE_BITS;
2347             dirty = test_bit(page, block->bmap);
2348             if (!dirty) {
2349                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2350                                                 page);
2351             } else {
2352                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2353             }
2354         }
2355
2356     } while (block && !dirty);
2357
2358     if (block) {
2359         /*
2360          * As soon as we start servicing pages out of order, then we have
2361          * to kill the bulk stage, since the bulk stage assumes
2362          * in (migration_bitmap_find_and_reset_dirty) that every page is
2363          * dirty, that's no longer true.
2364          */
2365         rs->ram_bulk_stage = false;
2366
2367         /*
2368          * We want the background search to continue from the queued page
2369          * since the guest is likely to want other pages near to the page
2370          * it just requested.
2371          */
2372         pss->block = block;
2373         pss->page = offset >> TARGET_PAGE_BITS;
2374
2375         /*
2376          * This unqueued page would break the "one round" check, even is
2377          * really rare.
2378          */
2379         pss->complete_round = false;
2380     }
2381
2382     return !!block;
2383 }
2384
2385 /**
2386  * migration_page_queue_free: drop any remaining pages in the ram
2387  * request queue
2388  *
2389  * It should be empty at the end anyway, but in error cases there may
2390  * be some left.  in case that there is any page left, we drop it.
2391  *
2392  */
2393 static void migration_page_queue_free(RAMState *rs)
2394 {
2395     struct RAMSrcPageRequest *mspr, *next_mspr;
2396     /* This queue generally should be empty - but in the case of a failed
2397      * migration might have some droppings in.
2398      */
2399     RCU_READ_LOCK_GUARD();
2400     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2401         memory_region_unref(mspr->rb->mr);
2402         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2403         g_free(mspr);
2404     }
2405 }
2406
2407 /**
2408  * ram_save_queue_pages: queue the page for transmission
2409  *
2410  * A request from postcopy destination for example.
2411  *
2412  * Returns zero on success or negative on error
2413  *
2414  * @rbname: Name of the RAMBLock of the request. NULL means the
2415  *          same that last one.
2416  * @start: starting address from the start of the RAMBlock
2417  * @len: length (in bytes) to send
2418  */
2419 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2420 {
2421     RAMBlock *ramblock;
2422     RAMState *rs = ram_state;
2423
2424     ram_counters.postcopy_requests++;
2425     RCU_READ_LOCK_GUARD();
2426
2427     if (!rbname) {
2428         /* Reuse last RAMBlock */
2429         ramblock = rs->last_req_rb;
2430
2431         if (!ramblock) {
2432             /*
2433              * Shouldn't happen, we can't reuse the last RAMBlock if
2434              * it's the 1st request.
2435              */
2436             error_report("ram_save_queue_pages no previous block");
2437             goto err;
2438         }
2439     } else {
2440         ramblock = qemu_ram_block_by_name(rbname);
2441
2442         if (!ramblock) {
2443             /* We shouldn't be asked for a non-existent RAMBlock */
2444             error_report("ram_save_queue_pages no block '%s'", rbname);
2445             goto err;
2446         }
2447         rs->last_req_rb = ramblock;
2448     }
2449     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2450     if (start+len > ramblock->used_length) {
2451         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2452                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2453                      __func__, start, len, ramblock->used_length);
2454         goto err;
2455     }
2456
2457     struct RAMSrcPageRequest *new_entry =
2458         g_malloc0(sizeof(struct RAMSrcPageRequest));
2459     new_entry->rb = ramblock;
2460     new_entry->offset = start;
2461     new_entry->len = len;
2462
2463     memory_region_ref(ramblock->mr);
2464     qemu_mutex_lock(&rs->src_page_req_mutex);
2465     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2466     migration_make_urgent_request();
2467     qemu_mutex_unlock(&rs->src_page_req_mutex);
2468
2469     return 0;
2470
2471 err:
2472     return -1;
2473 }
2474
2475 static bool save_page_use_compression(RAMState *rs)
2476 {
2477     if (!migrate_use_compression()) {
2478         return false;
2479     }
2480
2481     /*
2482      * If xbzrle is on, stop using the data compression after first
2483      * round of migration even if compression is enabled. In theory,
2484      * xbzrle can do better than compression.
2485      */
2486     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2487         return true;
2488     }
2489
2490     return false;
2491 }
2492
2493 /*
2494  * try to compress the page before posting it out, return true if the page
2495  * has been properly handled by compression, otherwise needs other
2496  * paths to handle it
2497  */
2498 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2499 {
2500     if (!save_page_use_compression(rs)) {
2501         return false;
2502     }
2503
2504     /*
2505      * When starting the process of a new block, the first page of
2506      * the block should be sent out before other pages in the same
2507      * block, and all the pages in last block should have been sent
2508      * out, keeping this order is important, because the 'cont' flag
2509      * is used to avoid resending the block name.
2510      *
2511      * We post the fist page as normal page as compression will take
2512      * much CPU resource.
2513      */
2514     if (block != rs->last_sent_block) {
2515         flush_compressed_data(rs);
2516         return false;
2517     }
2518
2519     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2520         return true;
2521     }
2522
2523     compression_counters.busy++;
2524     return false;
2525 }
2526
2527 /**
2528  * ram_save_target_page: save one target page
2529  *
2530  * Returns the number of pages written
2531  *
2532  * @rs: current RAM state
2533  * @pss: data about the page we want to send
2534  * @last_stage: if we are at the completion stage
2535  */
2536 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2537                                 bool last_stage)
2538 {
2539     RAMBlock *block = pss->block;
2540     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2541     int res;
2542
2543     if (control_save_page(rs, block, offset, &res)) {
2544         return res;
2545     }
2546
2547     if (save_compress_page(rs, block, offset)) {
2548         return 1;
2549     }
2550
2551     res = save_zero_page(rs, block, offset);
2552     if (res > 0) {
2553         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2554          * page would be stale
2555          */
2556         if (!save_page_use_compression(rs)) {
2557             XBZRLE_cache_lock();
2558             xbzrle_cache_zero_page(rs, block->offset + offset);
2559             XBZRLE_cache_unlock();
2560         }
2561         ram_release_pages(block->idstr, offset, res);
2562         return res;
2563     }
2564
2565     /*
2566      * do not use multifd for compression as the first page in the new
2567      * block should be posted out before sending the compressed page
2568      */
2569     if (!save_page_use_compression(rs) && migrate_use_multifd()) {
2570         return ram_save_multifd_page(rs, block, offset);
2571     }
2572
2573     return ram_save_page(rs, pss, last_stage);
2574 }
2575
2576 /**
2577  * ram_save_host_page: save a whole host page
2578  *
2579  * Starting at *offset send pages up to the end of the current host
2580  * page. It's valid for the initial offset to point into the middle of
2581  * a host page in which case the remainder of the hostpage is sent.
2582  * Only dirty target pages are sent. Note that the host page size may
2583  * be a huge page for this block.
2584  * The saving stops at the boundary of the used_length of the block
2585  * if the RAMBlock isn't a multiple of the host page size.
2586  *
2587  * Returns the number of pages written or negative on error
2588  *
2589  * @rs: current RAM state
2590  * @ms: current migration state
2591  * @pss: data about the page we want to send
2592  * @last_stage: if we are at the completion stage
2593  */
2594 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2595                               bool last_stage)
2596 {
2597     int tmppages, pages = 0;
2598     size_t pagesize_bits =
2599         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2600
2601     if (ramblock_is_ignored(pss->block)) {
2602         error_report("block %s should not be migrated !", pss->block->idstr);
2603         return 0;
2604     }
2605
2606     do {
2607         /* Check the pages is dirty and if it is send it */
2608         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2609             pss->page++;
2610             continue;
2611         }
2612
2613         tmppages = ram_save_target_page(rs, pss, last_stage);
2614         if (tmppages < 0) {
2615             return tmppages;
2616         }
2617
2618         pages += tmppages;
2619         pss->page++;
2620     } while ((pss->page & (pagesize_bits - 1)) &&
2621              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2622
2623     /* The offset we leave with is the last one we looked at */
2624     pss->page--;
2625     return pages;
2626 }
2627
2628 /**
2629  * ram_find_and_save_block: finds a dirty page and sends it to f
2630  *
2631  * Called within an RCU critical section.
2632  *
2633  * Returns the number of pages written where zero means no dirty pages,
2634  * or negative on error
2635  *
2636  * @rs: current RAM state
2637  * @last_stage: if we are at the completion stage
2638  *
2639  * On systems where host-page-size > target-page-size it will send all the
2640  * pages in a host page that are dirty.
2641  */
2642
2643 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2644 {
2645     PageSearchStatus pss;
2646     int pages = 0;
2647     bool again, found;
2648
2649     /* No dirty page as there is zero RAM */
2650     if (!ram_bytes_total()) {
2651         return pages;
2652     }
2653
2654     pss.block = rs->last_seen_block;
2655     pss.page = rs->last_page;
2656     pss.complete_round = false;
2657
2658     if (!pss.block) {
2659         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2660     }
2661
2662     do {
2663         again = true;
2664         found = get_queued_page(rs, &pss);
2665
2666         if (!found) {
2667             /* priority queue empty, so just search for something dirty */
2668             found = find_dirty_block(rs, &pss, &again);
2669         }
2670
2671         if (found) {
2672             pages = ram_save_host_page(rs, &pss, last_stage);
2673         }
2674     } while (!pages && again);
2675
2676     rs->last_seen_block = pss.block;
2677     rs->last_page = pss.page;
2678
2679     return pages;
2680 }
2681
2682 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2683 {
2684     uint64_t pages = size / TARGET_PAGE_SIZE;
2685
2686     if (zero) {
2687         ram_counters.duplicate += pages;
2688     } else {
2689         ram_counters.normal += pages;
2690         ram_counters.transferred += size;
2691         qemu_update_position(f, size);
2692     }
2693 }
2694
2695 static uint64_t ram_bytes_total_common(bool count_ignored)
2696 {
2697     RAMBlock *block;
2698     uint64_t total = 0;
2699
2700     RCU_READ_LOCK_GUARD();
2701
2702     if (count_ignored) {
2703         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2704             total += block->used_length;
2705         }
2706     } else {
2707         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2708             total += block->used_length;
2709         }
2710     }
2711     return total;
2712 }
2713
2714 uint64_t ram_bytes_total(void)
2715 {
2716     return ram_bytes_total_common(false);
2717 }
2718
2719 static void xbzrle_load_setup(void)
2720 {
2721     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2722 }
2723
2724 static void xbzrle_load_cleanup(void)
2725 {
2726     g_free(XBZRLE.decoded_buf);
2727     XBZRLE.decoded_buf = NULL;
2728 }
2729
2730 static void ram_state_cleanup(RAMState **rsp)
2731 {
2732     if (*rsp) {
2733         migration_page_queue_free(*rsp);
2734         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2735         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2736         g_free(*rsp);
2737         *rsp = NULL;
2738     }
2739 }
2740
2741 static void xbzrle_cleanup(void)
2742 {
2743     XBZRLE_cache_lock();
2744     if (XBZRLE.cache) {
2745         cache_fini(XBZRLE.cache);
2746         g_free(XBZRLE.encoded_buf);
2747         g_free(XBZRLE.current_buf);
2748         g_free(XBZRLE.zero_target_page);
2749         XBZRLE.cache = NULL;
2750         XBZRLE.encoded_buf = NULL;
2751         XBZRLE.current_buf = NULL;
2752         XBZRLE.zero_target_page = NULL;
2753     }
2754     XBZRLE_cache_unlock();
2755 }
2756
2757 static void ram_save_cleanup(void *opaque)
2758 {
2759     RAMState **rsp = opaque;
2760     RAMBlock *block;
2761
2762     /* caller have hold iothread lock or is in a bh, so there is
2763      * no writing race against the migration bitmap
2764      */
2765     memory_global_dirty_log_stop();
2766
2767     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2768         g_free(block->clear_bmap);
2769         block->clear_bmap = NULL;
2770         g_free(block->bmap);
2771         block->bmap = NULL;
2772     }
2773
2774     xbzrle_cleanup();
2775     compress_threads_save_cleanup();
2776     ram_state_cleanup(rsp);
2777 }
2778
2779 static void ram_state_reset(RAMState *rs)
2780 {
2781     rs->last_seen_block = NULL;
2782     rs->last_sent_block = NULL;
2783     rs->last_page = 0;
2784     rs->last_version = ram_list.version;
2785     rs->ram_bulk_stage = true;
2786     rs->fpo_enabled = false;
2787 }
2788
2789 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2790
2791 /*
2792  * 'expected' is the value you expect the bitmap mostly to be full
2793  * of; it won't bother printing lines that are all this value.
2794  * If 'todump' is null the migration bitmap is dumped.
2795  */
2796 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2797                            unsigned long pages)
2798 {
2799     int64_t cur;
2800     int64_t linelen = 128;
2801     char linebuf[129];
2802
2803     for (cur = 0; cur < pages; cur += linelen) {
2804         int64_t curb;
2805         bool found = false;
2806         /*
2807          * Last line; catch the case where the line length
2808          * is longer than remaining ram
2809          */
2810         if (cur + linelen > pages) {
2811             linelen = pages - cur;
2812         }
2813         for (curb = 0; curb < linelen; curb++) {
2814             bool thisbit = test_bit(cur + curb, todump);
2815             linebuf[curb] = thisbit ? '1' : '.';
2816             found = found || (thisbit != expected);
2817         }
2818         if (found) {
2819             linebuf[curb] = '\0';
2820             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2821         }
2822     }
2823 }
2824
2825 /* **** functions for postcopy ***** */
2826
2827 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2828 {
2829     struct RAMBlock *block;
2830
2831     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2832         unsigned long *bitmap = block->bmap;
2833         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2834         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2835
2836         while (run_start < range) {
2837             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2838             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2839                               (run_end - run_start) << TARGET_PAGE_BITS);
2840             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2841         }
2842     }
2843 }
2844
2845 /**
2846  * postcopy_send_discard_bm_ram: discard a RAMBlock
2847  *
2848  * Returns zero on success
2849  *
2850  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2851  *
2852  * @ms: current migration state
2853  * @block: RAMBlock to discard
2854  */
2855 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2856 {
2857     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2858     unsigned long current;
2859     unsigned long *bitmap = block->bmap;
2860
2861     for (current = 0; current < end; ) {
2862         unsigned long one = find_next_bit(bitmap, end, current);
2863         unsigned long zero, discard_length;
2864
2865         if (one >= end) {
2866             break;
2867         }
2868
2869         zero = find_next_zero_bit(bitmap, end, one + 1);
2870
2871         if (zero >= end) {
2872             discard_length = end - one;
2873         } else {
2874             discard_length = zero - one;
2875         }
2876         postcopy_discard_send_range(ms, one, discard_length);
2877         current = one + discard_length;
2878     }
2879
2880     return 0;
2881 }
2882
2883 /**
2884  * postcopy_each_ram_send_discard: discard all RAMBlocks
2885  *
2886  * Returns 0 for success or negative for error
2887  *
2888  * Utility for the outgoing postcopy code.
2889  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2890  *   passing it bitmap indexes and name.
2891  * (qemu_ram_foreach_block ends up passing unscaled lengths
2892  *  which would mean postcopy code would have to deal with target page)
2893  *
2894  * @ms: current migration state
2895  */
2896 static int postcopy_each_ram_send_discard(MigrationState *ms)
2897 {
2898     struct RAMBlock *block;
2899     int ret;
2900
2901     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2902         postcopy_discard_send_init(ms, block->idstr);
2903
2904         /*
2905          * Postcopy sends chunks of bitmap over the wire, but it
2906          * just needs indexes at this point, avoids it having
2907          * target page specific code.
2908          */
2909         ret = postcopy_send_discard_bm_ram(ms, block);
2910         postcopy_discard_send_finish(ms);
2911         if (ret) {
2912             return ret;
2913         }
2914     }
2915
2916     return 0;
2917 }
2918
2919 /**
2920  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2921  *
2922  * Helper for postcopy_chunk_hostpages; it's called twice to
2923  * canonicalize the two bitmaps, that are similar, but one is
2924  * inverted.
2925  *
2926  * Postcopy requires that all target pages in a hostpage are dirty or
2927  * clean, not a mix.  This function canonicalizes the bitmaps.
2928  *
2929  * @ms: current migration state
2930  * @block: block that contains the page we want to canonicalize
2931  */
2932 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2933 {
2934     RAMState *rs = ram_state;
2935     unsigned long *bitmap = block->bmap;
2936     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2937     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2938     unsigned long run_start;
2939
2940     if (block->page_size == TARGET_PAGE_SIZE) {
2941         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2942         return;
2943     }
2944
2945     /* Find a dirty page */
2946     run_start = find_next_bit(bitmap, pages, 0);
2947
2948     while (run_start < pages) {
2949
2950         /*
2951          * If the start of this run of pages is in the middle of a host
2952          * page, then we need to fixup this host page.
2953          */
2954         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2955             /* Find the end of this run */
2956             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2957             /*
2958              * If the end isn't at the start of a host page, then the
2959              * run doesn't finish at the end of a host page
2960              * and we need to discard.
2961              */
2962         }
2963
2964         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2965             unsigned long page;
2966             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2967                                                              host_ratio);
2968             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2969
2970             /* Clean up the bitmap */
2971             for (page = fixup_start_addr;
2972                  page < fixup_start_addr + host_ratio; page++) {
2973                 /*
2974                  * Remark them as dirty, updating the count for any pages
2975                  * that weren't previously dirty.
2976                  */
2977                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2978             }
2979         }
2980
2981         /* Find the next dirty page for the next iteration */
2982         run_start = find_next_bit(bitmap, pages, run_start);
2983     }
2984 }
2985
2986 /**
2987  * postcopy_chunk_hostpages: discard any partially sent host page
2988  *
2989  * Utility for the outgoing postcopy code.
2990  *
2991  * Discard any partially sent host-page size chunks, mark any partially
2992  * dirty host-page size chunks as all dirty.  In this case the host-page
2993  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2994  *
2995  * Returns zero on success
2996  *
2997  * @ms: current migration state
2998  * @block: block we want to work with
2999  */
3000 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
3001 {
3002     postcopy_discard_send_init(ms, block->idstr);
3003
3004     /*
3005      * Ensure that all partially dirty host pages are made fully dirty.
3006      */
3007     postcopy_chunk_hostpages_pass(ms, block);
3008
3009     postcopy_discard_send_finish(ms);
3010     return 0;
3011 }
3012
3013 /**
3014  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3015  *
3016  * Returns zero on success
3017  *
3018  * Transmit the set of pages to be discarded after precopy to the target
3019  * these are pages that:
3020  *     a) Have been previously transmitted but are now dirty again
3021  *     b) Pages that have never been transmitted, this ensures that
3022  *        any pages on the destination that have been mapped by background
3023  *        tasks get discarded (transparent huge pages is the specific concern)
3024  * Hopefully this is pretty sparse
3025  *
3026  * @ms: current migration state
3027  */
3028 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
3029 {
3030     RAMState *rs = ram_state;
3031     RAMBlock *block;
3032     int ret;
3033
3034     RCU_READ_LOCK_GUARD();
3035
3036     /* This should be our last sync, the src is now paused */
3037     migration_bitmap_sync(rs);
3038
3039     /* Easiest way to make sure we don't resume in the middle of a host-page */
3040     rs->last_seen_block = NULL;
3041     rs->last_sent_block = NULL;
3042     rs->last_page = 0;
3043
3044     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3045         /* Deal with TPS != HPS and huge pages */
3046         ret = postcopy_chunk_hostpages(ms, block);
3047         if (ret) {
3048             return ret;
3049         }
3050
3051 #ifdef DEBUG_POSTCOPY
3052         ram_debug_dump_bitmap(block->bmap, true,
3053                               block->used_length >> TARGET_PAGE_BITS);
3054 #endif
3055     }
3056     trace_ram_postcopy_send_discard_bitmap();
3057
3058     ret = postcopy_each_ram_send_discard(ms);
3059
3060     return ret;
3061 }
3062
3063 /**
3064  * ram_discard_range: discard dirtied pages at the beginning of postcopy
3065  *
3066  * Returns zero on success
3067  *
3068  * @rbname: name of the RAMBlock of the request. NULL means the
3069  *          same that last one.
3070  * @start: RAMBlock starting page
3071  * @length: RAMBlock size
3072  */
3073 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3074 {
3075     int ret = -1;
3076
3077     trace_ram_discard_range(rbname, start, length);
3078
3079     RCU_READ_LOCK_GUARD();
3080     RAMBlock *rb = qemu_ram_block_by_name(rbname);
3081
3082     if (!rb) {
3083         error_report("ram_discard_range: Failed to find block '%s'", rbname);
3084         goto err;
3085     }
3086
3087     /*
3088      * On source VM, we don't need to update the received bitmap since
3089      * we don't even have one.
3090      */
3091     if (rb->receivedmap) {
3092         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3093                      length >> qemu_target_page_bits());
3094     }
3095
3096     ret = ram_block_discard_range(rb, start, length);
3097
3098 err:
3099     return ret;
3100 }
3101
3102 /*
3103  * For every allocation, we will try not to crash the VM if the
3104  * allocation failed.
3105  */
3106 static int xbzrle_init(void)
3107 {
3108     Error *local_err = NULL;
3109
3110     if (!migrate_use_xbzrle()) {
3111         return 0;
3112     }
3113
3114     XBZRLE_cache_lock();
3115
3116     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3117     if (!XBZRLE.zero_target_page) {
3118         error_report("%s: Error allocating zero page", __func__);
3119         goto err_out;
3120     }
3121
3122     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3123                               TARGET_PAGE_SIZE, &local_err);
3124     if (!XBZRLE.cache) {
3125         error_report_err(local_err);
3126         goto free_zero_page;
3127     }
3128
3129     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3130     if (!XBZRLE.encoded_buf) {
3131         error_report("%s: Error allocating encoded_buf", __func__);
3132         goto free_cache;
3133     }
3134
3135     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3136     if (!XBZRLE.current_buf) {
3137         error_report("%s: Error allocating current_buf", __func__);
3138         goto free_encoded_buf;
3139     }
3140
3141     /* We are all good */
3142     XBZRLE_cache_unlock();
3143     return 0;
3144
3145 free_encoded_buf:
3146     g_free(XBZRLE.encoded_buf);
3147     XBZRLE.encoded_buf = NULL;
3148 free_cache:
3149     cache_fini(XBZRLE.cache);
3150     XBZRLE.cache = NULL;
3151 free_zero_page:
3152     g_free(XBZRLE.zero_target_page);
3153     XBZRLE.zero_target_page = NULL;
3154 err_out:
3155     XBZRLE_cache_unlock();
3156     return -ENOMEM;
3157 }
3158
3159 static int ram_state_init(RAMState **rsp)
3160 {
3161     *rsp = g_try_new0(RAMState, 1);
3162
3163     if (!*rsp) {
3164         error_report("%s: Init ramstate fail", __func__);
3165         return -1;
3166     }
3167
3168     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3169     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3170     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3171
3172     /*
3173      * Count the total number of pages used by ram blocks not including any
3174      * gaps due to alignment or unplugs.
3175      * This must match with the initial values of dirty bitmap.
3176      */
3177     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3178     ram_state_reset(*rsp);
3179
3180     return 0;
3181 }
3182
3183 static void ram_list_init_bitmaps(void)
3184 {
3185     MigrationState *ms = migrate_get_current();
3186     RAMBlock *block;
3187     unsigned long pages;
3188     uint8_t shift;
3189
3190     /* Skip setting bitmap if there is no RAM */
3191     if (ram_bytes_total()) {
3192         shift = ms->clear_bitmap_shift;
3193         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3194             error_report("clear_bitmap_shift (%u) too big, using "
3195                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3196             shift = CLEAR_BITMAP_SHIFT_MAX;
3197         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3198             error_report("clear_bitmap_shift (%u) too small, using "
3199                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3200             shift = CLEAR_BITMAP_SHIFT_MIN;
3201         }
3202
3203         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3204             pages = block->max_length >> TARGET_PAGE_BITS;
3205             /*
3206              * The initial dirty bitmap for migration must be set with all
3207              * ones to make sure we'll migrate every guest RAM page to
3208              * destination.
3209              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3210              * new migration after a failed migration, ram_list.
3211              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3212              * guest memory.
3213              */
3214             block->bmap = bitmap_new(pages);
3215             bitmap_set(block->bmap, 0, pages);
3216             block->clear_bmap_shift = shift;
3217             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3218         }
3219     }
3220 }
3221
3222 static void ram_init_bitmaps(RAMState *rs)
3223 {
3224     /* For memory_global_dirty_log_start below.  */
3225     qemu_mutex_lock_iothread();
3226     qemu_mutex_lock_ramlist();
3227
3228     WITH_RCU_READ_LOCK_GUARD() {
3229         ram_list_init_bitmaps();
3230         memory_global_dirty_log_start();
3231         migration_bitmap_sync_precopy(rs);
3232     }
3233     qemu_mutex_unlock_ramlist();
3234     qemu_mutex_unlock_iothread();
3235 }
3236
3237 static int ram_init_all(RAMState **rsp)
3238 {
3239     if (ram_state_init(rsp)) {
3240         return -1;
3241     }
3242
3243     if (xbzrle_init()) {
3244         ram_state_cleanup(rsp);
3245         return -1;
3246     }
3247
3248     ram_init_bitmaps(*rsp);
3249
3250     return 0;
3251 }
3252
3253 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3254 {
3255     RAMBlock *block;
3256     uint64_t pages = 0;
3257
3258     /*
3259      * Postcopy is not using xbzrle/compression, so no need for that.
3260      * Also, since source are already halted, we don't need to care
3261      * about dirty page logging as well.
3262      */
3263
3264     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3265         pages += bitmap_count_one(block->bmap,
3266                                   block->used_length >> TARGET_PAGE_BITS);
3267     }
3268
3269     /* This may not be aligned with current bitmaps. Recalculate. */
3270     rs->migration_dirty_pages = pages;
3271
3272     rs->last_seen_block = NULL;
3273     rs->last_sent_block = NULL;
3274     rs->last_page = 0;
3275     rs->last_version = ram_list.version;
3276     /*
3277      * Disable the bulk stage, otherwise we'll resend the whole RAM no
3278      * matter what we have sent.
3279      */
3280     rs->ram_bulk_stage = false;
3281
3282     /* Update RAMState cache of output QEMUFile */
3283     rs->f = out;
3284
3285     trace_ram_state_resume_prepare(pages);
3286 }
3287
3288 /*
3289  * This function clears bits of the free pages reported by the caller from the
3290  * migration dirty bitmap. @addr is the host address corresponding to the
3291  * start of the continuous guest free pages, and @len is the total bytes of
3292  * those pages.
3293  */
3294 void qemu_guest_free_page_hint(void *addr, size_t len)
3295 {
3296     RAMBlock *block;
3297     ram_addr_t offset;
3298     size_t used_len, start, npages;
3299     MigrationState *s = migrate_get_current();
3300
3301     /* This function is currently expected to be used during live migration */
3302     if (!migration_is_setup_or_active(s->state)) {
3303         return;
3304     }
3305
3306     for (; len > 0; len -= used_len, addr += used_len) {
3307         block = qemu_ram_block_from_host(addr, false, &offset);
3308         if (unlikely(!block || offset >= block->used_length)) {
3309             /*
3310              * The implementation might not support RAMBlock resize during
3311              * live migration, but it could happen in theory with future
3312              * updates. So we add a check here to capture that case.
3313              */
3314             error_report_once("%s unexpected error", __func__);
3315             return;
3316         }
3317
3318         if (len <= block->used_length - offset) {
3319             used_len = len;
3320         } else {
3321             used_len = block->used_length - offset;
3322         }
3323
3324         start = offset >> TARGET_PAGE_BITS;
3325         npages = used_len >> TARGET_PAGE_BITS;
3326
3327         qemu_mutex_lock(&ram_state->bitmap_mutex);
3328         ram_state->migration_dirty_pages -=
3329                       bitmap_count_one_with_offset(block->bmap, start, npages);
3330         bitmap_clear(block->bmap, start, npages);
3331         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3332     }
3333 }
3334
3335 /*
3336  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3337  * long-running RCU critical section.  When rcu-reclaims in the code
3338  * start to become numerous it will be necessary to reduce the
3339  * granularity of these critical sections.
3340  */
3341
3342 /**
3343  * ram_save_setup: Setup RAM for migration
3344  *
3345  * Returns zero to indicate success and negative for error
3346  *
3347  * @f: QEMUFile where to send the data
3348  * @opaque: RAMState pointer
3349  */
3350 static int ram_save_setup(QEMUFile *f, void *opaque)
3351 {
3352     RAMState **rsp = opaque;
3353     RAMBlock *block;
3354
3355     if (compress_threads_save_setup()) {
3356         return -1;
3357     }
3358
3359     /* migration has already setup the bitmap, reuse it. */
3360     if (!migration_in_colo_state()) {
3361         if (ram_init_all(rsp) != 0) {
3362             compress_threads_save_cleanup();
3363             return -1;
3364         }
3365     }
3366     (*rsp)->f = f;
3367
3368     WITH_RCU_READ_LOCK_GUARD() {
3369         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3370
3371         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3372             qemu_put_byte(f, strlen(block->idstr));
3373             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3374             qemu_put_be64(f, block->used_length);
3375             if (migrate_postcopy_ram() && block->page_size !=
3376                                           qemu_host_page_size) {
3377                 qemu_put_be64(f, block->page_size);
3378             }
3379             if (migrate_ignore_shared()) {
3380                 qemu_put_be64(f, block->mr->addr);
3381             }
3382         }
3383     }
3384
3385     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3386     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3387
3388     multifd_send_sync_main(*rsp);
3389     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3390     qemu_fflush(f);
3391
3392     return 0;
3393 }
3394
3395 /**
3396  * ram_save_iterate: iterative stage for migration
3397  *
3398  * Returns zero to indicate success and negative for error
3399  *
3400  * @f: QEMUFile where to send the data
3401  * @opaque: RAMState pointer
3402  */
3403 static int ram_save_iterate(QEMUFile *f, void *opaque)
3404 {
3405     RAMState **temp = opaque;
3406     RAMState *rs = *temp;
3407     int ret;
3408     int i;
3409     int64_t t0;
3410     int done = 0;
3411
3412     if (blk_mig_bulk_active()) {
3413         /* Avoid transferring ram during bulk phase of block migration as
3414          * the bulk phase will usually take a long time and transferring
3415          * ram updates during that time is pointless. */
3416         goto out;
3417     }
3418
3419     WITH_RCU_READ_LOCK_GUARD() {
3420         if (ram_list.version != rs->last_version) {
3421             ram_state_reset(rs);
3422         }
3423
3424         /* Read version before ram_list.blocks */
3425         smp_rmb();
3426
3427         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3428
3429         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3430         i = 0;
3431         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3432                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3433             int pages;
3434
3435             if (qemu_file_get_error(f)) {
3436                 break;
3437             }
3438
3439             pages = ram_find_and_save_block(rs, false);
3440             /* no more pages to sent */
3441             if (pages == 0) {
3442                 done = 1;
3443                 break;
3444             }
3445
3446             if (pages < 0) {
3447                 qemu_file_set_error(f, pages);
3448                 break;
3449             }
3450
3451             rs->target_page_count += pages;
3452
3453             /*
3454              * we want to check in the 1st loop, just in case it was the 1st
3455              * time and we had to sync the dirty bitmap.
3456              * qemu_clock_get_ns() is a bit expensive, so we only check each
3457              * some iterations
3458              */
3459             if ((i & 63) == 0) {
3460                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3461                               1000000;
3462                 if (t1 > MAX_WAIT) {
3463                     trace_ram_save_iterate_big_wait(t1, i);
3464                     break;
3465                 }
3466             }
3467             i++;
3468         }
3469     }
3470
3471     /*
3472      * Must occur before EOS (or any QEMUFile operation)
3473      * because of RDMA protocol.
3474      */
3475     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3476
3477 out:
3478     multifd_send_sync_main(rs);
3479     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3480     qemu_fflush(f);
3481     ram_counters.transferred += 8;
3482
3483     ret = qemu_file_get_error(f);
3484     if (ret < 0) {
3485         return ret;
3486     }
3487
3488     return done;
3489 }
3490
3491 /**
3492  * ram_save_complete: function called to send the remaining amount of ram
3493  *
3494  * Returns zero to indicate success or negative on error
3495  *
3496  * Called with iothread lock
3497  *
3498  * @f: QEMUFile where to send the data
3499  * @opaque: RAMState pointer
3500  */
3501 static int ram_save_complete(QEMUFile *f, void *opaque)
3502 {
3503     RAMState **temp = opaque;
3504     RAMState *rs = *temp;
3505     int ret = 0;
3506
3507     WITH_RCU_READ_LOCK_GUARD() {
3508         if (!migration_in_postcopy()) {
3509             migration_bitmap_sync_precopy(rs);
3510         }
3511
3512         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3513
3514         /* try transferring iterative blocks of memory */
3515
3516         /* flush all remaining blocks regardless of rate limiting */
3517         while (true) {
3518             int pages;
3519
3520             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3521             /* no more blocks to sent */
3522             if (pages == 0) {
3523                 break;
3524             }
3525             if (pages < 0) {
3526                 ret = pages;
3527                 break;
3528             }
3529         }
3530
3531         flush_compressed_data(rs);
3532         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3533     }
3534
3535     multifd_send_sync_main(rs);
3536     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3537     qemu_fflush(f);
3538
3539     return ret;
3540 }
3541
3542 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3543                              uint64_t *res_precopy_only,
3544                              uint64_t *res_compatible,
3545                              uint64_t *res_postcopy_only)
3546 {
3547     RAMState **temp = opaque;
3548     RAMState *rs = *temp;
3549     uint64_t remaining_size;
3550
3551     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3552
3553     if (!migration_in_postcopy() &&
3554         remaining_size < max_size) {
3555         qemu_mutex_lock_iothread();
3556         WITH_RCU_READ_LOCK_GUARD() {
3557             migration_bitmap_sync_precopy(rs);
3558         }
3559         qemu_mutex_unlock_iothread();
3560         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3561     }
3562
3563     if (migrate_postcopy_ram()) {
3564         /* We can do postcopy, and all the data is postcopiable */
3565         *res_compatible += remaining_size;
3566     } else {
3567         *res_precopy_only += remaining_size;
3568     }
3569 }
3570
3571 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3572 {
3573     unsigned int xh_len;
3574     int xh_flags;
3575     uint8_t *loaded_data;
3576
3577     /* extract RLE header */
3578     xh_flags = qemu_get_byte(f);
3579     xh_len = qemu_get_be16(f);
3580
3581     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3582         error_report("Failed to load XBZRLE page - wrong compression!");
3583         return -1;
3584     }
3585
3586     if (xh_len > TARGET_PAGE_SIZE) {
3587         error_report("Failed to load XBZRLE page - len overflow!");
3588         return -1;
3589     }
3590     loaded_data = XBZRLE.decoded_buf;
3591     /* load data and decode */
3592     /* it can change loaded_data to point to an internal buffer */
3593     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3594
3595     /* decode RLE */
3596     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3597                              TARGET_PAGE_SIZE) == -1) {
3598         error_report("Failed to load XBZRLE page - decode error!");
3599         return -1;
3600     }
3601
3602     return 0;
3603 }
3604
3605 /**
3606  * ram_block_from_stream: read a RAMBlock id from the migration stream
3607  *
3608  * Must be called from within a rcu critical section.
3609  *
3610  * Returns a pointer from within the RCU-protected ram_list.
3611  *
3612  * @f: QEMUFile where to read the data from
3613  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3614  */
3615 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3616 {
3617     static RAMBlock *block = NULL;
3618     char id[256];
3619     uint8_t len;
3620
3621     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3622         if (!block) {
3623             error_report("Ack, bad migration stream!");
3624             return NULL;
3625         }
3626         return block;
3627     }
3628
3629     len = qemu_get_byte(f);
3630     qemu_get_buffer(f, (uint8_t *)id, len);
3631     id[len] = 0;
3632
3633     block = qemu_ram_block_by_name(id);
3634     if (!block) {
3635         error_report("Can't find block %s", id);
3636         return NULL;
3637     }
3638
3639     if (ramblock_is_ignored(block)) {
3640         error_report("block %s should not be migrated !", id);
3641         return NULL;
3642     }
3643
3644     return block;
3645 }
3646
3647 static inline void *host_from_ram_block_offset(RAMBlock *block,
3648                                                ram_addr_t offset)
3649 {
3650     if (!offset_in_ramblock(block, offset)) {
3651         return NULL;
3652     }
3653
3654     return block->host + offset;
3655 }
3656
3657 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3658                                                  ram_addr_t offset)
3659 {
3660     if (!offset_in_ramblock(block, offset)) {
3661         return NULL;
3662     }
3663     if (!block->colo_cache) {
3664         error_report("%s: colo_cache is NULL in block :%s",
3665                      __func__, block->idstr);
3666         return NULL;
3667     }
3668
3669     /*
3670     * During colo checkpoint, we need bitmap of these migrated pages.
3671     * It help us to decide which pages in ram cache should be flushed
3672     * into VM's RAM later.
3673     */
3674     if (!test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3675         ram_state->migration_dirty_pages++;
3676     }
3677     return block->colo_cache + offset;
3678 }
3679
3680 /**
3681  * ram_handle_compressed: handle the zero page case
3682  *
3683  * If a page (or a whole RDMA chunk) has been
3684  * determined to be zero, then zap it.
3685  *
3686  * @host: host address for the zero page
3687  * @ch: what the page is filled from.  We only support zero
3688  * @size: size of the zero page
3689  */
3690 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3691 {
3692     if (ch != 0 || !is_zero_range(host, size)) {
3693         memset(host, ch, size);
3694     }
3695 }
3696
3697 /* return the size after decompression, or negative value on error */
3698 static int
3699 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3700                      const uint8_t *source, size_t source_len)
3701 {
3702     int err;
3703
3704     err = inflateReset(stream);
3705     if (err != Z_OK) {
3706         return -1;
3707     }
3708
3709     stream->avail_in = source_len;
3710     stream->next_in = (uint8_t *)source;
3711     stream->avail_out = dest_len;
3712     stream->next_out = dest;
3713
3714     err = inflate(stream, Z_NO_FLUSH);
3715     if (err != Z_STREAM_END) {
3716         return -1;
3717     }
3718
3719     return stream->total_out;
3720 }
3721
3722 static void *do_data_decompress(void *opaque)
3723 {
3724     DecompressParam *param = opaque;
3725     unsigned long pagesize;
3726     uint8_t *des;
3727     int len, ret;
3728
3729     qemu_mutex_lock(&param->mutex);
3730     while (!param->quit) {
3731         if (param->des) {
3732             des = param->des;
3733             len = param->len;
3734             param->des = 0;
3735             qemu_mutex_unlock(&param->mutex);
3736
3737             pagesize = TARGET_PAGE_SIZE;
3738
3739             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3740                                        param->compbuf, len);
3741             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3742                 error_report("decompress data failed");
3743                 qemu_file_set_error(decomp_file, ret);
3744             }
3745
3746             qemu_mutex_lock(&decomp_done_lock);
3747             param->done = true;
3748             qemu_cond_signal(&decomp_done_cond);
3749             qemu_mutex_unlock(&decomp_done_lock);
3750
3751             qemu_mutex_lock(&param->mutex);
3752         } else {
3753             qemu_cond_wait(&param->cond, &param->mutex);
3754         }
3755     }
3756     qemu_mutex_unlock(&param->mutex);
3757
3758     return NULL;
3759 }
3760
3761 static int wait_for_decompress_done(void)
3762 {
3763     int idx, thread_count;
3764
3765     if (!migrate_use_compression()) {
3766         return 0;
3767     }
3768
3769     thread_count = migrate_decompress_threads();
3770     qemu_mutex_lock(&decomp_done_lock);
3771     for (idx = 0; idx < thread_count; idx++) {
3772         while (!decomp_param[idx].done) {
3773             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3774         }
3775     }
3776     qemu_mutex_unlock(&decomp_done_lock);
3777     return qemu_file_get_error(decomp_file);
3778 }
3779
3780 static void compress_threads_load_cleanup(void)
3781 {
3782     int i, thread_count;
3783
3784     if (!migrate_use_compression()) {
3785         return;
3786     }
3787     thread_count = migrate_decompress_threads();
3788     for (i = 0; i < thread_count; i++) {
3789         /*
3790          * we use it as a indicator which shows if the thread is
3791          * properly init'd or not
3792          */
3793         if (!decomp_param[i].compbuf) {
3794             break;
3795         }
3796
3797         qemu_mutex_lock(&decomp_param[i].mutex);
3798         decomp_param[i].quit = true;
3799         qemu_cond_signal(&decomp_param[i].cond);
3800         qemu_mutex_unlock(&decomp_param[i].mutex);
3801     }
3802     for (i = 0; i < thread_count; i++) {
3803         if (!decomp_param[i].compbuf) {
3804             break;
3805         }
3806
3807         qemu_thread_join(decompress_threads + i);
3808         qemu_mutex_destroy(&decomp_param[i].mutex);
3809         qemu_cond_destroy(&decomp_param[i].cond);
3810         inflateEnd(&decomp_param[i].stream);
3811         g_free(decomp_param[i].compbuf);
3812         decomp_param[i].compbuf = NULL;
3813     }
3814     g_free(decompress_threads);
3815     g_free(decomp_param);
3816     decompress_threads = NULL;
3817     decomp_param = NULL;
3818     decomp_file = NULL;
3819 }
3820
3821 static int compress_threads_load_setup(QEMUFile *f)
3822 {
3823     int i, thread_count;
3824
3825     if (!migrate_use_compression()) {
3826         return 0;
3827     }
3828
3829     thread_count = migrate_decompress_threads();
3830     decompress_threads = g_new0(QemuThread, thread_count);
3831     decomp_param = g_new0(DecompressParam, thread_count);
3832     qemu_mutex_init(&decomp_done_lock);
3833     qemu_cond_init(&decomp_done_cond);
3834     decomp_file = f;
3835     for (i = 0; i < thread_count; i++) {
3836         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3837             goto exit;
3838         }
3839
3840         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3841         qemu_mutex_init(&decomp_param[i].mutex);
3842         qemu_cond_init(&decomp_param[i].cond);
3843         decomp_param[i].done = true;
3844         decomp_param[i].quit = false;
3845         qemu_thread_create(decompress_threads + i, "decompress",
3846                            do_data_decompress, decomp_param + i,
3847                            QEMU_THREAD_JOINABLE);
3848     }
3849     return 0;
3850 exit:
3851     compress_threads_load_cleanup();
3852     return -1;
3853 }
3854
3855 static void decompress_data_with_multi_threads(QEMUFile *f,
3856                                                void *host, int len)
3857 {
3858     int idx, thread_count;
3859
3860     thread_count = migrate_decompress_threads();
3861     qemu_mutex_lock(&decomp_done_lock);
3862     while (true) {
3863         for (idx = 0; idx < thread_count; idx++) {
3864             if (decomp_param[idx].done) {
3865                 decomp_param[idx].done = false;
3866                 qemu_mutex_lock(&decomp_param[idx].mutex);
3867                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3868                 decomp_param[idx].des = host;
3869                 decomp_param[idx].len = len;
3870                 qemu_cond_signal(&decomp_param[idx].cond);
3871                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3872                 break;
3873             }
3874         }
3875         if (idx < thread_count) {
3876             break;
3877         } else {
3878             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3879         }
3880     }
3881     qemu_mutex_unlock(&decomp_done_lock);
3882 }
3883
3884 /*
3885  * colo cache: this is for secondary VM, we cache the whole
3886  * memory of the secondary VM, it is need to hold the global lock
3887  * to call this helper.
3888  */
3889 int colo_init_ram_cache(void)
3890 {
3891     RAMBlock *block;
3892
3893     WITH_RCU_READ_LOCK_GUARD() {
3894         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3895             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3896                                                     NULL,
3897                                                     false);
3898             if (!block->colo_cache) {
3899                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3900                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3901                              block->used_length);
3902                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3903                     if (block->colo_cache) {
3904                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3905                         block->colo_cache = NULL;
3906                     }
3907                 }
3908                 return -errno;
3909             }
3910             memcpy(block->colo_cache, block->host, block->used_length);
3911         }
3912     }
3913
3914     /*
3915     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3916     * with to decide which page in cache should be flushed into SVM's RAM. Here
3917     * we use the same name 'ram_bitmap' as for migration.
3918     */
3919     if (ram_bytes_total()) {
3920         RAMBlock *block;
3921
3922         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3923             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3924
3925             block->bmap = bitmap_new(pages);
3926             bitmap_set(block->bmap, 0, pages);
3927         }
3928     }
3929     ram_state = g_new0(RAMState, 1);
3930     ram_state->migration_dirty_pages = 0;
3931     qemu_mutex_init(&ram_state->bitmap_mutex);
3932     memory_global_dirty_log_start();
3933
3934     return 0;
3935 }
3936
3937 /* It is need to hold the global lock to call this helper */
3938 void colo_release_ram_cache(void)
3939 {
3940     RAMBlock *block;
3941
3942     memory_global_dirty_log_stop();
3943     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3944         g_free(block->bmap);
3945         block->bmap = NULL;
3946     }
3947
3948     WITH_RCU_READ_LOCK_GUARD() {
3949         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3950             if (block->colo_cache) {
3951                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3952                 block->colo_cache = NULL;
3953             }
3954         }
3955     }
3956     qemu_mutex_destroy(&ram_state->bitmap_mutex);
3957     g_free(ram_state);
3958     ram_state = NULL;
3959 }
3960
3961 /**
3962  * ram_load_setup: Setup RAM for migration incoming side
3963  *
3964  * Returns zero to indicate success and negative for error
3965  *
3966  * @f: QEMUFile where to receive the data
3967  * @opaque: RAMState pointer
3968  */
3969 static int ram_load_setup(QEMUFile *f, void *opaque)
3970 {
3971     if (compress_threads_load_setup(f)) {
3972         return -1;
3973     }
3974
3975     xbzrle_load_setup();
3976     ramblock_recv_map_init();
3977
3978     return 0;
3979 }
3980
3981 static int ram_load_cleanup(void *opaque)
3982 {
3983     RAMBlock *rb;
3984
3985     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3986         qemu_ram_block_writeback(rb);
3987     }
3988
3989     xbzrle_load_cleanup();
3990     compress_threads_load_cleanup();
3991
3992     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3993         g_free(rb->receivedmap);
3994         rb->receivedmap = NULL;
3995     }
3996
3997     return 0;
3998 }
3999
4000 /**
4001  * ram_postcopy_incoming_init: allocate postcopy data structures
4002  *
4003  * Returns 0 for success and negative if there was one error
4004  *
4005  * @mis: current migration incoming state
4006  *
4007  * Allocate data structures etc needed by incoming migration with
4008  * postcopy-ram. postcopy-ram's similarly names
4009  * postcopy_ram_incoming_init does the work.
4010  */
4011 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4012 {
4013     return postcopy_ram_incoming_init(mis);
4014 }
4015
4016 /**
4017  * ram_load_postcopy: load a page in postcopy case
4018  *
4019  * Returns 0 for success or -errno in case of error
4020  *
4021  * Called in postcopy mode by ram_load().
4022  * rcu_read_lock is taken prior to this being called.
4023  *
4024  * @f: QEMUFile where to send the data
4025  */
4026 static int ram_load_postcopy(QEMUFile *f)
4027 {
4028     int flags = 0, ret = 0;
4029     bool place_needed = false;
4030     bool matches_target_page_size = false;
4031     MigrationIncomingState *mis = migration_incoming_get_current();
4032     /* Temporary page that is later 'placed' */
4033     void *postcopy_host_page = mis->postcopy_tmp_page;
4034     void *last_host = NULL;
4035     bool all_zero = false;
4036
4037     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4038         ram_addr_t addr;
4039         void *host = NULL;
4040         void *page_buffer = NULL;
4041         void *place_source = NULL;
4042         RAMBlock *block = NULL;
4043         uint8_t ch;
4044
4045         addr = qemu_get_be64(f);
4046
4047         /*
4048          * If qemu file error, we should stop here, and then "addr"
4049          * may be invalid
4050          */
4051         ret = qemu_file_get_error(f);
4052         if (ret) {
4053             break;
4054         }
4055
4056         flags = addr & ~TARGET_PAGE_MASK;
4057         addr &= TARGET_PAGE_MASK;
4058
4059         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
4060         place_needed = false;
4061         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
4062             block = ram_block_from_stream(f, flags);
4063
4064             host = host_from_ram_block_offset(block, addr);
4065             if (!host) {
4066                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4067                 ret = -EINVAL;
4068                 break;
4069             }
4070             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4071             /*
4072              * Postcopy requires that we place whole host pages atomically;
4073              * these may be huge pages for RAMBlocks that are backed by
4074              * hugetlbfs.
4075              * To make it atomic, the data is read into a temporary page
4076              * that's moved into place later.
4077              * The migration protocol uses,  possibly smaller, target-pages
4078              * however the source ensures it always sends all the components
4079              * of a host page in order.
4080              */
4081             page_buffer = postcopy_host_page +
4082                           ((uintptr_t)host & (block->page_size - 1));
4083             /* If all TP are zero then we can optimise the place */
4084             if (!((uintptr_t)host & (block->page_size - 1))) {
4085                 all_zero = true;
4086             } else {
4087                 /* not the 1st TP within the HP */
4088                 if (host != (last_host + TARGET_PAGE_SIZE)) {
4089                     error_report("Non-sequential target page %p/%p",
4090                                   host, last_host);
4091                     ret = -EINVAL;
4092                     break;
4093                 }
4094             }
4095
4096
4097             /*
4098              * If it's the last part of a host page then we place the host
4099              * page
4100              */
4101             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
4102                                      (block->page_size - 1)) == 0;
4103             place_source = postcopy_host_page;
4104         }
4105         last_host = host;
4106
4107         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4108         case RAM_SAVE_FLAG_ZERO:
4109             ch = qemu_get_byte(f);
4110             memset(page_buffer, ch, TARGET_PAGE_SIZE);
4111             if (ch) {
4112                 all_zero = false;
4113             }
4114             break;
4115
4116         case RAM_SAVE_FLAG_PAGE:
4117             all_zero = false;
4118             if (!matches_target_page_size) {
4119                 /* For huge pages, we always use temporary buffer */
4120                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4121             } else {
4122                 /*
4123                  * For small pages that matches target page size, we
4124                  * avoid the qemu_file copy.  Instead we directly use
4125                  * the buffer of QEMUFile to place the page.  Note: we
4126                  * cannot do any QEMUFile operation before using that
4127                  * buffer to make sure the buffer is valid when
4128                  * placing the page.
4129                  */
4130                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4131                                          TARGET_PAGE_SIZE);
4132             }
4133             break;
4134         case RAM_SAVE_FLAG_EOS:
4135             /* normal exit */
4136             multifd_recv_sync_main();
4137             break;
4138         default:
4139             error_report("Unknown combination of migration flags: %#x"
4140                          " (postcopy mode)", flags);
4141             ret = -EINVAL;
4142             break;
4143         }
4144
4145         /* Detect for any possible file errors */
4146         if (!ret && qemu_file_get_error(f)) {
4147             ret = qemu_file_get_error(f);
4148         }
4149
4150         if (!ret && place_needed) {
4151             /* This gets called at the last target page in the host page */
4152             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
4153
4154             if (all_zero) {
4155                 ret = postcopy_place_page_zero(mis, place_dest,
4156                                                block);
4157             } else {
4158                 ret = postcopy_place_page(mis, place_dest,
4159                                           place_source, block);
4160             }
4161         }
4162     }
4163
4164     return ret;
4165 }
4166
4167 static bool postcopy_is_advised(void)
4168 {
4169     PostcopyState ps = postcopy_state_get();
4170     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4171 }
4172
4173 static bool postcopy_is_running(void)
4174 {
4175     PostcopyState ps = postcopy_state_get();
4176     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4177 }
4178
4179 /*
4180  * Flush content of RAM cache into SVM's memory.
4181  * Only flush the pages that be dirtied by PVM or SVM or both.
4182  */
4183 static void colo_flush_ram_cache(void)
4184 {
4185     RAMBlock *block = NULL;
4186     void *dst_host;
4187     void *src_host;
4188     unsigned long offset = 0;
4189
4190     memory_global_dirty_log_sync();
4191     WITH_RCU_READ_LOCK_GUARD() {
4192         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4193             ramblock_sync_dirty_bitmap(ram_state, block);
4194         }
4195     }
4196
4197     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4198     WITH_RCU_READ_LOCK_GUARD() {
4199         block = QLIST_FIRST_RCU(&ram_list.blocks);
4200
4201         while (block) {
4202             offset = migration_bitmap_find_dirty(ram_state, block, offset);
4203
4204             if (offset << TARGET_PAGE_BITS >= block->used_length) {
4205                 offset = 0;
4206                 block = QLIST_NEXT_RCU(block, next);
4207             } else {
4208                 migration_bitmap_clear_dirty(ram_state, block, offset);
4209                 dst_host = block->host + (offset << TARGET_PAGE_BITS);
4210                 src_host = block->colo_cache + (offset << TARGET_PAGE_BITS);
4211                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
4212             }
4213         }
4214     }
4215     trace_colo_flush_ram_cache_end();
4216 }
4217
4218 /**
4219  * ram_load_precopy: load pages in precopy case
4220  *
4221  * Returns 0 for success or -errno in case of error
4222  *
4223  * Called in precopy mode by ram_load().
4224  * rcu_read_lock is taken prior to this being called.
4225  *
4226  * @f: QEMUFile where to send the data
4227  */
4228 static int ram_load_precopy(QEMUFile *f)
4229 {
4230     int flags = 0, ret = 0, invalid_flags = 0, len = 0;
4231     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4232     bool postcopy_advised = postcopy_is_advised();
4233     if (!migrate_use_compression()) {
4234         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4235     }
4236
4237     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4238         ram_addr_t addr, total_ram_bytes;
4239         void *host = NULL;
4240         uint8_t ch;
4241
4242         addr = qemu_get_be64(f);
4243         flags = addr & ~TARGET_PAGE_MASK;
4244         addr &= TARGET_PAGE_MASK;
4245
4246         if (flags & invalid_flags) {
4247             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4248                 error_report("Received an unexpected compressed page");
4249             }
4250
4251             ret = -EINVAL;
4252             break;
4253         }
4254
4255         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4256                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4257             RAMBlock *block = ram_block_from_stream(f, flags);
4258
4259             /*
4260              * After going into COLO, we should load the Page into colo_cache.
4261              */
4262             if (migration_incoming_in_colo_state()) {
4263                 host = colo_cache_from_block_offset(block, addr);
4264             } else {
4265                 host = host_from_ram_block_offset(block, addr);
4266             }
4267             if (!host) {
4268                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4269                 ret = -EINVAL;
4270                 break;
4271             }
4272
4273             if (!migration_incoming_in_colo_state()) {
4274                 ramblock_recv_bitmap_set(block, host);
4275             }
4276
4277             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4278         }
4279
4280         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4281         case RAM_SAVE_FLAG_MEM_SIZE:
4282             /* Synchronize RAM block list */
4283             total_ram_bytes = addr;
4284             while (!ret && total_ram_bytes) {
4285                 RAMBlock *block;
4286                 char id[256];
4287                 ram_addr_t length;
4288
4289                 len = qemu_get_byte(f);
4290                 qemu_get_buffer(f, (uint8_t *)id, len);
4291                 id[len] = 0;
4292                 length = qemu_get_be64(f);
4293
4294                 block = qemu_ram_block_by_name(id);
4295                 if (block && !qemu_ram_is_migratable(block)) {
4296                     error_report("block %s should not be migrated !", id);
4297                     ret = -EINVAL;
4298                 } else if (block) {
4299                     if (length != block->used_length) {
4300                         Error *local_err = NULL;
4301
4302                         ret = qemu_ram_resize(block, length,
4303                                               &local_err);
4304                         if (local_err) {
4305                             error_report_err(local_err);
4306                         }
4307                     }
4308                     /* For postcopy we need to check hugepage sizes match */
4309                     if (postcopy_advised &&
4310                         block->page_size != qemu_host_page_size) {
4311                         uint64_t remote_page_size = qemu_get_be64(f);
4312                         if (remote_page_size != block->page_size) {
4313                             error_report("Mismatched RAM page size %s "
4314                                          "(local) %zd != %" PRId64,
4315                                          id, block->page_size,
4316                                          remote_page_size);
4317                             ret = -EINVAL;
4318                         }
4319                     }
4320                     if (migrate_ignore_shared()) {
4321                         hwaddr addr = qemu_get_be64(f);
4322                         if (ramblock_is_ignored(block) &&
4323                             block->mr->addr != addr) {
4324                             error_report("Mismatched GPAs for block %s "
4325                                          "%" PRId64 "!= %" PRId64,
4326                                          id, (uint64_t)addr,
4327                                          (uint64_t)block->mr->addr);
4328                             ret = -EINVAL;
4329                         }
4330                     }
4331                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4332                                           block->idstr);
4333                 } else {
4334                     error_report("Unknown ramblock \"%s\", cannot "
4335                                  "accept migration", id);
4336                     ret = -EINVAL;
4337                 }
4338
4339                 total_ram_bytes -= length;
4340             }
4341             break;
4342
4343         case RAM_SAVE_FLAG_ZERO:
4344             ch = qemu_get_byte(f);
4345             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4346             break;
4347
4348         case RAM_SAVE_FLAG_PAGE:
4349             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4350             break;
4351
4352         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4353             len = qemu_get_be32(f);
4354             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4355                 error_report("Invalid compressed data length: %d", len);
4356                 ret = -EINVAL;
4357                 break;
4358             }
4359             decompress_data_with_multi_threads(f, host, len);
4360             break;
4361
4362         case RAM_SAVE_FLAG_XBZRLE:
4363             if (load_xbzrle(f, addr, host) < 0) {
4364                 error_report("Failed to decompress XBZRLE page at "
4365                              RAM_ADDR_FMT, addr);
4366                 ret = -EINVAL;
4367                 break;
4368             }
4369             break;
4370         case RAM_SAVE_FLAG_EOS:
4371             /* normal exit */
4372             multifd_recv_sync_main();
4373             break;
4374         default:
4375             if (flags & RAM_SAVE_FLAG_HOOK) {
4376                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4377             } else {
4378                 error_report("Unknown combination of migration flags: %#x",
4379                              flags);
4380                 ret = -EINVAL;
4381             }
4382         }
4383         if (!ret) {
4384             ret = qemu_file_get_error(f);
4385         }
4386     }
4387
4388     return ret;
4389 }
4390
4391 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4392 {
4393     int ret = 0;
4394     static uint64_t seq_iter;
4395     /*
4396      * If system is running in postcopy mode, page inserts to host memory must
4397      * be atomic
4398      */
4399     bool postcopy_running = postcopy_is_running();
4400
4401     seq_iter++;
4402
4403     if (version_id != 4) {
4404         return -EINVAL;
4405     }
4406
4407     /*
4408      * This RCU critical section can be very long running.
4409      * When RCU reclaims in the code start to become numerous,
4410      * it will be necessary to reduce the granularity of this
4411      * critical section.
4412      */
4413     WITH_RCU_READ_LOCK_GUARD() {
4414         if (postcopy_running) {
4415             ret = ram_load_postcopy(f);
4416         } else {
4417             ret = ram_load_precopy(f);
4418         }
4419
4420         ret |= wait_for_decompress_done();
4421     }
4422     trace_ram_load_complete(ret, seq_iter);
4423
4424     if (!ret  && migration_incoming_in_colo_state()) {
4425         colo_flush_ram_cache();
4426     }
4427     return ret;
4428 }
4429
4430 static bool ram_has_postcopy(void *opaque)
4431 {
4432     RAMBlock *rb;
4433     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4434         if (ramblock_is_pmem(rb)) {
4435             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4436                          "is not supported now!", rb->idstr, rb->host);
4437             return false;
4438         }
4439     }
4440
4441     return migrate_postcopy_ram();
4442 }
4443
4444 /* Sync all the dirty bitmap with destination VM.  */
4445 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4446 {
4447     RAMBlock *block;
4448     QEMUFile *file = s->to_dst_file;
4449     int ramblock_count = 0;
4450
4451     trace_ram_dirty_bitmap_sync_start();
4452
4453     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4454         qemu_savevm_send_recv_bitmap(file, block->idstr);
4455         trace_ram_dirty_bitmap_request(block->idstr);
4456         ramblock_count++;
4457     }
4458
4459     trace_ram_dirty_bitmap_sync_wait();
4460
4461     /* Wait until all the ramblocks' dirty bitmap synced */
4462     while (ramblock_count--) {
4463         qemu_sem_wait(&s->rp_state.rp_sem);
4464     }
4465
4466     trace_ram_dirty_bitmap_sync_complete();
4467
4468     return 0;
4469 }
4470
4471 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4472 {
4473     qemu_sem_post(&s->rp_state.rp_sem);
4474 }
4475
4476 /*
4477  * Read the received bitmap, revert it as the initial dirty bitmap.
4478  * This is only used when the postcopy migration is paused but wants
4479  * to resume from a middle point.
4480  */
4481 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4482 {
4483     int ret = -EINVAL;
4484     QEMUFile *file = s->rp_state.from_dst_file;
4485     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4486     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4487     uint64_t size, end_mark;
4488
4489     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4490
4491     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4492         error_report("%s: incorrect state %s", __func__,
4493                      MigrationStatus_str(s->state));
4494         return -EINVAL;
4495     }
4496
4497     /*
4498      * Note: see comments in ramblock_recv_bitmap_send() on why we
4499      * need the endianess convertion, and the paddings.
4500      */
4501     local_size = ROUND_UP(local_size, 8);
4502
4503     /* Add paddings */
4504     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4505
4506     size = qemu_get_be64(file);
4507
4508     /* The size of the bitmap should match with our ramblock */
4509     if (size != local_size) {
4510         error_report("%s: ramblock '%s' bitmap size mismatch "
4511                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4512                      block->idstr, size, local_size);
4513         ret = -EINVAL;
4514         goto out;
4515     }
4516
4517     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4518     end_mark = qemu_get_be64(file);
4519
4520     ret = qemu_file_get_error(file);
4521     if (ret || size != local_size) {
4522         error_report("%s: read bitmap failed for ramblock '%s': %d"
4523                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4524                      __func__, block->idstr, ret, local_size, size);
4525         ret = -EIO;
4526         goto out;
4527     }
4528
4529     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4530         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
4531                      __func__, block->idstr, end_mark);
4532         ret = -EINVAL;
4533         goto out;
4534     }
4535
4536     /*
4537      * Endianess convertion. We are during postcopy (though paused).
4538      * The dirty bitmap won't change. We can directly modify it.
4539      */
4540     bitmap_from_le(block->bmap, le_bitmap, nbits);
4541
4542     /*
4543      * What we received is "received bitmap". Revert it as the initial
4544      * dirty bitmap for this ramblock.
4545      */
4546     bitmap_complement(block->bmap, block->bmap, nbits);
4547
4548     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4549
4550     /*
4551      * We succeeded to sync bitmap for current ramblock. If this is
4552      * the last one to sync, we need to notify the main send thread.
4553      */
4554     ram_dirty_bitmap_reload_notify(s);
4555
4556     ret = 0;
4557 out:
4558     g_free(le_bitmap);
4559     return ret;
4560 }
4561
4562 static int ram_resume_prepare(MigrationState *s, void *opaque)
4563 {
4564     RAMState *rs = *(RAMState **)opaque;
4565     int ret;
4566
4567     ret = ram_dirty_bitmap_sync_all(s, rs);
4568     if (ret) {
4569         return ret;
4570     }
4571
4572     ram_state_resume_prepare(rs, s->to_dst_file);
4573
4574     return 0;
4575 }
4576
4577 static SaveVMHandlers savevm_ram_handlers = {
4578     .save_setup = ram_save_setup,
4579     .save_live_iterate = ram_save_iterate,
4580     .save_live_complete_postcopy = ram_save_complete,
4581     .save_live_complete_precopy = ram_save_complete,
4582     .has_postcopy = ram_has_postcopy,
4583     .save_live_pending = ram_save_pending,
4584     .load_state = ram_load,
4585     .save_cleanup = ram_save_cleanup,
4586     .load_setup = ram_load_setup,
4587     .load_cleanup = ram_load_cleanup,
4588     .resume_prepare = ram_resume_prepare,
4589 };
4590
4591 void ram_mig_init(void)
4592 {
4593     qemu_mutex_init(&XBZRLE.lock);
4594     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4595 }