*/
#include "qemu/osdep.h"
-#include "cpu.h"
#include "qemu/cutils.h"
#include "qemu/bitops.h"
#include "qemu/bitmap.h"
#include "qemu/rcu_queue.h"
#include "migration/colo.h"
#include "block.h"
-#include "sysemu/sysemu.h"
#include "sysemu/cpu-throttle.h"
#include "savevm.h"
#include "qemu/iov.h"
#include "multifd.h"
+#include "sysemu/runstate.h"
+
+#if defined(__linux__)
+#include "qemu/userfaultfd.h"
+#endif /* defined(__linux__) */
/***********************************************************/
/* ram save/restore */
/**
* xbzrle_cache_resize: resize the xbzrle cache
*
- * This function is called from qmp_migrate_set_cache_size in main
+ * This function is called from migrate_params_apply in main
* thread, possibly while a migration is in progress. A running
* migration may be using the cache and might finish during this call,
* hence changes to the cache are protected by XBZRLE.lock().
* @new_size: new cache size
* @errp: set *errp if the check failed, with reason
*/
-int xbzrle_cache_resize(int64_t new_size, Error **errp)
+int xbzrle_cache_resize(uint64_t new_size, Error **errp)
{
PageCache *new_cache;
int64_t ret = 0;
return -1;
}
- nbits = block->used_length >> TARGET_PAGE_BITS;
+ nbits = block->postcopy_length >> TARGET_PAGE_BITS;
/*
* Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
struct RAMState {
/* QEMUFile used for this migration */
QEMUFile *f;
+ /* UFFD file descriptor, used in 'write-tracking' migration */
+ int uffdio_fd;
/* Last block that we have visited searching for dirty pages */
RAMBlock *last_seen_block;
/* Last block from where we have sent data */
ram_addr_t last_page;
/* last ram version we have seen */
uint32_t last_version;
- /* We are in the first round */
- bool ram_bulk_stage;
- /* The free page optimization is enabled */
- bool fpo_enabled;
/* How many times we have dirty too many pages */
int dirty_rate_high_cnt;
/* these variables are used for bitmap sync */
uint64_t xbzrle_pages_prev;
/* Amount of xbzrle encoded bytes since the beginning of the period */
uint64_t xbzrle_bytes_prev;
+ /* Start using XBZRLE (e.g., after the first round). */
+ bool xbzrle_enabled;
/* compression statistics since the beginning of the period */
/* amount of count that no free thread to compress data */
return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
}
-void precopy_enable_free_page_optimization(void)
-{
- if (!ram_state) {
- return;
- }
-
- ram_state->fpo_enabled = true;
-}
-
uint64_t ram_bytes_remaining(void)
{
return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
/* comp_param[i].file is just used as a dummy buffer to save data,
* set its ops to empty.
*/
- comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
+ comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
comp_param[i].done = true;
comp_param[i].quit = false;
qemu_mutex_init(&comp_param[i].mutex);
}
/**
- * mig_throttle_guest_down: throotle down the guest
+ * mig_throttle_guest_down: throttle down the guest
*
* Reduce amount of guest cpu execution to hopefully slow down memory
* writes. If guest dirty memory rate is reduced below the rate at
*/
static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
{
- if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
+ if (!rs->xbzrle_enabled) {
return;
}
{
unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
unsigned long *bitmap = rb->bmap;
- unsigned long next;
if (ramblock_is_ignored(rb)) {
return size;
}
+ return find_next_bit(bitmap, size, start);
+}
+
+static void migration_clear_memory_region_dirty_bitmap(RAMState *rs,
+ RAMBlock *rb,
+ unsigned long page)
+{
+ uint8_t shift;
+ hwaddr size, start;
+
+ if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
+ return;
+ }
+
+ shift = rb->clear_bmap_shift;
/*
- * When the free page optimization is enabled, we need to check the bitmap
- * to send the non-free pages rather than all the pages in the bulk stage.
+ * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
+ * can make things easier sometimes since then start address
+ * of the small chunk will always be 64 pages aligned so the
+ * bitmap will always be aligned to unsigned long. We should
+ * even be able to remove this restriction but I'm simply
+ * keeping it.
*/
- if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
- next = start + 1;
- } else {
- next = find_next_bit(bitmap, size, start);
- }
+ assert(shift >= 6);
+
+ size = 1ULL << (TARGET_PAGE_BITS + shift);
+ start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
+ trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
+ memory_region_clear_dirty_bitmap(rb->mr, start, size);
+}
- return next;
+static void
+migration_clear_memory_region_dirty_bitmap_range(RAMState *rs,
+ RAMBlock *rb,
+ unsigned long start,
+ unsigned long npages)
+{
+ unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
+ unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
+ unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
+
+ /*
+ * Clear pages from start to start + npages - 1, so the end boundary is
+ * exclusive.
+ */
+ for (i = chunk_start; i < chunk_end; i += chunk_pages) {
+ migration_clear_memory_region_dirty_bitmap(rs, rb, i);
+ }
}
static inline bool migration_bitmap_clear_dirty(RAMState *rs,
{
bool ret;
- qemu_mutex_lock(&rs->bitmap_mutex);
-
/*
* Clear dirty bitmap if needed. This _must_ be called before we
* send any of the page in the chunk because we need to make sure
* the page in the chunk we clear the remote dirty bitmap for all.
* Clearing it earlier won't be a problem, but too late will.
*/
- if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
- uint8_t shift = rb->clear_bmap_shift;
- hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
- hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
-
- /*
- * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
- * can make things easier sometimes since then start address
- * of the small chunk will always be 64 pages aligned so the
- * bitmap will always be aligned to unsigned long. We should
- * even be able to remove this restriction but I'm simply
- * keeping it.
- */
- assert(shift >= 6);
- trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
- memory_region_clear_dirty_bitmap(rb->mr, start, size);
- }
+ migration_clear_memory_region_dirty_bitmap(rs, rb, page);
ret = test_and_clear_bit(page, rb->bmap);
-
if (ret) {
rs->migration_dirty_pages--;
}
- qemu_mutex_unlock(&rs->bitmap_mutex);
return ret;
}
trace_ram_save_page(block->idstr, (uint64_t)offset, p);
XBZRLE_cache_lock();
- if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
- migrate_use_xbzrle()) {
+ if (rs->xbzrle_enabled && !migration_in_postcopy()) {
pages = save_xbzrle_page(rs, &p, current_addr, block,
offset, last_stage);
if (!last_stage) {
*again = false;
return false;
}
- if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
- >= pss->block->used_length) {
+ if (!offset_in_ramblock(pss->block,
+ ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
/* Didn't find anything in this RAM Block */
pss->page = 0;
pss->block = QLIST_NEXT_RCU(pss->block, next);
pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
/* Flag that we've looped */
pss->complete_round = true;
- rs->ram_bulk_stage = false;
+ /* After the first round, enable XBZRLE. */
+ if (migrate_use_xbzrle()) {
+ rs->xbzrle_enabled = true;
+ }
}
/* Didn't find anything this time, but try again on the new block */
*again = true;
return block;
}
+#if defined(__linux__)
+/**
+ * poll_fault_page: try to get next UFFD write fault page and, if pending fault
+ * is found, return RAM block pointer and page offset
+ *
+ * Returns pointer to the RAMBlock containing faulting page,
+ * NULL if no write faults are pending
+ *
+ * @rs: current RAM state
+ * @offset: page offset from the beginning of the block
+ */
+static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
+{
+ struct uffd_msg uffd_msg;
+ void *page_address;
+ RAMBlock *block;
+ int res;
+
+ if (!migrate_background_snapshot()) {
+ return NULL;
+ }
+
+ res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
+ if (res <= 0) {
+ return NULL;
+ }
+
+ page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
+ block = qemu_ram_block_from_host(page_address, false, offset);
+ assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
+ return block;
+}
+
+/**
+ * ram_save_release_protection: release UFFD write protection after
+ * a range of pages has been saved
+ *
+ * @rs: current RAM state
+ * @pss: page-search-status structure
+ * @start_page: index of the first page in the range relative to pss->block
+ *
+ * Returns 0 on success, negative value in case of an error
+*/
+static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
+ unsigned long start_page)
+{
+ int res = 0;
+
+ /* Check if page is from UFFD-managed region. */
+ if (pss->block->flags & RAM_UF_WRITEPROTECT) {
+ void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
+ uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
+
+ /* Flush async buffers before un-protect. */
+ qemu_fflush(rs->f);
+ /* Un-protect memory range. */
+ res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
+ false, false);
+ }
+
+ return res;
+}
+
+/* ram_write_tracking_available: check if kernel supports required UFFD features
+ *
+ * Returns true if supports, false otherwise
+ */
+bool ram_write_tracking_available(void)
+{
+ uint64_t uffd_features;
+ int res;
+
+ res = uffd_query_features(&uffd_features);
+ return (res == 0 &&
+ (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
+}
+
+/* ram_write_tracking_compatible: check if guest configuration is
+ * compatible with 'write-tracking'
+ *
+ * Returns true if compatible, false otherwise
+ */
+bool ram_write_tracking_compatible(void)
+{
+ const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
+ int uffd_fd;
+ RAMBlock *block;
+ bool ret = false;
+
+ /* Open UFFD file descriptor */
+ uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
+ if (uffd_fd < 0) {
+ return false;
+ }
+
+ RCU_READ_LOCK_GUARD();
+
+ RAMBLOCK_FOREACH_NOT_IGNORED(block) {
+ uint64_t uffd_ioctls;
+
+ /* Nothing to do with read-only and MMIO-writable regions */
+ if (block->mr->readonly || block->mr->rom_device) {
+ continue;
+ }
+ /* Try to register block memory via UFFD-IO to track writes */
+ if (uffd_register_memory(uffd_fd, block->host, block->max_length,
+ UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
+ goto out;
+ }
+ if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
+ goto out;
+ }
+ }
+ ret = true;
+
+out:
+ uffd_close_fd(uffd_fd);
+ return ret;
+}
+
+/*
+ * ram_block_populate_pages: populate memory in the RAM block by reading
+ * an integer from the beginning of each page.
+ *
+ * Since it's solely used for userfault_fd WP feature, here we just
+ * hardcode page size to qemu_real_host_page_size.
+ *
+ * @block: RAM block to populate
+ */
+static void ram_block_populate_pages(RAMBlock *block)
+{
+ char *ptr = (char *) block->host;
+
+ for (ram_addr_t offset = 0; offset < block->used_length;
+ offset += qemu_real_host_page_size) {
+ char tmp = *(ptr + offset);
+
+ /* Don't optimize the read out */
+ asm volatile("" : "+r" (tmp));
+ }
+}
+
+/*
+ * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
+ */
+void ram_write_tracking_prepare(void)
+{
+ RAMBlock *block;
+
+ RCU_READ_LOCK_GUARD();
+
+ RAMBLOCK_FOREACH_NOT_IGNORED(block) {
+ /* Nothing to do with read-only and MMIO-writable regions */
+ if (block->mr->readonly || block->mr->rom_device) {
+ continue;
+ }
+
+ /*
+ * Populate pages of the RAM block before enabling userfault_fd
+ * write protection.
+ *
+ * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
+ * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
+ * pages with pte_none() entries in page table.
+ */
+ ram_block_populate_pages(block);
+ }
+}
+
+/*
+ * ram_write_tracking_start: start UFFD-WP memory tracking
+ *
+ * Returns 0 for success or negative value in case of error
+ */
+int ram_write_tracking_start(void)
+{
+ int uffd_fd;
+ RAMState *rs = ram_state;
+ RAMBlock *block;
+
+ /* Open UFFD file descriptor */
+ uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
+ if (uffd_fd < 0) {
+ return uffd_fd;
+ }
+ rs->uffdio_fd = uffd_fd;
+
+ RCU_READ_LOCK_GUARD();
+
+ RAMBLOCK_FOREACH_NOT_IGNORED(block) {
+ /* Nothing to do with read-only and MMIO-writable regions */
+ if (block->mr->readonly || block->mr->rom_device) {
+ continue;
+ }
+
+ /* Register block memory with UFFD to track writes */
+ if (uffd_register_memory(rs->uffdio_fd, block->host,
+ block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
+ goto fail;
+ }
+ /* Apply UFFD write protection to the block memory range */
+ if (uffd_change_protection(rs->uffdio_fd, block->host,
+ block->max_length, true, false)) {
+ goto fail;
+ }
+ block->flags |= RAM_UF_WRITEPROTECT;
+ memory_region_ref(block->mr);
+
+ trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
+ block->host, block->max_length);
+ }
+
+ return 0;
+
+fail:
+ error_report("ram_write_tracking_start() failed: restoring initial memory state");
+
+ RAMBLOCK_FOREACH_NOT_IGNORED(block) {
+ if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
+ continue;
+ }
+ /*
+ * In case some memory block failed to be write-protected
+ * remove protection and unregister all succeeded RAM blocks
+ */
+ uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
+ false, false);
+ uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
+ /* Cleanup flags and remove reference */
+ block->flags &= ~RAM_UF_WRITEPROTECT;
+ memory_region_unref(block->mr);
+ }
+
+ uffd_close_fd(uffd_fd);
+ rs->uffdio_fd = -1;
+ return -1;
+}
+
+/**
+ * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
+ */
+void ram_write_tracking_stop(void)
+{
+ RAMState *rs = ram_state;
+ RAMBlock *block;
+
+ RCU_READ_LOCK_GUARD();
+
+ RAMBLOCK_FOREACH_NOT_IGNORED(block) {
+ if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
+ continue;
+ }
+ /* Remove protection and unregister all affected RAM blocks */
+ uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
+ false, false);
+ uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
+
+ trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
+ block->host, block->max_length);
+
+ /* Cleanup flags and remove reference */
+ block->flags &= ~RAM_UF_WRITEPROTECT;
+ memory_region_unref(block->mr);
+ }
+
+ /* Finally close UFFD file descriptor */
+ uffd_close_fd(rs->uffdio_fd);
+ rs->uffdio_fd = -1;
+}
+
+#else
+/* No target OS support, stubs just fail or ignore */
+
+static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
+{
+ (void) rs;
+ (void) offset;
+
+ return NULL;
+}
+
+static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
+ unsigned long start_page)
+{
+ (void) rs;
+ (void) pss;
+ (void) start_page;
+
+ return 0;
+}
+
+bool ram_write_tracking_available(void)
+{
+ return false;
+}
+
+bool ram_write_tracking_compatible(void)
+{
+ assert(0);
+ return false;
+}
+
+int ram_write_tracking_start(void)
+{
+ assert(0);
+ return -1;
+}
+
+void ram_write_tracking_stop(void)
+{
+ assert(0);
+}
+#endif /* defined(__linux__) */
+
/**
* get_queued_page: unqueue a page from the postcopy requests
*
} while (block && !dirty);
- if (block) {
+ if (!block) {
/*
- * As soon as we start servicing pages out of order, then we have
- * to kill the bulk stage, since the bulk stage assumes
- * in (migration_bitmap_find_and_reset_dirty) that every page is
- * dirty, that's no longer true.
+ * Poll write faults too if background snapshot is enabled; that's
+ * when we have vcpus got blocked by the write protected pages.
*/
- rs->ram_bulk_stage = false;
+ block = poll_fault_page(rs, &offset);
+ }
+ if (block) {
/*
* We want the background search to continue from the queued page
* since the guest is likely to want other pages near to the page
rs->last_req_rb = ramblock;
}
trace_ram_save_queue_pages(ramblock->idstr, start, len);
- if (start + len > ramblock->used_length) {
+ if (!offset_in_ramblock(ramblock, start + len - 1)) {
error_report("%s request overrun start=" RAM_ADDR_FMT " len="
RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
__func__, start, len, ramblock->used_length);
}
/*
- * If xbzrle is on, stop using the data compression after first
- * round of migration even if compression is enabled. In theory,
- * xbzrle can do better than compression.
+ * If xbzrle is enabled (e.g., after first round of migration), stop
+ * using the data compression. In theory, xbzrle can do better than
+ * compression.
*/
- if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
- return true;
+ if (rs->xbzrle_enabled) {
+ return false;
}
- return false;
+ return true;
}
/*
int tmppages, pages = 0;
size_t pagesize_bits =
qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
+ unsigned long hostpage_boundary =
+ QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
+ unsigned long start_page = pss->page;
+ int res;
if (ramblock_is_ignored(pss->block)) {
error_report("block %s should not be migrated !", pss->block->idstr);
do {
/* Check the pages is dirty and if it is send it */
- if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
- pss->page++;
- continue;
- }
+ if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
+ tmppages = ram_save_target_page(rs, pss, last_stage);
+ if (tmppages < 0) {
+ return tmppages;
+ }
- tmppages = ram_save_target_page(rs, pss, last_stage);
- if (tmppages < 0) {
- return tmppages;
+ pages += tmppages;
+ /*
+ * Allow rate limiting to happen in the middle of huge pages if
+ * something is sent in the current iteration.
+ */
+ if (pagesize_bits > 1 && tmppages > 0) {
+ migration_rate_limit();
+ }
}
-
- pages += tmppages;
- pss->page++;
- /* Allow rate limiting to happen in the middle of huge pages */
- migration_rate_limit();
- } while ((pss->page & (pagesize_bits - 1)) &&
+ pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
+ } while ((pss->page < hostpage_boundary) &&
offset_in_ramblock(pss->block,
((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
+ /* The offset we leave with is the min boundary of host page and block */
+ pss->page = MIN(pss->page, hostpage_boundary) - 1;
- /* The offset we leave with is the last one we looked at */
- pss->page--;
- return pages;
+ res = ram_save_release_protection(rs, pss, start_page);
+ return (res < 0 ? res : pages);
}
/**
RAMState **rsp = opaque;
RAMBlock *block;
- /* caller have hold iothread lock or is in a bh, so there is
- * no writing race against the migration bitmap
- */
- memory_global_dirty_log_stop();
+ /* We don't use dirty log with background snapshots */
+ if (!migrate_background_snapshot()) {
+ /* caller have hold iothread lock or is in a bh, so there is
+ * no writing race against the migration bitmap
+ */
+ memory_global_dirty_log_stop();
+ }
RAMBLOCK_FOREACH_NOT_IGNORED(block) {
g_free(block->clear_bmap);
rs->last_sent_block = NULL;
rs->last_page = 0;
rs->last_version = ram_list.version;
- rs->ram_bulk_stage = true;
- rs->fpo_enabled = false;
+ rs->xbzrle_enabled = false;
}
#define MAX_WAIT 50 /* ms, half buffered_file limit */
WITH_RCU_READ_LOCK_GUARD() {
ram_list_init_bitmaps();
- memory_global_dirty_log_start();
- migration_bitmap_sync_precopy(rs);
+ /* We don't use dirty log with background snapshots */
+ if (!migrate_background_snapshot()) {
+ memory_global_dirty_log_start();
+ migration_bitmap_sync_precopy(rs);
+ }
}
qemu_mutex_unlock_ramlist();
qemu_mutex_unlock_iothread();
/* This may not be aligned with current bitmaps. Recalculate. */
rs->migration_dirty_pages = pages;
- rs->last_seen_block = NULL;
- rs->last_sent_block = NULL;
- rs->last_page = 0;
- rs->last_version = ram_list.version;
- /*
- * Disable the bulk stage, otherwise we'll resend the whole RAM no
- * matter what we have sent.
- */
- rs->ram_bulk_stage = false;
+ ram_state_reset(rs);
/* Update RAMState cache of output QEMUFile */
rs->f = out;
npages = used_len >> TARGET_PAGE_BITS;
qemu_mutex_lock(&ram_state->bitmap_mutex);
+ /*
+ * The skipped free pages are equavalent to be sent from clear_bmap's
+ * perspective, so clear the bits from the memory region bitmap which
+ * are initially set. Otherwise those skipped pages will be sent in
+ * the next round after syncing from the memory region bitmap.
+ */
+ migration_clear_memory_region_dirty_bitmap_range(ram_state, block,
+ start, npages);
ram_state->migration_dirty_pages -=
bitmap_count_one_with_offset(block->bmap, start, npages);
bitmap_clear(block->bmap, start, npages);
goto out;
}
+ /*
+ * We'll take this lock a little bit long, but it's okay for two reasons.
+ * Firstly, the only possible other thread to take it is who calls
+ * qemu_guest_free_page_hint(), which should be rare; secondly, see
+ * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
+ * guarantees that we'll at least released it in a regular basis.
+ */
+ qemu_mutex_lock(&rs->bitmap_mutex);
WITH_RCU_READ_LOCK_GUARD() {
if (ram_list.version != rs->last_version) {
ram_state_reset(rs);
i++;
}
}
+ qemu_mutex_unlock(&rs->bitmap_mutex);
/*
* Must occur before EOS (or any QEMUFile operation)
return block->host + offset;
}
+static void *host_page_from_ram_block_offset(RAMBlock *block,
+ ram_addr_t offset)
+{
+ /* Note: Explicitly no check against offset_in_ramblock(). */
+ return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
+ block->page_size);
+}
+
+static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
+ ram_addr_t offset)
+{
+ return ((uintptr_t)block->host + offset) & (block->page_size - 1);
+}
+
static inline void *colo_cache_from_block_offset(RAMBlock *block,
ram_addr_t offset, bool record_bitmap)
{
int idx, thread_count;
thread_count = migrate_decompress_threads();
- qemu_mutex_lock(&decomp_done_lock);
+ QEMU_LOCK_GUARD(&decomp_done_lock);
while (true) {
for (idx = 0; idx < thread_count; idx++) {
if (decomp_param[idx].done) {
qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
}
}
- qemu_mutex_unlock(&decomp_done_lock);
+}
+
+static void colo_init_ram_state(void)
+{
+ ram_state_init(&ram_state);
}
/*
WITH_RCU_READ_LOCK_GUARD() {
RAMBLOCK_FOREACH_NOT_IGNORED(block) {
block->colo_cache = qemu_anon_ram_alloc(block->used_length,
- NULL,
- false);
+ NULL, false, false);
if (!block->colo_cache) {
error_report("%s: Can't alloc memory for COLO cache of block %s,"
"size 0x" RAM_ADDR_FMT, __func__, block->idstr,
}
}
- ram_state_init(&ram_state);
+ colo_init_ram_state();
return 0;
}
MigrationIncomingState *mis = migration_incoming_get_current();
/* Temporary page that is later 'placed' */
void *postcopy_host_page = mis->postcopy_tmp_page;
- void *this_host = NULL;
+ void *host_page = NULL;
bool all_zero = true;
int target_pages = 0;
while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
ram_addr_t addr;
- void *host = NULL;
void *page_buffer = NULL;
void *place_source = NULL;
RAMBlock *block = NULL;
if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
RAM_SAVE_FLAG_COMPRESS_PAGE)) {
block = ram_block_from_stream(f, flags);
+ if (!block) {
+ ret = -EINVAL;
+ break;
+ }
- host = host_from_ram_block_offset(block, addr);
- if (!host) {
+ /*
+ * Relying on used_length is racy and can result in false positives.
+ * We might place pages beyond used_length in case RAM was shrunk
+ * while in postcopy, which is fine - trying to place via
+ * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
+ */
+ if (!block->host || addr >= block->postcopy_length) {
error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
ret = -EINVAL;
break;
* of a host page in one chunk.
*/
page_buffer = postcopy_host_page +
- ((uintptr_t)host & (block->page_size - 1));
+ host_page_offset_from_ram_block_offset(block, addr);
+ /* If all TP are zero then we can optimise the place */
if (target_pages == 1) {
- this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
- block->page_size);
- } else {
+ host_page = host_page_from_ram_block_offset(block, addr);
+ } else if (host_page != host_page_from_ram_block_offset(block,
+ addr)) {
/* not the 1st TP within the HP */
- if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
- (uintptr_t)this_host) {
- error_report("Non-same host page %p/%p",
- host, this_host);
- ret = -EINVAL;
- break;
- }
+ error_report("Non-same host page %p/%p", host_page,
+ host_page_from_ram_block_offset(block, addr));
+ ret = -EINVAL;
+ break;
}
/*
}
if (!ret && place_needed) {
- /* This gets called at the last target page in the host page */
- void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
- block->page_size);
-
if (all_zero) {
- ret = postcopy_place_page_zero(mis, place_dest,
- block);
+ ret = postcopy_place_page_zero(mis, host_page, block);
} else {
- ret = postcopy_place_page(mis, place_dest,
- place_source, block);
+ ret = postcopy_place_page(mis, host_page, place_source,
+ block);
}
place_needed = false;
target_pages = 0;
unsigned long offset = 0;
memory_global_dirty_log_sync();
+ qemu_mutex_lock(&ram_state->bitmap_mutex);
WITH_RCU_READ_LOCK_GUARD() {
RAMBLOCK_FOREACH_NOT_IGNORED(block) {
ramblock_sync_dirty_bitmap(ram_state, block);
while (block) {
offset = migration_bitmap_find_dirty(ram_state, block, offset);
- if (((ram_addr_t)offset) << TARGET_PAGE_BITS
- >= block->used_length) {
+ if (!offset_in_ramblock(block,
+ ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
offset = 0;
block = QLIST_NEXT_RCU(block, next);
} else {
}
}
trace_colo_flush_ram_cache_end();
+ qemu_mutex_unlock(&ram_state->bitmap_mutex);
}
/**
}
}
/* For postcopy we need to check hugepage sizes match */
- if (postcopy_advised &&
+ if (postcopy_advised && migrate_postcopy_ram() &&
block->page_size != qemu_host_page_size) {
uint64_t remote_page_size = qemu_get_be64(f);
if (remote_page_size != block->page_size) {
int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
{
int ret = -EINVAL;
+ /* from_dst_file is always valid because we're within rp_thread */
QEMUFile *file = s->rp_state.from_dst_file;
unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
uint64_t local_size = DIV_ROUND_UP(nbits, 8);
}
if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
- error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
+ error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
__func__, block->idstr, end_mark);
ret = -EINVAL;
goto out;
.resume_prepare = ram_resume_prepare,
};
+static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
+ size_t old_size, size_t new_size)
+{
+ PostcopyState ps = postcopy_state_get();
+ ram_addr_t offset;
+ RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
+ Error *err = NULL;
+
+ if (ramblock_is_ignored(rb)) {
+ return;
+ }
+
+ if (!migration_is_idle()) {
+ /*
+ * Precopy code on the source cannot deal with the size of RAM blocks
+ * changing at random points in time - especially after sending the
+ * RAM block sizes in the migration stream, they must no longer change.
+ * Abort and indicate a proper reason.
+ */
+ error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
+ migrate_set_error(migrate_get_current(), err);
+ error_free(err);
+ migration_cancel();
+ }
+
+ switch (ps) {
+ case POSTCOPY_INCOMING_ADVISE:
+ /*
+ * Update what ram_postcopy_incoming_init()->init_range() does at the
+ * time postcopy was advised. Syncing RAM blocks with the source will
+ * result in RAM resizes.
+ */
+ if (old_size < new_size) {
+ if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
+ error_report("RAM block '%s' discard of resized RAM failed",
+ rb->idstr);
+ }
+ }
+ rb->postcopy_length = new_size;
+ break;
+ case POSTCOPY_INCOMING_NONE:
+ case POSTCOPY_INCOMING_RUNNING:
+ case POSTCOPY_INCOMING_END:
+ /*
+ * Once our guest is running, postcopy does no longer care about
+ * resizes. When growing, the new memory was not available on the
+ * source, no handler needed.
+ */
+ break;
+ default:
+ error_report("RAM block '%s' resized during postcopy state: %d",
+ rb->idstr, ps);
+ exit(-1);
+ }
+}
+
+static RAMBlockNotifier ram_mig_ram_notifier = {
+ .ram_block_resized = ram_mig_ram_block_resized,
+};
+
void ram_mig_init(void)
{
qemu_mutex_init(&XBZRLE.lock);
register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
+ ram_block_notifier_add(&ram_mig_ram_notifier);
}