QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
};
+typedef struct {
+ /*
+ * Cached ramblock/offset values if preempted. They're only meaningful if
+ * preempted==true below.
+ */
+ RAMBlock *ram_block;
+ unsigned long ram_page;
+ /*
+ * Whether a postcopy preemption just happened. Will be reset after
+ * precopy recovered to background migration.
+ */
+ bool preempted;
+} PostcopyPreemptState;
+
/* State of RAM for migration */
struct RAMState {
/* QEMUFile used for this migration */
/* Queue of outstanding page requests from the destination */
QemuMutex src_page_req_mutex;
QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
+
+ /* Postcopy preemption informations */
+ PostcopyPreemptState postcopy_preempt_state;
+ /*
+ * Current channel we're using on src VM. Only valid if postcopy-preempt
+ * is enabled.
+ */
+ unsigned int postcopy_channel;
};
typedef struct RAMState RAMState;
static NotifierWithReturnList precopy_notifier_list;
+static void postcopy_preempt_reset(RAMState *rs)
+{
+ memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState));
+}
+
/* Whether postcopy has queued requests? */
static bool postcopy_has_request(RAMState *rs)
{
ram_counters.transferred += bytes;
}
+void dirty_sync_missed_zero_copy(void)
+{
+ ram_counters.dirty_sync_missed_zero_copy++;
+}
+
/* used by the search for pages to send */
struct PageSearchStatus {
/* Current block being searched */
unsigned long page;
/* Set once we wrap around */
bool complete_round;
- /* Whether current page is explicitly requested by postcopy */
+ /*
+ * [POSTCOPY-ONLY] Whether current page is explicitly requested by
+ * postcopy. When set, the request is "urgent" because the dest QEMU
+ * threads are waiting for us.
+ */
bool postcopy_requested;
+ /*
+ * [POSTCOPY-ONLY] The target channel to use to send current page.
+ *
+ * Note: This may _not_ match with the value in postcopy_requested
+ * above. Let's imagine the case where the postcopy request is exactly
+ * the page that we're sending in progress during precopy. In this case
+ * we'll have postcopy_requested set to true but the target channel
+ * will be the precopy channel (so that we don't split brain on that
+ * specific page since the precopy channel already contains partial of
+ * that page data).
+ *
+ * Besides that specific use case, postcopy_target_channel should
+ * always be equal to postcopy_requested, because by default we send
+ * postcopy pages via postcopy preempt channel.
+ */
+ bool postcopy_target_channel;
};
typedef struct PageSearchStatus PageSearchStatus;
static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
ram_addr_t offset, uint8_t *source_buf);
+static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
+ bool postcopy_requested);
+
static void *do_data_compress(void *opaque)
{
CompressParam *param = opaque;
*/
static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
{
- /* This is not a postcopy requested page */
+ /*
+ * This is not a postcopy requested page, mark it "not urgent", and use
+ * precopy channel to send it.
+ */
pss->postcopy_requested = false;
+ pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
if (pss->complete_round && pss->block == rs->last_seen_block &&
{
struct RAMSrcPageRequest *entry;
RAMBlock *block = NULL;
- size_t page_size;
if (!postcopy_has_request(rs)) {
return NULL;
entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
block = entry->rb;
*offset = entry->offset;
- page_size = qemu_ram_pagesize(block);
- /* Each page request should only be multiple page size of the ramblock */
- assert((entry->len % page_size) == 0);
- if (entry->len > page_size) {
- entry->len -= page_size;
- entry->offset += page_size;
+ if (entry->len > TARGET_PAGE_SIZE) {
+ entry->len -= TARGET_PAGE_SIZE;
+ entry->offset += TARGET_PAGE_SIZE;
} else {
memory_region_unref(block->mr);
QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
migration_consume_urgent_request();
}
- trace_unqueue_page(block->idstr, *offset,
- test_bit((*offset >> TARGET_PAGE_BITS), block->bmap));
-
return block;
}
}
#endif /* defined(__linux__) */
+/*
+ * Check whether two addr/offset of the ramblock falls onto the same host huge
+ * page. Returns true if so, false otherwise.
+ */
+static bool offset_on_same_huge_page(RAMBlock *rb, uint64_t addr1,
+ uint64_t addr2)
+{
+ size_t page_size = qemu_ram_pagesize(rb);
+
+ addr1 = ROUND_DOWN(addr1, page_size);
+ addr2 = ROUND_DOWN(addr2, page_size);
+
+ return addr1 == addr2;
+}
+
+/*
+ * Whether a previous preempted precopy huge page contains current requested
+ * page? Returns true if so, false otherwise.
+ *
+ * This should really happen very rarely, because it means when we were sending
+ * during background migration for postcopy we're sending exactly the page that
+ * some vcpu got faulted on on dest node. When it happens, we probably don't
+ * need to do much but drop the request, because we know right after we restore
+ * the precopy stream it'll be serviced. It'll slightly affect the order of
+ * postcopy requests to be serviced (e.g. it'll be the same as we move current
+ * request to the end of the queue) but it shouldn't be a big deal. The most
+ * imporant thing is we can _never_ try to send a partial-sent huge page on the
+ * POSTCOPY channel again, otherwise that huge page will got "split brain" on
+ * two channels (PRECOPY, POSTCOPY).
+ */
+static bool postcopy_preempted_contains(RAMState *rs, RAMBlock *block,
+ ram_addr_t offset)
+{
+ PostcopyPreemptState *state = &rs->postcopy_preempt_state;
+
+ /* No preemption at all? */
+ if (!state->preempted) {
+ return false;
+ }
+
+ /* Not even the same ramblock? */
+ if (state->ram_block != block) {
+ return false;
+ }
+
+ return offset_on_same_huge_page(block, offset,
+ state->ram_page << TARGET_PAGE_BITS);
+}
+
/**
* get_queued_page: unqueue a page from the postcopy requests
*
{
RAMBlock *block;
ram_addr_t offset;
+ bool dirty;
- block = unqueue_page(rs, &offset);
+ do {
+ block = unqueue_page(rs, &offset);
+ /*
+ * We're sending this page, and since it's postcopy nothing else
+ * will dirty it, and we must make sure it doesn't get sent again
+ * even if this queue request was received after the background
+ * search already sent it.
+ */
+ if (block) {
+ unsigned long page;
- if (!block) {
+ page = offset >> TARGET_PAGE_BITS;
+ dirty = test_bit(page, block->bmap);
+ if (!dirty) {
+ trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
+ page);
+ } else {
+ trace_get_queued_page(block->idstr, (uint64_t)offset, page);
+ }
+ }
+
+ } while (block && !dirty);
+
+ if (block) {
+ /* See comment above postcopy_preempted_contains() */
+ if (postcopy_preempted_contains(rs, block, offset)) {
+ trace_postcopy_preempt_hit(block->idstr, offset);
+ /*
+ * If what we preempted previously was exactly what we're
+ * requesting right now, restore the preempted precopy
+ * immediately, boosting its priority as it's requested by
+ * postcopy.
+ */
+ postcopy_preempt_restore(rs, pss, true);
+ return true;
+ }
+ } else {
/*
* Poll write faults too if background snapshot is enabled; that's
* when we have vcpus got blocked by the write protected pages.
* really rare.
*/
pss->complete_round = false;
+ /* Mark it an urgent request, meanwhile using POSTCOPY channel */
pss->postcopy_requested = true;
+ pss->postcopy_target_channel = RAM_CHANNEL_POSTCOPY;
}
return !!block;
return ram_save_page(rs, pss);
}
+static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss)
+{
+ MigrationState *ms = migrate_get_current();
+
+ /* Not enabled eager preempt? Then never do that. */
+ if (!migrate_postcopy_preempt()) {
+ return false;
+ }
+
+ /* If the user explicitly disabled breaking of huge page, skip */
+ if (!ms->postcopy_preempt_break_huge) {
+ return false;
+ }
+
+ /* If the ramblock we're sending is a small page? Never bother. */
+ if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) {
+ return false;
+ }
+
+ /* Not in postcopy at all? */
+ if (!migration_in_postcopy()) {
+ return false;
+ }
+
+ /*
+ * If we're already handling a postcopy request, don't preempt as this page
+ * has got the same high priority.
+ */
+ if (pss->postcopy_requested) {
+ return false;
+ }
+
+ /* If there's postcopy requests, then check it up! */
+ return postcopy_has_request(rs);
+}
+
+/* Returns true if we preempted precopy, false otherwise */
+static void postcopy_do_preempt(RAMState *rs, PageSearchStatus *pss)
+{
+ PostcopyPreemptState *p_state = &rs->postcopy_preempt_state;
+
+ trace_postcopy_preempt_triggered(pss->block->idstr, pss->page);
+
+ /*
+ * Time to preempt precopy. Cache current PSS into preempt state, so that
+ * after handling the postcopy pages we can recover to it. We need to do
+ * so because the dest VM will have partial of the precopy huge page kept
+ * over in its tmp huge page caches; better move on with it when we can.
+ */
+ p_state->ram_block = pss->block;
+ p_state->ram_page = pss->page;
+ p_state->preempted = true;
+}
+
+/* Whether we're preempted by a postcopy request during sending a huge page */
+static bool postcopy_preempt_triggered(RAMState *rs)
+{
+ return rs->postcopy_preempt_state.preempted;
+}
+
+static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
+ bool postcopy_requested)
+{
+ PostcopyPreemptState *state = &rs->postcopy_preempt_state;
+
+ assert(state->preempted);
+
+ pss->block = state->ram_block;
+ pss->page = state->ram_page;
+
+ /* Whether this is a postcopy request? */
+ pss->postcopy_requested = postcopy_requested;
+ /*
+ * When restoring a preempted page, the old data resides in PRECOPY
+ * slow channel, even if postcopy_requested is set. So always use
+ * PRECOPY channel here.
+ */
+ pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
+
+ trace_postcopy_preempt_restored(pss->block->idstr, pss->page);
+
+ /* Reset preempt state, most importantly, set preempted==false */
+ postcopy_preempt_reset(rs);
+}
+
+static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss)
+{
+ MigrationState *s = migrate_get_current();
+ unsigned int channel = pss->postcopy_target_channel;
+ QEMUFile *next;
+
+ if (channel != rs->postcopy_channel) {
+ if (channel == RAM_CHANNEL_PRECOPY) {
+ next = s->to_dst_file;
+ } else {
+ next = s->postcopy_qemufile_src;
+ }
+ /* Update and cache the current channel */
+ rs->f = next;
+ rs->postcopy_channel = channel;
+
+ /*
+ * If channel switched, reset last_sent_block since the old sent block
+ * may not be on the same channel.
+ */
+ rs->last_sent_block = NULL;
+
+ trace_postcopy_preempt_switch_channel(channel);
+ }
+
+ trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
+}
+
+/* We need to make sure rs->f always points to the default channel elsewhere */
+static void postcopy_preempt_reset_channel(RAMState *rs)
+{
+ if (migrate_postcopy_preempt() && migration_in_postcopy()) {
+ rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
+ rs->f = migrate_get_current()->to_dst_file;
+ trace_postcopy_preempt_reset_channel();
+ }
+}
+
/**
* ram_save_host_page: save a whole host page
*
return 0;
}
+ if (migrate_postcopy_preempt() && migration_in_postcopy()) {
+ postcopy_preempt_choose_channel(rs, pss);
+ }
+
do {
+ if (postcopy_needs_preempt(rs, pss)) {
+ postcopy_do_preempt(rs, pss);
+ break;
+ }
+
/* Check the pages is dirty and if it is send it */
if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
tmppages = ram_save_target_page(rs, pss);
/* The offset we leave with is the min boundary of host page and block */
pss->page = MIN(pss->page, hostpage_boundary);
+ /*
+ * When with postcopy preempt mode, flush the data as soon as possible for
+ * postcopy requests, because we've already sent a whole huge page, so the
+ * dst node should already have enough resource to atomically filling in
+ * the current missing page.
+ *
+ * More importantly, when using separate postcopy channel, we must do
+ * explicit flush or it won't flush until the buffer is full.
+ */
+ if (migrate_postcopy_preempt() && pss->postcopy_requested) {
+ qemu_fflush(rs->f);
+ }
+
res = ram_save_release_protection(rs, pss, start_page);
return (res < 0 ? res : pages);
}
found = get_queued_page(rs, &pss);
if (!found) {
- /* priority queue empty, so just search for something dirty */
- found = find_dirty_block(rs, &pss, &again);
+ /*
+ * Recover previous precopy ramblock/offset if postcopy has
+ * preempted precopy. Otherwise find the next dirty bit.
+ */
+ if (postcopy_preempt_triggered(rs)) {
+ postcopy_preempt_restore(rs, &pss, false);
+ found = true;
+ } else {
+ /* priority queue empty, so just search for something dirty */
+ found = find_dirty_block(rs, &pss, &again);
+ }
}
if (found) {
rs->last_page = 0;
rs->last_version = ram_list.version;
rs->xbzrle_enabled = false;
+ postcopy_preempt_reset(rs);
+ rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
}
#define MAX_WAIT 50 /* ms, half buffered_file limit */
}
qemu_mutex_unlock(&rs->bitmap_mutex);
+ postcopy_preempt_reset_channel(rs);
+
/*
* Must occur before EOS (or any QEMUFile operation)
* because of RDMA protocol.
return ret;
}
+ postcopy_preempt_reset_channel(rs);
+
ret = multifd_send_sync_main(rs->f);
if (ret < 0) {
return ret;
* @mis: the migration incoming state pointer
* @f: QEMUFile where to read the data from
* @flags: Page flags (mostly to see if it's a continuation of previous block)
+ * @channel: the channel we're using
*/
static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
- QEMUFile *f, int flags)
+ QEMUFile *f, int flags,
+ int channel)
{
- RAMBlock *block = mis->last_recv_block;
+ RAMBlock *block = mis->last_recv_block[channel];
char id[256];
uint8_t len;
return NULL;
}
- mis->last_recv_block = block;
+ mis->last_recv_block[channel] = block;
return block;
}
* rcu_read_lock is taken prior to this being called.
*
* @f: QEMUFile where to send the data
+ * @channel: the channel to use for loading
*/
-int ram_load_postcopy(QEMUFile *f)
+int ram_load_postcopy(QEMUFile *f, int channel)
{
int flags = 0, ret = 0;
bool place_needed = false;
bool matches_target_page_size = false;
MigrationIncomingState *mis = migration_incoming_get_current();
- /* Currently we only use channel 0. TODO: use all the channels */
- PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[0];
+ PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
ram_addr_t addr;
flags = addr & ~TARGET_PAGE_MASK;
addr &= TARGET_PAGE_MASK;
- trace_ram_load_postcopy_loop((uint64_t)addr, flags);
+ trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
RAM_SAVE_FLAG_COMPRESS_PAGE)) {
- block = ram_block_from_stream(mis, f, flags);
+ block = ram_block_from_stream(mis, f, flags, channel);
if (!block) {
ret = -EINVAL;
break;
} else if (tmp_page->host_addr !=
host_page_from_ram_block_offset(block, addr)) {
/* not the 1st TP within the HP */
- error_report("Non-same host page detected. "
+ error_report("Non-same host page detected on channel %d: "
"Target host page %p, received host page %p "
"(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
- tmp_page->host_addr,
+ channel, tmp_page->host_addr,
host_page_from_ram_block_offset(block, addr),
block->idstr, addr, tmp_page->target_pages);
ret = -EINVAL;
if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
- RAMBlock *block = ram_block_from_stream(mis, f, flags);
+ RAMBlock *block = ram_block_from_stream(mis, f, flags,
+ RAM_CHANNEL_PRECOPY);
host = host_from_ram_block_offset(block, addr);
/*
*/
WITH_RCU_READ_LOCK_GUARD() {
if (postcopy_running) {
- ret = ram_load_postcopy(f);
+ /*
+ * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of
+ * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
+ * service fast page faults.
+ */
+ ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
} else {
ret = ram_load_precopy(f);
}
return 0;
}
+void postcopy_preempt_shutdown_file(MigrationState *s)
+{
+ qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
+ qemu_fflush(s->postcopy_qemufile_src);
+}
+
static SaveVMHandlers savevm_ram_handlers = {
.save_setup = ram_save_setup,
.save_live_iterate = ram_save_iterate,