X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=migration%2Fpostcopy-ram.c;h=a793bad477c9ee39de84056eb793ad14af01bef7;hb=2d49bacda00876736c746ce1fcea006a128bef6b;hp=7814da5b4bf5e8b224b78f0d68d55911615cbbae;hpb=c1d5b9add7b04661bedef9a3379a8b82547b53db;p=mirror_qemu.git diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 7814da5b4b..a793bad477 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -23,10 +23,14 @@ #include "savevm.h" #include "postcopy-ram.h" #include "ram.h" +#include "qapi/error.h" +#include "qemu/notify.h" +#include "qemu/rcu.h" #include "sysemu/sysemu.h" #include "sysemu/balloon.h" #include "qemu/error-report.h" #include "trace.h" +#include "hw/boards.h" /* Arbitrary limit on size of each discard command, * keeps them around ~200 bytes @@ -45,6 +49,33 @@ struct PostcopyDiscardState { unsigned int nsentcmds; }; +static NotifierWithReturnList postcopy_notifier_list; + +void postcopy_infrastructure_init(void) +{ + notifier_with_return_list_init(&postcopy_notifier_list); +} + +void postcopy_add_notifier(NotifierWithReturn *nn) +{ + notifier_with_return_list_add(&postcopy_notifier_list, nn); +} + +void postcopy_remove_notifier(NotifierWithReturn *n) +{ + notifier_with_return_remove(n); +} + +int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp) +{ + struct PostcopyNotifyData pnd; + pnd.reason = reason; + pnd.errp = errp; + + return notifier_with_return_list_notify(&postcopy_notifier_list, + &pnd); +} + /* Postcopy needs to detect accesses to pages that haven't yet been copied * across, and efficiently map new pages in, the techniques for doing this * are target OS specific. @@ -63,16 +94,17 @@ struct PostcopyDiscardState { typedef struct PostcopyBlocktimeContext { /* time when page fault initiated per vCPU */ - int64_t *page_fault_vcpu_time; + uint32_t *page_fault_vcpu_time; /* page address per vCPU */ uintptr_t *vcpu_addr; - int64_t total_blocktime; + uint32_t total_blocktime; /* blocktime per vCPU */ - int64_t *vcpu_blocktime; + uint32_t *vcpu_blocktime; /* point in time when last page fault was initiated */ - int64_t last_begin; + uint32_t last_begin; /* number of vCPU are suspended */ int smp_cpus_down; + uint64_t start_time; /* * Handler for exit event, necessary for @@ -98,23 +130,27 @@ static void migration_exit_cb(Notifier *n, void *data) static struct PostcopyBlocktimeContext *blocktime_context_new(void) { + MachineState *ms = MACHINE(qdev_get_machine()); + unsigned int smp_cpus = ms->smp.cpus; PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1); - ctx->page_fault_vcpu_time = g_new0(int64_t, smp_cpus); + ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus); ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus); - ctx->vcpu_blocktime = g_new0(int64_t, smp_cpus); + ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus); ctx->exit_notifier.notify = migration_exit_cb; + ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); qemu_add_exit_notifier(&ctx->exit_notifier); return ctx; } -static int64List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx) +static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx) { - int64List *list = NULL, *entry = NULL; + MachineState *ms = MACHINE(qdev_get_machine()); + uint32List *list = NULL, *entry = NULL; int i; - for (i = smp_cpus - 1; i >= 0; i--) { - entry = g_new0(int64List, 1); + for (i = ms->smp.cpus - 1; i >= 0; i--) { + entry = g_new0(uint32List, 1); entry->value = ctx->vcpu_blocktime[i]; entry->next = list; list = entry; @@ -145,7 +181,7 @@ void fill_destination_postcopy_migration_info(MigrationInfo *info) info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc); } -static uint64_t get_postcopy_total_blocktime(void) +static uint32_t get_postcopy_total_blocktime(void) { MigrationIncomingState *mis = migration_incoming_get_current(); PostcopyBlocktimeContext *bc = mis->blocktime_ctx; @@ -288,18 +324,12 @@ static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis) /* Callback from postcopy_ram_supported_by_host block iterator. */ -static int test_ramblock_postcopiable(const char *block_name, void *host_addr, - ram_addr_t offset, ram_addr_t length, void *opaque) +static int test_ramblock_postcopiable(RAMBlock *rb, void *opaque) { - RAMBlock *rb = qemu_ram_block_by_name(block_name); + const char *block_name = qemu_ram_get_idstr(rb); + ram_addr_t length = qemu_ram_get_used_length(rb); size_t pagesize = qemu_ram_pagesize(rb); - if (qemu_ram_is_shared(rb)) { - error_report("Postcopy on shared RAM (%s) is not yet supported", - block_name); - return 1; - } - if (length % pagesize) { error_report("Postcopy requires RAM blocks to be a page size multiple," " block %s is 0x" RAM_ADDR_FMT " bytes with a " @@ -323,6 +353,7 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis) struct uffdio_register reg_struct; struct uffdio_range range_struct; uint64_t feature_mask; + Error *local_err = NULL; if (qemu_target_page_size() > pagesize) { error_report("Target page size bigger than host page size"); @@ -336,13 +367,19 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis) goto out; } + /* Give devices a chance to object */ + if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) { + error_report_err(local_err); + goto out; + } + /* Version and features check */ if (!ufd_check_and_apply(ufd, mis)) { goto out; } /* We don't support postcopy with shared RAM yet */ - if (qemu_ram_foreach_block(test_ramblock_postcopiable, NULL)) { + if (foreach_not_ignored_block(test_ramblock_postcopiable, NULL)) { goto out; } @@ -411,9 +448,12 @@ out: * must be done right at the start prior to pre-copy. * opaque should be the MIS. */ -static int init_range(const char *block_name, void *host_addr, - ram_addr_t offset, ram_addr_t length, void *opaque) +static int init_range(RAMBlock *rb, void *opaque) { + const char *block_name = qemu_ram_get_idstr(rb); + void *host_addr = qemu_ram_get_host_addr(rb); + ram_addr_t offset = qemu_ram_get_offset(rb); + ram_addr_t length = qemu_ram_get_used_length(rb); trace_postcopy_init_range(block_name, host_addr, offset, length); /* @@ -433,9 +473,12 @@ static int init_range(const char *block_name, void *host_addr, * At the end of migration, undo the effects of init_range * opaque should be the MIS. */ -static int cleanup_range(const char *block_name, void *host_addr, - ram_addr_t offset, ram_addr_t length, void *opaque) +static int cleanup_range(RAMBlock *rb, void *opaque) { + const char *block_name = qemu_ram_get_idstr(rb); + void *host_addr = qemu_ram_get_host_addr(rb); + ram_addr_t offset = qemu_ram_get_offset(rb); + ram_addr_t length = qemu_ram_get_used_length(rb); MigrationIncomingState *mis = opaque; struct uffdio_range range_struct; trace_postcopy_cleanup_range(block_name, host_addr, offset, length); @@ -468,15 +511,29 @@ static int cleanup_range(const char *block_name, void *host_addr, * postcopy later; must be called prior to any precopy. * called from arch_init's similarly named ram_postcopy_incoming_init */ -int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages) +int postcopy_ram_incoming_init(MigrationIncomingState *mis) { - if (qemu_ram_foreach_block(init_range, NULL)) { + if (foreach_not_ignored_block(init_range, NULL)) { return -1; } return 0; } +/* + * Manage a single vote to the QEMU balloon inhibitor for all postcopy usage, + * last caller wins. + */ +static void postcopy_balloon_inhibit(bool state) +{ + static bool cur_state = false; + + if (state != cur_state) { + qemu_balloon_inhibit(state); + cur_state = state; + } +} + /* * At the end of a migration where postcopy_ram_incoming_init was called. */ @@ -485,31 +542,30 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) trace_postcopy_ram_incoming_cleanup_entry(); if (mis->have_fault_thread) { - uint64_t tmp64; + Error *local_err = NULL; + + /* Let the fault thread quit */ + atomic_set(&mis->fault_thread_quit, 1); + postcopy_fault_thread_notify(mis); + trace_postcopy_ram_incoming_cleanup_join(); + qemu_thread_join(&mis->fault_thread); - if (qemu_ram_foreach_block(cleanup_range, mis)) { + if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) { + error_report_err(local_err); return -1; } - /* - * Tell the fault_thread to exit, it's an eventfd that should - * currently be at 0, we're going to increment it to 1 - */ - tmp64 = 1; - if (write(mis->userfault_quit_fd, &tmp64, 8) == 8) { - trace_postcopy_ram_incoming_cleanup_join(); - qemu_thread_join(&mis->fault_thread); - } else { - /* Not much we can do here, but may as well report it */ - error_report("%s: incrementing userfault_quit_fd: %s", __func__, - strerror(errno)); + + if (foreach_not_ignored_block(cleanup_range, mis)) { + return -1; } + trace_postcopy_ram_incoming_cleanup_closeuf(); close(mis->userfault_fd); - close(mis->userfault_quit_fd); + close(mis->userfault_event_fd); mis->have_fault_thread = false; } - qemu_balloon_inhibit(false); + postcopy_balloon_inhibit(false); if (enable_mlock) { if (os_mlock() < 0) { @@ -521,8 +577,6 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) } } - postcopy_state_set(POSTCOPY_INCOMING_END); - if (mis->postcopy_tmp_page) { munmap(mis->postcopy_tmp_page, mis->largest_page_size); mis->postcopy_tmp_page = NULL; @@ -541,9 +595,12 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) /* * Disable huge pages on an area */ -static int nhp_range(const char *block_name, void *host_addr, - ram_addr_t offset, ram_addr_t length, void *opaque) +static int nhp_range(RAMBlock *rb, void *opaque) { + const char *block_name = qemu_ram_get_idstr(rb); + void *host_addr = qemu_ram_get_host_addr(rb); + ram_addr_t offset = qemu_ram_get_offset(rb); + ram_addr_t length = qemu_ram_get_used_length(rb); trace_postcopy_nhp_range(block_name, host_addr, offset, length); /* @@ -563,7 +620,7 @@ static int nhp_range(const char *block_name, void *host_addr, */ int postcopy_ram_prepare_discard(MigrationIncomingState *mis) { - if (qemu_ram_foreach_block(nhp_range, mis)) { + if (foreach_not_ignored_block(nhp_range, mis)) { return -1; } @@ -574,22 +631,20 @@ int postcopy_ram_prepare_discard(MigrationIncomingState *mis) /* * Mark the given area of RAM as requiring notification to unwritten areas - * Used as a callback on qemu_ram_foreach_block. + * Used as a callback on foreach_not_ignored_block. * host_addr: Base of area to mark * offset: Offset in the whole ram arena * length: Length of the section * opaque: MigrationIncomingState pointer * Returns 0 on success */ -static int ram_block_enable_notify(const char *block_name, void *host_addr, - ram_addr_t offset, ram_addr_t length, - void *opaque) +static int ram_block_enable_notify(RAMBlock *rb, void *opaque) { MigrationIncomingState *mis = opaque; struct uffdio_register reg_struct; - reg_struct.range.start = (uintptr_t)host_addr; - reg_struct.range.len = length; + reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb); + reg_struct.range.len = qemu_ram_get_used_length(rb); reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; /* Now tell our userfault_fd that it's responsible for this area */ @@ -601,7 +656,59 @@ static int ram_block_enable_notify(const char *block_name, void *host_addr, error_report("%s userfault: Region doesn't support COPY", __func__); return -1; } + if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) { + qemu_ram_set_uf_zeroable(rb); + } + + return 0; +} + +int postcopy_wake_shared(struct PostCopyFD *pcfd, + uint64_t client_addr, + RAMBlock *rb) +{ + size_t pagesize = qemu_ram_pagesize(rb); + struct uffdio_range range; + int ret; + trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb)); + range.start = client_addr & ~(pagesize - 1); + range.len = pagesize; + ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range); + if (ret) { + error_report("%s: Failed to wake: %zx in %s (%s)", + __func__, (size_t)client_addr, qemu_ram_get_idstr(rb), + strerror(errno)); + } + return ret; +} +/* + * Callback from shared fault handlers to ask for a page, + * the page must be specified by a RAMBlock and an offset in that rb + * Note: Only for use by shared fault handlers (in fault thread) + */ +int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb, + uint64_t client_addr, uint64_t rb_offset) +{ + size_t pagesize = qemu_ram_pagesize(rb); + uint64_t aligned_rbo = rb_offset & ~(pagesize - 1); + MigrationIncomingState *mis = migration_incoming_get_current(); + + trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb), + rb_offset); + if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) { + trace_postcopy_request_shared_page_present(pcfd->idstr, + qemu_ram_get_idstr(rb), rb_offset); + return postcopy_wake_shared(pcfd, client_addr, rb); + } + if (rb != mis->last_rb) { + mis->last_rb = rb; + migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), + aligned_rbo, pagesize); + } else { + /* Save some space */ + migrate_send_rp_req_pages(mis, NULL, aligned_rbo, pagesize); + } return 0; } @@ -619,6 +726,13 @@ static int get_mem_fault_cpu_index(uint32_t pid) return -1; } +static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc) +{ + int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - + dc->start_time; + return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX; +} + /* * This function is being called when pagefault occurs. It * tracks down vCPU blocking time. @@ -633,7 +747,7 @@ static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid, int cpu, already_received; MigrationIncomingState *mis = migration_incoming_get_current(); PostcopyBlocktimeContext *dc = mis->blocktime_ctx; - int64_t now_ms; + uint32_t low_time_offset; if (!dc || ptid == 0) { return; @@ -643,22 +757,24 @@ static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid, return; } - now_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + low_time_offset = get_low_time_offset(dc); if (dc->vcpu_addr[cpu] == 0) { atomic_inc(&dc->smp_cpus_down); } - atomic_xchg__nocheck(&dc->last_begin, now_ms); - atomic_xchg__nocheck(&dc->page_fault_vcpu_time[cpu], now_ms); - atomic_xchg__nocheck(&dc->vcpu_addr[cpu], addr); + atomic_xchg(&dc->last_begin, low_time_offset); + atomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset); + atomic_xchg(&dc->vcpu_addr[cpu], addr); - /* check it here, not at the begining of the function, - * due to, check could accur early than bitmap_set in - * qemu_ufd_copy_ioctl */ + /* + * check it here, not at the beginning of the function, + * due to, check could occur early than bitmap_set in + * qemu_ufd_copy_ioctl + */ already_received = ramblock_recv_bitmap_test(rb, (void *)addr); if (already_received) { - atomic_xchg__nocheck(&dc->vcpu_addr[cpu], 0); - atomic_xchg__nocheck(&dc->page_fault_vcpu_time[cpu], 0); + atomic_xchg(&dc->vcpu_addr[cpu], 0); + atomic_xchg(&dc->page_fault_vcpu_time[cpu], 0); atomic_dec(&dc->smp_cpus_down); } trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu], @@ -696,31 +812,31 @@ static void mark_postcopy_blocktime_end(uintptr_t addr) { MigrationIncomingState *mis = migration_incoming_get_current(); PostcopyBlocktimeContext *dc = mis->blocktime_ctx; + MachineState *ms = MACHINE(qdev_get_machine()); + unsigned int smp_cpus = ms->smp.cpus; int i, affected_cpu = 0; - int64_t now_ms; bool vcpu_total_blocktime = false; - int64_t read_vcpu_time; + uint32_t read_vcpu_time, low_time_offset; if (!dc) { return; } - now_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - + low_time_offset = get_low_time_offset(dc); /* lookup cpu, to clear it, * that algorithm looks straighforward, but it's not * optimal, more optimal algorithm is keeping tree or hash * where key is address value is a list of */ for (i = 0; i < smp_cpus; i++) { - uint64_t vcpu_blocktime = 0; + uint32_t vcpu_blocktime = 0; read_vcpu_time = atomic_fetch_add(&dc->page_fault_vcpu_time[i], 0); if (atomic_fetch_add(&dc->vcpu_addr[i], 0) != addr || read_vcpu_time == 0) { continue; } - atomic_xchg__nocheck(&dc->vcpu_addr[i], 0); - vcpu_blocktime = now_ms - read_vcpu_time; + atomic_xchg(&dc->vcpu_addr[i], 0); + vcpu_blocktime = low_time_offset - read_vcpu_time; affected_cpu += 1; /* we need to know is that mark_postcopy_end was due to * faulted page, another possible case it's prefetched @@ -735,12 +851,24 @@ static void mark_postcopy_blocktime_end(uintptr_t addr) atomic_sub(&dc->smp_cpus_down, affected_cpu); if (vcpu_total_blocktime) { - dc->total_blocktime += now_ms - atomic_fetch_add(&dc->last_begin, 0); + dc->total_blocktime += low_time_offset - atomic_fetch_add( + &dc->last_begin, 0); } trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime, affected_cpu); } +static bool postcopy_pause_fault_thread(MigrationIncomingState *mis) +{ + trace_postcopy_pause_fault_thread(); + + qemu_sem_wait(&mis->postcopy_pause_sem_fault); + + trace_postcopy_pause_fault_thread_continued(); + + return true; +} + /* * Handle faults detected by the USERFAULT markings */ @@ -749,95 +877,220 @@ static void *postcopy_ram_fault_thread(void *opaque) MigrationIncomingState *mis = opaque; struct uffd_msg msg; int ret; + size_t index; RAMBlock *rb = NULL; - RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */ trace_postcopy_ram_fault_thread_entry(); + rcu_register_thread(); + mis->last_rb = NULL; /* last RAMBlock we sent part of */ qemu_sem_post(&mis->fault_thread_sem); + struct pollfd *pfd; + size_t pfd_len = 2 + mis->postcopy_remote_fds->len; + + pfd = g_new0(struct pollfd, pfd_len); + + pfd[0].fd = mis->userfault_fd; + pfd[0].events = POLLIN; + pfd[1].fd = mis->userfault_event_fd; + pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */ + trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd); + for (index = 0; index < mis->postcopy_remote_fds->len; index++) { + struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds, + struct PostCopyFD, index); + pfd[2 + index].fd = pcfd->fd; + pfd[2 + index].events = POLLIN; + trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr, + pcfd->fd); + } + while (true) { ram_addr_t rb_offset; - struct pollfd pfd[2]; + int poll_result; /* * We're mainly waiting for the kernel to give us a faulting HVA, * however we can be told to quit via userfault_quit_fd which is * an eventfd */ - pfd[0].fd = mis->userfault_fd; - pfd[0].events = POLLIN; - pfd[0].revents = 0; - pfd[1].fd = mis->userfault_quit_fd; - pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */ - pfd[1].revents = 0; - - if (poll(pfd, 2, -1 /* Wait forever */) == -1) { + + poll_result = poll(pfd, pfd_len, -1 /* Wait forever */); + if (poll_result == -1) { error_report("%s: userfault poll: %s", __func__, strerror(errno)); break; } - if (pfd[1].revents) { - trace_postcopy_ram_fault_thread_quit(); - break; + if (!mis->to_src_file) { + /* + * Possibly someone tells us that the return path is + * broken already using the event. We should hold until + * the channel is rebuilt. + */ + if (postcopy_pause_fault_thread(mis)) { + mis->last_rb = NULL; + /* Continue to read the userfaultfd */ + } else { + error_report("%s: paused but don't allow to continue", + __func__); + break; + } } - ret = read(mis->userfault_fd, &msg, sizeof(msg)); - if (ret != sizeof(msg)) { - if (errno == EAGAIN) { - /* - * if a wake up happens on the other thread just after - * the poll, there is nothing to read. - */ - continue; + if (pfd[1].revents) { + uint64_t tmp64 = 0; + + /* Consume the signal */ + if (read(mis->userfault_event_fd, &tmp64, 8) != 8) { + /* Nothing obviously nicer than posting this error. */ + error_report("%s: read() failed", __func__); } - if (ret < 0) { - error_report("%s: Failed to read full userfault message: %s", - __func__, strerror(errno)); + + if (atomic_read(&mis->fault_thread_quit)) { + trace_postcopy_ram_fault_thread_quit(); break; - } else { - error_report("%s: Read %d bytes from userfaultfd expected %zd", - __func__, ret, sizeof(msg)); - break; /* Lost alignment, don't know what we'd read next */ } } - if (msg.event != UFFD_EVENT_PAGEFAULT) { - error_report("%s: Read unexpected event %ud from userfaultfd", - __func__, msg.event); - continue; /* It's not a page fault, shouldn't happen */ - } - rb = qemu_ram_block_from_host( - (void *)(uintptr_t)msg.arg.pagefault.address, - true, &rb_offset); - if (!rb) { - error_report("postcopy_ram_fault_thread: Fault outside guest: %" - PRIx64, (uint64_t)msg.arg.pagefault.address); - break; - } + if (pfd[0].revents) { + poll_result--; + ret = read(mis->userfault_fd, &msg, sizeof(msg)); + if (ret != sizeof(msg)) { + if (errno == EAGAIN) { + /* + * if a wake up happens on the other thread just after + * the poll, there is nothing to read. + */ + continue; + } + if (ret < 0) { + error_report("%s: Failed to read full userfault " + "message: %s", + __func__, strerror(errno)); + break; + } else { + error_report("%s: Read %d bytes from userfaultfd " + "expected %zd", + __func__, ret, sizeof(msg)); + break; /* Lost alignment, don't know what we'd read next */ + } + } + if (msg.event != UFFD_EVENT_PAGEFAULT) { + error_report("%s: Read unexpected event %ud from userfaultfd", + __func__, msg.event); + continue; /* It's not a page fault, shouldn't happen */ + } - rb_offset &= ~(qemu_ram_pagesize(rb) - 1); - trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address, + rb = qemu_ram_block_from_host( + (void *)(uintptr_t)msg.arg.pagefault.address, + true, &rb_offset); + if (!rb) { + error_report("postcopy_ram_fault_thread: Fault outside guest: %" + PRIx64, (uint64_t)msg.arg.pagefault.address); + break; + } + + rb_offset &= ~(qemu_ram_pagesize(rb) - 1); + trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address, qemu_ram_get_idstr(rb), rb_offset, msg.arg.pagefault.feat.ptid); + mark_postcopy_blocktime_begin( + (uintptr_t)(msg.arg.pagefault.address), + msg.arg.pagefault.feat.ptid, rb); - mark_postcopy_blocktime_begin((uintptr_t)(msg.arg.pagefault.address), - msg.arg.pagefault.feat.ptid, rb); - /* - * Send the request to the source - we want to request one - * of our host page sizes (which is >= TPS) - */ - if (rb != last_rb) { - last_rb = rb; - migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), - rb_offset, qemu_ram_pagesize(rb)); - } else { - /* Save some space */ - migrate_send_rp_req_pages(mis, NULL, - rb_offset, qemu_ram_pagesize(rb)); +retry: + /* + * Send the request to the source - we want to request one + * of our host page sizes (which is >= TPS) + */ + if (rb != mis->last_rb) { + mis->last_rb = rb; + ret = migrate_send_rp_req_pages(mis, + qemu_ram_get_idstr(rb), + rb_offset, + qemu_ram_pagesize(rb)); + } else { + /* Save some space */ + ret = migrate_send_rp_req_pages(mis, + NULL, + rb_offset, + qemu_ram_pagesize(rb)); + } + + if (ret) { + /* May be network failure, try to wait for recovery */ + if (ret == -EIO && postcopy_pause_fault_thread(mis)) { + /* We got reconnected somehow, try to continue */ + mis->last_rb = NULL; + goto retry; + } else { + /* This is a unavoidable fault */ + error_report("%s: migrate_send_rp_req_pages() get %d", + __func__, ret); + break; + } + } + } + + /* Now handle any requests from external processes on shared memory */ + /* TODO: May need to handle devices deregistering during postcopy */ + for (index = 2; index < pfd_len && poll_result; index++) { + if (pfd[index].revents) { + struct PostCopyFD *pcfd = + &g_array_index(mis->postcopy_remote_fds, + struct PostCopyFD, index - 2); + + poll_result--; + if (pfd[index].revents & POLLERR) { + error_report("%s: POLLERR on poll %zd fd=%d", + __func__, index, pcfd->fd); + pfd[index].events = 0; + continue; + } + + ret = read(pcfd->fd, &msg, sizeof(msg)); + if (ret != sizeof(msg)) { + if (errno == EAGAIN) { + /* + * if a wake up happens on the other thread just after + * the poll, there is nothing to read. + */ + continue; + } + if (ret < 0) { + error_report("%s: Failed to read full userfault " + "message: %s (shared) revents=%d", + __func__, strerror(errno), + pfd[index].revents); + /*TODO: Could just disable this sharer */ + break; + } else { + error_report("%s: Read %d bytes from userfaultfd " + "expected %zd (shared)", + __func__, ret, sizeof(msg)); + /*TODO: Could just disable this sharer */ + break; /*Lost alignment,don't know what we'd read next*/ + } + } + if (msg.event != UFFD_EVENT_PAGEFAULT) { + error_report("%s: Read unexpected event %ud " + "from userfaultfd (shared)", + __func__, msg.event); + continue; /* It's not a page fault, shouldn't happen */ + } + /* Call the device handler registered with us */ + ret = pcfd->handler(pcfd, &msg); + if (ret) { + error_report("%s: Failed to resolve shared fault on %zd/%s", + __func__, index, pcfd->idstr); + /* TODO: Fail? Disable this sharer? */ + } + } } } + rcu_unregister_thread(); trace_postcopy_ram_fault_thread_exit(); + g_free(pfd); return NULL; } @@ -860,9 +1113,9 @@ int postcopy_ram_enable_notify(MigrationIncomingState *mis) } /* Now an eventfd we use to tell the fault-thread to quit */ - mis->userfault_quit_fd = eventfd(0, EFD_CLOEXEC); - if (mis->userfault_quit_fd == -1) { - error_report("%s: Opening userfault_quit_fd: %s", __func__, + mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC); + if (mis->userfault_event_fd == -1) { + error_report("%s: Opening userfault_event_fd: %s", __func__, strerror(errno)); close(mis->userfault_fd); return -1; @@ -876,15 +1129,42 @@ int postcopy_ram_enable_notify(MigrationIncomingState *mis) mis->have_fault_thread = true; /* Mark so that we get notified of accesses to unwritten areas */ - if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) { + if (foreach_not_ignored_block(ram_block_enable_notify, mis)) { + error_report("ram_block_enable_notify failed"); return -1; } + mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size, + PROT_READ | PROT_WRITE, MAP_PRIVATE | + MAP_ANONYMOUS, -1, 0); + if (mis->postcopy_tmp_page == MAP_FAILED) { + mis->postcopy_tmp_page = NULL; + error_report("%s: Failed to map postcopy_tmp_page %s", + __func__, strerror(errno)); + return -1; + } + + /* + * Map large zero page when kernel can't use UFFDIO_ZEROPAGE for hugepages + */ + mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + -1, 0); + if (mis->postcopy_tmp_zero_page == MAP_FAILED) { + int e = errno; + mis->postcopy_tmp_zero_page = NULL; + error_report("%s: Failed to map large zero page %s", + __func__, strerror(e)); + return -e; + } + memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size); + /* * Ballooning can mark pages as absent while we're postcopying * that would cause false userfaults. */ - qemu_balloon_inhibit(true); + postcopy_balloon_inhibit(true); trace_postcopy_ram_enable_notify(); @@ -918,6 +1198,22 @@ static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr, return ret; } +int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset) +{ + int i; + MigrationIncomingState *mis = migration_incoming_get_current(); + GArray *pcrfds = mis->postcopy_remote_fds; + + for (i = 0; i < pcrfds->len; i++) { + struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i); + int ret = cur->waker(cur, rb, offset); + if (ret) { + return ret; + } + } + return 0; +} + /* * Place a host page (from) at (host) atomically * returns 0 on success @@ -941,7 +1237,8 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, } trace_postcopy_place_page(host); - return 0; + return postcopy_notify_shared_wake(rb, + qemu_ram_block_host_offset(rb, host)); } /* @@ -951,62 +1248,26 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, RAMBlock *rb) { + size_t pagesize = qemu_ram_pagesize(rb); trace_postcopy_place_page_zero(host); - if (qemu_ram_pagesize(rb) == getpagesize()) { - if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, getpagesize(), - rb)) { + /* Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE + * but it's not available for everything (e.g. hugetlbpages) + */ + if (qemu_ram_is_uf_zeroable(rb)) { + if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, pagesize, rb)) { int e = errno; error_report("%s: %s zero host: %p", __func__, strerror(e), host); return -e; } + return postcopy_notify_shared_wake(rb, + qemu_ram_block_host_offset(rb, + host)); } else { - /* The kernel can't use UFFDIO_ZEROPAGE for hugepages */ - if (!mis->postcopy_tmp_zero_page) { - mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - -1, 0); - if (mis->postcopy_tmp_zero_page == MAP_FAILED) { - int e = errno; - mis->postcopy_tmp_zero_page = NULL; - error_report("%s: %s mapping large zero page", - __func__, strerror(e)); - return -e; - } - memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size); - } - return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, - rb); + return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb); } - - return 0; -} - -/* - * Returns a target page of memory that can be mapped at a later point in time - * using postcopy_place_page - * The same address is used repeatedly, postcopy_place_page just takes the - * backing page away. - * Returns: Pointer to allocated page - * - */ -void *postcopy_get_tmp_page(MigrationIncomingState *mis) -{ - if (!mis->postcopy_tmp_page) { - mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size, - PROT_READ | PROT_WRITE, MAP_PRIVATE | - MAP_ANONYMOUS, -1, 0); - if (mis->postcopy_tmp_page == MAP_FAILED) { - mis->postcopy_tmp_page = NULL; - error_report("%s: %s", __func__, strerror(errno)); - return NULL; - } - } - - return mis->postcopy_tmp_page; } #else @@ -1021,7 +1282,7 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis) return false; } -int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages) +int postcopy_ram_incoming_init(MigrationIncomingState *mis) { error_report("postcopy_ram_incoming_init: No OS support"); return -1; @@ -1039,6 +1300,13 @@ int postcopy_ram_prepare_discard(MigrationIncomingState *mis) return -1; } +int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb, + uint64_t client_addr, uint64_t rb_offset) +{ + assert(0); + return -1; +} + int postcopy_ram_enable_notify(MigrationIncomingState *mis) { assert(0); @@ -1059,37 +1327,47 @@ int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, return -1; } -void *postcopy_get_tmp_page(MigrationIncomingState *mis) +int postcopy_wake_shared(struct PostCopyFD *pcfd, + uint64_t client_addr, + RAMBlock *rb) { assert(0); - return NULL; + return -1; } - #endif /* ------------------------------------------------------------------------- */ +void postcopy_fault_thread_notify(MigrationIncomingState *mis) +{ + uint64_t tmp64 = 1; + + /* + * Wakeup the fault_thread. It's an eventfd that should currently + * be at 0, we're going to increment it to 1 + */ + if (write(mis->userfault_event_fd, &tmp64, 8) != 8) { + /* Not much we can do here, but may as well report it */ + error_report("%s: incrementing failed: %s", __func__, + strerror(errno)); + } +} + /** * postcopy_discard_send_init: Called at the start of each RAMBlock before * asking to discard individual ranges. * * @ms: The current migration state. - * @offset: the bitmap offset of the named RAMBlock in the migration - * bitmap. + * @offset: the bitmap offset of the named RAMBlock in the migration bitmap. * @name: RAMBlock that discards will operate on. - * - * returns: a new PDS. */ -PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms, - const char *name) +static PostcopyDiscardState pds = {0}; +void postcopy_discard_send_init(MigrationState *ms, const char *name) { - PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState)); - - if (res) { - res->ramblock_name = name; - } - - return res; + pds.ramblock_name = name; + pds.cur_entry = 0; + pds.nsentwords = 0; + pds.nsentcmds = 0; } /** @@ -1098,30 +1376,29 @@ PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms, * be sent later. * * @ms: Current migration state. - * @pds: Structure initialised by postcopy_discard_send_init(). * @start,@length: a range of pages in the migration bitmap in the * RAM block passed to postcopy_discard_send_init() (length=1 is one page) */ -void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds, - unsigned long start, unsigned long length) +void postcopy_discard_send_range(MigrationState *ms, unsigned long start, + unsigned long length) { size_t tp_size = qemu_target_page_size(); /* Convert to byte offsets within the RAM block */ - pds->start_list[pds->cur_entry] = start * tp_size; - pds->length_list[pds->cur_entry] = length * tp_size; - trace_postcopy_discard_send_range(pds->ramblock_name, start, length); - pds->cur_entry++; - pds->nsentwords++; + pds.start_list[pds.cur_entry] = start * tp_size; + pds.length_list[pds.cur_entry] = length * tp_size; + trace_postcopy_discard_send_range(pds.ramblock_name, start, length); + pds.cur_entry++; + pds.nsentwords++; - if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) { + if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) { /* Full set, ship it! */ qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file, - pds->ramblock_name, - pds->cur_entry, - pds->start_list, - pds->length_list); - pds->nsentcmds++; - pds->cur_entry = 0; + pds.ramblock_name, + pds.cur_entry, + pds.start_list, + pds.length_list); + pds.nsentcmds++; + pds.cur_entry = 0; } } @@ -1130,24 +1407,21 @@ void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds, * bitmap code. Sends any outstanding discard messages, frees the PDS * * @ms: Current migration state. - * @pds: Structure initialised by postcopy_discard_send_init(). */ -void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds) +void postcopy_discard_send_finish(MigrationState *ms) { /* Anything unsent? */ - if (pds->cur_entry) { + if (pds.cur_entry) { qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file, - pds->ramblock_name, - pds->cur_entry, - pds->start_list, - pds->length_list); - pds->nsentcmds++; + pds.ramblock_name, + pds.cur_entry, + pds.start_list, + pds.length_list); + pds.nsentcmds++; } - trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords, - pds->nsentcmds); - - g_free(pds); + trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords, + pds.nsentcmds); } /* @@ -1167,3 +1441,31 @@ PostcopyState postcopy_state_set(PostcopyState new_state) { return atomic_xchg(&incoming_postcopy_state, new_state); } + +/* Register a handler for external shared memory postcopy + * called on the destination. + */ +void postcopy_register_shared_ufd(struct PostCopyFD *pcfd) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + + mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds, + *pcfd); +} + +/* Unregister a handler for external shared memory postcopy + */ +void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd) +{ + guint i; + MigrationIncomingState *mis = migration_incoming_get_current(); + GArray *pcrfds = mis->postcopy_remote_fds; + + for (i = 0; i < pcrfds->len; i++) { + struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i); + if (cur->fd == pcfd->fd) { + mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i); + return; + } + } +}