]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blobdiff - arch/x86/kvm/x86.c
KVM: x86/speculation: Disable Fill buffer clear within guests
[mirror_ubuntu-jammy-kernel.git] / arch / x86 / kvm / x86.c
index 63f9cb33cc19defe9993110bd986dacb0945879f..62298046ea34ad5ddd263d2ec88c47c2b7ea0a60 100644 (file)
@@ -68,7 +68,9 @@
 #include <asm/mce.h>
 #include <asm/pkru.h>
 #include <linux/kernel_stat.h>
-#include <asm/fpu/internal.h> /* Ugh! */
+#include <asm/fpu/api.h>
+#include <asm/fpu/xcr.h>
+#include <asm/fpu/xstate.h>
 #include <asm/pvclock.h>
 #include <asm/div64.h>
 #include <asm/irq_remapping.h>
@@ -293,8 +295,6 @@ u64 __read_mostly host_xcr0;
 u64 __read_mostly supported_xcr0;
 EXPORT_SYMBOL_GPL(supported_xcr0);
 
-static struct kmem_cache *x86_fpu_cache;
-
 static struct kmem_cache *x86_emulator_cache;
 
 /*
@@ -848,6 +848,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
 
        memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
        kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+       kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
        vcpu->arch.pdptrs_from_userspace = false;
 
 out:
@@ -1091,6 +1092,18 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
        unsigned long roots_to_free = 0;
        int i;
 
+       /*
+        * MOV CR3 and INVPCID are usually not intercepted when using TDP, but
+        * this is reachable when running EPT=1 and unrestricted_guest=0,  and
+        * also via the emulator.  KVM's TDP page tables are not in the scope of
+        * the invalidation, but the guest's TLB entries need to be flushed as
+        * the CPU may have cached entries in its TLB for the target PCID.
+        */
+       if (unlikely(tdp_enabled)) {
+               kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+               return;
+       }
+
        /*
         * If neither the current CR3 nor any of the prev_roots use the given
         * PCID, then nothing needs to be done here because a resync will
@@ -1311,7 +1324,7 @@ static const u32 msrs_to_save_all[] = {
        MSR_IA32_UMWAIT_CONTROL,
 
        MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
-       MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
+       MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
        MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
        MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
        MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
@@ -1497,6 +1510,9 @@ static u64 kvm_get_arch_capabilities(void)
                 */
        }
 
+       /* Guests don't need to know "Fill buffer clear control" exists */
+       data &= ~ARCH_CAP_FB_CLEAR_CTRL;
+
        return data;
 }
 
@@ -1592,8 +1608,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                return r;
        }
 
-       /* Update reserved bits */
-       if ((efer ^ old_efer) & EFER_NX)
+       if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
                kvm_mmu_reset_context(vcpu);
 
        return 0;
@@ -2542,7 +2557,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
        kvm_vcpu_write_tsc_offset(vcpu, offset);
        raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
-       spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
+       raw_spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
        if (!matched) {
                kvm->arch.nr_vcpus_matched_tsc = 0;
        } else if (!already_matched) {
@@ -2550,7 +2565,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
        }
 
        kvm_track_tsc_matching(vcpu);
-       spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
+       raw_spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
 }
 
 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
@@ -2780,9 +2795,9 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
        kvm_make_mclock_inprogress_request(kvm);
 
        /* no guest entries from this point */
-       spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
        pvclock_update_vm_gtod_copy(kvm);
-       spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
 
        kvm_for_each_vcpu(i, vcpu, kvm)
                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -2800,15 +2815,15 @@ u64 get_kvmclock_ns(struct kvm *kvm)
        unsigned long flags;
        u64 ret;
 
-       spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
        if (!ka->use_master_clock) {
-               spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+               raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
                return get_kvmclock_base_ns() + ka->kvmclock_offset;
        }
 
        hv_clock.tsc_timestamp = ka->master_cycle_now;
        hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
-       spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
 
        /* both __this_cpu_read() and rdtsc() should be on the same cpu */
        get_cpu();
@@ -2902,13 +2917,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
         * If the host uses TSC clock, then passthrough TSC as stable
         * to the guest.
         */
-       spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
        use_master_clock = ka->use_master_clock;
        if (use_master_clock) {
                host_tsc = ka->master_cycle_now;
                kernel_ns = ka->master_kernel_ns;
        }
-       spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
 
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
@@ -3193,10 +3208,36 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
        static_call(kvm_x86_tlb_flush_guest)(vcpu);
 }
 
+
+static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+       ++vcpu->stat.tlb_flush;
+       static_call(kvm_x86_tlb_flush_current)(vcpu);
+}
+
+/*
+ * Service "local" TLB flush requests, which are specific to the current MMU
+ * context.  In addition to the generic event handling in vcpu_enter_guest(),
+ * TLB flushes that are targeted at an MMU context also need to be serviced
+ * prior before nested VM-Enter/VM-Exit.
+ */
+void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu)
+{
+       if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+               kvm_vcpu_flush_tlb_current(vcpu);
+
+       if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
+               kvm_vcpu_flush_tlb_guest(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests);
+
 static void record_steal_time(struct kvm_vcpu *vcpu)
 {
-       struct kvm_host_map map;
-       struct kvm_steal_time *st;
+       struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+       struct kvm_steal_time __user *st;
+       struct kvm_memslots *slots;
+       u64 steal;
+       u32 version;
 
        if (kvm_xen_msr_enabled(vcpu->kvm)) {
                kvm_xen_runstate_set_running(vcpu);
@@ -3206,47 +3247,86 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
        if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
                return;
 
-       /* -EAGAIN is returned in atomic context so we can just return. */
-       if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
-                       &map, &vcpu->arch.st.cache, false))
+       if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
                return;
 
-       st = map.hva +
-               offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+       slots = kvm_memslots(vcpu->kvm);
+
+       if (unlikely(slots->generation != ghc->generation ||
+                    kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
+               gfn_t gfn = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+
+               /* We rely on the fact that it fits in a single page. */
+               BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
+
+               if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gfn, sizeof(*st)) ||
+                   kvm_is_error_hva(ghc->hva) || !ghc->memslot)
+                       return;
+       }
 
+       st = (struct kvm_steal_time __user *)ghc->hva;
        /*
         * Doing a TLB flush here, on the guest's behalf, can avoid
         * expensive IPIs.
         */
        if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
-               u8 st_preempted = xchg(&st->preempted, 0);
+               u8 st_preempted = 0;
+               int err = -EFAULT;
+
+               if (!user_access_begin(st, sizeof(*st)))
+                       return;
+
+               asm volatile("1: xchgb %0, %2\n"
+                            "xor %1, %1\n"
+                            "2:\n"
+                            _ASM_EXTABLE_UA(1b, 2b)
+                            : "+q" (st_preempted),
+                              "+&r" (err),
+                              "+m" (st->preempted));
+               if (err)
+                       goto out;
+
+               user_access_end();
+
+               vcpu->arch.st.preempted = 0;
 
                trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
                                       st_preempted & KVM_VCPU_FLUSH_TLB);
                if (st_preempted & KVM_VCPU_FLUSH_TLB)
                        kvm_vcpu_flush_tlb_guest(vcpu);
+
+               if (!user_access_begin(st, sizeof(*st)))
+                       goto dirty;
        } else {
-               st->preempted = 0;
-       }
+               if (!user_access_begin(st, sizeof(*st)))
+                       return;
 
-       vcpu->arch.st.preempted = 0;
+               unsafe_put_user(0, &st->preempted, out);
+               vcpu->arch.st.preempted = 0;
+       }
 
-       if (st->version & 1)
-               st->version += 1;  /* first time write, random junk */
+       unsafe_get_user(version, &st->version, out);
+       if (version & 1)
+               version += 1;  /* first time write, random junk */
 
-       st->version += 1;
+       version += 1;
+       unsafe_put_user(version, &st->version, out);
 
        smp_wmb();
 
-       st->steal += current->sched_info.run_delay -
+       unsafe_get_user(steal, &st->steal, out);
+       steal += current->sched_info.run_delay -
                vcpu->arch.st.last_steal;
        vcpu->arch.st.last_steal = current->sched_info.run_delay;
+       unsafe_put_user(steal, &st->steal, out);
 
-       smp_wmb();
+       version += 1;
+       unsafe_put_user(version, &st->version, out);
 
-       st->version += 1;
-
-       kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
+ out:
+       user_access_end();
+ dirty:
+       mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
 }
 
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -3282,7 +3362,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
                if (!msr_info->host_initiated)
                        return 1;
-               if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent))
+               if (kvm_get_msr_feature(&msr_ent))
                        return 1;
                if (data & ~msr_ent.data)
                        return 1;
@@ -3376,6 +3456,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                if (data & ~supported_xss)
                        return 1;
                vcpu->arch.ia32_xss = data;
+               kvm_update_cpuid_runtime(vcpu);
                break;
        case MSR_SMI_COUNT:
                if (!msr_info->host_initiated)
@@ -4285,8 +4366,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 {
-       struct kvm_host_map map;
-       struct kvm_steal_time *st;
+       struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+       struct kvm_steal_time __user *st;
+       struct kvm_memslots *slots;
+       static const u8 preempted = KVM_VCPU_PREEMPTED;
 
        if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
                return;
@@ -4294,16 +4377,23 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
        if (vcpu->arch.st.preempted)
                return;
 
-       if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
-                       &vcpu->arch.st.cache, true))
+       /* This happens on process exit */
+       if (unlikely(current->mm != vcpu->kvm->mm))
+               return;
+
+       slots = kvm_memslots(vcpu->kvm);
+
+       if (unlikely(slots->generation != ghc->generation ||
+                    kvm_is_error_hva(ghc->hva) || !ghc->memslot))
                return;
 
-       st = map.hva +
-               offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+       st = (struct kvm_steal_time __user *)ghc->hva;
+       BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
 
-       st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+       if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
+               vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
 
-       kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+       mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -4331,8 +4421,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
                                    struct kvm_lapic_state *s)
 {
-       if (vcpu->arch.apicv_active)
-               static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+       static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
 
        return kvm_apic_get_state(vcpu, s);
 }
@@ -4642,8 +4731,10 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                vcpu->arch.apic->sipi_vector = events->sipi_vector;
 
        if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
-               if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm)
+               if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
+                       kvm_x86_ops.nested_ops->leave_nested(vcpu);
                        kvm_smm_changed(vcpu, events->smi.smm);
+               }
 
                vcpu->arch.smi_pending = events->smi.pending;
 
@@ -4700,144 +4791,27 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
        return 0;
 }
 
-#define XSTATE_COMPACTION_ENABLED (1ULL << 63)
-
-static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
-{
-       struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
-       u64 xstate_bv = xsave->header.xfeatures;
-       u64 valid;
-
-       /*
-        * Copy legacy XSAVE area, to avoid complications with CPUID
-        * leaves 0 and 1 in the loop below.
-        */
-       memcpy(dest, xsave, XSAVE_HDR_OFFSET);
-
-       /* Set XSTATE_BV */
-       xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
-       *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
-
-       /*
-        * Copy each region from the possibly compacted offset to the
-        * non-compacted offset.
-        */
-       valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
-       while (valid) {
-               u32 size, offset, ecx, edx;
-               u64 xfeature_mask = valid & -valid;
-               int xfeature_nr = fls64(xfeature_mask) - 1;
-               void *src;
-
-               cpuid_count(XSTATE_CPUID, xfeature_nr,
-                           &size, &offset, &ecx, &edx);
-
-               if (xfeature_nr == XFEATURE_PKRU) {
-                       memcpy(dest + offset, &vcpu->arch.pkru,
-                              sizeof(vcpu->arch.pkru));
-               } else {
-                       src = get_xsave_addr(xsave, xfeature_nr);
-                       if (src)
-                               memcpy(dest + offset, src, size);
-               }
-
-               valid -= xfeature_mask;
-       }
-}
-
-static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
-{
-       struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
-       u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
-       u64 valid;
-
-       /*
-        * Copy legacy XSAVE area, to avoid complications with CPUID
-        * leaves 0 and 1 in the loop below.
-        */
-       memcpy(xsave, src, XSAVE_HDR_OFFSET);
-
-       /* Set XSTATE_BV and possibly XCOMP_BV.  */
-       xsave->header.xfeatures = xstate_bv;
-       if (boot_cpu_has(X86_FEATURE_XSAVES))
-               xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
-
-       /*
-        * Copy each region from the non-compacted offset to the
-        * possibly compacted offset.
-        */
-       valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
-       while (valid) {
-               u32 size, offset, ecx, edx;
-               u64 xfeature_mask = valid & -valid;
-               int xfeature_nr = fls64(xfeature_mask) - 1;
-
-               cpuid_count(XSTATE_CPUID, xfeature_nr,
-                           &size, &offset, &ecx, &edx);
-
-               if (xfeature_nr == XFEATURE_PKRU) {
-                       memcpy(&vcpu->arch.pkru, src + offset,
-                              sizeof(vcpu->arch.pkru));
-               } else {
-                       void *dest = get_xsave_addr(xsave, xfeature_nr);
-
-                       if (dest)
-                               memcpy(dest, src + offset, size);
-               }
-
-               valid -= xfeature_mask;
-       }
-}
-
 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
                                         struct kvm_xsave *guest_xsave)
 {
-       if (!vcpu->arch.guest_fpu)
+       if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
                return;
 
-       if (boot_cpu_has(X86_FEATURE_XSAVE)) {
-               memset(guest_xsave, 0, sizeof(struct kvm_xsave));
-               fill_xsave((u8 *) guest_xsave->region, vcpu);
-       } else {
-               memcpy(guest_xsave->region,
-                       &vcpu->arch.guest_fpu->state.fxsave,
-                       sizeof(struct fxregs_state));
-               *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
-                       XFEATURE_MASK_FPSSE;
-       }
+       fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
+                                      guest_xsave->region,
+                                      sizeof(guest_xsave->region),
+                                      vcpu->arch.pkru);
 }
 
-#define XSAVE_MXCSR_OFFSET 24
-
 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
                                        struct kvm_xsave *guest_xsave)
 {
-       u64 xstate_bv;
-       u32 mxcsr;
-
-       if (!vcpu->arch.guest_fpu)
+       if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
                return 0;
 
-       xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
-       mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
-
-       if (boot_cpu_has(X86_FEATURE_XSAVE)) {
-               /*
-                * Here we allow setting states that are not present in
-                * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
-                * with old userspace.
-                */
-               if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask)
-                       return -EINVAL;
-               load_xsave(vcpu, (u8 *)guest_xsave->region);
-       } else {
-               if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
-                       mxcsr & ~mxcsr_feature_mask)
-                       return -EINVAL;
-               memcpy(&vcpu->arch.guest_fpu->state.fxsave,
-                       guest_xsave->region, sizeof(struct fxregs_state));
-       }
-       return 0;
+       return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
+                                             guest_xsave->region,
+                                             supported_xcr0, &vcpu->arch.pkru);
 }
 
 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
@@ -6100,13 +6074,13 @@ set_pit2_out:
                 * is slightly ahead) here we risk going negative on unsigned
                 * 'system_time' when 'user_ns.clock' is very small.
                 */
-               spin_lock_irq(&ka->pvclock_gtod_sync_lock);
+               raw_spin_lock_irq(&ka->pvclock_gtod_sync_lock);
                if (kvm->arch.use_master_clock)
                        now_ns = ka->master_kernel_ns;
                else
                        now_ns = get_kvmclock_base_ns();
                ka->kvmclock_offset = user_ns.clock - now_ns;
-               spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
+               raw_spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
 
                kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
                break;
@@ -6948,7 +6922,13 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
                           unsigned short port, void *val, unsigned int count)
 {
        if (vcpu->arch.pio.count) {
-               /* Complete previous iteration.  */
+               /*
+                * Complete a previous iteration that required userspace I/O.
+                * Note, @count isn't guaranteed to match pio.count as userspace
+                * can modify ECX before rerunning the vCPU.  Ignore any such
+                * shenanigans as KVM doesn't support modifying the rep count,
+                * and the emulator ensures @count doesn't overflow the buffer.
+                */
        } else {
                int r = __emulator_pio_in(vcpu, size, port, count);
                if (!r)
@@ -6957,7 +6937,6 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
                /* Results already available, fall through.  */
        }
 
-       WARN_ON(count != vcpu->arch.pio.count);
        complete_emulator_pio_in(vcpu, val);
        return 1;
 }
@@ -7300,6 +7279,11 @@ static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
        return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
 }
 
+static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
+{
+       return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
+}
+
 static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
 {
        return kvm_register_read_raw(emul_to_vcpu(ctxt), reg);
@@ -7382,6 +7366,7 @@ static const struct x86_emulate_ops emulate_ops = {
        .guest_has_long_mode = emulator_guest_has_long_mode,
        .guest_has_movbe     = emulator_guest_has_movbe,
        .guest_has_fxsr      = emulator_guest_has_fxsr,
+       .guest_has_rdpid     = emulator_guest_has_rdpid,
        .set_nmi_mask        = emulator_set_nmi_mask,
        .get_hflags          = emulator_get_hflags,
        .exiting_smm         = emulator_exiting_smm,
@@ -7905,7 +7890,12 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
         * updating interruptibility state and injecting single-step #DBs.
         */
        if (emulation_type & EMULTYPE_SKIP) {
-               kvm_rip_write(vcpu, ctxt->_eip);
+               if (ctxt->mode != X86EMUL_MODE_PROT64)
+                       ctxt->eip = (u32)ctxt->_eip;
+               else
+                       ctxt->eip = ctxt->_eip;
+
+               kvm_rip_write(vcpu, ctxt->eip);
                if (ctxt->eflags & X86_EFLAGS_RF)
                        kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
                return 1;
@@ -7969,6 +7959,9 @@ restart:
                        writeback = false;
                r = 0;
                vcpu->arch.complete_userspace_io = complete_emulated_mmio;
+       } else if (vcpu->arch.complete_userspace_io) {
+               writeback = false;
+               r = 0;
        } else if (r == EMULATION_RESTART)
                goto restart;
        else
@@ -8156,9 +8149,9 @@ static void kvm_hyperv_tsc_notifier(void)
        list_for_each_entry(kvm, &vm_list, vm_list) {
                struct kvm_arch *ka = &kvm->arch;
 
-               spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+               raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
                pvclock_update_vm_gtod_copy(kvm);
-               spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+               raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
 
                kvm_for_each_vcpu(cpu, vcpu, kvm)
                        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -8340,7 +8333,7 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
        .is_in_guest            = kvm_is_in_guest,
        .is_user_mode           = kvm_is_user_mode,
        .get_guest_ip           = kvm_get_guest_ip,
-       .handle_intel_pt_intr   = kvm_handle_intel_pt_intr,
+       .handle_intel_pt_intr   = NULL,
 };
 
 #ifdef CONFIG_X86_64
@@ -8417,7 +8410,7 @@ int kvm_arch_init(void *opaque)
                goto out;
        }
        if (ops->disabled_by_bios()) {
-               pr_err_ratelimited("kvm: disabled by bios\n");
+               pr_warn_ratelimited("kvm: disabled by bios\n");
                r = -EOPNOTSUPP;
                goto out;
        }
@@ -8434,18 +8427,11 @@ int kvm_arch_init(void *opaque)
        }
 
        r = -ENOMEM;
-       x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
-                                         __alignof__(struct fpu), SLAB_ACCOUNT,
-                                         NULL);
-       if (!x86_fpu_cache) {
-               printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n");
-               goto out;
-       }
 
        x86_emulator_cache = kvm_alloc_emulator_cache();
        if (!x86_emulator_cache) {
                pr_err("kvm: failed to allocate cache for x86 emulator\n");
-               goto out_free_x86_fpu_cache;
+               goto out;
        }
 
        user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
@@ -8455,14 +8441,12 @@ int kvm_arch_init(void *opaque)
        }
        kvm_nr_uret_msrs = 0;
 
-       r = kvm_mmu_module_init();
+       r = kvm_mmu_vendor_module_init();
        if (r)
                goto out_free_percpu;
 
        kvm_timer_init();
 
-       perf_register_guest_info_callbacks(&kvm_guest_cbs);
-
        if (boot_cpu_has(X86_FEATURE_XSAVE)) {
                host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
                supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
@@ -8483,8 +8467,6 @@ out_free_percpu:
        free_percpu(user_return_msrs);
 out_free_x86_emulator_cache:
        kmem_cache_destroy(x86_emulator_cache);
-out_free_x86_fpu_cache:
-       kmem_cache_destroy(x86_fpu_cache);
 out:
        return r;
 }
@@ -8496,7 +8478,6 @@ void kvm_arch_exit(void)
                clear_hv_tscchange_cb();
 #endif
        kvm_lapic_exit();
-       perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
 
        if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
@@ -8508,10 +8489,9 @@ void kvm_arch_exit(void)
        cancel_work_sync(&pvclock_gtod_work);
 #endif
        kvm_x86_ops.hardware_enable = NULL;
-       kvm_mmu_module_exit();
+       kvm_mmu_vendor_module_exit();
        free_percpu(user_return_msrs);
        kmem_cache_destroy(x86_emulator_cache);
-       kmem_cache_destroy(x86_fpu_cache);
 #ifdef CONFIG_KVM_XEN
        static_key_deferred_flush(&kvm_xen_enabled);
        WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
@@ -8567,6 +8547,13 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
        if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
                return -KVM_EOPNOTSUPP;
 
+       /*
+        * When tsc is in permanent catchup mode guests won't be able to use
+        * pvclock_read_retry loop to get consistent view of pvclock
+        */
+       if (vcpu->arch.tsc_always_catchup)
+               return -KVM_EOPNOTSUPP;
+
        if (!kvm_get_walltime_and_clockread(&ts, &cycle))
                return -KVM_EOPNOTSUPP;
 
@@ -8686,7 +8673,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 
        trace_kvm_hypercall(nr, a0, a1, a2, a3);
 
-       op_64_bit = is_64_bit_mode(vcpu);
+       op_64_bit = is_64_bit_hypercall(vcpu);
        if (!op_64_bit) {
                nr &= 0xFFFFFFFF;
                a0 &= 0xFFFFFFFF;
@@ -8790,19 +8777,20 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *kvm_run = vcpu->run;
 
-       /*
-        * if_flag is obsolete and useless, so do not bother
-        * setting it for SEV-ES guests.  Userspace can just
-        * use kvm_run->ready_for_interrupt_injection.
-        */
-       kvm_run->if_flag = !vcpu->arch.guest_state_protected
-               && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
-
+       kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu);
        kvm_run->cr8 = kvm_get_cr8(vcpu);
        kvm_run->apic_base = kvm_get_apic_base(vcpu);
+
+       /*
+        * The call to kvm_ready_for_interrupt_injection() may end up in
+        * kvm_xen_has_interrupt() which may require the srcu lock to be
+        * held, to protect against changes in the vcpu_info address.
+        */
+       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
        kvm_run->ready_for_interrupt_injection =
                pic_in_kernel(vcpu->kvm) ||
                kvm_vcpu_ready_for_interrupt_injection(vcpu);
+       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 
        if (is_smm(vcpu))
                kvm_run->flags |= KVM_RUN_X86_SMM;
@@ -9351,8 +9339,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
        if (irqchip_split(vcpu->kvm))
                kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
        else {
-               if (vcpu->arch.apicv_active)
-                       static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+               static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
                if (ioapic_in_kernel(vcpu->kvm))
                        kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
        }
@@ -9370,12 +9357,16 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
        if (!kvm_apic_hw_enabled(vcpu->arch.apic))
                return;
 
-       if (to_hv_vcpu(vcpu))
+       if (to_hv_vcpu(vcpu)) {
                bitmap_or((ulong *)eoi_exit_bitmap,
                          vcpu->arch.ioapic_handled_vectors,
                          to_hv_synic(vcpu)->vec_bitmap, 256);
+               static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
+               return;
+       }
 
-       static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
+       static_call(kvm_x86_load_eoi_exitmap)(
+               vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
 }
 
 void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
@@ -9467,10 +9458,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        /* Flushing all ASIDs flushes the current ASID... */
                        kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
                }
-               if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
-                       kvm_vcpu_flush_tlb_current(vcpu);
-               if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
-                       kvm_vcpu_flush_tlb_guest(vcpu);
+               kvm_service_local_tlb_flush_requests(vcpu);
 
                if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
                        vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -9621,10 +9609,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
        /*
         * This handles the case where a posted interrupt was
-        * notified with kvm_vcpu_kick.
+        * notified with kvm_vcpu_kick.  Assigned devices can
+        * use the POSTED_INTR_VECTOR even if APICv is disabled,
+        * so do it even if APICv is disabled on this vCPU.
         */
-       if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
-               static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+       if (kvm_lapic_enabled(vcpu))
+               static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
 
        if (kvm_vcpu_exit_request(vcpu)) {
                vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -9660,8 +9650,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
                        break;
 
-               if (vcpu->arch.apicv_active)
-                       static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+               if (kvm_lapic_enabled(vcpu))
+                       static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
 
                if (unlikely(kvm_vcpu_exit_request(vcpu))) {
                        exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
@@ -9930,58 +9920,21 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-static void kvm_save_current_fpu(struct fpu *fpu)
-{
-       /*
-        * If the target FPU state is not resident in the CPU registers, just
-        * memcpy() from current, else save CPU state directly to the target.
-        */
-       if (test_thread_flag(TIF_NEED_FPU_LOAD))
-               memcpy(&fpu->state, &current->thread.fpu.state,
-                      fpu_kernel_xstate_size);
-       else
-               save_fpregs_to_fpstate(fpu);
-}
-
 /* Swap (qemu) user FPU context for the guest FPU context. */
 static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 {
-       fpregs_lock();
-
-       kvm_save_current_fpu(vcpu->arch.user_fpu);
-
        /*
-        * Guests with protected state can't have it set by the hypervisor,
-        * so skip trying to set it.
+        * Exclude PKRU from restore as restored separately in
+        * kvm_x86_ops.run().
         */
-       if (vcpu->arch.guest_fpu)
-               /* PKRU is separately restored in kvm_x86_ops.run. */
-               __restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state,
-                                       ~XFEATURE_MASK_PKRU);
-
-       fpregs_mark_activate();
-       fpregs_unlock();
-
+       fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
        trace_kvm_fpu(1);
 }
 
 /* When vcpu_run ends, restore user space FPU context. */
 static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
-       fpregs_lock();
-
-       /*
-        * Guests with protected state can't have it read by the hypervisor,
-        * so skip trying to save it.
-        */
-       if (vcpu->arch.guest_fpu)
-               kvm_save_current_fpu(vcpu->arch.guest_fpu);
-
-       restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state);
-
-       fpregs_mark_activate();
-       fpregs_unlock();
-
+       fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
        ++vcpu->stat.fpu_reload;
        trace_kvm_fpu(0);
 }
@@ -10562,12 +10515,12 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
        struct fxregs_state *fxsave;
 
-       if (!vcpu->arch.guest_fpu)
+       if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
                return 0;
 
        vcpu_load(vcpu);
 
-       fxsave = &vcpu->arch.guest_fpu->state.fxsave;
+       fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
        memcpy(fpu->fpr, fxsave->st_space, 128);
        fpu->fcw = fxsave->cwd;
        fpu->fsw = fxsave->swd;
@@ -10585,12 +10538,12 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
        struct fxregs_state *fxsave;
 
-       if (!vcpu->arch.guest_fpu)
+       if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
                return 0;
 
        vcpu_load(vcpu);
 
-       fxsave = &vcpu->arch.guest_fpu->state.fxsave;
+       fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
 
        memcpy(fxsave->st_space, fpu->fpr, 128);
        fxsave->cwd = fpu->fcw;
@@ -10643,14 +10596,6 @@ static int sync_regs(struct kvm_vcpu *vcpu)
 
 static void fx_init(struct kvm_vcpu *vcpu)
 {
-       if (!vcpu->arch.guest_fpu)
-               return;
-
-       fpstate_init(&vcpu->arch.guest_fpu->state);
-       if (boot_cpu_has(X86_FEATURE_XSAVES))
-               vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
-                       host_xcr0 | XSTATE_COMPACTION_ENABLED;
-
        /*
         * Ensure guest xcr0 is valid for loading
         */
@@ -10659,15 +10604,6 @@ static void fx_init(struct kvm_vcpu *vcpu)
        vcpu->arch.cr0 |= X86_CR0_ET;
 }
 
-void kvm_free_guest_fpu(struct kvm_vcpu *vcpu)
-{
-       if (vcpu->arch.guest_fpu) {
-               kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
-               vcpu->arch.guest_fpu = NULL;
-       }
-}
-EXPORT_SYMBOL_GPL(kvm_free_guest_fpu);
-
 int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
 {
        if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
@@ -10724,19 +10660,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
        if (!alloc_emulate_ctxt(vcpu))
                goto free_wbinvd_dirty_mask;
 
-       vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
-                                               GFP_KERNEL_ACCOUNT);
-       if (!vcpu->arch.user_fpu) {
-               pr_err("kvm: failed to allocate userspace's fpu\n");
+       if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
+               pr_err("kvm: failed to allocate vcpu's fpu\n");
                goto free_emulate_ctxt;
        }
 
-       vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
-                                                GFP_KERNEL_ACCOUNT);
-       if (!vcpu->arch.guest_fpu) {
-               pr_err("kvm: failed to allocate vcpu's fpu\n");
-               goto free_user_fpu;
-       }
        fx_init(vcpu);
 
        vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -10769,9 +10697,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
        return 0;
 
 free_guest_fpu:
-       kvm_free_guest_fpu(vcpu);
-free_user_fpu:
-       kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
+       fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
 free_emulate_ctxt:
        kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
 free_wbinvd_dirty_mask:
@@ -10809,19 +10735,15 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
-       struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
        int idx;
 
-       kvm_release_pfn(cache->pfn, cache->dirty, cache);
-
        kvmclock_reset(vcpu);
 
        static_call(kvm_x86_vcpu_free)(vcpu);
 
        kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
-       kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
-       kvm_free_guest_fpu(vcpu);
+       fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
 
        kvm_hv_vcpu_uninit(vcpu);
        kvm_pmu_destroy(vcpu);
@@ -10873,8 +10795,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
        kvm_async_pf_hash_reset(vcpu);
        vcpu->arch.apf.halted = false;
 
-       if (vcpu->arch.guest_fpu && kvm_mpx_supported()) {
-               void *mpx_state_buffer;
+       if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) {
+               struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
 
                /*
                 * To avoid have the INIT path from kvm_apic_has_events() that be
@@ -10882,14 +10804,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
                 */
                if (init_event)
                        kvm_put_guest_fpu(vcpu);
-               mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
-                                       XFEATURE_BNDREGS);
-               if (mpx_state_buffer)
-                       memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
-               mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
-                                       XFEATURE_BNDCSR);
-               if (mpx_state_buffer)
-                       memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
+
+               fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS);
+               fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR);
+
                if (init_event)
                        kvm_load_guest_fpu(vcpu);
        }
@@ -10900,7 +10818,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
                vcpu->arch.msr_misc_features_enables = 0;
 
-               vcpu->arch.xcr0 = XFEATURE_MASK_FP;
+               __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP);
+               __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true);
        }
 
        memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
@@ -10919,8 +10838,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
                eax = 0x600;
        kvm_rdx_write(vcpu, eax);
 
-       vcpu->arch.ia32_xss = 0;
-
        static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
 
        kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
@@ -11096,6 +11013,10 @@ int kvm_arch_hardware_setup(void *opaque)
        memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
        kvm_ops_static_call_update();
 
+       if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest())
+               kvm_guest_cbs.handle_intel_pt_intr = kvm_handle_intel_pt_intr;
+       perf_register_guest_info_callbacks(&kvm_guest_cbs);
+
        if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
                supported_xss = 0;
 
@@ -11123,6 +11044,9 @@ int kvm_arch_hardware_setup(void *opaque)
 
 void kvm_arch_hardware_unsetup(void)
 {
+       perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+       kvm_guest_cbs.handle_intel_pt_intr = NULL;
+
        static_call(kvm_x86_hardware_unsetup)();
 }
 
@@ -11199,7 +11123,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
        raw_spin_lock_init(&kvm->arch.tsc_write_lock);
        mutex_init(&kvm->arch.apic_map_lock);
-       spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+       raw_spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
 
        kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
        pvclock_update_vm_gtod_copy(kvm);
@@ -12385,39 +12309,78 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
 }
 EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read);
 
-static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
+static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
+                          unsigned int port);
+
+static int complete_sev_es_emulated_outs(struct kvm_vcpu *vcpu)
 {
-       memcpy(vcpu->arch.sev_pio_data, vcpu->arch.pio_data,
-              vcpu->arch.pio.count * vcpu->arch.pio.size);
-       vcpu->arch.pio.count = 0;
+       int size = vcpu->arch.pio.size;
+       int port = vcpu->arch.pio.port;
 
+       vcpu->arch.pio.count = 0;
+       if (vcpu->arch.sev_pio_count)
+               return kvm_sev_es_outs(vcpu, size, port);
        return 1;
 }
 
 static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
-                          unsigned int port, unsigned int count)
+                          unsigned int port)
 {
-       int ret = emulator_pio_out(vcpu, size, port,
-                                  vcpu->arch.sev_pio_data, count);
+       for (;;) {
+               unsigned int count =
+                       min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
+               int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count);
+
+               /* memcpy done already by emulator_pio_out.  */
+               vcpu->arch.sev_pio_count -= count;
+               vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size;
+               if (!ret)
+                       break;
 
-       if (ret) {
                /* Emulation done by the kernel.  */
-               return ret;
+               if (!vcpu->arch.sev_pio_count)
+                       return 1;
        }
 
-       vcpu->arch.pio.count = 0;
+       vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs;
        return 0;
 }
 
 static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
-                         unsigned int port, unsigned int count)
+                         unsigned int port);
+
+static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
 {
-       int ret = emulator_pio_in(vcpu, size, port,
-                                 vcpu->arch.sev_pio_data, count);
+       unsigned count = vcpu->arch.pio.count;
+       complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data);
+       vcpu->arch.sev_pio_count -= count;
+       vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size;
+}
+
+static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
+{
+       int size = vcpu->arch.pio.size;
+       int port = vcpu->arch.pio.port;
+
+       advance_sev_es_emulated_ins(vcpu);
+       if (vcpu->arch.sev_pio_count)
+               return kvm_sev_es_ins(vcpu, size, port);
+       return 1;
+}
+
+static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
+                         unsigned int port)
+{
+       for (;;) {
+               unsigned int count =
+                       min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
+               if (!__emulator_pio_in(vcpu, size, port, count))
+                       break;
 
-       if (ret) {
                /* Emulation done by the kernel.  */
-               return ret;
+               advance_sev_es_emulated_ins(vcpu);
+               if (!vcpu->arch.sev_pio_count)
+                       return 1;
        }
 
        vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
@@ -12429,8 +12392,9 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
                         int in)
 {
        vcpu->arch.sev_pio_data = data;
-       return in ? kvm_sev_es_ins(vcpu, size, port, count)
-                 : kvm_sev_es_outs(vcpu, size, port, count);
+       vcpu->arch.sev_pio_count = count;
+       return in ? kvm_sev_es_ins(vcpu, size, port)
+                 : kvm_sev_es_outs(vcpu, size, port);
 }
 EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
 
@@ -12461,3 +12425,19 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
+
+static int __init kvm_x86_init(void)
+{
+       kvm_mmu_x86_module_init();
+       return 0;
+}
+module_init(kvm_x86_init);
+
+static void __exit kvm_x86_exit(void)
+{
+       /*
+        * If module_init() is implemented, module_exit() must also be
+        * implemented to allow module unload.
+        */
+}
+module_exit(kvm_x86_exit);