KVM: x86/speculation: Disable Fill buffer clear within guests

[mirror_ubuntu-jammy-kernel.git] / arch / x86 / kvm / x86.c
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 63f9cb33cc19defe9993110bd986dacb0945879f..62298046ea34ad5ddd263d2ec88c47c2b7ea0a60 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -68,7 +68,9 @@
  #include <asm/mce.h>
  #include <asm/pkru.h>
  #include <linux/kernel_stat.h>
-#include <asm/fpu/internal.h> /* Ugh! */
+#include <asm/fpu/api.h>
+#include <asm/fpu/xcr.h>
+#include <asm/fpu/xstate.h>
  #include <asm/pvclock.h>
  #include <asm/div64.h>
  #include <asm/irq_remapping.h>
@@ -293,8 +295,6 @@ u64 __read_mostly host_xcr0;
  u64 __read_mostly supported_xcr0;
  EXPORT_SYMBOL_GPL(supported_xcr0);
  
-static struct kmem_cache *x86_fpu_cache;
-
  static struct kmem_cache *x86_emulator_cache;
  
  /*
@@ -848,6 +848,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
  
         memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
         kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+       kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
         vcpu->arch.pdptrs_from_userspace = false;
  
  out:
@@ -1091,6 +1092,18 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
         unsigned long roots_to_free = 0;
         int i;
  
+       /*
+        * MOV CR3 and INVPCID are usually not intercepted when using TDP, but
+        * this is reachable when running EPT=1 and unrestricted_guest=0,  and
+        * also via the emulator.  KVM's TDP page tables are not in the scope of
+        * the invalidation, but the guest's TLB entries need to be flushed as
+        * the CPU may have cached entries in its TLB for the target PCID.
+        */
+       if (unlikely(tdp_enabled)) {
+               kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+               return;
+       }
+
         /*
          * If neither the current CR3 nor any of the prev_roots use the given
          * PCID, then nothing needs to be done here because a resync will
@@ -1311,7 +1324,7 @@ static const u32 msrs_to_save_all[] = {
         MSR_IA32_UMWAIT_CONTROL,
  
         MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
-       MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
+       MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
         MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
         MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
         MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
@@ -1497,6 +1510,9 @@ static u64 kvm_get_arch_capabilities(void)
                  */
         }
  
+       /* Guests don't need to know "Fill buffer clear control" exists */
+       data &= ~ARCH_CAP_FB_CLEAR_CTRL;
+
         return data;
  }
  
@@ -1592,8 +1608,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 return r;
         }
  
-       /* Update reserved bits */
-       if ((efer ^ old_efer) & EFER_NX)
+       if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
                 kvm_mmu_reset_context(vcpu);
  
         return 0;
@@ -2542,7 +2557,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
         kvm_vcpu_write_tsc_offset(vcpu, offset);
         raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
  
-       spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
+       raw_spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
         if (!matched) {
                 kvm->arch.nr_vcpus_matched_tsc = 0;
         } else if (!already_matched) {
@@ -2550,7 +2565,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
         }
  
         kvm_track_tsc_matching(vcpu);
-       spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
+       raw_spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
  }
  
  static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
@@ -2780,9 +2795,9 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
         kvm_make_mclock_inprogress_request(kvm);
  
         /* no guest entries from this point */
-       spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
         pvclock_update_vm_gtod_copy(kvm);
-       spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
  
         kvm_for_each_vcpu(i, vcpu, kvm)
                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -2800,15 +2815,15 @@ u64 get_kvmclock_ns(struct kvm *kvm)
         unsigned long flags;
         u64 ret;
  
-       spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
         if (!ka->use_master_clock) {
-               spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+               raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
                 return get_kvmclock_base_ns() + ka->kvmclock_offset;
         }
  
         hv_clock.tsc_timestamp = ka->master_cycle_now;
         hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
-       spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
  
         /* both __this_cpu_read() and rdtsc() should be on the same cpu */
         get_cpu();
@@ -2902,13 +2917,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
          * If the host uses TSC clock, then passthrough TSC as stable
          * to the guest.
          */
-       spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
         use_master_clock = ka->use_master_clock;
         if (use_master_clock) {
                 host_tsc = ka->master_cycle_now;
                 kernel_ns = ka->master_kernel_ns;
         }
-       spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+       raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
  
         /* Keep irq disabled to prevent changes to the clock */
         local_irq_save(flags);
@@ -3193,10 +3208,36 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
         static_call(kvm_x86_tlb_flush_guest)(vcpu);
  }
  
+
+static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+       ++vcpu->stat.tlb_flush;
+       static_call(kvm_x86_tlb_flush_current)(vcpu);
+}
+
+/*
+ * Service "local" TLB flush requests, which are specific to the current MMU
+ * context.  In addition to the generic event handling in vcpu_enter_guest(),
+ * TLB flushes that are targeted at an MMU context also need to be serviced
+ * prior before nested VM-Enter/VM-Exit.
+ */
+void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu)
+{
+       if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+               kvm_vcpu_flush_tlb_current(vcpu);
+
+       if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
+               kvm_vcpu_flush_tlb_guest(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests);
+
  static void record_steal_time(struct kvm_vcpu *vcpu)
  {
-       struct kvm_host_map map;
-       struct kvm_steal_time *st;
+       struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+       struct kvm_steal_time __user *st;
+       struct kvm_memslots *slots;
+       u64 steal;
+       u32 version;
  
         if (kvm_xen_msr_enabled(vcpu->kvm)) {
                 kvm_xen_runstate_set_running(vcpu);
@@ -3206,47 +3247,86 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
         if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
                 return;
  
-       /* -EAGAIN is returned in atomic context so we can just return. */
-       if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
-                       &map, &vcpu->arch.st.cache, false))
+       if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
                 return;
  
-       st = map.hva +
-               offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+       slots = kvm_memslots(vcpu->kvm);
+
+       if (unlikely(slots->generation != ghc->generation ||
+                    kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
+               gfn_t gfn = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+
+               /* We rely on the fact that it fits in a single page. */
+               BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
+
+               if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gfn, sizeof(*st)) ||
+                   kvm_is_error_hva(ghc->hva) || !ghc->memslot)
+                       return;
+       }
  
+       st = (struct kvm_steal_time __user *)ghc->hva;
         /*
          * Doing a TLB flush here, on the guest's behalf, can avoid
          * expensive IPIs.
          */
         if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
-               u8 st_preempted = xchg(&st->preempted, 0);
+               u8 st_preempted = 0;
+               int err = -EFAULT;
+
+               if (!user_access_begin(st, sizeof(*st)))
+                       return;
+
+               asm volatile("1: xchgb %0, %2\n"
+                            "xor %1, %1\n"
+                            "2:\n"
+                            _ASM_EXTABLE_UA(1b, 2b)
+                            : "+q" (st_preempted),
+                              "+&r" (err),
+                              "+m" (st->preempted));
+               if (err)
+                       goto out;
+
+               user_access_end();
+
+               vcpu->arch.st.preempted = 0;
  
                 trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
                                        st_preempted & KVM_VCPU_FLUSH_TLB);
                 if (st_preempted & KVM_VCPU_FLUSH_TLB)
                         kvm_vcpu_flush_tlb_guest(vcpu);
+
+               if (!user_access_begin(st, sizeof(*st)))
+                       goto dirty;
         } else {
-               st->preempted = 0;
-       }
+               if (!user_access_begin(st, sizeof(*st)))
+                       return;
  
-       vcpu->arch.st.preempted = 0;
+               unsafe_put_user(0, &st->preempted, out);
+               vcpu->arch.st.preempted = 0;
+       }
  
-       if (st->version & 1)
-               st->version += 1;  /* first time write, random junk */
+       unsafe_get_user(version, &st->version, out);
+       if (version & 1)
+               version += 1;  /* first time write, random junk */
  
-       st->version += 1;
+       version += 1;
+       unsafe_put_user(version, &st->version, out);
  
         smp_wmb();
  
-       st->steal += current->sched_info.run_delay -
+       unsafe_get_user(steal, &st->steal, out);
+       steal += current->sched_info.run_delay -
                 vcpu->arch.st.last_steal;
         vcpu->arch.st.last_steal = current->sched_info.run_delay;
+       unsafe_put_user(steal, &st->steal, out);
  
-       smp_wmb();
+       version += 1;
+       unsafe_put_user(version, &st->version, out);
  
-       st->version += 1;
-
-       kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
+ out:
+       user_access_end();
+ dirty:
+       mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
  }
  
  int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -3282,7 +3362,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
  
                 if (!msr_info->host_initiated)
                         return 1;
-               if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent))
+               if (kvm_get_msr_feature(&msr_ent))
                         return 1;
                 if (data & ~msr_ent.data)
                         return 1;
@@ -3376,6 +3456,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 if (data & ~supported_xss)
                         return 1;
                 vcpu->arch.ia32_xss = data;
+               kvm_update_cpuid_runtime(vcpu);
                 break;
         case MSR_SMI_COUNT:
                 if (!msr_info->host_initiated)
@@ -4285,8 +4366,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
  
  static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
  {
-       struct kvm_host_map map;
-       struct kvm_steal_time *st;
+       struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+       struct kvm_steal_time __user *st;
+       struct kvm_memslots *slots;
+       static const u8 preempted = KVM_VCPU_PREEMPTED;
  
         if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
                 return;
@@ -4294,16 +4377,23 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
         if (vcpu->arch.st.preempted)
                 return;
  
-       if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
-                       &vcpu->arch.st.cache, true))
+       /* This happens on process exit */
+       if (unlikely(current->mm != vcpu->kvm->mm))
+               return;
+
+       slots = kvm_memslots(vcpu->kvm);
+
+       if (unlikely(slots->generation != ghc->generation ||
+                    kvm_is_error_hva(ghc->hva) || !ghc->memslot))
                 return;
  
-       st = map.hva +
-               offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+       st = (struct kvm_steal_time __user *)ghc->hva;
+       BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
  
-       st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+       if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
+               vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
  
-       kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+       mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
  }
  
  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -4331,8 +4421,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
  static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
                                     struct kvm_lapic_state *s)
  {
-       if (vcpu->arch.apicv_active)
-               static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+       static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
  
         return kvm_apic_get_state(vcpu, s);
  }
@@ -4642,8 +4731,10 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                 vcpu->arch.apic->sipi_vector = events->sipi_vector;
  
         if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
-               if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm)
+               if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
+                       kvm_x86_ops.nested_ops->leave_nested(vcpu);
                         kvm_smm_changed(vcpu, events->smi.smm);
+               }
  
                 vcpu->arch.smi_pending = events->smi.pending;
  
@@ -4700,144 +4791,27 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
         return 0;
  }
  
-#define XSTATE_COMPACTION_ENABLED (1ULL << 63)
-
-static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
-{
-       struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
-       u64 xstate_bv = xsave->header.xfeatures;
-       u64 valid;
-
-       /*
-        * Copy legacy XSAVE area, to avoid complications with CPUID
-        * leaves 0 and 1 in the loop below.
-        */
-       memcpy(dest, xsave, XSAVE_HDR_OFFSET);
-
-       /* Set XSTATE_BV */
-       xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
-       *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
-
-       /*
-        * Copy each region from the possibly compacted offset to the
-        * non-compacted offset.
-        */
-       valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
-       while (valid) {
-               u32 size, offset, ecx, edx;
-               u64 xfeature_mask = valid & -valid;
-               int xfeature_nr = fls64(xfeature_mask) - 1;
-               void *src;
-
-               cpuid_count(XSTATE_CPUID, xfeature_nr,
-                           &size, &offset, &ecx, &edx);
-
-               if (xfeature_nr == XFEATURE_PKRU) {
-                       memcpy(dest + offset, &vcpu->arch.pkru,
-                              sizeof(vcpu->arch.pkru));
-               } else {
-                       src = get_xsave_addr(xsave, xfeature_nr);
-                       if (src)
-                               memcpy(dest + offset, src, size);
-               }
-
-               valid -= xfeature_mask;
-       }
-}
-
-static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
-{
-       struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
-       u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
-       u64 valid;
-
-       /*
-        * Copy legacy XSAVE area, to avoid complications with CPUID
-        * leaves 0 and 1 in the loop below.
-        */
-       memcpy(xsave, src, XSAVE_HDR_OFFSET);
-
-       /* Set XSTATE_BV and possibly XCOMP_BV.  */
-       xsave->header.xfeatures = xstate_bv;
-       if (boot_cpu_has(X86_FEATURE_XSAVES))
-               xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
-
-       /*
-        * Copy each region from the non-compacted offset to the
-        * possibly compacted offset.
-        */
-       valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
-       while (valid) {
-               u32 size, offset, ecx, edx;
-               u64 xfeature_mask = valid & -valid;
-               int xfeature_nr = fls64(xfeature_mask) - 1;
-
-               cpuid_count(XSTATE_CPUID, xfeature_nr,
-                           &size, &offset, &ecx, &edx);
-
-               if (xfeature_nr == XFEATURE_PKRU) {
-                       memcpy(&vcpu->arch.pkru, src + offset,
-                              sizeof(vcpu->arch.pkru));
-               } else {
-                       void *dest = get_xsave_addr(xsave, xfeature_nr);
-
-                       if (dest)
-                               memcpy(dest, src + offset, size);
-               }
-
-               valid -= xfeature_mask;
-       }
-}
-
  static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
                                          struct kvm_xsave *guest_xsave)
  {
-       if (!vcpu->arch.guest_fpu)
+       if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
                 return;
  
-       if (boot_cpu_has(X86_FEATURE_XSAVE)) {
-               memset(guest_xsave, 0, sizeof(struct kvm_xsave));
-               fill_xsave((u8 *) guest_xsave->region, vcpu);
-       } else {
-               memcpy(guest_xsave->region,
-                       &vcpu->arch.guest_fpu->state.fxsave,
-                       sizeof(struct fxregs_state));
-               *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
-                       XFEATURE_MASK_FPSSE;
-       }
+       fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
+                                      guest_xsave->region,
+                                      sizeof(guest_xsave->region),
+                                      vcpu->arch.pkru);
  }
  
-#define XSAVE_MXCSR_OFFSET 24
-
  static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
                                         struct kvm_xsave *guest_xsave)
  {
-       u64 xstate_bv;
-       u32 mxcsr;
-
-       if (!vcpu->arch.guest_fpu)
+       if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
                 return 0;
  
-       xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
-       mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
-
-       if (boot_cpu_has(X86_FEATURE_XSAVE)) {
-               /*
-                * Here we allow setting states that are not present in
-                * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
-                * with old userspace.
-                */
-               if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask)
-                       return -EINVAL;
-               load_xsave(vcpu, (u8 *)guest_xsave->region);
-       } else {
-               if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
-                       mxcsr & ~mxcsr_feature_mask)
-                       return -EINVAL;
-               memcpy(&vcpu->arch.guest_fpu->state.fxsave,
-                       guest_xsave->region, sizeof(struct fxregs_state));
-       }
-       return 0;
+       return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
+                                             guest_xsave->region,
+                                             supported_xcr0, &vcpu->arch.pkru);
  }
  
  static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
@@ -6100,13 +6074,13 @@ set_pit2_out:
                  * is slightly ahead) here we risk going negative on unsigned
                  * 'system_time' when 'user_ns.clock' is very small.
                  */
-               spin_lock_irq(&ka->pvclock_gtod_sync_lock);
+               raw_spin_lock_irq(&ka->pvclock_gtod_sync_lock);
                 if (kvm->arch.use_master_clock)
                         now_ns = ka->master_kernel_ns;
                 else
                         now_ns = get_kvmclock_base_ns();
                 ka->kvmclock_offset = user_ns.clock - now_ns;
-               spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
+               raw_spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
  
                 kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
                 break;
@@ -6948,7 +6922,13 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
                            unsigned short port, void *val, unsigned int count)
  {
         if (vcpu->arch.pio.count) {
-               /* Complete previous iteration.  */
+               /*
+                * Complete a previous iteration that required userspace I/O.
+                * Note, @count isn't guaranteed to match pio.count as userspace
+                * can modify ECX before rerunning the vCPU.  Ignore any such
+                * shenanigans as KVM doesn't support modifying the rep count,
+                * and the emulator ensures @count doesn't overflow the buffer.
+                */
         } else {
                 int r = __emulator_pio_in(vcpu, size, port, count);
                 if (!r)
@@ -6957,7 +6937,6 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
                 /* Results already available, fall through.  */
         }
  
-       WARN_ON(count != vcpu->arch.pio.count);
         complete_emulator_pio_in(vcpu, val);
         return 1;
  }
@@ -7300,6 +7279,11 @@ static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
         return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
  }
  
+static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
+{
+       return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
+}
+
  static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
  {
         return kvm_register_read_raw(emul_to_vcpu(ctxt), reg);
@@ -7382,6 +7366,7 @@ static const struct x86_emulate_ops emulate_ops = {
         .guest_has_long_mode = emulator_guest_has_long_mode,
         .guest_has_movbe     = emulator_guest_has_movbe,
         .guest_has_fxsr      = emulator_guest_has_fxsr,
+       .guest_has_rdpid     = emulator_guest_has_rdpid,
         .set_nmi_mask        = emulator_set_nmi_mask,
         .get_hflags          = emulator_get_hflags,
         .exiting_smm         = emulator_exiting_smm,
@@ -7905,7 +7890,12 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
          * updating interruptibility state and injecting single-step #DBs.
          */
         if (emulation_type & EMULTYPE_SKIP) {
-               kvm_rip_write(vcpu, ctxt->_eip);
+               if (ctxt->mode != X86EMUL_MODE_PROT64)
+                       ctxt->eip = (u32)ctxt->_eip;
+               else
+                       ctxt->eip = ctxt->_eip;
+
+               kvm_rip_write(vcpu, ctxt->eip);
                 if (ctxt->eflags & X86_EFLAGS_RF)
                         kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
                 return 1;
@@ -7969,6 +7959,9 @@ restart:
                         writeback = false;
                 r = 0;
                 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
+       } else if (vcpu->arch.complete_userspace_io) {
+               writeback = false;
+               r = 0;
         } else if (r == EMULATION_RESTART)
                 goto restart;
         else
@@ -8156,9 +8149,9 @@ static void kvm_hyperv_tsc_notifier(void)
         list_for_each_entry(kvm, &vm_list, vm_list) {
                 struct kvm_arch *ka = &kvm->arch;
  
-               spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+               raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
                 pvclock_update_vm_gtod_copy(kvm);
-               spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+               raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
  
                 kvm_for_each_vcpu(cpu, vcpu, kvm)
                         kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -8340,7 +8333,7 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
         .is_in_guest            = kvm_is_in_guest,
         .is_user_mode           = kvm_is_user_mode,
         .get_guest_ip           = kvm_get_guest_ip,
-       .handle_intel_pt_intr   = kvm_handle_intel_pt_intr,
+       .handle_intel_pt_intr   = NULL,
  };
  
  #ifdef CONFIG_X86_64
@@ -8417,7 +8410,7 @@ int kvm_arch_init(void *opaque)
                 goto out;
         }
         if (ops->disabled_by_bios()) {
-               pr_err_ratelimited("kvm: disabled by bios\n");
+               pr_warn_ratelimited("kvm: disabled by bios\n");
                 r = -EOPNOTSUPP;
                 goto out;
         }
@@ -8434,18 +8427,11 @@ int kvm_arch_init(void *opaque)
         }
  
         r = -ENOMEM;
-       x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
-                                         __alignof__(struct fpu), SLAB_ACCOUNT,
-                                         NULL);
-       if (!x86_fpu_cache) {
-               printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n");
-               goto out;
-       }
  
         x86_emulator_cache = kvm_alloc_emulator_cache();
         if (!x86_emulator_cache) {
                 pr_err("kvm: failed to allocate cache for x86 emulator\n");
-               goto out_free_x86_fpu_cache;
+               goto out;
         }
  
         user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
@@ -8455,14 +8441,12 @@ int kvm_arch_init(void *opaque)
         }
         kvm_nr_uret_msrs = 0;
  
-       r = kvm_mmu_module_init();
+       r = kvm_mmu_vendor_module_init();
         if (r)
                 goto out_free_percpu;
  
         kvm_timer_init();
  
-       perf_register_guest_info_callbacks(&kvm_guest_cbs);
-
         if (boot_cpu_has(X86_FEATURE_XSAVE)) {
                 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
                 supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
@@ -8483,8 +8467,6 @@ out_free_percpu:
         free_percpu(user_return_msrs);
  out_free_x86_emulator_cache:
         kmem_cache_destroy(x86_emulator_cache);
-out_free_x86_fpu_cache:
-       kmem_cache_destroy(x86_fpu_cache);
  out:
         return r;
  }
@@ -8496,7 +8478,6 @@ void kvm_arch_exit(void)
                 clear_hv_tscchange_cb();
  #endif
         kvm_lapic_exit();
-       perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
  
         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
@@ -8508,10 +8489,9 @@ void kvm_arch_exit(void)
         cancel_work_sync(&pvclock_gtod_work);
  #endif
         kvm_x86_ops.hardware_enable = NULL;
-       kvm_mmu_module_exit();
+       kvm_mmu_vendor_module_exit();
         free_percpu(user_return_msrs);
         kmem_cache_destroy(x86_emulator_cache);
-       kmem_cache_destroy(x86_fpu_cache);
  #ifdef CONFIG_KVM_XEN
         static_key_deferred_flush(&kvm_xen_enabled);
         WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
@@ -8567,6 +8547,13 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
         if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
                 return -KVM_EOPNOTSUPP;
  
+       /*
+        * When tsc is in permanent catchup mode guests won't be able to use
+        * pvclock_read_retry loop to get consistent view of pvclock
+        */
+       if (vcpu->arch.tsc_always_catchup)
+               return -KVM_EOPNOTSUPP;
+
         if (!kvm_get_walltime_and_clockread(&ts, &cycle))
                 return -KVM_EOPNOTSUPP;
  
@@ -8686,7 +8673,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
  
         trace_kvm_hypercall(nr, a0, a1, a2, a3);
  
-       op_64_bit = is_64_bit_mode(vcpu);
+       op_64_bit = is_64_bit_hypercall(vcpu);
         if (!op_64_bit) {
                 nr &= 0xFFFFFFFF;
                 a0 &= 0xFFFFFFFF;
@@ -8790,19 +8777,20 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
  {
         struct kvm_run *kvm_run = vcpu->run;
  
-       /*
-        * if_flag is obsolete and useless, so do not bother
-        * setting it for SEV-ES guests.  Userspace can just
-        * use kvm_run->ready_for_interrupt_injection.
-        */
-       kvm_run->if_flag = !vcpu->arch.guest_state_protected
-               && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
-
+       kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu);
         kvm_run->cr8 = kvm_get_cr8(vcpu);
         kvm_run->apic_base = kvm_get_apic_base(vcpu);
+
+       /*
+        * The call to kvm_ready_for_interrupt_injection() may end up in
+        * kvm_xen_has_interrupt() which may require the srcu lock to be
+        * held, to protect against changes in the vcpu_info address.
+        */
+       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
         kvm_run->ready_for_interrupt_injection =
                 pic_in_kernel(vcpu->kvm) ||
                 kvm_vcpu_ready_for_interrupt_injection(vcpu);
+       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
  
         if (is_smm(vcpu))
                 kvm_run->flags |= KVM_RUN_X86_SMM;
@@ -9351,8 +9339,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
         if (irqchip_split(vcpu->kvm))
                 kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
         else {
-               if (vcpu->arch.apicv_active)
-                       static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+               static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
                 if (ioapic_in_kernel(vcpu->kvm))
                         kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
         }
@@ -9370,12 +9357,16 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
         if (!kvm_apic_hw_enabled(vcpu->arch.apic))
                 return;
  
-       if (to_hv_vcpu(vcpu))
+       if (to_hv_vcpu(vcpu)) {
                 bitmap_or((ulong *)eoi_exit_bitmap,
                           vcpu->arch.ioapic_handled_vectors,
                           to_hv_synic(vcpu)->vec_bitmap, 256);
+               static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
+               return;
+       }
  
-       static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
+       static_call(kvm_x86_load_eoi_exitmap)(
+               vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
  }
  
  void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
@@ -9467,10 +9458,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                         /* Flushing all ASIDs flushes the current ASID... */
                         kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
                 }
-               if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
-                       kvm_vcpu_flush_tlb_current(vcpu);
-               if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
-                       kvm_vcpu_flush_tlb_guest(vcpu);
+               kvm_service_local_tlb_flush_requests(vcpu);
  
                 if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
                         vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -9621,10 +9609,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
  
         /*
          * This handles the case where a posted interrupt was
-        * notified with kvm_vcpu_kick.
+        * notified with kvm_vcpu_kick.  Assigned devices can
+        * use the POSTED_INTR_VECTOR even if APICv is disabled,
+        * so do it even if APICv is disabled on this vCPU.
          */
-       if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
-               static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+       if (kvm_lapic_enabled(vcpu))
+               static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
  
         if (kvm_vcpu_exit_request(vcpu)) {
                 vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -9660,8 +9650,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                 if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
                         break;
  
-               if (vcpu->arch.apicv_active)
-                       static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+               if (kvm_lapic_enabled(vcpu))
+                       static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
  
                 if (unlikely(kvm_vcpu_exit_request(vcpu))) {
                         exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
@@ -9930,58 +9920,21 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
         return 0;
  }
  
-static void kvm_save_current_fpu(struct fpu *fpu)
-{
-       /*
-        * If the target FPU state is not resident in the CPU registers, just
-        * memcpy() from current, else save CPU state directly to the target.
-        */
-       if (test_thread_flag(TIF_NEED_FPU_LOAD))
-               memcpy(&fpu->state, &current->thread.fpu.state,
-                      fpu_kernel_xstate_size);
-       else
-               save_fpregs_to_fpstate(fpu);
-}
-
  /* Swap (qemu) user FPU context for the guest FPU context. */
  static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
  {
-       fpregs_lock();
-
-       kvm_save_current_fpu(vcpu->arch.user_fpu);
-
         /*
-        * Guests with protected state can't have it set by the hypervisor,
-        * so skip trying to set it.
+        * Exclude PKRU from restore as restored separately in
+        * kvm_x86_ops.run().
          */
-       if (vcpu->arch.guest_fpu)
-               /* PKRU is separately restored in kvm_x86_ops.run. */
-               __restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state,
-                                       ~XFEATURE_MASK_PKRU);
-
-       fpregs_mark_activate();
-       fpregs_unlock();
-
+       fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
         trace_kvm_fpu(1);
  }
  
  /* When vcpu_run ends, restore user space FPU context. */
  static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
  {
-       fpregs_lock();
-
-       /*
-        * Guests with protected state can't have it read by the hypervisor,
-        * so skip trying to save it.
-        */
-       if (vcpu->arch.guest_fpu)
-               kvm_save_current_fpu(vcpu->arch.guest_fpu);
-
-       restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state);
-
-       fpregs_mark_activate();
-       fpregs_unlock();
-
+       fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
         ++vcpu->stat.fpu_reload;
         trace_kvm_fpu(0);
  }
@@ -10562,12 +10515,12 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
  {
         struct fxregs_state *fxsave;
  
-       if (!vcpu->arch.guest_fpu)
+       if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
                 return 0;
  
         vcpu_load(vcpu);
  
-       fxsave = &vcpu->arch.guest_fpu->state.fxsave;
+       fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
         memcpy(fpu->fpr, fxsave->st_space, 128);
         fpu->fcw = fxsave->cwd;
         fpu->fsw = fxsave->swd;
@@ -10585,12 +10538,12 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
  {
         struct fxregs_state *fxsave;
  
-       if (!vcpu->arch.guest_fpu)
+       if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
                 return 0;
  
         vcpu_load(vcpu);
  
-       fxsave = &vcpu->arch.guest_fpu->state.fxsave;
+       fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
  
         memcpy(fxsave->st_space, fpu->fpr, 128);
         fxsave->cwd = fpu->fcw;
@@ -10643,14 +10596,6 @@ static int sync_regs(struct kvm_vcpu *vcpu)
  
  static void fx_init(struct kvm_vcpu *vcpu)
  {
-       if (!vcpu->arch.guest_fpu)
-               return;
-
-       fpstate_init(&vcpu->arch.guest_fpu->state);
-       if (boot_cpu_has(X86_FEATURE_XSAVES))
-               vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
-                       host_xcr0 | XSTATE_COMPACTION_ENABLED;
-
         /*
          * Ensure guest xcr0 is valid for loading
          */
@@ -10659,15 +10604,6 @@ static void fx_init(struct kvm_vcpu *vcpu)
         vcpu->arch.cr0 |= X86_CR0_ET;
  }
  
-void kvm_free_guest_fpu(struct kvm_vcpu *vcpu)
-{
-       if (vcpu->arch.guest_fpu) {
-               kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
-               vcpu->arch.guest_fpu = NULL;
-       }
-}
-EXPORT_SYMBOL_GPL(kvm_free_guest_fpu);
-
  int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
  {
         if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
@@ -10724,19 +10660,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
         if (!alloc_emulate_ctxt(vcpu))
                 goto free_wbinvd_dirty_mask;
  
-       vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
-                                               GFP_KERNEL_ACCOUNT);
-       if (!vcpu->arch.user_fpu) {
-               pr_err("kvm: failed to allocate userspace's fpu\n");
+       if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
+               pr_err("kvm: failed to allocate vcpu's fpu\n");
                 goto free_emulate_ctxt;
         }
  
-       vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
-                                                GFP_KERNEL_ACCOUNT);
-       if (!vcpu->arch.guest_fpu) {
-               pr_err("kvm: failed to allocate vcpu's fpu\n");
-               goto free_user_fpu;
-       }
         fx_init(vcpu);
  
         vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
@@ -10769,9 +10697,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
         return 0;
  
  free_guest_fpu:
-       kvm_free_guest_fpu(vcpu);
-free_user_fpu:
-       kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
+       fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
  free_emulate_ctxt:
         kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
  free_wbinvd_dirty_mask:
@@ -10809,19 +10735,15 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
  
  void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
  {
-       struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
         int idx;
  
-       kvm_release_pfn(cache->pfn, cache->dirty, cache);
-
         kvmclock_reset(vcpu);
  
         static_call(kvm_x86_vcpu_free)(vcpu);
  
         kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
         free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
-       kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
-       kvm_free_guest_fpu(vcpu);
+       fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
  
         kvm_hv_vcpu_uninit(vcpu);
         kvm_pmu_destroy(vcpu);
@@ -10873,8 +10795,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
         kvm_async_pf_hash_reset(vcpu);
         vcpu->arch.apf.halted = false;
  
-       if (vcpu->arch.guest_fpu && kvm_mpx_supported()) {
-               void *mpx_state_buffer;
+       if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) {
+               struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
  
                 /*
                  * To avoid have the INIT path from kvm_apic_has_events() that be
@@ -10882,14 +10804,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
                  */
                 if (init_event)
                         kvm_put_guest_fpu(vcpu);
-               mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
-                                       XFEATURE_BNDREGS);
-               if (mpx_state_buffer)
-                       memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
-               mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
-                                       XFEATURE_BNDCSR);
-               if (mpx_state_buffer)
-                       memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
+
+               fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS);
+               fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR);
+
                 if (init_event)
                         kvm_load_guest_fpu(vcpu);
         }
@@ -10900,7 +10818,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
  
                 vcpu->arch.msr_misc_features_enables = 0;
  
-               vcpu->arch.xcr0 = XFEATURE_MASK_FP;
+               __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP);
+               __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true);
         }
  
         memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
@@ -10919,8 +10838,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
                 eax = 0x600;
         kvm_rdx_write(vcpu, eax);
  
-       vcpu->arch.ia32_xss = 0;
-
         static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
  
         kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
@@ -11096,6 +11013,10 @@ int kvm_arch_hardware_setup(void *opaque)
         memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
         kvm_ops_static_call_update();
  
+       if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest())
+               kvm_guest_cbs.handle_intel_pt_intr = kvm_handle_intel_pt_intr;
+       perf_register_guest_info_callbacks(&kvm_guest_cbs);
+
         if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
                 supported_xss = 0;
  
@@ -11123,6 +11044,9 @@ int kvm_arch_hardware_setup(void *opaque)
  
  void kvm_arch_hardware_unsetup(void)
  {
+       perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+       kvm_guest_cbs.handle_intel_pt_intr = NULL;
+
         static_call(kvm_x86_hardware_unsetup)();
  }
  
@@ -11199,7 +11123,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
  
         raw_spin_lock_init(&kvm->arch.tsc_write_lock);
         mutex_init(&kvm->arch.apic_map_lock);
-       spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+       raw_spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
  
         kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
         pvclock_update_vm_gtod_copy(kvm);
@@ -12385,39 +12309,78 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
  }
  EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read);
  
-static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
+static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
+                          unsigned int port);
+
+static int complete_sev_es_emulated_outs(struct kvm_vcpu *vcpu)
  {
-       memcpy(vcpu->arch.sev_pio_data, vcpu->arch.pio_data,
-              vcpu->arch.pio.count * vcpu->arch.pio.size);
-       vcpu->arch.pio.count = 0;
+       int size = vcpu->arch.pio.size;
+       int port = vcpu->arch.pio.port;
  
+       vcpu->arch.pio.count = 0;
+       if (vcpu->arch.sev_pio_count)
+               return kvm_sev_es_outs(vcpu, size, port);
         return 1;
  }
  
  static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
-                          unsigned int port, unsigned int count)
+                          unsigned int port)
  {
-       int ret = emulator_pio_out(vcpu, size, port,
-                                  vcpu->arch.sev_pio_data, count);
+       for (;;) {
+               unsigned int count =
+                       min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
+               int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count);
+
+               /* memcpy done already by emulator_pio_out.  */
+               vcpu->arch.sev_pio_count -= count;
+               vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size;
+               if (!ret)
+                       break;
  
-       if (ret) {
                 /* Emulation done by the kernel.  */
-               return ret;
+               if (!vcpu->arch.sev_pio_count)
+                       return 1;
         }
  
-       vcpu->arch.pio.count = 0;
+       vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs;
         return 0;
  }
  
  static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
-                         unsigned int port, unsigned int count)
+                         unsigned int port);
+
+static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
  {
-       int ret = emulator_pio_in(vcpu, size, port,
-                                 vcpu->arch.sev_pio_data, count);
+       unsigned count = vcpu->arch.pio.count;
+       complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data);
+       vcpu->arch.sev_pio_count -= count;
+       vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size;
+}
+
+static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
+{
+       int size = vcpu->arch.pio.size;
+       int port = vcpu->arch.pio.port;
+
+       advance_sev_es_emulated_ins(vcpu);
+       if (vcpu->arch.sev_pio_count)
+               return kvm_sev_es_ins(vcpu, size, port);
+       return 1;
+}
+
+static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
+                         unsigned int port)
+{
+       for (;;) {
+               unsigned int count =
+                       min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
+               if (!__emulator_pio_in(vcpu, size, port, count))
+                       break;
  
-       if (ret) {
                 /* Emulation done by the kernel.  */
-               return ret;
+               advance_sev_es_emulated_ins(vcpu);
+               if (!vcpu->arch.sev_pio_count)
+                       return 1;
         }
  
         vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
@@ -12429,8 +12392,9 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
                          int in)
  {
         vcpu->arch.sev_pio_data = data;
-       return in ? kvm_sev_es_ins(vcpu, size, port, count)
-                 : kvm_sev_es_outs(vcpu, size, port, count);
+       vcpu->arch.sev_pio_count = count;
+       return in ? kvm_sev_es_ins(vcpu, size, port)
+                 : kvm_sev_es_outs(vcpu, size, port);
  }
  EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
  
@@ -12461,3 +12425,19 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
+
+static int __init kvm_x86_init(void)
+{
+       kvm_mmu_x86_module_init();
+       return 0;
+}
+module_init(kvm_x86_init);
+
+static void __exit kvm_x86_exit(void)
+{
+       /*
+        * If module_init() is implemented, module_exit() must also be
+        * implemented to allow module unload.
+        */
+}
+module_exit(kvm_x86_exit);