]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - arch/x86/kvm/vmx.c
UBUNTU: SAUCE: [Fix] x86/speculation: Use ARCH_CAPABILITIES to skip L1D flush on...
[mirror_ubuntu-bionic-kernel.git] / arch / x86 / kvm / vmx.c
index 561d8937fac558b33c1b513095e94359dada8c07..fdce59385cdf9d62609129ade6eed2c207f594ea 100644 (file)
@@ -51,7 +51,8 @@
 #include <asm/apic.h>
 #include <asm/irq_remapping.h>
 #include <asm/mmu_context.h>
-#include <asm/nospec-branch.h>
+#include <asm/microcode.h>
+#include <asm/spec-ctrl.h>
 
 #include "trace.h"
 #include "pmu.h"
@@ -193,6 +194,156 @@ module_param(ple_window_max, int, S_IRUGO);
 
 extern const ulong vmx_return;
 
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
+static DEFINE_MUTEX(vmx_l1d_flush_mutex);
+
+/* Storage for pre module init parameter parsing */
+static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
+
+static const struct {
+       const char *option;
+       bool for_parse;
+} vmentry_l1d_param[] = {
+       [VMENTER_L1D_FLUSH_AUTO]         = {"auto", true},
+       [VMENTER_L1D_FLUSH_NEVER]        = {"never", true},
+       [VMENTER_L1D_FLUSH_COND]         = {"cond", true},
+       [VMENTER_L1D_FLUSH_ALWAYS]       = {"always", true},
+       [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
+       [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
+};
+
+#define L1D_CACHE_ORDER 4
+static void *vmx_l1d_flush_pages;
+
+static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
+{
+       struct page *page;
+       unsigned int i;
+
+       if (!enable_ept) {
+               l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
+               return 0;
+       }
+
+       if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
+               u64 msr;
+
+               rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
+               if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+                       l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+                       return 0;
+               }
+       }
+
+       /* If set to auto use the default l1tf mitigation method */
+       if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
+               switch (l1tf_mitigation) {
+               case L1TF_MITIGATION_OFF:
+                       l1tf = VMENTER_L1D_FLUSH_NEVER;
+                       break;
+               case L1TF_MITIGATION_FLUSH_NOWARN:
+               case L1TF_MITIGATION_FLUSH:
+               case L1TF_MITIGATION_FLUSH_NOSMT:
+                       l1tf = VMENTER_L1D_FLUSH_COND;
+                       break;
+               case L1TF_MITIGATION_FULL:
+               case L1TF_MITIGATION_FULL_FORCE:
+                       l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+                       break;
+               }
+       } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
+               l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+       }
+
+       if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
+           !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+               page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
+               if (!page)
+                       return -ENOMEM;
+               vmx_l1d_flush_pages = page_address(page);
+
+               /*
+                * Initialize each page with a different pattern in
+                * order to protect against KSM in the nested
+                * virtualization case.
+                */
+               for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
+                       memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
+                              PAGE_SIZE);
+               }
+       }
+
+       l1tf_vmx_mitigation = l1tf;
+
+       if (l1tf != VMENTER_L1D_FLUSH_NEVER)
+               static_branch_enable(&vmx_l1d_should_flush);
+       else
+               static_branch_disable(&vmx_l1d_should_flush);
+
+       if (l1tf == VMENTER_L1D_FLUSH_COND)
+               static_branch_enable(&vmx_l1d_flush_cond);
+       else
+               static_branch_disable(&vmx_l1d_flush_cond);
+       return 0;
+}
+
+static int vmentry_l1d_flush_parse(const char *s)
+{
+       unsigned int i;
+
+       if (s) {
+               for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
+                       if (vmentry_l1d_param[i].for_parse &&
+                           sysfs_streq(s, vmentry_l1d_param[i].option))
+                               return i;
+               }
+       }
+       return -EINVAL;
+}
+
+static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+{
+       int l1tf, ret;
+
+       l1tf = vmentry_l1d_flush_parse(s);
+       if (l1tf < 0)
+               return l1tf;
+
+       if (!boot_cpu_has(X86_BUG_L1TF))
+               return 0;
+
+       /*
+        * Has vmx_init() run already? If not then this is the pre init
+        * parameter parsing. In that case just store the value and let
+        * vmx_init() do the proper setup after enable_ept has been
+        * established.
+        */
+       if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
+               vmentry_l1d_flush_param = l1tf;
+               return 0;
+       }
+
+       mutex_lock(&vmx_l1d_flush_mutex);
+       ret = vmx_setup_l1d_flush(l1tf);
+       mutex_unlock(&vmx_l1d_flush_mutex);
+       return ret;
+}
+
+static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
+{
+       if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
+               return sprintf(s, "???\n");
+
+       return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
+}
+
+static const struct kernel_param_ops vmentry_l1d_flush_ops = {
+       .set = vmentry_l1d_flush_set,
+       .get = vmentry_l1d_flush_get,
+};
+module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
+
 #define NR_AUTOLOAD_MSRS 8
 
 struct vmcs {
@@ -577,6 +728,11 @@ static inline int pi_test_sn(struct pi_desc *pi_desc)
                        (unsigned long *)&pi_desc->control);
 }
 
+struct vmx_msrs {
+       unsigned int            nr;
+       struct vmx_msr_entry    val[NR_AUTOLOAD_MSRS];
+};
+
 struct vcpu_vmx {
        struct kvm_vcpu       vcpu;
        unsigned long         host_rsp;
@@ -610,9 +766,8 @@ struct vcpu_vmx {
        struct loaded_vmcs   *loaded_vmcs;
        bool                  __launched; /* temporary, used in vmx_vcpu_run */
        struct msr_autoload {
-               unsigned nr;
-               struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
-               struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
+               struct vmx_msrs guest;
+               struct vmx_msrs host;
        } msr_autoload;
        struct {
                int           loaded;
@@ -1078,6 +1233,13 @@ static inline bool is_machine_check(u32 intr_info)
                (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
 }
 
+/* Undocumented: icebp/int1 */
+static inline bool is_icebp(u32 intr_info)
+{
+       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+               == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK);
+}
+
 static inline bool cpu_has_vmx_msr_bitmap(void)
 {
        return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
@@ -1964,9 +2126,20 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
        vm_exit_controls_clearbit(vmx, exit);
 }
 
+static int find_msr(struct vmx_msrs *m, unsigned int msr)
+{
+       unsigned int i;
+
+       for (i = 0; i < m->nr; ++i) {
+               if (m->val[i].index == msr)
+                       return i;
+       }
+       return -ENOENT;
+}
+
 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
 {
-       unsigned i;
+       int i;
        struct msr_autoload *m = &vmx->msr_autoload;
 
        switch (msr) {
@@ -1987,18 +2160,21 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
                }
                break;
        }
+       i = find_msr(&m->guest, msr);
+       if (i < 0)
+               goto skip_guest;
+       --m->guest.nr;
+       m->guest.val[i] = m->guest.val[m->guest.nr];
+       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
 
-       for (i = 0; i < m->nr; ++i)
-               if (m->guest[i].index == msr)
-                       break;
-
-       if (i == m->nr)
+skip_guest:
+       i = find_msr(&m->host, msr);
+       if (i < 0)
                return;
-       --m->nr;
-       m->guest[i] = m->guest[m->nr];
-       m->host[i] = m->host[m->nr];
-       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
-       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
+
+       --m->host.nr;
+       m->host.val[i] = m->host.val[m->host.nr];
+       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
 }
 
 static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
@@ -2013,9 +2189,9 @@ static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
 }
 
 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
-                                 u64 guest_val, u64 host_val)
+                                 u64 guest_val, u64 host_val, bool entry_only)
 {
-       unsigned i;
+       int i, j = 0;
        struct msr_autoload *m = &vmx->msr_autoload;
 
        switch (msr) {
@@ -2050,24 +2226,31 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
                wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
        }
 
-       for (i = 0; i < m->nr; ++i)
-               if (m->guest[i].index == msr)
-                       break;
+       i = find_msr(&m->guest, msr);
+       if (!entry_only)
+               j = find_msr(&m->host, msr);
 
-       if (i == NR_AUTOLOAD_MSRS) {
+       if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) {
                printk_once(KERN_WARNING "Not enough msr switch entries. "
                                "Can't add msr %x\n", msr);
                return;
-       } else if (i == m->nr) {
-               ++m->nr;
-               vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
-               vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
        }
+       if (i < 0) {
+               i = m->guest.nr++;
+               vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
+       }
+       m->guest.val[i].index = msr;
+       m->guest.val[i].value = guest_val;
 
-       m->guest[i].index = msr;
-       m->guest[i].value = guest_val;
-       m->host[i].index = msr;
-       m->host[i].value = host_val;
+       if (entry_only)
+               return;
+
+       if (j < 0) {
+               j = m->host.nr++;
+               vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
+       }
+       m->host.val[j].index = msr;
+       m->host.val[j].value = host_val;
 }
 
 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
@@ -2111,7 +2294,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
                        guest_efer &= ~EFER_LME;
                if (guest_efer != host_efer)
                        add_atomic_switch_msr(vmx, MSR_EFER,
-                                             guest_efer, host_efer);
+                                             guest_efer, host_efer, false);
                return false;
        } else {
                guest_efer &= ~ignore_bits;
@@ -2580,6 +2763,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
                return;
        }
 
+       WARN_ON_ONCE(vmx->emulation_required);
+
        if (kvm_exception_is_soft(nr)) {
                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
                             vmx->vcpu.arch.event_exit_inst_len);
@@ -2655,18 +2840,15 @@ static void setup_msrs(struct vcpu_vmx *vmx)
                vmx_update_msr_bitmap(&vmx->vcpu);
 }
 
-/*
- * reads and returns guest's timestamp counter "register"
- * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
- * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3
- */
-static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
+static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
 {
-       u64 host_tsc, tsc_offset;
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
-       host_tsc = rdtsc();
-       tsc_offset = vmcs_read64(TSC_OFFSET);
-       return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
+       if (is_guest_mode(vcpu) &&
+           (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
+               return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
+
+       return vcpu->arch.tsc_offset;
 }
 
 /*
@@ -3259,6 +3441,11 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
        return !(val & ~valid_bits);
 }
 
+static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
+{
+       return 1;
+}
+
 /*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
@@ -3283,12 +3470,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 #endif
        case MSR_EFER:
                return kvm_get_msr_common(vcpu, msr_info);
-       case MSR_IA32_TSC:
-               msr_info->data = guest_read_tsc(vcpu);
-               break;
        case MSR_IA32_SPEC_CTRL:
                if (!msr_info->host_initiated &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
                    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
                        return 1;
 
@@ -3404,17 +3587,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                vmcs_write64(GUEST_BNDCFGS, data);
                break;
-       case MSR_IA32_TSC:
-               kvm_write_tsc(vcpu, msr_info);
-               break;
        case MSR_IA32_SPEC_CTRL:
                if (!msr_info->host_initiated &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
                    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
                        return 1;
 
                /* The STIBP bit doesn't fault even if it's not advertised */
-               if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+               if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
                        return 1;
 
                vmx->spec_ctrl = data;
@@ -3440,7 +3619,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                break;
        case MSR_IA32_PRED_CMD:
                if (!msr_info->host_initiated &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) &&
                    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
                        return 1;
 
@@ -3519,7 +3697,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                vcpu->arch.ia32_xss = data;
                if (vcpu->arch.ia32_xss != host_xss)
                        add_atomic_switch_msr(vmx, MSR_IA32_XSS,
-                               vcpu->arch.ia32_xss, host_xss);
+                               vcpu->arch.ia32_xss, host_xss, false);
                else
                        clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
                break;
@@ -4270,12 +4448,6 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
        __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
 }
 
-static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu)
-{
-       if (enable_ept)
-               vmx_flush_tlb(vcpu);
-}
-
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
 {
        ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@ -5712,9 +5884,9 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
        vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
-       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
+       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
        vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
-       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
 
        if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
                vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
@@ -5734,8 +5906,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
                ++vmx->nmsrs;
        }
 
-       if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
-               rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
+       vmx->arch_capabilities = kvm_get_arch_capabilities();
 
        vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
 
@@ -5766,6 +5937,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
        vmx->rmode.vm86_active = 0;
        vmx->spec_ctrl = 0;
 
+       vcpu->arch.microcode_version = 0x100000000ULL;
        vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
        kvm_set_cr8(vcpu, 0);
 
@@ -6172,7 +6344,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
                      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
                        vcpu->arch.dr6 &= ~15;
                        vcpu->arch.dr6 |= dr6 | DR6_RTM;
-                       if (!(dr6 & ~DR6_RESERVED)) /* icebp */
+                       if (is_icebp(intr_info))
                                skip_emulated_instruction(vcpu);
 
                        kvm_queue_exception(vcpu, DB_VECTOR);
@@ -6761,7 +6933,21 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
        if (!is_guest_mode(vcpu) &&
            !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
                trace_kvm_fast_mmio(gpa);
-               return kvm_skip_emulated_instruction(vcpu);
+               /*
+                * Doing kvm_skip_emulated_instruction() depends on undefined
+                * behavior: Intel's manual doesn't mandate
+                * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG
+                * occurs and while on real hardware it was observed to be set,
+                * other hypervisors (namely Hyper-V) don't set it, we end up
+                * advancing IP with some random value. Disable fast mmio when
+                * running nested and keep it for real hardware in hope that
+                * VM_EXIT_INSTRUCTION_LEN will always be set correctly.
+                */
+               if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
+                       return kvm_skip_emulated_instruction(vcpu);
+               else
+                       return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP,
+                                                      NULL, 0) == EMULATE_DONE;
        }
 
        ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
@@ -6815,12 +7001,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
                        goto out;
                }
 
-               if (err != EMULATE_DONE) {
-                       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-                       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-                       vcpu->run->internal.ndata = 0;
-                       return 0;
-               }
+               if (err != EMULATE_DONE)
+                       goto emulation_error;
+
+               if (vmx->emulation_required && !vmx->rmode.vm86_active &&
+                   vcpu->arch.exception.pending)
+                       goto emulation_error;
 
                if (vcpu->arch.halt_request) {
                        vcpu->arch.halt_request = 0;
@@ -6836,6 +7022,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 
 out:
        return ret;
+
+emulation_error:
+       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+       vcpu->run->internal.ndata = 0;
+       return 0;
 }
 
 static int __grow_ple_window(int val)
@@ -7301,8 +7493,7 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
                        vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
                return 1;
 
-       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, vmpointer,
-                               sizeof(*vmpointer), &e)) {
+       if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
                kvm_inject_page_fault(vcpu, &e);
                return 1;
        }
@@ -7383,6 +7574,12 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
                return 1;
        }
 
+       /* CPL=0 must be checked manually. */
+       if (vmx_get_cpl(vcpu)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
        if (vmx->nested.vmxon) {
                nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
                return kvm_skip_emulated_instruction(vcpu);
@@ -7442,6 +7639,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
  */
 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
 {
+       if (vmx_get_cpl(vcpu)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 0;
+       }
+
        if (!to_vmx(vcpu)->nested.vmxon) {
                kvm_queue_exception(vcpu, UD_VECTOR);
                return 0;
@@ -7486,6 +7688,7 @@ static void free_nested(struct vcpu_vmx *vmx)
        if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
                return;
 
+       hrtimer_cancel(&vmx->nested.preemption_timer);
        vmx->nested.vmxon = false;
        vmx->nested.smm.vmxon = false;
        free_vpid(vmx->nested.vpid02);
@@ -7775,9 +7978,9 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
                if (get_vmx_mem_address(vcpu, exit_qualification,
                                vmx_instruction_info, true, &gva))
                        return 1;
-               /* _system ok, as hardware has verified cpl=0 */
-               kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
-                            &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
+               /* _system ok, nested_vmx_check_permission has verified cpl=0 */
+               kvm_write_guest_virt_system(vcpu, gva, &field_value,
+                                           (is_long_mode(vcpu) ? 8 : 4), NULL);
        }
 
        nested_vmx_succeed(vcpu);
@@ -7813,8 +8016,8 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                if (get_vmx_mem_address(vcpu, exit_qualification,
                                vmx_instruction_info, false, &gva))
                        return 1;
-               if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
-                          &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
+               if (kvm_read_guest_virt(vcpu, gva, &field_value,
+                                       (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
                        kvm_inject_page_fault(vcpu, &e);
                        return 1;
                }
@@ -7918,10 +8121,10 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
        if (get_vmx_mem_address(vcpu, exit_qualification,
                        vmx_instruction_info, true, &vmcs_gva))
                return 1;
-       /* ok to use *_system, as hardware has verified cpl=0 */
-       if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
-                                (void *)&to_vmx(vcpu)->nested.current_vmptr,
-                                sizeof(u64), &e)) {
+       /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
+       if (kvm_write_guest_virt_system(vcpu, vmcs_gva,
+                                       (void *)&to_vmx(vcpu)->nested.current_vmptr,
+                                       sizeof(u64), &e)) {
                kvm_inject_page_fault(vcpu, &e);
                return 1;
        }
@@ -7968,8 +8171,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
        if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
                        vmx_instruction_info, false, &gva))
                return 1;
-       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
-                               sizeof(operand), &e)) {
+       if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
                kvm_inject_page_fault(vcpu, &e);
                return 1;
        }
@@ -8033,8 +8235,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
        if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
                        vmx_instruction_info, false, &gva))
                return 1;
-       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
-                               sizeof(operand), &e)) {
+       if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
                kvm_inject_page_fault(vcpu, &e);
                return 1;
        }
@@ -8962,6 +9163,79 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
        }
 }
 
+/*
+ * Software based L1D cache flush which is used when microcode providing
+ * the cache control MSR is not loaded.
+ *
+ * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
+ * flush it is required to read in 64 KiB because the replacement algorithm
+ * is not exactly LRU. This could be sized at runtime via topology
+ * information but as all relevant affected CPUs have 32KiB L1D cache size
+ * there is no point in doing so.
+ */
+#define L1D_CACHE_ORDER 4
+static void *vmx_l1d_flush_pages;
+
+static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+       int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+       /*
+        * This code is only executed when the the flush mode is 'cond' or
+        * 'always'
+        */
+       if (static_branch_likely(&vmx_l1d_flush_cond)) {
+               bool flush_l1d;
+
+               /*
+                * Clear the per-vcpu flush bit, it gets set again
+                * either from vcpu_run() or from one of the unsafe
+                * VMEXIT handlers.
+                */
+               flush_l1d = vcpu->arch.l1tf_flush_l1d;
+               vcpu->arch.l1tf_flush_l1d = false;
+
+               /*
+                * Clear the per-cpu flush bit, it gets set again from
+                * the interrupt handlers.
+                */
+               flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
+               kvm_clear_cpu_l1tf_flush_l1d();
+
+               if (!flush_l1d)
+                       return;
+       }
+
+       vcpu->stat.l1d_flush++;
+
+       if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+               wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+               return;
+       }
+
+       asm volatile(
+               /* First ensure the pages are in the TLB */
+               "xorl   %%eax, %%eax\n"
+               ".Lpopulate_tlb:\n\t"
+               "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+               "addl   $4096, %%eax\n\t"
+               "cmpl   %%eax, %[size]\n\t"
+               "jne    .Lpopulate_tlb\n\t"
+               "xorl   %%eax, %%eax\n\t"
+               "cpuid\n\t"
+               /* Now fill the cache */
+               "xorl   %%eax, %%eax\n"
+               ".Lfill_cache:\n"
+               "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+               "addl   $64, %%eax\n\t"
+               "cmpl   %%eax, %[size]\n\t"
+               "jne    .Lfill_cache\n\t"
+               "lfence\n"
+               :: [flush_pages] "r" (vmx_l1d_flush_pages),
+                   [size] "r" (size)
+               : "eax", "ebx", "ecx", "edx");
+}
+
 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -9002,7 +9276,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
        } else {
                sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
                sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-               vmx_flush_tlb_ept_only(vcpu);
+               vmx_flush_tlb(vcpu);
        }
        vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
 
@@ -9030,7 +9304,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
            !nested_cpu_has2(get_vmcs12(&vmx->vcpu),
                             SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
                vmcs_write64(APIC_ACCESS_ADDR, hpa);
-               vmx_flush_tlb_ept_only(vcpu);
+               vmx_flush_tlb(vcpu);
        }
 }
 
@@ -9207,9 +9481,21 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 }
 STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
 
-static bool vmx_has_high_real_mode_segbase(void)
+static bool vmx_has_emulated_msr(int index)
 {
-       return enable_unrestricted_guest || emulate_invalid_guest_state;
+       switch (index) {
+       case MSR_IA32_SMBASE:
+               /*
+                * We cannot do SMM unless we can run the guest in big
+                * real mode.
+                */
+               return enable_unrestricted_guest || emulate_invalid_guest_state;
+       case MSR_AMD64_VIRT_SPEC_CTRL:
+               /* This is AMD only.  */
+               return false;
+       default:
+               return true;
+       }
 }
 
 static bool vmx_mpx_supported(void)
@@ -9353,7 +9639,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
                        clear_atomic_switch_msr(vmx, msrs[i].msr);
                else
                        add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
-                                       msrs[i].host);
+                                       msrs[i].host, false);
 }
 
 static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
@@ -9442,10 +9728,13 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
         * is no need to worry about the conditional branch over the wrmsr
         * being speculatively taken.
         */
-       if (vmx->spec_ctrl)
-               wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+       x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
 
        vmx->__launched = vmx->loaded_vmcs->launched;
+
+       if (static_branch_unlikely(&vmx_l1d_should_flush))
+               vmx_l1d_flush(vcpu);
+
        asm(
                /* Store host registers */
                "push %%" _ASM_DX "; push %%" _ASM_BP ";"
@@ -9578,11 +9867,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
         * If the L02 MSR bitmap does not intercept the MSR, then we need to
         * save it.
         */
-       if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
-               rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+       if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+               vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
-       if (vmx->spec_ctrl)
-               wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+       x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
 
        /* Eliminate branch target predictions from guest mode */
        vmexit_fill_RSB();
@@ -9796,6 +10084,37 @@ free_vcpu:
        return ERR_PTR(err);
 }
 
+#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
+#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
+
+static int vmx_vm_init(struct kvm *kvm)
+{
+       if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
+               switch (l1tf_mitigation) {
+               case L1TF_MITIGATION_OFF:
+               case L1TF_MITIGATION_FLUSH_NOWARN:
+                       /* 'I explicitly don't care' is set */
+                       break;
+               case L1TF_MITIGATION_FLUSH:
+               case L1TF_MITIGATION_FLUSH_NOSMT:
+               case L1TF_MITIGATION_FULL:
+                       /*
+                        * Warn upon starting the first VM in a potentially
+                        * insecure environment.
+                        */
+                       if (cpu_smt_control == CPU_SMT_ENABLED)
+                               pr_warn_once(L1TF_MSG_SMT);
+                       if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
+                               pr_warn_once(L1TF_MSG_L1D);
+                       break;
+               case L1TF_MITIGATION_FULL_FORCE:
+                       /* Flush is enforced */
+                       break;
+               }
+       }
+       return 0;
+}
+
 static void __init vmx_check_processor_compat(void *rtn)
 {
        struct vmcs_config vmcs_conf;
@@ -10113,6 +10432,8 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
                        kunmap(vmx->nested.pi_desc_page);
                        kvm_release_page_dirty(vmx->nested.pi_desc_page);
                        vmx->nested.pi_desc_page = NULL;
+                       vmx->nested.pi_desc = NULL;
+                       vmcs_write64(POSTED_INTR_DESC_ADDR, -1ull);
                }
                page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
                if (is_error_page(page))
@@ -10277,6 +10598,16 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
        return true;
 }
 
+static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
+                                         struct vmcs12 *vmcs12)
+{
+       if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+           !page_address_valid(vcpu, vmcs12->apic_access_addr))
+               return -EINVAL;
+       else
+               return 0;
+}
+
 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
                                           struct vmcs12 *vmcs12)
 {
@@ -10724,10 +11055,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
         * Set the MSR load/store lists to match L0's settings.
         */
        vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
-       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
-       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
-       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
-       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
+       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
 
        /*
         * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
@@ -10804,11 +11135,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
            vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
                vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
 
-       if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
-               vmcs_write64(TSC_OFFSET,
-                       vcpu->arch.tsc_offset + vmcs12->tsc_offset);
-       else
-               vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+       vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+
        if (kvm_has_tsc_control)
                decache_tsc_multiplier(vmx);
 
@@ -10856,7 +11184,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                }
        } else if (nested_cpu_has2(vmcs12,
                                   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-               vmx_flush_tlb_ept_only(vcpu);
+               vmx_flush_tlb(vcpu);
        }
 
        /*
@@ -10920,6 +11248,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
                return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
+       if (nested_vmx_check_apic_access_controls(vcpu, vmcs12))
+               return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
        if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12))
                return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
@@ -11038,6 +11369,7 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        u32 msr_entry_idx;
        u32 exit_qual;
+       int r;
 
        enter_guest_mode(vcpu);
 
@@ -11047,26 +11379,21 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
        vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
        vmx_segment_cache_clear(vmx);
 
-       if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
-               leave_guest_mode(vcpu);
-               vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-               nested_vmx_entry_failure(vcpu, vmcs12,
-                                        EXIT_REASON_INVALID_STATE, exit_qual);
-               return 1;
-       }
+       if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+               vcpu->arch.tsc_offset += vmcs12->tsc_offset;
+
+       r = EXIT_REASON_INVALID_STATE;
+       if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual))
+               goto fail;
 
        nested_get_vmcs12_pages(vcpu, vmcs12);
 
+       r = EXIT_REASON_MSR_LOAD_FAIL;
        msr_entry_idx = nested_vmx_load_msr(vcpu,
                                            vmcs12->vm_entry_msr_load_addr,
                                            vmcs12->vm_entry_msr_load_count);
-       if (msr_entry_idx) {
-               leave_guest_mode(vcpu);
-               vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-               nested_vmx_entry_failure(vcpu, vmcs12,
-                               EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
-               return 1;
-       }
+       if (msr_entry_idx)
+               goto fail;
 
        /*
         * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
@@ -11075,6 +11402,14 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
         * the success flag) when L2 exits (see nested_vmx_vmexit()).
         */
        return 0;
+
+fail:
+       if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+               vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+       leave_guest_mode(vcpu);
+       vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+       nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual);
+       return 1;
 }
 
 /*
@@ -11154,7 +11489,15 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        if (ret)
                return ret;
 
-       if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
+       /* Hide L1D cache contents from the nested guest.  */
+       vmx->vcpu.arch.l1tf_flush_l1d = true;
+
+       /*
+        * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
+        * by event injection, halt vcpu.
+        */
+       if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
+           !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK))
                return kvm_vcpu_halt(vcpu);
 
        vmx->nested.nested_run_pending = 1;
@@ -11644,6 +11987,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
        leave_guest_mode(vcpu);
 
+       if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+               vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+
        if (likely(!vmx->fail)) {
                if (exit_reason == -1)
                        sync_vmcs12(vcpu, vmcs12);
@@ -11662,8 +12008,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
        vmx_segment_cache_clear(vmx);
 
        /* Update any VMCS fields that might have changed while L2 ran */
-       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
-       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
        vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
        if (vmx->hv_deadline_tsc == -1)
                vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
@@ -11681,7 +12027,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
        } else if (!nested_cpu_has_ept(vmcs12) &&
                   nested_cpu_has2(vmcs12,
                                   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-               vmx_flush_tlb_ept_only(vcpu);
+               vmx_flush_tlb(vcpu);
        }
 
        /* This is needed for same reason as it was needed in prepare_vmcs02 */
@@ -12134,7 +12480,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
                vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
                vcpu_info.vector = irq.vector;
 
-               trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
+               trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
                                vcpu_info.vector, vcpu_info.pi_desc_addr, set);
 
                if (set)
@@ -12222,7 +12568,9 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .hardware_enable = hardware_enable,
        .hardware_disable = hardware_disable,
        .cpu_has_accelerated_tpr = report_flexpriority,
-       .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase,
+       .has_emulated_msr = vmx_has_emulated_msr,
+
+       .vm_init = vmx_vm_init,
 
        .vcpu_create = vmx_create_vcpu,
        .vcpu_free = vmx_free_vcpu,
@@ -12233,6 +12581,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .vcpu_put = vmx_vcpu_put,
 
        .update_bp_intercept = update_exception_bitmap,
+       .get_msr_feature = vmx_get_msr_feature,
        .get_msr = vmx_get_msr,
        .set_msr = vmx_set_msr,
        .get_segment_base = vmx_get_segment_base,
@@ -12306,6 +12655,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
        .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
+       .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
        .write_tsc_offset = vmx_write_tsc_offset,
 
        .set_tdp_cr3 = vmx_set_cr3,
@@ -12345,22 +12695,17 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .enable_smi_window = enable_smi_window,
 };
 
-static int __init vmx_init(void)
+static void vmx_cleanup_l1d_flush(void)
 {
-       int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
-                     __alignof__(struct vcpu_vmx), THIS_MODULE);
-       if (r)
-               return r;
-
-#ifdef CONFIG_KEXEC_CORE
-       rcu_assign_pointer(crash_vmclear_loaded_vmcss,
-                          crash_vmclear_local_loaded_vmcss);
-#endif
-
-       return 0;
+       if (vmx_l1d_flush_pages) {
+               free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
+               vmx_l1d_flush_pages = NULL;
+       }
+       /* Restore state so sysfs ignores VMX */
+       l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
 }
 
-static void __exit vmx_exit(void)
+static void vmx_exit(void)
 {
 #ifdef CONFIG_KEXEC_CORE
        RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
@@ -12368,7 +12713,40 @@ static void __exit vmx_exit(void)
 #endif
 
        kvm_exit();
+
+       vmx_cleanup_l1d_flush();
 }
+module_exit(vmx_exit)
 
+static int __init vmx_init(void)
+{
+       int r;
+
+       r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+                    __alignof__(struct vcpu_vmx), THIS_MODULE);
+       if (r)
+               return r;
+
+       /*
+        * Must be called after kvm_init() so enable_ept is properly set
+        * up. Hand the parameter mitigation value in which was stored in
+        * the pre module init parser. If no parameter was given, it will
+        * contain 'auto' which will be turned into the default 'cond'
+        * mitigation mode.
+        */
+       if (boot_cpu_has(X86_BUG_L1TF)) {
+               r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+               if (r) {
+                       vmx_exit();
+                       return r;
+               }
+       }
+
+#ifdef CONFIG_KEXEC_CORE
+       rcu_assign_pointer(crash_vmclear_loaded_vmcss,
+                          crash_vmclear_local_loaded_vmcss);
+#endif
+
+       return 0;
+}
 module_init(vmx_init)
-module_exit(vmx_exit)