]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - arch/x86/kvm/vmx.c
UBUNTU: SAUCE: Synchronize MDS mitigations with upstream
[mirror_ubuntu-bionic-kernel.git] / arch / x86 / kvm / vmx.c
index 209c05f5e31357c9a190408a28ea7dfa25136cdd..35a21216904320d8542471f926f23b0082c0264d 100644 (file)
@@ -70,9 +70,6 @@ static const struct x86_cpu_id vmx_cpu_id[] = {
 };
 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
 
-static bool __read_mostly nosmt;
-module_param(nosmt, bool, S_IRUGO);
-
 static bool __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
@@ -198,46 +195,154 @@ module_param(ple_window_max, int, S_IRUGO);
 extern const ulong vmx_return;
 
 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
+static DEFINE_MUTEX(vmx_l1d_flush_mutex);
 
-static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush = VMENTER_L1D_FLUSH_COND;
+/* Storage for pre module init parameter parsing */
+static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
 
 static const struct {
        const char *option;
-       enum vmx_l1d_flush_state cmd;
+       bool for_parse;
 } vmentry_l1d_param[] = {
-       {"auto",        VMENTER_L1D_FLUSH_AUTO},
-       {"never",       VMENTER_L1D_FLUSH_NEVER},
-       {"cond",        VMENTER_L1D_FLUSH_COND},
-       {"always",      VMENTER_L1D_FLUSH_ALWAYS},
+       [VMENTER_L1D_FLUSH_AUTO]         = {"auto", true},
+       [VMENTER_L1D_FLUSH_NEVER]        = {"never", true},
+       [VMENTER_L1D_FLUSH_COND]         = {"cond", true},
+       [VMENTER_L1D_FLUSH_ALWAYS]       = {"always", true},
+       [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
+       [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
 };
 
-static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+#define L1D_CACHE_ORDER 4
+static void *vmx_l1d_flush_pages;
+
+static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
 {
+       struct page *page;
        unsigned int i;
 
-       if (!s)
-               return -EINVAL;
+       if (!enable_ept) {
+               l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
+               return 0;
+       }
+
+       if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
+               u64 msr;
 
-       for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
-               if (!strcmp(s, vmentry_l1d_param[i].option)) {
-                       vmentry_l1d_flush = vmentry_l1d_param[i].cmd;
+               rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
+               if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+                       l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
                        return 0;
                }
        }
 
+       /* If set to auto use the default l1tf mitigation method */
+       if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
+               switch (l1tf_mitigation) {
+               case L1TF_MITIGATION_OFF:
+                       l1tf = VMENTER_L1D_FLUSH_NEVER;
+                       break;
+               case L1TF_MITIGATION_FLUSH_NOWARN:
+               case L1TF_MITIGATION_FLUSH:
+               case L1TF_MITIGATION_FLUSH_NOSMT:
+                       l1tf = VMENTER_L1D_FLUSH_COND;
+                       break;
+               case L1TF_MITIGATION_FULL:
+               case L1TF_MITIGATION_FULL_FORCE:
+                       l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+                       break;
+               }
+       } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
+               l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+       }
+
+       if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
+           !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+               page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
+               if (!page)
+                       return -ENOMEM;
+               vmx_l1d_flush_pages = page_address(page);
+
+               /*
+                * Initialize each page with a different pattern in
+                * order to protect against KSM in the nested
+                * virtualization case.
+                */
+               for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
+                       memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
+                              PAGE_SIZE);
+               }
+       }
+
+       l1tf_vmx_mitigation = l1tf;
+
+       if (l1tf != VMENTER_L1D_FLUSH_NEVER)
+               static_branch_enable(&vmx_l1d_should_flush);
+       else
+               static_branch_disable(&vmx_l1d_should_flush);
+
+       if (l1tf == VMENTER_L1D_FLUSH_COND)
+               static_branch_enable(&vmx_l1d_flush_cond);
+       else
+               static_branch_disable(&vmx_l1d_flush_cond);
+       return 0;
+}
+
+static int vmentry_l1d_flush_parse(const char *s)
+{
+       unsigned int i;
+
+       if (s) {
+               for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
+                       if (vmentry_l1d_param[i].for_parse &&
+                           sysfs_streq(s, vmentry_l1d_param[i].option))
+                               return i;
+               }
+       }
        return -EINVAL;
 }
 
+static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+{
+       int l1tf, ret;
+
+       l1tf = vmentry_l1d_flush_parse(s);
+       if (l1tf < 0)
+               return l1tf;
+
+       if (!boot_cpu_has(X86_BUG_L1TF))
+               return 0;
+
+       /*
+        * Has vmx_init() run already? If not then this is the pre init
+        * parameter parsing. In that case just store the value and let
+        * vmx_init() do the proper setup after enable_ept has been
+        * established.
+        */
+       if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
+               vmentry_l1d_flush_param = l1tf;
+               return 0;
+       }
+
+       mutex_lock(&vmx_l1d_flush_mutex);
+       ret = vmx_setup_l1d_flush(l1tf);
+       mutex_unlock(&vmx_l1d_flush_mutex);
+       return ret;
+}
+
 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
 {
-       return sprintf(s, "%s\n", vmentry_l1d_param[vmentry_l1d_flush].option);
+       if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
+               return sprintf(s, "???\n");
+
+       return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
 }
 
 static const struct kernel_param_ops vmentry_l1d_flush_ops = {
        .set = vmentry_l1d_flush_set,
        .get = vmentry_l1d_flush_get,
 };
-module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, &vmentry_l1d_flush, S_IRUGO);
+module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
 
 #define NR_AUTOLOAD_MSRS 8
 
@@ -2658,6 +2763,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
                return;
        }
 
+       WARN_ON_ONCE(vmx->emulation_required);
+
        if (kvm_exception_is_soft(nr)) {
                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
                             vmx->vcpu.arch.event_exit_inst_len);
@@ -2733,18 +2840,15 @@ static void setup_msrs(struct vcpu_vmx *vmx)
                vmx_update_msr_bitmap(&vmx->vcpu);
 }
 
-/*
- * reads and returns guest's timestamp counter "register"
- * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
- * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3
- */
-static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
+static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
 {
-       u64 host_tsc, tsc_offset;
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+       if (is_guest_mode(vcpu) &&
+           (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
+               return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
 
-       host_tsc = rdtsc();
-       tsc_offset = vmcs_read64(TSC_OFFSET);
-       return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
+       return vcpu->arch.tsc_offset;
 }
 
 /*
@@ -3337,6 +3441,11 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
        return !(val & ~valid_bits);
 }
 
+static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
+{
+       return 1;
+}
+
 /*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
@@ -3361,9 +3470,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 #endif
        case MSR_EFER:
                return kvm_get_msr_common(vcpu, msr_info);
-       case MSR_IA32_TSC:
-               msr_info->data = guest_read_tsc(vcpu);
-               break;
        case MSR_IA32_SPEC_CTRL:
                if (!msr_info->host_initiated &&
                    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
@@ -3481,9 +3587,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                vmcs_write64(GUEST_BNDCFGS, data);
                break;
-       case MSR_IA32_TSC:
-               kvm_write_tsc(vcpu, msr_info);
-               break;
        case MSR_IA32_SPEC_CTRL:
                if (!msr_info->host_initiated &&
                    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
@@ -4345,12 +4448,6 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
        __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
 }
 
-static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu)
-{
-       if (enable_ept)
-               vmx_flush_tlb(vcpu);
-}
-
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
 {
        ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@ -5809,8 +5906,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
                ++vmx->nmsrs;
        }
 
-       if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
-               rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
+       vmx->arch_capabilities = kvm_get_arch_capabilities();
 
        vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
 
@@ -5841,6 +5937,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
        vmx->rmode.vm86_active = 0;
        vmx->spec_ctrl = 0;
 
+       vcpu->arch.microcode_version = 0x100000000ULL;
        vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
        kvm_set_cr8(vcpu, 0);
 
@@ -6904,12 +7001,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
                        goto out;
                }
 
-               if (err != EMULATE_DONE) {
-                       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-                       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-                       vcpu->run->internal.ndata = 0;
-                       return 0;
-               }
+               if (err != EMULATE_DONE)
+                       goto emulation_error;
+
+               if (vmx->emulation_required && !vmx->rmode.vm86_active &&
+                   vcpu->arch.exception.pending)
+                       goto emulation_error;
 
                if (vcpu->arch.halt_request) {
                        vcpu->arch.halt_request = 0;
@@ -6925,6 +7022,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 
 out:
        return ret;
+
+emulation_error:
+       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+       vcpu->run->internal.ndata = 0;
+       return 0;
 }
 
 static int __grow_ple_window(int val)
@@ -7390,8 +7493,7 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
                        vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
                return 1;
 
-       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, vmpointer,
-                               sizeof(*vmpointer), &e)) {
+       if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
                kvm_inject_page_fault(vcpu, &e);
                return 1;
        }
@@ -7472,6 +7574,12 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
                return 1;
        }
 
+       /* CPL=0 must be checked manually. */
+       if (vmx_get_cpl(vcpu)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
        if (vmx->nested.vmxon) {
                nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
                return kvm_skip_emulated_instruction(vcpu);
@@ -7531,6 +7639,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
  */
 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
 {
+       if (vmx_get_cpl(vcpu)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 0;
+       }
+
        if (!to_vmx(vcpu)->nested.vmxon) {
                kvm_queue_exception(vcpu, UD_VECTOR);
                return 0;
@@ -7575,6 +7688,7 @@ static void free_nested(struct vcpu_vmx *vmx)
        if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
                return;
 
+       hrtimer_cancel(&vmx->nested.preemption_timer);
        vmx->nested.vmxon = false;
        vmx->nested.smm.vmxon = false;
        free_vpid(vmx->nested.vpid02);
@@ -7864,9 +7978,9 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
                if (get_vmx_mem_address(vcpu, exit_qualification,
                                vmx_instruction_info, true, &gva))
                        return 1;
-               /* _system ok, as hardware has verified cpl=0 */
-               kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
-                            &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
+               /* _system ok, nested_vmx_check_permission has verified cpl=0 */
+               kvm_write_guest_virt_system(vcpu, gva, &field_value,
+                                           (is_long_mode(vcpu) ? 8 : 4), NULL);
        }
 
        nested_vmx_succeed(vcpu);
@@ -7902,8 +8016,8 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                if (get_vmx_mem_address(vcpu, exit_qualification,
                                vmx_instruction_info, false, &gva))
                        return 1;
-               if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
-                          &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
+               if (kvm_read_guest_virt(vcpu, gva, &field_value,
+                                       (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
                        kvm_inject_page_fault(vcpu, &e);
                        return 1;
                }
@@ -8007,10 +8121,10 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
        if (get_vmx_mem_address(vcpu, exit_qualification,
                        vmx_instruction_info, true, &vmcs_gva))
                return 1;
-       /* ok to use *_system, as hardware has verified cpl=0 */
-       if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
-                                (void *)&to_vmx(vcpu)->nested.current_vmptr,
-                                sizeof(u64), &e)) {
+       /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
+       if (kvm_write_guest_virt_system(vcpu, vmcs_gva,
+                                       (void *)&to_vmx(vcpu)->nested.current_vmptr,
+                                       sizeof(u64), &e)) {
                kvm_inject_page_fault(vcpu, &e);
                return 1;
        }
@@ -8057,8 +8171,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
        if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
                        vmx_instruction_info, false, &gva))
                return 1;
-       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
-                               sizeof(operand), &e)) {
+       if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
                kvm_inject_page_fault(vcpu, &e);
                return 1;
        }
@@ -8122,8 +8235,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
        if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
                        vmx_instruction_info, false, &gva))
                return 1;
-       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
-                               sizeof(operand), &e)) {
+       if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
                kvm_inject_page_fault(vcpu, &e);
                return 1;
        }
@@ -9067,18 +9179,32 @@ static void *vmx_l1d_flush_pages;
 static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
 {
        int size = PAGE_SIZE << L1D_CACHE_ORDER;
-       bool always;
 
        /*
         * This code is only executed when the the flush mode is 'cond' or
         * 'always'
-        *
-        * If 'flush always', keep the flush bit set, otherwise clear
-        * it. The flush bit gets set again either from vcpu_run() or from
-        * one of the unsafe VMEXIT handlers.
         */
-       always = vmentry_l1d_flush == VMENTER_L1D_FLUSH_ALWAYS;
-       vcpu->arch.l1tf_flush_l1d = always;
+       if (static_branch_likely(&vmx_l1d_flush_cond)) {
+               bool flush_l1d;
+
+               /*
+                * Clear the per-vcpu flush bit, it gets set again
+                * either from vcpu_run() or from one of the unsafe
+                * VMEXIT handlers.
+                */
+               flush_l1d = vcpu->arch.l1tf_flush_l1d;
+               vcpu->arch.l1tf_flush_l1d = false;
+
+               /*
+                * Clear the per-cpu flush bit, it gets set again from
+                * the interrupt handlers.
+                */
+               flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
+               kvm_clear_cpu_l1tf_flush_l1d();
+
+               if (!flush_l1d)
+                       return;
+       }
 
        vcpu->stat.l1d_flush++;
 
@@ -9091,7 +9217,7 @@ static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
                /* First ensure the pages are in the TLB */
                "xorl   %%eax, %%eax\n"
                ".Lpopulate_tlb:\n\t"
-               "movzbl (%[empty_zp], %%" _ASM_AX "), %%ecx\n\t"
+               "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
                "addl   $4096, %%eax\n\t"
                "cmpl   %%eax, %[size]\n\t"
                "jne    .Lpopulate_tlb\n\t"
@@ -9100,12 +9226,12 @@ static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
                /* Now fill the cache */
                "xorl   %%eax, %%eax\n"
                ".Lfill_cache:\n"
-               "movzbl (%[empty_zp], %%" _ASM_AX "), %%ecx\n\t"
+               "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
                "addl   $64, %%eax\n\t"
                "cmpl   %%eax, %[size]\n\t"
                "jne    .Lfill_cache\n\t"
                "lfence\n"
-               :: [empty_zp] "r" (vmx_l1d_flush_pages),
+               :: [flush_pages] "r" (vmx_l1d_flush_pages),
                    [size] "r" (size)
                : "eax", "ebx", "ecx", "edx");
 }
@@ -9150,7 +9276,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
        } else {
                sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
                sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-               vmx_flush_tlb_ept_only(vcpu);
+               vmx_flush_tlb(vcpu);
        }
        vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
 
@@ -9178,7 +9304,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
            !nested_cpu_has2(get_vmcs12(&vmx->vcpu),
                             SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
                vmcs_write64(APIC_ACCESS_ADDR, hpa);
-               vmx_flush_tlb_ept_only(vcpu);
+               vmx_flush_tlb(vcpu);
        }
 }
 
@@ -9351,7 +9477,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
                        [ss]"i"(__KERNEL_DS),
                        [cs]"i"(__KERNEL_CS)
                        );
-               vcpu->arch.l1tf_flush_l1d = true;
        }
 }
 STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
@@ -9607,10 +9732,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
        vmx->__launched = vmx->loaded_vmcs->launched;
 
-       if (static_branch_unlikely(&vmx_l1d_should_flush)) {
-               if (vcpu->arch.l1tf_flush_l1d)
-                       vmx_l1d_flush(vcpu);
-       }
+       /* L1D Flush includes CPU buffer clear to mitigate MDS */
+       if (static_branch_unlikely(&vmx_l1d_should_flush))
+               vmx_l1d_flush(vcpu);
+       else if (static_branch_unlikely(&mds_user_clear))
+               mds_clear_cpu_buffers();
 
        asm(
                /* Store host registers */
@@ -9961,16 +10087,33 @@ free_vcpu:
        return ERR_PTR(err);
 }
 
-#define L1TF_MSG "SMT enabled with L1TF CPU bug present. Refer to CVE-2018-3620 for details.\n"
+#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
+#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
 
 static int vmx_vm_init(struct kvm *kvm)
 {
-       if (boot_cpu_has(X86_BUG_L1TF) && cpu_smt_control == CPU_SMT_ENABLED) {
-               if (nosmt) {
-                       pr_err(L1TF_MSG);
-                       return -EOPNOTSUPP;
+       if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
+               switch (l1tf_mitigation) {
+               case L1TF_MITIGATION_OFF:
+               case L1TF_MITIGATION_FLUSH_NOWARN:
+                       /* 'I explicitly don't care' is set */
+                       break;
+               case L1TF_MITIGATION_FLUSH:
+               case L1TF_MITIGATION_FLUSH_NOSMT:
+               case L1TF_MITIGATION_FULL:
+                       /*
+                        * Warn upon starting the first VM in a potentially
+                        * insecure environment.
+                        */
+                       if (cpu_smt_control == CPU_SMT_ENABLED)
+                               pr_warn_once(L1TF_MSG_SMT);
+                       if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
+                               pr_warn_once(L1TF_MSG_L1D);
+                       break;
+               case L1TF_MITIGATION_FULL_FORCE:
+                       /* Flush is enforced */
+                       break;
                }
-               pr_warn(L1TF_MSG);
        }
        return 0;
 }
@@ -10292,6 +10435,8 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
                        kunmap(vmx->nested.pi_desc_page);
                        kvm_release_page_dirty(vmx->nested.pi_desc_page);
                        vmx->nested.pi_desc_page = NULL;
+                       vmx->nested.pi_desc = NULL;
+                       vmcs_write64(POSTED_INTR_DESC_ADDR, -1ull);
                }
                page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
                if (is_error_page(page))
@@ -10456,6 +10601,16 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
        return true;
 }
 
+static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
+                                         struct vmcs12 *vmcs12)
+{
+       if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+           !page_address_valid(vcpu, vmcs12->apic_access_addr))
+               return -EINVAL;
+       else
+               return 0;
+}
+
 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
                                           struct vmcs12 *vmcs12)
 {
@@ -10983,11 +11138,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
            vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
                vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
 
-       if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
-               vmcs_write64(TSC_OFFSET,
-                       vcpu->arch.tsc_offset + vmcs12->tsc_offset);
-       else
-               vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+       vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+
        if (kvm_has_tsc_control)
                decache_tsc_multiplier(vmx);
 
@@ -11035,7 +11187,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                }
        } else if (nested_cpu_has2(vmcs12,
                                   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-               vmx_flush_tlb_ept_only(vcpu);
+               vmx_flush_tlb(vcpu);
        }
 
        /*
@@ -11099,6 +11251,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
                return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
+       if (nested_vmx_check_apic_access_controls(vcpu, vmcs12))
+               return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
        if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12))
                return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
@@ -11217,6 +11372,7 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        u32 msr_entry_idx;
        u32 exit_qual;
+       int r;
 
        enter_guest_mode(vcpu);
 
@@ -11226,26 +11382,21 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
        vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
        vmx_segment_cache_clear(vmx);
 
-       if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
-               leave_guest_mode(vcpu);
-               vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-               nested_vmx_entry_failure(vcpu, vmcs12,
-                                        EXIT_REASON_INVALID_STATE, exit_qual);
-               return 1;
-       }
+       if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+               vcpu->arch.tsc_offset += vmcs12->tsc_offset;
+
+       r = EXIT_REASON_INVALID_STATE;
+       if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual))
+               goto fail;
 
        nested_get_vmcs12_pages(vcpu, vmcs12);
 
+       r = EXIT_REASON_MSR_LOAD_FAIL;
        msr_entry_idx = nested_vmx_load_msr(vcpu,
                                            vmcs12->vm_entry_msr_load_addr,
                                            vmcs12->vm_entry_msr_load_count);
-       if (msr_entry_idx) {
-               leave_guest_mode(vcpu);
-               vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-               nested_vmx_entry_failure(vcpu, vmcs12,
-                               EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
-               return 1;
-       }
+       if (msr_entry_idx)
+               goto fail;
 
        /*
         * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
@@ -11254,6 +11405,14 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
         * the success flag) when L2 exits (see nested_vmx_vmexit()).
         */
        return 0;
+
+fail:
+       if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+               vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+       leave_guest_mode(vcpu);
+       vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+       nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual);
+       return 1;
 }
 
 /*
@@ -11336,7 +11495,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        /* Hide L1D cache contents from the nested guest.  */
        vmx->vcpu.arch.l1tf_flush_l1d = true;
 
-       if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
+       /*
+        * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
+        * by event injection, halt vcpu.
+        */
+       if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
+           !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK))
                return kvm_vcpu_halt(vcpu);
 
        vmx->nested.nested_run_pending = 1;
@@ -11826,6 +11990,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
        leave_guest_mode(vcpu);
 
+       if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+               vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+
        if (likely(!vmx->fail)) {
                if (exit_reason == -1)
                        sync_vmcs12(vcpu, vmcs12);
@@ -11863,7 +12030,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
        } else if (!nested_cpu_has_ept(vmcs12) &&
                   nested_cpu_has2(vmcs12,
                                   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-               vmx_flush_tlb_ept_only(vcpu);
+               vmx_flush_tlb(vcpu);
        }
 
        /* This is needed for same reason as it was needed in prepare_vmcs02 */
@@ -12316,7 +12483,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
                vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
                vcpu_info.vector = irq.vector;
 
-               trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
+               trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
                                vcpu_info.vector, vcpu_info.pi_desc_addr, set);
 
                if (set)
@@ -12417,6 +12584,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .vcpu_put = vmx_vcpu_put,
 
        .update_bp_intercept = update_exception_bitmap,
+       .get_msr_feature = vmx_get_msr_feature,
        .get_msr = vmx_get_msr,
        .set_msr = vmx_set_msr,
        .get_segment_base = vmx_get_segment_base,
@@ -12490,6 +12658,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
        .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
+       .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
        .write_tsc_offset = vmx_write_tsc_offset,
 
        .set_tdp_cr3 = vmx_set_cr3,
@@ -12529,29 +12698,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .enable_smi_window = enable_smi_window,
 };
 
-static int __init vmx_setup_l1d_flush(void)
-{
-       struct page *page;
-
-       if (!boot_cpu_has_bug(X86_BUG_L1TF))
-               return 0;
-
-       l1tf_vmx_mitigation = vmentry_l1d_flush;
-
-       if (vmentry_l1d_flush == VMENTER_L1D_FLUSH_NEVER)
-               return 0;
-
-       if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
-               page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
-               if (!page)
-                       return -ENOMEM;
-               vmx_l1d_flush_pages = page_address(page);
-       }
-
-       static_branch_enable(&vmx_l1d_should_flush);
-       return 0;
-}
-
 static void vmx_cleanup_l1d_flush(void)
 {
        if (vmx_l1d_flush_pages) {
@@ -12562,19 +12708,41 @@ static void vmx_cleanup_l1d_flush(void)
        l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
 }
 
+static void vmx_exit(void)
+{
+#ifdef CONFIG_KEXEC_CORE
+       RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
+       synchronize_rcu();
+#endif
+
+       kvm_exit();
+
+       vmx_cleanup_l1d_flush();
+}
+module_exit(vmx_exit)
+
 static int __init vmx_init(void)
 {
        int r;
 
-       r = vmx_setup_l1d_flush();
-       if (r)
-               return r;
-
        r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
                     __alignof__(struct vcpu_vmx), THIS_MODULE);
-       if (r) {
-               vmx_cleanup_l1d_flush();
+       if (r)
                return r;
+
+       /*
+        * Must be called after kvm_init() so enable_ept is properly set
+        * up. Hand the parameter mitigation value in which was stored in
+        * the pre module init parser. If no parameter was given, it will
+        * contain 'auto' which will be turned into the default 'cond'
+        * mitigation mode.
+        */
+       if (boot_cpu_has(X86_BUG_L1TF)) {
+               r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+               if (r) {
+                       vmx_exit();
+                       return r;
+               }
        }
 
 #ifdef CONFIG_KEXEC_CORE
@@ -12584,18 +12752,4 @@ static int __init vmx_init(void)
 
        return 0;
 }
-
-static void __exit vmx_exit(void)
-{
-#ifdef CONFIG_KEXEC_CORE
-       RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
-       synchronize_rcu();
-#endif
-
-       kvm_exit();
-
-       vmx_cleanup_l1d_flush();
-}
-
 module_init(vmx_init)
-module_exit(vmx_exit)