x86/KVM/VMX: Add L1D flush logic

[mirror_ubuntu-bionic-kernel.git] / arch / x86 / kvm / vmx.c
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index a3e3e67b5574aac364f1be765c626c74fa46b411..2bd8c0c944f4bbcf071fc18644d89adad0f145e0 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -52,7 +52,7 @@
  #include <asm/irq_remapping.h>
  #include <asm/mmu_context.h>
  #include <asm/microcode.h>
-#include <asm/nospec-branch.h>
+#include <asm/spec-ctrl.h>
  
  #include "trace.h"
  #include "pmu.h"
@@ -70,6 +70,9 @@ static const struct x86_cpu_id vmx_cpu_id[] = {
  };
  MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
  
+static bool __read_mostly nosmt;
+module_param(nosmt, bool, S_IRUGO);
+
  static bool __read_mostly enable_vpid = 1;
  module_param_named(vpid, enable_vpid, bool, 0444);
  
@@ -194,6 +197,54 @@ module_param(ple_window_max, int, S_IRUGO);
  
  extern const ulong vmx_return;
  
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
+
+/* These MUST be in sync with vmentry_l1d_param order. */
+enum vmx_l1d_flush_state {
+       VMENTER_L1D_FLUSH_NEVER,
+       VMENTER_L1D_FLUSH_COND,
+       VMENTER_L1D_FLUSH_ALWAYS,
+};
+
+static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush = VMENTER_L1D_FLUSH_COND;
+
+static const struct {
+       const char *option;
+       enum vmx_l1d_flush_state cmd;
+} vmentry_l1d_param[] = {
+       {"never",       VMENTER_L1D_FLUSH_NEVER},
+       {"cond",        VMENTER_L1D_FLUSH_COND},
+       {"always",      VMENTER_L1D_FLUSH_ALWAYS},
+};
+
+static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+{
+       unsigned int i;
+
+       if (!s)
+               return -EINVAL;
+
+       for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
+               if (!strcmp(s, vmentry_l1d_param[i].option)) {
+                       vmentry_l1d_flush = vmentry_l1d_param[i].cmd;
+                       return 0;
+               }
+       }
+
+       return -EINVAL;
+}
+
+static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
+{
+       return sprintf(s, "%s\n", vmentry_l1d_param[vmentry_l1d_flush].option);
+}
+
+static const struct kernel_param_ops vmentry_l1d_flush_ops = {
+       .set = vmentry_l1d_flush_set,
+       .get = vmentry_l1d_flush_get,
+};
+module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, &vmentry_l1d_flush, S_IRUGO);
+
  #define NR_AUTOLOAD_MSRS 8
  
  struct vmcs {
@@ -3296,7 +3347,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 break;
         case MSR_IA32_SPEC_CTRL:
                 if (!msr_info->host_initiated &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
                     !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
                         return 1;
  
@@ -3417,12 +3467,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 break;
         case MSR_IA32_SPEC_CTRL:
                 if (!msr_info->host_initiated &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
                     !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
                         return 1;
  
                 /* The STIBP bit doesn't fault even if it's not advertised */
-               if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+               if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
                         return 1;
  
                 vmx->spec_ctrl = data;
@@ -3448,7 +3497,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 break;
         case MSR_IA32_PRED_CMD:
                 if (!msr_info->host_initiated &&
-                   !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) &&
                     !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
                         return 1;
  
@@ -6769,7 +6817,21 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
         if (!is_guest_mode(vcpu) &&
             !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
                 trace_kvm_fast_mmio(gpa);
-               return kvm_skip_emulated_instruction(vcpu);
+               /*
+                * Doing kvm_skip_emulated_instruction() depends on undefined
+                * behavior: Intel's manual doesn't mandate
+                * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG
+                * occurs and while on real hardware it was observed to be set,
+                * other hypervisors (namely Hyper-V) don't set it, we end up
+                * advancing IP with some random value. Disable fast mmio when
+                * running nested and keep it for real hardware in hope that
+                * VM_EXIT_INSTRUCTION_LEN will always be set correctly.
+                */
+               if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
+                       return kvm_skip_emulated_instruction(vcpu);
+               else
+                       return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP,
+                                                      NULL, 0) == EMULATE_DONE;
         }
  
         ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
@@ -8970,6 +9032,62 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
         }
  }
  
+/*
+ * Software based L1D cache flush which is used when microcode providing
+ * the cache control MSR is not loaded.
+ *
+ * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
+ * flush it is required to read in 64 KiB because the replacement algorithm
+ * is not exactly LRU. This could be sized at runtime via topology
+ * information but as all relevant affected CPUs have 32KiB L1D cache size
+ * there is no point in doing so.
+ */
+#define L1D_CACHE_ORDER 4
+static void *vmx_l1d_flush_pages;
+
+static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+       int size = PAGE_SIZE << L1D_CACHE_ORDER;
+       bool always;
+
+       /*
+        * If the mitigation mode is 'flush always', keep the flush bit
+        * set, otherwise clear it. It gets set again either from
+        * vcpu_run() or from one of the unsafe VMEXIT handlers.
+        */
+       always = vmentry_l1d_flush == VMENTER_L1D_FLUSH_ALWAYS;
+       vcpu->arch.l1tf_flush_l1d = always;
+
+       vcpu->stat.l1d_flush++;
+
+       if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+               wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+               return;
+       }
+
+       asm volatile(
+               /* First ensure the pages are in the TLB */
+               "xorl   %%eax, %%eax\n"
+               ".Lpopulate_tlb:\n\t"
+               "movzbl (%[empty_zp], %%" _ASM_AX "), %%ecx\n\t"
+               "addl   $4096, %%eax\n\t"
+               "cmpl   %%eax, %[size]\n\t"
+               "jne    .Lpopulate_tlb\n\t"
+               "xorl   %%eax, %%eax\n\t"
+               "cpuid\n\t"
+               /* Now fill the cache */
+               "xorl   %%eax, %%eax\n"
+               ".Lfill_cache:\n"
+               "movzbl (%[empty_zp], %%" _ASM_AX "), %%ecx\n\t"
+               "addl   $64, %%eax\n\t"
+               "cmpl   %%eax, %[size]\n\t"
+               "jne    .Lfill_cache\n\t"
+               "lfence\n"
+               :: [empty_zp] "r" (vmx_l1d_flush_pages),
+                   [size] "r" (size)
+               : "eax", "ebx", "ecx", "edx");
+}
+
  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
  {
         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -9211,13 +9329,26 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
                         [ss]"i"(__KERNEL_DS),
                         [cs]"i"(__KERNEL_CS)
                         );
+               vcpu->arch.l1tf_flush_l1d = true;
         }
  }
  STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
  
-static bool vmx_has_high_real_mode_segbase(void)
+static bool vmx_has_emulated_msr(int index)
  {
-       return enable_unrestricted_guest || emulate_invalid_guest_state;
+       switch (index) {
+       case MSR_IA32_SMBASE:
+               /*
+                * We cannot do SMM unless we can run the guest in big
+                * real mode.
+                */
+               return enable_unrestricted_guest || emulate_invalid_guest_state;
+       case MSR_AMD64_VIRT_SPEC_CTRL:
+               /* This is AMD only.  */
+               return false;
+       default:
+               return true;
+       }
  }
  
  static bool vmx_mpx_supported(void)
@@ -9450,9 +9581,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
          * is no need to worry about the conditional branch over the wrmsr
          * being speculatively taken.
          */
-       x86_spec_ctrl_set_guest(vmx->spec_ctrl);
+       x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
  
         vmx->__launched = vmx->loaded_vmcs->launched;
+
+       if (static_branch_unlikely(&vmx_l1d_should_flush)) {
+               if (vcpu->arch.l1tf_flush_l1d)
+                       vmx_l1d_flush(vcpu);
+       }
+
         asm(
                 /* Store host registers */
                 "push %%" _ASM_DX "; push %%" _ASM_BP ";"
@@ -9588,7 +9725,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
         if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
                 vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
  
-       x86_spec_ctrl_restore_host(vmx->spec_ctrl);
+       x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
  
         /* Eliminate branch target predictions from guest mode */
         vmexit_fill_RSB();
@@ -9802,6 +9939,20 @@ free_vcpu:
         return ERR_PTR(err);
  }
  
+#define L1TF_MSG "SMT enabled with L1TF CPU bug present. Refer to CVE-2018-3620 for details.\n"
+
+static int vmx_vm_init(struct kvm *kvm)
+{
+       if (boot_cpu_has(X86_BUG_L1TF) && cpu_smt_control == CPU_SMT_ENABLED) {
+               if (nosmt) {
+                       pr_err(L1TF_MSG);
+                       return -EOPNOTSUPP;
+               }
+               pr_warn(L1TF_MSG);
+       }
+       return 0;
+}
+
  static void __init vmx_check_processor_compat(void *rtn)
  {
         struct vmcs_config vmcs_conf;
@@ -11160,6 +11311,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
         if (ret)
                 return ret;
  
+       /* Hide L1D cache contents from the nested guest.  */
+       vmx->vcpu.arch.l1tf_flush_l1d = true;
+
         if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
                 return kvm_vcpu_halt(vcpu);
  
@@ -12228,7 +12382,9 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
         .hardware_enable = hardware_enable,
         .hardware_disable = hardware_disable,
         .cpu_has_accelerated_tpr = report_flexpriority,
-       .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase,
+       .has_emulated_msr = vmx_has_emulated_msr,
+
+       .vm_init = vmx_vm_init,
  
         .vcpu_create = vmx_create_vcpu,
         .vcpu_free = vmx_free_vcpu,
@@ -12351,13 +12507,48 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
         .enable_smi_window = enable_smi_window,
  };
  
+static int __init vmx_setup_l1d_flush(void)
+{
+       struct page *page;
+
+       if (vmentry_l1d_flush == VMENTER_L1D_FLUSH_NEVER ||
+           !boot_cpu_has_bug(X86_BUG_L1TF))
+               return 0;
+
+       if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+               page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
+               if (!page)
+                       return -ENOMEM;
+               vmx_l1d_flush_pages = page_address(page);
+       }
+
+       static_branch_enable(&vmx_l1d_should_flush);
+       return 0;
+}
+
+static void vmx_free_l1d_flush_pages(void)
+{
+       if (vmx_l1d_flush_pages) {
+               free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
+               vmx_l1d_flush_pages = NULL;
+       }
+}
+
  static int __init vmx_init(void)
  {
-       int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
-                     __alignof__(struct vcpu_vmx), THIS_MODULE);
+       int r;
+
+       r = vmx_setup_l1d_flush();
         if (r)
                 return r;
  
+       r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+                    __alignof__(struct vcpu_vmx), THIS_MODULE);
+       if (r) {
+               vmx_free_l1d_flush_pages();
+               return r;
+       }
+
  #ifdef CONFIG_KEXEC_CORE
         rcu_assign_pointer(crash_vmclear_loaded_vmcss,
                            crash_vmclear_local_loaded_vmcss);
@@ -12374,6 +12565,8 @@ static void __exit vmx_exit(void)
  #endif
  
         kvm_exit();
+
+       vmx_free_l1d_flush_pages();
  }
  
  module_init(vmx_init)