UBUNTU: SAUCE: Synchronize MDS mitigations with upstream

[mirror_ubuntu-bionic-kernel.git] / arch / x86 / kvm / vmx.c
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index c829d89e2e63f85bc36c6821b0ca1d7712297043..35a21216904320d8542471f926f23b0082c0264d 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -34,6 +34,7 @@
  #include <linux/tboot.h>
  #include <linux/hrtimer.h>
  #include <linux/frame.h>
+#include <linux/nospec.h>
  #include "kvm_cache_regs.h"
  #include "x86.h"
  
@@ -50,7 +51,8 @@
  #include <asm/apic.h>
  #include <asm/irq_remapping.h>
  #include <asm/mmu_context.h>
-#include <asm/nospec-branch.h>
+#include <asm/microcode.h>
+#include <asm/spec-ctrl.h>
  
  #include "trace.h"
  #include "pmu.h"
@@ -111,6 +113,14 @@ static u64 __read_mostly host_xss;
  static bool __read_mostly enable_pml = 1;
  module_param_named(pml, enable_pml, bool, S_IRUGO);
  
+#define MSR_TYPE_R     1
+#define MSR_TYPE_W     2
+#define MSR_TYPE_RW    3
+
+#define MSR_BITMAP_MODE_X2APIC         1
+#define MSR_BITMAP_MODE_X2APIC_APICV   2
+#define MSR_BITMAP_MODE_LM             4
+
  #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
  
  /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
@@ -184,8 +194,157 @@ module_param(ple_window_max, int, S_IRUGO);
  
  extern const ulong vmx_return;
  
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
+static DEFINE_MUTEX(vmx_l1d_flush_mutex);
+
+/* Storage for pre module init parameter parsing */
+static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
+
+static const struct {
+       const char *option;
+       bool for_parse;
+} vmentry_l1d_param[] = {
+       [VMENTER_L1D_FLUSH_AUTO]         = {"auto", true},
+       [VMENTER_L1D_FLUSH_NEVER]        = {"never", true},
+       [VMENTER_L1D_FLUSH_COND]         = {"cond", true},
+       [VMENTER_L1D_FLUSH_ALWAYS]       = {"always", true},
+       [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
+       [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
+};
+
+#define L1D_CACHE_ORDER 4
+static void *vmx_l1d_flush_pages;
+
+static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
+{
+       struct page *page;
+       unsigned int i;
+
+       if (!enable_ept) {
+               l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
+               return 0;
+       }
+
+       if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
+               u64 msr;
+
+               rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
+               if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+                       l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+                       return 0;
+               }
+       }
+
+       /* If set to auto use the default l1tf mitigation method */
+       if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
+               switch (l1tf_mitigation) {
+               case L1TF_MITIGATION_OFF:
+                       l1tf = VMENTER_L1D_FLUSH_NEVER;
+                       break;
+               case L1TF_MITIGATION_FLUSH_NOWARN:
+               case L1TF_MITIGATION_FLUSH:
+               case L1TF_MITIGATION_FLUSH_NOSMT:
+                       l1tf = VMENTER_L1D_FLUSH_COND;
+                       break;
+               case L1TF_MITIGATION_FULL:
+               case L1TF_MITIGATION_FULL_FORCE:
+                       l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+                       break;
+               }
+       } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
+               l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+       }
+
+       if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
+           !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+               page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
+               if (!page)
+                       return -ENOMEM;
+               vmx_l1d_flush_pages = page_address(page);
+
+               /*
+                * Initialize each page with a different pattern in
+                * order to protect against KSM in the nested
+                * virtualization case.
+                */
+               for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
+                       memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
+                              PAGE_SIZE);
+               }
+       }
+
+       l1tf_vmx_mitigation = l1tf;
+
+       if (l1tf != VMENTER_L1D_FLUSH_NEVER)
+               static_branch_enable(&vmx_l1d_should_flush);
+       else
+               static_branch_disable(&vmx_l1d_should_flush);
+
+       if (l1tf == VMENTER_L1D_FLUSH_COND)
+               static_branch_enable(&vmx_l1d_flush_cond);
+       else
+               static_branch_disable(&vmx_l1d_flush_cond);
+       return 0;
+}
+
+static int vmentry_l1d_flush_parse(const char *s)
+{
+       unsigned int i;
+
+       if (s) {
+               for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
+                       if (vmentry_l1d_param[i].for_parse &&
+                           sysfs_streq(s, vmentry_l1d_param[i].option))
+                               return i;
+               }
+       }
+       return -EINVAL;
+}
+
+static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+{
+       int l1tf, ret;
+
+       l1tf = vmentry_l1d_flush_parse(s);
+       if (l1tf < 0)
+               return l1tf;
+
+       if (!boot_cpu_has(X86_BUG_L1TF))
+               return 0;
+
+       /*
+        * Has vmx_init() run already? If not then this is the pre init
+        * parameter parsing. In that case just store the value and let
+        * vmx_init() do the proper setup after enable_ept has been
+        * established.
+        */
+       if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
+               vmentry_l1d_flush_param = l1tf;
+               return 0;
+       }
+
+       mutex_lock(&vmx_l1d_flush_mutex);
+       ret = vmx_setup_l1d_flush(l1tf);
+       mutex_unlock(&vmx_l1d_flush_mutex);
+       return ret;
+}
+
+static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
+{
+       if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
+               return sprintf(s, "???\n");
+
+       return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
+}
+
+static const struct kernel_param_ops vmentry_l1d_flush_ops = {
+       .set = vmentry_l1d_flush_set,
+       .get = vmentry_l1d_flush_get,
+};
+module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
+
  #define NR_AUTOLOAD_MSRS 8
-#define VMCS02_POOL_SIZE 1
  
  struct vmcs {
         u32 revision_id;
@@ -210,6 +369,7 @@ struct loaded_vmcs {
         int soft_vnmi_blocked;
         ktime_t entry_time;
         s64 vnmi_blocked_time;
+       unsigned long *msr_bitmap;
         struct list_head loaded_vmcss_on_cpu_link;
  };
  
@@ -226,7 +386,7 @@ struct shared_msr_entry {
   * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
   * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
   * More than one of these structures may exist, if L1 runs multiple L2 guests.
- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
+ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
   * underlying hardware which will be used to run L2.
   * This structure is packed to ensure that its layout is identical across
   * machines (necessary for live migration).
@@ -409,13 +569,6 @@ struct __packed vmcs12 {
   */
  #define VMCS12_SIZE 0x1000
  
-/* Used to remember the last vmcs02 used for some recently used vmcs12s */
-struct vmcs02_list {
-       struct list_head list;
-       gpa_t vmptr;
-       struct loaded_vmcs vmcs02;
-};
-
  /*
   * The nested_vmx structure is part of vcpu_vmx, and holds information we need
   * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -440,15 +593,15 @@ struct nested_vmx {
          */
         bool sync_shadow_vmcs;
  
-       /* vmcs02_list cache of VMCSs recently used to run L2 guests */
-       struct list_head vmcs02_pool;
-       int vmcs02_num;
         bool change_vmcs01_virtual_x2apic_mode;
         /* L2 must run next, and mustn't decide to exit to L1. */
         bool nested_run_pending;
+
+       struct loaded_vmcs vmcs02;
+
         /*
-        * Guest pages referred to in vmcs02 with host-physical pointers, so
-        * we must keep them pinned while L2 runs.
+        * Guest pages referred to in the vmcs02 with host-physical
+        * pointers, so we must keep them pinned while L2 runs.
          */
         struct page *apic_access_page;
         struct page *virtual_apic_page;
@@ -457,8 +610,6 @@ struct nested_vmx {
         bool pi_pending;
         u16 posted_intr_nv;
  
-       unsigned long *msr_bitmap;
-
         struct hrtimer preemption_timer;
         bool preemption_timer_expired;
  
@@ -577,10 +728,16 @@ static inline int pi_test_sn(struct pi_desc *pi_desc)
                         (unsigned long *)&pi_desc->control);
  }
  
+struct vmx_msrs {
+       unsigned int            nr;
+       struct vmx_msr_entry    val[NR_AUTOLOAD_MSRS];
+};
+
  struct vcpu_vmx {
         struct kvm_vcpu       vcpu;
         unsigned long         host_rsp;
         u8                    fail;
+       u8                    msr_bitmap_mode;
         u32                   exit_intr_info;
         u32                   idt_vectoring_info;
         ulong                 rflags;
@@ -592,6 +749,10 @@ struct vcpu_vmx {
         u64                   msr_host_kernel_gs_base;
         u64                   msr_guest_kernel_gs_base;
  #endif
+
+       u64                   arch_capabilities;
+       u64                   spec_ctrl;
+
         u32 vm_entry_controls_shadow;
         u32 vm_exit_controls_shadow;
         u32 secondary_exec_control;
@@ -605,9 +766,8 @@ struct vcpu_vmx {
         struct loaded_vmcs   *loaded_vmcs;
         bool                  __launched; /* temporary, used in vmx_vcpu_run */
         struct msr_autoload {
-               unsigned nr;
-               struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
-               struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
+               struct vmx_msrs guest;
+               struct vmx_msrs host;
         } msr_autoload;
         struct {
                 int           loaded;
@@ -898,21 +1058,18 @@ static const unsigned short vmcs_field_to_offset_table[] = {
  
  static inline short vmcs_field_to_offset(unsigned long field)
  {
-       BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
+       const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table);
+       unsigned short offset;
  
-       if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
+       BUILD_BUG_ON(size > SHRT_MAX);
+       if (field >= size)
                 return -ENOENT;
  
-       /*
-        * FIXME: Mitigation for CVE-2017-5753.  To be replaced with a
-        * generic mechanism.
-        */
-       asm("lfence");
-
-       if (vmcs_field_to_offset_table[field] == 0)
+       field = array_index_nospec(field, size);
+       offset = vmcs_field_to_offset_table[field];
+       if (offset == 0)
                 return -ENOENT;
-
-       return vmcs_field_to_offset_table[field];
+       return offset;
  }
  
  static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
@@ -935,6 +1092,9 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
  static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
  static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
                                             u16 error_code);
+static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
+static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+                                                         u32 msr, int type);
  
  static DEFINE_PER_CPU(struct vmcs *, vmxarea);
  static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -954,12 +1114,6 @@ static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
  enum {
         VMX_IO_BITMAP_A,
         VMX_IO_BITMAP_B,
-       VMX_MSR_BITMAP_LEGACY,
-       VMX_MSR_BITMAP_LONGMODE,
-       VMX_MSR_BITMAP_LEGACY_X2APIC_APICV,
-       VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV,
-       VMX_MSR_BITMAP_LEGACY_X2APIC,
-       VMX_MSR_BITMAP_LONGMODE_X2APIC,
         VMX_VMREAD_BITMAP,
         VMX_VMWRITE_BITMAP,
         VMX_BITMAP_NR
@@ -969,12 +1123,6 @@ static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
  
  #define vmx_io_bitmap_a                      (vmx_bitmap[VMX_IO_BITMAP_A])
  #define vmx_io_bitmap_b                      (vmx_bitmap[VMX_IO_BITMAP_B])
-#define vmx_msr_bitmap_legacy                (vmx_bitmap[VMX_MSR_BITMAP_LEGACY])
-#define vmx_msr_bitmap_longmode              (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE])
-#define vmx_msr_bitmap_legacy_x2apic_apicv   (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV])
-#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV])
-#define vmx_msr_bitmap_legacy_x2apic         (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC])
-#define vmx_msr_bitmap_longmode_x2apic       (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC])
  #define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
  #define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
  
@@ -1085,6 +1233,13 @@ static inline bool is_machine_check(u32 intr_info)
                 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
  }
  
+/* Undocumented: icebp/int1 */
+static inline bool is_icebp(u32 intr_info)
+{
+       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+               == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK);
+}
+
  static inline bool cpu_has_vmx_msr_bitmap(void)
  {
         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
@@ -1918,6 +2073,52 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
         vmcs_write32(EXCEPTION_BITMAP, eb);
  }
  
+/*
+ * Check if MSR is intercepted for currently loaded MSR bitmap.
+ */
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+{
+       unsigned long *msr_bitmap;
+       int f = sizeof(unsigned long);
+
+       if (!cpu_has_vmx_msr_bitmap())
+               return true;
+
+       msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
+
+       if (msr <= 0x1fff) {
+               return !!test_bit(msr, msr_bitmap + 0x800 / f);
+       } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+               msr &= 0x1fff;
+               return !!test_bit(msr, msr_bitmap + 0xc00 / f);
+       }
+
+       return true;
+}
+
+/*
+ * Check if MSR is intercepted for L01 MSR bitmap.
+ */
+static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
+{
+       unsigned long *msr_bitmap;
+       int f = sizeof(unsigned long);
+
+       if (!cpu_has_vmx_msr_bitmap())
+               return true;
+
+       msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
+
+       if (msr <= 0x1fff) {
+               return !!test_bit(msr, msr_bitmap + 0x800 / f);
+       } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+               msr &= 0x1fff;
+               return !!test_bit(msr, msr_bitmap + 0xc00 / f);
+       }
+
+       return true;
+}
+
  static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
                 unsigned long entry, unsigned long exit)
  {
@@ -1925,9 +2126,20 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
         vm_exit_controls_clearbit(vmx, exit);
  }
  
+static int find_msr(struct vmx_msrs *m, unsigned int msr)
+{
+       unsigned int i;
+
+       for (i = 0; i < m->nr; ++i) {
+               if (m->val[i].index == msr)
+                       return i;
+       }
+       return -ENOENT;
+}
+
  static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
  {
-       unsigned i;
+       int i;
         struct msr_autoload *m = &vmx->msr_autoload;
  
         switch (msr) {
@@ -1948,18 +2160,21 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
                 }
                 break;
         }
+       i = find_msr(&m->guest, msr);
+       if (i < 0)
+               goto skip_guest;
+       --m->guest.nr;
+       m->guest.val[i] = m->guest.val[m->guest.nr];
+       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
  
-       for (i = 0; i < m->nr; ++i)
-               if (m->guest[i].index == msr)
-                       break;
-
-       if (i == m->nr)
+skip_guest:
+       i = find_msr(&m->host, msr);
+       if (i < 0)
                 return;
-       --m->nr;
-       m->guest[i] = m->guest[m->nr];
-       m->host[i] = m->host[m->nr];
-       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
-       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
+
+       --m->host.nr;
+       m->host.val[i] = m->host.val[m->host.nr];
+       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
  }
  
  static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
@@ -1974,9 +2189,9 @@ static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
  }
  
  static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
-                                 u64 guest_val, u64 host_val)
+                                 u64 guest_val, u64 host_val, bool entry_only)
  {
-       unsigned i;
+       int i, j = 0;
         struct msr_autoload *m = &vmx->msr_autoload;
  
         switch (msr) {
@@ -2011,24 +2226,31 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
                 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
         }
  
-       for (i = 0; i < m->nr; ++i)
-               if (m->guest[i].index == msr)
-                       break;
+       i = find_msr(&m->guest, msr);
+       if (!entry_only)
+               j = find_msr(&m->host, msr);
  
-       if (i == NR_AUTOLOAD_MSRS) {
+       if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) {
                 printk_once(KERN_WARNING "Not enough msr switch entries. "
                                 "Can't add msr %x\n", msr);
                 return;
-       } else if (i == m->nr) {
-               ++m->nr;
-               vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
-               vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
         }
+       if (i < 0) {
+               i = m->guest.nr++;
+               vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
+       }
+       m->guest.val[i].index = msr;
+       m->guest.val[i].value = guest_val;
+
+       if (entry_only)
+               return;
  
-       m->guest[i].index = msr;
-       m->guest[i].value = guest_val;
-       m->host[i].index = msr;
-       m->host[i].value = host_val;
+       if (j < 0) {
+               j = m->host.nr++;
+               vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
+       }
+       m->host.val[j].index = msr;
+       m->host.val[j].value = host_val;
  }
  
  static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
@@ -2072,7 +2294,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
                         guest_efer &= ~EFER_LME;
                 if (guest_efer != host_efer)
                         add_atomic_switch_msr(vmx, MSR_EFER,
-                                             guest_efer, host_efer);
+                                             guest_efer, host_efer, false);
                 return false;
         } else {
                 guest_efer &= ~ignore_bits;
@@ -2296,6 +2518,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
         if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
                 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
                 vmcs_load(vmx->loaded_vmcs->vmcs);
+               indirect_branch_prediction_barrier();
         }
  
         if (!already_loaded) {
@@ -2540,6 +2763,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
                 return;
         }
  
+       WARN_ON_ONCE(vmx->emulation_required);
+
         if (kvm_exception_is_soft(nr)) {
                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
                              vmx->vcpu.arch.event_exit_inst_len);
@@ -2572,36 +2797,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
         vmx->guest_msrs[from] = tmp;
  }
  
-static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
-{
-       unsigned long *msr_bitmap;
-
-       if (is_guest_mode(vcpu))
-               msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
-       else if (cpu_has_secondary_exec_ctrls() &&
-                (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
-                 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
-               if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
-                       if (is_long_mode(vcpu))
-                               msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv;
-                       else
-                               msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv;
-               } else {
-                       if (is_long_mode(vcpu))
-                               msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
-                       else
-                               msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
-               }
-       } else {
-               if (is_long_mode(vcpu))
-                       msr_bitmap = vmx_msr_bitmap_longmode;
-               else
-                       msr_bitmap = vmx_msr_bitmap_legacy;
-       }
-
-       vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
-}
-
  /*
   * Set up the vmcs to automatically save and restore system
   * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
@@ -2642,21 +2837,18 @@ static void setup_msrs(struct vcpu_vmx *vmx)
         vmx->save_nmsrs = save_nmsrs;
  
         if (cpu_has_vmx_msr_bitmap())
-               vmx_set_msr_bitmap(&vmx->vcpu);
+               vmx_update_msr_bitmap(&vmx->vcpu);
  }
  
-/*
- * reads and returns guest's timestamp counter "register"
- * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
- * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3
- */
-static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
+static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
  {
-       u64 host_tsc, tsc_offset;
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+       if (is_guest_mode(vcpu) &&
+           (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
+               return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
  
-       host_tsc = rdtsc();
-       tsc_offset = vmcs_read64(TSC_OFFSET);
-       return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
+       return vcpu->arch.tsc_offset;
  }
  
  /*
@@ -3249,6 +3441,11 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
         return !(val & ~valid_bits);
  }
  
+static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
+{
+       return 1;
+}
+
  /*
   * Reads an msr value (of 'msr_index') into 'pdata'.
   * Returns 0 on success, non-0 otherwise.
@@ -3273,8 +3470,18 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
  #endif
         case MSR_EFER:
                 return kvm_get_msr_common(vcpu, msr_info);
-       case MSR_IA32_TSC:
-               msr_info->data = guest_read_tsc(vcpu);
+       case MSR_IA32_SPEC_CTRL:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+                       return 1;
+
+               msr_info->data = to_vmx(vcpu)->spec_ctrl;
+               break;
+       case MSR_IA32_ARCH_CAPABILITIES:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+                       return 1;
+               msr_info->data = to_vmx(vcpu)->arch_capabilities;
                 break;
         case MSR_IA32_SYSENTER_CS:
                 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
@@ -3380,8 +3587,67 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                         return 1;
                 vmcs_write64(GUEST_BNDCFGS, data);
                 break;
-       case MSR_IA32_TSC:
-               kvm_write_tsc(vcpu, msr_info);
+       case MSR_IA32_SPEC_CTRL:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+                       return 1;
+
+               /* The STIBP bit doesn't fault even if it's not advertised */
+               if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
+                       return 1;
+
+               vmx->spec_ctrl = data;
+
+               if (!data)
+                       break;
+
+               /*
+                * For non-nested:
+                * When it's written (to non-zero) for the first time, pass
+                * it through.
+                *
+                * For nested:
+                * The handling of the MSR bitmap for L2 guests is done in
+                * nested_vmx_merge_msr_bitmap. We should not touch the
+                * vmcs02.msr_bitmap here since it gets completely overwritten
+                * in the merging. We update the vmcs01 here for L1 as well
+                * since it will end up touching the MSR anyway now.
+                */
+               vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
+                                             MSR_IA32_SPEC_CTRL,
+                                             MSR_TYPE_RW);
+               break;
+       case MSR_IA32_PRED_CMD:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+                       return 1;
+
+               if (data & ~PRED_CMD_IBPB)
+                       return 1;
+
+               if (!data)
+                       break;
+
+               wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+
+               /*
+                * For non-nested:
+                * When it's written (to non-zero) for the first time, pass
+                * it through.
+                *
+                * For nested:
+                * The handling of the MSR bitmap for L2 guests is done in
+                * nested_vmx_merge_msr_bitmap. We should not touch the
+                * vmcs02.msr_bitmap here since it gets completely overwritten
+                * in the merging.
+                */
+               vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
+                                             MSR_TYPE_W);
+               break;
+       case MSR_IA32_ARCH_CAPABILITIES:
+               if (!msr_info->host_initiated)
+                       return 1;
+               vmx->arch_capabilities = data;
                 break;
         case MSR_IA32_CR_PAT:
                 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -3431,7 +3697,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                 vcpu->arch.ia32_xss = data;
                 if (vcpu->arch.ia32_xss != host_xss)
                         add_atomic_switch_msr(vmx, MSR_IA32_XSS,
-                               vcpu->arch.ia32_xss, host_xss);
+                               vcpu->arch.ia32_xss, host_xss, false);
                 else
                         clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
                 break;
@@ -3837,11 +4103,6 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
         return vmcs;
  }
  
-static struct vmcs *alloc_vmcs(void)
-{
-       return alloc_vmcs_cpu(raw_smp_processor_id());
-}
-
  static void free_vmcs(struct vmcs *vmcs)
  {
         free_pages((unsigned long)vmcs, vmcs_config.order);
@@ -3857,9 +4118,38 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
         loaded_vmcs_clear(loaded_vmcs);
         free_vmcs(loaded_vmcs->vmcs);
         loaded_vmcs->vmcs = NULL;
+       if (loaded_vmcs->msr_bitmap)
+               free_page((unsigned long)loaded_vmcs->msr_bitmap);
         WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
  }
  
+static struct vmcs *alloc_vmcs(void)
+{
+       return alloc_vmcs_cpu(raw_smp_processor_id());
+}
+
+static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+{
+       loaded_vmcs->vmcs = alloc_vmcs();
+       if (!loaded_vmcs->vmcs)
+               return -ENOMEM;
+
+       loaded_vmcs->shadow_vmcs = NULL;
+       loaded_vmcs_init(loaded_vmcs);
+
+       if (cpu_has_vmx_msr_bitmap()) {
+               loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+               if (!loaded_vmcs->msr_bitmap)
+                       goto out_vmcs;
+               memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
+       }
+       return 0;
+
+out_vmcs:
+       free_loaded_vmcs(loaded_vmcs);
+       return -ENOMEM;
+}
+
  static void free_kvm_area(void)
  {
         int cpu;
@@ -4158,12 +4448,6 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
         __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
  }
  
-static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu)
-{
-       if (enable_ept)
-               vmx_flush_tlb(vcpu);
-}
-
  static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
  {
         ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@ -4918,10 +5202,8 @@ static void free_vpid(int vpid)
         spin_unlock(&vmx_vpid_lock);
  }
  
-#define MSR_TYPE_R     1
-#define MSR_TYPE_W     2
-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
-                                               u32 msr, int type)
+static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+                                                         u32 msr, int type)
  {
         int f = sizeof(unsigned long);
  
@@ -4955,6 +5237,50 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
         }
  }
  
+static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+                                                        u32 msr, int type)
+{
+       int f = sizeof(unsigned long);
+
+       if (!cpu_has_vmx_msr_bitmap())
+               return;
+
+       /*
+        * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+        * have the write-low and read-high bitmap offsets the wrong way round.
+        * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+        */
+       if (msr <= 0x1fff) {
+               if (type & MSR_TYPE_R)
+                       /* read-low */
+                       __set_bit(msr, msr_bitmap + 0x000 / f);
+
+               if (type & MSR_TYPE_W)
+                       /* write-low */
+                       __set_bit(msr, msr_bitmap + 0x800 / f);
+
+       } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+               msr &= 0x1fff;
+               if (type & MSR_TYPE_R)
+                       /* read-high */
+                       __set_bit(msr, msr_bitmap + 0x400 / f);
+
+               if (type & MSR_TYPE_W)
+                       /* write-high */
+                       __set_bit(msr, msr_bitmap + 0xc00 / f);
+
+       }
+}
+
+static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
+                                                     u32 msr, int type, bool value)
+{
+       if (value)
+               vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
+       else
+               vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
+}
+
  /*
   * If a msr is allowed by L0, we should check whether it is allowed by L1.
   * The corresponding bit will be cleared unless both of L0 and L1 allow it.
@@ -5001,30 +5327,70 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
         }
  }
  
-static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
+static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
  {
-       if (!longmode_only)
-               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
-                                               msr, MSR_TYPE_R | MSR_TYPE_W);
-       __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
-                                               msr, MSR_TYPE_R | MSR_TYPE_W);
+       u8 mode = 0;
+
+       if (cpu_has_secondary_exec_ctrls() &&
+           (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
+            SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
+               mode |= MSR_BITMAP_MODE_X2APIC;
+               if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
+                       mode |= MSR_BITMAP_MODE_X2APIC_APICV;
+       }
+
+       if (is_long_mode(vcpu))
+               mode |= MSR_BITMAP_MODE_LM;
+
+       return mode;
  }
  
-static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active)
+#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
+
+static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
+                                        u8 mode)
  {
-       if (apicv_active) {
-               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv,
-                               msr, type);
-               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv,
-                               msr, type);
-       } else {
-               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
-                               msr, type);
-               __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
-                               msr, type);
+       int msr;
+
+       for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+               unsigned word = msr / BITS_PER_LONG;
+               msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
+               msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
+       }
+
+       if (mode & MSR_BITMAP_MODE_X2APIC) {
+               /*
+                * TPR reads and writes can be virtualized even if virtual interrupt
+                * delivery is not in use.
+                */
+               vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
+               if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
+                       vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
+                       vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
+                       vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+               }
         }
  }
  
+static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+       u8 mode = vmx_msr_bitmap_mode(vcpu);
+       u8 changed = mode ^ vmx->msr_bitmap_mode;
+
+       if (!changed)
+               return;
+
+       vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
+                                 !(mode & MSR_BITMAP_MODE_LM));
+
+       if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
+               vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
+
+       vmx->msr_bitmap_mode = mode;
+}
+
  static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
  {
         return enable_apicv;
@@ -5129,14 +5495,15 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
  
         if (is_guest_mode(vcpu) &&
             vector == vmx->nested.posted_intr_nv) {
-               /* the PIR and ON have been set by L1. */
-               kvm_vcpu_trigger_posted_interrupt(vcpu, true);
                 /*
                  * If a posted intr is not recognized by hardware,
                  * we will accomplish it in the next vmentry.
                  */
                 vmx->nested.pi_pending = true;
                 kvm_make_request(KVM_REQ_EVENT, vcpu);
+               /* the PIR and ON have been set by L1. */
+               if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
+                       kvm_vcpu_kick(vcpu);
                 return 0;
         }
         return -1;
@@ -5274,7 +5641,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
         }
  
         if (cpu_has_vmx_msr_bitmap())
-               vmx_set_msr_bitmap(vcpu);
+               vmx_update_msr_bitmap(vcpu);
  }
  
  static u32 vmx_exec_control(struct vcpu_vmx *vmx)
@@ -5461,7 +5828,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
                 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
         }
         if (cpu_has_vmx_msr_bitmap())
-               vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
+               vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
  
         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
  
@@ -5517,9 +5884,9 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
  
         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
-       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
+       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
-       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
  
         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
                 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
@@ -5539,6 +5906,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
                 ++vmx->nmsrs;
         }
  
+       vmx->arch_capabilities = kvm_get_arch_capabilities();
  
         vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
  
@@ -5567,7 +5935,9 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
         u64 cr0;
  
         vmx->rmode.vm86_active = 0;
+       vmx->spec_ctrl = 0;
  
+       vcpu->arch.microcode_version = 0x100000000ULL;
         vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
         kvm_set_cr8(vcpu, 0);
  
@@ -5974,7 +6344,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
                       (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
                         vcpu->arch.dr6 &= ~15;
                         vcpu->arch.dr6 |= dr6 | DR6_RTM;
-                       if (!(dr6 & ~DR6_RESERVED)) /* icebp */
+                       if (is_icebp(intr_info))
                                 skip_emulated_instruction(vcpu);
  
                         kvm_queue_exception(vcpu, DB_VECTOR);
@@ -6563,7 +6933,21 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
         if (!is_guest_mode(vcpu) &&
             !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
                 trace_kvm_fast_mmio(gpa);
-               return kvm_skip_emulated_instruction(vcpu);
+               /*
+                * Doing kvm_skip_emulated_instruction() depends on undefined
+                * behavior: Intel's manual doesn't mandate
+                * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG
+                * occurs and while on real hardware it was observed to be set,
+                * other hypervisors (namely Hyper-V) don't set it, we end up
+                * advancing IP with some random value. Disable fast mmio when
+                * running nested and keep it for real hardware in hope that
+                * VM_EXIT_INSTRUCTION_LEN will always be set correctly.
+                */
+               if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
+                       return kvm_skip_emulated_instruction(vcpu);
+               else
+                       return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP,
+                                                      NULL, 0) == EMULATE_DONE;
         }
  
         ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
@@ -6617,12 +7001,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
                         goto out;
                 }
  
-               if (err != EMULATE_DONE) {
-                       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-                       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-                       vcpu->run->internal.ndata = 0;
-                       return 0;
-               }
+               if (err != EMULATE_DONE)
+                       goto emulation_error;
+
+               if (vmx->emulation_required && !vmx->rmode.vm86_active &&
+                   vcpu->arch.exception.pending)
+                       goto emulation_error;
  
                 if (vcpu->arch.halt_request) {
                         vcpu->arch.halt_request = 0;
@@ -6638,6 +7022,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
  
  out:
         return ret;
+
+emulation_error:
+       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+       vcpu->run->internal.ndata = 0;
+       return 0;
  }
  
  static int __grow_ple_window(int val)
@@ -6744,7 +7134,7 @@ void vmx_enable_tdp(void)
  
  static __init int hardware_setup(void)
  {
-       int r = -ENOMEM, i, msr;
+       int r = -ENOMEM, i;
  
         rdmsrl_safe(MSR_EFER, &host_efer);
  
@@ -6764,9 +7154,6 @@ static __init int hardware_setup(void)
  
         memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
  
-       memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
-       memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
-
         if (setup_vmcs_config(&vmcs_config) < 0) {
                 r = -EIO;
                 goto out;
@@ -6835,42 +7222,8 @@ static __init int hardware_setup(void)
                 kvm_tsc_scaling_ratio_frac_bits = 48;
         }
  
-       vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
-       vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
-       vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
-       vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
-       vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
-       vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
-
-       memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
-                       vmx_msr_bitmap_legacy, PAGE_SIZE);
-       memcpy(vmx_msr_bitmap_longmode_x2apic_apicv,
-                       vmx_msr_bitmap_longmode, PAGE_SIZE);
-       memcpy(vmx_msr_bitmap_legacy_x2apic,
-                       vmx_msr_bitmap_legacy, PAGE_SIZE);
-       memcpy(vmx_msr_bitmap_longmode_x2apic,
-                       vmx_msr_bitmap_longmode, PAGE_SIZE);
-
         set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
  
-       for (msr = 0x800; msr <= 0x8ff; msr++) {
-               if (msr == 0x839 /* TMCCT */)
-                       continue;
-               vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true);
-       }
-
-       /*
-        * TPR reads and writes can be virtualized even if virtual interrupt
-        * delivery is not in use.
-        */
-       vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true);
-       vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false);
-
-       /* EOI */
-       vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true);
-       /* SELF-IPI */
-       vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
-
         if (enable_ept)
                 vmx_enable_tdp();
         else
@@ -6973,94 +7326,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
         return handle_nop(vcpu);
  }
  
-/*
- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
- * We could reuse a single VMCS for all the L2 guests, but we also want the
- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
- * allows keeping them loaded on the processor, and in the future will allow
- * optimizations where prepare_vmcs02 doesn't need to set all the fields on
- * every entry if they never change.
- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
- *
- * The following functions allocate and free a vmcs02 in this pool.
- */
-
-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
-{
-       struct vmcs02_list *item;
-       list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
-               if (item->vmptr == vmx->nested.current_vmptr) {
-                       list_move(&item->list, &vmx->nested.vmcs02_pool);
-                       return &item->vmcs02;
-               }
-
-       if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
-               /* Recycle the least recently used VMCS. */
-               item = list_last_entry(&vmx->nested.vmcs02_pool,
-                                      struct vmcs02_list, list);
-               item->vmptr = vmx->nested.current_vmptr;
-               list_move(&item->list, &vmx->nested.vmcs02_pool);
-               return &item->vmcs02;
-       }
-
-       /* Create a new VMCS */
-       item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
-       if (!item)
-               return NULL;
-       item->vmcs02.vmcs = alloc_vmcs();
-       item->vmcs02.shadow_vmcs = NULL;
-       if (!item->vmcs02.vmcs) {
-               kfree(item);
-               return NULL;
-       }
-       loaded_vmcs_init(&item->vmcs02);
-       item->vmptr = vmx->nested.current_vmptr;
-       list_add(&(item->list), &(vmx->nested.vmcs02_pool));
-       vmx->nested.vmcs02_num++;
-       return &item->vmcs02;
-}
-
-/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
-static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
-{
-       struct vmcs02_list *item;
-       list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
-               if (item->vmptr == vmptr) {
-                       free_loaded_vmcs(&item->vmcs02);
-                       list_del(&item->list);
-                       kfree(item);
-                       vmx->nested.vmcs02_num--;
-                       return;
-               }
-}
-
-/*
- * Free all VMCSs saved for this vcpu, except the one pointed by
- * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
- * must be &vmx->vmcs01.
- */
-static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
-{
-       struct vmcs02_list *item, *n;
-
-       WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
-       list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
-               /*
-                * Something will leak if the above WARN triggers.  Better than
-                * a use-after-free.
-                */
-               if (vmx->loaded_vmcs == &item->vmcs02)
-                       continue;
-
-               free_loaded_vmcs(&item->vmcs02);
-               list_del(&item->list);
-               kfree(item);
-               vmx->nested.vmcs02_num--;
-       }
-}
-
  /*
   * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
   * set the success or error code of an emulated VMX instruction, as specified
@@ -7228,8 +7493,7 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
                         vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
                 return 1;
  
-       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, vmpointer,
-                               sizeof(*vmpointer), &e)) {
+       if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
                 kvm_inject_page_fault(vcpu, &e);
                 return 1;
         }
@@ -7241,13 +7505,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         struct vmcs *shadow_vmcs;
+       int r;
  
-       if (cpu_has_vmx_msr_bitmap()) {
-               vmx->nested.msr_bitmap =
-                               (unsigned long *)__get_free_page(GFP_KERNEL);
-               if (!vmx->nested.msr_bitmap)
-                       goto out_msr_bitmap;
-       }
+       r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
+       if (r < 0)
+               goto out_vmcs02;
  
         vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
         if (!vmx->nested.cached_vmcs12)
@@ -7264,9 +7526,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
                 vmx->vmcs01.shadow_vmcs = shadow_vmcs;
         }
  
-       INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
-       vmx->nested.vmcs02_num = 0;
-
         hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
                      HRTIMER_MODE_REL_PINNED);
         vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
@@ -7278,9 +7537,9 @@ out_shadow_vmcs:
         kfree(vmx->nested.cached_vmcs12);
  
  out_cached_vmcs12:
-       free_page((unsigned long)vmx->nested.msr_bitmap);
+       free_loaded_vmcs(&vmx->nested.vmcs02);
  
-out_msr_bitmap:
+out_vmcs02:
         return -ENOMEM;
  }
  
@@ -7315,6 +7574,12 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
                 return 1;
         }
  
+       /* CPL=0 must be checked manually. */
+       if (vmx_get_cpl(vcpu)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
         if (vmx->nested.vmxon) {
                 nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
                 return kvm_skip_emulated_instruction(vcpu);
@@ -7374,6 +7639,11 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
   */
  static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
  {
+       if (vmx_get_cpl(vcpu)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 0;
+       }
+
         if (!to_vmx(vcpu)->nested.vmxon) {
                 kvm_queue_exception(vcpu, UD_VECTOR);
                 return 0;
@@ -7418,15 +7688,12 @@ static void free_nested(struct vcpu_vmx *vmx)
         if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
                 return;
  
+       hrtimer_cancel(&vmx->nested.preemption_timer);
         vmx->nested.vmxon = false;
         vmx->nested.smm.vmxon = false;
         free_vpid(vmx->nested.vpid02);
         vmx->nested.posted_intr_nv = -1;
         vmx->nested.current_vmptr = -1ull;
-       if (vmx->nested.msr_bitmap) {
-               free_page((unsigned long)vmx->nested.msr_bitmap);
-               vmx->nested.msr_bitmap = NULL;
-       }
         if (enable_shadow_vmcs) {
                 vmx_disable_shadow_vmcs(vmx);
                 vmcs_clear(vmx->vmcs01.shadow_vmcs);
@@ -7434,7 +7701,7 @@ static void free_nested(struct vcpu_vmx *vmx)
                 vmx->vmcs01.shadow_vmcs = NULL;
         }
         kfree(vmx->nested.cached_vmcs12);
-       /* Unpin physical memory we referred to in current vmcs02 */
+       /* Unpin physical memory we referred to in the vmcs02 */
         if (vmx->nested.apic_access_page) {
                 kvm_release_page_dirty(vmx->nested.apic_access_page);
                 vmx->nested.apic_access_page = NULL;
@@ -7450,7 +7717,7 @@ static void free_nested(struct vcpu_vmx *vmx)
                 vmx->nested.pi_desc = NULL;
         }
  
-       nested_free_all_saved_vmcss(vmx);
+       free_loaded_vmcs(&vmx->nested.vmcs02);
  }
  
  /* Emulate the VMXOFF instruction */
@@ -7493,8 +7760,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
                         vmptr + offsetof(struct vmcs12, launch_state),
                         &zero, sizeof(zero));
  
-       nested_free_vmcs02(vmx, vmptr);
-
         nested_vmx_succeed(vcpu);
         return kvm_skip_emulated_instruction(vcpu);
  }
@@ -7713,9 +7978,9 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
                 if (get_vmx_mem_address(vcpu, exit_qualification,
                                 vmx_instruction_info, true, &gva))
                         return 1;
-               /* _system ok, as hardware has verified cpl=0 */
-               kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
-                            &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
+               /* _system ok, nested_vmx_check_permission has verified cpl=0 */
+               kvm_write_guest_virt_system(vcpu, gva, &field_value,
+                                           (is_long_mode(vcpu) ? 8 : 4), NULL);
         }
  
         nested_vmx_succeed(vcpu);
@@ -7751,8 +8016,8 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                 if (get_vmx_mem_address(vcpu, exit_qualification,
                                 vmx_instruction_info, false, &gva))
                         return 1;
-               if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
-                          &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
+               if (kvm_read_guest_virt(vcpu, gva, &field_value,
+                                       (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
                         kvm_inject_page_fault(vcpu, &e);
                         return 1;
                 }
@@ -7856,10 +8121,10 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
         if (get_vmx_mem_address(vcpu, exit_qualification,
                         vmx_instruction_info, true, &vmcs_gva))
                 return 1;
-       /* ok to use *_system, as hardware has verified cpl=0 */
-       if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
-                                (void *)&to_vmx(vcpu)->nested.current_vmptr,
-                                sizeof(u64), &e)) {
+       /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
+       if (kvm_write_guest_virt_system(vcpu, vmcs_gva,
+                                       (void *)&to_vmx(vcpu)->nested.current_vmptr,
+                                       sizeof(u64), &e)) {
                 kvm_inject_page_fault(vcpu, &e);
                 return 1;
         }
@@ -7906,8 +8171,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
                         vmx_instruction_info, false, &gva))
                 return 1;
-       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
-                               sizeof(operand), &e)) {
+       if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
                 kvm_inject_page_fault(vcpu, &e);
                 return 1;
         }
@@ -7971,8 +8235,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
         if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
                         vmx_instruction_info, false, &gva))
                 return 1;
-       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
-                               sizeof(operand), &e)) {
+       if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
                 kvm_inject_page_fault(vcpu, &e);
                 return 1;
         }
@@ -8406,10 +8669,11 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
  
         /*
          * The host physical addresses of some pages of guest memory
-        * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
-        * may write to these pages via their host physical address while
-        * L2 is running, bypassing any address-translation-based dirty
-        * tracking (e.g. EPT write protection).
+        * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+        * Page). The CPU may write to these pages via their host
+        * physical address while L2 is running, bypassing any
+        * address-translation-based dirty tracking (e.g. EPT write
+        * protection).
          *
          * Mark them dirty on every exit from L2 to prevent them from
          * getting out of sync with dirty tracking.
@@ -8899,6 +9163,79 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
         }
  }
  
+/*
+ * Software based L1D cache flush which is used when microcode providing
+ * the cache control MSR is not loaded.
+ *
+ * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
+ * flush it is required to read in 64 KiB because the replacement algorithm
+ * is not exactly LRU. This could be sized at runtime via topology
+ * information but as all relevant affected CPUs have 32KiB L1D cache size
+ * there is no point in doing so.
+ */
+#define L1D_CACHE_ORDER 4
+static void *vmx_l1d_flush_pages;
+
+static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+       int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+       /*
+        * This code is only executed when the the flush mode is 'cond' or
+        * 'always'
+        */
+       if (static_branch_likely(&vmx_l1d_flush_cond)) {
+               bool flush_l1d;
+
+               /*
+                * Clear the per-vcpu flush bit, it gets set again
+                * either from vcpu_run() or from one of the unsafe
+                * VMEXIT handlers.
+                */
+               flush_l1d = vcpu->arch.l1tf_flush_l1d;
+               vcpu->arch.l1tf_flush_l1d = false;
+
+               /*
+                * Clear the per-cpu flush bit, it gets set again from
+                * the interrupt handlers.
+                */
+               flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
+               kvm_clear_cpu_l1tf_flush_l1d();
+
+               if (!flush_l1d)
+                       return;
+       }
+
+       vcpu->stat.l1d_flush++;
+
+       if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+               wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+               return;
+       }
+
+       asm volatile(
+               /* First ensure the pages are in the TLB */
+               "xorl   %%eax, %%eax\n"
+               ".Lpopulate_tlb:\n\t"
+               "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+               "addl   $4096, %%eax\n\t"
+               "cmpl   %%eax, %[size]\n\t"
+               "jne    .Lpopulate_tlb\n\t"
+               "xorl   %%eax, %%eax\n\t"
+               "cpuid\n\t"
+               /* Now fill the cache */
+               "xorl   %%eax, %%eax\n"
+               ".Lfill_cache:\n"
+               "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+               "addl   $64, %%eax\n\t"
+               "cmpl   %%eax, %[size]\n\t"
+               "jne    .Lfill_cache\n\t"
+               "lfence\n"
+               :: [flush_pages] "r" (vmx_l1d_flush_pages),
+                   [size] "r" (size)
+               : "eax", "ebx", "ecx", "edx");
+}
+
  static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
  {
         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -8939,11 +9276,11 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
         } else {
                 sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
                 sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-               vmx_flush_tlb_ept_only(vcpu);
+               vmx_flush_tlb(vcpu);
         }
         vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
  
-       vmx_set_msr_bitmap(vcpu);
+       vmx_update_msr_bitmap(vcpu);
  }
  
  static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
@@ -8967,7 +9304,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
             !nested_cpu_has2(get_vmcs12(&vmx->vcpu),
                              SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
                 vmcs_write64(APIC_ACCESS_ADDR, hpa);
-               vmx_flush_tlb_ept_only(vcpu);
+               vmx_flush_tlb(vcpu);
         }
  }
  
@@ -9129,14 +9466,14 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
  #endif
                         "pushf\n\t"
                         __ASM_SIZE(push) " $%c[cs]\n\t"
-                       "call *%[entry]\n\t"
+                       CALL_NOSPEC
                         :
  #ifdef CONFIG_X86_64
                         [sp]"=&r"(tmp),
  #endif
                         ASM_CALL_CONSTRAINT
                         :
-                       [entry]"r"(entry),
+                       THUNK_TARGET(entry),
                         [ss]"i"(__KERNEL_DS),
                         [cs]"i"(__KERNEL_CS)
                         );
@@ -9144,9 +9481,21 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
  }
  STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
  
-static bool vmx_has_high_real_mode_segbase(void)
+static bool vmx_has_emulated_msr(int index)
  {
-       return enable_unrestricted_guest || emulate_invalid_guest_state;
+       switch (index) {
+       case MSR_IA32_SMBASE:
+               /*
+                * We cannot do SMM unless we can run the guest in big
+                * real mode.
+                */
+               return enable_unrestricted_guest || emulate_invalid_guest_state;
+       case MSR_AMD64_VIRT_SPEC_CTRL:
+               /* This is AMD only.  */
+               return false;
+       default:
+               return true;
+       }
  }
  
  static bool vmx_mpx_supported(void)
@@ -9290,7 +9639,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
                         clear_atomic_switch_msr(vmx, msrs[i].msr);
                 else
                         add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
-                                       msrs[i].host);
+                                       msrs[i].host, false);
  }
  
  static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
@@ -9373,7 +9722,22 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
  
         vmx_arm_hv_timer(vcpu);
  
+       /*
+        * If this vCPU has touched SPEC_CTRL, restore the guest's value if
+        * it's non-zero. Since vmentry is serialising on affected CPUs, there
+        * is no need to worry about the conditional branch over the wrmsr
+        * being speculatively taken.
+        */
+       x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
+
         vmx->__launched = vmx->loaded_vmcs->launched;
+
+       /* L1D Flush includes CPU buffer clear to mitigate MDS */
+       if (static_branch_unlikely(&vmx_l1d_should_flush))
+               vmx_l1d_flush(vcpu);
+       else if (static_branch_unlikely(&mds_user_clear))
+               mds_clear_cpu_buffers();
+
         asm(
                 /* Store host registers */
                 "push %%" _ASM_DX "; push %%" _ASM_BP ";"
@@ -9491,6 +9855,26 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
  #endif
               );
  
+       /*
+        * We do not use IBRS in the kernel. If this vCPU has used the
+        * SPEC_CTRL MSR it may have left it on; save the value and
+        * turn it off. This is much more efficient than blindly adding
+        * it to the atomic save/restore list. Especially as the former
+        * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
+        *
+        * For non-nested case:
+        * If the L01 MSR bitmap does not intercept the MSR, then we need to
+        * save it.
+        *
+        * For nested case:
+        * If the L02 MSR bitmap does not intercept the MSR, then we need to
+        * save it.
+        */
+       if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+               vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
+
+       x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
+
         /* Eliminate branch target predictions from guest mode */
         vmexit_fill_RSB();
  
@@ -9604,6 +9988,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
  {
         int err;
         struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+       unsigned long *msr_bitmap;
         int cpu;
  
         if (!vmx)
@@ -9636,13 +10021,20 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
         if (!vmx->guest_msrs)
                 goto free_pml;
  
-       vmx->loaded_vmcs = &vmx->vmcs01;
-       vmx->loaded_vmcs->vmcs = alloc_vmcs();
-       vmx->loaded_vmcs->shadow_vmcs = NULL;
-       if (!vmx->loaded_vmcs->vmcs)
+       err = alloc_loaded_vmcs(&vmx->vmcs01);
+       if (err < 0)
                 goto free_msrs;
-       loaded_vmcs_init(vmx->loaded_vmcs);
  
+       msr_bitmap = vmx->vmcs01.msr_bitmap;
+       vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+       vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+       vmx->msr_bitmap_mode = 0;
+
+       vmx->loaded_vmcs = &vmx->vmcs01;
         cpu = get_cpu();
         vmx_vcpu_load(&vmx->vcpu, cpu);
         vmx->vcpu.cpu = cpu;
@@ -9695,6 +10087,37 @@ free_vcpu:
         return ERR_PTR(err);
  }
  
+#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
+#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
+
+static int vmx_vm_init(struct kvm *kvm)
+{
+       if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
+               switch (l1tf_mitigation) {
+               case L1TF_MITIGATION_OFF:
+               case L1TF_MITIGATION_FLUSH_NOWARN:
+                       /* 'I explicitly don't care' is set */
+                       break;
+               case L1TF_MITIGATION_FLUSH:
+               case L1TF_MITIGATION_FLUSH_NOSMT:
+               case L1TF_MITIGATION_FULL:
+                       /*
+                        * Warn upon starting the first VM in a potentially
+                        * insecure environment.
+                        */
+                       if (cpu_smt_control == CPU_SMT_ENABLED)
+                               pr_warn_once(L1TF_MSG_SMT);
+                       if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
+                               pr_warn_once(L1TF_MSG_L1D);
+                       break;
+               case L1TF_MITIGATION_FULL_FORCE:
+                       /* Flush is enforced */
+                       break;
+               }
+       }
+       return 0;
+}
+
  static void __init vmx_check_processor_compat(void *rtn)
  {
         struct vmcs_config vmcs_conf;
@@ -10012,6 +10435,8 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
                         kunmap(vmx->nested.pi_desc_page);
                         kvm_release_page_dirty(vmx->nested.pi_desc_page);
                         vmx->nested.pi_desc_page = NULL;
+                       vmx->nested.pi_desc = NULL;
+                       vmcs_write64(POSTED_INTR_DESC_ADDR, -1ull);
                 }
                 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
                 if (is_error_page(page))
@@ -10030,7 +10455,8 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
         if (cpu_has_vmx_msr_bitmap() &&
             nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS) &&
             nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
-               ;
+               vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
+                             CPU_BASED_USE_MSR_BITMAPS);
         else
                 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
                                 CPU_BASED_USE_MSR_BITMAPS);
@@ -10105,10 +10531,25 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
         int msr;
         struct page *page;
         unsigned long *msr_bitmap_l1;
-       unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
+       unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
+       /*
+        * pred_cmd & spec_ctrl are trying to verify two things:
+        *
+        * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
+        *    ensures that we do not accidentally generate an L02 MSR bitmap
+        *    from the L12 MSR bitmap that is too permissive.
+        * 2. That L1 or L2s have actually used the MSR. This avoids
+        *    unnecessarily merging of the bitmap if the MSR is unused. This
+        *    works properly because we only update the L01 MSR bitmap lazily.
+        *    So even if L0 should pass L1 these MSRs, the L01 bitmap is only
+        *    updated to reflect this when L1 (or its L2s) actually write to
+        *    the MSR.
+        */
+       bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
+       bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
  
-       /* This shortcut is ok because we support only x2APIC MSRs so far. */
-       if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+       if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+           !pred_cmd && !spec_ctrl)
                 return false;
  
         page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
@@ -10141,12 +10582,35 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
                                 MSR_TYPE_W);
                 }
         }
+
+       if (spec_ctrl)
+               nested_vmx_disable_intercept_for_msr(
+                                       msr_bitmap_l1, msr_bitmap_l0,
+                                       MSR_IA32_SPEC_CTRL,
+                                       MSR_TYPE_R | MSR_TYPE_W);
+
+       if (pred_cmd)
+               nested_vmx_disable_intercept_for_msr(
+                                       msr_bitmap_l1, msr_bitmap_l0,
+                                       MSR_IA32_PRED_CMD,
+                                       MSR_TYPE_W);
+
         kunmap(page);
         kvm_release_page_clean(page);
  
         return true;
  }
  
+static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
+                                         struct vmcs12 *vmcs12)
+{
+       if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+           !page_address_valid(vcpu, vmcs12->apic_access_addr))
+               return -EINVAL;
+       else
+               return 0;
+}
+
  static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
                                            struct vmcs12 *vmcs12)
  {
@@ -10594,10 +11058,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
          * Set the MSR load/store lists to match L0's settings.
          */
         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
-       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
-       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
-       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
-       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+       vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
+       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+       vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
  
         /*
          * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
@@ -10674,14 +11138,14 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
             vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
                 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
  
-       if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
-               vmcs_write64(TSC_OFFSET,
-                       vcpu->arch.tsc_offset + vmcs12->tsc_offset);
-       else
-               vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+       vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+
         if (kvm_has_tsc_control)
                 decache_tsc_multiplier(vmx);
  
+       if (cpu_has_vmx_msr_bitmap())
+               vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
+
         if (enable_vpid) {
                 /*
                  * There is no direct mapping between vpid02 and vpid12, the
@@ -10723,7 +11187,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                 }
         } else if (nested_cpu_has2(vmcs12,
                                    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-               vmx_flush_tlb_ept_only(vcpu);
+               vmx_flush_tlb(vcpu);
         }
  
         /*
@@ -10787,6 +11251,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
         if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
  
+       if (nested_vmx_check_apic_access_controls(vcpu, vmcs12))
+               return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
         if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12))
                 return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
  
@@ -10903,42 +11370,33 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-       struct loaded_vmcs *vmcs02;
         u32 msr_entry_idx;
         u32 exit_qual;
-
-       vmcs02 = nested_get_current_vmcs02(vmx);
-       if (!vmcs02)
-               return -ENOMEM;
+       int r;
  
         enter_guest_mode(vcpu);
  
         if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
                 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
  
-       vmx_switch_vmcs(vcpu, vmcs02);
+       vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
         vmx_segment_cache_clear(vmx);
  
-       if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
-               leave_guest_mode(vcpu);
-               vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-               nested_vmx_entry_failure(vcpu, vmcs12,
-                                        EXIT_REASON_INVALID_STATE, exit_qual);
-               return 1;
-       }
+       if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+               vcpu->arch.tsc_offset += vmcs12->tsc_offset;
+
+       r = EXIT_REASON_INVALID_STATE;
+       if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual))
+               goto fail;
  
         nested_get_vmcs12_pages(vcpu, vmcs12);
  
+       r = EXIT_REASON_MSR_LOAD_FAIL;
         msr_entry_idx = nested_vmx_load_msr(vcpu,
                                             vmcs12->vm_entry_msr_load_addr,
                                             vmcs12->vm_entry_msr_load_count);
-       if (msr_entry_idx) {
-               leave_guest_mode(vcpu);
-               vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-               nested_vmx_entry_failure(vcpu, vmcs12,
-                               EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
-               return 1;
-       }
+       if (msr_entry_idx)
+               goto fail;
  
         /*
          * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
@@ -10947,6 +11405,14 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
          * the success flag) when L2 exits (see nested_vmx_vmexit()).
          */
         return 0;
+
+fail:
+       if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+               vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+       leave_guest_mode(vcpu);
+       vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+       nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual);
+       return 1;
  }
  
  /*
@@ -11026,7 +11492,15 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
         if (ret)
                 return ret;
  
-       if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
+       /* Hide L1D cache contents from the nested guest.  */
+       vmx->vcpu.arch.l1tf_flush_l1d = true;
+
+       /*
+        * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
+        * by event injection, halt vcpu.
+        */
+       if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
+           !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK))
                 return kvm_vcpu_halt(vcpu);
  
         vmx->nested.nested_run_pending = 1;
@@ -11128,7 +11602,6 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
                 if (block_nested_events)
                         return -EBUSY;
                 nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
-               vcpu->arch.exception.pending = false;
                 return 0;
         }
  
@@ -11485,7 +11958,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
  
         if (cpu_has_vmx_msr_bitmap())
-               vmx_set_msr_bitmap(vcpu);
+               vmx_update_msr_bitmap(vcpu);
  
         if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
                                 vmcs12->vm_exit_msr_load_count))
@@ -11517,6 +11990,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
  
         leave_guest_mode(vcpu);
  
+       if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+               vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+
         if (likely(!vmx->fail)) {
                 if (exit_reason == -1)
                         sync_vmcs12(vcpu, vmcs12);
@@ -11534,13 +12010,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
         vm_exit_controls_reset_shadow(vmx);
         vmx_segment_cache_clear(vmx);
  
-       /* if no vmcs02 cache requested, remove the one we used */
-       if (VMCS02_POOL_SIZE == 0)
-               nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
-
         /* Update any VMCS fields that might have changed while L2 ran */
-       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
-       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+       vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+       vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
         vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
         if (vmx->hv_deadline_tsc == -1)
                 vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
@@ -11558,7 +12030,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
         } else if (!nested_cpu_has_ept(vmcs12) &&
                    nested_cpu_has2(vmcs12,
                                    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-               vmx_flush_tlb_ept_only(vcpu);
+               vmx_flush_tlb(vcpu);
         }
  
         /* This is needed for same reason as it was needed in prepare_vmcs02 */
@@ -12011,7 +12483,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
                 vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
                 vcpu_info.vector = irq.vector;
  
-               trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
+               trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
                                 vcpu_info.vector, vcpu_info.pi_desc_addr, set);
  
                 if (set)
@@ -12099,7 +12571,9 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
         .hardware_enable = hardware_enable,
         .hardware_disable = hardware_disable,
         .cpu_has_accelerated_tpr = report_flexpriority,
-       .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase,
+       .has_emulated_msr = vmx_has_emulated_msr,
+
+       .vm_init = vmx_vm_init,
  
         .vcpu_create = vmx_create_vcpu,
         .vcpu_free = vmx_free_vcpu,
@@ -12110,6 +12584,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
         .vcpu_put = vmx_vcpu_put,
  
         .update_bp_intercept = update_exception_bitmap,
+       .get_msr_feature = vmx_get_msr_feature,
         .get_msr = vmx_get_msr,
         .set_msr = vmx_set_msr,
         .get_segment_base = vmx_get_segment_base,
@@ -12183,6 +12658,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
  
         .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
  
+       .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
         .write_tsc_offset = vmx_write_tsc_offset,
  
         .set_tdp_cr3 = vmx_set_cr3,
@@ -12222,22 +12698,17 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
         .enable_smi_window = enable_smi_window,
  };
  
-static int __init vmx_init(void)
+static void vmx_cleanup_l1d_flush(void)
  {
-       int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
-                     __alignof__(struct vcpu_vmx), THIS_MODULE);
-       if (r)
-               return r;
-
-#ifdef CONFIG_KEXEC_CORE
-       rcu_assign_pointer(crash_vmclear_loaded_vmcss,
-                          crash_vmclear_local_loaded_vmcss);
-#endif
-
-       return 0;
+       if (vmx_l1d_flush_pages) {
+               free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
+               vmx_l1d_flush_pages = NULL;
+       }
+       /* Restore state so sysfs ignores VMX */
+       l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
  }
  
-static void __exit vmx_exit(void)
+static void vmx_exit(void)
  {
  #ifdef CONFIG_KEXEC_CORE
         RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
@@ -12245,7 +12716,40 @@ static void __exit vmx_exit(void)
  #endif
  
         kvm_exit();
+
+       vmx_cleanup_l1d_flush();
  }
+module_exit(vmx_exit)
+
+static int __init vmx_init(void)
+{
+       int r;
+
+       r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+                    __alignof__(struct vcpu_vmx), THIS_MODULE);
+       if (r)
+               return r;
+
+       /*
+        * Must be called after kvm_init() so enable_ept is properly set
+        * up. Hand the parameter mitigation value in which was stored in
+        * the pre module init parser. If no parameter was given, it will
+        * contain 'auto' which will be turned into the default 'cond'
+        * mitigation mode.
+        */
+       if (boot_cpu_has(X86_BUG_L1TF)) {
+               r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+               if (r) {
+                       vmx_exit();
+                       return r;
+               }
+       }
  
+#ifdef CONFIG_KEXEC_CORE
+       rcu_assign_pointer(crash_vmclear_loaded_vmcss,
+                          crash_vmclear_local_loaded_vmcss);
+#endif
+
+       return 0;
+}
  module_init(vmx_init)
-module_exit(vmx_exit)