]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blobdiff - arch/x86/kvm/vmx.c
x86/bugs, KVM: Support the combination of guest and host IBRS
[mirror_ubuntu-artful-kernel.git] / arch / x86 / kvm / vmx.c
index 84e62acf2dd861023b17382e61f9c98c8df82f68..7a7bd88b2e14e01c4fd127d09dfa78d329d61e37 100644 (file)
@@ -50,6 +50,8 @@
 #include <asm/apic.h>
 #include <asm/irq_remapping.h>
 #include <asm/mmu_context.h>
+#include <asm/nospec-branch.h>
+#include <asm/microcode.h>
 
 #include "trace.h"
 #include "pmu.h"
@@ -198,7 +200,14 @@ struct loaded_vmcs {
        struct vmcs *vmcs;
        struct vmcs *shadow_vmcs;
        int cpu;
-       int launched;
+       bool launched;
+       bool nmi_known_unmasked;
+       unsigned long vmcs_host_cr3;    /* May not match real cr3 */
+       unsigned long vmcs_host_cr4;    /* May not match real cr4 */
+       /* Support for vnmi-less CPUs */
+       int soft_vnmi_blocked;
+       ktime_t entry_time;
+       s64 vnmi_blocked_time;
        struct list_head loaded_vmcss_on_cpu_link;
 };
 
@@ -415,13 +424,10 @@ struct nested_vmx {
 
        /* The guest-physical address of the current VMCS L1 keeps for L2 */
        gpa_t current_vmptr;
-       /* The host-usable pointer to the above */
-       struct page *current_vmcs12_page;
-       struct vmcs12 *current_vmcs12;
        /*
         * Cache of the guest's VMCS, existing outside of guest memory.
         * Loaded from guest memory during VMPTRLD. Flushed to guest
-        * memory during VMXOFF, VMCLEAR, VMPTRLD.
+        * memory during VMCLEAR and VMPTRLD.
         */
        struct vmcs12 *cached_vmcs12;
        /*
@@ -562,7 +568,6 @@ struct vcpu_vmx {
        struct kvm_vcpu       vcpu;
        unsigned long         host_rsp;
        u8                    fail;
-       bool                  nmi_known_unmasked;
        u32                   exit_intr_info;
        u32                   idt_vectoring_info;
        ulong                 rflags;
@@ -598,8 +603,6 @@ struct vcpu_vmx {
                int           gs_ldt_reload_needed;
                int           fs_reload_needed;
                u64           msr_host_bndcfgs;
-               unsigned long vmcs_host_cr3;    /* May not match real cr3 */
-               unsigned long vmcs_host_cr4;    /* May not match real cr4 */
        } host_state;
        struct {
                int vm86_active;
@@ -639,8 +642,6 @@ struct vcpu_vmx {
 
        u64 current_tsc_ratio;
 
-       bool guest_pkru_valid;
-       u32 guest_pkru;
        u32 host_pkru;
 
        /*
@@ -927,6 +928,10 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var);
 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
 static int alloc_identity_pagetable(struct kvm *kvm);
+static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
+static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
+static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+                                           u16 error_code);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -1077,6 +1082,13 @@ static inline bool is_machine_check(u32 intr_info)
                (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
 }
 
+/* Undocumented: icebp/int1 */
+static inline bool is_icebp(u32 intr_info)
+{
+       return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+               == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK);
+}
+
 static inline bool cpu_has_vmx_msr_bitmap(void)
 {
        return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
@@ -1289,6 +1301,11 @@ static inline bool cpu_has_vmx_invpcid(void)
                SECONDARY_EXEC_ENABLE_INVPCID;
 }
 
+static inline bool cpu_has_virtual_nmis(void)
+{
+       return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
+}
+
 static inline bool cpu_has_vmx_wbinvd_exit(void)
 {
        return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -1340,11 +1357,6 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
                (vmcs12->secondary_vm_exec_control & bit);
 }
 
-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
-{
-       return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
-}
-
 static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
 {
        return vmcs12->pin_based_vm_exec_control &
@@ -1484,6 +1496,7 @@ static void vmcs_load(struct vmcs *vmcs)
        if (error)
                printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
                       vmcs, phys_addr);
+
 }
 
 #ifdef CONFIG_KEXEC_CORE
@@ -2188,46 +2201,44 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
        struct pi_desc old, new;
        unsigned int dest;
 
-       if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-               !irq_remapping_cap(IRQ_POSTING_CAP)  ||
-               !kvm_vcpu_apicv_active(vcpu))
+       /*
+        * In case of hot-plug or hot-unplug, we may have to undo
+        * vmx_vcpu_pi_put even if there is no assigned device.  And we
+        * always keep PI.NDST up to date for simplicity: it makes the
+        * code easier, and CPU migration is not a fast path.
+        */
+       if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
                return;
 
+       /*
+        * First handle the simple case where no cmpxchg is necessary; just
+        * allow posting non-urgent interrupts.
+        *
+        * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
+        * PI.NDST: pi_post_block will do it for us and the wakeup_handler
+        * expects the VCPU to be on the blocked_vcpu_list that matches
+        * PI.NDST.
+        */
+       if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
+           vcpu->cpu == cpu) {
+               pi_clear_sn(pi_desc);
+               return;
+       }
+
+       /* The full case.  */
        do {
                old.control = new.control = pi_desc->control;
 
-               /*
-                * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
-                * are two possible cases:
-                * 1. After running 'pre_block', context switch
-                *    happened. For this case, 'sn' was set in
-                *    vmx_vcpu_put(), so we need to clear it here.
-                * 2. After running 'pre_block', we were blocked,
-                *    and woken up by some other guy. For this case,
-                *    we don't need to do anything, 'pi_post_block'
-                *    will do everything for us. However, we cannot
-                *    check whether it is case #1 or case #2 here
-                *    (maybe, not needed), so we also clear sn here,
-                *    I think it is not a big deal.
-                */
-               if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
-                       if (vcpu->cpu != cpu) {
-                               dest = cpu_physical_id(cpu);
-
-                               if (x2apic_enabled())
-                                       new.ndst = dest;
-                               else
-                                       new.ndst = (dest << 8) & 0xFF00;
-                       }
+               dest = cpu_physical_id(cpu);
 
-                       /* set 'NV' to 'notification vector' */
-                       new.nv = POSTED_INTR_VECTOR;
-               }
+               if (x2apic_enabled())
+                       new.ndst = dest;
+               else
+                       new.ndst = (dest << 8) & 0xFF00;
 
-               /* Allow posting non-urgent interrupts */
                new.sn = 0;
-       } while (cmpxchg(&pi_desc->control, old.control,
-                       new.control) != old.control);
+       } while (cmpxchg64(&pi_desc->control, old.control,
+                          new.control) != old.control);
 }
 
 static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
@@ -2266,6 +2277,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
                per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
                vmcs_load(vmx->loaded_vmcs->vmcs);
+               if (ibpb_inuse)
+                       native_wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
        }
 
        if (!already_loaded) {
@@ -2279,7 +2292,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                 * processors.  See 22.2.4.
                 */
                vmcs_writel(HOST_TR_BASE,
-                           (unsigned long)this_cpu_ptr(&cpu_tss));
+                           (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
                vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
 
                /*
@@ -2326,6 +2339,11 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
        __vmx_load_host_state(to_vmx(vcpu));
 }
 
+static bool emulation_required(struct kvm_vcpu *vcpu)
+{
+       return emulate_invalid_guest_state && !guest_state_valid(vcpu);
+}
+
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
 
 /*
@@ -2363,6 +2381,8 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
 
 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
+       unsigned long old_rflags = vmx_get_rflags(vcpu);
+
        __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
        to_vmx(vcpu)->rflags = rflags;
        if (to_vmx(vcpu)->rmode.vm86_active) {
@@ -2370,11 +2390,9 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
                rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
        }
        vmcs_writel(GUEST_RFLAGS, rflags);
-}
 
-static u32 vmx_get_pkru(struct kvm_vcpu *vcpu)
-{
-       return to_vmx(vcpu)->guest_pkru;
+       if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
+               to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
 }
 
 static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
@@ -2418,6 +2436,30 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
        vmx_set_interrupt_shadow(vcpu, 0);
 }
 
+static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
+                                              unsigned long exit_qual)
+{
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+       unsigned int nr = vcpu->arch.exception.nr;
+       u32 intr_info = nr | INTR_INFO_VALID_MASK;
+
+       if (vcpu->arch.exception.has_error_code) {
+               vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
+               intr_info |= INTR_INFO_DELIVER_CODE_MASK;
+       }
+
+       if (kvm_exception_is_soft(nr))
+               intr_info |= INTR_TYPE_SOFT_EXCEPTION;
+       else
+               intr_info |= INTR_TYPE_HARD_EXCEPTION;
+
+       if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
+           vmx_get_nmi_mask(vcpu))
+               intr_info |= INTR_INFO_UNBLOCK_NMI;
+
+       nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
+}
+
 /*
  * KVM wants to inject page-faults which it got to the guest. This function
  * checks whether in a nested guest, we need to inject them to L1 or L2.
@@ -2427,23 +2469,38 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu)
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        unsigned int nr = vcpu->arch.exception.nr;
 
-       if (!((vmcs12->exception_bitmap & (1u << nr)) ||
-               (nr == PF_VECTOR && vcpu->arch.exception.nested_apf)))
-               return 0;
+       if (nr == PF_VECTOR) {
+               if (vcpu->arch.exception.nested_apf) {
+                       nested_vmx_inject_exception_vmexit(vcpu,
+                                                          vcpu->arch.apf.nested_apf_token);
+                       return 1;
+               }
+               /*
+                * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
+                * The fix is to add the ancillary datum (CR2 or DR6) to structs
+                * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
+                * can be written only when inject_pending_event runs.  This should be
+                * conditional on a new capability---if the capability is disabled,
+                * kvm_multiple_exception would write the ancillary information to
+                * CR2 or DR6, for backwards ABI-compatibility.
+                */
+               if (nested_vmx_is_page_fault_vmexit(vmcs12,
+                                                   vcpu->arch.exception.error_code)) {
+                       nested_vmx_inject_exception_vmexit(vcpu, vcpu->arch.cr2);
+                       return 1;
+               }
+       } else {
+               unsigned long exit_qual = 0;
+               if (nr == DB_VECTOR)
+                       exit_qual = vcpu->arch.dr6;
 
-       if (vcpu->arch.exception.nested_apf) {
-               vmcs_write32(VM_EXIT_INTR_ERROR_CODE, vcpu->arch.exception.error_code);
-               nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
-                       PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
-                       INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
-                       vcpu->arch.apf.nested_apf_token);
-               return 1;
+               if (vmcs12->exception_bitmap & (1u << nr)) {
+                       nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+                       return 1;
+               }
        }
 
-       nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
-                         vmcs_read32(VM_EXIT_INTR_INFO),
-                         vmcs_readl(EXIT_QUALIFICATION));
-       return 1;
+       return 0;
 }
 
 static void vmx_queue_exception(struct kvm_vcpu *vcpu)
@@ -2657,7 +2714,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
         * reason is that if one of these bits is necessary, it will appear
         * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
         * fields of vmcs01 and vmcs02, will turn these bits off - and
-        * nested_vmx_exit_handled() will not pass related exits to L1.
+        * nested_vmx_exit_reflected() will not pass related exits to L1.
         * These rules have exceptions below.
         */
 
@@ -3202,6 +3259,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_TSC:
                msr_info->data = guest_read_tsc(vcpu);
                break;
+       case MSR_IA32_SPEC_CTRL:
+               msr_info->data = vcpu->arch.spec_ctrl;
+               break;
        case MSR_IA32_SYSENTER_CS:
                msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
                break;
@@ -3306,6 +3366,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_TSC:
                kvm_write_tsc(vcpu, msr_info);
                break;
+       case MSR_IA32_SPEC_CTRL:
+               vcpu->arch.spec_ctrl = msr_info->data;
+               break;
        case MSR_IA32_CR_PAT:
                if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
                        if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
@@ -3635,9 +3698,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                                &_vmexit_control) < 0)
                return -EIO;
 
-       min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
-               PIN_BASED_VIRTUAL_NMIS;
-       opt = PIN_BASED_POSTED_INTR | PIN_BASED_VMX_PREEMPTION_TIMER;
+       min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
+       opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
+                PIN_BASED_VMX_PREEMPTION_TIMER;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
                                &_pin_based_exec_control) < 0)
                return -EIO;
@@ -3857,11 +3920,6 @@ static __init int alloc_kvm_area(void)
        return 0;
 }
 
-static bool emulation_required(struct kvm_vcpu *vcpu)
-{
-       return emulate_invalid_guest_state && !guest_state_valid(vcpu);
-}
-
 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
                struct kvm_segment *save)
 {
@@ -4950,6 +5008,28 @@ static bool vmx_get_enable_apicv(void)
        return enable_apicv;
 }
 
+static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
+{
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+       gfn_t gfn;
+
+       /*
+        * Don't need to mark the APIC access page dirty; it is never
+        * written to by the CPU during APIC virtualization.
+        */
+
+       if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
+               gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
+               kvm_vcpu_mark_page_dirty(vcpu, gfn);
+       }
+
+       if (nested_cpu_has_posted_intr(vmcs12)) {
+               gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
+               kvm_vcpu_mark_page_dirty(vcpu, gfn);
+       }
+}
+
+
 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4957,18 +5037,15 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
        void *vapic_page;
        u16 status;
 
-       if (vmx->nested.pi_desc &&
-           vmx->nested.pi_pending) {
-               vmx->nested.pi_pending = false;
-               if (!pi_test_and_clear_on(vmx->nested.pi_desc))
-                       return;
-
-               max_irr = find_last_bit(
-                       (unsigned long *)vmx->nested.pi_desc->pir, 256);
+       if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
+               return;
 
-               if (max_irr == 256)
-                       return;
+       vmx->nested.pi_pending = false;
+       if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+               return;
 
+       max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
+       if (max_irr != 256) {
                vapic_page = kmap(vmx->nested.virtual_apic_page);
                __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
                kunmap(vmx->nested.virtual_apic_page);
@@ -4980,30 +5057,43 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
                        vmcs_write16(GUEST_INTR_STATUS, status);
                }
        }
+
+       nested_mark_vmcs12_pages_dirty(vcpu);
 }
 
-static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
+static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
+                                                    bool nested)
 {
 #ifdef CONFIG_SMP
-       if (vcpu->mode == IN_GUEST_MODE) {
-               struct vcpu_vmx *vmx = to_vmx(vcpu);
+       int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
 
+       if (vcpu->mode == IN_GUEST_MODE) {
                /*
-                * Currently, we don't support urgent interrupt,
-                * all interrupts are recognized as non-urgent
-                * interrupt, so we cannot post interrupts when
-                * 'SN' is set.
+                * The vector of interrupt to be delivered to vcpu had
+                * been set in PIR before this function.
+                *
+                * Following cases will be reached in this block, and
+                * we always send a notification event in all cases as
+                * explained below.
+                *
+                * Case 1: vcpu keeps in non-root mode. Sending a
+                * notification event posts the interrupt to vcpu.
                 *
-                * If the vcpu is in guest mode, it means it is
-                * running instead of being scheduled out and
-                * waiting in the run queue, and that's the only
-                * case when 'SN' is set currently, warning if
-                * 'SN' is set.
+                * Case 2: vcpu exits to root mode and is still
+                * runnable. PIR will be synced to vIRR before the
+                * next vcpu entry. Sending a notification event in
+                * this case has no effect, as vcpu is not in root
+                * mode.
+                *
+                * Case 3: vcpu exits to root mode and is blocked.
+                * vcpu_block() has already synced PIR to vIRR and
+                * never blocks vcpu if vIRR is not cleared. Therefore,
+                * a blocked vcpu here does not wait for any requested
+                * interrupts in PIR, and sending a notification event
+                * which has no effect is safe here.
                 */
-               WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc));
 
-               apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
-                               POSTED_INTR_VECTOR);
+               apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
                return true;
        }
 #endif
@@ -5018,7 +5108,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
        if (is_guest_mode(vcpu) &&
            vector == vmx->nested.posted_intr_nv) {
                /* the PIR and ON have been set by L1. */
-               kvm_vcpu_trigger_posted_interrupt(vcpu);
+               kvm_vcpu_trigger_posted_interrupt(vcpu, true);
                /*
                 * If a posted intr is not recognized by hardware,
                 * we will accomplish it in the next vmentry.
@@ -5052,7 +5142,7 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
        if (pi_test_and_set_on(&vmx->pi_desc))
                return;
 
-       if (!kvm_vcpu_trigger_posted_interrupt(vcpu))
+       if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
                kvm_vcpu_kick(vcpu);
 }
 
@@ -5079,12 +5169,12 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
         */
        cr3 = __read_cr3();
        vmcs_writel(HOST_CR3, cr3);             /* 22.2.3  FIXME: shadow tables */
-       vmx->host_state.vmcs_host_cr3 = cr3;
+       vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
 
        /* Save the most likely value for this task's CR4 in the VMCS. */
        cr4 = cr4_read_shadow();
        vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
-       vmx->host_state.vmcs_host_cr4 = cr4;
+       vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
 
        vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
 #ifdef CONFIG_X86_64
@@ -5102,7 +5192,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
        vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
        vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
 
-       native_store_idt(&dt);
+       store_idt(&dt);
        vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
        vmx->host_idt_base = dt.address;
 
@@ -5470,7 +5560,8 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
 
 static void enable_nmi_window(struct kvm_vcpu *vcpu)
 {
-       if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+       if (!cpu_has_virtual_nmis() ||
+           vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
                enable_irq_window(vcpu);
                return;
        }
@@ -5510,11 +5601,22 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-       if (!is_guest_mode(vcpu)) {
-               ++vcpu->stat.nmi_injections;
-               vmx->nmi_known_unmasked = false;
+       if (!cpu_has_virtual_nmis()) {
+               /*
+                * Tracking the NMI-blocked state in software is built upon
+                * finding the next open IRQ window. This, in turn, depends on
+                * well-behaving guests: They have to keep IRQs disabled at
+                * least as long as the NMI handler runs. Otherwise we may
+                * cause NMI nesting, maybe breaking the guest. But as this is
+                * highly unlikely, we can live with the residual risk.
+                */
+               vmx->loaded_vmcs->soft_vnmi_blocked = 1;
+               vmx->loaded_vmcs->vnmi_blocked_time = 0;
        }
 
+       ++vcpu->stat.nmi_injections;
+       vmx->loaded_vmcs->nmi_known_unmasked = false;
+
        if (vmx->rmode.vm86_active) {
                if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
                        kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
@@ -5527,22 +5629,36 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 
 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
 {
-       if (to_vmx(vcpu)->nmi_known_unmasked)
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       bool masked;
+
+       if (!cpu_has_virtual_nmis())
+               return vmx->loaded_vmcs->soft_vnmi_blocked;
+       if (vmx->loaded_vmcs->nmi_known_unmasked)
                return false;
-       return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
+       masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
+       vmx->loaded_vmcs->nmi_known_unmasked = !masked;
+       return masked;
 }
 
 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-       vmx->nmi_known_unmasked = !masked;
-       if (masked)
-               vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
-                             GUEST_INTR_STATE_NMI);
-       else
-               vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
-                               GUEST_INTR_STATE_NMI);
+       if (!cpu_has_virtual_nmis()) {
+               if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
+                       vmx->loaded_vmcs->soft_vnmi_blocked = masked;
+                       vmx->loaded_vmcs->vnmi_blocked_time = 0;
+               }
+       } else {
+               vmx->loaded_vmcs->nmi_known_unmasked = !masked;
+               if (masked)
+                       vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+                                     GUEST_INTR_STATE_NMI);
+               else
+                       vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+                                       GUEST_INTR_STATE_NMI);
+       }
 }
 
 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -5550,6 +5666,10 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
        if (to_vmx(vcpu)->nested.nested_run_pending)
                return 0;
 
+       if (!cpu_has_virtual_nmis() &&
+           to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
+               return 0;
+
        return  !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
                  (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
                   | GUEST_INTR_STATE_NMI));
@@ -5731,7 +5851,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
                      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
                        vcpu->arch.dr6 &= ~15;
                        vcpu->arch.dr6 |= dr6 | DR6_RTM;
-                       if (!(dr6 & ~DR6_RESERVED)) /* icebp */
+                       if (is_icebp(intr_info))
                                skip_emulated_instruction(vcpu);
 
                        kvm_queue_exception(vcpu, DB_VECTOR);
@@ -6044,6 +6164,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
 
        msr_info.index = ecx;
        msr_info.host_initiated = false;
+
        if (vmx_get_msr(vcpu, &msr_info)) {
                trace_kvm_msr_read_ex(ecx);
                kvm_inject_gp(vcpu, 0);
@@ -6277,6 +6398,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
         * AAK134, BY25.
         */
        if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+                       cpu_has_virtual_nmis() &&
                        (exit_qualification & INTR_INFO_UNBLOCK_NMI))
                vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
 
@@ -6517,12 +6639,7 @@ static __init int hardware_setup(void)
        memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
        memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
 
-       /*
-        * Allow direct access to the PC debug port (it is often used for I/O
-        * delays, but the vmexits simply slow things down).
-        */
        memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
-       clear_bit(0x80, vmx_io_bitmap_a);
 
        memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
 
@@ -6596,6 +6713,8 @@ static __init int hardware_setup(void)
        vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
        vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
        vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
+       vmx_disable_intercept_for_msr(MSR_IA32_SPEC_CTRL, false);
+       vmx_disable_intercept_for_msr(MSR_IA32_PRED_CMD, false);
 
        memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
                        vmx_msr_bitmap_legacy, PAGE_SIZE);
@@ -6749,7 +6868,7 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
        }
 
        /* Create a new VMCS */
-       item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
+       item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
        if (!item)
                return NULL;
        item->vmcs02.vmcs = alloc_vmcs();
@@ -7124,34 +7243,32 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
+{
+       vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
+       vmcs_write64(VMCS_LINK_POINTER, -1ull);
+}
+
 static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
 {
        if (vmx->nested.current_vmptr == -1ull)
                return;
 
-       /* current_vmptr and current_vmcs12 are always set/reset together */
-       if (WARN_ON(vmx->nested.current_vmcs12 == NULL))
-               return;
-
        if (enable_shadow_vmcs) {
                /* copy to memory all shadowed fields in case
                   they were modified */
                copy_shadow_to_vmcs12(vmx);
                vmx->nested.sync_shadow_vmcs = false;
-               vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
-                               SECONDARY_EXEC_SHADOW_VMCS);
-               vmcs_write64(VMCS_LINK_POINTER, -1ull);
+               vmx_disable_shadow_vmcs(vmx);
        }
        vmx->nested.posted_intr_nv = -1;
 
        /* Flush VMCS12 to guest memory */
-       memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12,
-              VMCS12_SIZE);
+       kvm_vcpu_write_guest_page(&vmx->vcpu,
+                                 vmx->nested.current_vmptr >> PAGE_SHIFT,
+                                 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
 
-       kunmap(vmx->nested.current_vmcs12_page);
-       nested_release_page(vmx->nested.current_vmcs12_page);
        vmx->nested.current_vmptr = -1ull;
-       vmx->nested.current_vmcs12 = NULL;
 }
 
 /*
@@ -7165,12 +7282,14 @@ static void free_nested(struct vcpu_vmx *vmx)
 
        vmx->nested.vmxon = false;
        free_vpid(vmx->nested.vpid02);
-       nested_release_vmcs12(vmx);
+       vmx->nested.posted_intr_nv = -1;
+       vmx->nested.current_vmptr = -1ull;
        if (vmx->nested.msr_bitmap) {
                free_page((unsigned long)vmx->nested.msr_bitmap);
                vmx->nested.msr_bitmap = NULL;
        }
        if (enable_shadow_vmcs) {
+               vmx_disable_shadow_vmcs(vmx);
                vmcs_clear(vmx->vmcs01.shadow_vmcs);
                free_vmcs(vmx->vmcs01.shadow_vmcs);
                vmx->vmcs01.shadow_vmcs = NULL;
@@ -7569,14 +7688,14 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
                }
 
                nested_release_vmcs12(vmx);
-               vmx->nested.current_vmcs12 = new_vmcs12;
-               vmx->nested.current_vmcs12_page = page;
                /*
                 * Load VMCS12 from guest memory since it is not already
                 * cached.
                 */
-               memcpy(vmx->nested.cached_vmcs12,
-                      vmx->nested.current_vmcs12, VMCS12_SIZE);
+               memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
+               kunmap(page);
+               nested_release_page_clean(page);
+
                set_current_vmptr(vmx, vmptr);
        }
 
@@ -7766,6 +7885,7 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
         * "blocked by NMI" bit has to be set before next VM entry.
         */
        if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+                       cpu_has_virtual_nmis() &&
                        (exit_qualification & INTR_INFO_UNBLOCK_NMI))
                vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
                                GUEST_INTR_STATE_NMI);
@@ -8009,12 +8129,11 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
  * should handle it ourselves in L0 (and then continue L2). Only call this
  * when in is_guest_mode (L2).
  */
-static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
+static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 {
        u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-       u32 exit_reason = vmx->exit_reason;
 
        trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
                                vmcs_readl(EXIT_QUALIFICATION),
@@ -8023,6 +8142,18 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                                vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
                                KVM_ISA_VMX);
 
+       /*
+        * The host physical addresses of some pages of guest memory
+        * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
+        * may write to these pages via their host physical address while
+        * L2 is running, bypassing any address-translation-based dirty
+        * tracking (e.g. EPT write protection).
+        *
+        * Mark them dirty on every exit from L2 to prevent them from
+        * getting out of sync with dirty tracking.
+        */
+       nested_mark_vmcs12_pages_dirty(vcpu);
+
        if (vmx->nested.nested_run_pending)
                return false;
 
@@ -8159,6 +8290,29 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
        }
 }
 
+static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
+{
+       u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+       /*
+        * At this point, the exit interruption info in exit_intr_info
+        * is only valid for EXCEPTION_NMI exits.  For EXTERNAL_INTERRUPT
+        * we need to query the in-kernel LAPIC.
+        */
+       WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
+       if ((exit_intr_info &
+            (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
+           (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
+               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+               vmcs12->vm_exit_intr_error_code =
+                       vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+       }
+
+       nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
+                         vmcs_readl(EXIT_QUALIFICATION));
+       return 1;
+}
+
 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 {
        *info1 = vmcs_readl(EXIT_QUALIFICATION);
@@ -8405,12 +8559,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
        if (vmx->emulation_required)
                return handle_invalid_guest_state(vcpu);
 
-       if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
-               nested_vmx_vmexit(vcpu, exit_reason,
-                                 vmcs_read32(VM_EXIT_INTR_INFO),
-                                 vmcs_readl(EXIT_QUALIFICATION));
-               return 1;
-       }
+       if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
+               return nested_vmx_reflect_vmexit(vcpu, exit_reason);
 
        if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
                dump_vmcs();
@@ -8453,6 +8603,25 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
                return 0;
        }
 
+       if (unlikely(!cpu_has_virtual_nmis() &&
+                    vmx->loaded_vmcs->soft_vnmi_blocked)) {
+               if (vmx_interrupt_allowed(vcpu)) {
+                       vmx->loaded_vmcs->soft_vnmi_blocked = 0;
+               } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
+                          vcpu->arch.nmi_pending) {
+                       /*
+                        * This CPU don't support us in finding the end of an
+                        * NMI-blocked window if the guest runs with IRQs
+                        * disabled. So we pull the trigger after 1 s of
+                        * futile waiting, but inform the user about this.
+                        */
+                       printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
+                              "state on VCPU %d after 1 s timeout\n",
+                              __func__, vcpu->vcpu_id);
+                       vmx->loaded_vmcs->soft_vnmi_blocked = 0;
+               }
+       }
+
        if (exit_reason < kvm_vmx_max_exit_handlers
            && kvm_vmx_exit_handlers[exit_reason])
                return kvm_vmx_exit_handlers[exit_reason](vcpu);
@@ -8671,7 +8840,6 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 {
        u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-       register void *__sp asm(_ASM_SP);
 
        if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
                        == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
@@ -8685,7 +8853,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 
                vector =  exit_intr_info & INTR_INFO_VECTOR_MASK;
                desc = (gate_desc *)vmx->host_idt_base + vector;
-               entry = gate_offset(*desc);
+               entry = gate_offset(desc);
                asm volatile(
 #ifdef CONFIG_X86_64
                        "mov %%" _ASM_SP ", %[sp]\n\t"
@@ -8695,14 +8863,14 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 #endif
                        "pushf\n\t"
                        __ASM_SIZE(push) " $%c[cs]\n\t"
-                       "call *%[entry]\n\t"
+                       CALL_NOSPEC
                        :
 #ifdef CONFIG_X86_64
                        [sp]"=&r"(tmp),
 #endif
-                       "+r"(__sp)
+                       ASM_CALL_CONSTRAINT
                        :
-                       [entry]"r"(entry),
+                       THUNK_TARGET(entry),
                        [ss]"i"(__KERNEL_DS),
                        [cs]"i"(__KERNEL_CS)
                        );
@@ -8736,33 +8904,38 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 
        idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
-       if (vmx->nmi_known_unmasked)
-               return;
-       /*
-        * Can't use vmx->exit_intr_info since we're not sure what
-        * the exit reason is.
-        */
-       exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-       unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
-       vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
-       /*
-        * SDM 3: 27.7.1.2 (September 2008)
-        * Re-set bit "block by NMI" before VM entry if vmexit caused by
-        * a guest IRET fault.
-        * SDM 3: 23.2.2 (September 2008)
-        * Bit 12 is undefined in any of the following cases:
-        *  If the VM exit sets the valid bit in the IDT-vectoring
-        *   information field.
-        *  If the VM exit is due to a double fault.
-        */
-       if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
-           vector != DF_VECTOR && !idtv_info_valid)
-               vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
-                             GUEST_INTR_STATE_NMI);
-       else
-               vmx->nmi_known_unmasked =
-                       !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
-                         & GUEST_INTR_STATE_NMI);
+       if (cpu_has_virtual_nmis()) {
+               if (vmx->loaded_vmcs->nmi_known_unmasked)
+                       return;
+               /*
+                * Can't use vmx->exit_intr_info since we're not sure what
+                * the exit reason is.
+                */
+               exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+               unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
+               vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+               /*
+                * SDM 3: 27.7.1.2 (September 2008)
+                * Re-set bit "block by NMI" before VM entry if vmexit caused by
+                * a guest IRET fault.
+                * SDM 3: 23.2.2 (September 2008)
+                * Bit 12 is undefined in any of the following cases:
+                *  If the VM exit sets the valid bit in the IDT-vectoring
+                *   information field.
+                *  If the VM exit is due to a double fault.
+                */
+               if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
+                   vector != DF_VECTOR && !idtv_info_valid)
+                       vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+                                     GUEST_INTR_STATE_NMI);
+               else
+                       vmx->loaded_vmcs->nmi_known_unmasked =
+                               !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
+                                 & GUEST_INTR_STATE_NMI);
+       } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
+               vmx->loaded_vmcs->vnmi_blocked_time +=
+                       ktime_to_ns(ktime_sub(ktime_get(),
+                                             vmx->loaded_vmcs->entry_time));
 }
 
 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
@@ -8879,6 +9052,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long debugctlmsr, cr3, cr4;
 
+       /* Record the guest's net vcpu time for enforced NMI injections. */
+       if (unlikely(!cpu_has_virtual_nmis() &&
+                    vmx->loaded_vmcs->soft_vnmi_blocked))
+               vmx->loaded_vmcs->entry_time = ktime_get();
+
        /* Don't enter VMX if guest state is invalid, let the exit handler
           start emulation until we arrive back to a valid state */
        if (vmx->emulation_required)
@@ -8900,15 +9078,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
                vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
 
        cr3 = __get_current_cr3_fast();
-       if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) {
+       if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) {
                vmcs_writel(HOST_CR3, cr3);
-               vmx->host_state.vmcs_host_cr3 = cr3;
+               vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
        }
 
        cr4 = cr4_read_shadow();
-       if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
+       if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) {
                vmcs_writel(HOST_CR4, cr4);
-               vmx->host_state.vmcs_host_cr4 = cr4;
+               vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
        }
 
        /* When single-stepping over STI and MOV SS, we must clear the
@@ -8919,10 +9097,16 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
                vmx_set_interrupt_shadow(vcpu, 0);
 
-       if (vmx->guest_pkru_valid)
-               __write_pkru(vmx->guest_pkru);
+       if (static_cpu_has(X86_FEATURE_PKU) &&
+           kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
+           vcpu->arch.pkru != vmx->host_pkru)
+               __write_pkru(vcpu->arch.pkru);
 
        atomic_switch_perf_msrs(vmx);
+
+       /* SMB: Ignore ibrs_inuse but rely on vcpu value */
+       x86_spec_ctrl_set_guest(vcpu->arch.spec_ctrl);
+
        debugctlmsr = get_debugctlmsr();
 
        vmx_arm_hv_timer(vcpu);
@@ -8975,6 +9159,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
                /* Save guest registers, load host registers, keep flags */
                "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
                "pop %0 \n\t"
+               "setbe %c[fail](%0)\n\t"
                "mov %%" _ASM_AX ", %c[rax](%0) \n\t"
                "mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
                __ASM_SIZE(pop) " %c[rcx](%0) \n\t"
@@ -8991,12 +9176,23 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
                "mov %%r13, %c[r13](%0) \n\t"
                "mov %%r14, %c[r14](%0) \n\t"
                "mov %%r15, %c[r15](%0) \n\t"
+               "xor %%r8d,  %%r8d \n\t"
+               "xor %%r9d,  %%r9d \n\t"
+               "xor %%r10d, %%r10d \n\t"
+               "xor %%r11d, %%r11d \n\t"
+               "xor %%r12d, %%r12d \n\t"
+               "xor %%r13d, %%r13d \n\t"
+               "xor %%r14d, %%r14d \n\t"
+               "xor %%r15d, %%r15d \n\t"
 #endif
                "mov %%cr2, %%" _ASM_AX "   \n\t"
                "mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
 
+               "xor %%eax, %%eax \n\t"
+               "xor %%ebx, %%ebx \n\t"
+               "xor %%esi, %%esi \n\t"
+               "xor %%edi, %%edi \n\t"
                "pop  %%" _ASM_BP "; pop  %%" _ASM_DX " \n\t"
-               "setbe %c[fail](%0) \n\t"
                ".pushsection .rodata \n\t"
                ".global vmx_return \n\t"
                "vmx_return: " _ASM_PTR " 2b \n\t"
@@ -9033,6 +9229,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
              );
 
+       x86_spec_ctrl_restore_host(vcpu->arch.spec_ctrl);
+
+       /* Eliminate branch target predictions from guest mode */
+       vmexit_fill_RSB();
+
        /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
        if (debugctlmsr)
                update_debugctlmsr(debugctlmsr);
@@ -9068,13 +9269,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
         * back on host, so it is safe to read guest PKRU from current
         * XSAVE.
         */
-       if (boot_cpu_has(X86_FEATURE_OSPKE)) {
-               vmx->guest_pkru = __read_pkru();
-               if (vmx->guest_pkru != vmx->host_pkru) {
-                       vmx->guest_pkru_valid = true;
+       if (static_cpu_has(X86_FEATURE_PKU) &&
+           kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
+               vcpu->arch.pkru = __read_pkru();
+               if (vcpu->arch.pkru != vmx->host_pkru)
                        __write_pkru(vmx->host_pkru);
-               } else
-                       vmx->guest_pkru_valid = false;
        }
 
        /*
@@ -9213,10 +9412,16 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 
        vmx->nested.posted_intr_nv = -1;
        vmx->nested.current_vmptr = -1ull;
-       vmx->nested.current_vmcs12 = NULL;
 
        vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
 
+       /*
+        * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
+        * or POSTED_INTR_WAKEUP_VECTOR.
+        */
+       vmx->pi_desc.nv = POSTED_INTR_VECTOR;
+       vmx->pi_desc.sn = 1;
+
        return &vmx->vcpu;
 
 free_vmcs:
@@ -9499,12 +9704,15 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 
        WARN_ON(!is_guest_mode(vcpu));
 
-       if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
-               nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
-                                 vmcs_read32(VM_EXIT_INTR_INFO),
-                                 vmcs_readl(EXIT_QUALIFICATION));
-       else
+       if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) {
+               vmcs12->vm_exit_intr_error_code = fault->error_code;
+               nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
+                                 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
+                                 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
+                                 fault->address);
+       } else {
                kvm_inject_page_fault(vcpu, fault);
+       }
 }
 
 static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
@@ -10032,6 +10240,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                             vmcs12->vm_entry_instruction_len);
                vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
                             vmcs12->guest_interruptibility_info);
+               vmx->loaded_vmcs->nmi_known_unmasked =
+                       !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
        } else {
                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
        }
@@ -10056,13 +10266,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 
        /* Posted interrupts setting is only taken from vmcs12.  */
        if (nested_cpu_has_posted_intr(vmcs12)) {
-               /*
-                * Note that we use L0's vector here and in
-                * vmx_deliver_nested_posted_interrupt.
-                */
                vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
                vmx->nested.pi_pending = false;
-               vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+               vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
        } else {
                exec_control &= ~PIN_BASED_POSTED_INTR;
        }
@@ -10086,12 +10292,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
         * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
         * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
         * !enable_ept, EB.PF is 1, so the "or" will always be 1.
-        *
-        * A problem with this approach (when !enable_ept) is that L1 may be
-        * injected with more page faults than it asked for. This could have
-        * caused problems, but in practice existing hypervisors don't care.
-        * To fix this, we will need to emulate the PFEC checking (on the L1
-        * page tables), using walk_addr(), when injecting PFs to L1.
         */
        vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
                enable_ept ? vmcs12->page_fault_error_code_mask : 0);
@@ -10178,6 +10378,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        if (exec_control & CPU_BASED_TPR_SHADOW) {
                vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
                vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+       } else {
+#ifdef CONFIG_X86_64
+               exec_control |= CPU_BASED_CR8_LOAD_EXITING |
+                               CPU_BASED_CR8_STORE_EXITING;
+#endif
        }
 
        /*
@@ -10488,6 +10693,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 {
        struct vmcs12 *vmcs12;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
        u32 exit_qual;
        int ret;
 
@@ -10512,6 +10718,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
         * for misconfigurations which will anyway be caught by the processor
         * when using the merged vmcs02.
         */
+       if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) {
+               nested_vmx_failValid(vcpu,
+                                    VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
+               goto out;
+       }
+
        if (vmcs12->launch_state == launch) {
                nested_vmx_failValid(vcpu,
                        launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
@@ -10832,13 +11044,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 
        vmcs12->vm_exit_reason = exit_reason;
        vmcs12->exit_qualification = exit_qualification;
-
        vmcs12->vm_exit_intr_info = exit_intr_info;
-       if ((vmcs12->vm_exit_intr_info &
-            (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
-           (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK))
-               vmcs12->vm_exit_intr_error_code =
-                       vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+
        vmcs12->idt_vectoring_info_field = 0;
        vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
        vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
@@ -10904,7 +11111,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 
        /* Same as above - no reason to call set_cr4_guest_host_mask().  */
        vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
-       kvm_set_cr4(vcpu, vmcs12->host_cr4);
+       vmx_set_cr4(vcpu, vmcs12->host_cr4);
 
        nested_ept_uninit_mmu_context(vcpu);
 
@@ -10926,7 +11133,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                 */
                vmx_flush_tlb(vcpu);
        }
-
+       /* Restore posted intr vector. */
+       if (nested_cpu_has_posted_intr(vmcs12))
+               vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
 
        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
        vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
@@ -11032,8 +11241,15 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
 
-       if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
-           && nested_exit_intr_ack_set(vcpu)) {
+       /*
+        * TODO: SDM says that with acknowledge interrupt on exit, bit 31 of
+        * the VM-exit interrupt information (valid interrupt) is always set to
+        * 1 on EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't need
+        * kvm_cpu_has_interrupt().  See the commit message for details.
+        */
+       if (nested_exit_intr_ack_set(vcpu) &&
+           exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+           kvm_cpu_has_interrupt(vcpu)) {
                int irq = kvm_cpu_get_interrupt(vcpu);
                WARN_ON(irq < 0);
                vmcs12->vm_exit_intr_info = irq |
@@ -11290,6 +11506,37 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
        kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
 }
 
+static void __pi_post_block(struct kvm_vcpu *vcpu)
+{
+       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+       struct pi_desc old, new;
+       unsigned int dest;
+
+       do {
+               old.control = new.control = pi_desc->control;
+               WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
+                    "Wakeup handler not enabled while the VCPU is blocked\n");
+
+               dest = cpu_physical_id(vcpu->cpu);
+
+               if (x2apic_enabled())
+                       new.ndst = dest;
+               else
+                       new.ndst = (dest << 8) & 0xFF00;
+
+               /* set 'NV' to 'notification vector' */
+               new.nv = POSTED_INTR_VECTOR;
+       } while (cmpxchg64(&pi_desc->control, old.control,
+                          new.control) != old.control);
+
+       if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
+               spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+               list_del(&vcpu->blocked_vcpu_list);
+               spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+               vcpu->pre_pcpu = -1;
+       }
+}
+
 /*
  * This routine does the following things for vCPU which is going
  * to be blocked if VT-d PI is enabled.
@@ -11305,7 +11552,6 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
  */
 static int pi_pre_block(struct kvm_vcpu *vcpu)
 {
-       unsigned long flags;
        unsigned int dest;
        struct pi_desc old, new;
        struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
@@ -11315,34 +11561,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
                !kvm_vcpu_apicv_active(vcpu))
                return 0;
 
-       vcpu->pre_pcpu = vcpu->cpu;
-       spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
-                         vcpu->pre_pcpu), flags);
-       list_add_tail(&vcpu->blocked_vcpu_list,
-                     &per_cpu(blocked_vcpu_on_cpu,
-                     vcpu->pre_pcpu));
-       spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
-                              vcpu->pre_pcpu), flags);
+       WARN_ON(irqs_disabled());
+       local_irq_disable();
+       if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
+               vcpu->pre_pcpu = vcpu->cpu;
+               spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+               list_add_tail(&vcpu->blocked_vcpu_list,
+                             &per_cpu(blocked_vcpu_on_cpu,
+                                      vcpu->pre_pcpu));
+               spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+       }
 
        do {
                old.control = new.control = pi_desc->control;
 
-               /*
-                * We should not block the vCPU if
-                * an interrupt is posted for it.
-                */
-               if (pi_test_on(pi_desc) == 1) {
-                       spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
-                                         vcpu->pre_pcpu), flags);
-                       list_del(&vcpu->blocked_vcpu_list);
-                       spin_unlock_irqrestore(
-                                       &per_cpu(blocked_vcpu_on_cpu_lock,
-                                       vcpu->pre_pcpu), flags);
-                       vcpu->pre_pcpu = -1;
-
-                       return 1;
-               }
-
                WARN((pi_desc->sn == 1),
                     "Warning: SN field of posted-interrupts "
                     "is set before blocking\n");
@@ -11364,10 +11596,15 @@ static int pi_pre_block(struct kvm_vcpu *vcpu)
 
                /* set 'NV' to 'wakeup vector' */
                new.nv = POSTED_INTR_WAKEUP_VECTOR;
-       } while (cmpxchg(&pi_desc->control, old.control,
-                       new.control) != old.control);
+       } while (cmpxchg64(&pi_desc->control, old.control,
+                          new.control) != old.control);
 
-       return 0;
+       /* We should not block the vCPU if an interrupt is posted for it.  */
+       if (pi_test_on(pi_desc) == 1)
+               __pi_post_block(vcpu);
+
+       local_irq_enable();
+       return (vcpu->pre_pcpu == -1);
 }
 
 static int vmx_pre_block(struct kvm_vcpu *vcpu)
@@ -11383,44 +11620,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
 
 static void pi_post_block(struct kvm_vcpu *vcpu)
 {
-       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-       struct pi_desc old, new;
-       unsigned int dest;
-       unsigned long flags;
-
-       if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-               !irq_remapping_cap(IRQ_POSTING_CAP)  ||
-               !kvm_vcpu_apicv_active(vcpu))
+       if (vcpu->pre_pcpu == -1)
                return;
 
-       do {
-               old.control = new.control = pi_desc->control;
-
-               dest = cpu_physical_id(vcpu->cpu);
-
-               if (x2apic_enabled())
-                       new.ndst = dest;
-               else
-                       new.ndst = (dest << 8) & 0xFF00;
-
-               /* Allow posting non-urgent interrupts */
-               new.sn = 0;
-
-               /* set 'NV' to 'notification vector' */
-               new.nv = POSTED_INTR_VECTOR;
-       } while (cmpxchg(&pi_desc->control, old.control,
-                       new.control) != old.control);
-
-       if(vcpu->pre_pcpu != -1) {
-               spin_lock_irqsave(
-                       &per_cpu(blocked_vcpu_on_cpu_lock,
-                       vcpu->pre_pcpu), flags);
-               list_del(&vcpu->blocked_vcpu_list);
-               spin_unlock_irqrestore(
-                       &per_cpu(blocked_vcpu_on_cpu_lock,
-                       vcpu->pre_pcpu), flags);
-               vcpu->pre_pcpu = -1;
-       }
+       WARN_ON(irqs_disabled());
+       local_irq_disable();
+       __pi_post_block(vcpu);
+       local_irq_enable();
 }
 
 static void vmx_post_block(struct kvm_vcpu *vcpu)
@@ -11448,7 +11654,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
        struct kvm_lapic_irq irq;
        struct kvm_vcpu *vcpu;
        struct vcpu_data vcpu_info;
-       int idx, ret = -EINVAL;
+       int idx, ret = 0;
 
        if (!kvm_arch_has_assigned_device(kvm) ||
                !irq_remapping_cap(IRQ_POSTING_CAP) ||
@@ -11457,7 +11663,12 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 
        idx = srcu_read_lock(&kvm->irq_srcu);
        irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
-       BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
+       if (guest_irq >= irq_rt->nr_rt_entries ||
+           hlist_empty(&irq_rt->map[guest_irq])) {
+               pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
+                            guest_irq, irq_rt->nr_rt_entries);
+               goto out;
+       }
 
        hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
                if (e->type != KVM_IRQ_ROUTING_MSI)
@@ -11500,12 +11711,8 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 
                if (set)
                        ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
-               else {
-                       /* suppress notification event before unposting */
-                       pi_set_sn(vcpu_to_pi_desc(vcpu));
+               else
                        ret = irq_set_vcpu_affinity(host_irq, NULL);
-                       pi_clear_sn(vcpu_to_pi_desc(vcpu));
-               }
 
                if (ret < 0) {
                        printk(KERN_INFO "%s: failed to update PI IRTE\n",
@@ -11576,8 +11783,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .get_rflags = vmx_get_rflags,
        .set_rflags = vmx_set_rflags,
 
-       .get_pkru = vmx_get_pkru,
-
        .tlb_flush = vmx_flush_tlb,
 
        .run = vmx_vcpu_run,