]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - arch/x86/kvm/vmx.c
kvm: nVMX: Add support for fast unprotection of nested guest page tables
[mirror_ubuntu-bionic-kernel.git] / arch / x86 / kvm / vmx.c
index 29fd8af5c347a6c642e131458fcfa254b6d7d854..ed1074e98b8e8faa505b93fb782409ef41187b81 100644 (file)
@@ -243,11 +243,13 @@ struct __packed vmcs12 {
        u64 virtual_apic_page_addr;
        u64 apic_access_addr;
        u64 posted_intr_desc_addr;
+       u64 vm_function_control;
        u64 ept_pointer;
        u64 eoi_exit_bitmap0;
        u64 eoi_exit_bitmap1;
        u64 eoi_exit_bitmap2;
        u64 eoi_exit_bitmap3;
+       u64 eptp_list_address;
        u64 xss_exit_bitmap;
        u64 guest_physical_address;
        u64 vmcs_link_pointer;
@@ -416,13 +418,10 @@ struct nested_vmx {
 
        /* The guest-physical address of the current VMCS L1 keeps for L2 */
        gpa_t current_vmptr;
-       /* The host-usable pointer to the above */
-       struct page *current_vmcs12_page;
-       struct vmcs12 *current_vmcs12;
        /*
         * Cache of the guest's VMCS, existing outside of guest memory.
         * Loaded from guest memory during VMPTRLD. Flushed to guest
-        * memory during VMXOFF, VMCLEAR, VMPTRLD.
+        * memory during VMCLEAR and VMPTRLD.
         */
        struct vmcs12 *cached_vmcs12;
        /*
@@ -484,6 +483,7 @@ struct nested_vmx {
        u64 nested_vmx_cr4_fixed0;
        u64 nested_vmx_cr4_fixed1;
        u64 nested_vmx_vmcs_enum;
+       u64 nested_vmx_vmfunc_controls;
 };
 
 #define POSTED_INTR_ON  0
@@ -563,7 +563,6 @@ struct vcpu_vmx {
        struct kvm_vcpu       vcpu;
        unsigned long         host_rsp;
        u8                    fail;
-       bool                  nmi_known_unmasked;
        u32                   exit_intr_info;
        u32                   idt_vectoring_info;
        ulong                 rflags;
@@ -767,11 +766,13 @@ static const unsigned short vmcs_field_to_offset_table[] = {
        FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
        FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
        FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
+       FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
        FIELD64(EPT_POINTER, ept_pointer),
        FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
        FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
        FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
        FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
+       FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
        FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
        FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
        FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@ -895,25 +896,6 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
        return to_vmx(vcpu)->nested.cached_vmcs12;
 }
 
-static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
-{
-       struct page *page = kvm_vcpu_gfn_to_page(vcpu, addr >> PAGE_SHIFT);
-       if (is_error_page(page))
-               return NULL;
-
-       return page;
-}
-
-static void nested_release_page(struct page *page)
-{
-       kvm_release_page_dirty(page);
-}
-
-static void nested_release_page_clean(struct page *page)
-{
-       kvm_release_page_clean(page);
-}
-
 static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
@@ -928,6 +910,10 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var);
 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
 static int alloc_identity_pagetable(struct kvm *kvm);
+static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
+static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
+static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+                                           u16 error_code);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -1319,6 +1305,12 @@ static inline bool cpu_has_vmx_tsc_scaling(void)
                SECONDARY_EXEC_TSC_SCALING;
 }
 
+static inline bool cpu_has_vmx_vmfunc(void)
+{
+       return vmcs_config.cpu_based_2nd_exec_ctrl &
+               SECONDARY_EXEC_ENABLE_VMFUNC;
+}
+
 static inline bool report_flexpriority(void)
 {
        return flexpriority_enabled;
@@ -1393,6 +1385,18 @@ static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
        return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
 }
 
+static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
+{
+       return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
+}
+
+static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
+{
+       return nested_cpu_has_vmfunc(vmcs12) &&
+               (vmcs12->vm_function_control &
+                VMX_VMFUNC_EPTP_SWITCHING);
+}
+
 static inline bool is_nmi(u32 intr_info)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2429,6 +2433,30 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
        vmx_set_interrupt_shadow(vcpu, 0);
 }
 
+static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
+                                              unsigned long exit_qual)
+{
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+       unsigned int nr = vcpu->arch.exception.nr;
+       u32 intr_info = nr | INTR_INFO_VALID_MASK;
+
+       if (vcpu->arch.exception.has_error_code) {
+               vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
+               intr_info |= INTR_INFO_DELIVER_CODE_MASK;
+       }
+
+       if (kvm_exception_is_soft(nr))
+               intr_info |= INTR_TYPE_SOFT_EXCEPTION;
+       else
+               intr_info |= INTR_TYPE_HARD_EXCEPTION;
+
+       if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
+           vmx_get_nmi_mask(vcpu))
+               intr_info |= INTR_INFO_UNBLOCK_NMI;
+
+       nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
+}
+
 /*
  * KVM wants to inject page-faults which it got to the guest. This function
  * checks whether in a nested guest, we need to inject them to L1 or L2.
@@ -2438,23 +2466,38 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu)
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        unsigned int nr = vcpu->arch.exception.nr;
 
-       if (!((vmcs12->exception_bitmap & (1u << nr)) ||
-               (nr == PF_VECTOR && vcpu->arch.exception.nested_apf)))
-               return 0;
+       if (nr == PF_VECTOR) {
+               if (vcpu->arch.exception.nested_apf) {
+                       nested_vmx_inject_exception_vmexit(vcpu,
+                                                          vcpu->arch.apf.nested_apf_token);
+                       return 1;
+               }
+               /*
+                * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
+                * The fix is to add the ancillary datum (CR2 or DR6) to structs
+                * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
+                * can be written only when inject_pending_event runs.  This should be
+                * conditional on a new capability---if the capability is disabled,
+                * kvm_multiple_exception would write the ancillary information to
+                * CR2 or DR6, for backwards ABI-compatibility.
+                */
+               if (nested_vmx_is_page_fault_vmexit(vmcs12,
+                                                   vcpu->arch.exception.error_code)) {
+                       nested_vmx_inject_exception_vmexit(vcpu, vcpu->arch.cr2);
+                       return 1;
+               }
+       } else {
+               unsigned long exit_qual = 0;
+               if (nr == DB_VECTOR)
+                       exit_qual = vcpu->arch.dr6;
 
-       if (vcpu->arch.exception.nested_apf) {
-               vmcs_write32(VM_EXIT_INTR_ERROR_CODE, vcpu->arch.exception.error_code);
-               nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
-                       PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
-                       INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
-                       vcpu->arch.apf.nested_apf_token);
-               return 1;
+               if (vmcs12->exception_bitmap & (1u << nr)) {
+                       nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+                       return 1;
+               }
        }
 
-       nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
-                         vmcs_read32(VM_EXIT_INTR_INFO),
-                         vmcs_readl(EXIT_QUALIFICATION));
-       return 1;
+       return 0;
 }
 
 static void vmx_queue_exception(struct kvm_vcpu *vcpu)
@@ -2568,7 +2611,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
                if (index >= 0)
                        move_msr_up(vmx, index, save_nmsrs++);
                index = __find_msr_index(vmx, MSR_TSC_AUX);
-               if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu))
+               if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
                        move_msr_up(vmx, index, save_nmsrs++);
                /*
                 * MSR_STAR is only needed on long mode guests, and only
@@ -2628,12 +2671,6 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
        }
 }
 
-static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
-{
-       struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
-       return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
-}
-
 /*
  * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
  * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
@@ -2642,7 +2679,7 @@ static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
  */
 static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
 {
-       return nested && guest_cpuid_has_vmx(vcpu);
+       return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
 }
 
 /*
@@ -2668,7 +2705,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
         * reason is that if one of these bits is necessary, it will appear
         * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
         * fields of vmcs01 and vmcs02, will turn these bits off - and
-        * nested_vmx_exit_handled() will not pass related exits to L1.
+        * nested_vmx_exit_reflected() will not pass related exits to L1.
         * These rules have exceptions below.
         */
 
@@ -2802,6 +2839,17 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
        } else
                vmx->nested.nested_vmx_ept_caps = 0;
 
+       if (cpu_has_vmx_vmfunc()) {
+               vmx->nested.nested_vmx_secondary_ctls_high |=
+                       SECONDARY_EXEC_ENABLE_VMFUNC;
+               /*
+                * Advertise EPTP switching unconditionally
+                * since we emulate it
+                */
+               vmx->nested.nested_vmx_vmfunc_controls =
+                       VMX_VMFUNC_EPTP_SWITCHING;
+       }
+
        /*
         * Old versions of KVM use the single-context version without
         * checking for support, so declare that it is supported even
@@ -3171,6 +3219,9 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
                *pdata = vmx->nested.nested_vmx_ept_caps |
                        ((u64)vmx->nested.nested_vmx_vpid_caps << 32);
                break;
+       case MSR_IA32_VMX_VMFUNC:
+               *pdata = vmx->nested.nested_vmx_vmfunc_controls;
+               break;
        default:
                return 1;
        }
@@ -3224,7 +3275,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                break;
        case MSR_IA32_BNDCFGS:
                if (!kvm_mpx_supported() ||
-                   (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu)))
+                   (!msr_info->host_initiated &&
+                    !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
                        return 1;
                msr_info->data = vmcs_read64(GUEST_BNDCFGS);
                break;
@@ -3248,7 +3300,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                msr_info->data = vcpu->arch.ia32_xss;
                break;
        case MSR_TSC_AUX:
-               if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated)
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
                        return 1;
                /* Otherwise falls through */
        default:
@@ -3307,7 +3360,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                break;
        case MSR_IA32_BNDCFGS:
                if (!kvm_mpx_supported() ||
-                   (!msr_info->host_initiated && !guest_cpuid_has_mpx(vcpu)))
+                   (!msr_info->host_initiated &&
+                    !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
                        return 1;
                if (is_noncanonical_address(data & PAGE_MASK) ||
                    (data & MSR_IA32_BNDCFGS_RSVD))
@@ -3370,7 +3424,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
                break;
        case MSR_TSC_AUX:
-               if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated)
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
                        return 1;
                /* Check reserved bit, higher 32 bits should be zero */
                if ((data >> 32) != 0)
@@ -3608,7 +3663,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                        SECONDARY_EXEC_SHADOW_VMCS |
                        SECONDARY_EXEC_XSAVES |
                        SECONDARY_EXEC_ENABLE_PML |
-                       SECONDARY_EXEC_TSC_SCALING;
+                       SECONDARY_EXEC_TSC_SCALING |
+                       SECONDARY_EXEC_ENABLE_VMFUNC;
                if (adjust_vmx_controls(min2, opt2,
                                        MSR_IA32_VMX_PROCBASED_CTLS2,
                                        &_cpu_based_2nd_exec_control) < 0)
@@ -4956,6 +5012,28 @@ static bool vmx_get_enable_apicv(void)
        return enable_apicv;
 }
 
+static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
+{
+       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+       gfn_t gfn;
+
+       /*
+        * Don't need to mark the APIC access page dirty; it is never
+        * written to by the CPU during APIC virtualization.
+        */
+
+       if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
+               gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
+               kvm_vcpu_mark_page_dirty(vcpu, gfn);
+       }
+
+       if (nested_cpu_has_posted_intr(vmcs12)) {
+               gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
+               kvm_vcpu_mark_page_dirty(vcpu, gfn);
+       }
+}
+
+
 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4963,18 +5041,15 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
        void *vapic_page;
        u16 status;
 
-       if (vmx->nested.pi_desc &&
-           vmx->nested.pi_pending) {
-               vmx->nested.pi_pending = false;
-               if (!pi_test_and_clear_on(vmx->nested.pi_desc))
-                       return;
-
-               max_irr = find_last_bit(
-                       (unsigned long *)vmx->nested.pi_desc->pir, 256);
+       if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
+               return;
 
-               if (max_irr == 256)
-                       return;
+       vmx->nested.pi_pending = false;
+       if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+               return;
 
+       max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
+       if (max_irr != 256) {
                vapic_page = kmap(vmx->nested.virtual_apic_page);
                __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
                kunmap(vmx->nested.virtual_apic_page);
@@ -4986,11 +5061,16 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
                        vmcs_write16(GUEST_INTR_STATUS, status);
                }
        }
+
+       nested_mark_vmcs12_pages_dirty(vcpu);
 }
 
-static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
+static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
+                                                    bool nested)
 {
 #ifdef CONFIG_SMP
+       int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
+
        if (vcpu->mode == IN_GUEST_MODE) {
                struct vcpu_vmx *vmx = to_vmx(vcpu);
 
@@ -5008,8 +5088,7 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
                 */
                WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc));
 
-               apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
-                               POSTED_INTR_VECTOR);
+               apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
                return true;
        }
 #endif
@@ -5024,7 +5103,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
        if (is_guest_mode(vcpu) &&
            vector == vmx->nested.posted_intr_nv) {
                /* the PIR and ON have been set by L1. */
-               kvm_vcpu_trigger_posted_interrupt(vcpu);
+               kvm_vcpu_trigger_posted_interrupt(vcpu, true);
                /*
                 * If a posted intr is not recognized by hardware,
                 * we will accomplish it in the next vmentry.
@@ -5058,7 +5137,7 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
        if (pi_test_and_set_on(&vmx->pi_desc))
                return;
 
-       if (!kvm_vcpu_trigger_posted_interrupt(vcpu))
+       if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
                kvm_vcpu_kick(vcpu);
 }
 
@@ -5302,6 +5381,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
 #endif
 
+       if (cpu_has_vmx_vmfunc())
+               vmcs_write64(VM_FUNCTION_CONTROL, 0);
+
        vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
        vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
        vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
@@ -5780,6 +5862,7 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu)
 static int handle_triple_fault(struct kvm_vcpu *vcpu)
 {
        vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+       vcpu->mmio_needed = 0;
        return 0;
 }
 
@@ -6275,7 +6358,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification;
        gpa_t gpa;
-       u32 error_code;
+       u64 error_code;
 
        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
@@ -6307,6 +6390,9 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
                        EPT_VIOLATION_EXECUTABLE))
                      ? PFERR_PRESENT_MASK : 0;
 
+       error_code |= (exit_qualification & 0x100) != 0 ?
+              PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
+
        vcpu->arch.gpa_available = true;
        vcpu->arch.exit_qualification = exit_qualification;
 
@@ -6699,7 +6785,13 @@ static int handle_pause(struct kvm_vcpu *vcpu)
        if (ple_gap)
                grow_ple_window(vcpu);
 
-       kvm_vcpu_on_spin(vcpu);
+       /*
+        * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
+        * VM-execution control is ignored if CPL > 0. OTOH, KVM
+        * never set PAUSE_EXITING and just set PLE if supported,
+        * so the vcpu must be CPL=0 if it gets a PAUSE exit.
+        */
+       kvm_vcpu_on_spin(vcpu, true);
        return kvm_skip_emulated_instruction(vcpu);
 }
 
@@ -7094,19 +7186,19 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
                return kvm_skip_emulated_instruction(vcpu);
        }
 
-       page = nested_get_page(vcpu, vmptr);
-       if (page == NULL) {
+       page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
+       if (is_error_page(page)) {
                nested_vmx_failInvalid(vcpu);
                return kvm_skip_emulated_instruction(vcpu);
        }
        if (*(u32 *)kmap(page) != VMCS12_REVISION) {
                kunmap(page);
-               nested_release_page_clean(page);
+               kvm_release_page_clean(page);
                nested_vmx_failInvalid(vcpu);
                return kvm_skip_emulated_instruction(vcpu);
        }
        kunmap(page);
-       nested_release_page_clean(page);
+       kvm_release_page_clean(page);
 
        vmx->nested.vmxon_ptr = vmptr;
        ret = enter_vmx_operation(vcpu);
@@ -7133,34 +7225,32 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
+{
+       vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
+       vmcs_write64(VMCS_LINK_POINTER, -1ull);
+}
+
 static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
 {
        if (vmx->nested.current_vmptr == -1ull)
                return;
 
-       /* current_vmptr and current_vmcs12 are always set/reset together */
-       if (WARN_ON(vmx->nested.current_vmcs12 == NULL))
-               return;
-
        if (enable_shadow_vmcs) {
                /* copy to memory all shadowed fields in case
                   they were modified */
                copy_shadow_to_vmcs12(vmx);
                vmx->nested.sync_shadow_vmcs = false;
-               vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
-                               SECONDARY_EXEC_SHADOW_VMCS);
-               vmcs_write64(VMCS_LINK_POINTER, -1ull);
+               vmx_disable_shadow_vmcs(vmx);
        }
        vmx->nested.posted_intr_nv = -1;
 
        /* Flush VMCS12 to guest memory */
-       memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12,
-              VMCS12_SIZE);
+       kvm_vcpu_write_guest_page(&vmx->vcpu,
+                                 vmx->nested.current_vmptr >> PAGE_SHIFT,
+                                 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
 
-       kunmap(vmx->nested.current_vmcs12_page);
-       nested_release_page(vmx->nested.current_vmcs12_page);
        vmx->nested.current_vmptr = -1ull;
-       vmx->nested.current_vmcs12 = NULL;
 }
 
 /*
@@ -7174,12 +7264,14 @@ static void free_nested(struct vcpu_vmx *vmx)
 
        vmx->nested.vmxon = false;
        free_vpid(vmx->nested.vpid02);
-       nested_release_vmcs12(vmx);
+       vmx->nested.posted_intr_nv = -1;
+       vmx->nested.current_vmptr = -1ull;
        if (vmx->nested.msr_bitmap) {
                free_page((unsigned long)vmx->nested.msr_bitmap);
                vmx->nested.msr_bitmap = NULL;
        }
        if (enable_shadow_vmcs) {
+               vmx_disable_shadow_vmcs(vmx);
                vmcs_clear(vmx->vmcs01.shadow_vmcs);
                free_vmcs(vmx->vmcs01.shadow_vmcs);
                vmx->vmcs01.shadow_vmcs = NULL;
@@ -7187,16 +7279,16 @@ static void free_nested(struct vcpu_vmx *vmx)
        kfree(vmx->nested.cached_vmcs12);
        /* Unpin physical memory we referred to in current vmcs02 */
        if (vmx->nested.apic_access_page) {
-               nested_release_page(vmx->nested.apic_access_page);
+               kvm_release_page_dirty(vmx->nested.apic_access_page);
                vmx->nested.apic_access_page = NULL;
        }
        if (vmx->nested.virtual_apic_page) {
-               nested_release_page(vmx->nested.virtual_apic_page);
+               kvm_release_page_dirty(vmx->nested.virtual_apic_page);
                vmx->nested.virtual_apic_page = NULL;
        }
        if (vmx->nested.pi_desc_page) {
                kunmap(vmx->nested.pi_desc_page);
-               nested_release_page(vmx->nested.pi_desc_page);
+               kvm_release_page_dirty(vmx->nested.pi_desc_page);
                vmx->nested.pi_desc_page = NULL;
                vmx->nested.pi_desc = NULL;
        }
@@ -7563,29 +7655,29 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
        if (vmx->nested.current_vmptr != vmptr) {
                struct vmcs12 *new_vmcs12;
                struct page *page;
-               page = nested_get_page(vcpu, vmptr);
-               if (page == NULL) {
+               page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
+               if (is_error_page(page)) {
                        nested_vmx_failInvalid(vcpu);
                        return kvm_skip_emulated_instruction(vcpu);
                }
                new_vmcs12 = kmap(page);
                if (new_vmcs12->revision_id != VMCS12_REVISION) {
                        kunmap(page);
-                       nested_release_page_clean(page);
+                       kvm_release_page_clean(page);
                        nested_vmx_failValid(vcpu,
                                VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
                        return kvm_skip_emulated_instruction(vcpu);
                }
 
                nested_release_vmcs12(vmx);
-               vmx->nested.current_vmcs12 = new_vmcs12;
-               vmx->nested.current_vmcs12_page = page;
                /*
                 * Load VMCS12 from guest memory since it is not already
                 * cached.
                 */
-               memcpy(vmx->nested.cached_vmcs12,
-                      vmx->nested.current_vmcs12, VMCS12_SIZE);
+               memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
+               kunmap(page);
+               kvm_release_page_clean(page);
+
                set_current_vmptr(vmx, vmptr);
        }
 
@@ -7792,6 +7884,125 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u64 mask = address & 0x7;
+       int maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+       /* Check for memory type validity */
+       switch (mask) {
+       case 0:
+               if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_UC_BIT))
+                       return false;
+               break;
+       case 6:
+               if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_WB_BIT))
+                       return false;
+               break;
+       default:
+               return false;
+       }
+
+       /* Bits 5:3 must be 3 */
+       if (((address >> VMX_EPT_GAW_EPTP_SHIFT) & 0x7) != VMX_EPT_DEFAULT_GAW)
+               return false;
+
+       /* Reserved bits should not be set */
+       if (address >> maxphyaddr || ((address >> 7) & 0x1f))
+               return false;
+
+       /* AD, if set, should be supported */
+       if ((address & VMX_EPT_AD_ENABLE_BIT)) {
+               if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPT_AD_BIT))
+                       return false;
+       }
+
+       return true;
+}
+
+static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
+                                    struct vmcs12 *vmcs12)
+{
+       u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
+       u64 address;
+       bool accessed_dirty;
+       struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+       if (!nested_cpu_has_eptp_switching(vmcs12) ||
+           !nested_cpu_has_ept(vmcs12))
+               return 1;
+
+       if (index >= VMFUNC_EPTP_ENTRIES)
+               return 1;
+
+
+       if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
+                                    &address, index * 8, 8))
+               return 1;
+
+       accessed_dirty = !!(address & VMX_EPT_AD_ENABLE_BIT);
+
+       /*
+        * If the (L2) guest does a vmfunc to the currently
+        * active ept pointer, we don't have to do anything else
+        */
+       if (vmcs12->ept_pointer != address) {
+               if (!valid_ept_address(vcpu, address))
+                       return 1;
+
+               kvm_mmu_unload(vcpu);
+               mmu->ept_ad = accessed_dirty;
+               mmu->base_role.ad_disabled = !accessed_dirty;
+               vmcs12->ept_pointer = address;
+               /*
+                * TODO: Check what's the correct approach in case
+                * mmu reload fails. Currently, we just let the next
+                * reload potentially fail
+                */
+               kvm_mmu_reload(vcpu);
+       }
+
+       return 0;
+}
+
+static int handle_vmfunc(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct vmcs12 *vmcs12;
+       u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
+
+       /*
+        * VMFUNC is only supported for nested guests, but we always enable the
+        * secondary control for simplicity; for non-nested mode, fake that we
+        * didn't by injecting #UD.
+        */
+       if (!is_guest_mode(vcpu)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
+       vmcs12 = get_vmcs12(vcpu);
+       if ((vmcs12->vm_function_control & (1 << function)) == 0)
+               goto fail;
+
+       switch (function) {
+       case 0:
+               if (nested_vmx_eptp_switching(vcpu, vmcs12))
+                       goto fail;
+               break;
+       default:
+               goto fail;
+       }
+       return kvm_skip_emulated_instruction(vcpu);
+
+fail:
+       nested_vmx_vmexit(vcpu, vmx->exit_reason,
+                         vmcs_read32(VM_EXIT_INTR_INFO),
+                         vmcs_readl(EXIT_QUALIFICATION));
+       return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -7842,6 +8053,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_XSAVES]                  = handle_xsaves,
        [EXIT_REASON_XRSTORS]                 = handle_xrstors,
        [EXIT_REASON_PML_FULL]                = handle_pml_full,
+       [EXIT_REASON_VMFUNC]                  = handle_vmfunc,
        [EXIT_REASON_PREEMPTION_TIMER]        = handle_preemption_timer,
 };
 
@@ -8018,12 +8230,11 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
  * should handle it ourselves in L0 (and then continue L2). Only call this
  * when in is_guest_mode (L2).
  */
-static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
+static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 {
        u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-       u32 exit_reason = vmx->exit_reason;
 
        trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
                                vmcs_readl(EXIT_QUALIFICATION),
@@ -8032,6 +8243,18 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                                vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
                                KVM_ISA_VMX);
 
+       /*
+        * The host physical addresses of some pages of guest memory
+        * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
+        * may write to these pages via their host physical address while
+        * L2 is running, bypassing any address-translation-based dirty
+        * tracking (e.g. EPT write protection).
+        *
+        * Mark them dirty on every exit from L2 to prevent them from
+        * getting out of sync with dirty tracking.
+        */
+       nested_mark_vmcs12_pages_dirty(vcpu);
+
        if (vmx->nested.nested_run_pending)
                return false;
 
@@ -8146,6 +8369,10 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                 * table is L0's fault.
                 */
                return false;
+       case EXIT_REASON_INVPCID:
+               return
+                       nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
+                       nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
        case EXIT_REASON_WBINVD:
                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
        case EXIT_REASON_XSETBV:
@@ -8163,11 +8390,37 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
        case EXIT_REASON_PML_FULL:
                /* We emulate PML support to L1. */
                return false;
+       case EXIT_REASON_VMFUNC:
+               /* VM functions are emulated through L2->L0 vmexits. */
+               return false;
        default:
                return true;
        }
 }
 
+static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
+{
+       u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+       /*
+        * At this point, the exit interruption info in exit_intr_info
+        * is only valid for EXCEPTION_NMI exits.  For EXTERNAL_INTERRUPT
+        * we need to query the in-kernel LAPIC.
+        */
+       WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
+       if ((exit_intr_info &
+            (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
+           (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
+               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+               vmcs12->vm_exit_intr_error_code =
+                       vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+       }
+
+       nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
+                         vmcs_readl(EXIT_QUALIFICATION));
+       return 1;
+}
+
 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 {
        *info1 = vmcs_readl(EXIT_QUALIFICATION);
@@ -8414,12 +8667,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
        if (vmx->emulation_required)
                return handle_invalid_guest_state(vcpu);
 
-       if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
-               nested_vmx_vmexit(vcpu, exit_reason,
-                                 vmcs_read32(VM_EXIT_INTR_INFO),
-                                 vmcs_readl(EXIT_QUALIFICATION));
-               return 1;
-       }
+       if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
+               return nested_vmx_reflect_vmexit(vcpu, exit_reason);
 
        if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
                dump_vmcs();
@@ -9222,7 +9471,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 
        vmx->nested.posted_intr_nv = -1;
        vmx->nested.current_vmptr = -1ull;
-       vmx->nested.current_vmcs12 = NULL;
 
        vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
 
@@ -9378,12 +9626,11 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
 
 static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 {
-       struct kvm_cpuid_entry2 *best;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx);
 
        if (vmx_rdtscp_supported()) {
-               bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu);
+               bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
                if (!rdtscp_enabled)
                        secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP;
 
@@ -9397,15 +9644,25 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
                }
        }
 
-       /* Exposing INVPCID only when PCID is exposed */
-       best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
-       if (vmx_invpcid_supported() &&
-           (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) ||
-           !guest_cpuid_has_pcid(vcpu))) {
-               secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID;
+       if (vmx_invpcid_supported()) {
+               /* Exposing INVPCID only when PCID is exposed */
+               bool invpcid_enabled =
+                       guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
+                       guest_cpuid_has(vcpu, X86_FEATURE_PCID);
+
+               if (!invpcid_enabled) {
+                       secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID;
+                       guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
+               }
 
-               if (best)
-                       best->ebx &= ~bit(X86_FEATURE_INVPCID);
+               if (nested) {
+                       if (invpcid_enabled)
+                               vmx->nested.nested_vmx_secondary_ctls_high |=
+                                       SECONDARY_EXEC_ENABLE_INVPCID;
+                       else
+                               vmx->nested.nested_vmx_secondary_ctls_high &=
+                                       ~SECONDARY_EXEC_ENABLE_INVPCID;
+               }
        }
 
        if (cpu_has_secondary_exec_ctrls())
@@ -9508,12 +9765,15 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 
        WARN_ON(!is_guest_mode(vcpu));
 
-       if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
-               nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
-                                 vmcs_read32(VM_EXIT_INTR_INFO),
-                                 vmcs_readl(EXIT_QUALIFICATION));
-       else
+       if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) {
+               vmcs12->vm_exit_intr_error_code = fault->error_code;
+               nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
+                                 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
+                                 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
+                                 fault->address);
+       } else {
                kvm_inject_page_fault(vcpu, fault);
+       }
 }
 
 static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
@@ -9523,6 +9783,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
                                        struct vmcs12 *vmcs12)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct page *page;
        u64 hpa;
 
        if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
@@ -9532,17 +9793,19 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
                 * physical address remains valid. We keep a reference
                 * to it so we can release it later.
                 */
-               if (vmx->nested.apic_access_page) /* shouldn't happen */
-                       nested_release_page(vmx->nested.apic_access_page);
-               vmx->nested.apic_access_page =
-                       nested_get_page(vcpu, vmcs12->apic_access_addr);
+               if (vmx->nested.apic_access_page) { /* shouldn't happen */
+                       kvm_release_page_dirty(vmx->nested.apic_access_page);
+                       vmx->nested.apic_access_page = NULL;
+               }
+               page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
                /*
                 * If translation failed, no matter: This feature asks
                 * to exit when accessing the given address, and if it
                 * can never be accessed, this feature won't do
                 * anything anyway.
                 */
-               if (vmx->nested.apic_access_page) {
+               if (!is_error_page(page)) {
+                       vmx->nested.apic_access_page = page;
                        hpa = page_to_phys(vmx->nested.apic_access_page);
                        vmcs_write64(APIC_ACCESS_ADDR, hpa);
                } else {
@@ -9557,10 +9820,11 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
        }
 
        if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-               if (vmx->nested.virtual_apic_page) /* shouldn't happen */
-                       nested_release_page(vmx->nested.virtual_apic_page);
-               vmx->nested.virtual_apic_page =
-                       nested_get_page(vcpu, vmcs12->virtual_apic_page_addr);
+               if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
+                       kvm_release_page_dirty(vmx->nested.virtual_apic_page);
+                       vmx->nested.virtual_apic_page = NULL;
+               }
+               page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr);
 
                /*
                 * If translation failed, VM entry will fail because
@@ -9575,7 +9839,8 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
                 * control.  But such a configuration is useless, so
                 * let's keep the code simple.
                 */
-               if (vmx->nested.virtual_apic_page) {
+               if (!is_error_page(page)) {
+                       vmx->nested.virtual_apic_page = page;
                        hpa = page_to_phys(vmx->nested.virtual_apic_page);
                        vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
                }
@@ -9584,16 +9849,14 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
        if (nested_cpu_has_posted_intr(vmcs12)) {
                if (vmx->nested.pi_desc_page) { /* shouldn't happen */
                        kunmap(vmx->nested.pi_desc_page);
-                       nested_release_page(vmx->nested.pi_desc_page);
+                       kvm_release_page_dirty(vmx->nested.pi_desc_page);
+                       vmx->nested.pi_desc_page = NULL;
                }
-               vmx->nested.pi_desc_page =
-                       nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
-               vmx->nested.pi_desc =
-                       (struct pi_desc *)kmap(vmx->nested.pi_desc_page);
-               if (!vmx->nested.pi_desc) {
-                       nested_release_page_clean(vmx->nested.pi_desc_page);
+               page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
+               if (is_error_page(page))
                        return;
-               }
+               vmx->nested.pi_desc_page = page;
+               vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
                vmx->nested.pi_desc =
                        (struct pi_desc *)((void *)vmx->nested.pi_desc +
                        (unsigned long)(vmcs12->posted_intr_desc_addr &
@@ -9675,8 +9938,8 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
        if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
                return false;
 
-       page = nested_get_page(vcpu, vmcs12->msr_bitmap);
-       if (!page)
+       page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
+       if (is_error_page(page))
                return false;
        msr_bitmap_l1 = (unsigned long *)kmap(page);
 
@@ -9706,7 +9969,7 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
                }
        }
        kunmap(page);
-       nested_release_page_clean(page);
+       kvm_release_page_clean(page);
 
        return true;
 }
@@ -10041,6 +10304,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                             vmcs12->vm_entry_instruction_len);
                vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
                             vmcs12->guest_interruptibility_info);
+               vmx->loaded_vmcs->nmi_known_unmasked =
+                       !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
        } else {
                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
        }
@@ -10065,13 +10330,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 
        /* Posted interrupts setting is only taken from vmcs12.  */
        if (nested_cpu_has_posted_intr(vmcs12)) {
-               /*
-                * Note that we use L0's vector here and in
-                * vmx_deliver_nested_posted_interrupt.
-                */
                vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
                vmx->nested.pi_pending = false;
-               vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+               vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
        } else {
                exec_control &= ~PIN_BASED_POSTED_INTR;
        }
@@ -10095,12 +10356,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
         * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
         * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
         * !enable_ept, EB.PF is 1, so the "or" will always be 1.
-        *
-        * A problem with this approach (when !enable_ept) is that L1 may be
-        * injected with more page faults than it asked for. This could have
-        * caused problems, but in practice existing hypervisors don't care.
-        * To fix this, we will need to emulate the PFEC checking (on the L1
-        * page tables), using walk_addr(), when injecting PFs to L1.
         */
        vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
                enable_ept ? vmcs12->page_fault_error_code_mask : 0);
@@ -10112,9 +10367,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 
                /* Take the following fields only from vmcs12 */
                exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+                                 SECONDARY_EXEC_ENABLE_INVPCID |
                                  SECONDARY_EXEC_RDTSCP |
                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
-                                 SECONDARY_EXEC_APIC_REGISTER_VIRT);
+                                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
+                                 SECONDARY_EXEC_ENABLE_VMFUNC);
                if (nested_cpu_has(vmcs12,
                                   CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
                        vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
@@ -10122,6 +10379,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                        exec_control |= vmcs12_exec_ctrl;
                }
 
+               /* All VMFUNCs are currently emulated through L0 vmexits.  */
+               if (exec_control & SECONDARY_EXEC_ENABLE_VMFUNC)
+                       vmcs_write64(VM_FUNCTION_CONTROL, 0);
+
                if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
                        vmcs_write64(EOI_EXIT_BITMAP0,
                                vmcs12->eoi_exit_bitmap0);
@@ -10374,6 +10635,18 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                                vmx->nested.nested_vmx_entry_ctls_high))
                return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
+       if (nested_cpu_has_vmfunc(vmcs12)) {
+               if (vmcs12->vm_function_control &
+                   ~vmx->nested.nested_vmx_vmfunc_controls)
+                       return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+               if (nested_cpu_has_eptp_switching(vmcs12)) {
+                       if (!nested_cpu_has_ept(vmcs12) ||
+                           !page_address_valid(vcpu, vmcs12->eptp_list_address))
+                               return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+               }
+       }
+
        if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu))
                return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
 
@@ -10848,13 +11121,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 
        vmcs12->vm_exit_reason = exit_reason;
        vmcs12->exit_qualification = exit_qualification;
-
        vmcs12->vm_exit_intr_info = exit_intr_info;
-       if ((vmcs12->vm_exit_intr_info &
-            (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
-           (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK))
-               vmcs12->vm_exit_intr_error_code =
-                       vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+
        vmcs12->idt_vectoring_info_field = 0;
        vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
        vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
@@ -10942,7 +11210,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                 */
                vmx_flush_tlb(vcpu);
        }
-
+       /* Restore posted intr vector. */
+       if (nested_cpu_has_posted_intr(vmcs12))
+               vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
 
        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
        vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
@@ -11048,8 +11318,15 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
 
-       if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
-           && nested_exit_intr_ack_set(vcpu)) {
+       /*
+        * TODO: SDM says that with acknowledge interrupt on exit, bit 31 of
+        * the VM-exit interrupt information (valid interrupt) is always set to
+        * 1 on EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't need
+        * kvm_cpu_has_interrupt().  See the commit message for details.
+        */
+       if (nested_exit_intr_ack_set(vcpu) &&
+           exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+           kvm_cpu_has_interrupt(vcpu)) {
                int irq = kvm_cpu_get_interrupt(vcpu);
                WARN_ON(irq < 0);
                vmcs12->vm_exit_intr_info = irq |
@@ -11101,16 +11378,16 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
        /* Unpin physical memory we referred to in vmcs02 */
        if (vmx->nested.apic_access_page) {
-               nested_release_page(vmx->nested.apic_access_page);
+               kvm_release_page_dirty(vmx->nested.apic_access_page);
                vmx->nested.apic_access_page = NULL;
        }
        if (vmx->nested.virtual_apic_page) {
-               nested_release_page(vmx->nested.virtual_apic_page);
+               kvm_release_page_dirty(vmx->nested.virtual_apic_page);
                vmx->nested.virtual_apic_page = NULL;
        }
        if (vmx->nested.pi_desc_page) {
                kunmap(vmx->nested.pi_desc_page);
-               nested_release_page(vmx->nested.pi_desc_page);
+               kvm_release_page_dirty(vmx->nested.pi_desc_page);
                vmx->nested.pi_desc_page = NULL;
                vmx->nested.pi_desc = NULL;
        }
@@ -11286,14 +11563,14 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
 
                gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
 
-               page = nested_get_page(vcpu, vmcs12->pml_address);
-               if (!page)
+               page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
+               if (is_error_page(page))
                        return 0;
 
                pml_address = kmap(page);
                pml_address[vmcs12->guest_pml_index--] = gpa;
                kunmap(page);
-               nested_release_page_clean(page);
+               kvm_release_page_clean(page);
        }
 
        return 0;