]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blobdiff - arch/x86/kvm/vmx.c
KVM: nVMX: Introduce vmcs12: a VMCS structure for L1
[mirror_ubuntu-artful-kernel.git] / arch / x86 / kvm / vmx.c
index 4c3fa0f6746970cef9bb5d26e5808d8bbdabcb1f..914dc4e9b37f622dcc7775f506005cbb96eb6c17 100644 (file)
@@ -43,6 +43,8 @@
 #include "trace.h"
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
+#define __ex_clear(x, reg) \
+       ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
 
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
@@ -72,6 +74,14 @@ module_param(vmm_exclusive, bool, S_IRUGO);
 static int __read_mostly yield_on_hlt = 1;
 module_param(yield_on_hlt, bool, S_IRUGO);
 
+/*
+ * If nested=1, nested virtualization is supported, i.e., guests may use
+ * VMX and be a hypervisor for its own guests. If nested=0, guests may not
+ * use VMX instructions.
+ */
+static int __read_mostly nested = 0;
+module_param(nested, bool, S_IRUGO);
+
 #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST                          \
        (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
 #define KVM_GUEST_CR0_MASK                                             \
@@ -116,17 +126,77 @@ struct vmcs {
        char data[0];
 };
 
+/*
+ * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
+ * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
+ * loaded on this CPU (so we can clear them if the CPU goes down).
+ */
+struct loaded_vmcs {
+       struct vmcs *vmcs;
+       int cpu;
+       int launched;
+       struct list_head loaded_vmcss_on_cpu_link;
+};
+
 struct shared_msr_entry {
        unsigned index;
        u64 data;
        u64 mask;
 };
 
+/*
+ * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
+ * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
+ * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
+ * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
+ * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
+ * More than one of these structures may exist, if L1 runs multiple L2 guests.
+ * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
+ * underlying hardware which will be used to run L2.
+ * This structure is packed to ensure that its layout is identical across
+ * machines (necessary for live migration).
+ * If there are changes in this struct, VMCS12_REVISION must be changed.
+ */
+struct __packed vmcs12 {
+       /* According to the Intel spec, a VMCS region must start with the
+        * following two fields. Then follow implementation-specific data.
+        */
+       u32 revision_id;
+       u32 abort;
+};
+
+/*
+ * VMCS12_REVISION is an arbitrary id that should be changed if the content or
+ * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
+ * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
+ */
+#define VMCS12_REVISION 0x11e57ed0
+
+/*
+ * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
+ * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
+ * current implementation, 4K are reserved to avoid future complications.
+ */
+#define VMCS12_SIZE 0x1000
+
+/*
+ * The nested_vmx structure is part of vcpu_vmx, and holds information we need
+ * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
+ */
+struct nested_vmx {
+       /* Has the level1 guest done vmxon? */
+       bool vmxon;
+
+       /* The guest-physical address of the current VMCS L1 keeps for L2 */
+       gpa_t current_vmptr;
+       /* The host-usable pointer to the above */
+       struct page *current_vmcs12_page;
+       struct vmcs12 *current_vmcs12;
+};
+
 struct vcpu_vmx {
        struct kvm_vcpu       vcpu;
-       struct list_head      local_vcpus_link;
        unsigned long         host_rsp;
-       int                   launched;
        u8                    fail;
        u8                    cpl;
        bool                  nmi_known_unmasked;
@@ -140,7 +210,14 @@ struct vcpu_vmx {
        u64                   msr_host_kernel_gs_base;
        u64                   msr_guest_kernel_gs_base;
 #endif
-       struct vmcs          *vmcs;
+       /*
+        * loaded_vmcs points to the VMCS currently used in this vcpu. For a
+        * non-nested (L1) guest, it always points to vmcs01. For a nested
+        * guest (L2), it points to a different VMCS.
+        */
+       struct loaded_vmcs    vmcs01;
+       struct loaded_vmcs   *loaded_vmcs;
+       bool                  __launched; /* temporary, used in vmx_vcpu_run */
        struct msr_autoload {
                unsigned nr;
                struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
@@ -176,6 +253,9 @@ struct vcpu_vmx {
        u32 exit_reason;
 
        bool rdtscp_enabled;
+
+       /* Support for a guest hypervisor (nested VMX) */
+       struct nested_vmx nested;
 };
 
 enum segment_cache_field {
@@ -192,6 +272,31 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
        return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
+static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
+{
+       return to_vmx(vcpu)->nested.current_vmcs12;
+}
+
+static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+       struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
+       if (is_error_page(page)) {
+               kvm_release_page_clean(page);
+               return NULL;
+       }
+       return page;
+}
+
+static void nested_release_page(struct page *page)
+{
+       kvm_release_page_dirty(page);
+}
+
+static void nested_release_page_clean(struct page *page)
+{
+       kvm_release_page_clean(page);
+}
+
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
@@ -200,7 +305,11 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
-static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+/*
+ * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
+ * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
+ */
+static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
 static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
 
 static unsigned long *vmx_io_bitmap_a;
@@ -501,6 +610,13 @@ static void vmcs_clear(struct vmcs *vmcs)
                       vmcs, phys_addr);
 }
 
+static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
+{
+       vmcs_clear(loaded_vmcs->vmcs);
+       loaded_vmcs->cpu = -1;
+       loaded_vmcs->launched = 0;
+}
+
 static void vmcs_load(struct vmcs *vmcs)
 {
        u64 phys_addr = __pa(vmcs);
@@ -514,25 +630,24 @@ static void vmcs_load(struct vmcs *vmcs)
                       vmcs, phys_addr);
 }
 
-static void __vcpu_clear(void *arg)
+static void __loaded_vmcs_clear(void *arg)
 {
-       struct vcpu_vmx *vmx = arg;
+       struct loaded_vmcs *loaded_vmcs = arg;
        int cpu = raw_smp_processor_id();
 
-       if (vmx->vcpu.cpu == cpu)
-               vmcs_clear(vmx->vmcs);
-       if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
+       if (loaded_vmcs->cpu != cpu)
+               return; /* vcpu migration can race with cpu offline */
+       if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
                per_cpu(current_vmcs, cpu) = NULL;
-       list_del(&vmx->local_vcpus_link);
-       vmx->vcpu.cpu = -1;
-       vmx->launched = 0;
+       list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
+       loaded_vmcs_init(loaded_vmcs);
 }
 
-static void vcpu_clear(struct vcpu_vmx *vmx)
+static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 {
-       if (vmx->vcpu.cpu == -1)
-               return;
-       smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
+       if (loaded_vmcs->cpu != -1)
+               smp_call_function_single(
+                       loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
 }
 
 static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -585,26 +700,26 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
        }
 }
 
-static unsigned long vmcs_readl(unsigned long field)
+static __always_inline unsigned long vmcs_readl(unsigned long field)
 {
-       unsigned long value = 0;
+       unsigned long value;
 
-       asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
-                     : "+a"(value) : "d"(field) : "cc");
+       asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
+                     : "=a"(value) : "d"(field) : "cc");
        return value;
 }
 
-static u16 vmcs_read16(unsigned long field)
+static __always_inline u16 vmcs_read16(unsigned long field)
 {
        return vmcs_readl(field);
 }
 
-static u32 vmcs_read32(unsigned long field)
+static __always_inline u32 vmcs_read32(unsigned long field)
 {
        return vmcs_readl(field);
 }
 
-static u64 vmcs_read64(unsigned long field)
+static __always_inline u64 vmcs_read64(unsigned long field)
 {
 #ifdef CONFIG_X86_64
        return vmcs_readl(field);
@@ -971,22 +1086,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
        if (!vmm_exclusive)
                kvm_cpu_vmxon(phys_addr);
-       else if (vcpu->cpu != cpu)
-               vcpu_clear(vmx);
+       else if (vmx->loaded_vmcs->cpu != cpu)
+               loaded_vmcs_clear(vmx->loaded_vmcs);
 
-       if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
-               per_cpu(current_vmcs, cpu) = vmx->vmcs;
-               vmcs_load(vmx->vmcs);
+       if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+               per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+               vmcs_load(vmx->loaded_vmcs->vmcs);
        }
 
-       if (vcpu->cpu != cpu) {
+       if (vmx->loaded_vmcs->cpu != cpu) {
                struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
                unsigned long sysenter_esp;
 
                kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
                local_irq_disable();
-               list_add(&vmx->local_vcpus_link,
-                        &per_cpu(vcpus_on_cpu, cpu));
+               list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
+                        &per_cpu(loaded_vmcss_on_cpu, cpu));
                local_irq_enable();
 
                /*
@@ -998,6 +1113,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
                rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
                vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+               vmx->loaded_vmcs->cpu = cpu;
        }
 }
 
@@ -1005,7 +1121,8 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
        __vmx_load_host_state(to_vmx(vcpu));
        if (!vmm_exclusive) {
-               __vcpu_clear(to_vmx(vcpu));
+               __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
+               vcpu->cpu = -1;
                kvm_cpu_vmxoff();
        }
 }
@@ -1261,6 +1378,23 @@ static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
        return target_tsc - native_read_tsc();
 }
 
+static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
+{
+       struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
+       return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
+}
+
+/*
+ * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
+ * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
+ * all guests if the "nested" module option is off, and can also be disabled
+ * for a single guest by disabling its VMX cpuid bit.
+ */
+static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
+{
+       return nested && guest_cpuid_has_vmx(vcpu);
+}
+
 /*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
@@ -1469,7 +1603,7 @@ static int hardware_enable(void *garbage)
        if (read_cr4() & X86_CR4_VMXE)
                return -EBUSY;
 
-       INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
+       INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
        rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
 
        test_bits = FEATURE_CONTROL_LOCKED;
@@ -1493,14 +1627,14 @@ static int hardware_enable(void *garbage)
        return 0;
 }
 
-static void vmclear_local_vcpus(void)
+static void vmclear_local_loaded_vmcss(void)
 {
        int cpu = raw_smp_processor_id();
-       struct vcpu_vmx *vmx, *n;
+       struct loaded_vmcs *v, *n;
 
-       list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
-                                local_vcpus_link)
-               __vcpu_clear(vmx);
+       list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
+                                loaded_vmcss_on_cpu_link)
+               __loaded_vmcs_clear(v);
 }
 
 
@@ -1515,7 +1649,7 @@ static void kvm_cpu_vmxoff(void)
 static void hardware_disable(void *garbage)
 {
        if (vmm_exclusive) {
-               vmclear_local_vcpus();
+               vmclear_local_loaded_vmcss();
                kvm_cpu_vmxoff();
        }
        write_cr4(read_cr4() & ~X86_CR4_VMXE);
@@ -1696,6 +1830,18 @@ static void free_vmcs(struct vmcs *vmcs)
        free_pages((unsigned long)vmcs, vmcs_config.order);
 }
 
+/*
+ * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
+ */
+static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+{
+       if (!loaded_vmcs->vmcs)
+               return;
+       loaded_vmcs_clear(loaded_vmcs);
+       free_vmcs(loaded_vmcs->vmcs);
+       loaded_vmcs->vmcs = NULL;
+}
+
 static void free_kvm_area(void)
 {
        int cpu;
@@ -2041,13 +2187,14 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
                  (unsigned long *)&vcpu->arch.regs_dirty);
 }
 
-static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 
 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
                                        unsigned long cr0,
                                        struct kvm_vcpu *vcpu)
 {
-       vmx_decache_cr3(vcpu);
+       if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+               vmx_decache_cr3(vcpu);
        if (!(cr0 & X86_CR0_PG)) {
                /* From paging/starting to nonpaging */
                vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -2138,11 +2285,23 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        vmcs_writel(GUEST_CR3, guest_cr3);
 }
 
-static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
        unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
                    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
 
+       if (cr4 & X86_CR4_VMXE) {
+               /*
+                * To use VMXON (and later other VMX instructions), a guest
+                * must first be able to turn on cr4.VMXE (see handle_vmon()).
+                * So basically the check on whether to allow nested VMX
+                * is here.
+                */
+               if (!nested_vmx_allowed(vcpu))
+                       return 1;
+       } else if (to_vmx(vcpu)->nested.vmxon)
+               return 1;
+
        vcpu->arch.cr4 = cr4;
        if (enable_ept) {
                if (!is_paging(vcpu)) {
@@ -2155,6 +2314,7 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 
        vmcs_writel(CR4_READ_SHADOW, cr4);
        vmcs_writel(GUEST_CR4, hw_cr4);
+       return 0;
 }
 
 static void vmx_get_segment(struct kvm_vcpu *vcpu,
@@ -3864,6 +4024,105 @@ static int handle_invalid_op(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+/*
+ * Emulate the VMXON instruction.
+ * Currently, we just remember that VMX is active, and do not save or even
+ * inspect the argument to VMXON (the so-called "VMXON pointer") because we
+ * do not currently need to store anything in that guest-allocated memory
+ * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
+ * argument is different from the VMXON pointer (which the spec says they do).
+ */
+static int handle_vmon(struct kvm_vcpu *vcpu)
+{
+       struct kvm_segment cs;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       /* The Intel VMX Instruction Reference lists a bunch of bits that
+        * are prerequisite to running VMXON, most notably cr4.VMXE must be
+        * set to 1 (see vmx_set_cr4() for when we allow the guest to set this).
+        * Otherwise, we should fail with #UD. We test these now:
+        */
+       if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) ||
+           !kvm_read_cr0_bits(vcpu, X86_CR0_PE) ||
+           (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
+       vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+       if (is_long_mode(vcpu) && !cs.l) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
+       if (vmx_get_cpl(vcpu)) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
+       vmx->nested.vmxon = true;
+
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
+
+/*
+ * Intel's VMX Instruction Reference specifies a common set of prerequisites
+ * for running VMX instructions (except VMXON, whose prerequisites are
+ * slightly different). It also specifies what exception to inject otherwise.
+ */
+static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
+{
+       struct kvm_segment cs;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (!vmx->nested.vmxon) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 0;
+       }
+
+       vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+       if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
+           (is_long_mode(vcpu) && !cs.l)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 0;
+       }
+
+       if (vmx_get_cpl(vcpu)) {
+               kvm_inject_gp(vcpu, 0);
+               return 0;
+       }
+
+       return 1;
+}
+
+/*
+ * Free whatever needs to be freed from vmx->nested when L1 goes down, or
+ * just stops using VMX.
+ */
+static void free_nested(struct vcpu_vmx *vmx)
+{
+       if (!vmx->nested.vmxon)
+               return;
+       vmx->nested.vmxon = false;
+       if (vmx->nested.current_vmptr != -1ull) {
+               kunmap(vmx->nested.current_vmcs12_page);
+               nested_release_page(vmx->nested.current_vmcs12_page);
+               vmx->nested.current_vmptr = -1ull;
+               vmx->nested.current_vmcs12 = NULL;
+       }
+}
+
+/* Emulate the VMXOFF instruction */
+static int handle_vmoff(struct kvm_vcpu *vcpu)
+{
+       if (!nested_vmx_check_permission(vcpu))
+               return 1;
+       free_nested(to_vmx(vcpu));
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -3892,8 +4151,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_VMREAD]                  = handle_vmx_insn,
        [EXIT_REASON_VMRESUME]                = handle_vmx_insn,
        [EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
-       [EXIT_REASON_VMOFF]                   = handle_vmx_insn,
-       [EXIT_REASON_VMON]                    = handle_vmx_insn,
+       [EXIT_REASON_VMOFF]                   = handle_vmoff,
+       [EXIT_REASON_VMON]                    = handle_vmon,
        [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
        [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
        [EXIT_REASON_WBINVD]                  = handle_wbinvd,
@@ -4166,6 +4425,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
                vmx_set_interrupt_shadow(vcpu, 0);
 
+       vmx->__launched = vmx->loaded_vmcs->launched;
        asm(
                /* Store host registers */
                "push %%"R"dx; push %%"R"bp;"
@@ -4236,7 +4496,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
                "pop  %%"R"bp; pop  %%"R"dx \n\t"
                "setbe %c[fail](%0) \n\t"
              : : "c"(vmx), "d"((unsigned long)HOST_RSP),
-               [launched]"i"(offsetof(struct vcpu_vmx, launched)),
+               [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
                [fail]"i"(offsetof(struct vcpu_vmx, fail)),
                [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
                [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
@@ -4276,7 +4536,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
        asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
-       vmx->launched = 1;
+       vmx->loaded_vmcs->launched = 1;
 
        vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
 
@@ -4288,41 +4548,18 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 #undef R
 #undef Q
 
-static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-       if (vmx->vmcs) {
-               vcpu_clear(vmx);
-               free_vmcs(vmx->vmcs);
-               vmx->vmcs = NULL;
-       }
-}
-
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
        free_vpid(vmx);
-       vmx_free_vmcs(vcpu);
+       free_nested(vmx);
+       free_loaded_vmcs(vmx->loaded_vmcs);
        kfree(vmx->guest_msrs);
        kvm_vcpu_uninit(vcpu);
        kmem_cache_free(kvm_vcpu_cache, vmx);
 }
 
-static inline void vmcs_init(struct vmcs *vmcs)
-{
-       u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
-
-       if (!vmm_exclusive)
-               kvm_cpu_vmxon(phys_addr);
-
-       vmcs_clear(vmcs);
-
-       if (!vmm_exclusive)
-               kvm_cpu_vmxoff();
-}
-
 static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 {
        int err;
@@ -4344,11 +4581,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
                goto uninit_vcpu;
        }
 
-       vmx->vmcs = alloc_vmcs();
-       if (!vmx->vmcs)
+       vmx->loaded_vmcs = &vmx->vmcs01;
+       vmx->loaded_vmcs->vmcs = alloc_vmcs();
+       if (!vmx->loaded_vmcs->vmcs)
                goto free_msrs;
-
-       vmcs_init(vmx->vmcs);
+       if (!vmm_exclusive)
+               kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
+       loaded_vmcs_init(vmx->loaded_vmcs);
+       if (!vmm_exclusive)
+               kvm_cpu_vmxoff();
 
        cpu = get_cpu();
        vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -4374,10 +4615,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
                        goto free_vmcs;
        }
 
+       vmx->nested.current_vmptr = -1ull;
+       vmx->nested.current_vmcs12 = NULL;
+
        return &vmx->vcpu;
 
 free_vmcs:
-       free_vmcs(vmx->vmcs);
+       free_vmcs(vmx->loaded_vmcs->vmcs);
 free_msrs:
        kfree(vmx->guest_msrs);
 uninit_vcpu: