#include "trace.h"
#define __ex(x) __kvm_handle_fault_on_reboot(x)
+#define __ex_clear(x, reg) \
+ ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
static int __read_mostly yield_on_hlt = 1;
module_param(yield_on_hlt, bool, S_IRUGO);
+/*
+ * If nested=1, nested virtualization is supported, i.e., guests may use
+ * VMX and be a hypervisor for its own guests. If nested=0, guests may not
+ * use VMX instructions.
+ */
+static int __read_mostly nested = 0;
+module_param(nested, bool, S_IRUGO);
+
#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
#define KVM_GUEST_CR0_MASK \
char data[0];
};
+/*
+ * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
+ * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
+ * loaded on this CPU (so we can clear them if the CPU goes down).
+ */
+struct loaded_vmcs {
+ struct vmcs *vmcs;
+ int cpu;
+ int launched;
+ struct list_head loaded_vmcss_on_cpu_link;
+};
+
struct shared_msr_entry {
unsigned index;
u64 data;
u64 mask;
};
+/*
+ * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
+ * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
+ * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
+ * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
+ * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
+ * More than one of these structures may exist, if L1 runs multiple L2 guests.
+ * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
+ * underlying hardware which will be used to run L2.
+ * This structure is packed to ensure that its layout is identical across
+ * machines (necessary for live migration).
+ * If there are changes in this struct, VMCS12_REVISION must be changed.
+ */
+struct __packed vmcs12 {
+ /* According to the Intel spec, a VMCS region must start with the
+ * following two fields. Then follow implementation-specific data.
+ */
+ u32 revision_id;
+ u32 abort;
+};
+
+/*
+ * VMCS12_REVISION is an arbitrary id that should be changed if the content or
+ * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
+ * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
+ */
+#define VMCS12_REVISION 0x11e57ed0
+
+/*
+ * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
+ * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
+ * current implementation, 4K are reserved to avoid future complications.
+ */
+#define VMCS12_SIZE 0x1000
+
+/*
+ * The nested_vmx structure is part of vcpu_vmx, and holds information we need
+ * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
+ */
+struct nested_vmx {
+ /* Has the level1 guest done vmxon? */
+ bool vmxon;
+
+ /* The guest-physical address of the current VMCS L1 keeps for L2 */
+ gpa_t current_vmptr;
+ /* The host-usable pointer to the above */
+ struct page *current_vmcs12_page;
+ struct vmcs12 *current_vmcs12;
+};
+
struct vcpu_vmx {
struct kvm_vcpu vcpu;
- struct list_head local_vcpus_link;
unsigned long host_rsp;
- int launched;
u8 fail;
u8 cpl;
bool nmi_known_unmasked;
u64 msr_host_kernel_gs_base;
u64 msr_guest_kernel_gs_base;
#endif
- struct vmcs *vmcs;
+ /*
+ * loaded_vmcs points to the VMCS currently used in this vcpu. For a
+ * non-nested (L1) guest, it always points to vmcs01. For a nested
+ * guest (L2), it points to a different VMCS.
+ */
+ struct loaded_vmcs vmcs01;
+ struct loaded_vmcs *loaded_vmcs;
+ bool __launched; /* temporary, used in vmx_vcpu_run */
struct msr_autoload {
unsigned nr;
struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
u32 exit_reason;
bool rdtscp_enabled;
+
+ /* Support for a guest hypervisor (nested VMX) */
+ struct nested_vmx nested;
};
enum segment_cache_field {
return container_of(vcpu, struct vcpu_vmx, vcpu);
}
+static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.current_vmcs12;
+}
+
+static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+ struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
+ if (is_error_page(page)) {
+ kvm_release_page_clean(page);
+ return NULL;
+ }
+ return page;
+}
+
+static void nested_release_page(struct page *page)
+{
+ kvm_release_page_dirty(page);
+}
+
+static void nested_release_page_clean(struct page *page)
+{
+ kvm_release_page_clean(page);
+}
+
static u64 construct_eptp(unsigned long root_hpa);
static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
-static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+/*
+ * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
+ * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
+ */
+static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
static unsigned long *vmx_io_bitmap_a;
vmcs, phys_addr);
}
+static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
+{
+ vmcs_clear(loaded_vmcs->vmcs);
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
+}
+
static void vmcs_load(struct vmcs *vmcs)
{
u64 phys_addr = __pa(vmcs);
vmcs, phys_addr);
}
-static void __vcpu_clear(void *arg)
+static void __loaded_vmcs_clear(void *arg)
{
- struct vcpu_vmx *vmx = arg;
+ struct loaded_vmcs *loaded_vmcs = arg;
int cpu = raw_smp_processor_id();
- if (vmx->vcpu.cpu == cpu)
- vmcs_clear(vmx->vmcs);
- if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
+ if (loaded_vmcs->cpu != cpu)
+ return; /* vcpu migration can race with cpu offline */
+ if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
- list_del(&vmx->local_vcpus_link);
- vmx->vcpu.cpu = -1;
- vmx->launched = 0;
+ list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
+ loaded_vmcs_init(loaded_vmcs);
}
-static void vcpu_clear(struct vcpu_vmx *vmx)
+static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
{
- if (vmx->vcpu.cpu == -1)
- return;
- smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
+ if (loaded_vmcs->cpu != -1)
+ smp_call_function_single(
+ loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
}
static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
}
}
-static unsigned long vmcs_readl(unsigned long field)
+static __always_inline unsigned long vmcs_readl(unsigned long field)
{
- unsigned long value = 0;
+ unsigned long value;
- asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
- : "+a"(value) : "d"(field) : "cc");
+ asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
+ : "=a"(value) : "d"(field) : "cc");
return value;
}
-static u16 vmcs_read16(unsigned long field)
+static __always_inline u16 vmcs_read16(unsigned long field)
{
return vmcs_readl(field);
}
-static u32 vmcs_read32(unsigned long field)
+static __always_inline u32 vmcs_read32(unsigned long field)
{
return vmcs_readl(field);
}
-static u64 vmcs_read64(unsigned long field)
+static __always_inline u64 vmcs_read64(unsigned long field)
{
#ifdef CONFIG_X86_64
return vmcs_readl(field);
if (!vmm_exclusive)
kvm_cpu_vmxon(phys_addr);
- else if (vcpu->cpu != cpu)
- vcpu_clear(vmx);
+ else if (vmx->loaded_vmcs->cpu != cpu)
+ loaded_vmcs_clear(vmx->loaded_vmcs);
- if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
- per_cpu(current_vmcs, cpu) = vmx->vmcs;
- vmcs_load(vmx->vmcs);
+ if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+ per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+ vmcs_load(vmx->loaded_vmcs->vmcs);
}
- if (vcpu->cpu != cpu) {
+ if (vmx->loaded_vmcs->cpu != cpu) {
struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
unsigned long sysenter_esp;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
local_irq_disable();
- list_add(&vmx->local_vcpus_link,
- &per_cpu(vcpus_on_cpu, cpu));
+ list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
+ &per_cpu(loaded_vmcss_on_cpu, cpu));
local_irq_enable();
/*
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+ vmx->loaded_vmcs->cpu = cpu;
}
}
{
__vmx_load_host_state(to_vmx(vcpu));
if (!vmm_exclusive) {
- __vcpu_clear(to_vmx(vcpu));
+ __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
+ vcpu->cpu = -1;
kvm_cpu_vmxoff();
}
}
return target_tsc - native_read_tsc();
}
+static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
+}
+
+/*
+ * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
+ * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
+ * all guests if the "nested" module option is off, and can also be disabled
+ * for a single guest by disabling its VMX cpuid bit.
+ */
+static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
+{
+ return nested && guest_cpuid_has_vmx(vcpu);
+}
+
/*
* Reads an msr value (of 'msr_index') into 'pdata'.
* Returns 0 on success, non-0 otherwise.
if (read_cr4() & X86_CR4_VMXE)
return -EBUSY;
- INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
+ INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
test_bits = FEATURE_CONTROL_LOCKED;
return 0;
}
-static void vmclear_local_vcpus(void)
+static void vmclear_local_loaded_vmcss(void)
{
int cpu = raw_smp_processor_id();
- struct vcpu_vmx *vmx, *n;
+ struct loaded_vmcs *v, *n;
- list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
- local_vcpus_link)
- __vcpu_clear(vmx);
+ list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ loaded_vmcss_on_cpu_link)
+ __loaded_vmcs_clear(v);
}
static void hardware_disable(void *garbage)
{
if (vmm_exclusive) {
- vmclear_local_vcpus();
+ vmclear_local_loaded_vmcss();
kvm_cpu_vmxoff();
}
write_cr4(read_cr4() & ~X86_CR4_VMXE);
free_pages((unsigned long)vmcs, vmcs_config.order);
}
+/*
+ * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
+ */
+static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+{
+ if (!loaded_vmcs->vmcs)
+ return;
+ loaded_vmcs_clear(loaded_vmcs);
+ free_vmcs(loaded_vmcs->vmcs);
+ loaded_vmcs->vmcs = NULL;
+}
+
static void free_kvm_area(void)
{
int cpu;
(unsigned long *)&vcpu->arch.regs_dirty);
}
-static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
unsigned long cr0,
struct kvm_vcpu *vcpu)
{
- vmx_decache_cr3(vcpu);
+ if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ vmx_decache_cr3(vcpu);
if (!(cr0 & X86_CR0_PG)) {
/* From paging/starting to nonpaging */
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
vmcs_writel(GUEST_CR3, guest_cr3);
}
-static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+ if (cr4 & X86_CR4_VMXE) {
+ /*
+ * To use VMXON (and later other VMX instructions), a guest
+ * must first be able to turn on cr4.VMXE (see handle_vmon()).
+ * So basically the check on whether to allow nested VMX
+ * is here.
+ */
+ if (!nested_vmx_allowed(vcpu))
+ return 1;
+ } else if (to_vmx(vcpu)->nested.vmxon)
+ return 1;
+
vcpu->arch.cr4 = cr4;
if (enable_ept) {
if (!is_paging(vcpu)) {
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, hw_cr4);
+ return 0;
}
static void vmx_get_segment(struct kvm_vcpu *vcpu,
return 1;
}
+/*
+ * Emulate the VMXON instruction.
+ * Currently, we just remember that VMX is active, and do not save or even
+ * inspect the argument to VMXON (the so-called "VMXON pointer") because we
+ * do not currently need to store anything in that guest-allocated memory
+ * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
+ * argument is different from the VMXON pointer (which the spec says they do).
+ */
+static int handle_vmon(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /* The Intel VMX Instruction Reference lists a bunch of bits that
+ * are prerequisite to running VMXON, most notably cr4.VMXE must be
+ * set to 1 (see vmx_set_cr4() for when we allow the guest to set this).
+ * Otherwise, we should fail with #UD. We test these now:
+ */
+ if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) ||
+ !kvm_read_cr0_bits(vcpu, X86_CR0_PE) ||
+ (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ if (is_long_mode(vcpu) && !cs.l) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (vmx_get_cpl(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ vmx->nested.vmxon = true;
+
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
+/*
+ * Intel's VMX Instruction Reference specifies a common set of prerequisites
+ * for running VMX instructions (except VMXON, whose prerequisites are
+ * slightly different). It also specifies what exception to inject otherwise.
+ */
+static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!vmx->nested.vmxon) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 0;
+ }
+
+ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
+ (is_long_mode(vcpu) && !cs.l)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 0;
+ }
+
+ if (vmx_get_cpl(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Free whatever needs to be freed from vmx->nested when L1 goes down, or
+ * just stops using VMX.
+ */
+static void free_nested(struct vcpu_vmx *vmx)
+{
+ if (!vmx->nested.vmxon)
+ return;
+ vmx->nested.vmxon = false;
+ if (vmx->nested.current_vmptr != -1ull) {
+ kunmap(vmx->nested.current_vmcs12_page);
+ nested_release_page(vmx->nested.current_vmcs12_page);
+ vmx->nested.current_vmptr = -1ull;
+ vmx->nested.current_vmcs12 = NULL;
+ }
+}
+
+/* Emulate the VMXOFF instruction */
+static int handle_vmoff(struct kvm_vcpu *vcpu)
+{
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+ free_nested(to_vmx(vcpu));
+ skip_emulated_instruction(vcpu);
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
[EXIT_REASON_VMREAD] = handle_vmx_insn,
[EXIT_REASON_VMRESUME] = handle_vmx_insn,
[EXIT_REASON_VMWRITE] = handle_vmx_insn,
- [EXIT_REASON_VMOFF] = handle_vmx_insn,
- [EXIT_REASON_VMON] = handle_vmx_insn,
+ [EXIT_REASON_VMOFF] = handle_vmoff,
+ [EXIT_REASON_VMON] = handle_vmon,
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_WBINVD] = handle_wbinvd,
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
+ vmx->__launched = vmx->loaded_vmcs->launched;
asm(
/* Store host registers */
"push %%"R"dx; push %%"R"bp;"
"pop %%"R"bp; pop %%"R"dx \n\t"
"setbe %c[fail](%0) \n\t"
: : "c"(vmx), "d"((unsigned long)HOST_RSP),
- [launched]"i"(offsetof(struct vcpu_vmx, launched)),
+ [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
[fail]"i"(offsetof(struct vcpu_vmx, fail)),
[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
- vmx->launched = 1;
+ vmx->loaded_vmcs->launched = 1;
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
#undef R
#undef Q
-static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (vmx->vmcs) {
- vcpu_clear(vmx);
- free_vmcs(vmx->vmcs);
- vmx->vmcs = NULL;
- }
-}
-
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
free_vpid(vmx);
- vmx_free_vmcs(vcpu);
+ free_nested(vmx);
+ free_loaded_vmcs(vmx->loaded_vmcs);
kfree(vmx->guest_msrs);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, vmx);
}
-static inline void vmcs_init(struct vmcs *vmcs)
-{
- u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
-
- if (!vmm_exclusive)
- kvm_cpu_vmxon(phys_addr);
-
- vmcs_clear(vmcs);
-
- if (!vmm_exclusive)
- kvm_cpu_vmxoff();
-}
-
static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{
int err;
goto uninit_vcpu;
}
- vmx->vmcs = alloc_vmcs();
- if (!vmx->vmcs)
+ vmx->loaded_vmcs = &vmx->vmcs01;
+ vmx->loaded_vmcs->vmcs = alloc_vmcs();
+ if (!vmx->loaded_vmcs->vmcs)
goto free_msrs;
-
- vmcs_init(vmx->vmcs);
+ if (!vmm_exclusive)
+ kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
+ loaded_vmcs_init(vmx->loaded_vmcs);
+ if (!vmm_exclusive)
+ kvm_cpu_vmxoff();
cpu = get_cpu();
vmx_vcpu_load(&vmx->vcpu, cpu);
goto free_vmcs;
}
+ vmx->nested.current_vmptr = -1ull;
+ vmx->nested.current_vmcs12 = NULL;
+
return &vmx->vcpu;
free_vmcs:
- free_vmcs(vmx->vmcs);
+ free_vmcs(vmx->loaded_vmcs->vmcs);
free_msrs:
kfree(vmx->guest_msrs);
uninit_vcpu: