1 // SPDX-License-Identifier: GPL-2.0-only
3 * Kernel-based Virtual Machine driver for Linux
5 * This module enables machines with Intel VT-x extensions to run virtual
6 * machines without emulation or binary translation.
8 * Copyright (C) 2006 Qumranet, Inc.
9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
12 * Avi Kivity <avi@qumranet.com>
13 * Yaniv Kamay <yaniv@qumranet.com>
16 #include <linux/highmem.h>
17 #include <linux/hrtimer.h>
18 #include <linux/kernel.h>
19 #include <linux/kvm_host.h>
20 #include <linux/module.h>
21 #include <linux/moduleparam.h>
22 #include <linux/mod_devicetable.h>
24 #include <linux/objtool.h>
25 #include <linux/sched.h>
26 #include <linux/sched/smt.h>
27 #include <linux/slab.h>
28 #include <linux/tboot.h>
29 #include <linux/trace_events.h>
30 #include <linux/entry-kvm.h>
35 #include <asm/cpu_device_id.h>
36 #include <asm/debugreg.h>
38 #include <asm/fpu/internal.h>
40 #include <asm/irq_remapping.h>
41 #include <asm/kexec.h>
42 #include <asm/perf_event.h>
43 #include <asm/mmu_context.h>
44 #include <asm/mshyperv.h>
45 #include <asm/mwait.h>
46 #include <asm/spec-ctrl.h>
47 #include <asm/virtext.h>
50 #include "capabilities.h"
55 #include "kvm_cache_regs.h"
66 MODULE_AUTHOR("Qumranet");
67 MODULE_LICENSE("GPL");
70 static const struct x86_cpu_id vmx_cpu_id
[] = {
71 X86_MATCH_FEATURE(X86_FEATURE_VMX
, NULL
),
74 MODULE_DEVICE_TABLE(x86cpu
, vmx_cpu_id
);
77 bool __read_mostly enable_vpid
= 1;
78 module_param_named(vpid
, enable_vpid
, bool, 0444);
80 static bool __read_mostly enable_vnmi
= 1;
81 module_param_named(vnmi
, enable_vnmi
, bool, S_IRUGO
);
83 bool __read_mostly flexpriority_enabled
= 1;
84 module_param_named(flexpriority
, flexpriority_enabled
, bool, S_IRUGO
);
86 bool __read_mostly enable_ept
= 1;
87 module_param_named(ept
, enable_ept
, bool, S_IRUGO
);
89 bool __read_mostly enable_unrestricted_guest
= 1;
90 module_param_named(unrestricted_guest
,
91 enable_unrestricted_guest
, bool, S_IRUGO
);
93 bool __read_mostly enable_ept_ad_bits
= 1;
94 module_param_named(eptad
, enable_ept_ad_bits
, bool, S_IRUGO
);
96 static bool __read_mostly emulate_invalid_guest_state
= true;
97 module_param(emulate_invalid_guest_state
, bool, S_IRUGO
);
99 static bool __read_mostly fasteoi
= 1;
100 module_param(fasteoi
, bool, S_IRUGO
);
102 bool __read_mostly enable_apicv
= 1;
103 module_param(enable_apicv
, bool, S_IRUGO
);
106 * If nested=1, nested virtualization is supported, i.e., guests may use
107 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
108 * use VMX instructions.
110 static bool __read_mostly nested
= 1;
111 module_param(nested
, bool, S_IRUGO
);
113 bool __read_mostly enable_pml
= 1;
114 module_param_named(pml
, enable_pml
, bool, S_IRUGO
);
116 static bool __read_mostly dump_invalid_vmcs
= 0;
117 module_param(dump_invalid_vmcs
, bool, 0644);
119 #define MSR_BITMAP_MODE_X2APIC 1
120 #define MSR_BITMAP_MODE_X2APIC_APICV 2
122 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
124 /* Guest_tsc -> host_tsc conversion requires 64-bit division. */
125 static int __read_mostly cpu_preemption_timer_multi
;
126 static bool __read_mostly enable_preemption_timer
= 1;
128 module_param_named(preemption_timer
, enable_preemption_timer
, bool, S_IRUGO
);
131 extern bool __read_mostly allow_smaller_maxphyaddr
;
132 module_param(allow_smaller_maxphyaddr
, bool, S_IRUGO
);
134 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
135 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
136 #define KVM_VM_CR0_ALWAYS_ON \
137 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
138 X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
140 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
141 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
142 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
144 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
146 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
147 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
148 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
149 RTIT_STATUS_BYTECNT))
152 * List of MSRs that can be directly passed to the guest.
153 * In addition to these x2apic and PT MSRs are handled specially.
155 static u32 vmx_possible_passthrough_msrs
[MAX_POSSIBLE_PASSTHROUGH_MSRS
] = {
162 MSR_IA32_SYSENTER_CS
,
163 MSR_IA32_SYSENTER_ESP
,
164 MSR_IA32_SYSENTER_EIP
,
166 MSR_CORE_C3_RESIDENCY
,
167 MSR_CORE_C6_RESIDENCY
,
168 MSR_CORE_C7_RESIDENCY
,
172 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
173 * ple_gap: upper bound on the amount of time between two successive
174 * executions of PAUSE in a loop. Also indicate if ple enabled.
175 * According to test, this time is usually smaller than 128 cycles.
176 * ple_window: upper bound on the amount of time a guest is allowed to execute
177 * in a PAUSE loop. Tests indicate that most spinlocks are held for
178 * less than 2^12 cycles
179 * Time is measured based on a counter that runs at the same rate as the TSC,
180 * refer SDM volume 3b section 21.6.13 & 22.1.3.
182 static unsigned int ple_gap
= KVM_DEFAULT_PLE_GAP
;
183 module_param(ple_gap
, uint
, 0444);
185 static unsigned int ple_window
= KVM_VMX_DEFAULT_PLE_WINDOW
;
186 module_param(ple_window
, uint
, 0444);
188 /* Default doubles per-vcpu window every exit. */
189 static unsigned int ple_window_grow
= KVM_DEFAULT_PLE_WINDOW_GROW
;
190 module_param(ple_window_grow
, uint
, 0444);
192 /* Default resets per-vcpu window every exit to ple_window. */
193 static unsigned int ple_window_shrink
= KVM_DEFAULT_PLE_WINDOW_SHRINK
;
194 module_param(ple_window_shrink
, uint
, 0444);
196 /* Default is to compute the maximum so we can never overflow. */
197 static unsigned int ple_window_max
= KVM_VMX_DEFAULT_PLE_WINDOW_MAX
;
198 module_param(ple_window_max
, uint
, 0444);
200 /* Default is SYSTEM mode, 1 for host-guest mode */
201 int __read_mostly pt_mode
= PT_MODE_SYSTEM
;
202 module_param(pt_mode
, int, S_IRUGO
);
204 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush
);
205 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond
);
206 static DEFINE_MUTEX(vmx_l1d_flush_mutex
);
208 /* Storage for pre module init parameter parsing */
209 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param
= VMENTER_L1D_FLUSH_AUTO
;
211 static const struct {
214 } vmentry_l1d_param
[] = {
215 [VMENTER_L1D_FLUSH_AUTO
] = {"auto", true},
216 [VMENTER_L1D_FLUSH_NEVER
] = {"never", true},
217 [VMENTER_L1D_FLUSH_COND
] = {"cond", true},
218 [VMENTER_L1D_FLUSH_ALWAYS
] = {"always", true},
219 [VMENTER_L1D_FLUSH_EPT_DISABLED
] = {"EPT disabled", false},
220 [VMENTER_L1D_FLUSH_NOT_REQUIRED
] = {"not required", false},
223 #define L1D_CACHE_ORDER 4
224 static void *vmx_l1d_flush_pages
;
226 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf
)
231 if (!boot_cpu_has_bug(X86_BUG_L1TF
)) {
232 l1tf_vmx_mitigation
= VMENTER_L1D_FLUSH_NOT_REQUIRED
;
237 l1tf_vmx_mitigation
= VMENTER_L1D_FLUSH_EPT_DISABLED
;
241 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES
)) {
244 rdmsrl(MSR_IA32_ARCH_CAPABILITIES
, msr
);
245 if (msr
& ARCH_CAP_SKIP_VMENTRY_L1DFLUSH
) {
246 l1tf_vmx_mitigation
= VMENTER_L1D_FLUSH_NOT_REQUIRED
;
251 /* If set to auto use the default l1tf mitigation method */
252 if (l1tf
== VMENTER_L1D_FLUSH_AUTO
) {
253 switch (l1tf_mitigation
) {
254 case L1TF_MITIGATION_OFF
:
255 l1tf
= VMENTER_L1D_FLUSH_NEVER
;
257 case L1TF_MITIGATION_FLUSH_NOWARN
:
258 case L1TF_MITIGATION_FLUSH
:
259 case L1TF_MITIGATION_FLUSH_NOSMT
:
260 l1tf
= VMENTER_L1D_FLUSH_COND
;
262 case L1TF_MITIGATION_FULL
:
263 case L1TF_MITIGATION_FULL_FORCE
:
264 l1tf
= VMENTER_L1D_FLUSH_ALWAYS
;
267 } else if (l1tf_mitigation
== L1TF_MITIGATION_FULL_FORCE
) {
268 l1tf
= VMENTER_L1D_FLUSH_ALWAYS
;
271 if (l1tf
!= VMENTER_L1D_FLUSH_NEVER
&& !vmx_l1d_flush_pages
&&
272 !boot_cpu_has(X86_FEATURE_FLUSH_L1D
)) {
274 * This allocation for vmx_l1d_flush_pages is not tied to a VM
275 * lifetime and so should not be charged to a memcg.
277 page
= alloc_pages(GFP_KERNEL
, L1D_CACHE_ORDER
);
280 vmx_l1d_flush_pages
= page_address(page
);
283 * Initialize each page with a different pattern in
284 * order to protect against KSM in the nested
285 * virtualization case.
287 for (i
= 0; i
< 1u << L1D_CACHE_ORDER
; ++i
) {
288 memset(vmx_l1d_flush_pages
+ i
* PAGE_SIZE
, i
+ 1,
293 l1tf_vmx_mitigation
= l1tf
;
295 if (l1tf
!= VMENTER_L1D_FLUSH_NEVER
)
296 static_branch_enable(&vmx_l1d_should_flush
);
298 static_branch_disable(&vmx_l1d_should_flush
);
300 if (l1tf
== VMENTER_L1D_FLUSH_COND
)
301 static_branch_enable(&vmx_l1d_flush_cond
);
303 static_branch_disable(&vmx_l1d_flush_cond
);
307 static int vmentry_l1d_flush_parse(const char *s
)
312 for (i
= 0; i
< ARRAY_SIZE(vmentry_l1d_param
); i
++) {
313 if (vmentry_l1d_param
[i
].for_parse
&&
314 sysfs_streq(s
, vmentry_l1d_param
[i
].option
))
321 static int vmentry_l1d_flush_set(const char *s
, const struct kernel_param
*kp
)
325 l1tf
= vmentry_l1d_flush_parse(s
);
329 if (!boot_cpu_has(X86_BUG_L1TF
))
333 * Has vmx_init() run already? If not then this is the pre init
334 * parameter parsing. In that case just store the value and let
335 * vmx_init() do the proper setup after enable_ept has been
338 if (l1tf_vmx_mitigation
== VMENTER_L1D_FLUSH_AUTO
) {
339 vmentry_l1d_flush_param
= l1tf
;
343 mutex_lock(&vmx_l1d_flush_mutex
);
344 ret
= vmx_setup_l1d_flush(l1tf
);
345 mutex_unlock(&vmx_l1d_flush_mutex
);
349 static int vmentry_l1d_flush_get(char *s
, const struct kernel_param
*kp
)
351 if (WARN_ON_ONCE(l1tf_vmx_mitigation
>= ARRAY_SIZE(vmentry_l1d_param
)))
352 return sprintf(s
, "???\n");
354 return sprintf(s
, "%s\n", vmentry_l1d_param
[l1tf_vmx_mitigation
].option
);
357 static const struct kernel_param_ops vmentry_l1d_flush_ops
= {
358 .set
= vmentry_l1d_flush_set
,
359 .get
= vmentry_l1d_flush_get
,
361 module_param_cb(vmentry_l1d_flush
, &vmentry_l1d_flush_ops
, NULL
, 0644);
363 static u32
vmx_segment_access_rights(struct kvm_segment
*var
);
364 static __always_inline
void vmx_disable_intercept_for_msr(struct kvm_vcpu
*vcpu
,
367 void vmx_vmexit(void);
369 #define vmx_insn_failed(fmt...) \
372 pr_warn_ratelimited(fmt); \
375 asmlinkage
void vmread_error(unsigned long field
, bool fault
)
378 kvm_spurious_fault();
380 vmx_insn_failed("kvm: vmread failed: field=%lx\n", field
);
383 noinline
void vmwrite_error(unsigned long field
, unsigned long value
)
385 vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n",
386 field
, value
, vmcs_read32(VM_INSTRUCTION_ERROR
));
389 noinline
void vmclear_error(struct vmcs
*vmcs
, u64 phys_addr
)
391 vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs
, phys_addr
);
394 noinline
void vmptrld_error(struct vmcs
*vmcs
, u64 phys_addr
)
396 vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs
, phys_addr
);
399 noinline
void invvpid_error(unsigned long ext
, u16 vpid
, gva_t gva
)
401 vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
405 noinline
void invept_error(unsigned long ext
, u64 eptp
, gpa_t gpa
)
407 vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
411 static DEFINE_PER_CPU(struct vmcs
*, vmxarea
);
412 DEFINE_PER_CPU(struct vmcs
*, current_vmcs
);
414 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
415 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
417 static DEFINE_PER_CPU(struct list_head
, loaded_vmcss_on_cpu
);
419 static DECLARE_BITMAP(vmx_vpid_bitmap
, VMX_NR_VPIDS
);
420 static DEFINE_SPINLOCK(vmx_vpid_lock
);
422 struct vmcs_config vmcs_config
;
423 struct vmx_capability vmx_capability
;
425 #define VMX_SEGMENT_FIELD(seg) \
426 [VCPU_SREG_##seg] = { \
427 .selector = GUEST_##seg##_SELECTOR, \
428 .base = GUEST_##seg##_BASE, \
429 .limit = GUEST_##seg##_LIMIT, \
430 .ar_bytes = GUEST_##seg##_AR_BYTES, \
433 static const struct kvm_vmx_segment_field
{
438 } kvm_vmx_segment_fields
[] = {
439 VMX_SEGMENT_FIELD(CS
),
440 VMX_SEGMENT_FIELD(DS
),
441 VMX_SEGMENT_FIELD(ES
),
442 VMX_SEGMENT_FIELD(FS
),
443 VMX_SEGMENT_FIELD(GS
),
444 VMX_SEGMENT_FIELD(SS
),
445 VMX_SEGMENT_FIELD(TR
),
446 VMX_SEGMENT_FIELD(LDTR
),
449 static inline void vmx_segment_cache_clear(struct vcpu_vmx
*vmx
)
451 vmx
->segment_cache
.bitmask
= 0;
454 static unsigned long host_idt_base
;
457 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
458 * will emulate SYSCALL in legacy mode if the vendor string in guest
459 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
460 * support this emulation, IA32_STAR must always be included in
461 * vmx_uret_msrs_list[], even in i386 builds.
463 static const u32 vmx_uret_msrs_list
[] = {
465 MSR_SYSCALL_MASK
, MSR_LSTAR
, MSR_CSTAR
,
467 MSR_EFER
, MSR_TSC_AUX
, MSR_STAR
,
471 #if IS_ENABLED(CONFIG_HYPERV)
472 static bool __read_mostly enlightened_vmcs
= true;
473 module_param(enlightened_vmcs
, bool, 0444);
475 /* check_ept_pointer() should be under protection of ept_pointer_lock. */
476 static void check_ept_pointer_match(struct kvm
*kvm
)
478 struct kvm_vcpu
*vcpu
;
479 u64 tmp_eptp
= INVALID_PAGE
;
482 kvm_for_each_vcpu(i
, vcpu
, kvm
) {
483 if (!VALID_PAGE(tmp_eptp
)) {
484 tmp_eptp
= to_vmx(vcpu
)->ept_pointer
;
485 } else if (tmp_eptp
!= to_vmx(vcpu
)->ept_pointer
) {
486 to_kvm_vmx(kvm
)->ept_pointers_match
487 = EPT_POINTERS_MISMATCH
;
492 to_kvm_vmx(kvm
)->ept_pointers_match
= EPT_POINTERS_MATCH
;
495 static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list
*flush
,
498 struct kvm_tlb_range
*range
= data
;
500 return hyperv_fill_flush_guest_mapping_list(flush
, range
->start_gfn
,
504 static inline int __hv_remote_flush_tlb_with_range(struct kvm
*kvm
,
505 struct kvm_vcpu
*vcpu
, struct kvm_tlb_range
*range
)
507 u64 ept_pointer
= to_vmx(vcpu
)->ept_pointer
;
510 * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
511 * of the base of EPT PML4 table, strip off EPT configuration
515 return hyperv_flush_guest_mapping_range(ept_pointer
& PAGE_MASK
,
516 kvm_fill_hv_flush_list_func
, (void *)range
);
518 return hyperv_flush_guest_mapping(ept_pointer
& PAGE_MASK
);
521 static int hv_remote_flush_tlb_with_range(struct kvm
*kvm
,
522 struct kvm_tlb_range
*range
)
524 struct kvm_vcpu
*vcpu
;
527 spin_lock(&to_kvm_vmx(kvm
)->ept_pointer_lock
);
529 if (to_kvm_vmx(kvm
)->ept_pointers_match
== EPT_POINTERS_CHECK
)
530 check_ept_pointer_match(kvm
);
532 if (to_kvm_vmx(kvm
)->ept_pointers_match
!= EPT_POINTERS_MATCH
) {
533 kvm_for_each_vcpu(i
, vcpu
, kvm
) {
534 /* If ept_pointer is invalid pointer, bypass flush request. */
535 if (VALID_PAGE(to_vmx(vcpu
)->ept_pointer
))
536 ret
|= __hv_remote_flush_tlb_with_range(
540 ret
= __hv_remote_flush_tlb_with_range(kvm
,
541 kvm_get_vcpu(kvm
, 0), range
);
544 spin_unlock(&to_kvm_vmx(kvm
)->ept_pointer_lock
);
547 static int hv_remote_flush_tlb(struct kvm
*kvm
)
549 return hv_remote_flush_tlb_with_range(kvm
, NULL
);
552 static int hv_enable_direct_tlbflush(struct kvm_vcpu
*vcpu
)
554 struct hv_enlightened_vmcs
*evmcs
;
555 struct hv_partition_assist_pg
**p_hv_pa_pg
=
556 &to_kvm_hv(vcpu
->kvm
)->hv_pa_pg
;
558 * Synthetic VM-Exit is not enabled in current code and so All
559 * evmcs in singe VM shares same assist page.
562 *p_hv_pa_pg
= kzalloc(PAGE_SIZE
, GFP_KERNEL
);
567 evmcs
= (struct hv_enlightened_vmcs
*)to_vmx(vcpu
)->loaded_vmcs
->vmcs
;
569 evmcs
->partition_assist_page
=
571 evmcs
->hv_vm_id
= (unsigned long)vcpu
->kvm
;
572 evmcs
->hv_enlightenments_control
.nested_flush_hypercall
= 1;
577 #endif /* IS_ENABLED(CONFIG_HYPERV) */
580 * Comment's format: document - errata name - stepping - processor name.
582 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
584 static u32 vmx_preemption_cpu_tfms
[] = {
585 /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
587 /* 323056.pdf - AAX65 - C2 - Xeon L3406 */
588 /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
589 /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
591 /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
593 /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
594 /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
596 * 320767.pdf - AAP86 - B1 -
597 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
600 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */
602 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */
604 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
606 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
607 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
608 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
610 /* Xeon E3-1220 V2 */
614 static inline bool cpu_has_broken_vmx_preemption_timer(void)
616 u32 eax
= cpuid_eax(0x00000001), i
;
618 /* Clear the reserved bits */
619 eax
&= ~(0x3U
<< 14 | 0xfU
<< 28);
620 for (i
= 0; i
< ARRAY_SIZE(vmx_preemption_cpu_tfms
); i
++)
621 if (eax
== vmx_preemption_cpu_tfms
[i
])
627 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu
*vcpu
)
629 return flexpriority_enabled
&& lapic_in_kernel(vcpu
);
632 static inline bool report_flexpriority(void)
634 return flexpriority_enabled
;
637 static int possible_passthrough_msr_slot(u32 msr
)
641 for (i
= 0; i
< ARRAY_SIZE(vmx_possible_passthrough_msrs
); i
++)
642 if (vmx_possible_passthrough_msrs
[i
] == msr
)
648 static bool is_valid_passthrough_msr(u32 msr
)
653 case 0x800 ... 0x8ff:
654 /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
656 case MSR_IA32_RTIT_STATUS
:
657 case MSR_IA32_RTIT_OUTPUT_BASE
:
658 case MSR_IA32_RTIT_OUTPUT_MASK
:
659 case MSR_IA32_RTIT_CR3_MATCH
:
660 case MSR_IA32_RTIT_ADDR0_A
... MSR_IA32_RTIT_ADDR3_B
:
661 /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
664 case MSR_LBR_INFO_0
... MSR_LBR_INFO_0
+ 31:
665 case MSR_LBR_NHM_FROM
... MSR_LBR_NHM_FROM
+ 31:
666 case MSR_LBR_NHM_TO
... MSR_LBR_NHM_TO
+ 31:
667 case MSR_LBR_CORE_FROM
... MSR_LBR_CORE_FROM
+ 8:
668 case MSR_LBR_CORE_TO
... MSR_LBR_CORE_TO
+ 8:
669 /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
673 r
= possible_passthrough_msr_slot(msr
) != -ENOENT
;
675 WARN(!r
, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr
);
680 static inline int __vmx_find_uret_msr(struct vcpu_vmx
*vmx
, u32 msr
)
684 for (i
= 0; i
< vmx
->nr_uret_msrs
; ++i
)
685 if (vmx_uret_msrs_list
[vmx
->guest_uret_msrs
[i
].slot
] == msr
)
690 struct vmx_uret_msr
*vmx_find_uret_msr(struct vcpu_vmx
*vmx
, u32 msr
)
694 i
= __vmx_find_uret_msr(vmx
, msr
);
696 return &vmx
->guest_uret_msrs
[i
];
700 static int vmx_set_guest_uret_msr(struct vcpu_vmx
*vmx
,
701 struct vmx_uret_msr
*msr
, u64 data
)
705 u64 old_msr_data
= msr
->data
;
707 if (msr
- vmx
->guest_uret_msrs
< vmx
->nr_active_uret_msrs
) {
709 ret
= kvm_set_user_return_msr(msr
->slot
, msr
->data
, msr
->mask
);
712 msr
->data
= old_msr_data
;
717 #ifdef CONFIG_KEXEC_CORE
718 static void crash_vmclear_local_loaded_vmcss(void)
720 int cpu
= raw_smp_processor_id();
721 struct loaded_vmcs
*v
;
723 list_for_each_entry(v
, &per_cpu(loaded_vmcss_on_cpu
, cpu
),
724 loaded_vmcss_on_cpu_link
)
727 #endif /* CONFIG_KEXEC_CORE */
729 static void __loaded_vmcs_clear(void *arg
)
731 struct loaded_vmcs
*loaded_vmcs
= arg
;
732 int cpu
= raw_smp_processor_id();
734 if (loaded_vmcs
->cpu
!= cpu
)
735 return; /* vcpu migration can race with cpu offline */
736 if (per_cpu(current_vmcs
, cpu
) == loaded_vmcs
->vmcs
)
737 per_cpu(current_vmcs
, cpu
) = NULL
;
739 vmcs_clear(loaded_vmcs
->vmcs
);
740 if (loaded_vmcs
->shadow_vmcs
&& loaded_vmcs
->launched
)
741 vmcs_clear(loaded_vmcs
->shadow_vmcs
);
743 list_del(&loaded_vmcs
->loaded_vmcss_on_cpu_link
);
746 * Ensure all writes to loaded_vmcs, including deleting it from its
747 * current percpu list, complete before setting loaded_vmcs->vcpu to
748 * -1, otherwise a different cpu can see vcpu == -1 first and add
749 * loaded_vmcs to its percpu list before it's deleted from this cpu's
750 * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
754 loaded_vmcs
->cpu
= -1;
755 loaded_vmcs
->launched
= 0;
758 void loaded_vmcs_clear(struct loaded_vmcs
*loaded_vmcs
)
760 int cpu
= loaded_vmcs
->cpu
;
763 smp_call_function_single(cpu
,
764 __loaded_vmcs_clear
, loaded_vmcs
, 1);
767 static bool vmx_segment_cache_test_set(struct vcpu_vmx
*vmx
, unsigned seg
,
771 u32 mask
= 1 << (seg
* SEG_FIELD_NR
+ field
);
773 if (!kvm_register_is_available(&vmx
->vcpu
, VCPU_EXREG_SEGMENTS
)) {
774 kvm_register_mark_available(&vmx
->vcpu
, VCPU_EXREG_SEGMENTS
);
775 vmx
->segment_cache
.bitmask
= 0;
777 ret
= vmx
->segment_cache
.bitmask
& mask
;
778 vmx
->segment_cache
.bitmask
|= mask
;
782 static u16
vmx_read_guest_seg_selector(struct vcpu_vmx
*vmx
, unsigned seg
)
784 u16
*p
= &vmx
->segment_cache
.seg
[seg
].selector
;
786 if (!vmx_segment_cache_test_set(vmx
, seg
, SEG_FIELD_SEL
))
787 *p
= vmcs_read16(kvm_vmx_segment_fields
[seg
].selector
);
791 static ulong
vmx_read_guest_seg_base(struct vcpu_vmx
*vmx
, unsigned seg
)
793 ulong
*p
= &vmx
->segment_cache
.seg
[seg
].base
;
795 if (!vmx_segment_cache_test_set(vmx
, seg
, SEG_FIELD_BASE
))
796 *p
= vmcs_readl(kvm_vmx_segment_fields
[seg
].base
);
800 static u32
vmx_read_guest_seg_limit(struct vcpu_vmx
*vmx
, unsigned seg
)
802 u32
*p
= &vmx
->segment_cache
.seg
[seg
].limit
;
804 if (!vmx_segment_cache_test_set(vmx
, seg
, SEG_FIELD_LIMIT
))
805 *p
= vmcs_read32(kvm_vmx_segment_fields
[seg
].limit
);
809 static u32
vmx_read_guest_seg_ar(struct vcpu_vmx
*vmx
, unsigned seg
)
811 u32
*p
= &vmx
->segment_cache
.seg
[seg
].ar
;
813 if (!vmx_segment_cache_test_set(vmx
, seg
, SEG_FIELD_AR
))
814 *p
= vmcs_read32(kvm_vmx_segment_fields
[seg
].ar_bytes
);
818 void vmx_update_exception_bitmap(struct kvm_vcpu
*vcpu
)
822 eb
= (1u << PF_VECTOR
) | (1u << UD_VECTOR
) | (1u << MC_VECTOR
) |
823 (1u << DB_VECTOR
) | (1u << AC_VECTOR
);
825 * Guest access to VMware backdoor ports could legitimately
826 * trigger #GP because of TSS I/O permission bitmap.
827 * We intercept those #GP and allow access to them anyway
830 if (enable_vmware_backdoor
)
831 eb
|= (1u << GP_VECTOR
);
832 if ((vcpu
->guest_debug
&
833 (KVM_GUESTDBG_ENABLE
| KVM_GUESTDBG_USE_SW_BP
)) ==
834 (KVM_GUESTDBG_ENABLE
| KVM_GUESTDBG_USE_SW_BP
))
835 eb
|= 1u << BP_VECTOR
;
836 if (to_vmx(vcpu
)->rmode
.vm86_active
)
838 if (!vmx_need_pf_intercept(vcpu
))
839 eb
&= ~(1u << PF_VECTOR
);
841 /* When we are running a nested L2 guest and L1 specified for it a
842 * certain exception bitmap, we must trap the same exceptions and pass
843 * them to L1. When running L2, we will only handle the exceptions
844 * specified above if L1 did not want them.
846 if (is_guest_mode(vcpu
))
847 eb
|= get_vmcs12(vcpu
)->exception_bitmap
;
850 * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched
851 * between guest and host. In that case we only care about present
852 * faults. For vmcs02, however, PFEC_MASK and PFEC_MATCH are set in
853 * prepare_vmcs02_rare.
855 bool selective_pf_trap
= enable_ept
&& (eb
& (1u << PF_VECTOR
));
856 int mask
= selective_pf_trap
? PFERR_PRESENT_MASK
: 0;
857 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK
, mask
);
858 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH
, mask
);
861 vmcs_write32(EXCEPTION_BITMAP
, eb
);
865 * Check if MSR is intercepted for currently loaded MSR bitmap.
867 static bool msr_write_intercepted(struct kvm_vcpu
*vcpu
, u32 msr
)
869 unsigned long *msr_bitmap
;
870 int f
= sizeof(unsigned long);
872 if (!cpu_has_vmx_msr_bitmap())
875 msr_bitmap
= to_vmx(vcpu
)->loaded_vmcs
->msr_bitmap
;
878 return !!test_bit(msr
, msr_bitmap
+ 0x800 / f
);
879 } else if ((msr
>= 0xc0000000) && (msr
<= 0xc0001fff)) {
881 return !!test_bit(msr
, msr_bitmap
+ 0xc00 / f
);
887 static void clear_atomic_switch_msr_special(struct vcpu_vmx
*vmx
,
888 unsigned long entry
, unsigned long exit
)
890 vm_entry_controls_clearbit(vmx
, entry
);
891 vm_exit_controls_clearbit(vmx
, exit
);
894 int vmx_find_loadstore_msr_slot(struct vmx_msrs
*m
, u32 msr
)
898 for (i
= 0; i
< m
->nr
; ++i
) {
899 if (m
->val
[i
].index
== msr
)
905 static void clear_atomic_switch_msr(struct vcpu_vmx
*vmx
, unsigned msr
)
908 struct msr_autoload
*m
= &vmx
->msr_autoload
;
912 if (cpu_has_load_ia32_efer()) {
913 clear_atomic_switch_msr_special(vmx
,
914 VM_ENTRY_LOAD_IA32_EFER
,
915 VM_EXIT_LOAD_IA32_EFER
);
919 case MSR_CORE_PERF_GLOBAL_CTRL
:
920 if (cpu_has_load_perf_global_ctrl()) {
921 clear_atomic_switch_msr_special(vmx
,
922 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL
,
923 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL
);
928 i
= vmx_find_loadstore_msr_slot(&m
->guest
, msr
);
932 m
->guest
.val
[i
] = m
->guest
.val
[m
->guest
.nr
];
933 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT
, m
->guest
.nr
);
936 i
= vmx_find_loadstore_msr_slot(&m
->host
, msr
);
941 m
->host
.val
[i
] = m
->host
.val
[m
->host
.nr
];
942 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT
, m
->host
.nr
);
945 static void add_atomic_switch_msr_special(struct vcpu_vmx
*vmx
,
946 unsigned long entry
, unsigned long exit
,
947 unsigned long guest_val_vmcs
, unsigned long host_val_vmcs
,
948 u64 guest_val
, u64 host_val
)
950 vmcs_write64(guest_val_vmcs
, guest_val
);
951 if (host_val_vmcs
!= HOST_IA32_EFER
)
952 vmcs_write64(host_val_vmcs
, host_val
);
953 vm_entry_controls_setbit(vmx
, entry
);
954 vm_exit_controls_setbit(vmx
, exit
);
957 static void add_atomic_switch_msr(struct vcpu_vmx
*vmx
, unsigned msr
,
958 u64 guest_val
, u64 host_val
, bool entry_only
)
961 struct msr_autoload
*m
= &vmx
->msr_autoload
;
965 if (cpu_has_load_ia32_efer()) {
966 add_atomic_switch_msr_special(vmx
,
967 VM_ENTRY_LOAD_IA32_EFER
,
968 VM_EXIT_LOAD_IA32_EFER
,
971 guest_val
, host_val
);
975 case MSR_CORE_PERF_GLOBAL_CTRL
:
976 if (cpu_has_load_perf_global_ctrl()) {
977 add_atomic_switch_msr_special(vmx
,
978 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL
,
979 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL
,
980 GUEST_IA32_PERF_GLOBAL_CTRL
,
981 HOST_IA32_PERF_GLOBAL_CTRL
,
982 guest_val
, host_val
);
986 case MSR_IA32_PEBS_ENABLE
:
987 /* PEBS needs a quiescent period after being disabled (to write
988 * a record). Disabling PEBS through VMX MSR swapping doesn't
989 * provide that period, so a CPU could write host's record into
992 wrmsrl(MSR_IA32_PEBS_ENABLE
, 0);
995 i
= vmx_find_loadstore_msr_slot(&m
->guest
, msr
);
997 j
= vmx_find_loadstore_msr_slot(&m
->host
, msr
);
999 if ((i
< 0 && m
->guest
.nr
== MAX_NR_LOADSTORE_MSRS
) ||
1000 (j
< 0 && m
->host
.nr
== MAX_NR_LOADSTORE_MSRS
)) {
1001 printk_once(KERN_WARNING
"Not enough msr switch entries. "
1002 "Can't add msr %x\n", msr
);
1007 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT
, m
->guest
.nr
);
1009 m
->guest
.val
[i
].index
= msr
;
1010 m
->guest
.val
[i
].value
= guest_val
;
1017 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT
, m
->host
.nr
);
1019 m
->host
.val
[j
].index
= msr
;
1020 m
->host
.val
[j
].value
= host_val
;
1023 static bool update_transition_efer(struct vcpu_vmx
*vmx
)
1025 u64 guest_efer
= vmx
->vcpu
.arch
.efer
;
1026 u64 ignore_bits
= 0;
1029 /* Shadow paging assumes NX to be available. */
1031 guest_efer
|= EFER_NX
;
1034 * LMA and LME handled by hardware; SCE meaningless outside long mode.
1036 ignore_bits
|= EFER_SCE
;
1037 #ifdef CONFIG_X86_64
1038 ignore_bits
|= EFER_LMA
| EFER_LME
;
1039 /* SCE is meaningful only in long mode on Intel */
1040 if (guest_efer
& EFER_LMA
)
1041 ignore_bits
&= ~(u64
)EFER_SCE
;
1045 * On EPT, we can't emulate NX, so we must switch EFER atomically.
1046 * On CPUs that support "load IA32_EFER", always switch EFER
1047 * atomically, since it's faster than switching it manually.
1049 if (cpu_has_load_ia32_efer() ||
1050 (enable_ept
&& ((vmx
->vcpu
.arch
.efer
^ host_efer
) & EFER_NX
))) {
1051 if (!(guest_efer
& EFER_LMA
))
1052 guest_efer
&= ~EFER_LME
;
1053 if (guest_efer
!= host_efer
)
1054 add_atomic_switch_msr(vmx
, MSR_EFER
,
1055 guest_efer
, host_efer
, false);
1057 clear_atomic_switch_msr(vmx
, MSR_EFER
);
1061 i
= __vmx_find_uret_msr(vmx
, MSR_EFER
);
1065 clear_atomic_switch_msr(vmx
, MSR_EFER
);
1067 guest_efer
&= ~ignore_bits
;
1068 guest_efer
|= host_efer
& ignore_bits
;
1070 vmx
->guest_uret_msrs
[i
].data
= guest_efer
;
1071 vmx
->guest_uret_msrs
[i
].mask
= ~ignore_bits
;
1076 #ifdef CONFIG_X86_32
1078 * On 32-bit kernels, VM exits still load the FS and GS bases from the
1079 * VMCS rather than the segment table. KVM uses this helper to figure
1080 * out the current bases to poke them into the VMCS before entry.
1082 static unsigned long segment_base(u16 selector
)
1084 struct desc_struct
*table
;
1087 if (!(selector
& ~SEGMENT_RPL_MASK
))
1090 table
= get_current_gdt_ro();
1092 if ((selector
& SEGMENT_TI_MASK
) == SEGMENT_LDT
) {
1093 u16 ldt_selector
= kvm_read_ldt();
1095 if (!(ldt_selector
& ~SEGMENT_RPL_MASK
))
1098 table
= (struct desc_struct
*)segment_base(ldt_selector
);
1100 v
= get_desc_base(&table
[selector
>> 3]);
1105 static inline bool pt_can_write_msr(struct vcpu_vmx
*vmx
)
1107 return vmx_pt_mode_is_host_guest() &&
1108 !(vmx
->pt_desc
.guest
.ctl
& RTIT_CTL_TRACEEN
);
1111 static inline bool pt_output_base_valid(struct kvm_vcpu
*vcpu
, u64 base
)
1113 /* The base must be 128-byte aligned and a legal physical address. */
1114 return kvm_vcpu_is_legal_aligned_gpa(vcpu
, base
, 128);
1117 static inline void pt_load_msr(struct pt_ctx
*ctx
, u32 addr_range
)
1121 wrmsrl(MSR_IA32_RTIT_STATUS
, ctx
->status
);
1122 wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE
, ctx
->output_base
);
1123 wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK
, ctx
->output_mask
);
1124 wrmsrl(MSR_IA32_RTIT_CR3_MATCH
, ctx
->cr3_match
);
1125 for (i
= 0; i
< addr_range
; i
++) {
1126 wrmsrl(MSR_IA32_RTIT_ADDR0_A
+ i
* 2, ctx
->addr_a
[i
]);
1127 wrmsrl(MSR_IA32_RTIT_ADDR0_B
+ i
* 2, ctx
->addr_b
[i
]);
1131 static inline void pt_save_msr(struct pt_ctx
*ctx
, u32 addr_range
)
1135 rdmsrl(MSR_IA32_RTIT_STATUS
, ctx
->status
);
1136 rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE
, ctx
->output_base
);
1137 rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK
, ctx
->output_mask
);
1138 rdmsrl(MSR_IA32_RTIT_CR3_MATCH
, ctx
->cr3_match
);
1139 for (i
= 0; i
< addr_range
; i
++) {
1140 rdmsrl(MSR_IA32_RTIT_ADDR0_A
+ i
* 2, ctx
->addr_a
[i
]);
1141 rdmsrl(MSR_IA32_RTIT_ADDR0_B
+ i
* 2, ctx
->addr_b
[i
]);
1145 static void pt_guest_enter(struct vcpu_vmx
*vmx
)
1147 if (vmx_pt_mode_is_system())
1151 * GUEST_IA32_RTIT_CTL is already set in the VMCS.
1152 * Save host state before VM entry.
1154 rdmsrl(MSR_IA32_RTIT_CTL
, vmx
->pt_desc
.host
.ctl
);
1155 if (vmx
->pt_desc
.guest
.ctl
& RTIT_CTL_TRACEEN
) {
1156 wrmsrl(MSR_IA32_RTIT_CTL
, 0);
1157 pt_save_msr(&vmx
->pt_desc
.host
, vmx
->pt_desc
.addr_range
);
1158 pt_load_msr(&vmx
->pt_desc
.guest
, vmx
->pt_desc
.addr_range
);
1162 static void pt_guest_exit(struct vcpu_vmx
*vmx
)
1164 if (vmx_pt_mode_is_system())
1167 if (vmx
->pt_desc
.guest
.ctl
& RTIT_CTL_TRACEEN
) {
1168 pt_save_msr(&vmx
->pt_desc
.guest
, vmx
->pt_desc
.addr_range
);
1169 pt_load_msr(&vmx
->pt_desc
.host
, vmx
->pt_desc
.addr_range
);
1172 /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
1173 wrmsrl(MSR_IA32_RTIT_CTL
, vmx
->pt_desc
.host
.ctl
);
1176 void vmx_set_host_fs_gs(struct vmcs_host_state
*host
, u16 fs_sel
, u16 gs_sel
,
1177 unsigned long fs_base
, unsigned long gs_base
)
1179 if (unlikely(fs_sel
!= host
->fs_sel
)) {
1181 vmcs_write16(HOST_FS_SELECTOR
, fs_sel
);
1183 vmcs_write16(HOST_FS_SELECTOR
, 0);
1184 host
->fs_sel
= fs_sel
;
1186 if (unlikely(gs_sel
!= host
->gs_sel
)) {
1188 vmcs_write16(HOST_GS_SELECTOR
, gs_sel
);
1190 vmcs_write16(HOST_GS_SELECTOR
, 0);
1191 host
->gs_sel
= gs_sel
;
1193 if (unlikely(fs_base
!= host
->fs_base
)) {
1194 vmcs_writel(HOST_FS_BASE
, fs_base
);
1195 host
->fs_base
= fs_base
;
1197 if (unlikely(gs_base
!= host
->gs_base
)) {
1198 vmcs_writel(HOST_GS_BASE
, gs_base
);
1199 host
->gs_base
= gs_base
;
1203 void vmx_prepare_switch_to_guest(struct kvm_vcpu
*vcpu
)
1205 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
1206 struct vmcs_host_state
*host_state
;
1207 #ifdef CONFIG_X86_64
1208 int cpu
= raw_smp_processor_id();
1210 unsigned long fs_base
, gs_base
;
1214 vmx
->req_immediate_exit
= false;
1217 * Note that guest MSRs to be saved/restored can also be changed
1218 * when guest state is loaded. This happens when guest transitions
1219 * to/from long-mode by setting MSR_EFER.LMA.
1221 if (!vmx
->guest_uret_msrs_loaded
) {
1222 vmx
->guest_uret_msrs_loaded
= true;
1223 for (i
= 0; i
< vmx
->nr_active_uret_msrs
; ++i
)
1224 kvm_set_user_return_msr(vmx
->guest_uret_msrs
[i
].slot
,
1225 vmx
->guest_uret_msrs
[i
].data
,
1226 vmx
->guest_uret_msrs
[i
].mask
);
1230 if (vmx
->nested
.need_vmcs12_to_shadow_sync
)
1231 nested_sync_vmcs12_to_shadow(vcpu
);
1233 if (vmx
->guest_state_loaded
)
1236 host_state
= &vmx
->loaded_vmcs
->host_state
;
1239 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
1240 * allow segment selectors with cpl > 0 or ti == 1.
1242 host_state
->ldt_sel
= kvm_read_ldt();
1244 #ifdef CONFIG_X86_64
1245 savesegment(ds
, host_state
->ds_sel
);
1246 savesegment(es
, host_state
->es_sel
);
1248 gs_base
= cpu_kernelmode_gs_base(cpu
);
1249 if (likely(is_64bit_mm(current
->mm
))) {
1250 current_save_fsgs();
1251 fs_sel
= current
->thread
.fsindex
;
1252 gs_sel
= current
->thread
.gsindex
;
1253 fs_base
= current
->thread
.fsbase
;
1254 vmx
->msr_host_kernel_gs_base
= current
->thread
.gsbase
;
1256 savesegment(fs
, fs_sel
);
1257 savesegment(gs
, gs_sel
);
1258 fs_base
= read_msr(MSR_FS_BASE
);
1259 vmx
->msr_host_kernel_gs_base
= read_msr(MSR_KERNEL_GS_BASE
);
1262 wrmsrl(MSR_KERNEL_GS_BASE
, vmx
->msr_guest_kernel_gs_base
);
1264 savesegment(fs
, fs_sel
);
1265 savesegment(gs
, gs_sel
);
1266 fs_base
= segment_base(fs_sel
);
1267 gs_base
= segment_base(gs_sel
);
1270 vmx_set_host_fs_gs(host_state
, fs_sel
, gs_sel
, fs_base
, gs_base
);
1271 vmx
->guest_state_loaded
= true;
1274 static void vmx_prepare_switch_to_host(struct vcpu_vmx
*vmx
)
1276 struct vmcs_host_state
*host_state
;
1278 if (!vmx
->guest_state_loaded
)
1281 host_state
= &vmx
->loaded_vmcs
->host_state
;
1283 ++vmx
->vcpu
.stat
.host_state_reload
;
1285 #ifdef CONFIG_X86_64
1286 rdmsrl(MSR_KERNEL_GS_BASE
, vmx
->msr_guest_kernel_gs_base
);
1288 if (host_state
->ldt_sel
|| (host_state
->gs_sel
& 7)) {
1289 kvm_load_ldt(host_state
->ldt_sel
);
1290 #ifdef CONFIG_X86_64
1291 load_gs_index(host_state
->gs_sel
);
1293 loadsegment(gs
, host_state
->gs_sel
);
1296 if (host_state
->fs_sel
& 7)
1297 loadsegment(fs
, host_state
->fs_sel
);
1298 #ifdef CONFIG_X86_64
1299 if (unlikely(host_state
->ds_sel
| host_state
->es_sel
)) {
1300 loadsegment(ds
, host_state
->ds_sel
);
1301 loadsegment(es
, host_state
->es_sel
);
1304 invalidate_tss_limit();
1305 #ifdef CONFIG_X86_64
1306 wrmsrl(MSR_KERNEL_GS_BASE
, vmx
->msr_host_kernel_gs_base
);
1308 load_fixmap_gdt(raw_smp_processor_id());
1309 vmx
->guest_state_loaded
= false;
1310 vmx
->guest_uret_msrs_loaded
= false;
1313 #ifdef CONFIG_X86_64
1314 static u64
vmx_read_guest_kernel_gs_base(struct vcpu_vmx
*vmx
)
1317 if (vmx
->guest_state_loaded
)
1318 rdmsrl(MSR_KERNEL_GS_BASE
, vmx
->msr_guest_kernel_gs_base
);
1320 return vmx
->msr_guest_kernel_gs_base
;
1323 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx
*vmx
, u64 data
)
1326 if (vmx
->guest_state_loaded
)
1327 wrmsrl(MSR_KERNEL_GS_BASE
, data
);
1329 vmx
->msr_guest_kernel_gs_base
= data
;
1333 void vmx_vcpu_load_vmcs(struct kvm_vcpu
*vcpu
, int cpu
,
1334 struct loaded_vmcs
*buddy
)
1336 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
1337 bool already_loaded
= vmx
->loaded_vmcs
->cpu
== cpu
;
1340 if (!already_loaded
) {
1341 loaded_vmcs_clear(vmx
->loaded_vmcs
);
1342 local_irq_disable();
1345 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
1346 * this cpu's percpu list, otherwise it may not yet be deleted
1347 * from its previous cpu's percpu list. Pairs with the
1348 * smb_wmb() in __loaded_vmcs_clear().
1352 list_add(&vmx
->loaded_vmcs
->loaded_vmcss_on_cpu_link
,
1353 &per_cpu(loaded_vmcss_on_cpu
, cpu
));
1357 prev
= per_cpu(current_vmcs
, cpu
);
1358 if (prev
!= vmx
->loaded_vmcs
->vmcs
) {
1359 per_cpu(current_vmcs
, cpu
) = vmx
->loaded_vmcs
->vmcs
;
1360 vmcs_load(vmx
->loaded_vmcs
->vmcs
);
1363 * No indirect branch prediction barrier needed when switching
1364 * the active VMCS within a guest, e.g. on nested VM-Enter.
1365 * The L1 VMM can protect itself with retpolines, IBPB or IBRS.
1367 if (!buddy
|| WARN_ON_ONCE(buddy
->vmcs
!= prev
))
1368 indirect_branch_prediction_barrier();
1371 if (!already_loaded
) {
1372 void *gdt
= get_current_gdt_ro();
1373 unsigned long sysenter_esp
;
1376 * Flush all EPTP/VPID contexts, the new pCPU may have stale
1377 * TLB entries from its previous association with the vCPU.
1379 kvm_make_request(KVM_REQ_TLB_FLUSH
, vcpu
);
1382 * Linux uses per-cpu TSS and GDT, so set these when switching
1383 * processors. See 22.2.4.
1385 vmcs_writel(HOST_TR_BASE
,
1386 (unsigned long)&get_cpu_entry_area(cpu
)->tss
.x86_tss
);
1387 vmcs_writel(HOST_GDTR_BASE
, (unsigned long)gdt
); /* 22.2.4 */
1389 rdmsrl(MSR_IA32_SYSENTER_ESP
, sysenter_esp
);
1390 vmcs_writel(HOST_IA32_SYSENTER_ESP
, sysenter_esp
); /* 22.2.3 */
1392 vmx
->loaded_vmcs
->cpu
= cpu
;
1395 /* Setup TSC multiplier */
1396 if (kvm_has_tsc_control
&&
1397 vmx
->current_tsc_ratio
!= vcpu
->arch
.tsc_scaling_ratio
)
1398 decache_tsc_multiplier(vmx
);
1402 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1403 * vcpu mutex is already taken.
1405 static void vmx_vcpu_load(struct kvm_vcpu
*vcpu
, int cpu
)
1407 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
1409 vmx_vcpu_load_vmcs(vcpu
, cpu
, NULL
);
1411 vmx_vcpu_pi_load(vcpu
, cpu
);
1413 vmx
->host_debugctlmsr
= get_debugctlmsr();
1416 static void vmx_vcpu_put(struct kvm_vcpu
*vcpu
)
1418 vmx_vcpu_pi_put(vcpu
);
1420 vmx_prepare_switch_to_host(to_vmx(vcpu
));
1423 static bool emulation_required(struct kvm_vcpu
*vcpu
)
1425 return emulate_invalid_guest_state
&& !vmx_guest_state_valid(vcpu
);
1428 unsigned long vmx_get_rflags(struct kvm_vcpu
*vcpu
)
1430 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
1431 unsigned long rflags
, save_rflags
;
1433 if (!kvm_register_is_available(vcpu
, VCPU_EXREG_RFLAGS
)) {
1434 kvm_register_mark_available(vcpu
, VCPU_EXREG_RFLAGS
);
1435 rflags
= vmcs_readl(GUEST_RFLAGS
);
1436 if (vmx
->rmode
.vm86_active
) {
1437 rflags
&= RMODE_GUEST_OWNED_EFLAGS_BITS
;
1438 save_rflags
= vmx
->rmode
.save_rflags
;
1439 rflags
|= save_rflags
& ~RMODE_GUEST_OWNED_EFLAGS_BITS
;
1441 vmx
->rflags
= rflags
;
1446 void vmx_set_rflags(struct kvm_vcpu
*vcpu
, unsigned long rflags
)
1448 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
1449 unsigned long old_rflags
;
1451 if (is_unrestricted_guest(vcpu
)) {
1452 kvm_register_mark_available(vcpu
, VCPU_EXREG_RFLAGS
);
1453 vmx
->rflags
= rflags
;
1454 vmcs_writel(GUEST_RFLAGS
, rflags
);
1458 old_rflags
= vmx_get_rflags(vcpu
);
1459 vmx
->rflags
= rflags
;
1460 if (vmx
->rmode
.vm86_active
) {
1461 vmx
->rmode
.save_rflags
= rflags
;
1462 rflags
|= X86_EFLAGS_IOPL
| X86_EFLAGS_VM
;
1464 vmcs_writel(GUEST_RFLAGS
, rflags
);
1466 if ((old_rflags
^ vmx
->rflags
) & X86_EFLAGS_VM
)
1467 vmx
->emulation_required
= emulation_required(vcpu
);
1470 u32
vmx_get_interrupt_shadow(struct kvm_vcpu
*vcpu
)
1472 u32 interruptibility
= vmcs_read32(GUEST_INTERRUPTIBILITY_INFO
);
1475 if (interruptibility
& GUEST_INTR_STATE_STI
)
1476 ret
|= KVM_X86_SHADOW_INT_STI
;
1477 if (interruptibility
& GUEST_INTR_STATE_MOV_SS
)
1478 ret
|= KVM_X86_SHADOW_INT_MOV_SS
;
1483 void vmx_set_interrupt_shadow(struct kvm_vcpu
*vcpu
, int mask
)
1485 u32 interruptibility_old
= vmcs_read32(GUEST_INTERRUPTIBILITY_INFO
);
1486 u32 interruptibility
= interruptibility_old
;
1488 interruptibility
&= ~(GUEST_INTR_STATE_STI
| GUEST_INTR_STATE_MOV_SS
);
1490 if (mask
& KVM_X86_SHADOW_INT_MOV_SS
)
1491 interruptibility
|= GUEST_INTR_STATE_MOV_SS
;
1492 else if (mask
& KVM_X86_SHADOW_INT_STI
)
1493 interruptibility
|= GUEST_INTR_STATE_STI
;
1495 if ((interruptibility
!= interruptibility_old
))
1496 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO
, interruptibility
);
1499 static int vmx_rtit_ctl_check(struct kvm_vcpu
*vcpu
, u64 data
)
1501 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
1502 unsigned long value
;
1505 * Any MSR write that attempts to change bits marked reserved will
1508 if (data
& vmx
->pt_desc
.ctl_bitmask
)
1512 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
1513 * result in a #GP unless the same write also clears TraceEn.
1515 if ((vmx
->pt_desc
.guest
.ctl
& RTIT_CTL_TRACEEN
) &&
1516 ((vmx
->pt_desc
.guest
.ctl
^ data
) & ~RTIT_CTL_TRACEEN
))
1520 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
1521 * and FabricEn would cause #GP, if
1522 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
1524 if ((data
& RTIT_CTL_TRACEEN
) && !(data
& RTIT_CTL_TOPA
) &&
1525 !(data
& RTIT_CTL_FABRIC_EN
) &&
1526 !intel_pt_validate_cap(vmx
->pt_desc
.caps
,
1527 PT_CAP_single_range_output
))
1531 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
1532 * utilize encodings marked reserved will casue a #GP fault.
1534 value
= intel_pt_validate_cap(vmx
->pt_desc
.caps
, PT_CAP_mtc_periods
);
1535 if (intel_pt_validate_cap(vmx
->pt_desc
.caps
, PT_CAP_mtc
) &&
1536 !test_bit((data
& RTIT_CTL_MTC_RANGE
) >>
1537 RTIT_CTL_MTC_RANGE_OFFSET
, &value
))
1539 value
= intel_pt_validate_cap(vmx
->pt_desc
.caps
,
1540 PT_CAP_cycle_thresholds
);
1541 if (intel_pt_validate_cap(vmx
->pt_desc
.caps
, PT_CAP_psb_cyc
) &&
1542 !test_bit((data
& RTIT_CTL_CYC_THRESH
) >>
1543 RTIT_CTL_CYC_THRESH_OFFSET
, &value
))
1545 value
= intel_pt_validate_cap(vmx
->pt_desc
.caps
, PT_CAP_psb_periods
);
1546 if (intel_pt_validate_cap(vmx
->pt_desc
.caps
, PT_CAP_psb_cyc
) &&
1547 !test_bit((data
& RTIT_CTL_PSB_FREQ
) >>
1548 RTIT_CTL_PSB_FREQ_OFFSET
, &value
))
1552 * If ADDRx_CFG is reserved or the encodings is >2 will
1553 * cause a #GP fault.
1555 value
= (data
& RTIT_CTL_ADDR0
) >> RTIT_CTL_ADDR0_OFFSET
;
1556 if ((value
&& (vmx
->pt_desc
.addr_range
< 1)) || (value
> 2))
1558 value
= (data
& RTIT_CTL_ADDR1
) >> RTIT_CTL_ADDR1_OFFSET
;
1559 if ((value
&& (vmx
->pt_desc
.addr_range
< 2)) || (value
> 2))
1561 value
= (data
& RTIT_CTL_ADDR2
) >> RTIT_CTL_ADDR2_OFFSET
;
1562 if ((value
&& (vmx
->pt_desc
.addr_range
< 3)) || (value
> 2))
1564 value
= (data
& RTIT_CTL_ADDR3
) >> RTIT_CTL_ADDR3_OFFSET
;
1565 if ((value
&& (vmx
->pt_desc
.addr_range
< 4)) || (value
> 2))
1571 static bool vmx_can_emulate_instruction(struct kvm_vcpu
*vcpu
, void *insn
, int insn_len
)
1576 static int skip_emulated_instruction(struct kvm_vcpu
*vcpu
)
1578 unsigned long rip
, orig_rip
;
1581 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
1582 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
1583 * set when EPT misconfig occurs. In practice, real hardware updates
1584 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
1585 * (namely Hyper-V) don't set it due to it being undefined behavior,
1586 * i.e. we end up advancing IP with some random value.
1588 if (!static_cpu_has(X86_FEATURE_HYPERVISOR
) ||
1589 to_vmx(vcpu
)->exit_reason
.basic
!= EXIT_REASON_EPT_MISCONFIG
) {
1590 orig_rip
= kvm_rip_read(vcpu
);
1591 rip
= orig_rip
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN
);
1592 #ifdef CONFIG_X86_64
1594 * We need to mask out the high 32 bits of RIP if not in 64-bit
1595 * mode, but just finding out that we are in 64-bit mode is
1596 * quite expensive. Only do it if there was a carry.
1598 if (unlikely(((rip
^ orig_rip
) >> 31) == 3) && !is_64_bit_mode(vcpu
))
1601 kvm_rip_write(vcpu
, rip
);
1603 if (!kvm_emulate_instruction(vcpu
, EMULTYPE_SKIP
))
1607 /* skipping an emulated instruction also counts */
1608 vmx_set_interrupt_shadow(vcpu
, 0);
1614 * Recognizes a pending MTF VM-exit and records the nested state for later
1617 static void vmx_update_emulated_instruction(struct kvm_vcpu
*vcpu
)
1619 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
1620 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
1622 if (!is_guest_mode(vcpu
))
1626 * Per the SDM, MTF takes priority over debug-trap exceptions besides
1627 * T-bit traps. As instruction emulation is completed (i.e. at the
1628 * instruction boundary), any #DB exception pending delivery must be a
1629 * debug-trap. Record the pending MTF state to be delivered in
1630 * vmx_check_nested_events().
1632 if (nested_cpu_has_mtf(vmcs12
) &&
1633 (!vcpu
->arch
.exception
.pending
||
1634 vcpu
->arch
.exception
.nr
== DB_VECTOR
))
1635 vmx
->nested
.mtf_pending
= true;
1637 vmx
->nested
.mtf_pending
= false;
1640 static int vmx_skip_emulated_instruction(struct kvm_vcpu
*vcpu
)
1642 vmx_update_emulated_instruction(vcpu
);
1643 return skip_emulated_instruction(vcpu
);
1646 static void vmx_clear_hlt(struct kvm_vcpu
*vcpu
)
1649 * Ensure that we clear the HLT state in the VMCS. We don't need to
1650 * explicitly skip the instruction because if the HLT state is set,
1651 * then the instruction is already executing and RIP has already been
1654 if (kvm_hlt_in_guest(vcpu
->kvm
) &&
1655 vmcs_read32(GUEST_ACTIVITY_STATE
) == GUEST_ACTIVITY_HLT
)
1656 vmcs_write32(GUEST_ACTIVITY_STATE
, GUEST_ACTIVITY_ACTIVE
);
1659 static void vmx_queue_exception(struct kvm_vcpu
*vcpu
)
1661 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
1662 unsigned nr
= vcpu
->arch
.exception
.nr
;
1663 bool has_error_code
= vcpu
->arch
.exception
.has_error_code
;
1664 u32 error_code
= vcpu
->arch
.exception
.error_code
;
1665 u32 intr_info
= nr
| INTR_INFO_VALID_MASK
;
1667 kvm_deliver_exception_payload(vcpu
);
1669 if (has_error_code
) {
1670 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE
, error_code
);
1671 intr_info
|= INTR_INFO_DELIVER_CODE_MASK
;
1674 if (vmx
->rmode
.vm86_active
) {
1676 if (kvm_exception_is_soft(nr
))
1677 inc_eip
= vcpu
->arch
.event_exit_inst_len
;
1678 kvm_inject_realmode_interrupt(vcpu
, nr
, inc_eip
);
1682 WARN_ON_ONCE(vmx
->emulation_required
);
1684 if (kvm_exception_is_soft(nr
)) {
1685 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN
,
1686 vmx
->vcpu
.arch
.event_exit_inst_len
);
1687 intr_info
|= INTR_TYPE_SOFT_EXCEPTION
;
1689 intr_info
|= INTR_TYPE_HARD_EXCEPTION
;
1691 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD
, intr_info
);
1693 vmx_clear_hlt(vcpu
);
1696 static void vmx_setup_uret_msr(struct vcpu_vmx
*vmx
, unsigned int msr
)
1698 struct vmx_uret_msr tmp
;
1701 from
= __vmx_find_uret_msr(vmx
, msr
);
1704 to
= vmx
->nr_active_uret_msrs
++;
1706 tmp
= vmx
->guest_uret_msrs
[to
];
1707 vmx
->guest_uret_msrs
[to
] = vmx
->guest_uret_msrs
[from
];
1708 vmx
->guest_uret_msrs
[from
] = tmp
;
1712 * Set up the vmcs to automatically save and restore system
1713 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
1714 * mode, as fiddling with msrs is very expensive.
1716 static void setup_msrs(struct vcpu_vmx
*vmx
)
1718 vmx
->guest_uret_msrs_loaded
= false;
1719 vmx
->nr_active_uret_msrs
= 0;
1720 #ifdef CONFIG_X86_64
1722 * The SYSCALL MSRs are only needed on long mode guests, and only
1723 * when EFER.SCE is set.
1725 if (is_long_mode(&vmx
->vcpu
) && (vmx
->vcpu
.arch
.efer
& EFER_SCE
)) {
1726 vmx_setup_uret_msr(vmx
, MSR_STAR
);
1727 vmx_setup_uret_msr(vmx
, MSR_LSTAR
);
1728 vmx_setup_uret_msr(vmx
, MSR_SYSCALL_MASK
);
1731 if (update_transition_efer(vmx
))
1732 vmx_setup_uret_msr(vmx
, MSR_EFER
);
1734 if (guest_cpuid_has(&vmx
->vcpu
, X86_FEATURE_RDTSCP
))
1735 vmx_setup_uret_msr(vmx
, MSR_TSC_AUX
);
1737 vmx_setup_uret_msr(vmx
, MSR_IA32_TSX_CTRL
);
1739 if (cpu_has_vmx_msr_bitmap())
1740 vmx_update_msr_bitmap(&vmx
->vcpu
);
1743 static u64
vmx_write_l1_tsc_offset(struct kvm_vcpu
*vcpu
, u64 offset
)
1745 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
1746 u64 g_tsc_offset
= 0;
1749 * We're here if L1 chose not to trap WRMSR to TSC. According
1750 * to the spec, this should set L1's TSC; The offset that L1
1751 * set for L2 remains unchanged, and still needs to be added
1752 * to the newly set TSC to get L2's TSC.
1754 if (is_guest_mode(vcpu
) &&
1755 (vmcs12
->cpu_based_vm_exec_control
& CPU_BASED_USE_TSC_OFFSETTING
))
1756 g_tsc_offset
= vmcs12
->tsc_offset
;
1758 trace_kvm_write_tsc_offset(vcpu
->vcpu_id
,
1759 vcpu
->arch
.tsc_offset
- g_tsc_offset
,
1761 vmcs_write64(TSC_OFFSET
, offset
+ g_tsc_offset
);
1762 return offset
+ g_tsc_offset
;
1766 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
1767 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
1768 * all guests if the "nested" module option is off, and can also be disabled
1769 * for a single guest by disabling its VMX cpuid bit.
1771 bool nested_vmx_allowed(struct kvm_vcpu
*vcpu
)
1773 return nested
&& guest_cpuid_has(vcpu
, X86_FEATURE_VMX
);
1776 static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu
*vcpu
,
1779 uint64_t valid_bits
= to_vmx(vcpu
)->msr_ia32_feature_control_valid_bits
;
1781 return !(val
& ~valid_bits
);
1784 static int vmx_get_msr_feature(struct kvm_msr_entry
*msr
)
1786 switch (msr
->index
) {
1787 case MSR_IA32_VMX_BASIC
... MSR_IA32_VMX_VMFUNC
:
1790 return vmx_get_vmx_msr(&vmcs_config
.nested
, msr
->index
, &msr
->data
);
1791 case MSR_IA32_PERF_CAPABILITIES
:
1792 msr
->data
= vmx_get_perf_capabilities();
1795 return KVM_MSR_RET_INVALID
;
1800 * Reads an msr value (of 'msr_index') into 'pdata'.
1801 * Returns 0 on success, non-0 otherwise.
1802 * Assumes vcpu_load() was already called.
1804 static int vmx_get_msr(struct kvm_vcpu
*vcpu
, struct msr_data
*msr_info
)
1806 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
1807 struct vmx_uret_msr
*msr
;
1810 switch (msr_info
->index
) {
1811 #ifdef CONFIG_X86_64
1813 msr_info
->data
= vmcs_readl(GUEST_FS_BASE
);
1816 msr_info
->data
= vmcs_readl(GUEST_GS_BASE
);
1818 case MSR_KERNEL_GS_BASE
:
1819 msr_info
->data
= vmx_read_guest_kernel_gs_base(vmx
);
1823 return kvm_get_msr_common(vcpu
, msr_info
);
1824 case MSR_IA32_TSX_CTRL
:
1825 if (!msr_info
->host_initiated
&&
1826 !(vcpu
->arch
.arch_capabilities
& ARCH_CAP_TSX_CTRL_MSR
))
1829 case MSR_IA32_UMWAIT_CONTROL
:
1830 if (!msr_info
->host_initiated
&& !vmx_has_waitpkg(vmx
))
1833 msr_info
->data
= vmx
->msr_ia32_umwait_control
;
1835 case MSR_IA32_SPEC_CTRL
:
1836 if (!msr_info
->host_initiated
&&
1837 !guest_has_spec_ctrl_msr(vcpu
))
1840 msr_info
->data
= to_vmx(vcpu
)->spec_ctrl
;
1842 case MSR_IA32_SYSENTER_CS
:
1843 msr_info
->data
= vmcs_read32(GUEST_SYSENTER_CS
);
1845 case MSR_IA32_SYSENTER_EIP
:
1846 msr_info
->data
= vmcs_readl(GUEST_SYSENTER_EIP
);
1848 case MSR_IA32_SYSENTER_ESP
:
1849 msr_info
->data
= vmcs_readl(GUEST_SYSENTER_ESP
);
1851 case MSR_IA32_BNDCFGS
:
1852 if (!kvm_mpx_supported() ||
1853 (!msr_info
->host_initiated
&&
1854 !guest_cpuid_has(vcpu
, X86_FEATURE_MPX
)))
1856 msr_info
->data
= vmcs_read64(GUEST_BNDCFGS
);
1858 case MSR_IA32_MCG_EXT_CTL
:
1859 if (!msr_info
->host_initiated
&&
1860 !(vmx
->msr_ia32_feature_control
&
1861 FEAT_CTL_LMCE_ENABLED
))
1863 msr_info
->data
= vcpu
->arch
.mcg_ext_ctl
;
1865 case MSR_IA32_FEAT_CTL
:
1866 msr_info
->data
= vmx
->msr_ia32_feature_control
;
1868 case MSR_IA32_VMX_BASIC
... MSR_IA32_VMX_VMFUNC
:
1869 if (!nested_vmx_allowed(vcpu
))
1871 if (vmx_get_vmx_msr(&vmx
->nested
.msrs
, msr_info
->index
,
1875 * Enlightened VMCS v1 doesn't have certain fields, but buggy
1876 * Hyper-V versions are still trying to use corresponding
1877 * features when they are exposed. Filter out the essential
1880 if (!msr_info
->host_initiated
&&
1881 vmx
->nested
.enlightened_vmcs_enabled
)
1882 nested_evmcs_filter_control_msr(msr_info
->index
,
1885 case MSR_IA32_RTIT_CTL
:
1886 if (!vmx_pt_mode_is_host_guest())
1888 msr_info
->data
= vmx
->pt_desc
.guest
.ctl
;
1890 case MSR_IA32_RTIT_STATUS
:
1891 if (!vmx_pt_mode_is_host_guest())
1893 msr_info
->data
= vmx
->pt_desc
.guest
.status
;
1895 case MSR_IA32_RTIT_CR3_MATCH
:
1896 if (!vmx_pt_mode_is_host_guest() ||
1897 !intel_pt_validate_cap(vmx
->pt_desc
.caps
,
1898 PT_CAP_cr3_filtering
))
1900 msr_info
->data
= vmx
->pt_desc
.guest
.cr3_match
;
1902 case MSR_IA32_RTIT_OUTPUT_BASE
:
1903 if (!vmx_pt_mode_is_host_guest() ||
1904 (!intel_pt_validate_cap(vmx
->pt_desc
.caps
,
1905 PT_CAP_topa_output
) &&
1906 !intel_pt_validate_cap(vmx
->pt_desc
.caps
,
1907 PT_CAP_single_range_output
)))
1909 msr_info
->data
= vmx
->pt_desc
.guest
.output_base
;
1911 case MSR_IA32_RTIT_OUTPUT_MASK
:
1912 if (!vmx_pt_mode_is_host_guest() ||
1913 (!intel_pt_validate_cap(vmx
->pt_desc
.caps
,
1914 PT_CAP_topa_output
) &&
1915 !intel_pt_validate_cap(vmx
->pt_desc
.caps
,
1916 PT_CAP_single_range_output
)))
1918 msr_info
->data
= vmx
->pt_desc
.guest
.output_mask
;
1920 case MSR_IA32_RTIT_ADDR0_A
... MSR_IA32_RTIT_ADDR3_B
:
1921 index
= msr_info
->index
- MSR_IA32_RTIT_ADDR0_A
;
1922 if (!vmx_pt_mode_is_host_guest() ||
1923 (index
>= 2 * intel_pt_validate_cap(vmx
->pt_desc
.caps
,
1924 PT_CAP_num_address_ranges
)))
1927 msr_info
->data
= vmx
->pt_desc
.guest
.addr_b
[index
/ 2];
1929 msr_info
->data
= vmx
->pt_desc
.guest
.addr_a
[index
/ 2];
1932 if (!msr_info
->host_initiated
&&
1933 !guest_cpuid_has(vcpu
, X86_FEATURE_RDTSCP
))
1936 case MSR_IA32_DEBUGCTLMSR
:
1937 msr_info
->data
= vmcs_read64(GUEST_IA32_DEBUGCTL
);
1941 msr
= vmx_find_uret_msr(vmx
, msr_info
->index
);
1943 msr_info
->data
= msr
->data
;
1946 return kvm_get_msr_common(vcpu
, msr_info
);
1952 static u64
nested_vmx_truncate_sysenter_addr(struct kvm_vcpu
*vcpu
,
1955 #ifdef CONFIG_X86_64
1956 if (!guest_cpuid_has(vcpu
, X86_FEATURE_LM
))
1959 return (unsigned long)data
;
1962 static u64
vcpu_supported_debugctl(struct kvm_vcpu
*vcpu
)
1964 u64 debugctl
= vmx_supported_debugctl();
1966 if (!intel_pmu_lbr_is_enabled(vcpu
))
1967 debugctl
&= ~DEBUGCTLMSR_LBR_MASK
;
1973 * Writes msr value into the appropriate "register".
1974 * Returns 0 on success, non-0 otherwise.
1975 * Assumes vcpu_load() was already called.
1977 static int vmx_set_msr(struct kvm_vcpu
*vcpu
, struct msr_data
*msr_info
)
1979 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
1980 struct vmx_uret_msr
*msr
;
1982 u32 msr_index
= msr_info
->index
;
1983 u64 data
= msr_info
->data
;
1986 switch (msr_index
) {
1988 ret
= kvm_set_msr_common(vcpu
, msr_info
);
1990 #ifdef CONFIG_X86_64
1992 vmx_segment_cache_clear(vmx
);
1993 vmcs_writel(GUEST_FS_BASE
, data
);
1996 vmx_segment_cache_clear(vmx
);
1997 vmcs_writel(GUEST_GS_BASE
, data
);
1999 case MSR_KERNEL_GS_BASE
:
2000 vmx_write_guest_kernel_gs_base(vmx
, data
);
2003 case MSR_IA32_SYSENTER_CS
:
2004 if (is_guest_mode(vcpu
))
2005 get_vmcs12(vcpu
)->guest_sysenter_cs
= data
;
2006 vmcs_write32(GUEST_SYSENTER_CS
, data
);
2008 case MSR_IA32_SYSENTER_EIP
:
2009 if (is_guest_mode(vcpu
)) {
2010 data
= nested_vmx_truncate_sysenter_addr(vcpu
, data
);
2011 get_vmcs12(vcpu
)->guest_sysenter_eip
= data
;
2013 vmcs_writel(GUEST_SYSENTER_EIP
, data
);
2015 case MSR_IA32_SYSENTER_ESP
:
2016 if (is_guest_mode(vcpu
)) {
2017 data
= nested_vmx_truncate_sysenter_addr(vcpu
, data
);
2018 get_vmcs12(vcpu
)->guest_sysenter_esp
= data
;
2020 vmcs_writel(GUEST_SYSENTER_ESP
, data
);
2022 case MSR_IA32_DEBUGCTLMSR
: {
2023 u64 invalid
= data
& ~vcpu_supported_debugctl(vcpu
);
2024 if (invalid
& (DEBUGCTLMSR_BTF
|DEBUGCTLMSR_LBR
)) {
2025 if (report_ignored_msrs
)
2026 vcpu_unimpl(vcpu
, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n",
2028 data
&= ~(DEBUGCTLMSR_BTF
|DEBUGCTLMSR_LBR
);
2029 invalid
&= ~(DEBUGCTLMSR_BTF
|DEBUGCTLMSR_LBR
);
2035 if (is_guest_mode(vcpu
) && get_vmcs12(vcpu
)->vm_exit_controls
&
2036 VM_EXIT_SAVE_DEBUG_CONTROLS
)
2037 get_vmcs12(vcpu
)->guest_ia32_debugctl
= data
;
2039 vmcs_write64(GUEST_IA32_DEBUGCTL
, data
);
2040 if (intel_pmu_lbr_is_enabled(vcpu
) && !to_vmx(vcpu
)->lbr_desc
.event
&&
2041 (data
& DEBUGCTLMSR_LBR
))
2042 intel_pmu_create_guest_lbr_event(vcpu
);
2045 case MSR_IA32_BNDCFGS
:
2046 if (!kvm_mpx_supported() ||
2047 (!msr_info
->host_initiated
&&
2048 !guest_cpuid_has(vcpu
, X86_FEATURE_MPX
)))
2050 if (is_noncanonical_address(data
& PAGE_MASK
, vcpu
) ||
2051 (data
& MSR_IA32_BNDCFGS_RSVD
))
2053 vmcs_write64(GUEST_BNDCFGS
, data
);
2055 case MSR_IA32_UMWAIT_CONTROL
:
2056 if (!msr_info
->host_initiated
&& !vmx_has_waitpkg(vmx
))
2059 /* The reserved bit 1 and non-32 bit [63:32] should be zero */
2060 if (data
& (BIT_ULL(1) | GENMASK_ULL(63, 32)))
2063 vmx
->msr_ia32_umwait_control
= data
;
2065 case MSR_IA32_SPEC_CTRL
:
2066 if (!msr_info
->host_initiated
&&
2067 !guest_has_spec_ctrl_msr(vcpu
))
2070 if (kvm_spec_ctrl_test_value(data
))
2073 vmx
->spec_ctrl
= data
;
2079 * When it's written (to non-zero) for the first time, pass
2083 * The handling of the MSR bitmap for L2 guests is done in
2084 * nested_vmx_prepare_msr_bitmap. We should not touch the
2085 * vmcs02.msr_bitmap here since it gets completely overwritten
2086 * in the merging. We update the vmcs01 here for L1 as well
2087 * since it will end up touching the MSR anyway now.
2089 vmx_disable_intercept_for_msr(vcpu
,
2093 case MSR_IA32_TSX_CTRL
:
2094 if (!msr_info
->host_initiated
&&
2095 !(vcpu
->arch
.arch_capabilities
& ARCH_CAP_TSX_CTRL_MSR
))
2097 if (data
& ~(TSX_CTRL_RTM_DISABLE
| TSX_CTRL_CPUID_CLEAR
))
2100 case MSR_IA32_PRED_CMD
:
2101 if (!msr_info
->host_initiated
&&
2102 !guest_has_pred_cmd_msr(vcpu
))
2105 if (data
& ~PRED_CMD_IBPB
)
2107 if (!boot_cpu_has(X86_FEATURE_IBPB
))
2112 wrmsrl(MSR_IA32_PRED_CMD
, PRED_CMD_IBPB
);
2116 * When it's written (to non-zero) for the first time, pass
2120 * The handling of the MSR bitmap for L2 guests is done in
2121 * nested_vmx_prepare_msr_bitmap. We should not touch the
2122 * vmcs02.msr_bitmap here since it gets completely overwritten
2125 vmx_disable_intercept_for_msr(vcpu
, MSR_IA32_PRED_CMD
, MSR_TYPE_W
);
2127 case MSR_IA32_CR_PAT
:
2128 if (!kvm_pat_valid(data
))
2131 if (is_guest_mode(vcpu
) &&
2132 get_vmcs12(vcpu
)->vm_exit_controls
& VM_EXIT_SAVE_IA32_PAT
)
2133 get_vmcs12(vcpu
)->guest_ia32_pat
= data
;
2135 if (vmcs_config
.vmentry_ctrl
& VM_ENTRY_LOAD_IA32_PAT
) {
2136 vmcs_write64(GUEST_IA32_PAT
, data
);
2137 vcpu
->arch
.pat
= data
;
2140 ret
= kvm_set_msr_common(vcpu
, msr_info
);
2142 case MSR_IA32_TSC_ADJUST
:
2143 ret
= kvm_set_msr_common(vcpu
, msr_info
);
2145 case MSR_IA32_MCG_EXT_CTL
:
2146 if ((!msr_info
->host_initiated
&&
2147 !(to_vmx(vcpu
)->msr_ia32_feature_control
&
2148 FEAT_CTL_LMCE_ENABLED
)) ||
2149 (data
& ~MCG_EXT_CTL_LMCE_EN
))
2151 vcpu
->arch
.mcg_ext_ctl
= data
;
2153 case MSR_IA32_FEAT_CTL
:
2154 if (!vmx_feature_control_msr_valid(vcpu
, data
) ||
2155 (to_vmx(vcpu
)->msr_ia32_feature_control
&
2156 FEAT_CTL_LOCKED
&& !msr_info
->host_initiated
))
2158 vmx
->msr_ia32_feature_control
= data
;
2159 if (msr_info
->host_initiated
&& data
== 0)
2160 vmx_leave_nested(vcpu
);
2162 case MSR_IA32_VMX_BASIC
... MSR_IA32_VMX_VMFUNC
:
2163 if (!msr_info
->host_initiated
)
2164 return 1; /* they are read-only */
2165 if (!nested_vmx_allowed(vcpu
))
2167 return vmx_set_vmx_msr(vcpu
, msr_index
, data
);
2168 case MSR_IA32_RTIT_CTL
:
2169 if (!vmx_pt_mode_is_host_guest() ||
2170 vmx_rtit_ctl_check(vcpu
, data
) ||
2173 vmcs_write64(GUEST_IA32_RTIT_CTL
, data
);
2174 vmx
->pt_desc
.guest
.ctl
= data
;
2175 pt_update_intercept_for_msr(vcpu
);
2177 case MSR_IA32_RTIT_STATUS
:
2178 if (!pt_can_write_msr(vmx
))
2180 if (data
& MSR_IA32_RTIT_STATUS_MASK
)
2182 vmx
->pt_desc
.guest
.status
= data
;
2184 case MSR_IA32_RTIT_CR3_MATCH
:
2185 if (!pt_can_write_msr(vmx
))
2187 if (!intel_pt_validate_cap(vmx
->pt_desc
.caps
,
2188 PT_CAP_cr3_filtering
))
2190 vmx
->pt_desc
.guest
.cr3_match
= data
;
2192 case MSR_IA32_RTIT_OUTPUT_BASE
:
2193 if (!pt_can_write_msr(vmx
))
2195 if (!intel_pt_validate_cap(vmx
->pt_desc
.caps
,
2196 PT_CAP_topa_output
) &&
2197 !intel_pt_validate_cap(vmx
->pt_desc
.caps
,
2198 PT_CAP_single_range_output
))
2200 if (!pt_output_base_valid(vcpu
, data
))
2202 vmx
->pt_desc
.guest
.output_base
= data
;
2204 case MSR_IA32_RTIT_OUTPUT_MASK
:
2205 if (!pt_can_write_msr(vmx
))
2207 if (!intel_pt_validate_cap(vmx
->pt_desc
.caps
,
2208 PT_CAP_topa_output
) &&
2209 !intel_pt_validate_cap(vmx
->pt_desc
.caps
,
2210 PT_CAP_single_range_output
))
2212 vmx
->pt_desc
.guest
.output_mask
= data
;
2214 case MSR_IA32_RTIT_ADDR0_A
... MSR_IA32_RTIT_ADDR3_B
:
2215 if (!pt_can_write_msr(vmx
))
2217 index
= msr_info
->index
- MSR_IA32_RTIT_ADDR0_A
;
2218 if (index
>= 2 * intel_pt_validate_cap(vmx
->pt_desc
.caps
,
2219 PT_CAP_num_address_ranges
))
2221 if (is_noncanonical_address(data
, vcpu
))
2224 vmx
->pt_desc
.guest
.addr_b
[index
/ 2] = data
;
2226 vmx
->pt_desc
.guest
.addr_a
[index
/ 2] = data
;
2229 if (!msr_info
->host_initiated
&&
2230 !guest_cpuid_has(vcpu
, X86_FEATURE_RDTSCP
))
2232 /* Check reserved bit, higher 32 bits should be zero */
2233 if ((data
>> 32) != 0)
2236 case MSR_IA32_PERF_CAPABILITIES
:
2237 if (data
&& !vcpu_to_pmu(vcpu
)->version
)
2239 if (data
& PMU_CAP_LBR_FMT
) {
2240 if ((data
& PMU_CAP_LBR_FMT
) !=
2241 (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT
))
2243 if (!intel_pmu_lbr_is_compatible(vcpu
))
2246 ret
= kvm_set_msr_common(vcpu
, msr_info
);
2251 msr
= vmx_find_uret_msr(vmx
, msr_index
);
2253 ret
= vmx_set_guest_uret_msr(vmx
, msr
, data
);
2255 ret
= kvm_set_msr_common(vcpu
, msr_info
);
2261 static void vmx_cache_reg(struct kvm_vcpu
*vcpu
, enum kvm_reg reg
)
2263 unsigned long guest_owned_bits
;
2265 kvm_register_mark_available(vcpu
, reg
);
2269 vcpu
->arch
.regs
[VCPU_REGS_RSP
] = vmcs_readl(GUEST_RSP
);
2272 vcpu
->arch
.regs
[VCPU_REGS_RIP
] = vmcs_readl(GUEST_RIP
);
2274 case VCPU_EXREG_PDPTR
:
2276 ept_save_pdptrs(vcpu
);
2278 case VCPU_EXREG_CR0
:
2279 guest_owned_bits
= vcpu
->arch
.cr0_guest_owned_bits
;
2281 vcpu
->arch
.cr0
&= ~guest_owned_bits
;
2282 vcpu
->arch
.cr0
|= vmcs_readl(GUEST_CR0
) & guest_owned_bits
;
2284 case VCPU_EXREG_CR3
:
2285 if (is_unrestricted_guest(vcpu
) ||
2286 (enable_ept
&& is_paging(vcpu
)))
2287 vcpu
->arch
.cr3
= vmcs_readl(GUEST_CR3
);
2289 case VCPU_EXREG_CR4
:
2290 guest_owned_bits
= vcpu
->arch
.cr4_guest_owned_bits
;
2292 vcpu
->arch
.cr4
&= ~guest_owned_bits
;
2293 vcpu
->arch
.cr4
|= vmcs_readl(GUEST_CR4
) & guest_owned_bits
;
2301 static __init
int cpu_has_kvm_support(void)
2303 return cpu_has_vmx();
2306 static __init
int vmx_disabled_by_bios(void)
2308 return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL
) ||
2309 !boot_cpu_has(X86_FEATURE_VMX
);
2312 static int kvm_cpu_vmxon(u64 vmxon_pointer
)
2316 cr4_set_bits(X86_CR4_VMXE
);
2318 asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
2319 _ASM_EXTABLE(1b
, %l
[fault
])
2320 : : [vmxon_pointer
] "m"(vmxon_pointer
)
2325 WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
2326 rdmsrl_safe(MSR_IA32_FEAT_CTL
, &msr
) ? 0xdeadbeef : msr
);
2327 cr4_clear_bits(X86_CR4_VMXE
);
2332 static int hardware_enable(void)
2334 int cpu
= raw_smp_processor_id();
2335 u64 phys_addr
= __pa(per_cpu(vmxarea
, cpu
));
2338 if (cr4_read_shadow() & X86_CR4_VMXE
)
2342 * This can happen if we hot-added a CPU but failed to allocate
2343 * VP assist page for it.
2345 if (static_branch_unlikely(&enable_evmcs
) &&
2346 !hv_get_vp_assist_page(cpu
))
2349 intel_pt_handle_vmx(1);
2351 r
= kvm_cpu_vmxon(phys_addr
);
2353 intel_pt_handle_vmx(0);
2363 static void vmclear_local_loaded_vmcss(void)
2365 int cpu
= raw_smp_processor_id();
2366 struct loaded_vmcs
*v
, *n
;
2368 list_for_each_entry_safe(v
, n
, &per_cpu(loaded_vmcss_on_cpu
, cpu
),
2369 loaded_vmcss_on_cpu_link
)
2370 __loaded_vmcs_clear(v
);
2373 static void hardware_disable(void)
2375 vmclear_local_loaded_vmcss();
2378 kvm_spurious_fault();
2380 intel_pt_handle_vmx(0);
2384 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
2385 * directly instead of going through cpu_has(), to ensure KVM is trapping
2386 * ENCLS whenever it's supported in hardware. It does not matter whether
2387 * the host OS supports or has enabled SGX.
2389 static bool cpu_has_sgx(void)
2391 return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
2394 static __init
int adjust_vmx_controls(u32 ctl_min
, u32 ctl_opt
,
2395 u32 msr
, u32
*result
)
2397 u32 vmx_msr_low
, vmx_msr_high
;
2398 u32 ctl
= ctl_min
| ctl_opt
;
2400 rdmsr(msr
, vmx_msr_low
, vmx_msr_high
);
2402 ctl
&= vmx_msr_high
; /* bit == 0 in high word ==> must be zero */
2403 ctl
|= vmx_msr_low
; /* bit == 1 in low word ==> must be one */
2405 /* Ensure minimum (required) set of control bits are supported. */
2413 static __init
int setup_vmcs_config(struct vmcs_config
*vmcs_conf
,
2414 struct vmx_capability
*vmx_cap
)
2416 u32 vmx_msr_low
, vmx_msr_high
;
2417 u32 min
, opt
, min2
, opt2
;
2418 u32 _pin_based_exec_control
= 0;
2419 u32 _cpu_based_exec_control
= 0;
2420 u32 _cpu_based_2nd_exec_control
= 0;
2421 u32 _vmexit_control
= 0;
2422 u32 _vmentry_control
= 0;
2424 memset(vmcs_conf
, 0, sizeof(*vmcs_conf
));
2425 min
= CPU_BASED_HLT_EXITING
|
2426 #ifdef CONFIG_X86_64
2427 CPU_BASED_CR8_LOAD_EXITING
|
2428 CPU_BASED_CR8_STORE_EXITING
|
2430 CPU_BASED_CR3_LOAD_EXITING
|
2431 CPU_BASED_CR3_STORE_EXITING
|
2432 CPU_BASED_UNCOND_IO_EXITING
|
2433 CPU_BASED_MOV_DR_EXITING
|
2434 CPU_BASED_USE_TSC_OFFSETTING
|
2435 CPU_BASED_MWAIT_EXITING
|
2436 CPU_BASED_MONITOR_EXITING
|
2437 CPU_BASED_INVLPG_EXITING
|
2438 CPU_BASED_RDPMC_EXITING
;
2440 opt
= CPU_BASED_TPR_SHADOW
|
2441 CPU_BASED_USE_MSR_BITMAPS
|
2442 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS
;
2443 if (adjust_vmx_controls(min
, opt
, MSR_IA32_VMX_PROCBASED_CTLS
,
2444 &_cpu_based_exec_control
) < 0)
2446 #ifdef CONFIG_X86_64
2447 if ((_cpu_based_exec_control
& CPU_BASED_TPR_SHADOW
))
2448 _cpu_based_exec_control
&= ~CPU_BASED_CR8_LOAD_EXITING
&
2449 ~CPU_BASED_CR8_STORE_EXITING
;
2451 if (_cpu_based_exec_control
& CPU_BASED_ACTIVATE_SECONDARY_CONTROLS
) {
2453 opt2
= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES
|
2454 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE
|
2455 SECONDARY_EXEC_WBINVD_EXITING
|
2456 SECONDARY_EXEC_ENABLE_VPID
|
2457 SECONDARY_EXEC_ENABLE_EPT
|
2458 SECONDARY_EXEC_UNRESTRICTED_GUEST
|
2459 SECONDARY_EXEC_PAUSE_LOOP_EXITING
|
2460 SECONDARY_EXEC_DESC
|
2461 SECONDARY_EXEC_ENABLE_RDTSCP
|
2462 SECONDARY_EXEC_ENABLE_INVPCID
|
2463 SECONDARY_EXEC_APIC_REGISTER_VIRT
|
2464 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY
|
2465 SECONDARY_EXEC_SHADOW_VMCS
|
2466 SECONDARY_EXEC_XSAVES
|
2467 SECONDARY_EXEC_RDSEED_EXITING
|
2468 SECONDARY_EXEC_RDRAND_EXITING
|
2469 SECONDARY_EXEC_ENABLE_PML
|
2470 SECONDARY_EXEC_TSC_SCALING
|
2471 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE
|
2472 SECONDARY_EXEC_PT_USE_GPA
|
2473 SECONDARY_EXEC_PT_CONCEAL_VMX
|
2474 SECONDARY_EXEC_ENABLE_VMFUNC
|
2475 SECONDARY_EXEC_BUS_LOCK_DETECTION
;
2477 opt2
|= SECONDARY_EXEC_ENCLS_EXITING
;
2478 if (adjust_vmx_controls(min2
, opt2
,
2479 MSR_IA32_VMX_PROCBASED_CTLS2
,
2480 &_cpu_based_2nd_exec_control
) < 0)
2483 #ifndef CONFIG_X86_64
2484 if (!(_cpu_based_2nd_exec_control
&
2485 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES
))
2486 _cpu_based_exec_control
&= ~CPU_BASED_TPR_SHADOW
;
2489 if (!(_cpu_based_exec_control
& CPU_BASED_TPR_SHADOW
))
2490 _cpu_based_2nd_exec_control
&= ~(
2491 SECONDARY_EXEC_APIC_REGISTER_VIRT
|
2492 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE
|
2493 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY
);
2495 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP
,
2496 &vmx_cap
->ept
, &vmx_cap
->vpid
);
2498 if (_cpu_based_2nd_exec_control
& SECONDARY_EXEC_ENABLE_EPT
) {
2499 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
2501 _cpu_based_exec_control
&= ~(CPU_BASED_CR3_LOAD_EXITING
|
2502 CPU_BASED_CR3_STORE_EXITING
|
2503 CPU_BASED_INVLPG_EXITING
);
2504 } else if (vmx_cap
->ept
) {
2506 pr_warn_once("EPT CAP should not exist if not support "
2507 "1-setting enable EPT VM-execution control\n");
2509 if (!(_cpu_based_2nd_exec_control
& SECONDARY_EXEC_ENABLE_VPID
) &&
2512 pr_warn_once("VPID CAP should not exist if not support "
2513 "1-setting enable VPID VM-execution control\n");
2516 min
= VM_EXIT_SAVE_DEBUG_CONTROLS
| VM_EXIT_ACK_INTR_ON_EXIT
;
2517 #ifdef CONFIG_X86_64
2518 min
|= VM_EXIT_HOST_ADDR_SPACE_SIZE
;
2520 opt
= VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL
|
2521 VM_EXIT_LOAD_IA32_PAT
|
2522 VM_EXIT_LOAD_IA32_EFER
|
2523 VM_EXIT_CLEAR_BNDCFGS
|
2524 VM_EXIT_PT_CONCEAL_PIP
|
2525 VM_EXIT_CLEAR_IA32_RTIT_CTL
;
2526 if (adjust_vmx_controls(min
, opt
, MSR_IA32_VMX_EXIT_CTLS
,
2527 &_vmexit_control
) < 0)
2530 min
= PIN_BASED_EXT_INTR_MASK
| PIN_BASED_NMI_EXITING
;
2531 opt
= PIN_BASED_VIRTUAL_NMIS
| PIN_BASED_POSTED_INTR
|
2532 PIN_BASED_VMX_PREEMPTION_TIMER
;
2533 if (adjust_vmx_controls(min
, opt
, MSR_IA32_VMX_PINBASED_CTLS
,
2534 &_pin_based_exec_control
) < 0)
2537 if (cpu_has_broken_vmx_preemption_timer())
2538 _pin_based_exec_control
&= ~PIN_BASED_VMX_PREEMPTION_TIMER
;
2539 if (!(_cpu_based_2nd_exec_control
&
2540 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY
))
2541 _pin_based_exec_control
&= ~PIN_BASED_POSTED_INTR
;
2543 min
= VM_ENTRY_LOAD_DEBUG_CONTROLS
;
2544 opt
= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL
|
2545 VM_ENTRY_LOAD_IA32_PAT
|
2546 VM_ENTRY_LOAD_IA32_EFER
|
2547 VM_ENTRY_LOAD_BNDCFGS
|
2548 VM_ENTRY_PT_CONCEAL_PIP
|
2549 VM_ENTRY_LOAD_IA32_RTIT_CTL
;
2550 if (adjust_vmx_controls(min
, opt
, MSR_IA32_VMX_ENTRY_CTLS
,
2551 &_vmentry_control
) < 0)
2555 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
2556 * can't be used due to an errata where VM Exit may incorrectly clear
2557 * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the
2558 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2560 if (boot_cpu_data
.x86
== 0x6) {
2561 switch (boot_cpu_data
.x86_model
) {
2562 case 26: /* AAK155 */
2563 case 30: /* AAP115 */
2564 case 37: /* AAT100 */
2565 case 44: /* BC86,AAY89,BD102 */
2567 _vmentry_control
&= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL
;
2568 _vmexit_control
&= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL
;
2569 pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
2570 "does not work properly. Using workaround\n");
2578 rdmsr(MSR_IA32_VMX_BASIC
, vmx_msr_low
, vmx_msr_high
);
2580 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2581 if ((vmx_msr_high
& 0x1fff) > PAGE_SIZE
)
2584 #ifdef CONFIG_X86_64
2585 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
2586 if (vmx_msr_high
& (1u<<16))
2590 /* Require Write-Back (WB) memory type for VMCS accesses. */
2591 if (((vmx_msr_high
>> 18) & 15) != 6)
2594 vmcs_conf
->size
= vmx_msr_high
& 0x1fff;
2595 vmcs_conf
->order
= get_order(vmcs_conf
->size
);
2596 vmcs_conf
->basic_cap
= vmx_msr_high
& ~0x1fff;
2598 vmcs_conf
->revision_id
= vmx_msr_low
;
2600 vmcs_conf
->pin_based_exec_ctrl
= _pin_based_exec_control
;
2601 vmcs_conf
->cpu_based_exec_ctrl
= _cpu_based_exec_control
;
2602 vmcs_conf
->cpu_based_2nd_exec_ctrl
= _cpu_based_2nd_exec_control
;
2603 vmcs_conf
->vmexit_ctrl
= _vmexit_control
;
2604 vmcs_conf
->vmentry_ctrl
= _vmentry_control
;
2606 #if IS_ENABLED(CONFIG_HYPERV)
2607 if (enlightened_vmcs
)
2608 evmcs_sanitize_exec_ctrls(vmcs_conf
);
2614 struct vmcs
*alloc_vmcs_cpu(bool shadow
, int cpu
, gfp_t flags
)
2616 int node
= cpu_to_node(cpu
);
2620 pages
= __alloc_pages_node(node
, flags
, vmcs_config
.order
);
2623 vmcs
= page_address(pages
);
2624 memset(vmcs
, 0, vmcs_config
.size
);
2626 /* KVM supports Enlightened VMCS v1 only */
2627 if (static_branch_unlikely(&enable_evmcs
))
2628 vmcs
->hdr
.revision_id
= KVM_EVMCS_VERSION
;
2630 vmcs
->hdr
.revision_id
= vmcs_config
.revision_id
;
2633 vmcs
->hdr
.shadow_vmcs
= 1;
2637 void free_vmcs(struct vmcs
*vmcs
)
2639 free_pages((unsigned long)vmcs
, vmcs_config
.order
);
2643 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2645 void free_loaded_vmcs(struct loaded_vmcs
*loaded_vmcs
)
2647 if (!loaded_vmcs
->vmcs
)
2649 loaded_vmcs_clear(loaded_vmcs
);
2650 free_vmcs(loaded_vmcs
->vmcs
);
2651 loaded_vmcs
->vmcs
= NULL
;
2652 if (loaded_vmcs
->msr_bitmap
)
2653 free_page((unsigned long)loaded_vmcs
->msr_bitmap
);
2654 WARN_ON(loaded_vmcs
->shadow_vmcs
!= NULL
);
2657 int alloc_loaded_vmcs(struct loaded_vmcs
*loaded_vmcs
)
2659 loaded_vmcs
->vmcs
= alloc_vmcs(false);
2660 if (!loaded_vmcs
->vmcs
)
2663 vmcs_clear(loaded_vmcs
->vmcs
);
2665 loaded_vmcs
->shadow_vmcs
= NULL
;
2666 loaded_vmcs
->hv_timer_soft_disabled
= false;
2667 loaded_vmcs
->cpu
= -1;
2668 loaded_vmcs
->launched
= 0;
2670 if (cpu_has_vmx_msr_bitmap()) {
2671 loaded_vmcs
->msr_bitmap
= (unsigned long *)
2672 __get_free_page(GFP_KERNEL_ACCOUNT
);
2673 if (!loaded_vmcs
->msr_bitmap
)
2675 memset(loaded_vmcs
->msr_bitmap
, 0xff, PAGE_SIZE
);
2677 if (IS_ENABLED(CONFIG_HYPERV
) &&
2678 static_branch_unlikely(&enable_evmcs
) &&
2679 (ms_hyperv
.nested_features
& HV_X64_NESTED_MSR_BITMAP
)) {
2680 struct hv_enlightened_vmcs
*evmcs
=
2681 (struct hv_enlightened_vmcs
*)loaded_vmcs
->vmcs
;
2683 evmcs
->hv_enlightenments_control
.msr_bitmap
= 1;
2687 memset(&loaded_vmcs
->host_state
, 0, sizeof(struct vmcs_host_state
));
2688 memset(&loaded_vmcs
->controls_shadow
, 0,
2689 sizeof(struct vmcs_controls_shadow
));
2694 free_loaded_vmcs(loaded_vmcs
);
2698 static void free_kvm_area(void)
2702 for_each_possible_cpu(cpu
) {
2703 free_vmcs(per_cpu(vmxarea
, cpu
));
2704 per_cpu(vmxarea
, cpu
) = NULL
;
2708 static __init
int alloc_kvm_area(void)
2712 for_each_possible_cpu(cpu
) {
2715 vmcs
= alloc_vmcs_cpu(false, cpu
, GFP_KERNEL
);
2722 * When eVMCS is enabled, alloc_vmcs_cpu() sets
2723 * vmcs->revision_id to KVM_EVMCS_VERSION instead of
2724 * revision_id reported by MSR_IA32_VMX_BASIC.
2726 * However, even though not explicitly documented by
2727 * TLFS, VMXArea passed as VMXON argument should
2728 * still be marked with revision_id reported by
2731 if (static_branch_unlikely(&enable_evmcs
))
2732 vmcs
->hdr
.revision_id
= vmcs_config
.revision_id
;
2734 per_cpu(vmxarea
, cpu
) = vmcs
;
2739 static void fix_pmode_seg(struct kvm_vcpu
*vcpu
, int seg
,
2740 struct kvm_segment
*save
)
2742 if (!emulate_invalid_guest_state
) {
2744 * CS and SS RPL should be equal during guest entry according
2745 * to VMX spec, but in reality it is not always so. Since vcpu
2746 * is in the middle of the transition from real mode to
2747 * protected mode it is safe to assume that RPL 0 is a good
2750 if (seg
== VCPU_SREG_CS
|| seg
== VCPU_SREG_SS
)
2751 save
->selector
&= ~SEGMENT_RPL_MASK
;
2752 save
->dpl
= save
->selector
& SEGMENT_RPL_MASK
;
2755 vmx_set_segment(vcpu
, save
, seg
);
2758 static void enter_pmode(struct kvm_vcpu
*vcpu
)
2760 unsigned long flags
;
2761 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
2764 * Update real mode segment cache. It may be not up-to-date if sement
2765 * register was written while vcpu was in a guest mode.
2767 vmx_get_segment(vcpu
, &vmx
->rmode
.segs
[VCPU_SREG_ES
], VCPU_SREG_ES
);
2768 vmx_get_segment(vcpu
, &vmx
->rmode
.segs
[VCPU_SREG_DS
], VCPU_SREG_DS
);
2769 vmx_get_segment(vcpu
, &vmx
->rmode
.segs
[VCPU_SREG_FS
], VCPU_SREG_FS
);
2770 vmx_get_segment(vcpu
, &vmx
->rmode
.segs
[VCPU_SREG_GS
], VCPU_SREG_GS
);
2771 vmx_get_segment(vcpu
, &vmx
->rmode
.segs
[VCPU_SREG_SS
], VCPU_SREG_SS
);
2772 vmx_get_segment(vcpu
, &vmx
->rmode
.segs
[VCPU_SREG_CS
], VCPU_SREG_CS
);
2774 vmx
->rmode
.vm86_active
= 0;
2776 vmx_set_segment(vcpu
, &vmx
->rmode
.segs
[VCPU_SREG_TR
], VCPU_SREG_TR
);
2778 flags
= vmcs_readl(GUEST_RFLAGS
);
2779 flags
&= RMODE_GUEST_OWNED_EFLAGS_BITS
;
2780 flags
|= vmx
->rmode
.save_rflags
& ~RMODE_GUEST_OWNED_EFLAGS_BITS
;
2781 vmcs_writel(GUEST_RFLAGS
, flags
);
2783 vmcs_writel(GUEST_CR4
, (vmcs_readl(GUEST_CR4
) & ~X86_CR4_VME
) |
2784 (vmcs_readl(CR4_READ_SHADOW
) & X86_CR4_VME
));
2786 vmx_update_exception_bitmap(vcpu
);
2788 fix_pmode_seg(vcpu
, VCPU_SREG_CS
, &vmx
->rmode
.segs
[VCPU_SREG_CS
]);
2789 fix_pmode_seg(vcpu
, VCPU_SREG_SS
, &vmx
->rmode
.segs
[VCPU_SREG_SS
]);
2790 fix_pmode_seg(vcpu
, VCPU_SREG_ES
, &vmx
->rmode
.segs
[VCPU_SREG_ES
]);
2791 fix_pmode_seg(vcpu
, VCPU_SREG_DS
, &vmx
->rmode
.segs
[VCPU_SREG_DS
]);
2792 fix_pmode_seg(vcpu
, VCPU_SREG_FS
, &vmx
->rmode
.segs
[VCPU_SREG_FS
]);
2793 fix_pmode_seg(vcpu
, VCPU_SREG_GS
, &vmx
->rmode
.segs
[VCPU_SREG_GS
]);
2796 static void fix_rmode_seg(int seg
, struct kvm_segment
*save
)
2798 const struct kvm_vmx_segment_field
*sf
= &kvm_vmx_segment_fields
[seg
];
2799 struct kvm_segment var
= *save
;
2802 if (seg
== VCPU_SREG_CS
)
2805 if (!emulate_invalid_guest_state
) {
2806 var
.selector
= var
.base
>> 4;
2807 var
.base
= var
.base
& 0xffff0;
2817 if (save
->base
& 0xf)
2818 printk_once(KERN_WARNING
"kvm: segment base is not "
2819 "paragraph aligned when entering "
2820 "protected mode (seg=%d)", seg
);
2823 vmcs_write16(sf
->selector
, var
.selector
);
2824 vmcs_writel(sf
->base
, var
.base
);
2825 vmcs_write32(sf
->limit
, var
.limit
);
2826 vmcs_write32(sf
->ar_bytes
, vmx_segment_access_rights(&var
));
2829 static void enter_rmode(struct kvm_vcpu
*vcpu
)
2831 unsigned long flags
;
2832 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
2833 struct kvm_vmx
*kvm_vmx
= to_kvm_vmx(vcpu
->kvm
);
2835 vmx_get_segment(vcpu
, &vmx
->rmode
.segs
[VCPU_SREG_TR
], VCPU_SREG_TR
);
2836 vmx_get_segment(vcpu
, &vmx
->rmode
.segs
[VCPU_SREG_ES
], VCPU_SREG_ES
);
2837 vmx_get_segment(vcpu
, &vmx
->rmode
.segs
[VCPU_SREG_DS
], VCPU_SREG_DS
);
2838 vmx_get_segment(vcpu
, &vmx
->rmode
.segs
[VCPU_SREG_FS
], VCPU_SREG_FS
);
2839 vmx_get_segment(vcpu
, &vmx
->rmode
.segs
[VCPU_SREG_GS
], VCPU_SREG_GS
);
2840 vmx_get_segment(vcpu
, &vmx
->rmode
.segs
[VCPU_SREG_SS
], VCPU_SREG_SS
);
2841 vmx_get_segment(vcpu
, &vmx
->rmode
.segs
[VCPU_SREG_CS
], VCPU_SREG_CS
);
2843 vmx
->rmode
.vm86_active
= 1;
2846 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
2847 * vcpu. Warn the user that an update is overdue.
2849 if (!kvm_vmx
->tss_addr
)
2850 printk_once(KERN_WARNING
"kvm: KVM_SET_TSS_ADDR need to be "
2851 "called before entering vcpu\n");
2853 vmx_segment_cache_clear(vmx
);
2855 vmcs_writel(GUEST_TR_BASE
, kvm_vmx
->tss_addr
);
2856 vmcs_write32(GUEST_TR_LIMIT
, RMODE_TSS_SIZE
- 1);
2857 vmcs_write32(GUEST_TR_AR_BYTES
, 0x008b);
2859 flags
= vmcs_readl(GUEST_RFLAGS
);
2860 vmx
->rmode
.save_rflags
= flags
;
2862 flags
|= X86_EFLAGS_IOPL
| X86_EFLAGS_VM
;
2864 vmcs_writel(GUEST_RFLAGS
, flags
);
2865 vmcs_writel(GUEST_CR4
, vmcs_readl(GUEST_CR4
) | X86_CR4_VME
);
2866 vmx_update_exception_bitmap(vcpu
);
2868 fix_rmode_seg(VCPU_SREG_SS
, &vmx
->rmode
.segs
[VCPU_SREG_SS
]);
2869 fix_rmode_seg(VCPU_SREG_CS
, &vmx
->rmode
.segs
[VCPU_SREG_CS
]);
2870 fix_rmode_seg(VCPU_SREG_ES
, &vmx
->rmode
.segs
[VCPU_SREG_ES
]);
2871 fix_rmode_seg(VCPU_SREG_DS
, &vmx
->rmode
.segs
[VCPU_SREG_DS
]);
2872 fix_rmode_seg(VCPU_SREG_GS
, &vmx
->rmode
.segs
[VCPU_SREG_GS
]);
2873 fix_rmode_seg(VCPU_SREG_FS
, &vmx
->rmode
.segs
[VCPU_SREG_FS
]);
2875 kvm_mmu_reset_context(vcpu
);
2878 int vmx_set_efer(struct kvm_vcpu
*vcpu
, u64 efer
)
2880 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
2881 struct vmx_uret_msr
*msr
= vmx_find_uret_msr(vmx
, MSR_EFER
);
2883 /* Nothing to do if hardware doesn't support EFER. */
2887 vcpu
->arch
.efer
= efer
;
2888 if (efer
& EFER_LMA
) {
2889 vm_entry_controls_setbit(to_vmx(vcpu
), VM_ENTRY_IA32E_MODE
);
2892 vm_entry_controls_clearbit(to_vmx(vcpu
), VM_ENTRY_IA32E_MODE
);
2894 msr
->data
= efer
& ~EFER_LME
;
2900 #ifdef CONFIG_X86_64
2902 static void enter_lmode(struct kvm_vcpu
*vcpu
)
2906 vmx_segment_cache_clear(to_vmx(vcpu
));
2908 guest_tr_ar
= vmcs_read32(GUEST_TR_AR_BYTES
);
2909 if ((guest_tr_ar
& VMX_AR_TYPE_MASK
) != VMX_AR_TYPE_BUSY_64_TSS
) {
2910 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
2912 vmcs_write32(GUEST_TR_AR_BYTES
,
2913 (guest_tr_ar
& ~VMX_AR_TYPE_MASK
)
2914 | VMX_AR_TYPE_BUSY_64_TSS
);
2916 vmx_set_efer(vcpu
, vcpu
->arch
.efer
| EFER_LMA
);
2919 static void exit_lmode(struct kvm_vcpu
*vcpu
)
2921 vm_entry_controls_clearbit(to_vmx(vcpu
), VM_ENTRY_IA32E_MODE
);
2922 vmx_set_efer(vcpu
, vcpu
->arch
.efer
& ~EFER_LMA
);
2927 static void vmx_flush_tlb_all(struct kvm_vcpu
*vcpu
)
2929 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
2932 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
2933 * the CPU is not required to invalidate guest-physical mappings on
2934 * VM-Entry, even if VPID is disabled. Guest-physical mappings are
2935 * associated with the root EPT structure and not any particular VPID
2936 * (INVVPID also isn't required to invalidate guest-physical mappings).
2940 } else if (enable_vpid
) {
2941 if (cpu_has_vmx_invvpid_global()) {
2942 vpid_sync_vcpu_global();
2944 vpid_sync_vcpu_single(vmx
->vpid
);
2945 vpid_sync_vcpu_single(vmx
->nested
.vpid02
);
2950 static void vmx_flush_tlb_current(struct kvm_vcpu
*vcpu
)
2952 struct kvm_mmu
*mmu
= vcpu
->arch
.mmu
;
2953 u64 root_hpa
= mmu
->root_hpa
;
2955 /* No flush required if the current context is invalid. */
2956 if (!VALID_PAGE(root_hpa
))
2960 ept_sync_context(construct_eptp(vcpu
, root_hpa
,
2961 mmu
->shadow_root_level
));
2962 else if (!is_guest_mode(vcpu
))
2963 vpid_sync_context(to_vmx(vcpu
)->vpid
);
2965 vpid_sync_context(nested_get_vpid02(vcpu
));
2968 static void vmx_flush_tlb_gva(struct kvm_vcpu
*vcpu
, gva_t addr
)
2971 * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in
2972 * vmx_flush_tlb_guest() for an explanation of why this is ok.
2974 vpid_sync_vcpu_addr(to_vmx(vcpu
)->vpid
, addr
);
2977 static void vmx_flush_tlb_guest(struct kvm_vcpu
*vcpu
)
2980 * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0
2981 * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit
2982 * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is
2983 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
2984 * i.e. no explicit INVVPID is necessary.
2986 vpid_sync_context(to_vmx(vcpu
)->vpid
);
2989 void vmx_ept_load_pdptrs(struct kvm_vcpu
*vcpu
)
2991 struct kvm_mmu
*mmu
= vcpu
->arch
.walk_mmu
;
2993 if (!kvm_register_is_dirty(vcpu
, VCPU_EXREG_PDPTR
))
2996 if (is_pae_paging(vcpu
)) {
2997 vmcs_write64(GUEST_PDPTR0
, mmu
->pdptrs
[0]);
2998 vmcs_write64(GUEST_PDPTR1
, mmu
->pdptrs
[1]);
2999 vmcs_write64(GUEST_PDPTR2
, mmu
->pdptrs
[2]);
3000 vmcs_write64(GUEST_PDPTR3
, mmu
->pdptrs
[3]);
3004 void ept_save_pdptrs(struct kvm_vcpu
*vcpu
)
3006 struct kvm_mmu
*mmu
= vcpu
->arch
.walk_mmu
;
3008 if (WARN_ON_ONCE(!is_pae_paging(vcpu
)))
3011 mmu
->pdptrs
[0] = vmcs_read64(GUEST_PDPTR0
);
3012 mmu
->pdptrs
[1] = vmcs_read64(GUEST_PDPTR1
);
3013 mmu
->pdptrs
[2] = vmcs_read64(GUEST_PDPTR2
);
3014 mmu
->pdptrs
[3] = vmcs_read64(GUEST_PDPTR3
);
3016 kvm_register_mark_dirty(vcpu
, VCPU_EXREG_PDPTR
);
3019 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0
,
3021 struct kvm_vcpu
*vcpu
)
3023 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3025 if (!kvm_register_is_available(vcpu
, VCPU_EXREG_CR3
))
3026 vmx_cache_reg(vcpu
, VCPU_EXREG_CR3
);
3027 if (!(cr0
& X86_CR0_PG
)) {
3028 /* From paging/starting to nonpaging */
3029 exec_controls_setbit(vmx
, CPU_BASED_CR3_LOAD_EXITING
|
3030 CPU_BASED_CR3_STORE_EXITING
);
3031 vcpu
->arch
.cr0
= cr0
;
3032 vmx_set_cr4(vcpu
, kvm_read_cr4(vcpu
));
3033 } else if (!is_paging(vcpu
)) {
3034 /* From nonpaging to paging */
3035 exec_controls_clearbit(vmx
, CPU_BASED_CR3_LOAD_EXITING
|
3036 CPU_BASED_CR3_STORE_EXITING
);
3037 vcpu
->arch
.cr0
= cr0
;
3038 vmx_set_cr4(vcpu
, kvm_read_cr4(vcpu
));
3041 if (!(cr0
& X86_CR0_WP
))
3042 *hw_cr0
&= ~X86_CR0_WP
;
3045 void vmx_set_cr0(struct kvm_vcpu
*vcpu
, unsigned long cr0
)
3047 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3048 unsigned long hw_cr0
;
3050 hw_cr0
= (cr0
& ~KVM_VM_CR0_ALWAYS_OFF
);
3051 if (is_unrestricted_guest(vcpu
))
3052 hw_cr0
|= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST
;
3054 hw_cr0
|= KVM_VM_CR0_ALWAYS_ON
;
3056 if (vmx
->rmode
.vm86_active
&& (cr0
& X86_CR0_PE
))
3059 if (!vmx
->rmode
.vm86_active
&& !(cr0
& X86_CR0_PE
))
3063 #ifdef CONFIG_X86_64
3064 if (vcpu
->arch
.efer
& EFER_LME
) {
3065 if (!is_paging(vcpu
) && (cr0
& X86_CR0_PG
))
3067 if (is_paging(vcpu
) && !(cr0
& X86_CR0_PG
))
3072 if (enable_ept
&& !is_unrestricted_guest(vcpu
))
3073 ept_update_paging_mode_cr0(&hw_cr0
, cr0
, vcpu
);
3075 vmcs_writel(CR0_READ_SHADOW
, cr0
);
3076 vmcs_writel(GUEST_CR0
, hw_cr0
);
3077 vcpu
->arch
.cr0
= cr0
;
3078 kvm_register_mark_available(vcpu
, VCPU_EXREG_CR0
);
3080 /* depends on vcpu->arch.cr0 to be set to a new value */
3081 vmx
->emulation_required
= emulation_required(vcpu
);
3084 static int vmx_get_max_tdp_level(void)
3086 if (cpu_has_vmx_ept_5levels())
3091 u64
construct_eptp(struct kvm_vcpu
*vcpu
, unsigned long root_hpa
,
3094 u64 eptp
= VMX_EPTP_MT_WB
;
3096 eptp
|= (root_level
== 5) ? VMX_EPTP_PWL_5
: VMX_EPTP_PWL_4
;
3098 if (enable_ept_ad_bits
&&
3099 (!is_guest_mode(vcpu
) || nested_ept_ad_enabled(vcpu
)))
3100 eptp
|= VMX_EPTP_AD_ENABLE_BIT
;
3101 eptp
|= (root_hpa
& PAGE_MASK
);
3106 static void vmx_load_mmu_pgd(struct kvm_vcpu
*vcpu
, unsigned long pgd
,
3109 struct kvm
*kvm
= vcpu
->kvm
;
3110 bool update_guest_cr3
= true;
3111 unsigned long guest_cr3
;
3115 eptp
= construct_eptp(vcpu
, pgd
, pgd_level
);
3116 vmcs_write64(EPT_POINTER
, eptp
);
3118 if (kvm_x86_ops
.tlb_remote_flush
) {
3119 spin_lock(&to_kvm_vmx(kvm
)->ept_pointer_lock
);
3120 to_vmx(vcpu
)->ept_pointer
= eptp
;
3121 to_kvm_vmx(kvm
)->ept_pointers_match
3122 = EPT_POINTERS_CHECK
;
3123 spin_unlock(&to_kvm_vmx(kvm
)->ept_pointer_lock
);
3126 if (!enable_unrestricted_guest
&& !is_paging(vcpu
))
3127 guest_cr3
= to_kvm_vmx(kvm
)->ept_identity_map_addr
;
3128 else if (test_bit(VCPU_EXREG_CR3
, (ulong
*)&vcpu
->arch
.regs_avail
))
3129 guest_cr3
= vcpu
->arch
.cr3
;
3130 else /* vmcs01.GUEST_CR3 is already up-to-date. */
3131 update_guest_cr3
= false;
3132 vmx_ept_load_pdptrs(vcpu
);
3137 if (update_guest_cr3
)
3138 vmcs_writel(GUEST_CR3
, guest_cr3
);
3141 static bool vmx_is_valid_cr4(struct kvm_vcpu
*vcpu
, unsigned long cr4
)
3144 * We operate under the default treatment of SMM, so VMX cannot be
3145 * enabled under SMM. Note, whether or not VMXE is allowed at all is
3146 * handled by kvm_is_valid_cr4().
3148 if ((cr4
& X86_CR4_VMXE
) && is_smm(vcpu
))
3151 if (to_vmx(vcpu
)->nested
.vmxon
&& !nested_cr4_valid(vcpu
, cr4
))
3157 void vmx_set_cr4(struct kvm_vcpu
*vcpu
, unsigned long cr4
)
3159 unsigned long old_cr4
= vcpu
->arch
.cr4
;
3160 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3162 * Pass through host's Machine Check Enable value to hw_cr4, which
3163 * is in force while we are in guest mode. Do not let guests control
3164 * this bit, even if host CR4.MCE == 0.
3166 unsigned long hw_cr4
;
3168 hw_cr4
= (cr4_read_shadow() & X86_CR4_MCE
) | (cr4
& ~X86_CR4_MCE
);
3169 if (is_unrestricted_guest(vcpu
))
3170 hw_cr4
|= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST
;
3171 else if (vmx
->rmode
.vm86_active
)
3172 hw_cr4
|= KVM_RMODE_VM_CR4_ALWAYS_ON
;
3174 hw_cr4
|= KVM_PMODE_VM_CR4_ALWAYS_ON
;
3176 if (!boot_cpu_has(X86_FEATURE_UMIP
) && vmx_umip_emulated()) {
3177 if (cr4
& X86_CR4_UMIP
) {
3178 secondary_exec_controls_setbit(vmx
, SECONDARY_EXEC_DESC
);
3179 hw_cr4
&= ~X86_CR4_UMIP
;
3180 } else if (!is_guest_mode(vcpu
) ||
3181 !nested_cpu_has2(get_vmcs12(vcpu
), SECONDARY_EXEC_DESC
)) {
3182 secondary_exec_controls_clearbit(vmx
, SECONDARY_EXEC_DESC
);
3186 vcpu
->arch
.cr4
= cr4
;
3187 kvm_register_mark_available(vcpu
, VCPU_EXREG_CR4
);
3189 if (!is_unrestricted_guest(vcpu
)) {
3191 if (!is_paging(vcpu
)) {
3192 hw_cr4
&= ~X86_CR4_PAE
;
3193 hw_cr4
|= X86_CR4_PSE
;
3194 } else if (!(cr4
& X86_CR4_PAE
)) {
3195 hw_cr4
&= ~X86_CR4_PAE
;
3200 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
3201 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
3202 * to be manually disabled when guest switches to non-paging
3205 * If !enable_unrestricted_guest, the CPU is always running
3206 * with CR0.PG=1 and CR4 needs to be modified.
3207 * If enable_unrestricted_guest, the CPU automatically
3208 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
3210 if (!is_paging(vcpu
))
3211 hw_cr4
&= ~(X86_CR4_SMEP
| X86_CR4_SMAP
| X86_CR4_PKE
);
3214 vmcs_writel(CR4_READ_SHADOW
, cr4
);
3215 vmcs_writel(GUEST_CR4
, hw_cr4
);
3217 if ((cr4
^ old_cr4
) & (X86_CR4_OSXSAVE
| X86_CR4_PKE
))
3218 kvm_update_cpuid_runtime(vcpu
);
3221 void vmx_get_segment(struct kvm_vcpu
*vcpu
, struct kvm_segment
*var
, int seg
)
3223 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3226 if (vmx
->rmode
.vm86_active
&& seg
!= VCPU_SREG_LDTR
) {
3227 *var
= vmx
->rmode
.segs
[seg
];
3228 if (seg
== VCPU_SREG_TR
3229 || var
->selector
== vmx_read_guest_seg_selector(vmx
, seg
))
3231 var
->base
= vmx_read_guest_seg_base(vmx
, seg
);
3232 var
->selector
= vmx_read_guest_seg_selector(vmx
, seg
);
3235 var
->base
= vmx_read_guest_seg_base(vmx
, seg
);
3236 var
->limit
= vmx_read_guest_seg_limit(vmx
, seg
);
3237 var
->selector
= vmx_read_guest_seg_selector(vmx
, seg
);
3238 ar
= vmx_read_guest_seg_ar(vmx
, seg
);
3239 var
->unusable
= (ar
>> 16) & 1;
3240 var
->type
= ar
& 15;
3241 var
->s
= (ar
>> 4) & 1;
3242 var
->dpl
= (ar
>> 5) & 3;
3244 * Some userspaces do not preserve unusable property. Since usable
3245 * segment has to be present according to VMX spec we can use present
3246 * property to amend userspace bug by making unusable segment always
3247 * nonpresent. vmx_segment_access_rights() already marks nonpresent
3248 * segment as unusable.
3250 var
->present
= !var
->unusable
;
3251 var
->avl
= (ar
>> 12) & 1;
3252 var
->l
= (ar
>> 13) & 1;
3253 var
->db
= (ar
>> 14) & 1;
3254 var
->g
= (ar
>> 15) & 1;
3257 static u64
vmx_get_segment_base(struct kvm_vcpu
*vcpu
, int seg
)
3259 struct kvm_segment s
;
3261 if (to_vmx(vcpu
)->rmode
.vm86_active
) {
3262 vmx_get_segment(vcpu
, &s
, seg
);
3265 return vmx_read_guest_seg_base(to_vmx(vcpu
), seg
);
3268 int vmx_get_cpl(struct kvm_vcpu
*vcpu
)
3270 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3272 if (unlikely(vmx
->rmode
.vm86_active
))
3275 int ar
= vmx_read_guest_seg_ar(vmx
, VCPU_SREG_SS
);
3276 return VMX_AR_DPL(ar
);
3280 static u32
vmx_segment_access_rights(struct kvm_segment
*var
)
3284 if (var
->unusable
|| !var
->present
)
3287 ar
= var
->type
& 15;
3288 ar
|= (var
->s
& 1) << 4;
3289 ar
|= (var
->dpl
& 3) << 5;
3290 ar
|= (var
->present
& 1) << 7;
3291 ar
|= (var
->avl
& 1) << 12;
3292 ar
|= (var
->l
& 1) << 13;
3293 ar
|= (var
->db
& 1) << 14;
3294 ar
|= (var
->g
& 1) << 15;
3300 void vmx_set_segment(struct kvm_vcpu
*vcpu
, struct kvm_segment
*var
, int seg
)
3302 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3303 const struct kvm_vmx_segment_field
*sf
= &kvm_vmx_segment_fields
[seg
];
3305 vmx_segment_cache_clear(vmx
);
3307 if (vmx
->rmode
.vm86_active
&& seg
!= VCPU_SREG_LDTR
) {
3308 vmx
->rmode
.segs
[seg
] = *var
;
3309 if (seg
== VCPU_SREG_TR
)
3310 vmcs_write16(sf
->selector
, var
->selector
);
3312 fix_rmode_seg(seg
, &vmx
->rmode
.segs
[seg
]);
3316 vmcs_writel(sf
->base
, var
->base
);
3317 vmcs_write32(sf
->limit
, var
->limit
);
3318 vmcs_write16(sf
->selector
, var
->selector
);
3321 * Fix the "Accessed" bit in AR field of segment registers for older
3323 * IA32 arch specifies that at the time of processor reset the
3324 * "Accessed" bit in the AR field of segment registers is 1. And qemu
3325 * is setting it to 0 in the userland code. This causes invalid guest
3326 * state vmexit when "unrestricted guest" mode is turned on.
3327 * Fix for this setup issue in cpu_reset is being pushed in the qemu
3328 * tree. Newer qemu binaries with that qemu fix would not need this
3331 if (is_unrestricted_guest(vcpu
) && (seg
!= VCPU_SREG_LDTR
))
3332 var
->type
|= 0x1; /* Accessed */
3334 vmcs_write32(sf
->ar_bytes
, vmx_segment_access_rights(var
));
3337 vmx
->emulation_required
= emulation_required(vcpu
);
3340 static void vmx_get_cs_db_l_bits(struct kvm_vcpu
*vcpu
, int *db
, int *l
)
3342 u32 ar
= vmx_read_guest_seg_ar(to_vmx(vcpu
), VCPU_SREG_CS
);
3344 *db
= (ar
>> 14) & 1;
3345 *l
= (ar
>> 13) & 1;
3348 static void vmx_get_idt(struct kvm_vcpu
*vcpu
, struct desc_ptr
*dt
)
3350 dt
->size
= vmcs_read32(GUEST_IDTR_LIMIT
);
3351 dt
->address
= vmcs_readl(GUEST_IDTR_BASE
);
3354 static void vmx_set_idt(struct kvm_vcpu
*vcpu
, struct desc_ptr
*dt
)
3356 vmcs_write32(GUEST_IDTR_LIMIT
, dt
->size
);
3357 vmcs_writel(GUEST_IDTR_BASE
, dt
->address
);
3360 static void vmx_get_gdt(struct kvm_vcpu
*vcpu
, struct desc_ptr
*dt
)
3362 dt
->size
= vmcs_read32(GUEST_GDTR_LIMIT
);
3363 dt
->address
= vmcs_readl(GUEST_GDTR_BASE
);
3366 static void vmx_set_gdt(struct kvm_vcpu
*vcpu
, struct desc_ptr
*dt
)
3368 vmcs_write32(GUEST_GDTR_LIMIT
, dt
->size
);
3369 vmcs_writel(GUEST_GDTR_BASE
, dt
->address
);
3372 static bool rmode_segment_valid(struct kvm_vcpu
*vcpu
, int seg
)
3374 struct kvm_segment var
;
3377 vmx_get_segment(vcpu
, &var
, seg
);
3379 if (seg
== VCPU_SREG_CS
)
3381 ar
= vmx_segment_access_rights(&var
);
3383 if (var
.base
!= (var
.selector
<< 4))
3385 if (var
.limit
!= 0xffff)
3393 static bool code_segment_valid(struct kvm_vcpu
*vcpu
)
3395 struct kvm_segment cs
;
3396 unsigned int cs_rpl
;
3398 vmx_get_segment(vcpu
, &cs
, VCPU_SREG_CS
);
3399 cs_rpl
= cs
.selector
& SEGMENT_RPL_MASK
;
3403 if (~cs
.type
& (VMX_AR_TYPE_CODE_MASK
|VMX_AR_TYPE_ACCESSES_MASK
))
3407 if (cs
.type
& VMX_AR_TYPE_WRITEABLE_MASK
) {
3408 if (cs
.dpl
> cs_rpl
)
3411 if (cs
.dpl
!= cs_rpl
)
3417 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3421 static bool stack_segment_valid(struct kvm_vcpu
*vcpu
)
3423 struct kvm_segment ss
;
3424 unsigned int ss_rpl
;
3426 vmx_get_segment(vcpu
, &ss
, VCPU_SREG_SS
);
3427 ss_rpl
= ss
.selector
& SEGMENT_RPL_MASK
;
3431 if (ss
.type
!= 3 && ss
.type
!= 7)
3435 if (ss
.dpl
!= ss_rpl
) /* DPL != RPL */
3443 static bool data_segment_valid(struct kvm_vcpu
*vcpu
, int seg
)
3445 struct kvm_segment var
;
3448 vmx_get_segment(vcpu
, &var
, seg
);
3449 rpl
= var
.selector
& SEGMENT_RPL_MASK
;
3457 if (~var
.type
& (VMX_AR_TYPE_CODE_MASK
|VMX_AR_TYPE_WRITEABLE_MASK
)) {
3458 if (var
.dpl
< rpl
) /* DPL < RPL */
3462 /* TODO: Add other members to kvm_segment_field to allow checking for other access
3468 static bool tr_valid(struct kvm_vcpu
*vcpu
)
3470 struct kvm_segment tr
;
3472 vmx_get_segment(vcpu
, &tr
, VCPU_SREG_TR
);
3476 if (tr
.selector
& SEGMENT_TI_MASK
) /* TI = 1 */
3478 if (tr
.type
!= 3 && tr
.type
!= 11) /* TODO: Check if guest is in IA32e mode */
3486 static bool ldtr_valid(struct kvm_vcpu
*vcpu
)
3488 struct kvm_segment ldtr
;
3490 vmx_get_segment(vcpu
, &ldtr
, VCPU_SREG_LDTR
);
3494 if (ldtr
.selector
& SEGMENT_TI_MASK
) /* TI = 1 */
3504 static bool cs_ss_rpl_check(struct kvm_vcpu
*vcpu
)
3506 struct kvm_segment cs
, ss
;
3508 vmx_get_segment(vcpu
, &cs
, VCPU_SREG_CS
);
3509 vmx_get_segment(vcpu
, &ss
, VCPU_SREG_SS
);
3511 return ((cs
.selector
& SEGMENT_RPL_MASK
) ==
3512 (ss
.selector
& SEGMENT_RPL_MASK
));
3516 * Check if guest state is valid. Returns true if valid, false if
3518 * We assume that registers are always usable
3520 bool __vmx_guest_state_valid(struct kvm_vcpu
*vcpu
)
3522 /* real mode guest state checks */
3523 if (!is_protmode(vcpu
) || (vmx_get_rflags(vcpu
) & X86_EFLAGS_VM
)) {
3524 if (!rmode_segment_valid(vcpu
, VCPU_SREG_CS
))
3526 if (!rmode_segment_valid(vcpu
, VCPU_SREG_SS
))
3528 if (!rmode_segment_valid(vcpu
, VCPU_SREG_DS
))
3530 if (!rmode_segment_valid(vcpu
, VCPU_SREG_ES
))
3532 if (!rmode_segment_valid(vcpu
, VCPU_SREG_FS
))
3534 if (!rmode_segment_valid(vcpu
, VCPU_SREG_GS
))
3537 /* protected mode guest state checks */
3538 if (!cs_ss_rpl_check(vcpu
))
3540 if (!code_segment_valid(vcpu
))
3542 if (!stack_segment_valid(vcpu
))
3544 if (!data_segment_valid(vcpu
, VCPU_SREG_DS
))
3546 if (!data_segment_valid(vcpu
, VCPU_SREG_ES
))
3548 if (!data_segment_valid(vcpu
, VCPU_SREG_FS
))
3550 if (!data_segment_valid(vcpu
, VCPU_SREG_GS
))
3552 if (!tr_valid(vcpu
))
3554 if (!ldtr_valid(vcpu
))
3558 * - Add checks on RIP
3559 * - Add checks on RFLAGS
3565 static int init_rmode_tss(struct kvm
*kvm
, void __user
*ua
)
3567 const void *zero_page
= (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3571 for (i
= 0; i
< 3; i
++) {
3572 if (__copy_to_user(ua
+ PAGE_SIZE
* i
, zero_page
, PAGE_SIZE
))
3576 data
= TSS_BASE_SIZE
+ TSS_REDIRECTION_SIZE
;
3577 if (__copy_to_user(ua
+ TSS_IOPB_BASE_OFFSET
, &data
, sizeof(u16
)))
3581 if (__copy_to_user(ua
+ RMODE_TSS_SIZE
- 1, &data
, sizeof(u8
)))
3587 static int init_rmode_identity_map(struct kvm
*kvm
)
3589 struct kvm_vmx
*kvm_vmx
= to_kvm_vmx(kvm
);
3594 /* Protect kvm_vmx->ept_identity_pagetable_done. */
3595 mutex_lock(&kvm
->slots_lock
);
3597 if (likely(kvm_vmx
->ept_identity_pagetable_done
))
3600 if (!kvm_vmx
->ept_identity_map_addr
)
3601 kvm_vmx
->ept_identity_map_addr
= VMX_EPT_IDENTITY_PAGETABLE_ADDR
;
3603 uaddr
= __x86_set_memory_region(kvm
,
3604 IDENTITY_PAGETABLE_PRIVATE_MEMSLOT
,
3605 kvm_vmx
->ept_identity_map_addr
,
3607 if (IS_ERR(uaddr
)) {
3612 /* Set up identity-mapping pagetable for EPT in real mode */
3613 for (i
= 0; i
< PT32_ENT_PER_PAGE
; i
++) {
3614 tmp
= (i
<< 22) + (_PAGE_PRESENT
| _PAGE_RW
| _PAGE_USER
|
3615 _PAGE_ACCESSED
| _PAGE_DIRTY
| _PAGE_PSE
);
3616 if (__copy_to_user(uaddr
+ i
* sizeof(tmp
), &tmp
, sizeof(tmp
))) {
3621 kvm_vmx
->ept_identity_pagetable_done
= true;
3624 mutex_unlock(&kvm
->slots_lock
);
3628 static void seg_setup(int seg
)
3630 const struct kvm_vmx_segment_field
*sf
= &kvm_vmx_segment_fields
[seg
];
3633 vmcs_write16(sf
->selector
, 0);
3634 vmcs_writel(sf
->base
, 0);
3635 vmcs_write32(sf
->limit
, 0xffff);
3637 if (seg
== VCPU_SREG_CS
)
3638 ar
|= 0x08; /* code segment */
3640 vmcs_write32(sf
->ar_bytes
, ar
);
3643 static int alloc_apic_access_page(struct kvm
*kvm
)
3649 mutex_lock(&kvm
->slots_lock
);
3650 if (kvm
->arch
.apic_access_page_done
)
3652 hva
= __x86_set_memory_region(kvm
, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT
,
3653 APIC_DEFAULT_PHYS_BASE
, PAGE_SIZE
);
3659 page
= gfn_to_page(kvm
, APIC_DEFAULT_PHYS_BASE
>> PAGE_SHIFT
);
3660 if (is_error_page(page
)) {
3666 * Do not pin the page in memory, so that memory hot-unplug
3667 * is able to migrate it.
3670 kvm
->arch
.apic_access_page_done
= true;
3672 mutex_unlock(&kvm
->slots_lock
);
3676 int allocate_vpid(void)
3682 spin_lock(&vmx_vpid_lock
);
3683 vpid
= find_first_zero_bit(vmx_vpid_bitmap
, VMX_NR_VPIDS
);
3684 if (vpid
< VMX_NR_VPIDS
)
3685 __set_bit(vpid
, vmx_vpid_bitmap
);
3688 spin_unlock(&vmx_vpid_lock
);
3692 void free_vpid(int vpid
)
3694 if (!enable_vpid
|| vpid
== 0)
3696 spin_lock(&vmx_vpid_lock
);
3697 __clear_bit(vpid
, vmx_vpid_bitmap
);
3698 spin_unlock(&vmx_vpid_lock
);
3701 static void vmx_clear_msr_bitmap_read(ulong
*msr_bitmap
, u32 msr
)
3703 int f
= sizeof(unsigned long);
3706 __clear_bit(msr
, msr_bitmap
+ 0x000 / f
);
3707 else if ((msr
>= 0xc0000000) && (msr
<= 0xc0001fff))
3708 __clear_bit(msr
& 0x1fff, msr_bitmap
+ 0x400 / f
);
3711 static void vmx_clear_msr_bitmap_write(ulong
*msr_bitmap
, u32 msr
)
3713 int f
= sizeof(unsigned long);
3716 __clear_bit(msr
, msr_bitmap
+ 0x800 / f
);
3717 else if ((msr
>= 0xc0000000) && (msr
<= 0xc0001fff))
3718 __clear_bit(msr
& 0x1fff, msr_bitmap
+ 0xc00 / f
);
3721 static void vmx_set_msr_bitmap_read(ulong
*msr_bitmap
, u32 msr
)
3723 int f
= sizeof(unsigned long);
3726 __set_bit(msr
, msr_bitmap
+ 0x000 / f
);
3727 else if ((msr
>= 0xc0000000) && (msr
<= 0xc0001fff))
3728 __set_bit(msr
& 0x1fff, msr_bitmap
+ 0x400 / f
);
3731 static void vmx_set_msr_bitmap_write(ulong
*msr_bitmap
, u32 msr
)
3733 int f
= sizeof(unsigned long);
3736 __set_bit(msr
, msr_bitmap
+ 0x800 / f
);
3737 else if ((msr
>= 0xc0000000) && (msr
<= 0xc0001fff))
3738 __set_bit(msr
& 0x1fff, msr_bitmap
+ 0xc00 / f
);
3741 static __always_inline
void vmx_disable_intercept_for_msr(struct kvm_vcpu
*vcpu
,
3744 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3745 unsigned long *msr_bitmap
= vmx
->vmcs01
.msr_bitmap
;
3747 if (!cpu_has_vmx_msr_bitmap())
3750 if (static_branch_unlikely(&enable_evmcs
))
3751 evmcs_touch_msr_bitmap();
3754 * Mark the desired intercept state in shadow bitmap, this is needed
3755 * for resync when the MSR filters change.
3757 if (is_valid_passthrough_msr(msr
)) {
3758 int idx
= possible_passthrough_msr_slot(msr
);
3760 if (idx
!= -ENOENT
) {
3761 if (type
& MSR_TYPE_R
)
3762 clear_bit(idx
, vmx
->shadow_msr_intercept
.read
);
3763 if (type
& MSR_TYPE_W
)
3764 clear_bit(idx
, vmx
->shadow_msr_intercept
.write
);
3768 if ((type
& MSR_TYPE_R
) &&
3769 !kvm_msr_allowed(vcpu
, msr
, KVM_MSR_FILTER_READ
)) {
3770 vmx_set_msr_bitmap_read(msr_bitmap
, msr
);
3771 type
&= ~MSR_TYPE_R
;
3774 if ((type
& MSR_TYPE_W
) &&
3775 !kvm_msr_allowed(vcpu
, msr
, KVM_MSR_FILTER_WRITE
)) {
3776 vmx_set_msr_bitmap_write(msr_bitmap
, msr
);
3777 type
&= ~MSR_TYPE_W
;
3780 if (type
& MSR_TYPE_R
)
3781 vmx_clear_msr_bitmap_read(msr_bitmap
, msr
);
3783 if (type
& MSR_TYPE_W
)
3784 vmx_clear_msr_bitmap_write(msr_bitmap
, msr
);
3787 static __always_inline
void vmx_enable_intercept_for_msr(struct kvm_vcpu
*vcpu
,
3790 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3791 unsigned long *msr_bitmap
= vmx
->vmcs01
.msr_bitmap
;
3793 if (!cpu_has_vmx_msr_bitmap())
3796 if (static_branch_unlikely(&enable_evmcs
))
3797 evmcs_touch_msr_bitmap();
3800 * Mark the desired intercept state in shadow bitmap, this is needed
3801 * for resync when the MSR filter changes.
3803 if (is_valid_passthrough_msr(msr
)) {
3804 int idx
= possible_passthrough_msr_slot(msr
);
3806 if (idx
!= -ENOENT
) {
3807 if (type
& MSR_TYPE_R
)
3808 set_bit(idx
, vmx
->shadow_msr_intercept
.read
);
3809 if (type
& MSR_TYPE_W
)
3810 set_bit(idx
, vmx
->shadow_msr_intercept
.write
);
3814 if (type
& MSR_TYPE_R
)
3815 vmx_set_msr_bitmap_read(msr_bitmap
, msr
);
3817 if (type
& MSR_TYPE_W
)
3818 vmx_set_msr_bitmap_write(msr_bitmap
, msr
);
3821 void vmx_set_intercept_for_msr(struct kvm_vcpu
*vcpu
,
3822 u32 msr
, int type
, bool value
)
3825 vmx_enable_intercept_for_msr(vcpu
, msr
, type
);
3827 vmx_disable_intercept_for_msr(vcpu
, msr
, type
);
3830 static u8
vmx_msr_bitmap_mode(struct kvm_vcpu
*vcpu
)
3834 if (cpu_has_secondary_exec_ctrls() &&
3835 (secondary_exec_controls_get(to_vmx(vcpu
)) &
3836 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE
)) {
3837 mode
|= MSR_BITMAP_MODE_X2APIC
;
3838 if (enable_apicv
&& kvm_vcpu_apicv_active(vcpu
))
3839 mode
|= MSR_BITMAP_MODE_X2APIC_APICV
;
3845 static void vmx_reset_x2apic_msrs(struct kvm_vcpu
*vcpu
, u8 mode
)
3847 unsigned long *msr_bitmap
= to_vmx(vcpu
)->vmcs01
.msr_bitmap
;
3848 unsigned long read_intercept
;
3851 read_intercept
= (mode
& MSR_BITMAP_MODE_X2APIC_APICV
) ? 0 : ~0;
3853 for (msr
= 0x800; msr
<= 0x8ff; msr
+= BITS_PER_LONG
) {
3854 unsigned int read_idx
= msr
/ BITS_PER_LONG
;
3855 unsigned int write_idx
= read_idx
+ (0x800 / sizeof(long));
3857 msr_bitmap
[read_idx
] = read_intercept
;
3858 msr_bitmap
[write_idx
] = ~0ul;
3862 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu
*vcpu
, u8 mode
)
3864 if (!cpu_has_vmx_msr_bitmap())
3867 vmx_reset_x2apic_msrs(vcpu
, mode
);
3870 * TPR reads and writes can be virtualized even if virtual interrupt
3871 * delivery is not in use.
3873 vmx_set_intercept_for_msr(vcpu
, X2APIC_MSR(APIC_TASKPRI
), MSR_TYPE_RW
,
3874 !(mode
& MSR_BITMAP_MODE_X2APIC
));
3876 if (mode
& MSR_BITMAP_MODE_X2APIC_APICV
) {
3877 vmx_enable_intercept_for_msr(vcpu
, X2APIC_MSR(APIC_TMCCT
), MSR_TYPE_RW
);
3878 vmx_disable_intercept_for_msr(vcpu
, X2APIC_MSR(APIC_EOI
), MSR_TYPE_W
);
3879 vmx_disable_intercept_for_msr(vcpu
, X2APIC_MSR(APIC_SELF_IPI
), MSR_TYPE_W
);
3883 void vmx_update_msr_bitmap(struct kvm_vcpu
*vcpu
)
3885 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3886 u8 mode
= vmx_msr_bitmap_mode(vcpu
);
3887 u8 changed
= mode
^ vmx
->msr_bitmap_mode
;
3892 if (changed
& (MSR_BITMAP_MODE_X2APIC
| MSR_BITMAP_MODE_X2APIC_APICV
))
3893 vmx_update_msr_bitmap_x2apic(vcpu
, mode
);
3895 vmx
->msr_bitmap_mode
= mode
;
3898 void pt_update_intercept_for_msr(struct kvm_vcpu
*vcpu
)
3900 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3901 bool flag
= !(vmx
->pt_desc
.guest
.ctl
& RTIT_CTL_TRACEEN
);
3904 vmx_set_intercept_for_msr(vcpu
, MSR_IA32_RTIT_STATUS
, MSR_TYPE_RW
, flag
);
3905 vmx_set_intercept_for_msr(vcpu
, MSR_IA32_RTIT_OUTPUT_BASE
, MSR_TYPE_RW
, flag
);
3906 vmx_set_intercept_for_msr(vcpu
, MSR_IA32_RTIT_OUTPUT_MASK
, MSR_TYPE_RW
, flag
);
3907 vmx_set_intercept_for_msr(vcpu
, MSR_IA32_RTIT_CR3_MATCH
, MSR_TYPE_RW
, flag
);
3908 for (i
= 0; i
< vmx
->pt_desc
.addr_range
; i
++) {
3909 vmx_set_intercept_for_msr(vcpu
, MSR_IA32_RTIT_ADDR0_A
+ i
* 2, MSR_TYPE_RW
, flag
);
3910 vmx_set_intercept_for_msr(vcpu
, MSR_IA32_RTIT_ADDR0_B
+ i
* 2, MSR_TYPE_RW
, flag
);
3914 static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu
*vcpu
)
3916 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3921 if (WARN_ON_ONCE(!is_guest_mode(vcpu
)) ||
3922 !nested_cpu_has_vid(get_vmcs12(vcpu
)) ||
3923 WARN_ON_ONCE(!vmx
->nested
.virtual_apic_map
.gfn
))
3926 rvi
= vmx_get_rvi();
3928 vapic_page
= vmx
->nested
.virtual_apic_map
.hva
;
3929 vppr
= *((u32
*)(vapic_page
+ APIC_PROCPRI
));
3931 return ((rvi
& 0xf0) > (vppr
& 0xf0));
3934 static void vmx_msr_filter_changed(struct kvm_vcpu
*vcpu
)
3936 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3940 * Set intercept permissions for all potentially passed through MSRs
3941 * again. They will automatically get filtered through the MSR filter,
3942 * so we are back in sync after this.
3944 for (i
= 0; i
< ARRAY_SIZE(vmx_possible_passthrough_msrs
); i
++) {
3945 u32 msr
= vmx_possible_passthrough_msrs
[i
];
3946 bool read
= test_bit(i
, vmx
->shadow_msr_intercept
.read
);
3947 bool write
= test_bit(i
, vmx
->shadow_msr_intercept
.write
);
3949 vmx_set_intercept_for_msr(vcpu
, msr
, MSR_TYPE_R
, read
);
3950 vmx_set_intercept_for_msr(vcpu
, msr
, MSR_TYPE_W
, write
);
3953 pt_update_intercept_for_msr(vcpu
);
3954 vmx_update_msr_bitmap_x2apic(vcpu
, vmx_msr_bitmap_mode(vcpu
));
3957 static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu
*vcpu
,
3961 int pi_vec
= nested
? POSTED_INTR_NESTED_VECTOR
: POSTED_INTR_VECTOR
;
3963 if (vcpu
->mode
== IN_GUEST_MODE
) {
3965 * The vector of interrupt to be delivered to vcpu had
3966 * been set in PIR before this function.
3968 * Following cases will be reached in this block, and
3969 * we always send a notification event in all cases as
3972 * Case 1: vcpu keeps in non-root mode. Sending a
3973 * notification event posts the interrupt to vcpu.
3975 * Case 2: vcpu exits to root mode and is still
3976 * runnable. PIR will be synced to vIRR before the
3977 * next vcpu entry. Sending a notification event in
3978 * this case has no effect, as vcpu is not in root
3981 * Case 3: vcpu exits to root mode and is blocked.
3982 * vcpu_block() has already synced PIR to vIRR and
3983 * never blocks vcpu if vIRR is not cleared. Therefore,
3984 * a blocked vcpu here does not wait for any requested
3985 * interrupts in PIR, and sending a notification event
3986 * which has no effect is safe here.
3989 apic
->send_IPI_mask(get_cpu_mask(vcpu
->cpu
), pi_vec
);
3996 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu
*vcpu
,
3999 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
4001 if (is_guest_mode(vcpu
) &&
4002 vector
== vmx
->nested
.posted_intr_nv
) {
4004 * If a posted intr is not recognized by hardware,
4005 * we will accomplish it in the next vmentry.
4007 vmx
->nested
.pi_pending
= true;
4008 kvm_make_request(KVM_REQ_EVENT
, vcpu
);
4009 /* the PIR and ON have been set by L1. */
4010 if (!kvm_vcpu_trigger_posted_interrupt(vcpu
, true))
4011 kvm_vcpu_kick(vcpu
);
4017 * Send interrupt to vcpu via posted interrupt way.
4018 * 1. If target vcpu is running(non-root mode), send posted interrupt
4019 * notification to vcpu and hardware will sync PIR to vIRR atomically.
4020 * 2. If target vcpu isn't running(root mode), kick it to pick up the
4021 * interrupt from PIR in next vmentry.
4023 static int vmx_deliver_posted_interrupt(struct kvm_vcpu
*vcpu
, int vector
)
4025 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
4028 r
= vmx_deliver_nested_posted_interrupt(vcpu
, vector
);
4032 if (!vcpu
->arch
.apicv_active
)
4035 if (pi_test_and_set_pir(vector
, &vmx
->pi_desc
))
4038 /* If a previous notification has sent the IPI, nothing to do. */
4039 if (pi_test_and_set_on(&vmx
->pi_desc
))
4042 if (vcpu
!= kvm_get_running_vcpu() &&
4043 !kvm_vcpu_trigger_posted_interrupt(vcpu
, false))
4044 kvm_vcpu_kick(vcpu
);
4050 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
4051 * will not change in the lifetime of the guest.
4052 * Note that host-state that does change is set elsewhere. E.g., host-state
4053 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
4055 void vmx_set_constant_host_state(struct vcpu_vmx
*vmx
)
4059 unsigned long cr0
, cr3
, cr4
;
4062 WARN_ON(cr0
& X86_CR0_TS
);
4063 vmcs_writel(HOST_CR0
, cr0
); /* 22.2.3 */
4066 * Save the most likely value for this task's CR3 in the VMCS.
4067 * We can't use __get_current_cr3_fast() because we're not atomic.
4070 vmcs_writel(HOST_CR3
, cr3
); /* 22.2.3 FIXME: shadow tables */
4071 vmx
->loaded_vmcs
->host_state
.cr3
= cr3
;
4073 /* Save the most likely value for this task's CR4 in the VMCS. */
4074 cr4
= cr4_read_shadow();
4075 vmcs_writel(HOST_CR4
, cr4
); /* 22.2.3, 22.2.5 */
4076 vmx
->loaded_vmcs
->host_state
.cr4
= cr4
;
4078 vmcs_write16(HOST_CS_SELECTOR
, __KERNEL_CS
); /* 22.2.4 */
4079 #ifdef CONFIG_X86_64
4081 * Load null selectors, so we can avoid reloading them in
4082 * vmx_prepare_switch_to_host(), in case userspace uses
4083 * the null selectors too (the expected case).
4085 vmcs_write16(HOST_DS_SELECTOR
, 0);
4086 vmcs_write16(HOST_ES_SELECTOR
, 0);
4088 vmcs_write16(HOST_DS_SELECTOR
, __KERNEL_DS
); /* 22.2.4 */
4089 vmcs_write16(HOST_ES_SELECTOR
, __KERNEL_DS
); /* 22.2.4 */
4091 vmcs_write16(HOST_SS_SELECTOR
, __KERNEL_DS
); /* 22.2.4 */
4092 vmcs_write16(HOST_TR_SELECTOR
, GDT_ENTRY_TSS
*8); /* 22.2.4 */
4094 vmcs_writel(HOST_IDTR_BASE
, host_idt_base
); /* 22.2.4 */
4096 vmcs_writel(HOST_RIP
, (unsigned long)vmx_vmexit
); /* 22.2.5 */
4098 rdmsr(MSR_IA32_SYSENTER_CS
, low32
, high32
);
4099 vmcs_write32(HOST_IA32_SYSENTER_CS
, low32
);
4100 rdmsrl(MSR_IA32_SYSENTER_EIP
, tmpl
);
4101 vmcs_writel(HOST_IA32_SYSENTER_EIP
, tmpl
); /* 22.2.3 */
4103 if (vmcs_config
.vmexit_ctrl
& VM_EXIT_LOAD_IA32_PAT
) {
4104 rdmsr(MSR_IA32_CR_PAT
, low32
, high32
);
4105 vmcs_write64(HOST_IA32_PAT
, low32
| ((u64
) high32
<< 32));
4108 if (cpu_has_load_ia32_efer())
4109 vmcs_write64(HOST_IA32_EFER
, host_efer
);
4112 void set_cr4_guest_host_mask(struct vcpu_vmx
*vmx
)
4114 struct kvm_vcpu
*vcpu
= &vmx
->vcpu
;
4116 vcpu
->arch
.cr4_guest_owned_bits
= KVM_POSSIBLE_CR4_GUEST_BITS
&
4117 ~vcpu
->arch
.cr4_guest_rsvd_bits
;
4119 vcpu
->arch
.cr4_guest_owned_bits
&= ~X86_CR4_PGE
;
4120 if (is_guest_mode(&vmx
->vcpu
))
4121 vcpu
->arch
.cr4_guest_owned_bits
&=
4122 ~get_vmcs12(vcpu
)->cr4_guest_host_mask
;
4123 vmcs_writel(CR4_GUEST_HOST_MASK
, ~vcpu
->arch
.cr4_guest_owned_bits
);
4126 u32
vmx_pin_based_exec_ctrl(struct vcpu_vmx
*vmx
)
4128 u32 pin_based_exec_ctrl
= vmcs_config
.pin_based_exec_ctrl
;
4130 if (!kvm_vcpu_apicv_active(&vmx
->vcpu
))
4131 pin_based_exec_ctrl
&= ~PIN_BASED_POSTED_INTR
;
4134 pin_based_exec_ctrl
&= ~PIN_BASED_VIRTUAL_NMIS
;
4136 if (!enable_preemption_timer
)
4137 pin_based_exec_ctrl
&= ~PIN_BASED_VMX_PREEMPTION_TIMER
;
4139 return pin_based_exec_ctrl
;
4142 static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu
*vcpu
)
4144 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
4146 pin_controls_set(vmx
, vmx_pin_based_exec_ctrl(vmx
));
4147 if (cpu_has_secondary_exec_ctrls()) {
4148 if (kvm_vcpu_apicv_active(vcpu
))
4149 secondary_exec_controls_setbit(vmx
,
4150 SECONDARY_EXEC_APIC_REGISTER_VIRT
|
4151 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY
);
4153 secondary_exec_controls_clearbit(vmx
,
4154 SECONDARY_EXEC_APIC_REGISTER_VIRT
|
4155 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY
);
4158 if (cpu_has_vmx_msr_bitmap())
4159 vmx_update_msr_bitmap(vcpu
);
4162 u32
vmx_exec_control(struct vcpu_vmx
*vmx
)
4164 u32 exec_control
= vmcs_config
.cpu_based_exec_ctrl
;
4166 if (vmx
->vcpu
.arch
.switch_db_regs
& KVM_DEBUGREG_WONT_EXIT
)
4167 exec_control
&= ~CPU_BASED_MOV_DR_EXITING
;
4169 if (!cpu_need_tpr_shadow(&vmx
->vcpu
)) {
4170 exec_control
&= ~CPU_BASED_TPR_SHADOW
;
4171 #ifdef CONFIG_X86_64
4172 exec_control
|= CPU_BASED_CR8_STORE_EXITING
|
4173 CPU_BASED_CR8_LOAD_EXITING
;
4177 exec_control
|= CPU_BASED_CR3_STORE_EXITING
|
4178 CPU_BASED_CR3_LOAD_EXITING
|
4179 CPU_BASED_INVLPG_EXITING
;
4180 if (kvm_mwait_in_guest(vmx
->vcpu
.kvm
))
4181 exec_control
&= ~(CPU_BASED_MWAIT_EXITING
|
4182 CPU_BASED_MONITOR_EXITING
);
4183 if (kvm_hlt_in_guest(vmx
->vcpu
.kvm
))
4184 exec_control
&= ~CPU_BASED_HLT_EXITING
;
4185 return exec_control
;
4189 * Adjust a single secondary execution control bit to intercept/allow an
4190 * instruction in the guest. This is usually done based on whether or not a
4191 * feature has been exposed to the guest in order to correctly emulate faults.
4194 vmx_adjust_secondary_exec_control(struct vcpu_vmx
*vmx
, u32
*exec_control
,
4195 u32 control
, bool enabled
, bool exiting
)
4198 * If the control is for an opt-in feature, clear the control if the
4199 * feature is not exposed to the guest, i.e. not enabled. If the
4200 * control is opt-out, i.e. an exiting control, clear the control if
4201 * the feature _is_ exposed to the guest, i.e. exiting/interception is
4202 * disabled for the associated instruction. Note, the caller is
4203 * responsible presetting exec_control to set all supported bits.
4205 if (enabled
== exiting
)
4206 *exec_control
&= ~control
;
4209 * Update the nested MSR settings so that a nested VMM can/can't set
4210 * controls for features that are/aren't exposed to the guest.
4214 vmx
->nested
.msrs
.secondary_ctls_high
|= control
;
4216 vmx
->nested
.msrs
.secondary_ctls_high
&= ~control
;
4221 * Wrapper macro for the common case of adjusting a secondary execution control
4222 * based on a single guest CPUID bit, with a dedicated feature bit. This also
4223 * verifies that the control is actually supported by KVM and hardware.
4225 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
4229 if (cpu_has_vmx_##name()) { \
4230 __enabled = guest_cpuid_has(&(vmx)->vcpu, \
4231 X86_FEATURE_##feat_name); \
4232 vmx_adjust_secondary_exec_control(vmx, exec_control, \
4233 SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
4237 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
4238 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
4239 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
4241 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
4242 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
4244 static void vmx_compute_secondary_exec_control(struct vcpu_vmx
*vmx
)
4246 struct kvm_vcpu
*vcpu
= &vmx
->vcpu
;
4248 u32 exec_control
= vmcs_config
.cpu_based_2nd_exec_ctrl
;
4250 if (vmx_pt_mode_is_system())
4251 exec_control
&= ~(SECONDARY_EXEC_PT_USE_GPA
| SECONDARY_EXEC_PT_CONCEAL_VMX
);
4252 if (!cpu_need_virtualize_apic_accesses(vcpu
))
4253 exec_control
&= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES
;
4255 exec_control
&= ~SECONDARY_EXEC_ENABLE_VPID
;
4257 exec_control
&= ~SECONDARY_EXEC_ENABLE_EPT
;
4258 enable_unrestricted_guest
= 0;
4260 if (!enable_unrestricted_guest
)
4261 exec_control
&= ~SECONDARY_EXEC_UNRESTRICTED_GUEST
;
4262 if (kvm_pause_in_guest(vmx
->vcpu
.kvm
))
4263 exec_control
&= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING
;
4264 if (!kvm_vcpu_apicv_active(vcpu
))
4265 exec_control
&= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT
|
4266 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY
);
4267 exec_control
&= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE
;
4269 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
4270 * in vmx_set_cr4. */
4271 exec_control
&= ~SECONDARY_EXEC_DESC
;
4273 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4275 We can NOT enable shadow_vmcs here because we don't have yet
4278 exec_control
&= ~SECONDARY_EXEC_SHADOW_VMCS
;
4281 exec_control
&= ~SECONDARY_EXEC_ENABLE_PML
;
4283 if (cpu_has_vmx_xsaves()) {
4284 /* Exposing XSAVES only when XSAVE is exposed */
4285 bool xsaves_enabled
=
4286 boot_cpu_has(X86_FEATURE_XSAVE
) &&
4287 guest_cpuid_has(vcpu
, X86_FEATURE_XSAVE
) &&
4288 guest_cpuid_has(vcpu
, X86_FEATURE_XSAVES
);
4290 vcpu
->arch
.xsaves_enabled
= xsaves_enabled
;
4292 vmx_adjust_secondary_exec_control(vmx
, &exec_control
,
4293 SECONDARY_EXEC_XSAVES
,
4294 xsaves_enabled
, false);
4297 vmx_adjust_sec_exec_feature(vmx
, &exec_control
, rdtscp
, RDTSCP
);
4300 * Expose INVPCID if and only if PCID is also exposed to the guest.
4301 * INVPCID takes a #UD when it's disabled in the VMCS, but a #GP or #PF
4302 * if CR4.PCIDE=0. Enumerating CPUID.INVPCID=1 would lead to incorrect
4303 * behavior from the guest perspective (it would expect #GP or #PF).
4305 if (!guest_cpuid_has(vcpu
, X86_FEATURE_PCID
))
4306 guest_cpuid_clear(vcpu
, X86_FEATURE_INVPCID
);
4307 vmx_adjust_sec_exec_feature(vmx
, &exec_control
, invpcid
, INVPCID
);
4310 vmx_adjust_sec_exec_exiting(vmx
, &exec_control
, rdrand
, RDRAND
);
4311 vmx_adjust_sec_exec_exiting(vmx
, &exec_control
, rdseed
, RDSEED
);
4313 vmx_adjust_sec_exec_control(vmx
, &exec_control
, waitpkg
, WAITPKG
,
4314 ENABLE_USR_WAIT_PAUSE
, false);
4316 if (!vcpu
->kvm
->arch
.bus_lock_detection_enabled
)
4317 exec_control
&= ~SECONDARY_EXEC_BUS_LOCK_DETECTION
;
4319 vmx
->secondary_exec_control
= exec_control
;
4322 static void ept_set_mmio_spte_mask(void)
4325 * EPT Misconfigurations can be generated if the value of bits 2:0
4326 * of an EPT paging-structure entry is 110b (write/execute).
4328 kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE
, 0);
4331 #define VMX_XSS_EXIT_BITMAP 0
4334 * Noting that the initialization of Guest-state Area of VMCS is in
4337 static void init_vmcs(struct vcpu_vmx
*vmx
)
4340 nested_vmx_set_vmcs_shadowing_bitmap();
4342 if (cpu_has_vmx_msr_bitmap())
4343 vmcs_write64(MSR_BITMAP
, __pa(vmx
->vmcs01
.msr_bitmap
));
4345 vmcs_write64(VMCS_LINK_POINTER
, -1ull); /* 22.3.1.5 */
4348 pin_controls_set(vmx
, vmx_pin_based_exec_ctrl(vmx
));
4350 exec_controls_set(vmx
, vmx_exec_control(vmx
));
4352 if (cpu_has_secondary_exec_ctrls()) {
4353 vmx_compute_secondary_exec_control(vmx
);
4354 secondary_exec_controls_set(vmx
, vmx
->secondary_exec_control
);
4357 if (kvm_vcpu_apicv_active(&vmx
->vcpu
)) {
4358 vmcs_write64(EOI_EXIT_BITMAP0
, 0);
4359 vmcs_write64(EOI_EXIT_BITMAP1
, 0);
4360 vmcs_write64(EOI_EXIT_BITMAP2
, 0);
4361 vmcs_write64(EOI_EXIT_BITMAP3
, 0);
4363 vmcs_write16(GUEST_INTR_STATUS
, 0);
4365 vmcs_write16(POSTED_INTR_NV
, POSTED_INTR_VECTOR
);
4366 vmcs_write64(POSTED_INTR_DESC_ADDR
, __pa((&vmx
->pi_desc
)));
4369 if (!kvm_pause_in_guest(vmx
->vcpu
.kvm
)) {
4370 vmcs_write32(PLE_GAP
, ple_gap
);
4371 vmx
->ple_window
= ple_window
;
4372 vmx
->ple_window_dirty
= true;
4375 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK
, 0);
4376 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH
, 0);
4377 vmcs_write32(CR3_TARGET_COUNT
, 0); /* 22.2.1 */
4379 vmcs_write16(HOST_FS_SELECTOR
, 0); /* 22.2.4 */
4380 vmcs_write16(HOST_GS_SELECTOR
, 0); /* 22.2.4 */
4381 vmx_set_constant_host_state(vmx
);
4382 vmcs_writel(HOST_FS_BASE
, 0); /* 22.2.4 */
4383 vmcs_writel(HOST_GS_BASE
, 0); /* 22.2.4 */
4385 if (cpu_has_vmx_vmfunc())
4386 vmcs_write64(VM_FUNCTION_CONTROL
, 0);
4388 vmcs_write32(VM_EXIT_MSR_STORE_COUNT
, 0);
4389 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT
, 0);
4390 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR
, __pa(vmx
->msr_autoload
.host
.val
));
4391 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT
, 0);
4392 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR
, __pa(vmx
->msr_autoload
.guest
.val
));
4394 if (vmcs_config
.vmentry_ctrl
& VM_ENTRY_LOAD_IA32_PAT
)
4395 vmcs_write64(GUEST_IA32_PAT
, vmx
->vcpu
.arch
.pat
);
4397 vm_exit_controls_set(vmx
, vmx_vmexit_ctrl());
4399 /* 22.2.1, 20.8.1 */
4400 vm_entry_controls_set(vmx
, vmx_vmentry_ctrl());
4402 vmx
->vcpu
.arch
.cr0_guest_owned_bits
= KVM_POSSIBLE_CR0_GUEST_BITS
;
4403 vmcs_writel(CR0_GUEST_HOST_MASK
, ~vmx
->vcpu
.arch
.cr0_guest_owned_bits
);
4405 set_cr4_guest_host_mask(vmx
);
4408 vmcs_write16(VIRTUAL_PROCESSOR_ID
, vmx
->vpid
);
4410 if (cpu_has_vmx_xsaves())
4411 vmcs_write64(XSS_EXIT_BITMAP
, VMX_XSS_EXIT_BITMAP
);
4414 vmcs_write64(PML_ADDRESS
, page_to_phys(vmx
->pml_pg
));
4415 vmcs_write16(GUEST_PML_INDEX
, PML_ENTITY_NUM
- 1);
4418 if (cpu_has_vmx_encls_vmexit())
4419 vmcs_write64(ENCLS_EXITING_BITMAP
, -1ull);
4421 if (vmx_pt_mode_is_host_guest()) {
4422 memset(&vmx
->pt_desc
, 0, sizeof(vmx
->pt_desc
));
4423 /* Bit[6~0] are forced to 1, writes are ignored. */
4424 vmx
->pt_desc
.guest
.output_mask
= 0x7F;
4425 vmcs_write64(GUEST_IA32_RTIT_CTL
, 0);
4429 static void vmx_vcpu_reset(struct kvm_vcpu
*vcpu
, bool init_event
)
4431 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
4432 struct msr_data apic_base_msr
;
4435 vmx
->rmode
.vm86_active
= 0;
4438 vmx
->msr_ia32_umwait_control
= 0;
4440 vmx
->vcpu
.arch
.regs
[VCPU_REGS_RDX
] = get_rdx_init_val();
4441 vmx
->hv_deadline_tsc
= -1;
4442 kvm_set_cr8(vcpu
, 0);
4445 apic_base_msr
.data
= APIC_DEFAULT_PHYS_BASE
|
4446 MSR_IA32_APICBASE_ENABLE
;
4447 if (kvm_vcpu_is_reset_bsp(vcpu
))
4448 apic_base_msr
.data
|= MSR_IA32_APICBASE_BSP
;
4449 apic_base_msr
.host_initiated
= true;
4450 kvm_set_apic_base(vcpu
, &apic_base_msr
);
4453 vmx_segment_cache_clear(vmx
);
4455 seg_setup(VCPU_SREG_CS
);
4456 vmcs_write16(GUEST_CS_SELECTOR
, 0xf000);
4457 vmcs_writel(GUEST_CS_BASE
, 0xffff0000ul
);
4459 seg_setup(VCPU_SREG_DS
);
4460 seg_setup(VCPU_SREG_ES
);
4461 seg_setup(VCPU_SREG_FS
);
4462 seg_setup(VCPU_SREG_GS
);
4463 seg_setup(VCPU_SREG_SS
);
4465 vmcs_write16(GUEST_TR_SELECTOR
, 0);
4466 vmcs_writel(GUEST_TR_BASE
, 0);
4467 vmcs_write32(GUEST_TR_LIMIT
, 0xffff);
4468 vmcs_write32(GUEST_TR_AR_BYTES
, 0x008b);
4470 vmcs_write16(GUEST_LDTR_SELECTOR
, 0);
4471 vmcs_writel(GUEST_LDTR_BASE
, 0);
4472 vmcs_write32(GUEST_LDTR_LIMIT
, 0xffff);
4473 vmcs_write32(GUEST_LDTR_AR_BYTES
, 0x00082);
4476 vmcs_write32(GUEST_SYSENTER_CS
, 0);
4477 vmcs_writel(GUEST_SYSENTER_ESP
, 0);
4478 vmcs_writel(GUEST_SYSENTER_EIP
, 0);
4479 vmcs_write64(GUEST_IA32_DEBUGCTL
, 0);
4482 kvm_set_rflags(vcpu
, X86_EFLAGS_FIXED
);
4483 kvm_rip_write(vcpu
, 0xfff0);
4485 vmcs_writel(GUEST_GDTR_BASE
, 0);
4486 vmcs_write32(GUEST_GDTR_LIMIT
, 0xffff);
4488 vmcs_writel(GUEST_IDTR_BASE
, 0);
4489 vmcs_write32(GUEST_IDTR_LIMIT
, 0xffff);
4491 vmcs_write32(GUEST_ACTIVITY_STATE
, GUEST_ACTIVITY_ACTIVE
);
4492 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO
, 0);
4493 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS
, 0);
4494 if (kvm_mpx_supported())
4495 vmcs_write64(GUEST_BNDCFGS
, 0);
4499 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD
, 0); /* 22.2.1 */
4501 if (cpu_has_vmx_tpr_shadow() && !init_event
) {
4502 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR
, 0);
4503 if (cpu_need_tpr_shadow(vcpu
))
4504 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR
,
4505 __pa(vcpu
->arch
.apic
->regs
));
4506 vmcs_write32(TPR_THRESHOLD
, 0);
4509 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD
, vcpu
);
4511 cr0
= X86_CR0_NW
| X86_CR0_CD
| X86_CR0_ET
;
4512 vmx
->vcpu
.arch
.cr0
= cr0
;
4513 vmx_set_cr0(vcpu
, cr0
); /* enter rmode */
4514 vmx_set_cr4(vcpu
, 0);
4515 vmx_set_efer(vcpu
, 0);
4517 vmx_update_exception_bitmap(vcpu
);
4519 vpid_sync_context(vmx
->vpid
);
4521 vmx_clear_hlt(vcpu
);
4524 static void vmx_enable_irq_window(struct kvm_vcpu
*vcpu
)
4526 exec_controls_setbit(to_vmx(vcpu
), CPU_BASED_INTR_WINDOW_EXITING
);
4529 static void vmx_enable_nmi_window(struct kvm_vcpu
*vcpu
)
4532 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO
) & GUEST_INTR_STATE_STI
) {
4533 vmx_enable_irq_window(vcpu
);
4537 exec_controls_setbit(to_vmx(vcpu
), CPU_BASED_NMI_WINDOW_EXITING
);
4540 static void vmx_inject_irq(struct kvm_vcpu
*vcpu
)
4542 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
4544 int irq
= vcpu
->arch
.interrupt
.nr
;
4546 trace_kvm_inj_virq(irq
);
4548 ++vcpu
->stat
.irq_injections
;
4549 if (vmx
->rmode
.vm86_active
) {
4551 if (vcpu
->arch
.interrupt
.soft
)
4552 inc_eip
= vcpu
->arch
.event_exit_inst_len
;
4553 kvm_inject_realmode_interrupt(vcpu
, irq
, inc_eip
);
4556 intr
= irq
| INTR_INFO_VALID_MASK
;
4557 if (vcpu
->arch
.interrupt
.soft
) {
4558 intr
|= INTR_TYPE_SOFT_INTR
;
4559 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN
,
4560 vmx
->vcpu
.arch
.event_exit_inst_len
);
4562 intr
|= INTR_TYPE_EXT_INTR
;
4563 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD
, intr
);
4565 vmx_clear_hlt(vcpu
);
4568 static void vmx_inject_nmi(struct kvm_vcpu
*vcpu
)
4570 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
4574 * Tracking the NMI-blocked state in software is built upon
4575 * finding the next open IRQ window. This, in turn, depends on
4576 * well-behaving guests: They have to keep IRQs disabled at
4577 * least as long as the NMI handler runs. Otherwise we may
4578 * cause NMI nesting, maybe breaking the guest. But as this is
4579 * highly unlikely, we can live with the residual risk.
4581 vmx
->loaded_vmcs
->soft_vnmi_blocked
= 1;
4582 vmx
->loaded_vmcs
->vnmi_blocked_time
= 0;
4585 ++vcpu
->stat
.nmi_injections
;
4586 vmx
->loaded_vmcs
->nmi_known_unmasked
= false;
4588 if (vmx
->rmode
.vm86_active
) {
4589 kvm_inject_realmode_interrupt(vcpu
, NMI_VECTOR
, 0);
4593 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD
,
4594 INTR_TYPE_NMI_INTR
| INTR_INFO_VALID_MASK
| NMI_VECTOR
);
4596 vmx_clear_hlt(vcpu
);
4599 bool vmx_get_nmi_mask(struct kvm_vcpu
*vcpu
)
4601 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
4605 return vmx
->loaded_vmcs
->soft_vnmi_blocked
;
4606 if (vmx
->loaded_vmcs
->nmi_known_unmasked
)
4608 masked
= vmcs_read32(GUEST_INTERRUPTIBILITY_INFO
) & GUEST_INTR_STATE_NMI
;
4609 vmx
->loaded_vmcs
->nmi_known_unmasked
= !masked
;
4613 void vmx_set_nmi_mask(struct kvm_vcpu
*vcpu
, bool masked
)
4615 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
4618 if (vmx
->loaded_vmcs
->soft_vnmi_blocked
!= masked
) {
4619 vmx
->loaded_vmcs
->soft_vnmi_blocked
= masked
;
4620 vmx
->loaded_vmcs
->vnmi_blocked_time
= 0;
4623 vmx
->loaded_vmcs
->nmi_known_unmasked
= !masked
;
4625 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO
,
4626 GUEST_INTR_STATE_NMI
);
4628 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO
,
4629 GUEST_INTR_STATE_NMI
);
4633 bool vmx_nmi_blocked(struct kvm_vcpu
*vcpu
)
4635 if (is_guest_mode(vcpu
) && nested_exit_on_nmi(vcpu
))
4638 if (!enable_vnmi
&& to_vmx(vcpu
)->loaded_vmcs
->soft_vnmi_blocked
)
4641 return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO
) &
4642 (GUEST_INTR_STATE_MOV_SS
| GUEST_INTR_STATE_STI
|
4643 GUEST_INTR_STATE_NMI
));
4646 static int vmx_nmi_allowed(struct kvm_vcpu
*vcpu
, bool for_injection
)
4648 if (to_vmx(vcpu
)->nested
.nested_run_pending
)
4651 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
4652 if (for_injection
&& is_guest_mode(vcpu
) && nested_exit_on_nmi(vcpu
))
4655 return !vmx_nmi_blocked(vcpu
);
4658 bool vmx_interrupt_blocked(struct kvm_vcpu
*vcpu
)
4660 if (is_guest_mode(vcpu
) && nested_exit_on_intr(vcpu
))
4663 return !(vmx_get_rflags(vcpu
) & X86_EFLAGS_IF
) ||
4664 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO
) &
4665 (GUEST_INTR_STATE_STI
| GUEST_INTR_STATE_MOV_SS
));
4668 static int vmx_interrupt_allowed(struct kvm_vcpu
*vcpu
, bool for_injection
)
4670 if (to_vmx(vcpu
)->nested
.nested_run_pending
)
4674 * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
4675 * e.g. if the IRQ arrived asynchronously after checking nested events.
4677 if (for_injection
&& is_guest_mode(vcpu
) && nested_exit_on_intr(vcpu
))
4680 return !vmx_interrupt_blocked(vcpu
);
4683 static int vmx_set_tss_addr(struct kvm
*kvm
, unsigned int addr
)
4687 if (enable_unrestricted_guest
)
4690 mutex_lock(&kvm
->slots_lock
);
4691 ret
= __x86_set_memory_region(kvm
, TSS_PRIVATE_MEMSLOT
, addr
,
4693 mutex_unlock(&kvm
->slots_lock
);
4696 return PTR_ERR(ret
);
4698 to_kvm_vmx(kvm
)->tss_addr
= addr
;
4700 return init_rmode_tss(kvm
, ret
);
4703 static int vmx_set_identity_map_addr(struct kvm
*kvm
, u64 ident_addr
)
4705 to_kvm_vmx(kvm
)->ept_identity_map_addr
= ident_addr
;
4709 static bool rmode_exception(struct kvm_vcpu
*vcpu
, int vec
)
4714 * Update instruction length as we may reinject the exception
4715 * from user space while in guest debugging mode.
4717 to_vmx(vcpu
)->vcpu
.arch
.event_exit_inst_len
=
4718 vmcs_read32(VM_EXIT_INSTRUCTION_LEN
);
4719 if (vcpu
->guest_debug
& KVM_GUESTDBG_USE_SW_BP
)
4723 return !(vcpu
->guest_debug
&
4724 (KVM_GUESTDBG_SINGLESTEP
| KVM_GUESTDBG_USE_HW_BP
));
4738 static int handle_rmode_exception(struct kvm_vcpu
*vcpu
,
4739 int vec
, u32 err_code
)
4742 * Instruction with address size override prefix opcode 0x67
4743 * Cause the #SS fault with 0 error code in VM86 mode.
4745 if (((vec
== GP_VECTOR
) || (vec
== SS_VECTOR
)) && err_code
== 0) {
4746 if (kvm_emulate_instruction(vcpu
, 0)) {
4747 if (vcpu
->arch
.halt_request
) {
4748 vcpu
->arch
.halt_request
= 0;
4749 return kvm_vcpu_halt(vcpu
);
4757 * Forward all other exceptions that are valid in real mode.
4758 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
4759 * the required debugging infrastructure rework.
4761 kvm_queue_exception(vcpu
, vec
);
4765 static int handle_machine_check(struct kvm_vcpu
*vcpu
)
4767 /* handled by vmx_vcpu_run() */
4772 * If the host has split lock detection disabled, then #AC is
4773 * unconditionally injected into the guest, which is the pre split lock
4774 * detection behaviour.
4776 * If the host has split lock detection enabled then #AC is
4777 * only injected into the guest when:
4778 * - Guest CPL == 3 (user mode)
4779 * - Guest has #AC detection enabled in CR0
4780 * - Guest EFLAGS has AC bit set
4782 static inline bool guest_inject_ac(struct kvm_vcpu
*vcpu
)
4784 if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT
))
4787 return vmx_get_cpl(vcpu
) == 3 && kvm_read_cr0_bits(vcpu
, X86_CR0_AM
) &&
4788 (kvm_get_rflags(vcpu
) & X86_EFLAGS_AC
);
4791 static int handle_exception_nmi(struct kvm_vcpu
*vcpu
)
4793 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
4794 struct kvm_run
*kvm_run
= vcpu
->run
;
4795 u32 intr_info
, ex_no
, error_code
;
4796 unsigned long cr2
, rip
, dr6
;
4799 vect_info
= vmx
->idt_vectoring_info
;
4800 intr_info
= vmx_get_intr_info(vcpu
);
4802 if (is_machine_check(intr_info
) || is_nmi(intr_info
))
4803 return 1; /* handled by handle_exception_nmi_irqoff() */
4805 if (is_invalid_opcode(intr_info
))
4806 return handle_ud(vcpu
);
4809 if (intr_info
& INTR_INFO_DELIVER_CODE_MASK
)
4810 error_code
= vmcs_read32(VM_EXIT_INTR_ERROR_CODE
);
4812 if (!vmx
->rmode
.vm86_active
&& is_gp_fault(intr_info
)) {
4813 WARN_ON_ONCE(!enable_vmware_backdoor
);
4816 * VMware backdoor emulation on #GP interception only handles
4817 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
4818 * error code on #GP.
4821 kvm_queue_exception_e(vcpu
, GP_VECTOR
, error_code
);
4824 return kvm_emulate_instruction(vcpu
, EMULTYPE_VMWARE_GP
);
4828 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
4829 * MMIO, it is better to report an internal error.
4830 * See the comments in vmx_handle_exit.
4832 if ((vect_info
& VECTORING_INFO_VALID_MASK
) &&
4833 !(is_page_fault(intr_info
) && !(error_code
& PFERR_RSVD_MASK
))) {
4834 vcpu
->run
->exit_reason
= KVM_EXIT_INTERNAL_ERROR
;
4835 vcpu
->run
->internal
.suberror
= KVM_INTERNAL_ERROR_SIMUL_EX
;
4836 vcpu
->run
->internal
.ndata
= 4;
4837 vcpu
->run
->internal
.data
[0] = vect_info
;
4838 vcpu
->run
->internal
.data
[1] = intr_info
;
4839 vcpu
->run
->internal
.data
[2] = error_code
;
4840 vcpu
->run
->internal
.data
[3] = vcpu
->arch
.last_vmentry_cpu
;
4844 if (is_page_fault(intr_info
)) {
4845 cr2
= vmx_get_exit_qual(vcpu
);
4846 if (enable_ept
&& !vcpu
->arch
.apf
.host_apf_flags
) {
4848 * EPT will cause page fault only if we need to
4849 * detect illegal GPAs.
4851 WARN_ON_ONCE(!allow_smaller_maxphyaddr
);
4852 kvm_fixup_and_inject_pf_error(vcpu
, cr2
, error_code
);
4855 return kvm_handle_page_fault(vcpu
, error_code
, cr2
, NULL
, 0);
4858 ex_no
= intr_info
& INTR_INFO_VECTOR_MASK
;
4860 if (vmx
->rmode
.vm86_active
&& rmode_exception(vcpu
, ex_no
))
4861 return handle_rmode_exception(vcpu
, ex_no
, error_code
);
4865 dr6
= vmx_get_exit_qual(vcpu
);
4866 if (!(vcpu
->guest_debug
&
4867 (KVM_GUESTDBG_SINGLESTEP
| KVM_GUESTDBG_USE_HW_BP
))) {
4868 if (is_icebp(intr_info
))
4869 WARN_ON(!skip_emulated_instruction(vcpu
));
4871 kvm_queue_exception_p(vcpu
, DB_VECTOR
, dr6
);
4874 kvm_run
->debug
.arch
.dr6
= dr6
| DR6_ACTIVE_LOW
;
4875 kvm_run
->debug
.arch
.dr7
= vmcs_readl(GUEST_DR7
);
4879 * Update instruction length as we may reinject #BP from
4880 * user space while in guest debugging mode. Reading it for
4881 * #DB as well causes no harm, it is not used in that case.
4883 vmx
->vcpu
.arch
.event_exit_inst_len
=
4884 vmcs_read32(VM_EXIT_INSTRUCTION_LEN
);
4885 kvm_run
->exit_reason
= KVM_EXIT_DEBUG
;
4886 rip
= kvm_rip_read(vcpu
);
4887 kvm_run
->debug
.arch
.pc
= vmcs_readl(GUEST_CS_BASE
) + rip
;
4888 kvm_run
->debug
.arch
.exception
= ex_no
;
4891 if (guest_inject_ac(vcpu
)) {
4892 kvm_queue_exception_e(vcpu
, AC_VECTOR
, error_code
);
4897 * Handle split lock. Depending on detection mode this will
4898 * either warn and disable split lock detection for this
4899 * task or force SIGBUS on it.
4901 if (handle_guest_split_lock(kvm_rip_read(vcpu
)))
4905 kvm_run
->exit_reason
= KVM_EXIT_EXCEPTION
;
4906 kvm_run
->ex
.exception
= ex_no
;
4907 kvm_run
->ex
.error_code
= error_code
;
4913 static __always_inline
int handle_external_interrupt(struct kvm_vcpu
*vcpu
)
4915 ++vcpu
->stat
.irq_exits
;
4919 static int handle_triple_fault(struct kvm_vcpu
*vcpu
)
4921 vcpu
->run
->exit_reason
= KVM_EXIT_SHUTDOWN
;
4922 vcpu
->mmio_needed
= 0;
4926 static int handle_io(struct kvm_vcpu
*vcpu
)
4928 unsigned long exit_qualification
;
4929 int size
, in
, string
;
4932 exit_qualification
= vmx_get_exit_qual(vcpu
);
4933 string
= (exit_qualification
& 16) != 0;
4935 ++vcpu
->stat
.io_exits
;
4938 return kvm_emulate_instruction(vcpu
, 0);
4940 port
= exit_qualification
>> 16;
4941 size
= (exit_qualification
& 7) + 1;
4942 in
= (exit_qualification
& 8) != 0;
4944 return kvm_fast_pio(vcpu
, size
, port
, in
);
4948 vmx_patch_hypercall(struct kvm_vcpu
*vcpu
, unsigned char *hypercall
)
4951 * Patch in the VMCALL instruction:
4953 hypercall
[0] = 0x0f;
4954 hypercall
[1] = 0x01;
4955 hypercall
[2] = 0xc1;
4958 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
4959 static int handle_set_cr0(struct kvm_vcpu
*vcpu
, unsigned long val
)
4961 if (is_guest_mode(vcpu
)) {
4962 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
4963 unsigned long orig_val
= val
;
4966 * We get here when L2 changed cr0 in a way that did not change
4967 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
4968 * but did change L0 shadowed bits. So we first calculate the
4969 * effective cr0 value that L1 would like to write into the
4970 * hardware. It consists of the L2-owned bits from the new
4971 * value combined with the L1-owned bits from L1's guest_cr0.
4973 val
= (val
& ~vmcs12
->cr0_guest_host_mask
) |
4974 (vmcs12
->guest_cr0
& vmcs12
->cr0_guest_host_mask
);
4976 if (!nested_guest_cr0_valid(vcpu
, val
))
4979 if (kvm_set_cr0(vcpu
, val
))
4981 vmcs_writel(CR0_READ_SHADOW
, orig_val
);
4984 if (to_vmx(vcpu
)->nested
.vmxon
&&
4985 !nested_host_cr0_valid(vcpu
, val
))
4988 return kvm_set_cr0(vcpu
, val
);
4992 static int handle_set_cr4(struct kvm_vcpu
*vcpu
, unsigned long val
)
4994 if (is_guest_mode(vcpu
)) {
4995 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
4996 unsigned long orig_val
= val
;
4998 /* analogously to handle_set_cr0 */
4999 val
= (val
& ~vmcs12
->cr4_guest_host_mask
) |
5000 (vmcs12
->guest_cr4
& vmcs12
->cr4_guest_host_mask
);
5001 if (kvm_set_cr4(vcpu
, val
))
5003 vmcs_writel(CR4_READ_SHADOW
, orig_val
);
5006 return kvm_set_cr4(vcpu
, val
);
5009 static int handle_desc(struct kvm_vcpu
*vcpu
)
5011 WARN_ON(!(vcpu
->arch
.cr4
& X86_CR4_UMIP
));
5012 return kvm_emulate_instruction(vcpu
, 0);
5015 static int handle_cr(struct kvm_vcpu
*vcpu
)
5017 unsigned long exit_qualification
, val
;
5023 exit_qualification
= vmx_get_exit_qual(vcpu
);
5024 cr
= exit_qualification
& 15;
5025 reg
= (exit_qualification
>> 8) & 15;
5026 switch ((exit_qualification
>> 4) & 3) {
5027 case 0: /* mov to cr */
5028 val
= kvm_register_readl(vcpu
, reg
);
5029 trace_kvm_cr_write(cr
, val
);
5032 err
= handle_set_cr0(vcpu
, val
);
5033 return kvm_complete_insn_gp(vcpu
, err
);
5035 WARN_ON_ONCE(enable_unrestricted_guest
);
5036 err
= kvm_set_cr3(vcpu
, val
);
5037 return kvm_complete_insn_gp(vcpu
, err
);
5039 err
= handle_set_cr4(vcpu
, val
);
5040 return kvm_complete_insn_gp(vcpu
, err
);
5042 u8 cr8_prev
= kvm_get_cr8(vcpu
);
5044 err
= kvm_set_cr8(vcpu
, cr8
);
5045 ret
= kvm_complete_insn_gp(vcpu
, err
);
5046 if (lapic_in_kernel(vcpu
))
5048 if (cr8_prev
<= cr8
)
5051 * TODO: we might be squashing a
5052 * KVM_GUESTDBG_SINGLESTEP-triggered
5053 * KVM_EXIT_DEBUG here.
5055 vcpu
->run
->exit_reason
= KVM_EXIT_SET_TPR
;
5061 WARN_ONCE(1, "Guest should always own CR0.TS");
5062 vmx_set_cr0(vcpu
, kvm_read_cr0_bits(vcpu
, ~X86_CR0_TS
));
5063 trace_kvm_cr_write(0, kvm_read_cr0(vcpu
));
5064 return kvm_skip_emulated_instruction(vcpu
);
5065 case 1: /*mov from cr*/
5068 WARN_ON_ONCE(enable_unrestricted_guest
);
5069 val
= kvm_read_cr3(vcpu
);
5070 kvm_register_write(vcpu
, reg
, val
);
5071 trace_kvm_cr_read(cr
, val
);
5072 return kvm_skip_emulated_instruction(vcpu
);
5074 val
= kvm_get_cr8(vcpu
);
5075 kvm_register_write(vcpu
, reg
, val
);
5076 trace_kvm_cr_read(cr
, val
);
5077 return kvm_skip_emulated_instruction(vcpu
);
5081 val
= (exit_qualification
>> LMSW_SOURCE_DATA_SHIFT
) & 0x0f;
5082 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu
) & ~0xful
) | val
);
5083 kvm_lmsw(vcpu
, val
);
5085 return kvm_skip_emulated_instruction(vcpu
);
5089 vcpu
->run
->exit_reason
= 0;
5090 vcpu_unimpl(vcpu
, "unhandled control register: op %d cr %d\n",
5091 (int)(exit_qualification
>> 4) & 3, cr
);
5095 static int handle_dr(struct kvm_vcpu
*vcpu
)
5097 unsigned long exit_qualification
;
5101 exit_qualification
= vmx_get_exit_qual(vcpu
);
5102 dr
= exit_qualification
& DEBUG_REG_ACCESS_NUM
;
5104 /* First, if DR does not exist, trigger UD */
5105 if (!kvm_require_dr(vcpu
, dr
))
5108 if (kvm_x86_ops
.get_cpl(vcpu
) > 0)
5111 dr7
= vmcs_readl(GUEST_DR7
);
5114 * As the vm-exit takes precedence over the debug trap, we
5115 * need to emulate the latter, either for the host or the
5116 * guest debugging itself.
5118 if (vcpu
->guest_debug
& KVM_GUESTDBG_USE_HW_BP
) {
5119 vcpu
->run
->debug
.arch
.dr6
= DR6_BD
| DR6_ACTIVE_LOW
;
5120 vcpu
->run
->debug
.arch
.dr7
= dr7
;
5121 vcpu
->run
->debug
.arch
.pc
= kvm_get_linear_rip(vcpu
);
5122 vcpu
->run
->debug
.arch
.exception
= DB_VECTOR
;
5123 vcpu
->run
->exit_reason
= KVM_EXIT_DEBUG
;
5126 kvm_queue_exception_p(vcpu
, DB_VECTOR
, DR6_BD
);
5131 if (vcpu
->guest_debug
== 0) {
5132 exec_controls_clearbit(to_vmx(vcpu
), CPU_BASED_MOV_DR_EXITING
);
5135 * No more DR vmexits; force a reload of the debug registers
5136 * and reenter on this instruction. The next vmexit will
5137 * retrieve the full state of the debug registers.
5139 vcpu
->arch
.switch_db_regs
|= KVM_DEBUGREG_WONT_EXIT
;
5143 reg
= DEBUG_REG_ACCESS_REG(exit_qualification
);
5144 if (exit_qualification
& TYPE_MOV_FROM_DR
) {
5147 kvm_get_dr(vcpu
, dr
, &val
);
5148 kvm_register_write(vcpu
, reg
, val
);
5151 err
= kvm_set_dr(vcpu
, dr
, kvm_register_readl(vcpu
, reg
));
5155 return kvm_complete_insn_gp(vcpu
, err
);
5158 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu
*vcpu
)
5160 get_debugreg(vcpu
->arch
.db
[0], 0);
5161 get_debugreg(vcpu
->arch
.db
[1], 1);
5162 get_debugreg(vcpu
->arch
.db
[2], 2);
5163 get_debugreg(vcpu
->arch
.db
[3], 3);
5164 get_debugreg(vcpu
->arch
.dr6
, 6);
5165 vcpu
->arch
.dr7
= vmcs_readl(GUEST_DR7
);
5167 vcpu
->arch
.switch_db_regs
&= ~KVM_DEBUGREG_WONT_EXIT
;
5168 exec_controls_setbit(to_vmx(vcpu
), CPU_BASED_MOV_DR_EXITING
);
5171 static void vmx_set_dr7(struct kvm_vcpu
*vcpu
, unsigned long val
)
5173 vmcs_writel(GUEST_DR7
, val
);
5176 static int handle_tpr_below_threshold(struct kvm_vcpu
*vcpu
)
5178 kvm_apic_update_ppr(vcpu
);
5182 static int handle_interrupt_window(struct kvm_vcpu
*vcpu
)
5184 exec_controls_clearbit(to_vmx(vcpu
), CPU_BASED_INTR_WINDOW_EXITING
);
5186 kvm_make_request(KVM_REQ_EVENT
, vcpu
);
5188 ++vcpu
->stat
.irq_window_exits
;
5192 static int handle_vmcall(struct kvm_vcpu
*vcpu
)
5194 return kvm_emulate_hypercall(vcpu
);
5197 static int handle_invd(struct kvm_vcpu
*vcpu
)
5199 /* Treat an INVD instruction as a NOP and just skip it. */
5200 return kvm_skip_emulated_instruction(vcpu
);
5203 static int handle_invlpg(struct kvm_vcpu
*vcpu
)
5205 unsigned long exit_qualification
= vmx_get_exit_qual(vcpu
);
5207 kvm_mmu_invlpg(vcpu
, exit_qualification
);
5208 return kvm_skip_emulated_instruction(vcpu
);
5211 static int handle_rdpmc(struct kvm_vcpu
*vcpu
)
5215 err
= kvm_rdpmc(vcpu
);
5216 return kvm_complete_insn_gp(vcpu
, err
);
5219 static int handle_wbinvd(struct kvm_vcpu
*vcpu
)
5221 return kvm_emulate_wbinvd(vcpu
);
5224 static int handle_xsetbv(struct kvm_vcpu
*vcpu
)
5226 u64 new_bv
= kvm_read_edx_eax(vcpu
);
5227 u32 index
= kvm_rcx_read(vcpu
);
5229 int err
= kvm_set_xcr(vcpu
, index
, new_bv
);
5230 return kvm_complete_insn_gp(vcpu
, err
);
5233 static int handle_apic_access(struct kvm_vcpu
*vcpu
)
5235 if (likely(fasteoi
)) {
5236 unsigned long exit_qualification
= vmx_get_exit_qual(vcpu
);
5237 int access_type
, offset
;
5239 access_type
= exit_qualification
& APIC_ACCESS_TYPE
;
5240 offset
= exit_qualification
& APIC_ACCESS_OFFSET
;
5242 * Sane guest uses MOV to write EOI, with written value
5243 * not cared. So make a short-circuit here by avoiding
5244 * heavy instruction emulation.
5246 if ((access_type
== TYPE_LINEAR_APIC_INST_WRITE
) &&
5247 (offset
== APIC_EOI
)) {
5248 kvm_lapic_set_eoi(vcpu
);
5249 return kvm_skip_emulated_instruction(vcpu
);
5252 return kvm_emulate_instruction(vcpu
, 0);
5255 static int handle_apic_eoi_induced(struct kvm_vcpu
*vcpu
)
5257 unsigned long exit_qualification
= vmx_get_exit_qual(vcpu
);
5258 int vector
= exit_qualification
& 0xff;
5260 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
5261 kvm_apic_set_eoi_accelerated(vcpu
, vector
);
5265 static int handle_apic_write(struct kvm_vcpu
*vcpu
)
5267 unsigned long exit_qualification
= vmx_get_exit_qual(vcpu
);
5268 u32 offset
= exit_qualification
& 0xfff;
5270 /* APIC-write VM exit is trap-like and thus no need to adjust IP */
5271 kvm_apic_write_nodecode(vcpu
, offset
);
5275 static int handle_task_switch(struct kvm_vcpu
*vcpu
)
5277 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
5278 unsigned long exit_qualification
;
5279 bool has_error_code
= false;
5282 int reason
, type
, idt_v
, idt_index
;
5284 idt_v
= (vmx
->idt_vectoring_info
& VECTORING_INFO_VALID_MASK
);
5285 idt_index
= (vmx
->idt_vectoring_info
& VECTORING_INFO_VECTOR_MASK
);
5286 type
= (vmx
->idt_vectoring_info
& VECTORING_INFO_TYPE_MASK
);
5288 exit_qualification
= vmx_get_exit_qual(vcpu
);
5290 reason
= (u32
)exit_qualification
>> 30;
5291 if (reason
== TASK_SWITCH_GATE
&& idt_v
) {
5293 case INTR_TYPE_NMI_INTR
:
5294 vcpu
->arch
.nmi_injected
= false;
5295 vmx_set_nmi_mask(vcpu
, true);
5297 case INTR_TYPE_EXT_INTR
:
5298 case INTR_TYPE_SOFT_INTR
:
5299 kvm_clear_interrupt_queue(vcpu
);
5301 case INTR_TYPE_HARD_EXCEPTION
:
5302 if (vmx
->idt_vectoring_info
&
5303 VECTORING_INFO_DELIVER_CODE_MASK
) {
5304 has_error_code
= true;
5306 vmcs_read32(IDT_VECTORING_ERROR_CODE
);
5309 case INTR_TYPE_SOFT_EXCEPTION
:
5310 kvm_clear_exception_queue(vcpu
);
5316 tss_selector
= exit_qualification
;
5318 if (!idt_v
|| (type
!= INTR_TYPE_HARD_EXCEPTION
&&
5319 type
!= INTR_TYPE_EXT_INTR
&&
5320 type
!= INTR_TYPE_NMI_INTR
))
5321 WARN_ON(!skip_emulated_instruction(vcpu
));
5324 * TODO: What about debug traps on tss switch?
5325 * Are we supposed to inject them and update dr6?
5327 return kvm_task_switch(vcpu
, tss_selector
,
5328 type
== INTR_TYPE_SOFT_INTR
? idt_index
: -1,
5329 reason
, has_error_code
, error_code
);
5332 static int handle_ept_violation(struct kvm_vcpu
*vcpu
)
5334 unsigned long exit_qualification
;
5338 exit_qualification
= vmx_get_exit_qual(vcpu
);
5341 * EPT violation happened while executing iret from NMI,
5342 * "blocked by NMI" bit has to be set before next VM entry.
5343 * There are errata that may cause this bit to not be set:
5346 if (!(to_vmx(vcpu
)->idt_vectoring_info
& VECTORING_INFO_VALID_MASK
) &&
5348 (exit_qualification
& INTR_INFO_UNBLOCK_NMI
))
5349 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO
, GUEST_INTR_STATE_NMI
);
5351 gpa
= vmcs_read64(GUEST_PHYSICAL_ADDRESS
);
5352 trace_kvm_page_fault(gpa
, exit_qualification
);
5354 /* Is it a read fault? */
5355 error_code
= (exit_qualification
& EPT_VIOLATION_ACC_READ
)
5356 ? PFERR_USER_MASK
: 0;
5357 /* Is it a write fault? */
5358 error_code
|= (exit_qualification
& EPT_VIOLATION_ACC_WRITE
)
5359 ? PFERR_WRITE_MASK
: 0;
5360 /* Is it a fetch fault? */
5361 error_code
|= (exit_qualification
& EPT_VIOLATION_ACC_INSTR
)
5362 ? PFERR_FETCH_MASK
: 0;
5363 /* ept page table entry is present? */
5364 error_code
|= (exit_qualification
&
5365 (EPT_VIOLATION_READABLE
| EPT_VIOLATION_WRITABLE
|
5366 EPT_VIOLATION_EXECUTABLE
))
5367 ? PFERR_PRESENT_MASK
: 0;
5369 error_code
|= (exit_qualification
& 0x100) != 0 ?
5370 PFERR_GUEST_FINAL_MASK
: PFERR_GUEST_PAGE_MASK
;
5372 vcpu
->arch
.exit_qualification
= exit_qualification
;
5375 * Check that the GPA doesn't exceed physical memory limits, as that is
5376 * a guest page fault. We have to emulate the instruction here, because
5377 * if the illegal address is that of a paging structure, then
5378 * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we
5379 * would also use advanced VM-exit information for EPT violations to
5380 * reconstruct the page fault error code.
5382 if (unlikely(allow_smaller_maxphyaddr
&& kvm_vcpu_is_illegal_gpa(vcpu
, gpa
)))
5383 return kvm_emulate_instruction(vcpu
, 0);
5385 return kvm_mmu_page_fault(vcpu
, gpa
, error_code
, NULL
, 0);
5388 static int handle_ept_misconfig(struct kvm_vcpu
*vcpu
)
5393 * A nested guest cannot optimize MMIO vmexits, because we have an
5394 * nGPA here instead of the required GPA.
5396 gpa
= vmcs_read64(GUEST_PHYSICAL_ADDRESS
);
5397 if (!is_guest_mode(vcpu
) &&
5398 !kvm_io_bus_write(vcpu
, KVM_FAST_MMIO_BUS
, gpa
, 0, NULL
)) {
5399 trace_kvm_fast_mmio(gpa
);
5400 return kvm_skip_emulated_instruction(vcpu
);
5403 return kvm_mmu_page_fault(vcpu
, gpa
, PFERR_RSVD_MASK
, NULL
, 0);
5406 static int handle_nmi_window(struct kvm_vcpu
*vcpu
)
5408 WARN_ON_ONCE(!enable_vnmi
);
5409 exec_controls_clearbit(to_vmx(vcpu
), CPU_BASED_NMI_WINDOW_EXITING
);
5410 ++vcpu
->stat
.nmi_window_exits
;
5411 kvm_make_request(KVM_REQ_EVENT
, vcpu
);
5416 static int handle_invalid_guest_state(struct kvm_vcpu
*vcpu
)
5418 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
5419 bool intr_window_requested
;
5420 unsigned count
= 130;
5422 intr_window_requested
= exec_controls_get(vmx
) &
5423 CPU_BASED_INTR_WINDOW_EXITING
;
5425 while (vmx
->emulation_required
&& count
-- != 0) {
5426 if (intr_window_requested
&& !vmx_interrupt_blocked(vcpu
))
5427 return handle_interrupt_window(&vmx
->vcpu
);
5429 if (kvm_test_request(KVM_REQ_EVENT
, vcpu
))
5432 if (!kvm_emulate_instruction(vcpu
, 0))
5435 if (vmx
->emulation_required
&& !vmx
->rmode
.vm86_active
&&
5436 vcpu
->arch
.exception
.pending
) {
5437 vcpu
->run
->exit_reason
= KVM_EXIT_INTERNAL_ERROR
;
5438 vcpu
->run
->internal
.suberror
=
5439 KVM_INTERNAL_ERROR_EMULATION
;
5440 vcpu
->run
->internal
.ndata
= 0;
5444 if (vcpu
->arch
.halt_request
) {
5445 vcpu
->arch
.halt_request
= 0;
5446 return kvm_vcpu_halt(vcpu
);
5450 * Note, return 1 and not 0, vcpu_run() will invoke
5451 * xfer_to_guest_mode() which will create a proper return
5454 if (__xfer_to_guest_mode_work_pending())
5461 static void grow_ple_window(struct kvm_vcpu
*vcpu
)
5463 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
5464 unsigned int old
= vmx
->ple_window
;
5466 vmx
->ple_window
= __grow_ple_window(old
, ple_window
,
5470 if (vmx
->ple_window
!= old
) {
5471 vmx
->ple_window_dirty
= true;
5472 trace_kvm_ple_window_update(vcpu
->vcpu_id
,
5473 vmx
->ple_window
, old
);
5477 static void shrink_ple_window(struct kvm_vcpu
*vcpu
)
5479 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
5480 unsigned int old
= vmx
->ple_window
;
5482 vmx
->ple_window
= __shrink_ple_window(old
, ple_window
,
5486 if (vmx
->ple_window
!= old
) {
5487 vmx
->ple_window_dirty
= true;
5488 trace_kvm_ple_window_update(vcpu
->vcpu_id
,
5489 vmx
->ple_window
, old
);
5493 static void vmx_enable_tdp(void)
5495 kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK
,
5496 enable_ept_ad_bits
? VMX_EPT_ACCESS_BIT
: 0ull,
5497 enable_ept_ad_bits
? VMX_EPT_DIRTY_BIT
: 0ull,
5498 0ull, VMX_EPT_EXECUTABLE_MASK
,
5499 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK
,
5500 VMX_EPT_RWX_MASK
, 0ull);
5502 ept_set_mmio_spte_mask();
5506 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
5507 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
5509 static int handle_pause(struct kvm_vcpu
*vcpu
)
5511 if (!kvm_pause_in_guest(vcpu
->kvm
))
5512 grow_ple_window(vcpu
);
5515 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
5516 * VM-execution control is ignored if CPL > 0. OTOH, KVM
5517 * never set PAUSE_EXITING and just set PLE if supported,
5518 * so the vcpu must be CPL=0 if it gets a PAUSE exit.
5520 kvm_vcpu_on_spin(vcpu
, true);
5521 return kvm_skip_emulated_instruction(vcpu
);
5524 static int handle_nop(struct kvm_vcpu
*vcpu
)
5526 return kvm_skip_emulated_instruction(vcpu
);
5529 static int handle_mwait(struct kvm_vcpu
*vcpu
)
5531 printk_once(KERN_WARNING
"kvm: MWAIT instruction emulated as NOP!\n");
5532 return handle_nop(vcpu
);
5535 static int handle_invalid_op(struct kvm_vcpu
*vcpu
)
5537 kvm_queue_exception(vcpu
, UD_VECTOR
);
5541 static int handle_monitor_trap(struct kvm_vcpu
*vcpu
)
5546 static int handle_monitor(struct kvm_vcpu
*vcpu
)
5548 printk_once(KERN_WARNING
"kvm: MONITOR instruction emulated as NOP!\n");
5549 return handle_nop(vcpu
);
5552 static int handle_invpcid(struct kvm_vcpu
*vcpu
)
5554 u32 vmx_instruction_info
;
5562 if (!guest_cpuid_has(vcpu
, X86_FEATURE_INVPCID
)) {
5563 kvm_queue_exception(vcpu
, UD_VECTOR
);
5567 vmx_instruction_info
= vmcs_read32(VMX_INSTRUCTION_INFO
);
5568 type
= kvm_register_readl(vcpu
, (vmx_instruction_info
>> 28) & 0xf);
5571 kvm_inject_gp(vcpu
, 0);
5575 /* According to the Intel instruction reference, the memory operand
5576 * is read even if it isn't needed (e.g., for type==all)
5578 if (get_vmx_mem_address(vcpu
, vmx_get_exit_qual(vcpu
),
5579 vmx_instruction_info
, false,
5580 sizeof(operand
), &gva
))
5583 return kvm_handle_invpcid(vcpu
, type
, gva
);
5586 static int handle_pml_full(struct kvm_vcpu
*vcpu
)
5588 unsigned long exit_qualification
;
5590 trace_kvm_pml_full(vcpu
->vcpu_id
);
5592 exit_qualification
= vmx_get_exit_qual(vcpu
);
5595 * PML buffer FULL happened while executing iret from NMI,
5596 * "blocked by NMI" bit has to be set before next VM entry.
5598 if (!(to_vmx(vcpu
)->idt_vectoring_info
& VECTORING_INFO_VALID_MASK
) &&
5600 (exit_qualification
& INTR_INFO_UNBLOCK_NMI
))
5601 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO
,
5602 GUEST_INTR_STATE_NMI
);
5605 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
5606 * here.., and there's no userspace involvement needed for PML.
5611 static fastpath_t
handle_fastpath_preemption_timer(struct kvm_vcpu
*vcpu
)
5613 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
5615 if (!vmx
->req_immediate_exit
&&
5616 !unlikely(vmx
->loaded_vmcs
->hv_timer_soft_disabled
)) {
5617 kvm_lapic_expired_hv_timer(vcpu
);
5618 return EXIT_FASTPATH_REENTER_GUEST
;
5621 return EXIT_FASTPATH_NONE
;
5624 static int handle_preemption_timer(struct kvm_vcpu
*vcpu
)
5626 handle_fastpath_preemption_timer(vcpu
);
5631 * When nested=0, all VMX instruction VM Exits filter here. The handlers
5632 * are overwritten by nested_vmx_setup() when nested=1.
5634 static int handle_vmx_instruction(struct kvm_vcpu
*vcpu
)
5636 kvm_queue_exception(vcpu
, UD_VECTOR
);
5640 static int handle_encls(struct kvm_vcpu
*vcpu
)
5643 * SGX virtualization is not yet supported. There is no software
5644 * enable bit for SGX, so we have to trap ENCLS and inject a #UD
5645 * to prevent the guest from executing ENCLS.
5647 kvm_queue_exception(vcpu
, UD_VECTOR
);
5651 static int handle_bus_lock_vmexit(struct kvm_vcpu
*vcpu
)
5653 vcpu
->run
->exit_reason
= KVM_EXIT_X86_BUS_LOCK
;
5654 vcpu
->run
->flags
|= KVM_RUN_X86_BUS_LOCK
;
5659 * The exit handlers return 1 if the exit was handled fully and guest execution
5660 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
5661 * to be done to userspace and return 0.
5663 static int (*kvm_vmx_exit_handlers
[])(struct kvm_vcpu
*vcpu
) = {
5664 [EXIT_REASON_EXCEPTION_NMI
] = handle_exception_nmi
,
5665 [EXIT_REASON_EXTERNAL_INTERRUPT
] = handle_external_interrupt
,
5666 [EXIT_REASON_TRIPLE_FAULT
] = handle_triple_fault
,
5667 [EXIT_REASON_NMI_WINDOW
] = handle_nmi_window
,
5668 [EXIT_REASON_IO_INSTRUCTION
] = handle_io
,
5669 [EXIT_REASON_CR_ACCESS
] = handle_cr
,
5670 [EXIT_REASON_DR_ACCESS
] = handle_dr
,
5671 [EXIT_REASON_CPUID
] = kvm_emulate_cpuid
,
5672 [EXIT_REASON_MSR_READ
] = kvm_emulate_rdmsr
,
5673 [EXIT_REASON_MSR_WRITE
] = kvm_emulate_wrmsr
,
5674 [EXIT_REASON_INTERRUPT_WINDOW
] = handle_interrupt_window
,
5675 [EXIT_REASON_HLT
] = kvm_emulate_halt
,
5676 [EXIT_REASON_INVD
] = handle_invd
,
5677 [EXIT_REASON_INVLPG
] = handle_invlpg
,
5678 [EXIT_REASON_RDPMC
] = handle_rdpmc
,
5679 [EXIT_REASON_VMCALL
] = handle_vmcall
,
5680 [EXIT_REASON_VMCLEAR
] = handle_vmx_instruction
,
5681 [EXIT_REASON_VMLAUNCH
] = handle_vmx_instruction
,
5682 [EXIT_REASON_VMPTRLD
] = handle_vmx_instruction
,
5683 [EXIT_REASON_VMPTRST
] = handle_vmx_instruction
,
5684 [EXIT_REASON_VMREAD
] = handle_vmx_instruction
,
5685 [EXIT_REASON_VMRESUME
] = handle_vmx_instruction
,
5686 [EXIT_REASON_VMWRITE
] = handle_vmx_instruction
,
5687 [EXIT_REASON_VMOFF
] = handle_vmx_instruction
,
5688 [EXIT_REASON_VMON
] = handle_vmx_instruction
,
5689 [EXIT_REASON_TPR_BELOW_THRESHOLD
] = handle_tpr_below_threshold
,
5690 [EXIT_REASON_APIC_ACCESS
] = handle_apic_access
,
5691 [EXIT_REASON_APIC_WRITE
] = handle_apic_write
,
5692 [EXIT_REASON_EOI_INDUCED
] = handle_apic_eoi_induced
,
5693 [EXIT_REASON_WBINVD
] = handle_wbinvd
,
5694 [EXIT_REASON_XSETBV
] = handle_xsetbv
,
5695 [EXIT_REASON_TASK_SWITCH
] = handle_task_switch
,
5696 [EXIT_REASON_MCE_DURING_VMENTRY
] = handle_machine_check
,
5697 [EXIT_REASON_GDTR_IDTR
] = handle_desc
,
5698 [EXIT_REASON_LDTR_TR
] = handle_desc
,
5699 [EXIT_REASON_EPT_VIOLATION
] = handle_ept_violation
,
5700 [EXIT_REASON_EPT_MISCONFIG
] = handle_ept_misconfig
,
5701 [EXIT_REASON_PAUSE_INSTRUCTION
] = handle_pause
,
5702 [EXIT_REASON_MWAIT_INSTRUCTION
] = handle_mwait
,
5703 [EXIT_REASON_MONITOR_TRAP_FLAG
] = handle_monitor_trap
,
5704 [EXIT_REASON_MONITOR_INSTRUCTION
] = handle_monitor
,
5705 [EXIT_REASON_INVEPT
] = handle_vmx_instruction
,
5706 [EXIT_REASON_INVVPID
] = handle_vmx_instruction
,
5707 [EXIT_REASON_RDRAND
] = handle_invalid_op
,
5708 [EXIT_REASON_RDSEED
] = handle_invalid_op
,
5709 [EXIT_REASON_PML_FULL
] = handle_pml_full
,
5710 [EXIT_REASON_INVPCID
] = handle_invpcid
,
5711 [EXIT_REASON_VMFUNC
] = handle_vmx_instruction
,
5712 [EXIT_REASON_PREEMPTION_TIMER
] = handle_preemption_timer
,
5713 [EXIT_REASON_ENCLS
] = handle_encls
,
5714 [EXIT_REASON_BUS_LOCK
] = handle_bus_lock_vmexit
,
5717 static const int kvm_vmx_max_exit_handlers
=
5718 ARRAY_SIZE(kvm_vmx_exit_handlers
);
5720 static void vmx_get_exit_info(struct kvm_vcpu
*vcpu
, u64
*info1
, u64
*info2
,
5721 u32
*intr_info
, u32
*error_code
)
5723 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
5725 *info1
= vmx_get_exit_qual(vcpu
);
5726 if (!(vmx
->exit_reason
.failed_vmentry
)) {
5727 *info2
= vmx
->idt_vectoring_info
;
5728 *intr_info
= vmx_get_intr_info(vcpu
);
5729 if (is_exception_with_error_code(*intr_info
))
5730 *error_code
= vmcs_read32(VM_EXIT_INTR_ERROR_CODE
);
5740 static void vmx_destroy_pml_buffer(struct vcpu_vmx
*vmx
)
5743 __free_page(vmx
->pml_pg
);
5748 static void vmx_flush_pml_buffer(struct kvm_vcpu
*vcpu
)
5750 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
5754 pml_idx
= vmcs_read16(GUEST_PML_INDEX
);
5756 /* Do nothing if PML buffer is empty */
5757 if (pml_idx
== (PML_ENTITY_NUM
- 1))
5760 /* PML index always points to next available PML buffer entity */
5761 if (pml_idx
>= PML_ENTITY_NUM
)
5766 pml_buf
= page_address(vmx
->pml_pg
);
5767 for (; pml_idx
< PML_ENTITY_NUM
; pml_idx
++) {
5770 gpa
= pml_buf
[pml_idx
];
5771 WARN_ON(gpa
& (PAGE_SIZE
- 1));
5772 kvm_vcpu_mark_page_dirty(vcpu
, gpa
>> PAGE_SHIFT
);
5775 /* reset PML index */
5776 vmcs_write16(GUEST_PML_INDEX
, PML_ENTITY_NUM
- 1);
5780 * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
5781 * Called before reporting dirty_bitmap to userspace.
5783 static void kvm_flush_pml_buffers(struct kvm
*kvm
)
5786 struct kvm_vcpu
*vcpu
;
5788 * We only need to kick vcpu out of guest mode here, as PML buffer
5789 * is flushed at beginning of all VMEXITs, and it's obvious that only
5790 * vcpus running in guest are possible to have unflushed GPAs in PML
5793 kvm_for_each_vcpu(i
, vcpu
, kvm
)
5794 kvm_vcpu_kick(vcpu
);
5797 static void vmx_dump_sel(char *name
, uint32_t sel
)
5799 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
5800 name
, vmcs_read16(sel
),
5801 vmcs_read32(sel
+ GUEST_ES_AR_BYTES
- GUEST_ES_SELECTOR
),
5802 vmcs_read32(sel
+ GUEST_ES_LIMIT
- GUEST_ES_SELECTOR
),
5803 vmcs_readl(sel
+ GUEST_ES_BASE
- GUEST_ES_SELECTOR
));
5806 static void vmx_dump_dtsel(char *name
, uint32_t limit
)
5808 pr_err("%s limit=0x%08x, base=0x%016lx\n",
5809 name
, vmcs_read32(limit
),
5810 vmcs_readl(limit
+ GUEST_GDTR_BASE
- GUEST_GDTR_LIMIT
));
5813 void dump_vmcs(void)
5815 u32 vmentry_ctl
, vmexit_ctl
;
5816 u32 cpu_based_exec_ctrl
, pin_based_exec_ctrl
, secondary_exec_control
;
5820 if (!dump_invalid_vmcs
) {
5821 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
5825 vmentry_ctl
= vmcs_read32(VM_ENTRY_CONTROLS
);
5826 vmexit_ctl
= vmcs_read32(VM_EXIT_CONTROLS
);
5827 cpu_based_exec_ctrl
= vmcs_read32(CPU_BASED_VM_EXEC_CONTROL
);
5828 pin_based_exec_ctrl
= vmcs_read32(PIN_BASED_VM_EXEC_CONTROL
);
5829 cr4
= vmcs_readl(GUEST_CR4
);
5830 efer
= vmcs_read64(GUEST_IA32_EFER
);
5831 secondary_exec_control
= 0;
5832 if (cpu_has_secondary_exec_ctrls())
5833 secondary_exec_control
= vmcs_read32(SECONDARY_VM_EXEC_CONTROL
);
5835 pr_err("*** Guest State ***\n");
5836 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
5837 vmcs_readl(GUEST_CR0
), vmcs_readl(CR0_READ_SHADOW
),
5838 vmcs_readl(CR0_GUEST_HOST_MASK
));
5839 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
5840 cr4
, vmcs_readl(CR4_READ_SHADOW
), vmcs_readl(CR4_GUEST_HOST_MASK
));
5841 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3
));
5842 if ((secondary_exec_control
& SECONDARY_EXEC_ENABLE_EPT
) &&
5843 (cr4
& X86_CR4_PAE
) && !(efer
& EFER_LMA
))
5845 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
5846 vmcs_read64(GUEST_PDPTR0
), vmcs_read64(GUEST_PDPTR1
));
5847 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
5848 vmcs_read64(GUEST_PDPTR2
), vmcs_read64(GUEST_PDPTR3
));
5850 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
5851 vmcs_readl(GUEST_RSP
), vmcs_readl(GUEST_RIP
));
5852 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
5853 vmcs_readl(GUEST_RFLAGS
), vmcs_readl(GUEST_DR7
));
5854 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
5855 vmcs_readl(GUEST_SYSENTER_ESP
),
5856 vmcs_read32(GUEST_SYSENTER_CS
), vmcs_readl(GUEST_SYSENTER_EIP
));
5857 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR
);
5858 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR
);
5859 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR
);
5860 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR
);
5861 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR
);
5862 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR
);
5863 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT
);
5864 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR
);
5865 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT
);
5866 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR
);
5867 if ((vmexit_ctl
& (VM_EXIT_SAVE_IA32_PAT
| VM_EXIT_SAVE_IA32_EFER
)) ||
5868 (vmentry_ctl
& (VM_ENTRY_LOAD_IA32_PAT
| VM_ENTRY_LOAD_IA32_EFER
)))
5869 pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
5870 efer
, vmcs_read64(GUEST_IA32_PAT
));
5871 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
5872 vmcs_read64(GUEST_IA32_DEBUGCTL
),
5873 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS
));
5874 if (cpu_has_load_perf_global_ctrl() &&
5875 vmentry_ctl
& VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL
)
5876 pr_err("PerfGlobCtl = 0x%016llx\n",
5877 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL
));
5878 if (vmentry_ctl
& VM_ENTRY_LOAD_BNDCFGS
)
5879 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS
));
5880 pr_err("Interruptibility = %08x ActivityState = %08x\n",
5881 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO
),
5882 vmcs_read32(GUEST_ACTIVITY_STATE
));
5883 if (secondary_exec_control
& SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY
)
5884 pr_err("InterruptStatus = %04x\n",
5885 vmcs_read16(GUEST_INTR_STATUS
));
5887 pr_err("*** Host State ***\n");
5888 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
5889 vmcs_readl(HOST_RIP
), vmcs_readl(HOST_RSP
));
5890 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
5891 vmcs_read16(HOST_CS_SELECTOR
), vmcs_read16(HOST_SS_SELECTOR
),
5892 vmcs_read16(HOST_DS_SELECTOR
), vmcs_read16(HOST_ES_SELECTOR
),
5893 vmcs_read16(HOST_FS_SELECTOR
), vmcs_read16(HOST_GS_SELECTOR
),
5894 vmcs_read16(HOST_TR_SELECTOR
));
5895 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
5896 vmcs_readl(HOST_FS_BASE
), vmcs_readl(HOST_GS_BASE
),
5897 vmcs_readl(HOST_TR_BASE
));
5898 pr_err("GDTBase=%016lx IDTBase=%016lx\n",
5899 vmcs_readl(HOST_GDTR_BASE
), vmcs_readl(HOST_IDTR_BASE
));
5900 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
5901 vmcs_readl(HOST_CR0
), vmcs_readl(HOST_CR3
),
5902 vmcs_readl(HOST_CR4
));
5903 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
5904 vmcs_readl(HOST_IA32_SYSENTER_ESP
),
5905 vmcs_read32(HOST_IA32_SYSENTER_CS
),
5906 vmcs_readl(HOST_IA32_SYSENTER_EIP
));
5907 if (vmexit_ctl
& (VM_EXIT_LOAD_IA32_PAT
| VM_EXIT_LOAD_IA32_EFER
))
5908 pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
5909 vmcs_read64(HOST_IA32_EFER
),
5910 vmcs_read64(HOST_IA32_PAT
));
5911 if (cpu_has_load_perf_global_ctrl() &&
5912 vmexit_ctl
& VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL
)
5913 pr_err("PerfGlobCtl = 0x%016llx\n",
5914 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL
));
5916 pr_err("*** Control State ***\n");
5917 pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
5918 pin_based_exec_ctrl
, cpu_based_exec_ctrl
, secondary_exec_control
);
5919 pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl
, vmexit_ctl
);
5920 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
5921 vmcs_read32(EXCEPTION_BITMAP
),
5922 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK
),
5923 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH
));
5924 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
5925 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD
),
5926 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE
),
5927 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN
));
5928 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
5929 vmcs_read32(VM_EXIT_INTR_INFO
),
5930 vmcs_read32(VM_EXIT_INTR_ERROR_CODE
),
5931 vmcs_read32(VM_EXIT_INSTRUCTION_LEN
));
5932 pr_err(" reason=%08x qualification=%016lx\n",
5933 vmcs_read32(VM_EXIT_REASON
), vmcs_readl(EXIT_QUALIFICATION
));
5934 pr_err("IDTVectoring: info=%08x errcode=%08x\n",
5935 vmcs_read32(IDT_VECTORING_INFO_FIELD
),
5936 vmcs_read32(IDT_VECTORING_ERROR_CODE
));
5937 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET
));
5938 if (secondary_exec_control
& SECONDARY_EXEC_TSC_SCALING
)
5939 pr_err("TSC Multiplier = 0x%016llx\n",
5940 vmcs_read64(TSC_MULTIPLIER
));
5941 if (cpu_based_exec_ctrl
& CPU_BASED_TPR_SHADOW
) {
5942 if (secondary_exec_control
& SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY
) {
5943 u16 status
= vmcs_read16(GUEST_INTR_STATUS
);
5944 pr_err("SVI|RVI = %02x|%02x ", status
>> 8, status
& 0xff);
5946 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD
));
5947 if (secondary_exec_control
& SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES
)
5948 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR
));
5949 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR
));
5951 if (pin_based_exec_ctrl
& PIN_BASED_POSTED_INTR
)
5952 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV
));
5953 if ((secondary_exec_control
& SECONDARY_EXEC_ENABLE_EPT
))
5954 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER
));
5955 if (secondary_exec_control
& SECONDARY_EXEC_PAUSE_LOOP_EXITING
)
5956 pr_err("PLE Gap=%08x Window=%08x\n",
5957 vmcs_read32(PLE_GAP
), vmcs_read32(PLE_WINDOW
));
5958 if (secondary_exec_control
& SECONDARY_EXEC_ENABLE_VPID
)
5959 pr_err("Virtual processor ID = 0x%04x\n",
5960 vmcs_read16(VIRTUAL_PROCESSOR_ID
));
5964 * The guest has exited. See if we can fix it or if we need userspace
5967 static int __vmx_handle_exit(struct kvm_vcpu
*vcpu
, fastpath_t exit_fastpath
)
5969 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
5970 union vmx_exit_reason exit_reason
= vmx
->exit_reason
;
5971 u32 vectoring_info
= vmx
->idt_vectoring_info
;
5972 u16 exit_handler_index
;
5975 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
5976 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
5977 * querying dirty_bitmap, we only need to kick all vcpus out of guest
5978 * mode as if vcpus is in root mode, the PML buffer must has been
5982 vmx_flush_pml_buffer(vcpu
);
5985 * We should never reach this point with a pending nested VM-Enter, and
5986 * more specifically emulation of L2 due to invalid guest state (see
5987 * below) should never happen as that means we incorrectly allowed a
5988 * nested VM-Enter with an invalid vmcs12.
5990 WARN_ON_ONCE(vmx
->nested
.nested_run_pending
);
5992 /* If guest state is invalid, start emulating */
5993 if (vmx
->emulation_required
)
5994 return handle_invalid_guest_state(vcpu
);
5996 if (is_guest_mode(vcpu
)) {
5998 * The host physical addresses of some pages of guest memory
5999 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
6000 * Page). The CPU may write to these pages via their host
6001 * physical address while L2 is running, bypassing any
6002 * address-translation-based dirty tracking (e.g. EPT write
6005 * Mark them dirty on every exit from L2 to prevent them from
6006 * getting out of sync with dirty tracking.
6008 nested_mark_vmcs12_pages_dirty(vcpu
);
6010 if (nested_vmx_reflect_vmexit(vcpu
))
6014 if (exit_reason
.failed_vmentry
) {
6016 vcpu
->run
->exit_reason
= KVM_EXIT_FAIL_ENTRY
;
6017 vcpu
->run
->fail_entry
.hardware_entry_failure_reason
6019 vcpu
->run
->fail_entry
.cpu
= vcpu
->arch
.last_vmentry_cpu
;
6023 if (unlikely(vmx
->fail
)) {
6025 vcpu
->run
->exit_reason
= KVM_EXIT_FAIL_ENTRY
;
6026 vcpu
->run
->fail_entry
.hardware_entry_failure_reason
6027 = vmcs_read32(VM_INSTRUCTION_ERROR
);
6028 vcpu
->run
->fail_entry
.cpu
= vcpu
->arch
.last_vmentry_cpu
;
6034 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
6035 * delivery event since it indicates guest is accessing MMIO.
6036 * The vm-exit can be triggered again after return to guest that
6037 * will cause infinite loop.
6039 if ((vectoring_info
& VECTORING_INFO_VALID_MASK
) &&
6040 (exit_reason
.basic
!= EXIT_REASON_EXCEPTION_NMI
&&
6041 exit_reason
.basic
!= EXIT_REASON_EPT_VIOLATION
&&
6042 exit_reason
.basic
!= EXIT_REASON_PML_FULL
&&
6043 exit_reason
.basic
!= EXIT_REASON_APIC_ACCESS
&&
6044 exit_reason
.basic
!= EXIT_REASON_TASK_SWITCH
)) {
6045 vcpu
->run
->exit_reason
= KVM_EXIT_INTERNAL_ERROR
;
6046 vcpu
->run
->internal
.suberror
= KVM_INTERNAL_ERROR_DELIVERY_EV
;
6047 vcpu
->run
->internal
.ndata
= 3;
6048 vcpu
->run
->internal
.data
[0] = vectoring_info
;
6049 vcpu
->run
->internal
.data
[1] = exit_reason
.full
;
6050 vcpu
->run
->internal
.data
[2] = vcpu
->arch
.exit_qualification
;
6051 if (exit_reason
.basic
== EXIT_REASON_EPT_MISCONFIG
) {
6052 vcpu
->run
->internal
.ndata
++;
6053 vcpu
->run
->internal
.data
[3] =
6054 vmcs_read64(GUEST_PHYSICAL_ADDRESS
);
6056 vcpu
->run
->internal
.data
[vcpu
->run
->internal
.ndata
++] =
6057 vcpu
->arch
.last_vmentry_cpu
;
6061 if (unlikely(!enable_vnmi
&&
6062 vmx
->loaded_vmcs
->soft_vnmi_blocked
)) {
6063 if (!vmx_interrupt_blocked(vcpu
)) {
6064 vmx
->loaded_vmcs
->soft_vnmi_blocked
= 0;
6065 } else if (vmx
->loaded_vmcs
->vnmi_blocked_time
> 1000000000LL &&
6066 vcpu
->arch
.nmi_pending
) {
6068 * This CPU don't support us in finding the end of an
6069 * NMI-blocked window if the guest runs with IRQs
6070 * disabled. So we pull the trigger after 1 s of
6071 * futile waiting, but inform the user about this.
6073 printk(KERN_WARNING
"%s: Breaking out of NMI-blocked "
6074 "state on VCPU %d after 1 s timeout\n",
6075 __func__
, vcpu
->vcpu_id
);
6076 vmx
->loaded_vmcs
->soft_vnmi_blocked
= 0;
6080 if (exit_fastpath
!= EXIT_FASTPATH_NONE
)
6083 if (exit_reason
.basic
>= kvm_vmx_max_exit_handlers
)
6084 goto unexpected_vmexit
;
6085 #ifdef CONFIG_RETPOLINE
6086 if (exit_reason
.basic
== EXIT_REASON_MSR_WRITE
)
6087 return kvm_emulate_wrmsr(vcpu
);
6088 else if (exit_reason
.basic
== EXIT_REASON_PREEMPTION_TIMER
)
6089 return handle_preemption_timer(vcpu
);
6090 else if (exit_reason
.basic
== EXIT_REASON_INTERRUPT_WINDOW
)
6091 return handle_interrupt_window(vcpu
);
6092 else if (exit_reason
.basic
== EXIT_REASON_EXTERNAL_INTERRUPT
)
6093 return handle_external_interrupt(vcpu
);
6094 else if (exit_reason
.basic
== EXIT_REASON_HLT
)
6095 return kvm_emulate_halt(vcpu
);
6096 else if (exit_reason
.basic
== EXIT_REASON_EPT_MISCONFIG
)
6097 return handle_ept_misconfig(vcpu
);
6100 exit_handler_index
= array_index_nospec((u16
)exit_reason
.basic
,
6101 kvm_vmx_max_exit_handlers
);
6102 if (!kvm_vmx_exit_handlers
[exit_handler_index
])
6103 goto unexpected_vmexit
;
6105 return kvm_vmx_exit_handlers
[exit_handler_index
](vcpu
);
6108 vcpu_unimpl(vcpu
, "vmx: unexpected exit reason 0x%x\n",
6111 vcpu
->run
->exit_reason
= KVM_EXIT_INTERNAL_ERROR
;
6112 vcpu
->run
->internal
.suberror
=
6113 KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON
;
6114 vcpu
->run
->internal
.ndata
= 2;
6115 vcpu
->run
->internal
.data
[0] = exit_reason
.full
;
6116 vcpu
->run
->internal
.data
[1] = vcpu
->arch
.last_vmentry_cpu
;
6120 static int vmx_handle_exit(struct kvm_vcpu
*vcpu
, fastpath_t exit_fastpath
)
6122 int ret
= __vmx_handle_exit(vcpu
, exit_fastpath
);
6125 * Even when current exit reason is handled by KVM internally, we
6126 * still need to exit to user space when bus lock detected to inform
6127 * that there is a bus lock in guest.
6129 if (to_vmx(vcpu
)->exit_reason
.bus_lock_detected
) {
6131 vcpu
->run
->exit_reason
= KVM_EXIT_X86_BUS_LOCK
;
6133 vcpu
->run
->flags
|= KVM_RUN_X86_BUS_LOCK
;
6140 * Software based L1D cache flush which is used when microcode providing
6141 * the cache control MSR is not loaded.
6143 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
6144 * flush it is required to read in 64 KiB because the replacement algorithm
6145 * is not exactly LRU. This could be sized at runtime via topology
6146 * information but as all relevant affected CPUs have 32KiB L1D cache size
6147 * there is no point in doing so.
6149 static noinstr
void vmx_l1d_flush(struct kvm_vcpu
*vcpu
)
6151 int size
= PAGE_SIZE
<< L1D_CACHE_ORDER
;
6154 * This code is only executed when the the flush mode is 'cond' or
6157 if (static_branch_likely(&vmx_l1d_flush_cond
)) {
6161 * Clear the per-vcpu flush bit, it gets set again
6162 * either from vcpu_run() or from one of the unsafe
6165 flush_l1d
= vcpu
->arch
.l1tf_flush_l1d
;
6166 vcpu
->arch
.l1tf_flush_l1d
= false;
6169 * Clear the per-cpu flush bit, it gets set again from
6170 * the interrupt handlers.
6172 flush_l1d
|= kvm_get_cpu_l1tf_flush_l1d();
6173 kvm_clear_cpu_l1tf_flush_l1d();
6179 vcpu
->stat
.l1d_flush
++;
6181 if (static_cpu_has(X86_FEATURE_FLUSH_L1D
)) {
6182 native_wrmsrl(MSR_IA32_FLUSH_CMD
, L1D_FLUSH
);
6187 /* First ensure the pages are in the TLB */
6188 "xorl %%eax, %%eax\n"
6189 ".Lpopulate_tlb:\n\t"
6190 "movzbl (%[flush_pages], %%" _ASM_AX
"), %%ecx\n\t"
6191 "addl $4096, %%eax\n\t"
6192 "cmpl %%eax, %[size]\n\t"
6193 "jne .Lpopulate_tlb\n\t"
6194 "xorl %%eax, %%eax\n\t"
6196 /* Now fill the cache */
6197 "xorl %%eax, %%eax\n"
6199 "movzbl (%[flush_pages], %%" _ASM_AX
"), %%ecx\n\t"
6200 "addl $64, %%eax\n\t"
6201 "cmpl %%eax, %[size]\n\t"
6202 "jne .Lfill_cache\n\t"
6204 :: [flush_pages
] "r" (vmx_l1d_flush_pages
),
6206 : "eax", "ebx", "ecx", "edx");
6209 static void vmx_update_cr8_intercept(struct kvm_vcpu
*vcpu
, int tpr
, int irr
)
6211 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
6214 if (is_guest_mode(vcpu
) &&
6215 nested_cpu_has(vmcs12
, CPU_BASED_TPR_SHADOW
))
6218 tpr_threshold
= (irr
== -1 || tpr
< irr
) ? 0 : irr
;
6219 if (is_guest_mode(vcpu
))
6220 to_vmx(vcpu
)->nested
.l1_tpr_threshold
= tpr_threshold
;
6222 vmcs_write32(TPR_THRESHOLD
, tpr_threshold
);
6225 void vmx_set_virtual_apic_mode(struct kvm_vcpu
*vcpu
)
6227 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
6228 u32 sec_exec_control
;
6230 if (!lapic_in_kernel(vcpu
))
6233 if (!flexpriority_enabled
&&
6234 !cpu_has_vmx_virtualize_x2apic_mode())
6237 /* Postpone execution until vmcs01 is the current VMCS. */
6238 if (is_guest_mode(vcpu
)) {
6239 vmx
->nested
.change_vmcs01_virtual_apic_mode
= true;
6243 sec_exec_control
= secondary_exec_controls_get(vmx
);
6244 sec_exec_control
&= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES
|
6245 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE
);
6247 switch (kvm_get_apic_mode(vcpu
)) {
6248 case LAPIC_MODE_INVALID
:
6249 WARN_ONCE(true, "Invalid local APIC state");
6250 case LAPIC_MODE_DISABLED
:
6252 case LAPIC_MODE_XAPIC
:
6253 if (flexpriority_enabled
) {
6255 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES
;
6256 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD
, vcpu
);
6259 * Flush the TLB, reloading the APIC access page will
6260 * only do so if its physical address has changed, but
6261 * the guest may have inserted a non-APIC mapping into
6262 * the TLB while the APIC access page was disabled.
6264 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT
, vcpu
);
6267 case LAPIC_MODE_X2APIC
:
6268 if (cpu_has_vmx_virtualize_x2apic_mode())
6270 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE
;
6273 secondary_exec_controls_set(vmx
, sec_exec_control
);
6275 vmx_update_msr_bitmap(vcpu
);
6278 static void vmx_set_apic_access_page_addr(struct kvm_vcpu
*vcpu
)
6282 /* Defer reload until vmcs01 is the current VMCS. */
6283 if (is_guest_mode(vcpu
)) {
6284 to_vmx(vcpu
)->nested
.reload_vmcs01_apic_access_page
= true;
6288 if (!(secondary_exec_controls_get(to_vmx(vcpu
)) &
6289 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES
))
6292 page
= gfn_to_page(vcpu
->kvm
, APIC_DEFAULT_PHYS_BASE
>> PAGE_SHIFT
);
6293 if (is_error_page(page
))
6296 vmcs_write64(APIC_ACCESS_ADDR
, page_to_phys(page
));
6297 vmx_flush_tlb_current(vcpu
);
6300 * Do not pin apic access page in memory, the MMU notifier
6301 * will call us again if it is migrated or swapped out.
6306 static void vmx_hwapic_isr_update(struct kvm_vcpu
*vcpu
, int max_isr
)
6314 status
= vmcs_read16(GUEST_INTR_STATUS
);
6316 if (max_isr
!= old
) {
6318 status
|= max_isr
<< 8;
6319 vmcs_write16(GUEST_INTR_STATUS
, status
);
6323 static void vmx_set_rvi(int vector
)
6331 status
= vmcs_read16(GUEST_INTR_STATUS
);
6332 old
= (u8
)status
& 0xff;
6333 if ((u8
)vector
!= old
) {
6335 status
|= (u8
)vector
;
6336 vmcs_write16(GUEST_INTR_STATUS
, status
);
6340 static void vmx_hwapic_irr_update(struct kvm_vcpu
*vcpu
, int max_irr
)
6343 * When running L2, updating RVI is only relevant when
6344 * vmcs12 virtual-interrupt-delivery enabled.
6345 * However, it can be enabled only when L1 also
6346 * intercepts external-interrupts and in that case
6347 * we should not update vmcs02 RVI but instead intercept
6348 * interrupt. Therefore, do nothing when running L2.
6350 if (!is_guest_mode(vcpu
))
6351 vmx_set_rvi(max_irr
);
6354 static int vmx_sync_pir_to_irr(struct kvm_vcpu
*vcpu
)
6356 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
6358 bool max_irr_updated
;
6360 WARN_ON(!vcpu
->arch
.apicv_active
);
6361 if (pi_test_on(&vmx
->pi_desc
)) {
6362 pi_clear_on(&vmx
->pi_desc
);
6364 * IOMMU can write to PID.ON, so the barrier matters even on UP.
6365 * But on x86 this is just a compiler barrier anyway.
6367 smp_mb__after_atomic();
6369 kvm_apic_update_irr(vcpu
, vmx
->pi_desc
.pir
, &max_irr
);
6372 * If we are running L2 and L1 has a new pending interrupt
6373 * which can be injected, we should re-evaluate
6374 * what should be done with this new L1 interrupt.
6375 * If L1 intercepts external-interrupts, we should
6376 * exit from L2 to L1. Otherwise, interrupt should be
6377 * delivered directly to L2.
6379 if (is_guest_mode(vcpu
) && max_irr_updated
) {
6380 if (nested_exit_on_intr(vcpu
))
6381 kvm_vcpu_exiting_guest_mode(vcpu
);
6383 kvm_make_request(KVM_REQ_EVENT
, vcpu
);
6386 max_irr
= kvm_lapic_find_highest_irr(vcpu
);
6388 vmx_hwapic_irr_update(vcpu
, max_irr
);
6392 static void vmx_load_eoi_exitmap(struct kvm_vcpu
*vcpu
, u64
*eoi_exit_bitmap
)
6394 if (!kvm_vcpu_apicv_active(vcpu
))
6397 vmcs_write64(EOI_EXIT_BITMAP0
, eoi_exit_bitmap
[0]);
6398 vmcs_write64(EOI_EXIT_BITMAP1
, eoi_exit_bitmap
[1]);
6399 vmcs_write64(EOI_EXIT_BITMAP2
, eoi_exit_bitmap
[2]);
6400 vmcs_write64(EOI_EXIT_BITMAP3
, eoi_exit_bitmap
[3]);
6403 static void vmx_apicv_post_state_restore(struct kvm_vcpu
*vcpu
)
6405 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
6407 pi_clear_on(&vmx
->pi_desc
);
6408 memset(vmx
->pi_desc
.pir
, 0, sizeof(vmx
->pi_desc
.pir
));
6411 void vmx_do_interrupt_nmi_irqoff(unsigned long entry
);
6413 static void handle_interrupt_nmi_irqoff(struct kvm_vcpu
*vcpu
, u32 intr_info
)
6415 unsigned int vector
= intr_info
& INTR_INFO_VECTOR_MASK
;
6416 gate_desc
*desc
= (gate_desc
*)host_idt_base
+ vector
;
6418 kvm_before_interrupt(vcpu
);
6419 vmx_do_interrupt_nmi_irqoff(gate_offset(desc
));
6420 kvm_after_interrupt(vcpu
);
6423 static void handle_exception_nmi_irqoff(struct vcpu_vmx
*vmx
)
6425 u32 intr_info
= vmx_get_intr_info(&vmx
->vcpu
);
6427 /* if exit due to PF check for async PF */
6428 if (is_page_fault(intr_info
))
6429 vmx
->vcpu
.arch
.apf
.host_apf_flags
= kvm_read_and_reset_apf_flags();
6430 /* Handle machine checks before interrupts are enabled */
6431 else if (is_machine_check(intr_info
))
6432 kvm_machine_check();
6433 /* We need to handle NMIs before interrupts are enabled */
6434 else if (is_nmi(intr_info
))
6435 handle_interrupt_nmi_irqoff(&vmx
->vcpu
, intr_info
);
6438 static void handle_external_interrupt_irqoff(struct kvm_vcpu
*vcpu
)
6440 u32 intr_info
= vmx_get_intr_info(vcpu
);
6442 if (WARN_ONCE(!is_external_intr(intr_info
),
6443 "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info
))
6446 handle_interrupt_nmi_irqoff(vcpu
, intr_info
);
6449 static void vmx_handle_exit_irqoff(struct kvm_vcpu
*vcpu
)
6451 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
6453 if (vmx
->exit_reason
.basic
== EXIT_REASON_EXTERNAL_INTERRUPT
)
6454 handle_external_interrupt_irqoff(vcpu
);
6455 else if (vmx
->exit_reason
.basic
== EXIT_REASON_EXCEPTION_NMI
)
6456 handle_exception_nmi_irqoff(vmx
);
6460 * The kvm parameter can be NULL (module initialization, or invocation before
6461 * VM creation). Be sure to check the kvm parameter before using it.
6463 static bool vmx_has_emulated_msr(struct kvm
*kvm
, u32 index
)
6466 case MSR_IA32_SMBASE
:
6468 * We cannot do SMM unless we can run the guest in big
6471 return enable_unrestricted_guest
|| emulate_invalid_guest_state
;
6472 case MSR_IA32_VMX_BASIC
... MSR_IA32_VMX_VMFUNC
:
6474 case MSR_AMD64_VIRT_SPEC_CTRL
:
6475 /* This is AMD only. */
6482 static void vmx_recover_nmi_blocking(struct vcpu_vmx
*vmx
)
6487 bool idtv_info_valid
;
6489 idtv_info_valid
= vmx
->idt_vectoring_info
& VECTORING_INFO_VALID_MASK
;
6492 if (vmx
->loaded_vmcs
->nmi_known_unmasked
)
6495 exit_intr_info
= vmx_get_intr_info(&vmx
->vcpu
);
6496 unblock_nmi
= (exit_intr_info
& INTR_INFO_UNBLOCK_NMI
) != 0;
6497 vector
= exit_intr_info
& INTR_INFO_VECTOR_MASK
;
6499 * SDM 3: 27.7.1.2 (September 2008)
6500 * Re-set bit "block by NMI" before VM entry if vmexit caused by
6501 * a guest IRET fault.
6502 * SDM 3: 23.2.2 (September 2008)
6503 * Bit 12 is undefined in any of the following cases:
6504 * If the VM exit sets the valid bit in the IDT-vectoring
6505 * information field.
6506 * If the VM exit is due to a double fault.
6508 if ((exit_intr_info
& INTR_INFO_VALID_MASK
) && unblock_nmi
&&
6509 vector
!= DF_VECTOR
&& !idtv_info_valid
)
6510 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO
,
6511 GUEST_INTR_STATE_NMI
);
6513 vmx
->loaded_vmcs
->nmi_known_unmasked
=
6514 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO
)
6515 & GUEST_INTR_STATE_NMI
);
6516 } else if (unlikely(vmx
->loaded_vmcs
->soft_vnmi_blocked
))
6517 vmx
->loaded_vmcs
->vnmi_blocked_time
+=
6518 ktime_to_ns(ktime_sub(ktime_get(),
6519 vmx
->loaded_vmcs
->entry_time
));
6522 static void __vmx_complete_interrupts(struct kvm_vcpu
*vcpu
,
6523 u32 idt_vectoring_info
,
6524 int instr_len_field
,
6525 int error_code_field
)
6529 bool idtv_info_valid
;
6531 idtv_info_valid
= idt_vectoring_info
& VECTORING_INFO_VALID_MASK
;
6533 vcpu
->arch
.nmi_injected
= false;
6534 kvm_clear_exception_queue(vcpu
);
6535 kvm_clear_interrupt_queue(vcpu
);
6537 if (!idtv_info_valid
)
6540 kvm_make_request(KVM_REQ_EVENT
, vcpu
);
6542 vector
= idt_vectoring_info
& VECTORING_INFO_VECTOR_MASK
;
6543 type
= idt_vectoring_info
& VECTORING_INFO_TYPE_MASK
;
6546 case INTR_TYPE_NMI_INTR
:
6547 vcpu
->arch
.nmi_injected
= true;
6549 * SDM 3: 27.7.1.2 (September 2008)
6550 * Clear bit "block by NMI" before VM entry if a NMI
6553 vmx_set_nmi_mask(vcpu
, false);
6555 case INTR_TYPE_SOFT_EXCEPTION
:
6556 vcpu
->arch
.event_exit_inst_len
= vmcs_read32(instr_len_field
);
6558 case INTR_TYPE_HARD_EXCEPTION
:
6559 if (idt_vectoring_info
& VECTORING_INFO_DELIVER_CODE_MASK
) {
6560 u32 err
= vmcs_read32(error_code_field
);
6561 kvm_requeue_exception_e(vcpu
, vector
, err
);
6563 kvm_requeue_exception(vcpu
, vector
);
6565 case INTR_TYPE_SOFT_INTR
:
6566 vcpu
->arch
.event_exit_inst_len
= vmcs_read32(instr_len_field
);
6568 case INTR_TYPE_EXT_INTR
:
6569 kvm_queue_interrupt(vcpu
, vector
, type
== INTR_TYPE_SOFT_INTR
);
6576 static void vmx_complete_interrupts(struct vcpu_vmx
*vmx
)
6578 __vmx_complete_interrupts(&vmx
->vcpu
, vmx
->idt_vectoring_info
,
6579 VM_EXIT_INSTRUCTION_LEN
,
6580 IDT_VECTORING_ERROR_CODE
);
6583 static void vmx_cancel_injection(struct kvm_vcpu
*vcpu
)
6585 __vmx_complete_interrupts(vcpu
,
6586 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD
),
6587 VM_ENTRY_INSTRUCTION_LEN
,
6588 VM_ENTRY_EXCEPTION_ERROR_CODE
);
6590 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD
, 0);
6593 static void atomic_switch_perf_msrs(struct vcpu_vmx
*vmx
)
6596 struct perf_guest_switch_msr
*msrs
;
6598 msrs
= perf_guest_get_msrs(&nr_msrs
);
6603 for (i
= 0; i
< nr_msrs
; i
++)
6604 if (msrs
[i
].host
== msrs
[i
].guest
)
6605 clear_atomic_switch_msr(vmx
, msrs
[i
].msr
);
6607 add_atomic_switch_msr(vmx
, msrs
[i
].msr
, msrs
[i
].guest
,
6608 msrs
[i
].host
, false);
6611 static void vmx_update_hv_timer(struct kvm_vcpu
*vcpu
)
6613 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
6617 if (vmx
->req_immediate_exit
) {
6618 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE
, 0);
6619 vmx
->loaded_vmcs
->hv_timer_soft_disabled
= false;
6620 } else if (vmx
->hv_deadline_tsc
!= -1) {
6622 if (vmx
->hv_deadline_tsc
> tscl
)
6623 /* set_hv_timer ensures the delta fits in 32-bits */
6624 delta_tsc
= (u32
)((vmx
->hv_deadline_tsc
- tscl
) >>
6625 cpu_preemption_timer_multi
);
6629 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE
, delta_tsc
);
6630 vmx
->loaded_vmcs
->hv_timer_soft_disabled
= false;
6631 } else if (!vmx
->loaded_vmcs
->hv_timer_soft_disabled
) {
6632 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE
, -1);
6633 vmx
->loaded_vmcs
->hv_timer_soft_disabled
= true;
6637 void noinstr
vmx_update_host_rsp(struct vcpu_vmx
*vmx
, unsigned long host_rsp
)
6639 if (unlikely(host_rsp
!= vmx
->loaded_vmcs
->host_state
.rsp
)) {
6640 vmx
->loaded_vmcs
->host_state
.rsp
= host_rsp
;
6641 vmcs_writel(HOST_RSP
, host_rsp
);
6645 static fastpath_t
vmx_exit_handlers_fastpath(struct kvm_vcpu
*vcpu
)
6647 switch (to_vmx(vcpu
)->exit_reason
.basic
) {
6648 case EXIT_REASON_MSR_WRITE
:
6649 return handle_fastpath_set_msr_irqoff(vcpu
);
6650 case EXIT_REASON_PREEMPTION_TIMER
:
6651 return handle_fastpath_preemption_timer(vcpu
);
6653 return EXIT_FASTPATH_NONE
;
6657 static noinstr
void vmx_vcpu_enter_exit(struct kvm_vcpu
*vcpu
,
6658 struct vcpu_vmx
*vmx
)
6661 * VMENTER enables interrupts (host state), but the kernel state is
6662 * interrupts disabled when this is invoked. Also tell RCU about
6663 * it. This is the same logic as for exit_to_user_mode().
6665 * This ensures that e.g. latency analysis on the host observes
6666 * guest mode as interrupt enabled.
6668 * guest_enter_irqoff() informs context tracking about the
6669 * transition to guest mode and if enabled adjusts RCU state
6672 instrumentation_begin();
6673 trace_hardirqs_on_prepare();
6674 lockdep_hardirqs_on_prepare(CALLER_ADDR0
);
6675 instrumentation_end();
6677 guest_enter_irqoff();
6678 lockdep_hardirqs_on(CALLER_ADDR0
);
6680 /* L1D Flush includes CPU buffer clear to mitigate MDS */
6681 if (static_branch_unlikely(&vmx_l1d_should_flush
))
6682 vmx_l1d_flush(vcpu
);
6683 else if (static_branch_unlikely(&mds_user_clear
))
6684 mds_clear_cpu_buffers();
6686 if (vcpu
->arch
.cr2
!= native_read_cr2())
6687 native_write_cr2(vcpu
->arch
.cr2
);
6689 vmx
->fail
= __vmx_vcpu_run(vmx
, (unsigned long *)&vcpu
->arch
.regs
,
6690 vmx
->loaded_vmcs
->launched
);
6692 vcpu
->arch
.cr2
= native_read_cr2();
6695 * VMEXIT disables interrupts (host state), but tracing and lockdep
6696 * have them in state 'on' as recorded before entering guest mode.
6697 * Same as enter_from_user_mode().
6699 * guest_exit_irqoff() restores host context and reinstates RCU if
6700 * enabled and required.
6702 * This needs to be done before the below as native_read_msr()
6703 * contains a tracepoint and x86_spec_ctrl_restore_host() calls
6704 * into world and some more.
6706 lockdep_hardirqs_off(CALLER_ADDR0
);
6707 guest_exit_irqoff();
6709 instrumentation_begin();
6710 trace_hardirqs_off_finish();
6711 instrumentation_end();
6714 static fastpath_t
vmx_vcpu_run(struct kvm_vcpu
*vcpu
)
6716 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
6717 unsigned long cr3
, cr4
;
6719 /* Record the guest's net vcpu time for enforced NMI injections. */
6720 if (unlikely(!enable_vnmi
&&
6721 vmx
->loaded_vmcs
->soft_vnmi_blocked
))
6722 vmx
->loaded_vmcs
->entry_time
= ktime_get();
6724 /* Don't enter VMX if guest state is invalid, let the exit handler
6725 start emulation until we arrive back to a valid state */
6726 if (vmx
->emulation_required
)
6727 return EXIT_FASTPATH_NONE
;
6729 trace_kvm_entry(vcpu
);
6731 if (vmx
->ple_window_dirty
) {
6732 vmx
->ple_window_dirty
= false;
6733 vmcs_write32(PLE_WINDOW
, vmx
->ple_window
);
6737 * We did this in prepare_switch_to_guest, because it needs to
6738 * be within srcu_read_lock.
6740 WARN_ON_ONCE(vmx
->nested
.need_vmcs12_to_shadow_sync
);
6742 if (kvm_register_is_dirty(vcpu
, VCPU_REGS_RSP
))
6743 vmcs_writel(GUEST_RSP
, vcpu
->arch
.regs
[VCPU_REGS_RSP
]);
6744 if (kvm_register_is_dirty(vcpu
, VCPU_REGS_RIP
))
6745 vmcs_writel(GUEST_RIP
, vcpu
->arch
.regs
[VCPU_REGS_RIP
]);
6747 cr3
= __get_current_cr3_fast();
6748 if (unlikely(cr3
!= vmx
->loaded_vmcs
->host_state
.cr3
)) {
6749 vmcs_writel(HOST_CR3
, cr3
);
6750 vmx
->loaded_vmcs
->host_state
.cr3
= cr3
;
6753 cr4
= cr4_read_shadow();
6754 if (unlikely(cr4
!= vmx
->loaded_vmcs
->host_state
.cr4
)) {
6755 vmcs_writel(HOST_CR4
, cr4
);
6756 vmx
->loaded_vmcs
->host_state
.cr4
= cr4
;
6759 /* When single-stepping over STI and MOV SS, we must clear the
6760 * corresponding interruptibility bits in the guest state. Otherwise
6761 * vmentry fails as it then expects bit 14 (BS) in pending debug
6762 * exceptions being set, but that's not correct for the guest debugging
6764 if (vcpu
->guest_debug
& KVM_GUESTDBG_SINGLESTEP
)
6765 vmx_set_interrupt_shadow(vcpu
, 0);
6767 kvm_load_guest_xsave_state(vcpu
);
6769 pt_guest_enter(vmx
);
6771 atomic_switch_perf_msrs(vmx
);
6772 if (intel_pmu_lbr_is_enabled(vcpu
))
6773 vmx_passthrough_lbr_msrs(vcpu
);
6775 if (enable_preemption_timer
)
6776 vmx_update_hv_timer(vcpu
);
6778 kvm_wait_lapic_expire(vcpu
);
6781 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
6782 * it's non-zero. Since vmentry is serialising on affected CPUs, there
6783 * is no need to worry about the conditional branch over the wrmsr
6784 * being speculatively taken.
6786 x86_spec_ctrl_set_guest(vmx
->spec_ctrl
, 0);
6788 /* The actual VMENTER/EXIT is in the .noinstr.text section. */
6789 vmx_vcpu_enter_exit(vcpu
, vmx
);
6792 * We do not use IBRS in the kernel. If this vCPU has used the
6793 * SPEC_CTRL MSR it may have left it on; save the value and
6794 * turn it off. This is much more efficient than blindly adding
6795 * it to the atomic save/restore list. Especially as the former
6796 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
6798 * For non-nested case:
6799 * If the L01 MSR bitmap does not intercept the MSR, then we need to
6803 * If the L02 MSR bitmap does not intercept the MSR, then we need to
6806 if (unlikely(!msr_write_intercepted(vcpu
, MSR_IA32_SPEC_CTRL
)))
6807 vmx
->spec_ctrl
= native_read_msr(MSR_IA32_SPEC_CTRL
);
6809 x86_spec_ctrl_restore_host(vmx
->spec_ctrl
, 0);
6811 /* All fields are clean at this point */
6812 if (static_branch_unlikely(&enable_evmcs
)) {
6813 current_evmcs
->hv_clean_fields
|=
6814 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL
;
6816 current_evmcs
->hv_vp_id
= kvm_hv_get_vpindex(vcpu
);
6819 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
6820 if (vmx
->host_debugctlmsr
)
6821 update_debugctlmsr(vmx
->host_debugctlmsr
);
6823 #ifndef CONFIG_X86_64
6825 * The sysexit path does not restore ds/es, so we must set them to
6826 * a reasonable value ourselves.
6828 * We can't defer this to vmx_prepare_switch_to_host() since that
6829 * function may be executed in interrupt context, which saves and
6830 * restore segments around it, nullifying its effect.
6832 loadsegment(ds
, __USER_DS
);
6833 loadsegment(es
, __USER_DS
);
6836 vmx_register_cache_reset(vcpu
);
6840 kvm_load_host_xsave_state(vcpu
);
6842 vmx
->nested
.nested_run_pending
= 0;
6843 vmx
->idt_vectoring_info
= 0;
6845 if (unlikely(vmx
->fail
)) {
6846 vmx
->exit_reason
.full
= 0xdead;
6847 return EXIT_FASTPATH_NONE
;
6850 vmx
->exit_reason
.full
= vmcs_read32(VM_EXIT_REASON
);
6851 if (unlikely((u16
)vmx
->exit_reason
.basic
== EXIT_REASON_MCE_DURING_VMENTRY
))
6852 kvm_machine_check();
6854 trace_kvm_exit(vmx
->exit_reason
.full
, vcpu
, KVM_ISA_VMX
);
6856 if (unlikely(vmx
->exit_reason
.failed_vmentry
))
6857 return EXIT_FASTPATH_NONE
;
6859 vmx
->loaded_vmcs
->launched
= 1;
6860 vmx
->idt_vectoring_info
= vmcs_read32(IDT_VECTORING_INFO_FIELD
);
6862 vmx_recover_nmi_blocking(vmx
);
6863 vmx_complete_interrupts(vmx
);
6865 if (is_guest_mode(vcpu
))
6866 return EXIT_FASTPATH_NONE
;
6868 return vmx_exit_handlers_fastpath(vcpu
);
6871 static void vmx_free_vcpu(struct kvm_vcpu
*vcpu
)
6873 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
6876 vmx_destroy_pml_buffer(vmx
);
6877 free_vpid(vmx
->vpid
);
6878 nested_vmx_free_vcpu(vcpu
);
6879 free_loaded_vmcs(vmx
->loaded_vmcs
);
6882 static int vmx_create_vcpu(struct kvm_vcpu
*vcpu
)
6884 struct vcpu_vmx
*vmx
;
6887 BUILD_BUG_ON(offsetof(struct vcpu_vmx
, vcpu
) != 0);
6892 vmx
->vpid
= allocate_vpid();
6895 * If PML is turned on, failure on enabling PML just results in failure
6896 * of creating the vcpu, therefore we can simplify PML logic (by
6897 * avoiding dealing with cases, such as enabling PML partially on vcpus
6898 * for the guest), etc.
6901 vmx
->pml_pg
= alloc_page(GFP_KERNEL_ACCOUNT
| __GFP_ZERO
);
6906 BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list
) != MAX_NR_USER_RETURN_MSRS
);
6908 for (i
= 0; i
< ARRAY_SIZE(vmx_uret_msrs_list
); ++i
) {
6909 u32 index
= vmx_uret_msrs_list
[i
];
6910 u32 data_low
, data_high
;
6911 int j
= vmx
->nr_uret_msrs
;
6913 if (rdmsr_safe(index
, &data_low
, &data_high
) < 0)
6915 if (wrmsr_safe(index
, data_low
, data_high
) < 0)
6918 vmx
->guest_uret_msrs
[j
].slot
= i
;
6919 vmx
->guest_uret_msrs
[j
].data
= 0;
6921 case MSR_IA32_TSX_CTRL
:
6923 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID
6924 * interception. Keep the host value unchanged to avoid
6925 * changing CPUID bits under the host kernel's feet.
6927 * hle=0, rtm=0, tsx_ctrl=1 can be found with some
6928 * combinations of new kernel and old userspace. If
6929 * those guests run on a tsx=off host, do allow guests
6930 * to use TSX_CTRL, but do not change the value on the
6931 * host so that TSX remains always disabled.
6933 if (boot_cpu_has(X86_FEATURE_RTM
))
6934 vmx
->guest_uret_msrs
[j
].mask
= ~(u64
)TSX_CTRL_CPUID_CLEAR
;
6936 vmx
->guest_uret_msrs
[j
].mask
= 0;
6939 vmx
->guest_uret_msrs
[j
].mask
= -1ull;
6942 ++vmx
->nr_uret_msrs
;
6945 err
= alloc_loaded_vmcs(&vmx
->vmcs01
);
6949 /* The MSR bitmap starts with all ones */
6950 bitmap_fill(vmx
->shadow_msr_intercept
.read
, MAX_POSSIBLE_PASSTHROUGH_MSRS
);
6951 bitmap_fill(vmx
->shadow_msr_intercept
.write
, MAX_POSSIBLE_PASSTHROUGH_MSRS
);
6953 vmx_disable_intercept_for_msr(vcpu
, MSR_IA32_TSC
, MSR_TYPE_R
);
6954 vmx_disable_intercept_for_msr(vcpu
, MSR_FS_BASE
, MSR_TYPE_RW
);
6955 vmx_disable_intercept_for_msr(vcpu
, MSR_GS_BASE
, MSR_TYPE_RW
);
6956 vmx_disable_intercept_for_msr(vcpu
, MSR_KERNEL_GS_BASE
, MSR_TYPE_RW
);
6957 vmx_disable_intercept_for_msr(vcpu
, MSR_IA32_SYSENTER_CS
, MSR_TYPE_RW
);
6958 vmx_disable_intercept_for_msr(vcpu
, MSR_IA32_SYSENTER_ESP
, MSR_TYPE_RW
);
6959 vmx_disable_intercept_for_msr(vcpu
, MSR_IA32_SYSENTER_EIP
, MSR_TYPE_RW
);
6960 if (kvm_cstate_in_guest(vcpu
->kvm
)) {
6961 vmx_disable_intercept_for_msr(vcpu
, MSR_CORE_C1_RES
, MSR_TYPE_R
);
6962 vmx_disable_intercept_for_msr(vcpu
, MSR_CORE_C3_RESIDENCY
, MSR_TYPE_R
);
6963 vmx_disable_intercept_for_msr(vcpu
, MSR_CORE_C6_RESIDENCY
, MSR_TYPE_R
);
6964 vmx_disable_intercept_for_msr(vcpu
, MSR_CORE_C7_RESIDENCY
, MSR_TYPE_R
);
6966 vmx
->msr_bitmap_mode
= 0;
6968 vmx
->loaded_vmcs
= &vmx
->vmcs01
;
6970 vmx_vcpu_load(vcpu
, cpu
);
6975 if (cpu_need_virtualize_apic_accesses(vcpu
)) {
6976 err
= alloc_apic_access_page(vcpu
->kvm
);
6981 if (enable_ept
&& !enable_unrestricted_guest
) {
6982 err
= init_rmode_identity_map(vcpu
->kvm
);
6988 memcpy(&vmx
->nested
.msrs
, &vmcs_config
.nested
, sizeof(vmx
->nested
.msrs
));
6990 memset(&vmx
->nested
.msrs
, 0, sizeof(vmx
->nested
.msrs
));
6992 vmx
->nested
.posted_intr_nv
= -1;
6993 vmx
->nested
.current_vmptr
= -1ull;
6995 vcpu
->arch
.microcode_version
= 0x100000000ULL
;
6996 vmx
->msr_ia32_feature_control_valid_bits
= FEAT_CTL_LOCKED
;
6999 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
7000 * or POSTED_INTR_WAKEUP_VECTOR.
7002 vmx
->pi_desc
.nv
= POSTED_INTR_VECTOR
;
7003 vmx
->pi_desc
.sn
= 1;
7005 vmx
->ept_pointer
= INVALID_PAGE
;
7010 free_loaded_vmcs(vmx
->loaded_vmcs
);
7012 vmx_destroy_pml_buffer(vmx
);
7014 free_vpid(vmx
->vpid
);
7018 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7019 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7021 static int vmx_vm_init(struct kvm
*kvm
)
7023 spin_lock_init(&to_kvm_vmx(kvm
)->ept_pointer_lock
);
7026 kvm
->arch
.pause_in_guest
= true;
7028 if (boot_cpu_has(X86_BUG_L1TF
) && enable_ept
) {
7029 switch (l1tf_mitigation
) {
7030 case L1TF_MITIGATION_OFF
:
7031 case L1TF_MITIGATION_FLUSH_NOWARN
:
7032 /* 'I explicitly don't care' is set */
7034 case L1TF_MITIGATION_FLUSH
:
7035 case L1TF_MITIGATION_FLUSH_NOSMT
:
7036 case L1TF_MITIGATION_FULL
:
7038 * Warn upon starting the first VM in a potentially
7039 * insecure environment.
7041 if (sched_smt_active())
7042 pr_warn_once(L1TF_MSG_SMT
);
7043 if (l1tf_vmx_mitigation
== VMENTER_L1D_FLUSH_NEVER
)
7044 pr_warn_once(L1TF_MSG_L1D
);
7046 case L1TF_MITIGATION_FULL_FORCE
:
7047 /* Flush is enforced */
7051 kvm_apicv_init(kvm
, enable_apicv
);
7055 static int __init
vmx_check_processor_compat(void)
7057 struct vmcs_config vmcs_conf
;
7058 struct vmx_capability vmx_cap
;
7060 if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL
) ||
7061 !this_cpu_has(X86_FEATURE_VMX
)) {
7062 pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id());
7066 if (setup_vmcs_config(&vmcs_conf
, &vmx_cap
) < 0)
7069 nested_vmx_setup_ctls_msrs(&vmcs_conf
.nested
, vmx_cap
.ept
);
7070 if (memcmp(&vmcs_config
, &vmcs_conf
, sizeof(struct vmcs_config
)) != 0) {
7071 printk(KERN_ERR
"kvm: CPU %d feature inconsistency!\n",
7072 smp_processor_id());
7078 static u64
vmx_get_mt_mask(struct kvm_vcpu
*vcpu
, gfn_t gfn
, bool is_mmio
)
7083 /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
7084 * memory aliases with conflicting memory types and sometimes MCEs.
7085 * We have to be careful as to what are honored and when.
7087 * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to
7088 * UC. The effective memory type is UC or WC depending on guest PAT.
7089 * This was historically the source of MCEs and we want to be
7092 * When there is no need to deal with noncoherent DMA (e.g., no VT-d
7093 * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The
7094 * EPT memory type is set to WB. The effective memory type is forced
7097 * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The
7098 * EPT memory type is used to emulate guest CD/MTRR.
7102 cache
= MTRR_TYPE_UNCACHABLE
;
7106 if (!kvm_arch_has_noncoherent_dma(vcpu
->kvm
)) {
7107 ipat
= VMX_EPT_IPAT_BIT
;
7108 cache
= MTRR_TYPE_WRBACK
;
7112 if (kvm_read_cr0(vcpu
) & X86_CR0_CD
) {
7113 ipat
= VMX_EPT_IPAT_BIT
;
7114 if (kvm_check_has_quirk(vcpu
->kvm
, KVM_X86_QUIRK_CD_NW_CLEARED
))
7115 cache
= MTRR_TYPE_WRBACK
;
7117 cache
= MTRR_TYPE_UNCACHABLE
;
7121 cache
= kvm_mtrr_get_guest_memory_type(vcpu
, gfn
);
7124 return (cache
<< VMX_EPT_MT_EPTE_SHIFT
) | ipat
;
7127 static void vmcs_set_secondary_exec_control(struct vcpu_vmx
*vmx
)
7130 * These bits in the secondary execution controls field
7131 * are dynamic, the others are mostly based on the hypervisor
7132 * architecture and the guest's CPUID. Do not touch the
7136 SECONDARY_EXEC_SHADOW_VMCS
|
7137 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE
|
7138 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES
|
7139 SECONDARY_EXEC_DESC
;
7141 u32 new_ctl
= vmx
->secondary_exec_control
;
7142 u32 cur_ctl
= secondary_exec_controls_get(vmx
);
7144 secondary_exec_controls_set(vmx
, (new_ctl
& ~mask
) | (cur_ctl
& mask
));
7148 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
7149 * (indicating "allowed-1") if they are supported in the guest's CPUID.
7151 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu
*vcpu
)
7153 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
7154 struct kvm_cpuid_entry2
*entry
;
7156 vmx
->nested
.msrs
.cr0_fixed1
= 0xffffffff;
7157 vmx
->nested
.msrs
.cr4_fixed1
= X86_CR4_PCE
;
7159 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
7160 if (entry && (entry->_reg & (_cpuid_mask))) \
7161 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
7164 entry
= kvm_find_cpuid_entry(vcpu
, 0x1, 0);
7165 cr4_fixed1_update(X86_CR4_VME
, edx
, feature_bit(VME
));
7166 cr4_fixed1_update(X86_CR4_PVI
, edx
, feature_bit(VME
));
7167 cr4_fixed1_update(X86_CR4_TSD
, edx
, feature_bit(TSC
));
7168 cr4_fixed1_update(X86_CR4_DE
, edx
, feature_bit(DE
));
7169 cr4_fixed1_update(X86_CR4_PSE
, edx
, feature_bit(PSE
));
7170 cr4_fixed1_update(X86_CR4_PAE
, edx
, feature_bit(PAE
));
7171 cr4_fixed1_update(X86_CR4_MCE
, edx
, feature_bit(MCE
));
7172 cr4_fixed1_update(X86_CR4_PGE
, edx
, feature_bit(PGE
));
7173 cr4_fixed1_update(X86_CR4_OSFXSR
, edx
, feature_bit(FXSR
));
7174 cr4_fixed1_update(X86_CR4_OSXMMEXCPT
, edx
, feature_bit(XMM
));
7175 cr4_fixed1_update(X86_CR4_VMXE
, ecx
, feature_bit(VMX
));
7176 cr4_fixed1_update(X86_CR4_SMXE
, ecx
, feature_bit(SMX
));
7177 cr4_fixed1_update(X86_CR4_PCIDE
, ecx
, feature_bit(PCID
));
7178 cr4_fixed1_update(X86_CR4_OSXSAVE
, ecx
, feature_bit(XSAVE
));
7180 entry
= kvm_find_cpuid_entry(vcpu
, 0x7, 0);
7181 cr4_fixed1_update(X86_CR4_FSGSBASE
, ebx
, feature_bit(FSGSBASE
));
7182 cr4_fixed1_update(X86_CR4_SMEP
, ebx
, feature_bit(SMEP
));
7183 cr4_fixed1_update(X86_CR4_SMAP
, ebx
, feature_bit(SMAP
));
7184 cr4_fixed1_update(X86_CR4_PKE
, ecx
, feature_bit(PKU
));
7185 cr4_fixed1_update(X86_CR4_UMIP
, ecx
, feature_bit(UMIP
));
7186 cr4_fixed1_update(X86_CR4_LA57
, ecx
, feature_bit(LA57
));
7188 #undef cr4_fixed1_update
7191 static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu
*vcpu
)
7193 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
7195 if (kvm_mpx_supported()) {
7196 bool mpx_enabled
= guest_cpuid_has(vcpu
, X86_FEATURE_MPX
);
7199 vmx
->nested
.msrs
.entry_ctls_high
|= VM_ENTRY_LOAD_BNDCFGS
;
7200 vmx
->nested
.msrs
.exit_ctls_high
|= VM_EXIT_CLEAR_BNDCFGS
;
7202 vmx
->nested
.msrs
.entry_ctls_high
&= ~VM_ENTRY_LOAD_BNDCFGS
;
7203 vmx
->nested
.msrs
.exit_ctls_high
&= ~VM_EXIT_CLEAR_BNDCFGS
;
7208 static void update_intel_pt_cfg(struct kvm_vcpu
*vcpu
)
7210 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
7211 struct kvm_cpuid_entry2
*best
= NULL
;
7214 for (i
= 0; i
< PT_CPUID_LEAVES
; i
++) {
7215 best
= kvm_find_cpuid_entry(vcpu
, 0x14, i
);
7218 vmx
->pt_desc
.caps
[CPUID_EAX
+ i
*PT_CPUID_REGS_NUM
] = best
->eax
;
7219 vmx
->pt_desc
.caps
[CPUID_EBX
+ i
*PT_CPUID_REGS_NUM
] = best
->ebx
;
7220 vmx
->pt_desc
.caps
[CPUID_ECX
+ i
*PT_CPUID_REGS_NUM
] = best
->ecx
;
7221 vmx
->pt_desc
.caps
[CPUID_EDX
+ i
*PT_CPUID_REGS_NUM
] = best
->edx
;
7224 /* Get the number of configurable Address Ranges for filtering */
7225 vmx
->pt_desc
.addr_range
= intel_pt_validate_cap(vmx
->pt_desc
.caps
,
7226 PT_CAP_num_address_ranges
);
7228 /* Initialize and clear the no dependency bits */
7229 vmx
->pt_desc
.ctl_bitmask
= ~(RTIT_CTL_TRACEEN
| RTIT_CTL_OS
|
7230 RTIT_CTL_USR
| RTIT_CTL_TSC_EN
| RTIT_CTL_DISRETC
);
7233 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
7234 * will inject an #GP
7236 if (intel_pt_validate_cap(vmx
->pt_desc
.caps
, PT_CAP_cr3_filtering
))
7237 vmx
->pt_desc
.ctl_bitmask
&= ~RTIT_CTL_CR3EN
;
7240 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
7241 * PSBFreq can be set
7243 if (intel_pt_validate_cap(vmx
->pt_desc
.caps
, PT_CAP_psb_cyc
))
7244 vmx
->pt_desc
.ctl_bitmask
&= ~(RTIT_CTL_CYCLEACC
|
7245 RTIT_CTL_CYC_THRESH
| RTIT_CTL_PSB_FREQ
);
7248 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
7249 * MTCFreq can be set
7251 if (intel_pt_validate_cap(vmx
->pt_desc
.caps
, PT_CAP_mtc
))
7252 vmx
->pt_desc
.ctl_bitmask
&= ~(RTIT_CTL_MTC_EN
|
7253 RTIT_CTL_BRANCH_EN
| RTIT_CTL_MTC_RANGE
);
7255 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
7256 if (intel_pt_validate_cap(vmx
->pt_desc
.caps
, PT_CAP_ptwrite
))
7257 vmx
->pt_desc
.ctl_bitmask
&= ~(RTIT_CTL_FUP_ON_PTW
|
7260 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
7261 if (intel_pt_validate_cap(vmx
->pt_desc
.caps
, PT_CAP_power_event_trace
))
7262 vmx
->pt_desc
.ctl_bitmask
&= ~RTIT_CTL_PWR_EVT_EN
;
7264 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
7265 if (intel_pt_validate_cap(vmx
->pt_desc
.caps
, PT_CAP_topa_output
))
7266 vmx
->pt_desc
.ctl_bitmask
&= ~RTIT_CTL_TOPA
;
7268 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */
7269 if (intel_pt_validate_cap(vmx
->pt_desc
.caps
, PT_CAP_output_subsys
))
7270 vmx
->pt_desc
.ctl_bitmask
&= ~RTIT_CTL_FABRIC_EN
;
7272 /* unmask address range configure area */
7273 for (i
= 0; i
< vmx
->pt_desc
.addr_range
; i
++)
7274 vmx
->pt_desc
.ctl_bitmask
&= ~(0xfULL
<< (32 + i
* 4));
7277 static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu
*vcpu
)
7279 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
7281 /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
7282 vcpu
->arch
.xsaves_enabled
= false;
7284 if (cpu_has_secondary_exec_ctrls()) {
7285 vmx_compute_secondary_exec_control(vmx
);
7286 vmcs_set_secondary_exec_control(vmx
);
7289 if (nested_vmx_allowed(vcpu
))
7290 to_vmx(vcpu
)->msr_ia32_feature_control_valid_bits
|=
7291 FEAT_CTL_VMX_ENABLED_INSIDE_SMX
|
7292 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX
;
7294 to_vmx(vcpu
)->msr_ia32_feature_control_valid_bits
&=
7295 ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX
|
7296 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX
);
7298 if (nested_vmx_allowed(vcpu
)) {
7299 nested_vmx_cr_fixed1_bits_update(vcpu
);
7300 nested_vmx_entry_exit_ctls_update(vcpu
);
7303 if (boot_cpu_has(X86_FEATURE_INTEL_PT
) &&
7304 guest_cpuid_has(vcpu
, X86_FEATURE_INTEL_PT
))
7305 update_intel_pt_cfg(vcpu
);
7307 if (boot_cpu_has(X86_FEATURE_RTM
)) {
7308 struct vmx_uret_msr
*msr
;
7309 msr
= vmx_find_uret_msr(vmx
, MSR_IA32_TSX_CTRL
);
7311 bool enabled
= guest_cpuid_has(vcpu
, X86_FEATURE_RTM
);
7312 vmx_set_guest_uret_msr(vmx
, msr
, enabled
? 0 : TSX_CTRL_RTM_DISABLE
);
7316 set_cr4_guest_host_mask(vmx
);
7318 /* Refresh #PF interception to account for MAXPHYADDR changes. */
7319 vmx_update_exception_bitmap(vcpu
);
7322 static __init
void vmx_set_cpu_caps(void)
7328 kvm_cpu_cap_set(X86_FEATURE_VMX
);
7331 if (kvm_mpx_supported())
7332 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX
);
7333 if (cpu_has_vmx_invpcid())
7334 kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID
);
7335 if (vmx_pt_mode_is_host_guest())
7336 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT
);
7338 if (vmx_umip_emulated())
7339 kvm_cpu_cap_set(X86_FEATURE_UMIP
);
7343 if (!cpu_has_vmx_xsaves())
7344 kvm_cpu_cap_clear(X86_FEATURE_XSAVES
);
7346 /* CPUID 0x80000001 */
7347 if (!cpu_has_vmx_rdtscp())
7348 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP
);
7350 if (cpu_has_vmx_waitpkg())
7351 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG
);
7354 static void vmx_request_immediate_exit(struct kvm_vcpu
*vcpu
)
7356 to_vmx(vcpu
)->req_immediate_exit
= true;
7359 static int vmx_check_intercept_io(struct kvm_vcpu
*vcpu
,
7360 struct x86_instruction_info
*info
)
7362 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
7363 unsigned short port
;
7367 if (info
->intercept
== x86_intercept_in
||
7368 info
->intercept
== x86_intercept_ins
) {
7369 port
= info
->src_val
;
7370 size
= info
->dst_bytes
;
7372 port
= info
->dst_val
;
7373 size
= info
->src_bytes
;
7377 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
7378 * VM-exits depend on the 'unconditional IO exiting' VM-execution
7381 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
7383 if (!nested_cpu_has(vmcs12
, CPU_BASED_USE_IO_BITMAPS
))
7384 intercept
= nested_cpu_has(vmcs12
,
7385 CPU_BASED_UNCOND_IO_EXITING
);
7387 intercept
= nested_vmx_check_io_bitmaps(vcpu
, port
, size
);
7389 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
7390 return intercept
? X86EMUL_UNHANDLEABLE
: X86EMUL_CONTINUE
;
7393 static int vmx_check_intercept(struct kvm_vcpu
*vcpu
,
7394 struct x86_instruction_info
*info
,
7395 enum x86_intercept_stage stage
,
7396 struct x86_exception
*exception
)
7398 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
7400 switch (info
->intercept
) {
7402 * RDPID causes #UD if disabled through secondary execution controls.
7403 * Because it is marked as EmulateOnUD, we need to intercept it here.
7405 case x86_intercept_rdtscp
:
7406 if (!nested_cpu_has2(vmcs12
, SECONDARY_EXEC_ENABLE_RDTSCP
)) {
7407 exception
->vector
= UD_VECTOR
;
7408 exception
->error_code_valid
= false;
7409 return X86EMUL_PROPAGATE_FAULT
;
7413 case x86_intercept_in
:
7414 case x86_intercept_ins
:
7415 case x86_intercept_out
:
7416 case x86_intercept_outs
:
7417 return vmx_check_intercept_io(vcpu
, info
);
7419 case x86_intercept_lgdt
:
7420 case x86_intercept_lidt
:
7421 case x86_intercept_lldt
:
7422 case x86_intercept_ltr
:
7423 case x86_intercept_sgdt
:
7424 case x86_intercept_sidt
:
7425 case x86_intercept_sldt
:
7426 case x86_intercept_str
:
7427 if (!nested_cpu_has2(vmcs12
, SECONDARY_EXEC_DESC
))
7428 return X86EMUL_CONTINUE
;
7430 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
7433 /* TODO: check more intercepts... */
7438 return X86EMUL_UNHANDLEABLE
;
7441 #ifdef CONFIG_X86_64
7442 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */
7443 static inline int u64_shl_div_u64(u64 a
, unsigned int shift
,
7444 u64 divisor
, u64
*result
)
7446 u64 low
= a
<< shift
, high
= a
>> (64 - shift
);
7448 /* To avoid the overflow on divq */
7449 if (high
>= divisor
)
7452 /* Low hold the result, high hold rem which is discarded */
7453 asm("divq %2\n\t" : "=a" (low
), "=d" (high
) :
7454 "rm" (divisor
), "0" (low
), "1" (high
));
7460 static int vmx_set_hv_timer(struct kvm_vcpu
*vcpu
, u64 guest_deadline_tsc
,
7463 struct vcpu_vmx
*vmx
;
7464 u64 tscl
, guest_tscl
, delta_tsc
, lapic_timer_advance_cycles
;
7465 struct kvm_timer
*ktimer
= &vcpu
->arch
.apic
->lapic_timer
;
7469 guest_tscl
= kvm_read_l1_tsc(vcpu
, tscl
);
7470 delta_tsc
= max(guest_deadline_tsc
, guest_tscl
) - guest_tscl
;
7471 lapic_timer_advance_cycles
= nsec_to_cycles(vcpu
,
7472 ktimer
->timer_advance_ns
);
7474 if (delta_tsc
> lapic_timer_advance_cycles
)
7475 delta_tsc
-= lapic_timer_advance_cycles
;
7479 /* Convert to host delta tsc if tsc scaling is enabled */
7480 if (vcpu
->arch
.tsc_scaling_ratio
!= kvm_default_tsc_scaling_ratio
&&
7481 delta_tsc
&& u64_shl_div_u64(delta_tsc
,
7482 kvm_tsc_scaling_ratio_frac_bits
,
7483 vcpu
->arch
.tsc_scaling_ratio
, &delta_tsc
))
7487 * If the delta tsc can't fit in the 32 bit after the multi shift,
7488 * we can't use the preemption timer.
7489 * It's possible that it fits on later vmentries, but checking
7490 * on every vmentry is costly so we just use an hrtimer.
7492 if (delta_tsc
>> (cpu_preemption_timer_multi
+ 32))
7495 vmx
->hv_deadline_tsc
= tscl
+ delta_tsc
;
7496 *expired
= !delta_tsc
;
7500 static void vmx_cancel_hv_timer(struct kvm_vcpu
*vcpu
)
7502 to_vmx(vcpu
)->hv_deadline_tsc
= -1;
7506 static void vmx_sched_in(struct kvm_vcpu
*vcpu
, int cpu
)
7508 if (!kvm_pause_in_guest(vcpu
->kvm
))
7509 shrink_ple_window(vcpu
);
7512 static void vmx_slot_enable_log_dirty(struct kvm
*kvm
,
7513 struct kvm_memory_slot
*slot
)
7515 if (!kvm_dirty_log_manual_protect_and_init_set(kvm
))
7516 kvm_mmu_slot_leaf_clear_dirty(kvm
, slot
);
7517 kvm_mmu_slot_largepage_remove_write_access(kvm
, slot
);
7520 static void vmx_slot_disable_log_dirty(struct kvm
*kvm
,
7521 struct kvm_memory_slot
*slot
)
7523 kvm_mmu_slot_set_dirty(kvm
, slot
);
7526 static void vmx_flush_log_dirty(struct kvm
*kvm
)
7528 kvm_flush_pml_buffers(kvm
);
7531 static void vmx_enable_log_dirty_pt_masked(struct kvm
*kvm
,
7532 struct kvm_memory_slot
*memslot
,
7533 gfn_t offset
, unsigned long mask
)
7535 kvm_mmu_clear_dirty_pt_masked(kvm
, memslot
, offset
, mask
);
7538 static int vmx_pre_block(struct kvm_vcpu
*vcpu
)
7540 if (pi_pre_block(vcpu
))
7543 if (kvm_lapic_hv_timer_in_use(vcpu
))
7544 kvm_lapic_switch_to_sw_timer(vcpu
);
7549 static void vmx_post_block(struct kvm_vcpu
*vcpu
)
7551 if (kvm_x86_ops
.set_hv_timer
)
7552 kvm_lapic_switch_to_hv_timer(vcpu
);
7554 pi_post_block(vcpu
);
7557 static void vmx_setup_mce(struct kvm_vcpu
*vcpu
)
7559 if (vcpu
->arch
.mcg_cap
& MCG_LMCE_P
)
7560 to_vmx(vcpu
)->msr_ia32_feature_control_valid_bits
|=
7561 FEAT_CTL_LMCE_ENABLED
;
7563 to_vmx(vcpu
)->msr_ia32_feature_control_valid_bits
&=
7564 ~FEAT_CTL_LMCE_ENABLED
;
7567 static int vmx_smi_allowed(struct kvm_vcpu
*vcpu
, bool for_injection
)
7569 /* we need a nested vmexit to enter SMM, postpone if run is pending */
7570 if (to_vmx(vcpu
)->nested
.nested_run_pending
)
7572 return !is_smm(vcpu
);
7575 static int vmx_pre_enter_smm(struct kvm_vcpu
*vcpu
, char *smstate
)
7577 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
7579 vmx
->nested
.smm
.guest_mode
= is_guest_mode(vcpu
);
7580 if (vmx
->nested
.smm
.guest_mode
)
7581 nested_vmx_vmexit(vcpu
, -1, 0, 0);
7583 vmx
->nested
.smm
.vmxon
= vmx
->nested
.vmxon
;
7584 vmx
->nested
.vmxon
= false;
7585 vmx_clear_hlt(vcpu
);
7589 static int vmx_pre_leave_smm(struct kvm_vcpu
*vcpu
, const char *smstate
)
7591 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
7594 if (vmx
->nested
.smm
.vmxon
) {
7595 vmx
->nested
.vmxon
= true;
7596 vmx
->nested
.smm
.vmxon
= false;
7599 if (vmx
->nested
.smm
.guest_mode
) {
7600 ret
= nested_vmx_enter_non_root_mode(vcpu
, false);
7604 vmx
->nested
.smm
.guest_mode
= false;
7609 static void vmx_enable_smi_window(struct kvm_vcpu
*vcpu
)
7611 /* RSM will cause a vmexit anyway. */
7614 static bool vmx_apic_init_signal_blocked(struct kvm_vcpu
*vcpu
)
7616 return to_vmx(vcpu
)->nested
.vmxon
&& !is_guest_mode(vcpu
);
7619 static void vmx_migrate_timers(struct kvm_vcpu
*vcpu
)
7621 if (is_guest_mode(vcpu
)) {
7622 struct hrtimer
*timer
= &to_vmx(vcpu
)->nested
.preemption_timer
;
7624 if (hrtimer_try_to_cancel(timer
) == 1)
7625 hrtimer_start_expires(timer
, HRTIMER_MODE_ABS_PINNED
);
7629 static void hardware_unsetup(void)
7632 nested_vmx_hardware_unsetup();
7637 static bool vmx_check_apicv_inhibit_reasons(ulong bit
)
7639 ulong supported
= BIT(APICV_INHIBIT_REASON_DISABLE
) |
7640 BIT(APICV_INHIBIT_REASON_HYPERV
);
7642 return supported
& BIT(bit
);
7645 static int vmx_cpu_dirty_log_size(void)
7647 return enable_pml
? PML_ENTITY_NUM
: 0;
7650 static struct kvm_x86_ops vmx_x86_ops __initdata
= {
7651 .hardware_unsetup
= hardware_unsetup
,
7653 .hardware_enable
= hardware_enable
,
7654 .hardware_disable
= hardware_disable
,
7655 .cpu_has_accelerated_tpr
= report_flexpriority
,
7656 .has_emulated_msr
= vmx_has_emulated_msr
,
7658 .vm_size
= sizeof(struct kvm_vmx
),
7659 .vm_init
= vmx_vm_init
,
7661 .vcpu_create
= vmx_create_vcpu
,
7662 .vcpu_free
= vmx_free_vcpu
,
7663 .vcpu_reset
= vmx_vcpu_reset
,
7665 .prepare_guest_switch
= vmx_prepare_switch_to_guest
,
7666 .vcpu_load
= vmx_vcpu_load
,
7667 .vcpu_put
= vmx_vcpu_put
,
7669 .update_exception_bitmap
= vmx_update_exception_bitmap
,
7670 .get_msr_feature
= vmx_get_msr_feature
,
7671 .get_msr
= vmx_get_msr
,
7672 .set_msr
= vmx_set_msr
,
7673 .get_segment_base
= vmx_get_segment_base
,
7674 .get_segment
= vmx_get_segment
,
7675 .set_segment
= vmx_set_segment
,
7676 .get_cpl
= vmx_get_cpl
,
7677 .get_cs_db_l_bits
= vmx_get_cs_db_l_bits
,
7678 .set_cr0
= vmx_set_cr0
,
7679 .is_valid_cr4
= vmx_is_valid_cr4
,
7680 .set_cr4
= vmx_set_cr4
,
7681 .set_efer
= vmx_set_efer
,
7682 .get_idt
= vmx_get_idt
,
7683 .set_idt
= vmx_set_idt
,
7684 .get_gdt
= vmx_get_gdt
,
7685 .set_gdt
= vmx_set_gdt
,
7686 .set_dr7
= vmx_set_dr7
,
7687 .sync_dirty_debug_regs
= vmx_sync_dirty_debug_regs
,
7688 .cache_reg
= vmx_cache_reg
,
7689 .get_rflags
= vmx_get_rflags
,
7690 .set_rflags
= vmx_set_rflags
,
7692 .tlb_flush_all
= vmx_flush_tlb_all
,
7693 .tlb_flush_current
= vmx_flush_tlb_current
,
7694 .tlb_flush_gva
= vmx_flush_tlb_gva
,
7695 .tlb_flush_guest
= vmx_flush_tlb_guest
,
7697 .run
= vmx_vcpu_run
,
7698 .handle_exit
= vmx_handle_exit
,
7699 .skip_emulated_instruction
= vmx_skip_emulated_instruction
,
7700 .update_emulated_instruction
= vmx_update_emulated_instruction
,
7701 .set_interrupt_shadow
= vmx_set_interrupt_shadow
,
7702 .get_interrupt_shadow
= vmx_get_interrupt_shadow
,
7703 .patch_hypercall
= vmx_patch_hypercall
,
7704 .set_irq
= vmx_inject_irq
,
7705 .set_nmi
= vmx_inject_nmi
,
7706 .queue_exception
= vmx_queue_exception
,
7707 .cancel_injection
= vmx_cancel_injection
,
7708 .interrupt_allowed
= vmx_interrupt_allowed
,
7709 .nmi_allowed
= vmx_nmi_allowed
,
7710 .get_nmi_mask
= vmx_get_nmi_mask
,
7711 .set_nmi_mask
= vmx_set_nmi_mask
,
7712 .enable_nmi_window
= vmx_enable_nmi_window
,
7713 .enable_irq_window
= vmx_enable_irq_window
,
7714 .update_cr8_intercept
= vmx_update_cr8_intercept
,
7715 .set_virtual_apic_mode
= vmx_set_virtual_apic_mode
,
7716 .set_apic_access_page_addr
= vmx_set_apic_access_page_addr
,
7717 .refresh_apicv_exec_ctrl
= vmx_refresh_apicv_exec_ctrl
,
7718 .load_eoi_exitmap
= vmx_load_eoi_exitmap
,
7719 .apicv_post_state_restore
= vmx_apicv_post_state_restore
,
7720 .check_apicv_inhibit_reasons
= vmx_check_apicv_inhibit_reasons
,
7721 .hwapic_irr_update
= vmx_hwapic_irr_update
,
7722 .hwapic_isr_update
= vmx_hwapic_isr_update
,
7723 .guest_apic_has_interrupt
= vmx_guest_apic_has_interrupt
,
7724 .sync_pir_to_irr
= vmx_sync_pir_to_irr
,
7725 .deliver_posted_interrupt
= vmx_deliver_posted_interrupt
,
7726 .dy_apicv_has_pending_interrupt
= pi_has_pending_interrupt
,
7728 .set_tss_addr
= vmx_set_tss_addr
,
7729 .set_identity_map_addr
= vmx_set_identity_map_addr
,
7730 .get_mt_mask
= vmx_get_mt_mask
,
7732 .get_exit_info
= vmx_get_exit_info
,
7734 .vcpu_after_set_cpuid
= vmx_vcpu_after_set_cpuid
,
7736 .has_wbinvd_exit
= cpu_has_vmx_wbinvd_exit
,
7738 .write_l1_tsc_offset
= vmx_write_l1_tsc_offset
,
7740 .load_mmu_pgd
= vmx_load_mmu_pgd
,
7742 .check_intercept
= vmx_check_intercept
,
7743 .handle_exit_irqoff
= vmx_handle_exit_irqoff
,
7745 .request_immediate_exit
= vmx_request_immediate_exit
,
7747 .sched_in
= vmx_sched_in
,
7749 .slot_enable_log_dirty
= vmx_slot_enable_log_dirty
,
7750 .slot_disable_log_dirty
= vmx_slot_disable_log_dirty
,
7751 .flush_log_dirty
= vmx_flush_log_dirty
,
7752 .enable_log_dirty_pt_masked
= vmx_enable_log_dirty_pt_masked
,
7754 .pre_block
= vmx_pre_block
,
7755 .post_block
= vmx_post_block
,
7757 .pmu_ops
= &intel_pmu_ops
,
7758 .nested_ops
= &vmx_nested_ops
,
7760 .update_pi_irte
= pi_update_irte
,
7762 #ifdef CONFIG_X86_64
7763 .set_hv_timer
= vmx_set_hv_timer
,
7764 .cancel_hv_timer
= vmx_cancel_hv_timer
,
7767 .setup_mce
= vmx_setup_mce
,
7769 .smi_allowed
= vmx_smi_allowed
,
7770 .pre_enter_smm
= vmx_pre_enter_smm
,
7771 .pre_leave_smm
= vmx_pre_leave_smm
,
7772 .enable_smi_window
= vmx_enable_smi_window
,
7774 .can_emulate_instruction
= vmx_can_emulate_instruction
,
7775 .apic_init_signal_blocked
= vmx_apic_init_signal_blocked
,
7776 .migrate_timers
= vmx_migrate_timers
,
7778 .msr_filter_changed
= vmx_msr_filter_changed
,
7779 .complete_emulated_msr
= kvm_complete_insn_gp
,
7780 .cpu_dirty_log_size
= vmx_cpu_dirty_log_size
,
7782 .vcpu_deliver_sipi_vector
= kvm_vcpu_deliver_sipi_vector
,
7785 static __init
int hardware_setup(void)
7787 unsigned long host_bndcfgs
;
7789 int r
, i
, ept_lpage_level
;
7792 host_idt_base
= dt
.address
;
7794 for (i
= 0; i
< ARRAY_SIZE(vmx_uret_msrs_list
); ++i
)
7795 kvm_define_user_return_msr(i
, vmx_uret_msrs_list
[i
]);
7797 if (setup_vmcs_config(&vmcs_config
, &vmx_capability
) < 0)
7800 if (boot_cpu_has(X86_FEATURE_NX
))
7801 kvm_enable_efer_bits(EFER_NX
);
7803 if (boot_cpu_has(X86_FEATURE_MPX
)) {
7804 rdmsrl(MSR_IA32_BNDCFGS
, host_bndcfgs
);
7805 WARN_ONCE(host_bndcfgs
, "KVM: BNDCFGS in host will be lost");
7808 if (!cpu_has_vmx_mpx())
7809 supported_xcr0
&= ~(XFEATURE_MASK_BNDREGS
|
7810 XFEATURE_MASK_BNDCSR
);
7812 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
7813 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
7816 if (!cpu_has_vmx_ept() ||
7817 !cpu_has_vmx_ept_4levels() ||
7818 !cpu_has_vmx_ept_mt_wb() ||
7819 !cpu_has_vmx_invept_global())
7822 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept
)
7823 enable_ept_ad_bits
= 0;
7825 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept
)
7826 enable_unrestricted_guest
= 0;
7828 if (!cpu_has_vmx_flexpriority())
7829 flexpriority_enabled
= 0;
7831 if (!cpu_has_virtual_nmis())
7835 * set_apic_access_page_addr() is used to reload apic access
7836 * page upon invalidation. No need to do anything if not
7837 * using the APIC_ACCESS_ADDR VMCS field.
7839 if (!flexpriority_enabled
)
7840 vmx_x86_ops
.set_apic_access_page_addr
= NULL
;
7842 if (!cpu_has_vmx_tpr_shadow())
7843 vmx_x86_ops
.update_cr8_intercept
= NULL
;
7845 #if IS_ENABLED(CONFIG_HYPERV)
7846 if (ms_hyperv
.nested_features
& HV_X64_NESTED_GUEST_MAPPING_FLUSH
7848 vmx_x86_ops
.tlb_remote_flush
= hv_remote_flush_tlb
;
7849 vmx_x86_ops
.tlb_remote_flush_with_range
=
7850 hv_remote_flush_tlb_with_range
;
7854 if (!cpu_has_vmx_ple()) {
7857 ple_window_grow
= 0;
7859 ple_window_shrink
= 0;
7862 if (!cpu_has_vmx_apicv()) {
7864 vmx_x86_ops
.sync_pir_to_irr
= NULL
;
7867 if (cpu_has_vmx_tsc_scaling()) {
7868 kvm_has_tsc_control
= true;
7869 kvm_max_tsc_scaling_ratio
= KVM_VMX_TSC_MULTIPLIER_MAX
;
7870 kvm_tsc_scaling_ratio_frac_bits
= 48;
7873 kvm_has_bus_lock_exit
= cpu_has_vmx_bus_lock_detection();
7875 set_bit(0, vmx_vpid_bitmap
); /* 0 is reserved for host */
7881 ept_lpage_level
= 0;
7882 else if (cpu_has_vmx_ept_1g_page())
7883 ept_lpage_level
= PG_LEVEL_1G
;
7884 else if (cpu_has_vmx_ept_2m_page())
7885 ept_lpage_level
= PG_LEVEL_2M
;
7887 ept_lpage_level
= PG_LEVEL_4K
;
7888 kvm_configure_mmu(enable_ept
, vmx_get_max_tdp_level(), ept_lpage_level
);
7891 * Only enable PML when hardware supports PML feature, and both EPT
7892 * and EPT A/D bit features are enabled -- PML depends on them to work.
7894 if (!enable_ept
|| !enable_ept_ad_bits
|| !cpu_has_vmx_pml())
7898 vmx_x86_ops
.slot_enable_log_dirty
= NULL
;
7899 vmx_x86_ops
.slot_disable_log_dirty
= NULL
;
7900 vmx_x86_ops
.flush_log_dirty
= NULL
;
7901 vmx_x86_ops
.enable_log_dirty_pt_masked
= NULL
;
7902 vmx_x86_ops
.cpu_dirty_log_size
= NULL
;
7905 if (!cpu_has_vmx_preemption_timer())
7906 enable_preemption_timer
= false;
7908 if (enable_preemption_timer
) {
7909 u64 use_timer_freq
= 5000ULL * 1000 * 1000;
7912 rdmsrl(MSR_IA32_VMX_MISC
, vmx_msr
);
7913 cpu_preemption_timer_multi
=
7914 vmx_msr
& VMX_MISC_PREEMPTION_TIMER_RATE_MASK
;
7917 use_timer_freq
= (u64
)tsc_khz
* 1000;
7918 use_timer_freq
>>= cpu_preemption_timer_multi
;
7921 * KVM "disables" the preemption timer by setting it to its max
7922 * value. Don't use the timer if it might cause spurious exits
7923 * at a rate faster than 0.1 Hz (of uninterrupted guest time).
7925 if (use_timer_freq
> 0xffffffffu
/ 10)
7926 enable_preemption_timer
= false;
7929 if (!enable_preemption_timer
) {
7930 vmx_x86_ops
.set_hv_timer
= NULL
;
7931 vmx_x86_ops
.cancel_hv_timer
= NULL
;
7932 vmx_x86_ops
.request_immediate_exit
= __kvm_request_immediate_exit
;
7935 kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler
);
7937 kvm_mce_cap_supported
|= MCG_LMCE_P
;
7939 if (pt_mode
!= PT_MODE_SYSTEM
&& pt_mode
!= PT_MODE_HOST_GUEST
)
7941 if (!enable_ept
|| !cpu_has_vmx_intel_pt())
7942 pt_mode
= PT_MODE_SYSTEM
;
7945 nested_vmx_setup_ctls_msrs(&vmcs_config
.nested
,
7946 vmx_capability
.ept
);
7948 r
= nested_vmx_hardware_setup(kvm_vmx_exit_handlers
);
7955 r
= alloc_kvm_area();
7957 nested_vmx_hardware_unsetup();
7961 static struct kvm_x86_init_ops vmx_init_ops __initdata
= {
7962 .cpu_has_kvm_support
= cpu_has_kvm_support
,
7963 .disabled_by_bios
= vmx_disabled_by_bios
,
7964 .check_processor_compatibility
= vmx_check_processor_compat
,
7965 .hardware_setup
= hardware_setup
,
7967 .runtime_ops
= &vmx_x86_ops
,
7970 static void vmx_cleanup_l1d_flush(void)
7972 if (vmx_l1d_flush_pages
) {
7973 free_pages((unsigned long)vmx_l1d_flush_pages
, L1D_CACHE_ORDER
);
7974 vmx_l1d_flush_pages
= NULL
;
7976 /* Restore state so sysfs ignores VMX */
7977 l1tf_vmx_mitigation
= VMENTER_L1D_FLUSH_AUTO
;
7980 static void vmx_exit(void)
7982 #ifdef CONFIG_KEXEC_CORE
7983 RCU_INIT_POINTER(crash_vmclear_loaded_vmcss
, NULL
);
7989 #if IS_ENABLED(CONFIG_HYPERV)
7990 if (static_branch_unlikely(&enable_evmcs
)) {
7992 struct hv_vp_assist_page
*vp_ap
;
7994 * Reset everything to support using non-enlightened VMCS
7995 * access later (e.g. when we reload the module with
7996 * enlightened_vmcs=0)
7998 for_each_online_cpu(cpu
) {
7999 vp_ap
= hv_get_vp_assist_page(cpu
);
8004 vp_ap
->nested_control
.features
.directhypercall
= 0;
8005 vp_ap
->current_nested_vmcs
= 0;
8006 vp_ap
->enlighten_vmentry
= 0;
8009 static_branch_disable(&enable_evmcs
);
8012 vmx_cleanup_l1d_flush();
8014 module_exit(vmx_exit
);
8016 static int __init
vmx_init(void)
8020 #if IS_ENABLED(CONFIG_HYPERV)
8022 * Enlightened VMCS usage should be recommended and the host needs
8023 * to support eVMCS v1 or above. We can also disable eVMCS support
8024 * with module parameter.
8026 if (enlightened_vmcs
&&
8027 ms_hyperv
.hints
& HV_X64_ENLIGHTENED_VMCS_RECOMMENDED
&&
8028 (ms_hyperv
.nested_features
& HV_X64_ENLIGHTENED_VMCS_VERSION
) >=
8029 KVM_EVMCS_VERSION
) {
8032 /* Check that we have assist pages on all online CPUs */
8033 for_each_online_cpu(cpu
) {
8034 if (!hv_get_vp_assist_page(cpu
)) {
8035 enlightened_vmcs
= false;
8040 if (enlightened_vmcs
) {
8041 pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
8042 static_branch_enable(&enable_evmcs
);
8045 if (ms_hyperv
.nested_features
& HV_X64_NESTED_DIRECT_FLUSH
)
8046 vmx_x86_ops
.enable_direct_tlbflush
8047 = hv_enable_direct_tlbflush
;
8050 enlightened_vmcs
= false;
8054 r
= kvm_init(&vmx_init_ops
, sizeof(struct vcpu_vmx
),
8055 __alignof__(struct vcpu_vmx
), THIS_MODULE
);
8060 * Must be called after kvm_init() so enable_ept is properly set
8061 * up. Hand the parameter mitigation value in which was stored in
8062 * the pre module init parser. If no parameter was given, it will
8063 * contain 'auto' which will be turned into the default 'cond'
8066 r
= vmx_setup_l1d_flush(vmentry_l1d_flush_param
);
8072 for_each_possible_cpu(cpu
) {
8073 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu
, cpu
));
8078 #ifdef CONFIG_KEXEC_CORE
8079 rcu_assign_pointer(crash_vmclear_loaded_vmcss
,
8080 crash_vmclear_local_loaded_vmcss
);
8082 vmx_check_vmcs12_offsets();
8085 * Shadow paging doesn't have a (further) performance penalty
8086 * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
8090 allow_smaller_maxphyaddr
= true;
8094 module_init(vmx_init
);