]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - arch/x86/kvm/vmx/vmx.c
KVM: selftests: s390x: Provide additional num-guest-pages adjustment
[mirror_ubuntu-hirsute-kernel.git] / arch / x86 / kvm / vmx / vmx.c
CommitLineData
20c8ccb1 1// SPDX-License-Identifier: GPL-2.0-only
6aa8b732
AK
2/*
3 * Kernel-based Virtual Machine driver for Linux
4 *
5 * This module enables machines with Intel VT-x extensions to run virtual
6 * machines without emulation or binary translation.
7 *
8 * Copyright (C) 2006 Qumranet, Inc.
9611c187 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
6aa8b732
AK
10 *
11 * Authors:
12 * Avi Kivity <avi@qumranet.com>
13 * Yaniv Kamay <yaniv@qumranet.com>
6aa8b732
AK
14 */
15
199b118a
SC
16#include <linux/frame.h>
17#include <linux/highmem.h>
18#include <linux/hrtimer.h>
19#include <linux/kernel.h>
edf88417 20#include <linux/kvm_host.h>
6aa8b732 21#include <linux/module.h>
c7addb90 22#include <linux/moduleparam.h>
e9bda3b3 23#include <linux/mod_devicetable.h>
199b118a 24#include <linux/mm.h>
199b118a 25#include <linux/sched.h>
b284909a 26#include <linux/sched/smt.h>
5a0e3ad6 27#include <linux/slab.h>
cafd6659 28#include <linux/tboot.h>
199b118a 29#include <linux/trace_events.h>
e495606d 30
199b118a 31#include <asm/apic.h>
fd8ca6da 32#include <asm/asm.h>
28b835d6 33#include <asm/cpu.h>
199b118a 34#include <asm/debugreg.h>
3b3be0d1 35#include <asm/desc.h>
952f07ec 36#include <asm/fpu/internal.h>
199b118a 37#include <asm/io.h>
efc64404 38#include <asm/irq_remapping.h>
199b118a
SC
39#include <asm/kexec.h>
40#include <asm/perf_event.h>
41#include <asm/mce.h>
d6e41f11 42#include <asm/mmu_context.h>
773e8a04 43#include <asm/mshyperv.h>
199b118a
SC
44#include <asm/spec-ctrl.h>
45#include <asm/virtext.h>
46#include <asm/vmx.h>
6aa8b732 47
3077c191 48#include "capabilities.h"
199b118a 49#include "cpuid.h"
4cebd747 50#include "evmcs.h"
199b118a
SC
51#include "irq.h"
52#include "kvm_cache_regs.h"
53#include "lapic.h"
54#include "mmu.h"
55d2375e 55#include "nested.h"
89b0c9f5 56#include "ops.h"
25462f7f 57#include "pmu.h"
199b118a 58#include "trace.h"
cb1d474b 59#include "vmcs.h"
609363cf 60#include "vmcs12.h"
89b0c9f5 61#include "vmx.h"
199b118a 62#include "x86.h"
229456fc 63
6aa8b732
AK
64MODULE_AUTHOR("Qumranet");
65MODULE_LICENSE("GPL");
66
e9bda3b3
JT
67static const struct x86_cpu_id vmx_cpu_id[] = {
68 X86_FEATURE_MATCH(X86_FEATURE_VMX),
69 {}
70};
71MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
72
2c4fd91d 73bool __read_mostly enable_vpid = 1;
736caefe 74module_param_named(vpid, enable_vpid, bool, 0444);
2384d2b3 75
d02fcf50
PB
76static bool __read_mostly enable_vnmi = 1;
77module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
78
2c4fd91d 79bool __read_mostly flexpriority_enabled = 1;
736caefe 80module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
4c9fc8ef 81
2c4fd91d 82bool __read_mostly enable_ept = 1;
736caefe 83module_param_named(ept, enable_ept, bool, S_IRUGO);
d56f546d 84
2c4fd91d 85bool __read_mostly enable_unrestricted_guest = 1;
3a624e29
NK
86module_param_named(unrestricted_guest,
87 enable_unrestricted_guest, bool, S_IRUGO);
88
2c4fd91d 89bool __read_mostly enable_ept_ad_bits = 1;
83c3a331
XH
90module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
91
a27685c3 92static bool __read_mostly emulate_invalid_guest_state = true;
c1f8bc04 93module_param(emulate_invalid_guest_state, bool, S_IRUGO);
04fa4d32 94
476bc001 95static bool __read_mostly fasteoi = 1;
58fbbf26
KT
96module_param(fasteoi, bool, S_IRUGO);
97
a4443267 98bool __read_mostly enable_apicv = 1;
01e439be 99module_param(enable_apicv, bool, S_IRUGO);
83d4c286 100
801d3424
NHE
101/*
102 * If nested=1, nested virtualization is supported, i.e., guests may use
103 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
104 * use VMX instructions.
105 */
1e58e5e5 106static bool __read_mostly nested = 1;
801d3424
NHE
107module_param(nested, bool, S_IRUGO);
108
2c4fd91d 109bool __read_mostly enable_pml = 1;
843e4330
KH
110module_param_named(pml, enable_pml, bool, S_IRUGO);
111
6f2f8453
PB
112static bool __read_mostly dump_invalid_vmcs = 0;
113module_param(dump_invalid_vmcs, bool, 0644);
114
904e14fb
PB
115#define MSR_BITMAP_MODE_X2APIC 1
116#define MSR_BITMAP_MODE_X2APIC_APICV 2
904e14fb 117
64903d61
HZ
118#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
119
64672c95
YJ
120/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
121static int __read_mostly cpu_preemption_timer_multi;
122static bool __read_mostly enable_preemption_timer = 1;
123#ifdef CONFIG_X86_64
124module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
125#endif
126
3de6347b 127#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
1706bd0c
SC
128#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
129#define KVM_VM_CR0_ALWAYS_ON \
130 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
131 X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
4c38609a
AK
132#define KVM_CR4_GUEST_OWNED_BITS \
133 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
fd8cb433 134 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
4c38609a 135
5dc1f044 136#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
cdc0e244
AK
137#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
138#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
139
78ac8b47
AK
140#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
141
bf8c55d8
CP
142#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
143 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
144 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
145 RTIT_STATUS_BYTECNT))
146
147#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
148 (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
149
4b8d54f9
ZE
150/*
151 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
152 * ple_gap: upper bound on the amount of time between two successive
153 * executions of PAUSE in a loop. Also indicate if ple enabled.
00c25bce 154 * According to test, this time is usually smaller than 128 cycles.
4b8d54f9
ZE
155 * ple_window: upper bound on the amount of time a guest is allowed to execute
156 * in a PAUSE loop. Tests indicate that most spinlocks are held for
157 * less than 2^12 cycles
158 * Time is measured based on a counter that runs at the same rate as the TSC,
159 * refer SDM volume 3b section 21.6.13 & 22.1.3.
160 */
c8e88717 161static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
a87c99e6 162module_param(ple_gap, uint, 0444);
b4a2d31d 163
7fbc85a5
BM
164static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
165module_param(ple_window, uint, 0444);
4b8d54f9 166
b4a2d31d 167/* Default doubles per-vcpu window every exit. */
c8e88717 168static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
7fbc85a5 169module_param(ple_window_grow, uint, 0444);
b4a2d31d
RK
170
171/* Default resets per-vcpu window every exit to ple_window. */
c8e88717 172static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
7fbc85a5 173module_param(ple_window_shrink, uint, 0444);
b4a2d31d
RK
174
175/* Default is to compute the maximum so we can never overflow. */
7fbc85a5
BM
176static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
177module_param(ple_window_max, uint, 0444);
b4a2d31d 178
f99e3daf
CP
179/* Default is SYSTEM mode, 1 for host-guest mode */
180int __read_mostly pt_mode = PT_MODE_SYSTEM;
181module_param(pt_mode, int, S_IRUGO);
182
a399477e 183static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
427362a1 184static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
dd4bfa73 185static DEFINE_MUTEX(vmx_l1d_flush_mutex);
a399477e 186
7db92e16
TG
187/* Storage for pre module init parameter parsing */
188static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
a399477e
KRW
189
190static const struct {
191 const char *option;
0027ff2a 192 bool for_parse;
a399477e 193} vmentry_l1d_param[] = {
0027ff2a
PB
194 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
195 [VMENTER_L1D_FLUSH_NEVER] = {"never", true},
196 [VMENTER_L1D_FLUSH_COND] = {"cond", true},
197 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},
198 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
199 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
a399477e
KRW
200};
201
7db92e16
TG
202#define L1D_CACHE_ORDER 4
203static void *vmx_l1d_flush_pages;
204
205static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
a399477e 206{
7db92e16 207 struct page *page;
288d152c 208 unsigned int i;
a399477e 209
19a36d32
WL
210 if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
211 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
212 return 0;
213 }
214
7db92e16
TG
215 if (!enable_ept) {
216 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
217 return 0;
a399477e
KRW
218 }
219
d806afa4
YW
220 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
221 u64 msr;
222
223 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
224 if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
225 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
226 return 0;
227 }
228 }
8e0b2b91 229
d90a7a0e
JK
230 /* If set to auto use the default l1tf mitigation method */
231 if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
232 switch (l1tf_mitigation) {
233 case L1TF_MITIGATION_OFF:
234 l1tf = VMENTER_L1D_FLUSH_NEVER;
235 break;
236 case L1TF_MITIGATION_FLUSH_NOWARN:
237 case L1TF_MITIGATION_FLUSH:
238 case L1TF_MITIGATION_FLUSH_NOSMT:
239 l1tf = VMENTER_L1D_FLUSH_COND;
240 break;
241 case L1TF_MITIGATION_FULL:
242 case L1TF_MITIGATION_FULL_FORCE:
243 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
244 break;
245 }
246 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
247 l1tf = VMENTER_L1D_FLUSH_ALWAYS;
248 }
249
7db92e16
TG
250 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
251 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
41836839
BG
252 /*
253 * This allocation for vmx_l1d_flush_pages is not tied to a VM
254 * lifetime and so should not be charged to a memcg.
255 */
7db92e16
TG
256 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
257 if (!page)
258 return -ENOMEM;
259 vmx_l1d_flush_pages = page_address(page);
288d152c
NS
260
261 /*
262 * Initialize each page with a different pattern in
263 * order to protect against KSM in the nested
264 * virtualization case.
265 */
266 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
267 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
268 PAGE_SIZE);
269 }
7db92e16
TG
270 }
271
272 l1tf_vmx_mitigation = l1tf;
273
895ae47f
TG
274 if (l1tf != VMENTER_L1D_FLUSH_NEVER)
275 static_branch_enable(&vmx_l1d_should_flush);
276 else
277 static_branch_disable(&vmx_l1d_should_flush);
4c6523ec 278
427362a1
NS
279 if (l1tf == VMENTER_L1D_FLUSH_COND)
280 static_branch_enable(&vmx_l1d_flush_cond);
895ae47f 281 else
427362a1 282 static_branch_disable(&vmx_l1d_flush_cond);
7db92e16
TG
283 return 0;
284}
285
286static int vmentry_l1d_flush_parse(const char *s)
287{
288 unsigned int i;
289
290 if (s) {
291 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
0027ff2a
PB
292 if (vmentry_l1d_param[i].for_parse &&
293 sysfs_streq(s, vmentry_l1d_param[i].option))
294 return i;
7db92e16
TG
295 }
296 }
a399477e
KRW
297 return -EINVAL;
298}
299
7db92e16
TG
300static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
301{
dd4bfa73 302 int l1tf, ret;
7db92e16 303
7db92e16
TG
304 l1tf = vmentry_l1d_flush_parse(s);
305 if (l1tf < 0)
306 return l1tf;
307
0027ff2a
PB
308 if (!boot_cpu_has(X86_BUG_L1TF))
309 return 0;
310
7db92e16
TG
311 /*
312 * Has vmx_init() run already? If not then this is the pre init
313 * parameter parsing. In that case just store the value and let
314 * vmx_init() do the proper setup after enable_ept has been
315 * established.
316 */
317 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
318 vmentry_l1d_flush_param = l1tf;
319 return 0;
320 }
321
dd4bfa73
TG
322 mutex_lock(&vmx_l1d_flush_mutex);
323 ret = vmx_setup_l1d_flush(l1tf);
324 mutex_unlock(&vmx_l1d_flush_mutex);
325 return ret;
7db92e16
TG
326}
327
a399477e
KRW
328static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
329{
0027ff2a
PB
330 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
331 return sprintf(s, "???\n");
332
7db92e16 333 return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
a399477e
KRW
334}
335
336static const struct kernel_param_ops vmentry_l1d_flush_ops = {
337 .set = vmentry_l1d_flush_set,
338 .get = vmentry_l1d_flush_get,
339};
895ae47f 340module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
a399477e 341
d99e4152
GN
342static bool guest_state_valid(struct kvm_vcpu *vcpu);
343static u32 vmx_segment_access_rights(struct kvm_segment *var);
1e4329ee 344static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
15d45071 345 u32 msr, int type);
75880a01 346
453eafbe
SC
347void vmx_vmexit(void);
348
52a9fcbc
SC
349#define vmx_insn_failed(fmt...) \
350do { \
351 WARN_ONCE(1, fmt); \
352 pr_warn_ratelimited(fmt); \
353} while (0)
354
6e202097
SC
355asmlinkage void vmread_error(unsigned long field, bool fault)
356{
357 if (fault)
358 kvm_spurious_fault();
359 else
360 vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
361}
362
52a9fcbc
SC
363noinline void vmwrite_error(unsigned long field, unsigned long value)
364{
365 vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n",
366 field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
367}
368
369noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
370{
371 vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr);
372}
373
374noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
375{
376 vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr);
377}
378
379noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
380{
381 vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
382 ext, vpid, gva);
383}
384
385noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
386{
387 vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
388 ext, eptp, gpa);
389}
390
6aa8b732 391static DEFINE_PER_CPU(struct vmcs *, vmxarea);
75edce8a 392DEFINE_PER_CPU(struct vmcs *, current_vmcs);
d462b819
NHE
393/*
394 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
395 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
396 */
397static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
6aa8b732 398
bf9f6ac8
FW
399/*
400 * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
401 * can find which vCPU should be waken up.
402 */
403static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
404static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
405
2384d2b3
SY
406static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
407static DEFINE_SPINLOCK(vmx_vpid_lock);
408
3077c191
SC
409struct vmcs_config vmcs_config;
410struct vmx_capability vmx_capability;
d56f546d 411
6aa8b732
AK
412#define VMX_SEGMENT_FIELD(seg) \
413 [VCPU_SREG_##seg] = { \
414 .selector = GUEST_##seg##_SELECTOR, \
415 .base = GUEST_##seg##_BASE, \
416 .limit = GUEST_##seg##_LIMIT, \
417 .ar_bytes = GUEST_##seg##_AR_BYTES, \
418 }
419
772e0318 420static const struct kvm_vmx_segment_field {
6aa8b732
AK
421 unsigned selector;
422 unsigned base;
423 unsigned limit;
424 unsigned ar_bytes;
425} kvm_vmx_segment_fields[] = {
426 VMX_SEGMENT_FIELD(CS),
427 VMX_SEGMENT_FIELD(DS),
428 VMX_SEGMENT_FIELD(ES),
429 VMX_SEGMENT_FIELD(FS),
430 VMX_SEGMENT_FIELD(GS),
431 VMX_SEGMENT_FIELD(SS),
432 VMX_SEGMENT_FIELD(TR),
433 VMX_SEGMENT_FIELD(LDTR),
434};
435
2342080c 436static unsigned long host_idt_base;
26bb0981 437
4d56c8a7 438/*
898a811f
JM
439 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
440 * will emulate SYSCALL in legacy mode if the vendor string in guest
441 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
442 * support this emulation, IA32_STAR must always be included in
443 * vmx_msr_index[], even in i386 builds.
4d56c8a7 444 */
cf3646eb 445const u32 vmx_msr_index[] = {
05b3e0c2 446#ifdef CONFIG_X86_64
44ea2b17 447 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
6aa8b732 448#endif
8c06585d 449 MSR_EFER, MSR_TSC_AUX, MSR_STAR,
c11f83e0 450 MSR_IA32_TSX_CTRL,
6aa8b732 451};
6aa8b732 452
773e8a04
VK
453#if IS_ENABLED(CONFIG_HYPERV)
454static bool __read_mostly enlightened_vmcs = true;
455module_param(enlightened_vmcs, bool, 0444);
456
877ad952
TL
457/* check_ept_pointer() should be under protection of ept_pointer_lock. */
458static void check_ept_pointer_match(struct kvm *kvm)
459{
460 struct kvm_vcpu *vcpu;
461 u64 tmp_eptp = INVALID_PAGE;
462 int i;
463
464 kvm_for_each_vcpu(i, vcpu, kvm) {
465 if (!VALID_PAGE(tmp_eptp)) {
466 tmp_eptp = to_vmx(vcpu)->ept_pointer;
467 } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
468 to_kvm_vmx(kvm)->ept_pointers_match
469 = EPT_POINTERS_MISMATCH;
470 return;
471 }
472 }
473
474 to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
475}
476
8997f657 477static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
1f3a3e46
LT
478 void *data)
479{
480 struct kvm_tlb_range *range = data;
481
482 return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
483 range->pages);
484}
485
486static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
487 struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
488{
489 u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
490
491 /*
492 * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
493 * of the base of EPT PML4 table, strip off EPT configuration
494 * information.
495 */
496 if (range)
497 return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
498 kvm_fill_hv_flush_list_func, (void *)range);
499 else
500 return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
501}
502
503static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
504 struct kvm_tlb_range *range)
877ad952 505{
a5c214da 506 struct kvm_vcpu *vcpu;
b7c1c226 507 int ret = 0, i;
877ad952
TL
508
509 spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
510
511 if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
512 check_ept_pointer_match(kvm);
513
514 if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
53963a70 515 kvm_for_each_vcpu(i, vcpu, kvm) {
1f3a3e46
LT
516 /* If ept_pointer is invalid pointer, bypass flush request. */
517 if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
518 ret |= __hv_remote_flush_tlb_with_range(
519 kvm, vcpu, range);
53963a70 520 }
a5c214da 521 } else {
1f3a3e46
LT
522 ret = __hv_remote_flush_tlb_with_range(kvm,
523 kvm_get_vcpu(kvm, 0), range);
877ad952 524 }
877ad952 525
877ad952
TL
526 spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
527 return ret;
528}
1f3a3e46
LT
529static int hv_remote_flush_tlb(struct kvm *kvm)
530{
531 return hv_remote_flush_tlb_with_range(kvm, NULL);
532}
533
6f6a657c
VK
534static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
535{
536 struct hv_enlightened_vmcs *evmcs;
537 struct hv_partition_assist_pg **p_hv_pa_pg =
538 &vcpu->kvm->arch.hyperv.hv_pa_pg;
539 /*
540 * Synthetic VM-Exit is not enabled in current code and so All
541 * evmcs in singe VM shares same assist page.
542 */
cab01850 543 if (!*p_hv_pa_pg)
6f6a657c 544 *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
cab01850
VK
545
546 if (!*p_hv_pa_pg)
547 return -ENOMEM;
6f6a657c
VK
548
549 evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
550
551 evmcs->partition_assist_page =
552 __pa(*p_hv_pa_pg);
cab01850 553 evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
6f6a657c
VK
554 evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
555
6f6a657c
VK
556 return 0;
557}
558
773e8a04
VK
559#endif /* IS_ENABLED(CONFIG_HYPERV) */
560
64672c95
YJ
561/*
562 * Comment's format: document - errata name - stepping - processor name.
563 * Refer from
564 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
565 */
566static u32 vmx_preemption_cpu_tfms[] = {
567/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
5680x000206E6,
569/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
570/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
571/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
5720x00020652,
573/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
5740x00020655,
575/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
576/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
577/*
578 * 320767.pdf - AAP86 - B1 -
579 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
580 */
5810x000106E5,
582/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
5830x000106A0,
584/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
5850x000106A1,
586/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
5870x000106A4,
588 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
589 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
590 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
5910x000106A5,
3d82c565
WH
592 /* Xeon E3-1220 V2 */
5930x000306A8,
64672c95
YJ
594};
595
596static inline bool cpu_has_broken_vmx_preemption_timer(void)
597{
598 u32 eax = cpuid_eax(0x00000001), i;
599
600 /* Clear the reserved bits */
601 eax &= ~(0x3U << 14 | 0xfU << 28);
03f6a22a 602 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
64672c95
YJ
603 if (eax == vmx_preemption_cpu_tfms[i])
604 return true;
605
606 return false;
607}
608
35754c98 609static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
f78e0e2e 610{
35754c98 611 return flexpriority_enabled && lapic_in_kernel(vcpu);
f78e0e2e
SY
612}
613
04547156
SY
614static inline bool report_flexpriority(void)
615{
616 return flexpriority_enabled;
617}
618
97b7ead3 619static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
7725f0ba
AK
620{
621 int i;
622
a2fa3e9f 623 for (i = 0; i < vmx->nmsrs; ++i)
26bb0981 624 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
a75beee6
ED
625 return i;
626 return -1;
627}
628
97b7ead3 629struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
a75beee6
ED
630{
631 int i;
632
8b9cf98c 633 i = __find_msr_index(vmx, msr);
a75beee6 634 if (i >= 0)
a2fa3e9f 635 return &vmx->guest_msrs[i];
8b6d44c7 636 return NULL;
7725f0ba
AK
637}
638
b07a5c53
PB
639static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, u64 data)
640{
641 int ret = 0;
642
643 u64 old_msr_data = msr->data;
644 msr->data = data;
645 if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
646 preempt_disable();
647 ret = kvm_set_shared_msr(msr->index, msr->data,
648 msr->mask);
649 preempt_enable();
650 if (ret)
651 msr->data = old_msr_data;
652 }
653 return ret;
654}
655
7c97fcb3
SC
656void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
657{
658 vmcs_clear(loaded_vmcs->vmcs);
659 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
660 vmcs_clear(loaded_vmcs->shadow_vmcs);
661 loaded_vmcs->cpu = -1;
662 loaded_vmcs->launched = 0;
663}
664
2965faa5 665#ifdef CONFIG_KEXEC_CORE
8f536b76
ZY
666/*
667 * This bitmap is used to indicate whether the vmclear
668 * operation is enabled on all cpus. All disabled by
669 * default.
670 */
671static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
672
673static inline void crash_enable_local_vmclear(int cpu)
674{
675 cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
676}
677
678static inline void crash_disable_local_vmclear(int cpu)
679{
680 cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
681}
682
683static inline int crash_local_vmclear_enabled(int cpu)
684{
685 return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
686}
687
688static void crash_vmclear_local_loaded_vmcss(void)
689{
690 int cpu = raw_smp_processor_id();
691 struct loaded_vmcs *v;
692
693 if (!crash_local_vmclear_enabled(cpu))
694 return;
695
696 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
697 loaded_vmcss_on_cpu_link)
698 vmcs_clear(v->vmcs);
699}
700#else
701static inline void crash_enable_local_vmclear(int cpu) { }
702static inline void crash_disable_local_vmclear(int cpu) { }
2965faa5 703#endif /* CONFIG_KEXEC_CORE */
8f536b76 704
d462b819 705static void __loaded_vmcs_clear(void *arg)
6aa8b732 706{
d462b819 707 struct loaded_vmcs *loaded_vmcs = arg;
d3b2c338 708 int cpu = raw_smp_processor_id();
6aa8b732 709
d462b819
NHE
710 if (loaded_vmcs->cpu != cpu)
711 return; /* vcpu migration can race with cpu offline */
712 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
6aa8b732 713 per_cpu(current_vmcs, cpu) = NULL;
8f536b76 714 crash_disable_local_vmclear(cpu);
d462b819 715 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
5a560f8b
XG
716
717 /*
718 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
719 * is before setting loaded_vmcs->vcpu to -1 which is done in
720 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
721 * then adds the vmcs into percpu list before it is deleted.
722 */
723 smp_wmb();
724
d462b819 725 loaded_vmcs_init(loaded_vmcs);
8f536b76 726 crash_enable_local_vmclear(cpu);
6aa8b732
AK
727}
728
89b0c9f5 729void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
8d0be2b3 730{
e6c7d321
XG
731 int cpu = loaded_vmcs->cpu;
732
733 if (cpu != -1)
734 smp_call_function_single(cpu,
735 __loaded_vmcs_clear, loaded_vmcs, 1);
8d0be2b3
AK
736}
737
2fb92db1
AK
738static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
739 unsigned field)
740{
741 bool ret;
742 u32 mask = 1 << (seg * SEG_FIELD_NR + field);
743
cb3c1e2f
SC
744 if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
745 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
2fb92db1
AK
746 vmx->segment_cache.bitmask = 0;
747 }
748 ret = vmx->segment_cache.bitmask & mask;
749 vmx->segment_cache.bitmask |= mask;
750 return ret;
751}
752
753static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
754{
755 u16 *p = &vmx->segment_cache.seg[seg].selector;
756
757 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
758 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
759 return *p;
760}
761
762static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
763{
764 ulong *p = &vmx->segment_cache.seg[seg].base;
765
766 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
767 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
768 return *p;
769}
770
771static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
772{
773 u32 *p = &vmx->segment_cache.seg[seg].limit;
774
775 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
776 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
777 return *p;
778}
779
780static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
781{
782 u32 *p = &vmx->segment_cache.seg[seg].ar;
783
784 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
785 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
786 return *p;
787}
788
97b7ead3 789void update_exception_bitmap(struct kvm_vcpu *vcpu)
abd3f2d6
AK
790{
791 u32 eb;
792
fd7373cc 793 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
bd7e5b08 794 (1u << DB_VECTOR) | (1u << AC_VECTOR);
9e869480
LA
795 /*
796 * Guest access to VMware backdoor ports could legitimately
797 * trigger #GP because of TSS I/O permission bitmap.
798 * We intercept those #GP and allow access to them anyway
799 * as VMware does.
800 */
801 if (enable_vmware_backdoor)
802 eb |= (1u << GP_VECTOR);
fd7373cc
JK
803 if ((vcpu->guest_debug &
804 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
805 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
806 eb |= 1u << BP_VECTOR;
7ffd92c5 807 if (to_vmx(vcpu)->rmode.vm86_active)
abd3f2d6 808 eb = ~0;
089d034e 809 if (enable_ept)
49f933d4 810 eb &= ~(1u << PF_VECTOR);
36cf24e0
NHE
811
812 /* When we are running a nested L2 guest and L1 specified for it a
813 * certain exception bitmap, we must trap the same exceptions and pass
814 * them to L1. When running L2, we will only handle the exceptions
815 * specified above if L1 did not want them.
816 */
817 if (is_guest_mode(vcpu))
818 eb |= get_vmcs12(vcpu)->exception_bitmap;
819
abd3f2d6
AK
820 vmcs_write32(EXCEPTION_BITMAP, eb);
821}
822
d28b387f
KA
823/*
824 * Check if MSR is intercepted for currently loaded MSR bitmap.
825 */
826static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
827{
828 unsigned long *msr_bitmap;
829 int f = sizeof(unsigned long);
830
831 if (!cpu_has_vmx_msr_bitmap())
832 return true;
833
834 msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
835
836 if (msr <= 0x1fff) {
837 return !!test_bit(msr, msr_bitmap + 0x800 / f);
838 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
839 msr &= 0x1fff;
840 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
841 }
842
843 return true;
844}
845
2961e876
GN
846static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
847 unsigned long entry, unsigned long exit)
8bf00a52 848{
2961e876
GN
849 vm_entry_controls_clearbit(vmx, entry);
850 vm_exit_controls_clearbit(vmx, exit);
8bf00a52
GN
851}
852
662f1d1d 853int vmx_find_msr_index(struct vmx_msrs *m, u32 msr)
ca83b4a7
KRW
854{
855 unsigned int i;
856
857 for (i = 0; i < m->nr; ++i) {
858 if (m->val[i].index == msr)
859 return i;
860 }
861 return -ENOENT;
862}
863
61d2ef2c
AK
864static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
865{
ca83b4a7 866 int i;
61d2ef2c
AK
867 struct msr_autoload *m = &vmx->msr_autoload;
868
8bf00a52
GN
869 switch (msr) {
870 case MSR_EFER:
c73da3fc 871 if (cpu_has_load_ia32_efer()) {
2961e876
GN
872 clear_atomic_switch_msr_special(vmx,
873 VM_ENTRY_LOAD_IA32_EFER,
8bf00a52
GN
874 VM_EXIT_LOAD_IA32_EFER);
875 return;
876 }
877 break;
878 case MSR_CORE_PERF_GLOBAL_CTRL:
c73da3fc 879 if (cpu_has_load_perf_global_ctrl()) {
2961e876 880 clear_atomic_switch_msr_special(vmx,
8bf00a52
GN
881 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
882 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
883 return;
884 }
885 break;
110312c8 886 }
ef0fbcac 887 i = vmx_find_msr_index(&m->guest, msr);
ca83b4a7 888 if (i < 0)
31907093 889 goto skip_guest;
33966dd6 890 --m->guest.nr;
33966dd6 891 m->guest.val[i] = m->guest.val[m->guest.nr];
33966dd6 892 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
110312c8 893
31907093 894skip_guest:
ef0fbcac 895 i = vmx_find_msr_index(&m->host, msr);
31907093 896 if (i < 0)
61d2ef2c 897 return;
31907093
KRW
898
899 --m->host.nr;
900 m->host.val[i] = m->host.val[m->host.nr];
33966dd6 901 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
61d2ef2c
AK
902}
903
2961e876
GN
904static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
905 unsigned long entry, unsigned long exit,
906 unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
907 u64 guest_val, u64 host_val)
8bf00a52
GN
908{
909 vmcs_write64(guest_val_vmcs, guest_val);
5a5e8a15
SC
910 if (host_val_vmcs != HOST_IA32_EFER)
911 vmcs_write64(host_val_vmcs, host_val);
2961e876
GN
912 vm_entry_controls_setbit(vmx, entry);
913 vm_exit_controls_setbit(vmx, exit);
8bf00a52
GN
914}
915
61d2ef2c 916static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
989e3992 917 u64 guest_val, u64 host_val, bool entry_only)
61d2ef2c 918{
989e3992 919 int i, j = 0;
61d2ef2c
AK
920 struct msr_autoload *m = &vmx->msr_autoload;
921
8bf00a52
GN
922 switch (msr) {
923 case MSR_EFER:
c73da3fc 924 if (cpu_has_load_ia32_efer()) {
2961e876
GN
925 add_atomic_switch_msr_special(vmx,
926 VM_ENTRY_LOAD_IA32_EFER,
8bf00a52
GN
927 VM_EXIT_LOAD_IA32_EFER,
928 GUEST_IA32_EFER,
929 HOST_IA32_EFER,
930 guest_val, host_val);
931 return;
932 }
933 break;
934 case MSR_CORE_PERF_GLOBAL_CTRL:
c73da3fc 935 if (cpu_has_load_perf_global_ctrl()) {
2961e876 936 add_atomic_switch_msr_special(vmx,
8bf00a52
GN
937 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
938 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
939 GUEST_IA32_PERF_GLOBAL_CTRL,
940 HOST_IA32_PERF_GLOBAL_CTRL,
941 guest_val, host_val);
942 return;
943 }
944 break;
7099e2e1
RK
945 case MSR_IA32_PEBS_ENABLE:
946 /* PEBS needs a quiescent period after being disabled (to write
947 * a record). Disabling PEBS through VMX MSR swapping doesn't
948 * provide that period, so a CPU could write host's record into
949 * guest's memory.
950 */
951 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
110312c8
AK
952 }
953
ef0fbcac 954 i = vmx_find_msr_index(&m->guest, msr);
989e3992 955 if (!entry_only)
ef0fbcac 956 j = vmx_find_msr_index(&m->host, msr);
61d2ef2c 957
7cfe0526
AL
958 if ((i < 0 && m->guest.nr == NR_LOADSTORE_MSRS) ||
959 (j < 0 && m->host.nr == NR_LOADSTORE_MSRS)) {
60266204 960 printk_once(KERN_WARNING "Not enough msr switch entries. "
e7fc6f93
GN
961 "Can't add msr %x\n", msr);
962 return;
61d2ef2c 963 }
31907093 964 if (i < 0) {
ca83b4a7 965 i = m->guest.nr++;
33966dd6 966 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
31907093 967 }
989e3992
KRW
968 m->guest.val[i].index = msr;
969 m->guest.val[i].value = guest_val;
970
971 if (entry_only)
972 return;
61d2ef2c 973
31907093
KRW
974 if (j < 0) {
975 j = m->host.nr++;
33966dd6 976 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
61d2ef2c 977 }
31907093
KRW
978 m->host.val[j].index = msr;
979 m->host.val[j].value = host_val;
61d2ef2c
AK
980}
981
92c0d900 982static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
2cc51560 983{
844a5fe2
PB
984 u64 guest_efer = vmx->vcpu.arch.efer;
985 u64 ignore_bits = 0;
986
9167ab79
PB
987 /* Shadow paging assumes NX to be available. */
988 if (!enable_ept)
989 guest_efer |= EFER_NX;
3a34a881 990
51c6cf66 991 /*
844a5fe2 992 * LMA and LME handled by hardware; SCE meaningless outside long mode.
51c6cf66 993 */
844a5fe2 994 ignore_bits |= EFER_SCE;
51c6cf66
AK
995#ifdef CONFIG_X86_64
996 ignore_bits |= EFER_LMA | EFER_LME;
997 /* SCE is meaningful only in long mode on Intel */
998 if (guest_efer & EFER_LMA)
999 ignore_bits &= ~(u64)EFER_SCE;
1000#endif
84ad33ef 1001
f6577a5f
AL
1002 /*
1003 * On EPT, we can't emulate NX, so we must switch EFER atomically.
1004 * On CPUs that support "load IA32_EFER", always switch EFER
1005 * atomically, since it's faster than switching it manually.
1006 */
c73da3fc 1007 if (cpu_has_load_ia32_efer() ||
f6577a5f 1008 (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
84ad33ef
AK
1009 if (!(guest_efer & EFER_LMA))
1010 guest_efer &= ~EFER_LME;
54b98bff
AL
1011 if (guest_efer != host_efer)
1012 add_atomic_switch_msr(vmx, MSR_EFER,
989e3992 1013 guest_efer, host_efer, false);
02343cf2
SC
1014 else
1015 clear_atomic_switch_msr(vmx, MSR_EFER);
84ad33ef 1016 return false;
844a5fe2 1017 } else {
02343cf2
SC
1018 clear_atomic_switch_msr(vmx, MSR_EFER);
1019
844a5fe2
PB
1020 guest_efer &= ~ignore_bits;
1021 guest_efer |= host_efer & ignore_bits;
1022
1023 vmx->guest_msrs[efer_offset].data = guest_efer;
1024 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
84ad33ef 1025
844a5fe2
PB
1026 return true;
1027 }
51c6cf66
AK
1028}
1029
e28baead
AL
1030#ifdef CONFIG_X86_32
1031/*
1032 * On 32-bit kernels, VM exits still load the FS and GS bases from the
1033 * VMCS rather than the segment table. KVM uses this helper to figure
1034 * out the current bases to poke them into the VMCS before entry.
1035 */
2d49ec72
GN
1036static unsigned long segment_base(u16 selector)
1037{
8c2e41f7 1038 struct desc_struct *table;
2d49ec72
GN
1039 unsigned long v;
1040
8c2e41f7 1041 if (!(selector & ~SEGMENT_RPL_MASK))
2d49ec72
GN
1042 return 0;
1043
45fc8757 1044 table = get_current_gdt_ro();
2d49ec72 1045
8c2e41f7 1046 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
2d49ec72
GN
1047 u16 ldt_selector = kvm_read_ldt();
1048
8c2e41f7 1049 if (!(ldt_selector & ~SEGMENT_RPL_MASK))
2d49ec72
GN
1050 return 0;
1051
8c2e41f7 1052 table = (struct desc_struct *)segment_base(ldt_selector);
2d49ec72 1053 }
8c2e41f7 1054 v = get_desc_base(&table[selector >> 3]);
2d49ec72
GN
1055 return v;
1056}
e28baead 1057#endif
2d49ec72 1058
e348ac7c
SC
1059static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
1060{
2ef7619d 1061 return vmx_pt_mode_is_host_guest() &&
e348ac7c
SC
1062 !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
1063}
1064
2ef444f1
CP
1065static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
1066{
1067 u32 i;
1068
1069 wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1070 wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1071 wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1072 wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1073 for (i = 0; i < addr_range; i++) {
1074 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1075 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1076 }
1077}
1078
1079static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
1080{
1081 u32 i;
1082
1083 rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1084 rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1085 rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1086 rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1087 for (i = 0; i < addr_range; i++) {
1088 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1089 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1090 }
1091}
1092
1093static void pt_guest_enter(struct vcpu_vmx *vmx)
1094{
2ef7619d 1095 if (vmx_pt_mode_is_system())
2ef444f1
CP
1096 return;
1097
2ef444f1 1098 /*
b08c2896
CP
1099 * GUEST_IA32_RTIT_CTL is already set in the VMCS.
1100 * Save host state before VM entry.
2ef444f1 1101 */
b08c2896 1102 rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
2ef444f1
CP
1103 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1104 wrmsrl(MSR_IA32_RTIT_CTL, 0);
1105 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
1106 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
1107 }
1108}
1109
1110static void pt_guest_exit(struct vcpu_vmx *vmx)
1111{
2ef7619d 1112 if (vmx_pt_mode_is_system())
2ef444f1
CP
1113 return;
1114
1115 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1116 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
1117 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
1118 }
1119
1120 /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
1121 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1122}
1123
13b964a2
SC
1124void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
1125 unsigned long fs_base, unsigned long gs_base)
1126{
1127 if (unlikely(fs_sel != host->fs_sel)) {
1128 if (!(fs_sel & 7))
1129 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1130 else
1131 vmcs_write16(HOST_FS_SELECTOR, 0);
1132 host->fs_sel = fs_sel;
1133 }
1134 if (unlikely(gs_sel != host->gs_sel)) {
1135 if (!(gs_sel & 7))
1136 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1137 else
1138 vmcs_write16(HOST_GS_SELECTOR, 0);
1139 host->gs_sel = gs_sel;
1140 }
1141 if (unlikely(fs_base != host->fs_base)) {
1142 vmcs_writel(HOST_FS_BASE, fs_base);
1143 host->fs_base = fs_base;
1144 }
1145 if (unlikely(gs_base != host->gs_base)) {
1146 vmcs_writel(HOST_GS_BASE, gs_base);
1147 host->gs_base = gs_base;
1148 }
1149}
1150
97b7ead3 1151void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
33ed6329 1152{
04d2cc77 1153 struct vcpu_vmx *vmx = to_vmx(vcpu);
d7ee039e 1154 struct vmcs_host_state *host_state;
51e8a8cc 1155#ifdef CONFIG_X86_64
35060ed6 1156 int cpu = raw_smp_processor_id();
51e8a8cc 1157#endif
e368b875
SC
1158 unsigned long fs_base, gs_base;
1159 u16 fs_sel, gs_sel;
26bb0981 1160 int i;
04d2cc77 1161
d264ee0c
SC
1162 vmx->req_immediate_exit = false;
1163
f48b4711
LA
1164 /*
1165 * Note that guest MSRs to be saved/restored can also be changed
1166 * when guest state is loaded. This happens when guest transitions
1167 * to/from long-mode by setting MSR_EFER.LMA.
1168 */
b464f57e
PB
1169 if (!vmx->guest_msrs_ready) {
1170 vmx->guest_msrs_ready = true;
f48b4711
LA
1171 for (i = 0; i < vmx->save_nmsrs; ++i)
1172 kvm_set_shared_msr(vmx->guest_msrs[i].index,
1173 vmx->guest_msrs[i].data,
1174 vmx->guest_msrs[i].mask);
1175
1176 }
c9dfd3fb 1177
1178 if (vmx->nested.need_vmcs12_to_shadow_sync)
1179 nested_sync_vmcs12_to_shadow(vcpu);
1180
b464f57e 1181 if (vmx->guest_state_loaded)
33ed6329
AK
1182 return;
1183
b464f57e 1184 host_state = &vmx->loaded_vmcs->host_state;
bd9966de 1185
33ed6329
AK
1186 /*
1187 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
1188 * allow segment selectors with cpl > 0 or ti == 1.
1189 */
d7ee039e 1190 host_state->ldt_sel = kvm_read_ldt();
42b933b5
VK
1191
1192#ifdef CONFIG_X86_64
d7ee039e
SC
1193 savesegment(ds, host_state->ds_sel);
1194 savesegment(es, host_state->es_sel);
e368b875
SC
1195
1196 gs_base = cpu_kernelmode_gs_base(cpu);
b062b794
VK
1197 if (likely(is_64bit_mm(current->mm))) {
1198 save_fsgs_for_kvm();
e368b875
SC
1199 fs_sel = current->thread.fsindex;
1200 gs_sel = current->thread.gsindex;
b062b794 1201 fs_base = current->thread.fsbase;
e368b875 1202 vmx->msr_host_kernel_gs_base = current->thread.gsbase;
b062b794 1203 } else {
e368b875
SC
1204 savesegment(fs, fs_sel);
1205 savesegment(gs, gs_sel);
b062b794 1206 fs_base = read_msr(MSR_FS_BASE);
e368b875 1207 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
33ed6329 1208 }
b2da15ac 1209
4679b61f 1210 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
4fde8d57 1211#else
e368b875
SC
1212 savesegment(fs, fs_sel);
1213 savesegment(gs, gs_sel);
1214 fs_base = segment_base(fs_sel);
1215 gs_base = segment_base(gs_sel);
707c0874 1216#endif
e368b875 1217
13b964a2 1218 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
b464f57e 1219 vmx->guest_state_loaded = true;
33ed6329
AK
1220}
1221
6d6095bd 1222static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
33ed6329 1223{
d7ee039e
SC
1224 struct vmcs_host_state *host_state;
1225
b464f57e 1226 if (!vmx->guest_state_loaded)
33ed6329
AK
1227 return;
1228
b464f57e 1229 host_state = &vmx->loaded_vmcs->host_state;
bd9966de 1230
e1beb1d3 1231 ++vmx->vcpu.stat.host_state_reload;
bd9966de 1232
c8770e7b 1233#ifdef CONFIG_X86_64
4679b61f 1234 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
c8770e7b 1235#endif
d7ee039e
SC
1236 if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
1237 kvm_load_ldt(host_state->ldt_sel);
33ed6329 1238#ifdef CONFIG_X86_64
d7ee039e 1239 load_gs_index(host_state->gs_sel);
9581d442 1240#else
d7ee039e 1241 loadsegment(gs, host_state->gs_sel);
33ed6329 1242#endif
33ed6329 1243 }
d7ee039e
SC
1244 if (host_state->fs_sel & 7)
1245 loadsegment(fs, host_state->fs_sel);
b2da15ac 1246#ifdef CONFIG_X86_64
d7ee039e
SC
1247 if (unlikely(host_state->ds_sel | host_state->es_sel)) {
1248 loadsegment(ds, host_state->ds_sel);
1249 loadsegment(es, host_state->es_sel);
b2da15ac 1250 }
b2da15ac 1251#endif
b7ffc44d 1252 invalidate_tss_limit();
44ea2b17 1253#ifdef CONFIG_X86_64
c8770e7b 1254 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
44ea2b17 1255#endif
45fc8757 1256 load_fixmap_gdt(raw_smp_processor_id());
b464f57e
PB
1257 vmx->guest_state_loaded = false;
1258 vmx->guest_msrs_ready = false;
33ed6329
AK
1259}
1260
678e315e
SC
1261#ifdef CONFIG_X86_64
1262static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
a9b21b62 1263{
4679b61f 1264 preempt_disable();
b464f57e 1265 if (vmx->guest_state_loaded)
4679b61f
PB
1266 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1267 preempt_enable();
678e315e 1268 return vmx->msr_guest_kernel_gs_base;
a9b21b62
AK
1269}
1270
678e315e
SC
1271static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
1272{
4679b61f 1273 preempt_disable();
b464f57e 1274 if (vmx->guest_state_loaded)
4679b61f
PB
1275 wrmsrl(MSR_KERNEL_GS_BASE, data);
1276 preempt_enable();
678e315e
SC
1277 vmx->msr_guest_kernel_gs_base = data;
1278}
1279#endif
1280
28b835d6
FW
1281static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
1282{
1283 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
1284 struct pi_desc old, new;
1285 unsigned int dest;
1286
31afb2ea
PB
1287 /*
1288 * In case of hot-plug or hot-unplug, we may have to undo
1289 * vmx_vcpu_pi_put even if there is no assigned device. And we
1290 * always keep PI.NDST up to date for simplicity: it makes the
1291 * code easier, and CPU migration is not a fast path.
1292 */
1293 if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
28b835d6
FW
1294 return;
1295
132194ff
JM
1296 /*
1297 * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
1298 * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
1299 * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
1300 * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
1301 * correctly.
1302 */
1303 if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
1304 pi_clear_sn(pi_desc);
1305 goto after_clear_sn;
1306 }
1307
31afb2ea 1308 /* The full case. */
28b835d6
FW
1309 do {
1310 old.control = new.control = pi_desc->control;
1311
31afb2ea 1312 dest = cpu_physical_id(cpu);
28b835d6 1313
31afb2ea
PB
1314 if (x2apic_enabled())
1315 new.ndst = dest;
1316 else
1317 new.ndst = (dest << 8) & 0xFF00;
28b835d6 1318
28b835d6 1319 new.sn = 0;
c0a1666b
PB
1320 } while (cmpxchg64(&pi_desc->control, old.control,
1321 new.control) != old.control);
c112b5f5 1322
132194ff
JM
1323after_clear_sn:
1324
c112b5f5
LK
1325 /*
1326 * Clear SN before reading the bitmap. The VT-d firmware
1327 * writes the bitmap and reads SN atomically (5.2.3 in the
1328 * spec), so it doesn't really have a memory barrier that
1329 * pairs with this, but we cannot do that and we need one.
1330 */
1331 smp_mb__after_atomic();
1332
29881b6e 1333 if (!pi_is_pir_empty(pi_desc))
c112b5f5 1334 pi_set_on(pi_desc);
28b835d6 1335}
1be0e61c 1336
8ef863e6 1337void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
6aa8b732 1338{
a2fa3e9f 1339 struct vcpu_vmx *vmx = to_vmx(vcpu);
b80c76ec 1340 bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
6aa8b732 1341
b80c76ec 1342 if (!already_loaded) {
fe0e80be 1343 loaded_vmcs_clear(vmx->loaded_vmcs);
92fe13be 1344 local_irq_disable();
8f536b76 1345 crash_disable_local_vmclear(cpu);
5a560f8b
XG
1346
1347 /*
1348 * Read loaded_vmcs->cpu should be before fetching
1349 * loaded_vmcs->loaded_vmcss_on_cpu_link.
1350 * See the comments in __loaded_vmcs_clear().
1351 */
1352 smp_rmb();
1353
d462b819
NHE
1354 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1355 &per_cpu(loaded_vmcss_on_cpu, cpu));
8f536b76 1356 crash_enable_local_vmclear(cpu);
92fe13be 1357 local_irq_enable();
b80c76ec
JM
1358 }
1359
1360 if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
1361 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1362 vmcs_load(vmx->loaded_vmcs->vmcs);
15d45071 1363 indirect_branch_prediction_barrier();
b80c76ec
JM
1364 }
1365
1366 if (!already_loaded) {
59c58ceb 1367 void *gdt = get_current_gdt_ro();
b80c76ec
JM
1368 unsigned long sysenter_esp;
1369
1370 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
92fe13be 1371
6aa8b732
AK
1372 /*
1373 * Linux uses per-cpu TSS and GDT, so set these when switching
e0c23063 1374 * processors. See 22.2.4.
6aa8b732 1375 */
e0c23063 1376 vmcs_writel(HOST_TR_BASE,
72f5e08d 1377 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
59c58ceb 1378 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
6aa8b732
AK
1379
1380 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
1381 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
ff2c3a18 1382
d462b819 1383 vmx->loaded_vmcs->cpu = cpu;
6aa8b732 1384 }
28b835d6 1385
2680d6da
OH
1386 /* Setup TSC multiplier */
1387 if (kvm_has_tsc_control &&
c95ba92a
PF
1388 vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
1389 decache_tsc_multiplier(vmx);
8ef863e6
SC
1390}
1391
1392/*
1393 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1394 * vcpu mutex is already taken.
1395 */
1396void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1397{
1398 struct vcpu_vmx *vmx = to_vmx(vcpu);
1399
1400 vmx_vcpu_load_vmcs(vcpu, cpu);
2680d6da 1401
28b835d6 1402 vmx_vcpu_pi_load(vcpu, cpu);
8ef863e6 1403
1be0e61c 1404 vmx->host_pkru = read_pkru();
74c55931 1405 vmx->host_debugctlmsr = get_debugctlmsr();
28b835d6
FW
1406}
1407
1408static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
1409{
1410 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
1411
1412 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
a0052191
YZ
1413 !irq_remapping_cap(IRQ_POSTING_CAP) ||
1414 !kvm_vcpu_apicv_active(vcpu))
28b835d6
FW
1415 return;
1416
1417 /* Set SN when the vCPU is preempted */
1418 if (vcpu->preempted)
1419 pi_set_sn(pi_desc);
6aa8b732
AK
1420}
1421
13b964a2 1422static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
6aa8b732 1423{
28b835d6
FW
1424 vmx_vcpu_pi_put(vcpu);
1425
6d6095bd 1426 vmx_prepare_switch_to_host(to_vmx(vcpu));
6aa8b732
AK
1427}
1428
f244deed
WL
1429static bool emulation_required(struct kvm_vcpu *vcpu)
1430{
1431 return emulate_invalid_guest_state && !guest_state_valid(vcpu);
1432}
1433
97b7ead3 1434unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
6aa8b732 1435{
e7bddc52 1436 struct vcpu_vmx *vmx = to_vmx(vcpu);
78ac8b47 1437 unsigned long rflags, save_rflags;
345dcaa8 1438
cb3c1e2f
SC
1439 if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
1440 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
6de12732 1441 rflags = vmcs_readl(GUEST_RFLAGS);
e7bddc52 1442 if (vmx->rmode.vm86_active) {
6de12732 1443 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
e7bddc52 1444 save_rflags = vmx->rmode.save_rflags;
6de12732
AK
1445 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1446 }
e7bddc52 1447 vmx->rflags = rflags;
78ac8b47 1448 }
e7bddc52 1449 return vmx->rflags;
6aa8b732
AK
1450}
1451
97b7ead3 1452void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
6aa8b732 1453{
e7bddc52 1454 struct vcpu_vmx *vmx = to_vmx(vcpu);
491c1ad1 1455 unsigned long old_rflags;
f244deed 1456
491c1ad1 1457 if (enable_unrestricted_guest) {
cb3c1e2f 1458 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
491c1ad1
SC
1459 vmx->rflags = rflags;
1460 vmcs_writel(GUEST_RFLAGS, rflags);
1461 return;
1462 }
1463
1464 old_rflags = vmx_get_rflags(vcpu);
e7bddc52
SC
1465 vmx->rflags = rflags;
1466 if (vmx->rmode.vm86_active) {
1467 vmx->rmode.save_rflags = rflags;
053de044 1468 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
78ac8b47 1469 }
6aa8b732 1470 vmcs_writel(GUEST_RFLAGS, rflags);
f244deed 1471
e7bddc52
SC
1472 if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
1473 vmx->emulation_required = emulation_required(vcpu);
6aa8b732
AK
1474}
1475
97b7ead3 1476u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
2809f5d2
GC
1477{
1478 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1479 int ret = 0;
1480
1481 if (interruptibility & GUEST_INTR_STATE_STI)
48005f64 1482 ret |= KVM_X86_SHADOW_INT_STI;
2809f5d2 1483 if (interruptibility & GUEST_INTR_STATE_MOV_SS)
48005f64 1484 ret |= KVM_X86_SHADOW_INT_MOV_SS;
2809f5d2 1485
37ccdcbe 1486 return ret;
2809f5d2
GC
1487}
1488
97b7ead3 1489void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
2809f5d2
GC
1490{
1491 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
1492 u32 interruptibility = interruptibility_old;
1493
1494 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
1495
48005f64 1496 if (mask & KVM_X86_SHADOW_INT_MOV_SS)
2809f5d2 1497 interruptibility |= GUEST_INTR_STATE_MOV_SS;
48005f64 1498 else if (mask & KVM_X86_SHADOW_INT_STI)
2809f5d2
GC
1499 interruptibility |= GUEST_INTR_STATE_STI;
1500
1501 if ((interruptibility != interruptibility_old))
1502 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
1503}
1504
bf8c55d8
CP
1505static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
1506{
1507 struct vcpu_vmx *vmx = to_vmx(vcpu);
1508 unsigned long value;
1509
1510 /*
1511 * Any MSR write that attempts to change bits marked reserved will
1512 * case a #GP fault.
1513 */
1514 if (data & vmx->pt_desc.ctl_bitmask)
1515 return 1;
1516
1517 /*
1518 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
1519 * result in a #GP unless the same write also clears TraceEn.
1520 */
1521 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
1522 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
1523 return 1;
1524
1525 /*
1526 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
1527 * and FabricEn would cause #GP, if
1528 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
1529 */
1530 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
1531 !(data & RTIT_CTL_FABRIC_EN) &&
1532 !intel_pt_validate_cap(vmx->pt_desc.caps,
1533 PT_CAP_single_range_output))
1534 return 1;
1535
1536 /*
1537 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
1538 * utilize encodings marked reserved will casue a #GP fault.
1539 */
1540 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
1541 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
1542 !test_bit((data & RTIT_CTL_MTC_RANGE) >>
1543 RTIT_CTL_MTC_RANGE_OFFSET, &value))
1544 return 1;
1545 value = intel_pt_validate_cap(vmx->pt_desc.caps,
1546 PT_CAP_cycle_thresholds);
1547 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1548 !test_bit((data & RTIT_CTL_CYC_THRESH) >>
1549 RTIT_CTL_CYC_THRESH_OFFSET, &value))
1550 return 1;
1551 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
1552 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
1553 !test_bit((data & RTIT_CTL_PSB_FREQ) >>
1554 RTIT_CTL_PSB_FREQ_OFFSET, &value))
1555 return 1;
1556
1557 /*
1558 * If ADDRx_CFG is reserved or the encodings is >2 will
1559 * cause a #GP fault.
1560 */
1561 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
1562 if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
1563 return 1;
1564 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
1565 if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
1566 return 1;
1567 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
1568 if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
1569 return 1;
1570 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
1571 if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
1572 return 1;
1573
1574 return 0;
1575}
1576
1957aa63 1577static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
6aa8b732
AK
1578{
1579 unsigned long rip;
6aa8b732 1580
1957aa63
SC
1581 /*
1582 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
1583 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
1584 * set when EPT misconfig occurs. In practice, real hardware updates
1585 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
1586 * (namely Hyper-V) don't set it due to it being undefined behavior,
1587 * i.e. we end up advancing IP with some random value.
1588 */
1589 if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
1590 to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
1591 rip = kvm_rip_read(vcpu);
1592 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1593 kvm_rip_write(vcpu, rip);
1594 } else {
1595 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
1596 return 0;
1597 }
6aa8b732 1598
2809f5d2
GC
1599 /* skipping an emulated instruction also counts */
1600 vmx_set_interrupt_shadow(vcpu, 0);
f8ea7c60 1601
60fc3d02 1602 return 1;
f8ea7c60
VK
1603}
1604
5ef8acbd
OU
1605
1606/*
1607 * Recognizes a pending MTF VM-exit and records the nested state for later
1608 * delivery.
1609 */
1610static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
1611{
1612 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1613 struct vcpu_vmx *vmx = to_vmx(vcpu);
1614
1615 if (!is_guest_mode(vcpu))
1616 return;
1617
1618 /*
1619 * Per the SDM, MTF takes priority over debug-trap exceptions besides
1620 * T-bit traps. As instruction emulation is completed (i.e. at the
1621 * instruction boundary), any #DB exception pending delivery must be a
1622 * debug-trap. Record the pending MTF state to be delivered in
1623 * vmx_check_nested_events().
1624 */
1625 if (nested_cpu_has_mtf(vmcs12) &&
1626 (!vcpu->arch.exception.pending ||
1627 vcpu->arch.exception.nr == DB_VECTOR))
1628 vmx->nested.mtf_pending = true;
1629 else
1630 vmx->nested.mtf_pending = false;
1631}
1632
1633static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
1634{
1635 vmx_update_emulated_instruction(vcpu);
1636 return skip_emulated_instruction(vcpu);
1637}
1638
caa057a2
WL
1639static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1640{
1641 /*
1642 * Ensure that we clear the HLT state in the VMCS. We don't need to
1643 * explicitly skip the instruction because if the HLT state is set,
1644 * then the instruction is already executing and RIP has already been
1645 * advanced.
1646 */
1647 if (kvm_hlt_in_guest(vcpu->kvm) &&
1648 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1649 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1650}
1651
cfcd20e5 1652static void vmx_queue_exception(struct kvm_vcpu *vcpu)
298101da 1653{
77ab6db0 1654 struct vcpu_vmx *vmx = to_vmx(vcpu);
cfcd20e5
WL
1655 unsigned nr = vcpu->arch.exception.nr;
1656 bool has_error_code = vcpu->arch.exception.has_error_code;
cfcd20e5 1657 u32 error_code = vcpu->arch.exception.error_code;
8ab2d2e2 1658 u32 intr_info = nr | INTR_INFO_VALID_MASK;
77ab6db0 1659
da998b46
JM
1660 kvm_deliver_exception_payload(vcpu);
1661
8ab2d2e2 1662 if (has_error_code) {
77ab6db0 1663 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
8ab2d2e2
JK
1664 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
1665 }
77ab6db0 1666
7ffd92c5 1667 if (vmx->rmode.vm86_active) {
71f9833b
SH
1668 int inc_eip = 0;
1669 if (kvm_exception_is_soft(nr))
1670 inc_eip = vcpu->arch.event_exit_inst_len;
9497e1f2 1671 kvm_inject_realmode_interrupt(vcpu, nr, inc_eip);
77ab6db0
JK
1672 return;
1673 }
1674
add5ff7a
SC
1675 WARN_ON_ONCE(vmx->emulation_required);
1676
66fd3f7f
GN
1677 if (kvm_exception_is_soft(nr)) {
1678 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
1679 vmx->vcpu.arch.event_exit_inst_len);
8ab2d2e2
JK
1680 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
1681 } else
1682 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1683
1684 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
caa057a2
WL
1685
1686 vmx_clear_hlt(vcpu);
298101da
AK
1687}
1688
a75beee6
ED
1689/*
1690 * Swap MSR entry in host/guest MSR entry array.
1691 */
8b9cf98c 1692static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
a75beee6 1693{
26bb0981 1694 struct shared_msr_entry tmp;
a2fa3e9f
GH
1695
1696 tmp = vmx->guest_msrs[to];
1697 vmx->guest_msrs[to] = vmx->guest_msrs[from];
1698 vmx->guest_msrs[from] = tmp;
a75beee6
ED
1699}
1700
e38aea3e
AK
1701/*
1702 * Set up the vmcs to automatically save and restore system
1703 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
1704 * mode, as fiddling with msrs is very expensive.
1705 */
8b9cf98c 1706static void setup_msrs(struct vcpu_vmx *vmx)
e38aea3e 1707{
26bb0981 1708 int save_nmsrs, index;
e38aea3e 1709
a75beee6
ED
1710 save_nmsrs = 0;
1711#ifdef CONFIG_X86_64
84c8c5b8
JM
1712 /*
1713 * The SYSCALL MSRs are only needed on long mode guests, and only
1714 * when EFER.SCE is set.
1715 */
1716 if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
1717 index = __find_msr_index(vmx, MSR_STAR);
a75beee6 1718 if (index >= 0)
8b9cf98c
RR
1719 move_msr_up(vmx, index, save_nmsrs++);
1720 index = __find_msr_index(vmx, MSR_LSTAR);
a75beee6 1721 if (index >= 0)
8b9cf98c 1722 move_msr_up(vmx, index, save_nmsrs++);
84c8c5b8
JM
1723 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
1724 if (index >= 0)
8b9cf98c 1725 move_msr_up(vmx, index, save_nmsrs++);
a75beee6
ED
1726 }
1727#endif
92c0d900
AK
1728 index = __find_msr_index(vmx, MSR_EFER);
1729 if (index >= 0 && update_transition_efer(vmx, index))
26bb0981 1730 move_msr_up(vmx, index, save_nmsrs++);
0023ef39
JM
1731 index = __find_msr_index(vmx, MSR_TSC_AUX);
1732 if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
1733 move_msr_up(vmx, index, save_nmsrs++);
c11f83e0
PB
1734 index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL);
1735 if (index >= 0)
1736 move_msr_up(vmx, index, save_nmsrs++);
e38aea3e 1737
26bb0981 1738 vmx->save_nmsrs = save_nmsrs;
b464f57e 1739 vmx->guest_msrs_ready = false;
5897297b 1740
8d14695f 1741 if (cpu_has_vmx_msr_bitmap())
904e14fb 1742 vmx_update_msr_bitmap(&vmx->vcpu);
e38aea3e
AK
1743}
1744
e79f245d 1745static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
6aa8b732 1746{
e79f245d 1747 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6aa8b732 1748
e79f245d 1749 if (is_guest_mode(vcpu) &&
5e3d394f 1750 (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
e79f245d
KA
1751 return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
1752
1753 return vcpu->arch.tsc_offset;
6aa8b732
AK
1754}
1755
326e7425 1756static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
6aa8b732 1757{
45c3af97
PB
1758 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1759 u64 g_tsc_offset = 0;
1760
1761 /*
1762 * We're here if L1 chose not to trap WRMSR to TSC. According
1763 * to the spec, this should set L1's TSC; The offset that L1
1764 * set for L2 remains unchanged, and still needs to be added
1765 * to the newly set TSC to get L2's TSC.
1766 */
1767 if (is_guest_mode(vcpu) &&
5e3d394f 1768 (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
45c3af97 1769 g_tsc_offset = vmcs12->tsc_offset;
326e7425 1770
45c3af97
PB
1771 trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1772 vcpu->arch.tsc_offset - g_tsc_offset,
1773 offset);
1774 vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
1775 return offset + g_tsc_offset;
6aa8b732
AK
1776}
1777
801d3424
NHE
1778/*
1779 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
1780 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
1781 * all guests if the "nested" module option is off, and can also be disabled
1782 * for a single guest by disabling its VMX cpuid bit.
1783 */
7c97fcb3 1784bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
801d3424 1785{
d6321d49 1786 return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
801d3424
NHE
1787}
1788
55d2375e
SC
1789static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
1790 uint64_t val)
62cc6b9d 1791{
55d2375e 1792 uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
62cc6b9d 1793
55d2375e 1794 return !(val & ~valid_bits);
62cc6b9d
DM
1795}
1796
55d2375e 1797static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
62cc6b9d 1798{
55d2375e
SC
1799 switch (msr->index) {
1800 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1801 if (!nested)
1802 return 1;
1803 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
1804 default:
1805 return 1;
1806 }
62cc6b9d
DM
1807}
1808
55d2375e
SC
1809/*
1810 * Reads an msr value (of 'msr_index') into 'pdata'.
1811 * Returns 0 on success, non-0 otherwise.
1812 * Assumes vcpu_load() was already called.
1813 */
1814static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
62cc6b9d 1815{
55d2375e
SC
1816 struct vcpu_vmx *vmx = to_vmx(vcpu);
1817 struct shared_msr_entry *msr;
bf8c55d8 1818 u32 index;
62cc6b9d 1819
55d2375e
SC
1820 switch (msr_info->index) {
1821#ifdef CONFIG_X86_64
1822 case MSR_FS_BASE:
1823 msr_info->data = vmcs_readl(GUEST_FS_BASE);
62cc6b9d 1824 break;
55d2375e
SC
1825 case MSR_GS_BASE:
1826 msr_info->data = vmcs_readl(GUEST_GS_BASE);
62cc6b9d 1827 break;
55d2375e
SC
1828 case MSR_KERNEL_GS_BASE:
1829 msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
62cc6b9d 1830 break;
55d2375e
SC
1831#endif
1832 case MSR_EFER:
1833 return kvm_get_msr_common(vcpu, msr_info);
c11f83e0
PB
1834 case MSR_IA32_TSX_CTRL:
1835 if (!msr_info->host_initiated &&
1836 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
1837 return 1;
1838 goto find_shared_msr;
6e3ba4ab
TX
1839 case MSR_IA32_UMWAIT_CONTROL:
1840 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
1841 return 1;
1842
1843 msr_info->data = vmx->msr_ia32_umwait_control;
1844 break;
55d2375e
SC
1845 case MSR_IA32_SPEC_CTRL:
1846 if (!msr_info->host_initiated &&
1847 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
1848 return 1;
1849
1850 msr_info->data = to_vmx(vcpu)->spec_ctrl;
62cc6b9d 1851 break;
6aa8b732 1852 case MSR_IA32_SYSENTER_CS:
609e36d3 1853 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
6aa8b732
AK
1854 break;
1855 case MSR_IA32_SYSENTER_EIP:
609e36d3 1856 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
6aa8b732
AK
1857 break;
1858 case MSR_IA32_SYSENTER_ESP:
609e36d3 1859 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
6aa8b732 1860 break;
0dd376e7 1861 case MSR_IA32_BNDCFGS:
691bd434 1862 if (!kvm_mpx_supported() ||
d6321d49
RK
1863 (!msr_info->host_initiated &&
1864 !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
93c4adc7 1865 return 1;
609e36d3 1866 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
0dd376e7 1867 break;
c45dcc71
AR
1868 case MSR_IA32_MCG_EXT_CTL:
1869 if (!msr_info->host_initiated &&
a6cb099a 1870 !(vmx->msr_ia32_feature_control &
32ad73db 1871 FEAT_CTL_LMCE_ENABLED))
cae50139 1872 return 1;
c45dcc71
AR
1873 msr_info->data = vcpu->arch.mcg_ext_ctl;
1874 break;
32ad73db 1875 case MSR_IA32_FEAT_CTL:
a6cb099a 1876 msr_info->data = vmx->msr_ia32_feature_control;
cae50139
JK
1877 break;
1878 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
1879 if (!nested_vmx_allowed(vcpu))
1880 return 1;
31de3d25
VK
1881 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
1882 &msr_info->data))
1883 return 1;
1884 /*
1885 * Enlightened VMCS v1 doesn't have certain fields, but buggy
1886 * Hyper-V versions are still trying to use corresponding
1887 * features when they are exposed. Filter out the essential
1888 * minimum.
1889 */
1890 if (!msr_info->host_initiated &&
1891 vmx->nested.enlightened_vmcs_enabled)
1892 nested_evmcs_filter_control_msr(msr_info->index,
1893 &msr_info->data);
1894 break;
bf8c55d8 1895 case MSR_IA32_RTIT_CTL:
2ef7619d 1896 if (!vmx_pt_mode_is_host_guest())
bf8c55d8
CP
1897 return 1;
1898 msr_info->data = vmx->pt_desc.guest.ctl;
1899 break;
1900 case MSR_IA32_RTIT_STATUS:
2ef7619d 1901 if (!vmx_pt_mode_is_host_guest())
bf8c55d8
CP
1902 return 1;
1903 msr_info->data = vmx->pt_desc.guest.status;
1904 break;
1905 case MSR_IA32_RTIT_CR3_MATCH:
2ef7619d 1906 if (!vmx_pt_mode_is_host_guest() ||
bf8c55d8
CP
1907 !intel_pt_validate_cap(vmx->pt_desc.caps,
1908 PT_CAP_cr3_filtering))
1909 return 1;
1910 msr_info->data = vmx->pt_desc.guest.cr3_match;
1911 break;
1912 case MSR_IA32_RTIT_OUTPUT_BASE:
2ef7619d 1913 if (!vmx_pt_mode_is_host_guest() ||
bf8c55d8
CP
1914 (!intel_pt_validate_cap(vmx->pt_desc.caps,
1915 PT_CAP_topa_output) &&
1916 !intel_pt_validate_cap(vmx->pt_desc.caps,
1917 PT_CAP_single_range_output)))
1918 return 1;
1919 msr_info->data = vmx->pt_desc.guest.output_base;
1920 break;
1921 case MSR_IA32_RTIT_OUTPUT_MASK:
2ef7619d 1922 if (!vmx_pt_mode_is_host_guest() ||
bf8c55d8
CP
1923 (!intel_pt_validate_cap(vmx->pt_desc.caps,
1924 PT_CAP_topa_output) &&
1925 !intel_pt_validate_cap(vmx->pt_desc.caps,
1926 PT_CAP_single_range_output)))
1927 return 1;
1928 msr_info->data = vmx->pt_desc.guest.output_mask;
1929 break;
1930 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
1931 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2ef7619d 1932 if (!vmx_pt_mode_is_host_guest() ||
bf8c55d8
CP
1933 (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
1934 PT_CAP_num_address_ranges)))
1935 return 1;
1936 if (index % 2)
1937 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
1938 else
1939 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
1940 break;
4e47c7a6 1941 case MSR_TSC_AUX:
d6321d49
RK
1942 if (!msr_info->host_initiated &&
1943 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
4e47c7a6 1944 return 1;
c11f83e0 1945 goto find_shared_msr;
6aa8b732 1946 default:
c11f83e0 1947 find_shared_msr:
a6cb099a 1948 msr = find_msr_entry(vmx, msr_info->index);
3bab1f5d 1949 if (msr) {
609e36d3 1950 msr_info->data = msr->data;
3bab1f5d 1951 break;
6aa8b732 1952 }
609e36d3 1953 return kvm_get_msr_common(vcpu, msr_info);
6aa8b732
AK
1954 }
1955
6aa8b732
AK
1956 return 0;
1957}
1958
1959/*
311497e0 1960 * Writes msr value into the appropriate "register".
6aa8b732
AK
1961 * Returns 0 on success, non-0 otherwise.
1962 * Assumes vcpu_load() was already called.
1963 */
8fe8ab46 1964static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
6aa8b732 1965{
a2fa3e9f 1966 struct vcpu_vmx *vmx = to_vmx(vcpu);
26bb0981 1967 struct shared_msr_entry *msr;
2cc51560 1968 int ret = 0;
8fe8ab46
WA
1969 u32 msr_index = msr_info->index;
1970 u64 data = msr_info->data;
bf8c55d8 1971 u32 index;
2cc51560 1972
6aa8b732 1973 switch (msr_index) {
3bab1f5d 1974 case MSR_EFER:
8fe8ab46 1975 ret = kvm_set_msr_common(vcpu, msr_info);
2cc51560 1976 break;
16175a79 1977#ifdef CONFIG_X86_64
6aa8b732 1978 case MSR_FS_BASE:
2fb92db1 1979 vmx_segment_cache_clear(vmx);
6aa8b732
AK
1980 vmcs_writel(GUEST_FS_BASE, data);
1981 break;
1982 case MSR_GS_BASE:
2fb92db1 1983 vmx_segment_cache_clear(vmx);
6aa8b732
AK
1984 vmcs_writel(GUEST_GS_BASE, data);
1985 break;
44ea2b17 1986 case MSR_KERNEL_GS_BASE:
678e315e 1987 vmx_write_guest_kernel_gs_base(vmx, data);
44ea2b17 1988 break;
6aa8b732
AK
1989#endif
1990 case MSR_IA32_SYSENTER_CS:
de70d279
SC
1991 if (is_guest_mode(vcpu))
1992 get_vmcs12(vcpu)->guest_sysenter_cs = data;
6aa8b732
AK
1993 vmcs_write32(GUEST_SYSENTER_CS, data);
1994 break;
1995 case MSR_IA32_SYSENTER_EIP:
de70d279
SC
1996 if (is_guest_mode(vcpu))
1997 get_vmcs12(vcpu)->guest_sysenter_eip = data;
f5b42c33 1998 vmcs_writel(GUEST_SYSENTER_EIP, data);
6aa8b732
AK
1999 break;
2000 case MSR_IA32_SYSENTER_ESP:
de70d279
SC
2001 if (is_guest_mode(vcpu))
2002 get_vmcs12(vcpu)->guest_sysenter_esp = data;
f5b42c33 2003 vmcs_writel(GUEST_SYSENTER_ESP, data);
6aa8b732 2004 break;
699a1ac2
SC
2005 case MSR_IA32_DEBUGCTLMSR:
2006 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
2007 VM_EXIT_SAVE_DEBUG_CONTROLS)
2008 get_vmcs12(vcpu)->guest_ia32_debugctl = data;
2009
2010 ret = kvm_set_msr_common(vcpu, msr_info);
2011 break;
2012
0dd376e7 2013 case MSR_IA32_BNDCFGS:
691bd434 2014 if (!kvm_mpx_supported() ||
d6321d49
RK
2015 (!msr_info->host_initiated &&
2016 !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
93c4adc7 2017 return 1;
fd8cb433 2018 if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
4531662d 2019 (data & MSR_IA32_BNDCFGS_RSVD))
93c4adc7 2020 return 1;
0dd376e7
LJ
2021 vmcs_write64(GUEST_BNDCFGS, data);
2022 break;
6e3ba4ab
TX
2023 case MSR_IA32_UMWAIT_CONTROL:
2024 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2025 return 1;
2026
2027 /* The reserved bit 1 and non-32 bit [63:32] should be zero */
2028 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
2029 return 1;
2030
2031 vmx->msr_ia32_umwait_control = data;
2032 break;
d28b387f
KA
2033 case MSR_IA32_SPEC_CTRL:
2034 if (!msr_info->host_initiated &&
d28b387f
KA
2035 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2036 return 1;
2037
6441fa61 2038 if (data & ~kvm_spec_ctrl_valid_bits(vcpu))
d28b387f
KA
2039 return 1;
2040
2041 vmx->spec_ctrl = data;
d28b387f
KA
2042 if (!data)
2043 break;
2044
2045 /*
2046 * For non-nested:
2047 * When it's written (to non-zero) for the first time, pass
2048 * it through.
2049 *
2050 * For nested:
2051 * The handling of the MSR bitmap for L2 guests is done in
4d516fe7 2052 * nested_vmx_prepare_msr_bitmap. We should not touch the
d28b387f
KA
2053 * vmcs02.msr_bitmap here since it gets completely overwritten
2054 * in the merging. We update the vmcs01 here for L1 as well
2055 * since it will end up touching the MSR anyway now.
2056 */
2057 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
2058 MSR_IA32_SPEC_CTRL,
2059 MSR_TYPE_RW);
2060 break;
c11f83e0
PB
2061 case MSR_IA32_TSX_CTRL:
2062 if (!msr_info->host_initiated &&
2063 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2064 return 1;
2065 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
2066 return 1;
2067 goto find_shared_msr;
15d45071
AR
2068 case MSR_IA32_PRED_CMD:
2069 if (!msr_info->host_initiated &&
15d45071
AR
2070 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2071 return 1;
2072
2073 if (data & ~PRED_CMD_IBPB)
2074 return 1;
6441fa61
PB
2075 if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL))
2076 return 1;
15d45071
AR
2077 if (!data)
2078 break;
2079
2080 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2081
2082 /*
2083 * For non-nested:
2084 * When it's written (to non-zero) for the first time, pass
2085 * it through.
2086 *
2087 * For nested:
2088 * The handling of the MSR bitmap for L2 guests is done in
4d516fe7 2089 * nested_vmx_prepare_msr_bitmap. We should not touch the
15d45071
AR
2090 * vmcs02.msr_bitmap here since it gets completely overwritten
2091 * in the merging.
2092 */
2093 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
2094 MSR_TYPE_W);
2095 break;
468d472f 2096 case MSR_IA32_CR_PAT:
d28f4290
SC
2097 if (!kvm_pat_valid(data))
2098 return 1;
2099
142e4be7
SC
2100 if (is_guest_mode(vcpu) &&
2101 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
2102 get_vmcs12(vcpu)->guest_ia32_pat = data;
2103
468d472f
SY
2104 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2105 vmcs_write64(GUEST_IA32_PAT, data);
2106 vcpu->arch.pat = data;
2107 break;
2108 }
8fe8ab46 2109 ret = kvm_set_msr_common(vcpu, msr_info);
4e47c7a6 2110 break;
ba904635
WA
2111 case MSR_IA32_TSC_ADJUST:
2112 ret = kvm_set_msr_common(vcpu, msr_info);
4e47c7a6 2113 break;
c45dcc71
AR
2114 case MSR_IA32_MCG_EXT_CTL:
2115 if ((!msr_info->host_initiated &&
2116 !(to_vmx(vcpu)->msr_ia32_feature_control &
32ad73db 2117 FEAT_CTL_LMCE_ENABLED)) ||
c45dcc71
AR
2118 (data & ~MCG_EXT_CTL_LMCE_EN))
2119 return 1;
2120 vcpu->arch.mcg_ext_ctl = data;
2121 break;
32ad73db 2122 case MSR_IA32_FEAT_CTL:
37e4c997 2123 if (!vmx_feature_control_msr_valid(vcpu, data) ||
3b84080b 2124 (to_vmx(vcpu)->msr_ia32_feature_control &
32ad73db 2125 FEAT_CTL_LOCKED && !msr_info->host_initiated))
cae50139 2126 return 1;
3b84080b 2127 vmx->msr_ia32_feature_control = data;
cae50139
JK
2128 if (msr_info->host_initiated && data == 0)
2129 vmx_leave_nested(vcpu);
2130 break;
2131 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
62cc6b9d
DM
2132 if (!msr_info->host_initiated)
2133 return 1; /* they are read-only */
2134 if (!nested_vmx_allowed(vcpu))
2135 return 1;
2136 return vmx_set_vmx_msr(vcpu, msr_index, data);
bf8c55d8 2137 case MSR_IA32_RTIT_CTL:
2ef7619d 2138 if (!vmx_pt_mode_is_host_guest() ||
ee85dec2
LK
2139 vmx_rtit_ctl_check(vcpu, data) ||
2140 vmx->nested.vmxon)
bf8c55d8
CP
2141 return 1;
2142 vmcs_write64(GUEST_IA32_RTIT_CTL, data);
2143 vmx->pt_desc.guest.ctl = data;
b08c2896 2144 pt_update_intercept_for_msr(vmx);
bf8c55d8
CP
2145 break;
2146 case MSR_IA32_RTIT_STATUS:
e348ac7c
SC
2147 if (!pt_can_write_msr(vmx))
2148 return 1;
2149 if (data & MSR_IA32_RTIT_STATUS_MASK)
bf8c55d8
CP
2150 return 1;
2151 vmx->pt_desc.guest.status = data;
2152 break;
2153 case MSR_IA32_RTIT_CR3_MATCH:
e348ac7c
SC
2154 if (!pt_can_write_msr(vmx))
2155 return 1;
2156 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2157 PT_CAP_cr3_filtering))
bf8c55d8
CP
2158 return 1;
2159 vmx->pt_desc.guest.cr3_match = data;
2160 break;
2161 case MSR_IA32_RTIT_OUTPUT_BASE:
e348ac7c
SC
2162 if (!pt_can_write_msr(vmx))
2163 return 1;
2164 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2165 PT_CAP_topa_output) &&
2166 !intel_pt_validate_cap(vmx->pt_desc.caps,
2167 PT_CAP_single_range_output))
2168 return 1;
2169 if (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK)
bf8c55d8
CP
2170 return 1;
2171 vmx->pt_desc.guest.output_base = data;
2172 break;
2173 case MSR_IA32_RTIT_OUTPUT_MASK:
e348ac7c
SC
2174 if (!pt_can_write_msr(vmx))
2175 return 1;
2176 if (!intel_pt_validate_cap(vmx->pt_desc.caps,
2177 PT_CAP_topa_output) &&
2178 !intel_pt_validate_cap(vmx->pt_desc.caps,
2179 PT_CAP_single_range_output))
bf8c55d8
CP
2180 return 1;
2181 vmx->pt_desc.guest.output_mask = data;
2182 break;
2183 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
e348ac7c
SC
2184 if (!pt_can_write_msr(vmx))
2185 return 1;
bf8c55d8 2186 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
e348ac7c
SC
2187 if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
2188 PT_CAP_num_address_ranges))
bf8c55d8 2189 return 1;
fe6ed369 2190 if (is_noncanonical_address(data, vcpu))
bf8c55d8
CP
2191 return 1;
2192 if (index % 2)
2193 vmx->pt_desc.guest.addr_b[index / 2] = data;
2194 else
2195 vmx->pt_desc.guest.addr_a[index / 2] = data;
2196 break;
4e47c7a6 2197 case MSR_TSC_AUX:
d6321d49
RK
2198 if (!msr_info->host_initiated &&
2199 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
4e47c7a6
SY
2200 return 1;
2201 /* Check reserved bit, higher 32 bits should be zero */
2202 if ((data >> 32) != 0)
2203 return 1;
c11f83e0
PB
2204 goto find_shared_msr;
2205
6aa8b732 2206 default:
c11f83e0 2207 find_shared_msr:
8b9cf98c 2208 msr = find_msr_entry(vmx, msr_index);
b07a5c53
PB
2209 if (msr)
2210 ret = vmx_set_guest_msr(vmx, msr, data);
2211 else
2212 ret = kvm_set_msr_common(vcpu, msr_info);
6aa8b732
AK
2213 }
2214
2cc51560 2215 return ret;
6aa8b732
AK
2216}
2217
5fdbf976 2218static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
6aa8b732 2219{
cb3c1e2f
SC
2220 kvm_register_mark_available(vcpu, reg);
2221
5fdbf976
MT
2222 switch (reg) {
2223 case VCPU_REGS_RSP:
2224 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2225 break;
2226 case VCPU_REGS_RIP:
2227 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2228 break;
6de4f3ad
AK
2229 case VCPU_EXREG_PDPTR:
2230 if (enable_ept)
2231 ept_save_pdptrs(vcpu);
2232 break;
34059c25
SC
2233 case VCPU_EXREG_CR3:
2234 if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
2235 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
2236 break;
5fdbf976 2237 default:
34059c25 2238 WARN_ON_ONCE(1);
5fdbf976
MT
2239 break;
2240 }
6aa8b732
AK
2241}
2242
6aa8b732
AK
2243static __init int cpu_has_kvm_support(void)
2244{
6210e37b 2245 return cpu_has_vmx();
6aa8b732
AK
2246}
2247
2248static __init int vmx_disabled_by_bios(void)
2249{
a4d0b2fd
SC
2250 return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
2251 !boot_cpu_has(X86_FEATURE_VMX);
6aa8b732
AK
2252}
2253
7725b894
DX
2254static void kvm_cpu_vmxon(u64 addr)
2255{
fe0e80be 2256 cr4_set_bits(X86_CR4_VMXE);
1c5ac21a
AS
2257 intel_pt_handle_vmx(1);
2258
4b1e5478 2259 asm volatile ("vmxon %0" : : "m"(addr));
7725b894
DX
2260}
2261
13a34e06 2262static int hardware_enable(void)
6aa8b732
AK
2263{
2264 int cpu = raw_smp_processor_id();
2265 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
6aa8b732 2266
1e02ce4c 2267 if (cr4_read_shadow() & X86_CR4_VMXE)
10474ae8
AG
2268 return -EBUSY;
2269
773e8a04
VK
2270 /*
2271 * This can happen if we hot-added a CPU but failed to allocate
2272 * VP assist page for it.
2273 */
2274 if (static_branch_unlikely(&enable_evmcs) &&
2275 !hv_get_vp_assist_page(cpu))
2276 return -EFAULT;
2277
d462b819 2278 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
bf9f6ac8
FW
2279 INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
2280 spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
8f536b76
ZY
2281
2282 /*
2283 * Now we can enable the vmclear operation in kdump
2284 * since the loaded_vmcss_on_cpu list on this cpu
2285 * has been initialized.
2286 *
2287 * Though the cpu is not in VMX operation now, there
2288 * is no problem to enable the vmclear operation
2289 * for the loaded_vmcss_on_cpu list is empty!
2290 */
2291 crash_enable_local_vmclear(cpu);
2292
fe0e80be 2293 kvm_cpu_vmxon(phys_addr);
fdf288bf
DH
2294 if (enable_ept)
2295 ept_sync_global();
10474ae8
AG
2296
2297 return 0;
6aa8b732
AK
2298}
2299
d462b819 2300static void vmclear_local_loaded_vmcss(void)
543e4243
AK
2301{
2302 int cpu = raw_smp_processor_id();
d462b819 2303 struct loaded_vmcs *v, *n;
543e4243 2304
d462b819
NHE
2305 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2306 loaded_vmcss_on_cpu_link)
2307 __loaded_vmcs_clear(v);
543e4243
AK
2308}
2309
710ff4a8
EH
2310
2311/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
2312 * tricks.
2313 */
2314static void kvm_cpu_vmxoff(void)
6aa8b732 2315{
4b1e5478 2316 asm volatile (__ex("vmxoff"));
1c5ac21a
AS
2317
2318 intel_pt_handle_vmx(0);
fe0e80be 2319 cr4_clear_bits(X86_CR4_VMXE);
6aa8b732
AK
2320}
2321
13a34e06 2322static void hardware_disable(void)
710ff4a8 2323{
fe0e80be
DH
2324 vmclear_local_loaded_vmcss();
2325 kvm_cpu_vmxoff();
710ff4a8
EH
2326}
2327
1c3d14fe 2328static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
d77c26fc 2329 u32 msr, u32 *result)
1c3d14fe
YS
2330{
2331 u32 vmx_msr_low, vmx_msr_high;
2332 u32 ctl = ctl_min | ctl_opt;
2333
2334 rdmsr(msr, vmx_msr_low, vmx_msr_high);
2335
2336 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2337 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
2338
2339 /* Ensure minimum (required) set of control bits are supported. */
2340 if (ctl_min & ~ctl)
002c7f7c 2341 return -EIO;
1c3d14fe
YS
2342
2343 *result = ctl;
2344 return 0;
2345}
2346
7caaa711
SC
2347static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
2348 struct vmx_capability *vmx_cap)
6aa8b732
AK
2349{
2350 u32 vmx_msr_low, vmx_msr_high;
d56f546d 2351 u32 min, opt, min2, opt2;
1c3d14fe
YS
2352 u32 _pin_based_exec_control = 0;
2353 u32 _cpu_based_exec_control = 0;
f78e0e2e 2354 u32 _cpu_based_2nd_exec_control = 0;
1c3d14fe
YS
2355 u32 _vmexit_control = 0;
2356 u32 _vmentry_control = 0;
2357
1389309c 2358 memset(vmcs_conf, 0, sizeof(*vmcs_conf));
10166744 2359 min = CPU_BASED_HLT_EXITING |
1c3d14fe
YS
2360#ifdef CONFIG_X86_64
2361 CPU_BASED_CR8_LOAD_EXITING |
2362 CPU_BASED_CR8_STORE_EXITING |
2363#endif
d56f546d
SY
2364 CPU_BASED_CR3_LOAD_EXITING |
2365 CPU_BASED_CR3_STORE_EXITING |
8eb73e2d 2366 CPU_BASED_UNCOND_IO_EXITING |
1c3d14fe 2367 CPU_BASED_MOV_DR_EXITING |
5e3d394f 2368 CPU_BASED_USE_TSC_OFFSETTING |
4d5422ce
WL
2369 CPU_BASED_MWAIT_EXITING |
2370 CPU_BASED_MONITOR_EXITING |
fee84b07
AK
2371 CPU_BASED_INVLPG_EXITING |
2372 CPU_BASED_RDPMC_EXITING;
443381a8 2373
f78e0e2e 2374 opt = CPU_BASED_TPR_SHADOW |
25c5f225 2375 CPU_BASED_USE_MSR_BITMAPS |
f78e0e2e 2376 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1c3d14fe
YS
2377 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
2378 &_cpu_based_exec_control) < 0)
002c7f7c 2379 return -EIO;
6e5d865c
YS
2380#ifdef CONFIG_X86_64
2381 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2382 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
2383 ~CPU_BASED_CR8_STORE_EXITING;
2384#endif
f78e0e2e 2385 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
d56f546d
SY
2386 min2 = 0;
2387 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
8d14695f 2388 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2384d2b3 2389 SECONDARY_EXEC_WBINVD_EXITING |
d56f546d 2390 SECONDARY_EXEC_ENABLE_VPID |
3a624e29 2391 SECONDARY_EXEC_ENABLE_EPT |
4b8d54f9 2392 SECONDARY_EXEC_UNRESTRICTED_GUEST |
4e47c7a6 2393 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
0367f205 2394 SECONDARY_EXEC_DESC |
ad756a16 2395 SECONDARY_EXEC_RDTSCP |
83d4c286 2396 SECONDARY_EXEC_ENABLE_INVPCID |
c7c9c56c 2397 SECONDARY_EXEC_APIC_REGISTER_VIRT |
abc4fc58 2398 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
20300099 2399 SECONDARY_EXEC_SHADOW_VMCS |
843e4330 2400 SECONDARY_EXEC_XSAVES |
736fdf72
DH
2401 SECONDARY_EXEC_RDSEED_EXITING |
2402 SECONDARY_EXEC_RDRAND_EXITING |
8b3e34e4 2403 SECONDARY_EXEC_ENABLE_PML |
2a499e49 2404 SECONDARY_EXEC_TSC_SCALING |
e69e72fa 2405 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
f99e3daf
CP
2406 SECONDARY_EXEC_PT_USE_GPA |
2407 SECONDARY_EXEC_PT_CONCEAL_VMX |
0b665d30
SC
2408 SECONDARY_EXEC_ENABLE_VMFUNC |
2409 SECONDARY_EXEC_ENCLS_EXITING;
d56f546d
SY
2410 if (adjust_vmx_controls(min2, opt2,
2411 MSR_IA32_VMX_PROCBASED_CTLS2,
f78e0e2e
SY
2412 &_cpu_based_2nd_exec_control) < 0)
2413 return -EIO;
2414 }
2415#ifndef CONFIG_X86_64
2416 if (!(_cpu_based_2nd_exec_control &
2417 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2418 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2419#endif
83d4c286
YZ
2420
2421 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2422 _cpu_based_2nd_exec_control &= ~(
8d14695f 2423 SECONDARY_EXEC_APIC_REGISTER_VIRT |
c7c9c56c
YZ
2424 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2425 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
83d4c286 2426
61f1dd90 2427 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
7caaa711 2428 &vmx_cap->ept, &vmx_cap->vpid);
61f1dd90 2429
d56f546d 2430 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
a7052897
MT
2431 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
2432 enabled */
5fff7d27
GN
2433 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
2434 CPU_BASED_CR3_STORE_EXITING |
2435 CPU_BASED_INVLPG_EXITING);
7caaa711
SC
2436 } else if (vmx_cap->ept) {
2437 vmx_cap->ept = 0;
61f1dd90
WL
2438 pr_warn_once("EPT CAP should not exist if not support "
2439 "1-setting enable EPT VM-execution control\n");
2440 }
2441 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
7caaa711
SC
2442 vmx_cap->vpid) {
2443 vmx_cap->vpid = 0;
61f1dd90
WL
2444 pr_warn_once("VPID CAP should not exist if not support "
2445 "1-setting enable VPID VM-execution control\n");
d56f546d 2446 }
1c3d14fe 2447
91fa0f8e 2448 min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
1c3d14fe
YS
2449#ifdef CONFIG_X86_64
2450 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
2451#endif
c73da3fc 2452 opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
c73da3fc
SC
2453 VM_EXIT_LOAD_IA32_PAT |
2454 VM_EXIT_LOAD_IA32_EFER |
f99e3daf
CP
2455 VM_EXIT_CLEAR_BNDCFGS |
2456 VM_EXIT_PT_CONCEAL_PIP |
2457 VM_EXIT_CLEAR_IA32_RTIT_CTL;
1c3d14fe
YS
2458 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
2459 &_vmexit_control) < 0)
002c7f7c 2460 return -EIO;
1c3d14fe 2461
8a1b4392
PB
2462 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
2463 opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
2464 PIN_BASED_VMX_PREEMPTION_TIMER;
01e439be
YZ
2465 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
2466 &_pin_based_exec_control) < 0)
2467 return -EIO;
2468
1c17c3e6
PB
2469 if (cpu_has_broken_vmx_preemption_timer())
2470 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
01e439be 2471 if (!(_cpu_based_2nd_exec_control &
91fa0f8e 2472 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
01e439be
YZ
2473 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2474
c845f9c6 2475 min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
c73da3fc
SC
2476 opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
2477 VM_ENTRY_LOAD_IA32_PAT |
2478 VM_ENTRY_LOAD_IA32_EFER |
f99e3daf
CP
2479 VM_ENTRY_LOAD_BNDCFGS |
2480 VM_ENTRY_PT_CONCEAL_PIP |
2481 VM_ENTRY_LOAD_IA32_RTIT_CTL;
1c3d14fe
YS
2482 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
2483 &_vmentry_control) < 0)
002c7f7c 2484 return -EIO;
6aa8b732 2485
c73da3fc
SC
2486 /*
2487 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
2488 * can't be used due to an errata where VM Exit may incorrectly clear
2489 * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the
2490 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2491 */
2492 if (boot_cpu_data.x86 == 0x6) {
2493 switch (boot_cpu_data.x86_model) {
2494 case 26: /* AAK155 */
2495 case 30: /* AAP115 */
2496 case 37: /* AAT100 */
2497 case 44: /* BC86,AAY89,BD102 */
2498 case 46: /* BA97 */
85ba2b16 2499 _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
c73da3fc
SC
2500 _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
2501 pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
2502 "does not work properly. Using workaround\n");
2503 break;
2504 default:
2505 break;
2506 }
2507 }
2508
2509
c68876fd 2510 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
1c3d14fe
YS
2511
2512 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
2513 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
002c7f7c 2514 return -EIO;
1c3d14fe
YS
2515
2516#ifdef CONFIG_X86_64
2517 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
2518 if (vmx_msr_high & (1u<<16))
002c7f7c 2519 return -EIO;
1c3d14fe
YS
2520#endif
2521
2522 /* Require Write-Back (WB) memory type for VMCS accesses. */
2523 if (((vmx_msr_high >> 18) & 15) != 6)
002c7f7c 2524 return -EIO;
1c3d14fe 2525
002c7f7c 2526 vmcs_conf->size = vmx_msr_high & 0x1fff;
16cb0255 2527 vmcs_conf->order = get_order(vmcs_conf->size);
9ac7e3e8 2528 vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
773e8a04 2529
2307af1c 2530 vmcs_conf->revision_id = vmx_msr_low;
1c3d14fe 2531
002c7f7c
YS
2532 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2533 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
f78e0e2e 2534 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
002c7f7c
YS
2535 vmcs_conf->vmexit_ctrl = _vmexit_control;
2536 vmcs_conf->vmentry_ctrl = _vmentry_control;
1c3d14fe 2537
773e8a04
VK
2538 if (static_branch_unlikely(&enable_evmcs))
2539 evmcs_sanitize_exec_ctrls(vmcs_conf);
2540
1c3d14fe 2541 return 0;
c68876fd 2542}
6aa8b732 2543
41836839 2544struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
6aa8b732
AK
2545{
2546 int node = cpu_to_node(cpu);
2547 struct page *pages;
2548 struct vmcs *vmcs;
2549
41836839 2550 pages = __alloc_pages_node(node, flags, vmcs_config.order);
6aa8b732
AK
2551 if (!pages)
2552 return NULL;
2553 vmcs = page_address(pages);
1c3d14fe 2554 memset(vmcs, 0, vmcs_config.size);
2307af1c
LA
2555
2556 /* KVM supports Enlightened VMCS v1 only */
2557 if (static_branch_unlikely(&enable_evmcs))
392b2f25 2558 vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
2307af1c 2559 else
392b2f25 2560 vmcs->hdr.revision_id = vmcs_config.revision_id;
2307af1c 2561
491a6038
LA
2562 if (shadow)
2563 vmcs->hdr.shadow_vmcs = 1;
6aa8b732
AK
2564 return vmcs;
2565}
2566
89b0c9f5 2567void free_vmcs(struct vmcs *vmcs)
6aa8b732 2568{
1c3d14fe 2569 free_pages((unsigned long)vmcs, vmcs_config.order);
6aa8b732
AK
2570}
2571
d462b819
NHE
2572/*
2573 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2574 */
89b0c9f5 2575void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
d462b819
NHE
2576{
2577 if (!loaded_vmcs->vmcs)
2578 return;
2579 loaded_vmcs_clear(loaded_vmcs);
2580 free_vmcs(loaded_vmcs->vmcs);
2581 loaded_vmcs->vmcs = NULL;
904e14fb
PB
2582 if (loaded_vmcs->msr_bitmap)
2583 free_page((unsigned long)loaded_vmcs->msr_bitmap);
355f4fb1 2584 WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
d462b819
NHE
2585}
2586
89b0c9f5 2587int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
f21f165e 2588{
491a6038 2589 loaded_vmcs->vmcs = alloc_vmcs(false);
f21f165e
PB
2590 if (!loaded_vmcs->vmcs)
2591 return -ENOMEM;
2592
2593 loaded_vmcs->shadow_vmcs = NULL;
804939ea 2594 loaded_vmcs->hv_timer_soft_disabled = false;
f21f165e 2595 loaded_vmcs_init(loaded_vmcs);
904e14fb
PB
2596
2597 if (cpu_has_vmx_msr_bitmap()) {
41836839
BG
2598 loaded_vmcs->msr_bitmap = (unsigned long *)
2599 __get_free_page(GFP_KERNEL_ACCOUNT);
904e14fb
PB
2600 if (!loaded_vmcs->msr_bitmap)
2601 goto out_vmcs;
2602 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
ceef7d10 2603
1f008e11
AB
2604 if (IS_ENABLED(CONFIG_HYPERV) &&
2605 static_branch_unlikely(&enable_evmcs) &&
ceef7d10
VK
2606 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
2607 struct hv_enlightened_vmcs *evmcs =
2608 (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
2609
2610 evmcs->hv_enlightenments_control.msr_bitmap = 1;
2611 }
904e14fb 2612 }
d7ee039e
SC
2613
2614 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
3af80fec
SC
2615 memset(&loaded_vmcs->controls_shadow, 0,
2616 sizeof(struct vmcs_controls_shadow));
d7ee039e 2617
f21f165e 2618 return 0;
904e14fb
PB
2619
2620out_vmcs:
2621 free_loaded_vmcs(loaded_vmcs);
2622 return -ENOMEM;
f21f165e
PB
2623}
2624
39959588 2625static void free_kvm_area(void)
6aa8b732
AK
2626{
2627 int cpu;
2628
3230bb47 2629 for_each_possible_cpu(cpu) {
6aa8b732 2630 free_vmcs(per_cpu(vmxarea, cpu));
3230bb47
ZA
2631 per_cpu(vmxarea, cpu) = NULL;
2632 }
6aa8b732
AK
2633}
2634
6aa8b732
AK
2635static __init int alloc_kvm_area(void)
2636{
2637 int cpu;
2638
3230bb47 2639 for_each_possible_cpu(cpu) {
6aa8b732
AK
2640 struct vmcs *vmcs;
2641
41836839 2642 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
6aa8b732
AK
2643 if (!vmcs) {
2644 free_kvm_area();
2645 return -ENOMEM;
2646 }
2647
2307af1c
LA
2648 /*
2649 * When eVMCS is enabled, alloc_vmcs_cpu() sets
2650 * vmcs->revision_id to KVM_EVMCS_VERSION instead of
2651 * revision_id reported by MSR_IA32_VMX_BASIC.
2652 *
312a4661 2653 * However, even though not explicitly documented by
2307af1c
LA
2654 * TLFS, VMXArea passed as VMXON argument should
2655 * still be marked with revision_id reported by
2656 * physical CPU.
2657 */
2658 if (static_branch_unlikely(&enable_evmcs))
392b2f25 2659 vmcs->hdr.revision_id = vmcs_config.revision_id;
2307af1c 2660
6aa8b732
AK
2661 per_cpu(vmxarea, cpu) = vmcs;
2662 }
2663 return 0;
2664}
2665
91b0aa2c 2666static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
d99e4152 2667 struct kvm_segment *save)
6aa8b732 2668{
d99e4152
GN
2669 if (!emulate_invalid_guest_state) {
2670 /*
2671 * CS and SS RPL should be equal during guest entry according
2672 * to VMX spec, but in reality it is not always so. Since vcpu
2673 * is in the middle of the transition from real mode to
2674 * protected mode it is safe to assume that RPL 0 is a good
2675 * default value.
2676 */
2677 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
b32a9918
NA
2678 save->selector &= ~SEGMENT_RPL_MASK;
2679 save->dpl = save->selector & SEGMENT_RPL_MASK;
d99e4152 2680 save->s = 1;
6aa8b732 2681 }
d99e4152 2682 vmx_set_segment(vcpu, save, seg);
6aa8b732
AK
2683}
2684
2685static void enter_pmode(struct kvm_vcpu *vcpu)
2686{
2687 unsigned long flags;
a89a8fb9 2688 struct vcpu_vmx *vmx = to_vmx(vcpu);
6aa8b732 2689
d99e4152
GN
2690 /*
2691 * Update real mode segment cache. It may be not up-to-date if sement
2692 * register was written while vcpu was in a guest mode.
2693 */
2694 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2695 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2696 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2697 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
2698 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2699 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
2700
7ffd92c5 2701 vmx->rmode.vm86_active = 0;
6aa8b732 2702
f5f7b2fe 2703 vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
6aa8b732
AK
2704
2705 flags = vmcs_readl(GUEST_RFLAGS);
78ac8b47
AK
2706 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
2707 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
6aa8b732
AK
2708 vmcs_writel(GUEST_RFLAGS, flags);
2709
66aee91a
RR
2710 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
2711 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
6aa8b732
AK
2712
2713 update_exception_bitmap(vcpu);
2714
91b0aa2c
GN
2715 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2716 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2717 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2718 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2719 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
2720 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
6aa8b732
AK
2721}
2722
f5f7b2fe 2723static void fix_rmode_seg(int seg, struct kvm_segment *save)
6aa8b732 2724{
772e0318 2725 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
d99e4152
GN
2726 struct kvm_segment var = *save;
2727
2728 var.dpl = 0x3;
2729 if (seg == VCPU_SREG_CS)
2730 var.type = 0x3;
2731
2732 if (!emulate_invalid_guest_state) {
2733 var.selector = var.base >> 4;
2734 var.base = var.base & 0xffff0;
2735 var.limit = 0xffff;
2736 var.g = 0;
2737 var.db = 0;
2738 var.present = 1;
2739 var.s = 1;
2740 var.l = 0;
2741 var.unusable = 0;
2742 var.type = 0x3;
2743 var.avl = 0;
2744 if (save->base & 0xf)
2745 printk_once(KERN_WARNING "kvm: segment base is not "
2746 "paragraph aligned when entering "
2747 "protected mode (seg=%d)", seg);
2748 }
6aa8b732 2749
d99e4152 2750 vmcs_write16(sf->selector, var.selector);
96794e4e 2751 vmcs_writel(sf->base, var.base);
d99e4152
GN
2752 vmcs_write32(sf->limit, var.limit);
2753 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
6aa8b732
AK
2754}
2755
2756static void enter_rmode(struct kvm_vcpu *vcpu)
2757{
2758 unsigned long flags;
a89a8fb9 2759 struct vcpu_vmx *vmx = to_vmx(vcpu);
40bbb9d0 2760 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
6aa8b732 2761
f5f7b2fe
AK
2762 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
2763 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
2764 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
2765 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
2766 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
c6ad1153
GN
2767 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
2768 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
f5f7b2fe 2769
7ffd92c5 2770 vmx->rmode.vm86_active = 1;
6aa8b732 2771
776e58ea
GN
2772 /*
2773 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
4918c6ca 2774 * vcpu. Warn the user that an update is overdue.
776e58ea 2775 */
40bbb9d0 2776 if (!kvm_vmx->tss_addr)
776e58ea
GN
2777 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
2778 "called before entering vcpu\n");
776e58ea 2779
2fb92db1
AK
2780 vmx_segment_cache_clear(vmx);
2781
40bbb9d0 2782 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
6aa8b732 2783 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
6aa8b732
AK
2784 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
2785
2786 flags = vmcs_readl(GUEST_RFLAGS);
78ac8b47 2787 vmx->rmode.save_rflags = flags;
6aa8b732 2788
053de044 2789 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
6aa8b732
AK
2790
2791 vmcs_writel(GUEST_RFLAGS, flags);
66aee91a 2792 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
6aa8b732
AK
2793 update_exception_bitmap(vcpu);
2794
d99e4152
GN
2795 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
2796 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
2797 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
2798 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
2799 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
2800 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
b246dd5d 2801
8668a3c4 2802 kvm_mmu_reset_context(vcpu);
6aa8b732
AK
2803}
2804
97b7ead3 2805void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
401d10de
AS
2806{
2807 struct vcpu_vmx *vmx = to_vmx(vcpu);
26bb0981
AK
2808 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
2809
2810 if (!msr)
2811 return;
401d10de 2812
f6801dff 2813 vcpu->arch.efer = efer;
401d10de 2814 if (efer & EFER_LMA) {
2961e876 2815 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
401d10de
AS
2816 msr->data = efer;
2817 } else {
2961e876 2818 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
401d10de
AS
2819
2820 msr->data = efer & ~EFER_LME;
2821 }
2822 setup_msrs(vmx);
2823}
2824
05b3e0c2 2825#ifdef CONFIG_X86_64
6aa8b732
AK
2826
2827static void enter_lmode(struct kvm_vcpu *vcpu)
2828{
2829 u32 guest_tr_ar;
2830
2fb92db1
AK
2831 vmx_segment_cache_clear(to_vmx(vcpu));
2832
6aa8b732 2833 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
4d283ec9 2834 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
bd80158a
JK
2835 pr_debug_ratelimited("%s: tss fixup for long mode. \n",
2836 __func__);
6aa8b732 2837 vmcs_write32(GUEST_TR_AR_BYTES,
4d283ec9
AL
2838 (guest_tr_ar & ~VMX_AR_TYPE_MASK)
2839 | VMX_AR_TYPE_BUSY_64_TSS);
6aa8b732 2840 }
da38f438 2841 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
6aa8b732
AK
2842}
2843
2844static void exit_lmode(struct kvm_vcpu *vcpu)
2845{
2961e876 2846 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
da38f438 2847 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
6aa8b732
AK
2848}
2849
2850#endif
2851
faff8758
JS
2852static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
2853{
2854 int vpid = to_vmx(vcpu)->vpid;
2855
2856 if (!vpid_sync_vcpu_addr(vpid, addr))
2857 vpid_sync_context(vpid);
2858
2859 /*
2860 * If VPIDs are not supported or enabled, then the above is a no-op.
2861 * But we don't really need a TLB flush in that case anyway, because
2862 * each VM entry/exit includes an implicit flush when VPID is 0.
2863 */
2864}
2865
e8467fda
AK
2866static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
2867{
2868 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
2869
2870 vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
2871 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
2872}
2873
25c4c276 2874static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
399badf3 2875{
fc78f519
AK
2876 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
2877
2878 vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
2879 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
399badf3
AK
2880}
2881
1439442c
SY
2882static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
2883{
d0d538b9
GN
2884 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
2885
cb3c1e2f 2886 if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
6de4f3ad
AK
2887 return;
2888
bf03d4f9 2889 if (is_pae_paging(vcpu)) {
d0d538b9
GN
2890 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
2891 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
2892 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
2893 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
1439442c
SY
2894 }
2895}
2896
97b7ead3 2897void ept_save_pdptrs(struct kvm_vcpu *vcpu)
8f5d549f 2898{
d0d538b9
GN
2899 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
2900
bf03d4f9 2901 if (is_pae_paging(vcpu)) {
d0d538b9
GN
2902 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
2903 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
2904 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
2905 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
8f5d549f 2906 }
6de4f3ad 2907
cb3c1e2f 2908 kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
8f5d549f
AK
2909}
2910
1439442c
SY
2911static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
2912 unsigned long cr0,
2913 struct kvm_vcpu *vcpu)
2914{
2183f564
SC
2915 struct vcpu_vmx *vmx = to_vmx(vcpu);
2916
cb3c1e2f 2917 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
34059c25 2918 vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
1439442c
SY
2919 if (!(cr0 & X86_CR0_PG)) {
2920 /* From paging/starting to nonpaging */
2183f564
SC
2921 exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
2922 CPU_BASED_CR3_STORE_EXITING);
1439442c 2923 vcpu->arch.cr0 = cr0;
fc78f519 2924 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
1439442c
SY
2925 } else if (!is_paging(vcpu)) {
2926 /* From nonpaging to paging */
2183f564
SC
2927 exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
2928 CPU_BASED_CR3_STORE_EXITING);
1439442c 2929 vcpu->arch.cr0 = cr0;
fc78f519 2930 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
1439442c 2931 }
95eb84a7
SY
2932
2933 if (!(cr0 & X86_CR0_WP))
2934 *hw_cr0 &= ~X86_CR0_WP;
1439442c
SY
2935}
2936
97b7ead3 2937void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
6aa8b732 2938{
7ffd92c5 2939 struct vcpu_vmx *vmx = to_vmx(vcpu);
3a624e29
NK
2940 unsigned long hw_cr0;
2941
3de6347b 2942 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
3a624e29 2943 if (enable_unrestricted_guest)
5037878e 2944 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
218e763f 2945 else {
5037878e 2946 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
1439442c 2947
218e763f
GN
2948 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
2949 enter_pmode(vcpu);
6aa8b732 2950
218e763f
GN
2951 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
2952 enter_rmode(vcpu);
2953 }
6aa8b732 2954
05b3e0c2 2955#ifdef CONFIG_X86_64
f6801dff 2956 if (vcpu->arch.efer & EFER_LME) {
707d92fa 2957 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
6aa8b732 2958 enter_lmode(vcpu);
707d92fa 2959 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
6aa8b732
AK
2960 exit_lmode(vcpu);
2961 }
2962#endif
2963
b4d18517 2964 if (enable_ept && !enable_unrestricted_guest)
1439442c
SY
2965 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
2966
6aa8b732 2967 vmcs_writel(CR0_READ_SHADOW, cr0);
1439442c 2968 vmcs_writel(GUEST_CR0, hw_cr0);
ad312c7c 2969 vcpu->arch.cr0 = cr0;
14168786
GN
2970
2971 /* depends on vcpu->arch.cr0 to be set to a new value */
2972 vmx->emulation_required = emulation_required(vcpu);
6aa8b732
AK
2973}
2974
855feb67
YZ
2975static int get_ept_level(struct kvm_vcpu *vcpu)
2976{
148d735e 2977 if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
ac69dfaa 2978 return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu));
855feb67
YZ
2979 if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
2980 return 5;
2981 return 4;
2982}
2983
89b0c9f5 2984u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
1439442c 2985{
855feb67
YZ
2986 u64 eptp = VMX_EPTP_MT_WB;
2987
2988 eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
1439442c 2989
995f00a6
PF
2990 if (enable_ept_ad_bits &&
2991 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
bb97a016 2992 eptp |= VMX_EPTP_AD_ENABLE_BIT;
1439442c
SY
2993 eptp |= (root_hpa & PAGE_MASK);
2994
2995 return eptp;
2996}
2997
727a7e27 2998void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3)
6aa8b732 2999{
877ad952 3000 struct kvm *kvm = vcpu->kvm;
04f11ef4 3001 bool update_guest_cr3 = true;
1439442c
SY
3002 unsigned long guest_cr3;
3003 u64 eptp;
3004
3005 guest_cr3 = cr3;
089d034e 3006 if (enable_ept) {
995f00a6 3007 eptp = construct_eptp(vcpu, cr3);
1439442c 3008 vmcs_write64(EPT_POINTER, eptp);
877ad952
TL
3009
3010 if (kvm_x86_ops->tlb_remote_flush) {
3011 spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
3012 to_vmx(vcpu)->ept_pointer = eptp;
3013 to_kvm_vmx(kvm)->ept_pointers_match
3014 = EPT_POINTERS_CHECK;
3015 spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
3016 }
3017
04f11ef4
SC
3018 /* Loading vmcs02.GUEST_CR3 is handled by nested VM-Enter. */
3019 if (is_guest_mode(vcpu))
3020 update_guest_cr3 = false;
b17b7436 3021 else if (!enable_unrestricted_guest && !is_paging(vcpu))
877ad952 3022 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
b17b7436
SC
3023 else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
3024 guest_cr3 = vcpu->arch.cr3;
3025 else /* vmcs01.GUEST_CR3 is already up-to-date. */
3026 update_guest_cr3 = false;
7c93be44 3027 ept_load_pdptrs(vcpu);
1439442c
SY
3028 }
3029
04f11ef4
SC
3030 if (update_guest_cr3)
3031 vmcs_writel(GUEST_CR3, guest_cr3);
6aa8b732
AK
3032}
3033
97b7ead3 3034int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
6aa8b732 3035{
fe7f895d 3036 struct vcpu_vmx *vmx = to_vmx(vcpu);
085e68ee
BS
3037 /*
3038 * Pass through host's Machine Check Enable value to hw_cr4, which
3039 * is in force while we are in guest mode. Do not let guests control
3040 * this bit, even if host CR4.MCE == 0.
3041 */
5dc1f044
SC
3042 unsigned long hw_cr4;
3043
3044 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
3045 if (enable_unrestricted_guest)
3046 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
fe7f895d 3047 else if (vmx->rmode.vm86_active)
5dc1f044
SC
3048 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
3049 else
3050 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
1439442c 3051
64f7a115
SC
3052 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
3053 if (cr4 & X86_CR4_UMIP) {
fe7f895d 3054 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
64f7a115
SC
3055 hw_cr4 &= ~X86_CR4_UMIP;
3056 } else if (!is_guest_mode(vcpu) ||
fe7f895d
SC
3057 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
3058 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
3059 }
64f7a115 3060 }
0367f205 3061
5e1746d6
NHE
3062 if (cr4 & X86_CR4_VMXE) {
3063 /*
3064 * To use VMXON (and later other VMX instructions), a guest
3065 * must first be able to turn on cr4.VMXE (see handle_vmon()).
3066 * So basically the check on whether to allow nested VMX
5bea5123
PB
3067 * is here. We operate under the default treatment of SMM,
3068 * so VMX cannot be enabled under SMM.
5e1746d6 3069 */
5bea5123 3070 if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
5e1746d6 3071 return 1;
1a0d74e6 3072 }
3899152c 3073
fe7f895d 3074 if (vmx->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
5e1746d6
NHE
3075 return 1;
3076
ad312c7c 3077 vcpu->arch.cr4 = cr4;
5dc1f044
SC
3078
3079 if (!enable_unrestricted_guest) {
3080 if (enable_ept) {
3081 if (!is_paging(vcpu)) {
3082 hw_cr4 &= ~X86_CR4_PAE;
3083 hw_cr4 |= X86_CR4_PSE;
3084 } else if (!(cr4 & X86_CR4_PAE)) {
3085 hw_cr4 &= ~X86_CR4_PAE;
3086 }
bc23008b 3087 }
1439442c 3088
656ec4a4 3089 /*
ddba2628
HH
3090 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
3091 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
3092 * to be manually disabled when guest switches to non-paging
3093 * mode.
3094 *
3095 * If !enable_unrestricted_guest, the CPU is always running
3096 * with CR0.PG=1 and CR4 needs to be modified.
3097 * If enable_unrestricted_guest, the CPU automatically
3098 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
656ec4a4 3099 */
5dc1f044
SC
3100 if (!is_paging(vcpu))
3101 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
3102 }
656ec4a4 3103
1439442c
SY
3104 vmcs_writel(CR4_READ_SHADOW, cr4);
3105 vmcs_writel(GUEST_CR4, hw_cr4);
5e1746d6 3106 return 0;
6aa8b732
AK
3107}
3108
97b7ead3 3109void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
6aa8b732 3110{
a9179499 3111 struct vcpu_vmx *vmx = to_vmx(vcpu);
6aa8b732
AK
3112 u32 ar;
3113
c6ad1153 3114 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
f5f7b2fe 3115 *var = vmx->rmode.segs[seg];
a9179499 3116 if (seg == VCPU_SREG_TR
2fb92db1 3117 || var->selector == vmx_read_guest_seg_selector(vmx, seg))
f5f7b2fe 3118 return;
1390a28b
AK
3119 var->base = vmx_read_guest_seg_base(vmx, seg);
3120 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3121 return;
a9179499 3122 }
2fb92db1
AK
3123 var->base = vmx_read_guest_seg_base(vmx, seg);
3124 var->limit = vmx_read_guest_seg_limit(vmx, seg);
3125 var->selector = vmx_read_guest_seg_selector(vmx, seg);
3126 ar = vmx_read_guest_seg_ar(vmx, seg);
03617c18 3127 var->unusable = (ar >> 16) & 1;
6aa8b732
AK
3128 var->type = ar & 15;
3129 var->s = (ar >> 4) & 1;
3130 var->dpl = (ar >> 5) & 3;
03617c18
GN
3131 /*
3132 * Some userspaces do not preserve unusable property. Since usable
3133 * segment has to be present according to VMX spec we can use present
3134 * property to amend userspace bug by making unusable segment always
3135 * nonpresent. vmx_segment_access_rights() already marks nonpresent
3136 * segment as unusable.
3137 */
3138 var->present = !var->unusable;
6aa8b732
AK
3139 var->avl = (ar >> 12) & 1;
3140 var->l = (ar >> 13) & 1;
3141 var->db = (ar >> 14) & 1;
3142 var->g = (ar >> 15) & 1;
6aa8b732
AK
3143}
3144
a9179499
AK
3145static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3146{
a9179499
AK
3147 struct kvm_segment s;
3148
3149 if (to_vmx(vcpu)->rmode.vm86_active) {
3150 vmx_get_segment(vcpu, &s, seg);
3151 return s.base;
3152 }
2fb92db1 3153 return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
a9179499
AK
3154}
3155
97b7ead3 3156int vmx_get_cpl(struct kvm_vcpu *vcpu)
2e4d2653 3157{
b09408d0
MT
3158 struct vcpu_vmx *vmx = to_vmx(vcpu);
3159
ae9fedc7 3160 if (unlikely(vmx->rmode.vm86_active))
2e4d2653 3161 return 0;
ae9fedc7
PB
3162 else {
3163 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
4d283ec9 3164 return VMX_AR_DPL(ar);
69c73028 3165 }
69c73028
AK
3166}
3167
653e3108 3168static u32 vmx_segment_access_rights(struct kvm_segment *var)
6aa8b732 3169{
6aa8b732
AK
3170 u32 ar;
3171
f0495f9b 3172 if (var->unusable || !var->present)
6aa8b732
AK
3173 ar = 1 << 16;
3174 else {
3175 ar = var->type & 15;
3176 ar |= (var->s & 1) << 4;
3177 ar |= (var->dpl & 3) << 5;
3178 ar |= (var->present & 1) << 7;
3179 ar |= (var->avl & 1) << 12;
3180 ar |= (var->l & 1) << 13;
3181 ar |= (var->db & 1) << 14;
3182 ar |= (var->g & 1) << 15;
3183 }
653e3108
AK
3184
3185 return ar;
3186}
3187
97b7ead3 3188void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
653e3108 3189{
7ffd92c5 3190 struct vcpu_vmx *vmx = to_vmx(vcpu);
772e0318 3191 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
653e3108 3192
2fb92db1
AK
3193 vmx_segment_cache_clear(vmx);
3194
1ecd50a9
GN
3195 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3196 vmx->rmode.segs[seg] = *var;
3197 if (seg == VCPU_SREG_TR)
3198 vmcs_write16(sf->selector, var->selector);
3199 else if (var->s)
3200 fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
d99e4152 3201 goto out;
653e3108 3202 }
1ecd50a9 3203
653e3108
AK
3204 vmcs_writel(sf->base, var->base);
3205 vmcs_write32(sf->limit, var->limit);
3206 vmcs_write16(sf->selector, var->selector);
3a624e29
NK
3207
3208 /*
3209 * Fix the "Accessed" bit in AR field of segment registers for older
3210 * qemu binaries.
3211 * IA32 arch specifies that at the time of processor reset the
3212 * "Accessed" bit in the AR field of segment registers is 1. And qemu
0fa06071 3213 * is setting it to 0 in the userland code. This causes invalid guest
3a624e29
NK
3214 * state vmexit when "unrestricted guest" mode is turned on.
3215 * Fix for this setup issue in cpu_reset is being pushed in the qemu
3216 * tree. Newer qemu binaries with that qemu fix would not need this
3217 * kvm hack.
3218 */
3219 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
f924d66d 3220 var->type |= 0x1; /* Accessed */
3a624e29 3221
f924d66d 3222 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
d99e4152
GN
3223
3224out:
98eb2f8b 3225 vmx->emulation_required = emulation_required(vcpu);
6aa8b732
AK
3226}
3227
6aa8b732
AK
3228static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3229{
2fb92db1 3230 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
6aa8b732
AK
3231
3232 *db = (ar >> 14) & 1;
3233 *l = (ar >> 13) & 1;
3234}
3235
89a27f4d 3236static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 3237{
89a27f4d
GN
3238 dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3239 dt->address = vmcs_readl(GUEST_IDTR_BASE);
6aa8b732
AK
3240}
3241
89a27f4d 3242static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 3243{
89a27f4d
GN
3244 vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3245 vmcs_writel(GUEST_IDTR_BASE, dt->address);
6aa8b732
AK
3246}
3247
89a27f4d 3248static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 3249{
89a27f4d
GN
3250 dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3251 dt->address = vmcs_readl(GUEST_GDTR_BASE);
6aa8b732
AK
3252}
3253
89a27f4d 3254static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
6aa8b732 3255{
89a27f4d
GN
3256 vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3257 vmcs_writel(GUEST_GDTR_BASE, dt->address);
6aa8b732
AK
3258}
3259
648dfaa7
MG
3260static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3261{
3262 struct kvm_segment var;
3263 u32 ar;
3264
3265 vmx_get_segment(vcpu, &var, seg);
07f42f5f 3266 var.dpl = 0x3;
0647f4aa
GN
3267 if (seg == VCPU_SREG_CS)
3268 var.type = 0x3;
648dfaa7
MG
3269 ar = vmx_segment_access_rights(&var);
3270
3271 if (var.base != (var.selector << 4))
3272 return false;
89efbed0 3273 if (var.limit != 0xffff)
648dfaa7 3274 return false;
07f42f5f 3275 if (ar != 0xf3)
648dfaa7
MG
3276 return false;
3277
3278 return true;
3279}
3280
3281static bool code_segment_valid(struct kvm_vcpu *vcpu)
3282{
3283 struct kvm_segment cs;
3284 unsigned int cs_rpl;
3285
3286 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
b32a9918 3287 cs_rpl = cs.selector & SEGMENT_RPL_MASK;
648dfaa7 3288
1872a3f4
AK
3289 if (cs.unusable)
3290 return false;
4d283ec9 3291 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
648dfaa7
MG
3292 return false;
3293 if (!cs.s)
3294 return false;
4d283ec9 3295 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
648dfaa7
MG
3296 if (cs.dpl > cs_rpl)
3297 return false;
1872a3f4 3298 } else {
648dfaa7
MG
3299 if (cs.dpl != cs_rpl)
3300 return false;
3301 }
3302 if (!cs.present)
3303 return false;
3304
3305 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3306 return true;
3307}
3308
3309static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3310{
3311 struct kvm_segment ss;
3312 unsigned int ss_rpl;
3313
3314 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
b32a9918 3315 ss_rpl = ss.selector & SEGMENT_RPL_MASK;
648dfaa7 3316
1872a3f4
AK
3317 if (ss.unusable)
3318 return true;
3319 if (ss.type != 3 && ss.type != 7)
648dfaa7
MG
3320 return false;
3321 if (!ss.s)
3322 return false;
3323 if (ss.dpl != ss_rpl) /* DPL != RPL */
3324 return false;
3325 if (!ss.present)
3326 return false;
3327
3328 return true;
3329}
3330
3331static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3332{
3333 struct kvm_segment var;
3334 unsigned int rpl;
3335
3336 vmx_get_segment(vcpu, &var, seg);
b32a9918 3337 rpl = var.selector & SEGMENT_RPL_MASK;
648dfaa7 3338
1872a3f4
AK
3339 if (var.unusable)
3340 return true;
648dfaa7
MG
3341 if (!var.s)
3342 return false;
3343 if (!var.present)
3344 return false;
4d283ec9 3345 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
648dfaa7
MG
3346 if (var.dpl < rpl) /* DPL < RPL */
3347 return false;
3348 }
3349
3350 /* TODO: Add other members to kvm_segment_field to allow checking for other access
3351 * rights flags
3352 */
3353 return true;
3354}
3355
3356static bool tr_valid(struct kvm_vcpu *vcpu)
3357{
3358 struct kvm_segment tr;
3359
3360 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
3361
1872a3f4
AK
3362 if (tr.unusable)
3363 return false;
b32a9918 3364 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */
648dfaa7 3365 return false;
1872a3f4 3366 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
648dfaa7
MG
3367 return false;
3368 if (!tr.present)
3369 return false;
3370
3371 return true;
3372}
3373
3374static bool ldtr_valid(struct kvm_vcpu *vcpu)
3375{
3376 struct kvm_segment ldtr;
3377
3378 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
3379
1872a3f4
AK
3380 if (ldtr.unusable)
3381 return true;
b32a9918 3382 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */
648dfaa7
MG
3383 return false;
3384 if (ldtr.type != 2)
3385 return false;
3386 if (!ldtr.present)
3387 return false;
3388
3389 return true;
3390}
3391
3392static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3393{
3394 struct kvm_segment cs, ss;
3395
3396 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3397 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3398
b32a9918
NA
3399 return ((cs.selector & SEGMENT_RPL_MASK) ==
3400 (ss.selector & SEGMENT_RPL_MASK));
648dfaa7
MG
3401}
3402
3403/*
3404 * Check if guest state is valid. Returns true if valid, false if
3405 * not.
3406 * We assume that registers are always usable
3407 */
3408static bool guest_state_valid(struct kvm_vcpu *vcpu)
3409{
c5e97c80
GN
3410 if (enable_unrestricted_guest)
3411 return true;
3412
648dfaa7 3413 /* real mode guest state checks */
f13882d8 3414 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
648dfaa7
MG
3415 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
3416 return false;
3417 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
3418 return false;
3419 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
3420 return false;
3421 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
3422 return false;
3423 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
3424 return false;
3425 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
3426 return false;
3427 } else {
3428 /* protected mode guest state checks */
3429 if (!cs_ss_rpl_check(vcpu))
3430 return false;
3431 if (!code_segment_valid(vcpu))
3432 return false;
3433 if (!stack_segment_valid(vcpu))
3434 return false;
3435 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
3436 return false;
3437 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
3438 return false;
3439 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
3440 return false;
3441 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
3442 return false;
3443 if (!tr_valid(vcpu))
3444 return false;
3445 if (!ldtr_valid(vcpu))
3446 return false;
3447 }
3448 /* TODO:
3449 * - Add checks on RIP
3450 * - Add checks on RFLAGS
3451 */
3452
3453 return true;
3454}
3455
d77c26fc 3456static int init_rmode_tss(struct kvm *kvm)
6aa8b732 3457{
40dcaa9f 3458 gfn_t fn;
195aefde 3459 u16 data = 0;
1f755a82 3460 int idx, r;
6aa8b732 3461
40dcaa9f 3462 idx = srcu_read_lock(&kvm->srcu);
40bbb9d0 3463 fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
195aefde
IE
3464 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
3465 if (r < 0)
10589a46 3466 goto out;
195aefde 3467 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
464d17c8
SY
3468 r = kvm_write_guest_page(kvm, fn++, &data,
3469 TSS_IOPB_BASE_OFFSET, sizeof(u16));
195aefde 3470 if (r < 0)
10589a46 3471 goto out;
195aefde
IE
3472 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
3473 if (r < 0)
10589a46 3474 goto out;
195aefde
IE
3475 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
3476 if (r < 0)
10589a46 3477 goto out;
195aefde 3478 data = ~0;
10589a46
MT
3479 r = kvm_write_guest_page(kvm, fn, &data,
3480 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
3481 sizeof(u8));
10589a46 3482out:
40dcaa9f 3483 srcu_read_unlock(&kvm->srcu, idx);
1f755a82 3484 return r;
6aa8b732
AK
3485}
3486
b7ebfb05
SY
3487static int init_rmode_identity_map(struct kvm *kvm)
3488{
40bbb9d0 3489 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
2a5755bb 3490 int i, r = 0;
ba049e93 3491 kvm_pfn_t identity_map_pfn;
b7ebfb05
SY
3492 u32 tmp;
3493
40bbb9d0 3494 /* Protect kvm_vmx->ept_identity_pagetable_done. */
a255d479
TC
3495 mutex_lock(&kvm->slots_lock);
3496
40bbb9d0 3497 if (likely(kvm_vmx->ept_identity_pagetable_done))
2a5755bb 3498 goto out;
a255d479 3499
40bbb9d0
SC
3500 if (!kvm_vmx->ept_identity_map_addr)
3501 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
3502 identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
a255d479 3503
d8a6e365 3504 r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
40bbb9d0 3505 kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
f51770ed 3506 if (r < 0)
2a5755bb 3507 goto out;
a255d479 3508
b7ebfb05
SY
3509 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
3510 if (r < 0)
3511 goto out;
3512 /* Set up identity-mapping pagetable for EPT in real mode */
3513 for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
3514 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
3515 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
3516 r = kvm_write_guest_page(kvm, identity_map_pfn,
3517 &tmp, i * sizeof(tmp), sizeof(tmp));
3518 if (r < 0)
3519 goto out;
3520 }
40bbb9d0 3521 kvm_vmx->ept_identity_pagetable_done = true;
f51770ed 3522
b7ebfb05 3523out:
a255d479 3524 mutex_unlock(&kvm->slots_lock);
f51770ed 3525 return r;
b7ebfb05
SY
3526}
3527
6aa8b732
AK
3528static void seg_setup(int seg)
3529{
772e0318 3530 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3a624e29 3531 unsigned int ar;
6aa8b732
AK
3532
3533 vmcs_write16(sf->selector, 0);
3534 vmcs_writel(sf->base, 0);
3535 vmcs_write32(sf->limit, 0xffff);
d54d07b2
GN
3536 ar = 0x93;
3537 if (seg == VCPU_SREG_CS)
3538 ar |= 0x08; /* code segment */
3a624e29
NK
3539
3540 vmcs_write32(sf->ar_bytes, ar);
6aa8b732
AK
3541}
3542
f78e0e2e
SY
3543static int alloc_apic_access_page(struct kvm *kvm)
3544{
4484141a 3545 struct page *page;
f78e0e2e
SY
3546 int r = 0;
3547
79fac95e 3548 mutex_lock(&kvm->slots_lock);
c24ae0dc 3549 if (kvm->arch.apic_access_page_done)
f78e0e2e 3550 goto out;
1d8007bd
PB
3551 r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
3552 APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
f78e0e2e
SY
3553 if (r)
3554 goto out;
72dc67a6 3555
73a6d941 3556 page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
4484141a
XG
3557 if (is_error_page(page)) {
3558 r = -EFAULT;
3559 goto out;
3560 }
3561
c24ae0dc
TC
3562 /*
3563 * Do not pin the page in memory, so that memory hot-unplug
3564 * is able to migrate it.
3565 */
3566 put_page(page);
3567 kvm->arch.apic_access_page_done = true;
f78e0e2e 3568out:
79fac95e 3569 mutex_unlock(&kvm->slots_lock);
f78e0e2e
SY
3570 return r;
3571}
3572
97b7ead3 3573int allocate_vpid(void)
2384d2b3
SY
3574{
3575 int vpid;
3576
919818ab 3577 if (!enable_vpid)
991e7a0e 3578 return 0;
2384d2b3
SY
3579 spin_lock(&vmx_vpid_lock);
3580 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
991e7a0e 3581 if (vpid < VMX_NR_VPIDS)
2384d2b3 3582 __set_bit(vpid, vmx_vpid_bitmap);
991e7a0e
WL
3583 else
3584 vpid = 0;
2384d2b3 3585 spin_unlock(&vmx_vpid_lock);
991e7a0e 3586 return vpid;
2384d2b3
SY
3587}
3588
97b7ead3 3589void free_vpid(int vpid)
cdbecfc3 3590{
991e7a0e 3591 if (!enable_vpid || vpid == 0)
cdbecfc3
LJ
3592 return;
3593 spin_lock(&vmx_vpid_lock);
991e7a0e 3594 __clear_bit(vpid, vmx_vpid_bitmap);
cdbecfc3
LJ
3595 spin_unlock(&vmx_vpid_lock);
3596}
3597
1e4329ee 3598static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
904e14fb 3599 u32 msr, int type)
25c5f225 3600{
3e7c73e9 3601 int f = sizeof(unsigned long);
25c5f225
SY
3602
3603 if (!cpu_has_vmx_msr_bitmap())
3604 return;
3605
ceef7d10
VK
3606 if (static_branch_unlikely(&enable_evmcs))
3607 evmcs_touch_msr_bitmap();
3608
25c5f225
SY
3609 /*
3610 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
3611 * have the write-low and read-high bitmap offsets the wrong way round.
3612 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
3613 */
25c5f225 3614 if (msr <= 0x1fff) {
8d14695f
YZ
3615 if (type & MSR_TYPE_R)
3616 /* read-low */
3617 __clear_bit(msr, msr_bitmap + 0x000 / f);
3618
3619 if (type & MSR_TYPE_W)
3620 /* write-low */
3621 __clear_bit(msr, msr_bitmap + 0x800 / f);
3622
25c5f225
SY
3623 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
3624 msr &= 0x1fff;
8d14695f
YZ
3625 if (type & MSR_TYPE_R)
3626 /* read-high */
3627 __clear_bit(msr, msr_bitmap + 0x400 / f);
3628
3629 if (type & MSR_TYPE_W)
3630 /* write-high */
3631 __clear_bit(msr, msr_bitmap + 0xc00 / f);
3632
3633 }
3634}
3635
1e4329ee 3636static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
904e14fb
PB
3637 u32 msr, int type)
3638{
3639 int f = sizeof(unsigned long);
3640
3641 if (!cpu_has_vmx_msr_bitmap())
3642 return;
3643
ceef7d10
VK
3644 if (static_branch_unlikely(&enable_evmcs))
3645 evmcs_touch_msr_bitmap();
3646
904e14fb
PB
3647 /*
3648 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
3649 * have the write-low and read-high bitmap offsets the wrong way round.
3650 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
3651 */
3652 if (msr <= 0x1fff) {
3653 if (type & MSR_TYPE_R)
3654 /* read-low */
3655 __set_bit(msr, msr_bitmap + 0x000 / f);
3656
3657 if (type & MSR_TYPE_W)
3658 /* write-low */
3659 __set_bit(msr, msr_bitmap + 0x800 / f);
3660
3661 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
3662 msr &= 0x1fff;
3663 if (type & MSR_TYPE_R)
3664 /* read-high */
3665 __set_bit(msr, msr_bitmap + 0x400 / f);
3666
3667 if (type & MSR_TYPE_W)
3668 /* write-high */
3669 __set_bit(msr, msr_bitmap + 0xc00 / f);
3670
3671 }
3672}
3673
1e4329ee 3674static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
904e14fb
PB
3675 u32 msr, int type, bool value)
3676{
3677 if (value)
3678 vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
3679 else
3680 vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
3681}
3682
904e14fb 3683static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
5897297b 3684{
904e14fb
PB
3685 u8 mode = 0;
3686
3687 if (cpu_has_secondary_exec_ctrls() &&
fe7f895d 3688 (secondary_exec_controls_get(to_vmx(vcpu)) &
904e14fb
PB
3689 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
3690 mode |= MSR_BITMAP_MODE_X2APIC;
3691 if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
3692 mode |= MSR_BITMAP_MODE_X2APIC_APICV;
3693 }
3694
904e14fb 3695 return mode;
8d14695f
YZ
3696}
3697
904e14fb
PB
3698static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
3699 u8 mode)
8d14695f 3700{
904e14fb
PB
3701 int msr;
3702
3703 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
3704 unsigned word = msr / BITS_PER_LONG;
3705 msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
3706 msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
3707 }
3708
3709 if (mode & MSR_BITMAP_MODE_X2APIC) {
3710 /*
3711 * TPR reads and writes can be virtualized even if virtual interrupt
3712 * delivery is not in use.
3713 */
3714 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
3715 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
3716 vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
3717 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
3718 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
3719 }
f6e90f9e 3720 }
5897297b
AK
3721}
3722
97b7ead3 3723void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
904e14fb
PB
3724{
3725 struct vcpu_vmx *vmx = to_vmx(vcpu);
3726 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
3727 u8 mode = vmx_msr_bitmap_mode(vcpu);
3728 u8 changed = mode ^ vmx->msr_bitmap_mode;
3729
3730 if (!changed)
3731 return;
3732
904e14fb
PB
3733 if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
3734 vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
3735
3736 vmx->msr_bitmap_mode = mode;
3737}
3738
b08c2896
CP
3739void pt_update_intercept_for_msr(struct vcpu_vmx *vmx)
3740{
3741 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
3742 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
3743 u32 i;
3744
3745 vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
3746 MSR_TYPE_RW, flag);
3747 vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
3748 MSR_TYPE_RW, flag);
3749 vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
3750 MSR_TYPE_RW, flag);
3751 vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
3752 MSR_TYPE_RW, flag);
3753 for (i = 0; i < vmx->pt_desc.addr_range; i++) {
3754 vmx_set_intercept_for_msr(msr_bitmap,
3755 MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
3756 vmx_set_intercept_for_msr(msr_bitmap,
3757 MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
3758 }
3759}
3760
e6c67d8c
LA
3761static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
3762{
3763 struct vcpu_vmx *vmx = to_vmx(vcpu);
3764 void *vapic_page;
3765 u32 vppr;
3766 int rvi;
3767
3768 if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
3769 !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
96c66e87 3770 WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
e6c67d8c
LA
3771 return false;
3772
7e712684 3773 rvi = vmx_get_rvi();
e6c67d8c 3774
96c66e87 3775 vapic_page = vmx->nested.virtual_apic_map.hva;
e6c67d8c 3776 vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
e6c67d8c
LA
3777
3778 return ((rvi & 0xf0) > (vppr & 0xf0));
3779}
3780
06a5524f
WV
3781static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
3782 bool nested)
21bc8dc5
RK
3783{
3784#ifdef CONFIG_SMP
06a5524f
WV
3785 int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
3786
21bc8dc5 3787 if (vcpu->mode == IN_GUEST_MODE) {
28b835d6 3788 /*
5753743f
HZ
3789 * The vector of interrupt to be delivered to vcpu had
3790 * been set in PIR before this function.
3791 *
3792 * Following cases will be reached in this block, and
3793 * we always send a notification event in all cases as
3794 * explained below.
3795 *
3796 * Case 1: vcpu keeps in non-root mode. Sending a
3797 * notification event posts the interrupt to vcpu.
3798 *
3799 * Case 2: vcpu exits to root mode and is still
3800 * runnable. PIR will be synced to vIRR before the
3801 * next vcpu entry. Sending a notification event in
3802 * this case has no effect, as vcpu is not in root
3803 * mode.
28b835d6 3804 *
5753743f
HZ
3805 * Case 3: vcpu exits to root mode and is blocked.
3806 * vcpu_block() has already synced PIR to vIRR and
3807 * never blocks vcpu if vIRR is not cleared. Therefore,
3808 * a blocked vcpu here does not wait for any requested
3809 * interrupts in PIR, and sending a notification event
3810 * which has no effect is safe here.
28b835d6 3811 */
28b835d6 3812
06a5524f 3813 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
21bc8dc5
RK
3814 return true;
3815 }
3816#endif
3817 return false;
3818}
3819
705699a1
WV
3820static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
3821 int vector)
3822{
3823 struct vcpu_vmx *vmx = to_vmx(vcpu);
3824
3825 if (is_guest_mode(vcpu) &&
3826 vector == vmx->nested.posted_intr_nv) {
705699a1
WV
3827 /*
3828 * If a posted intr is not recognized by hardware,
3829 * we will accomplish it in the next vmentry.
3830 */
3831 vmx->nested.pi_pending = true;
3832 kvm_make_request(KVM_REQ_EVENT, vcpu);
6b697711
LA
3833 /* the PIR and ON have been set by L1. */
3834 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
3835 kvm_vcpu_kick(vcpu);
705699a1
WV
3836 return 0;
3837 }
3838 return -1;
3839}
a20ed54d
YZ
3840/*
3841 * Send interrupt to vcpu via posted interrupt way.
3842 * 1. If target vcpu is running(non-root mode), send posted interrupt
3843 * notification to vcpu and hardware will sync PIR to vIRR atomically.
3844 * 2. If target vcpu isn't running(root mode), kick it to pick up the
3845 * interrupt from PIR in next vmentry.
3846 */
91a5f413 3847static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
a20ed54d
YZ
3848{
3849 struct vcpu_vmx *vmx = to_vmx(vcpu);
3850 int r;
3851
705699a1
WV
3852 r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
3853 if (!r)
91a5f413
VK
3854 return 0;
3855
3856 if (!vcpu->arch.apicv_active)
3857 return -1;
705699a1 3858
a20ed54d 3859 if (pi_test_and_set_pir(vector, &vmx->pi_desc))
91a5f413 3860 return 0;
a20ed54d 3861
b95234c8
PB
3862 /* If a previous notification has sent the IPI, nothing to do. */
3863 if (pi_test_and_set_on(&vmx->pi_desc))
91a5f413 3864 return 0;
b95234c8 3865
06a5524f 3866 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
a20ed54d 3867 kvm_vcpu_kick(vcpu);
91a5f413
VK
3868
3869 return 0;
a20ed54d
YZ
3870}
3871
a3a8ff8e
NHE
3872/*
3873 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
3874 * will not change in the lifetime of the guest.
3875 * Note that host-state that does change is set elsewhere. E.g., host-state
3876 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
3877 */
97b7ead3 3878void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
a3a8ff8e
NHE
3879{
3880 u32 low32, high32;
3881 unsigned long tmpl;
d6e41f11 3882 unsigned long cr0, cr3, cr4;
a3a8ff8e 3883
04ac88ab
AL
3884 cr0 = read_cr0();
3885 WARN_ON(cr0 & X86_CR0_TS);
3886 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
d6e41f11
AL
3887
3888 /*
3889 * Save the most likely value for this task's CR3 in the VMCS.
3890 * We can't use __get_current_cr3_fast() because we're not atomic.
3891 */
6c690ee1 3892 cr3 = __read_cr3();
d6e41f11 3893 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
d7ee039e 3894 vmx->loaded_vmcs->host_state.cr3 = cr3;
a3a8ff8e 3895
d974baa3 3896 /* Save the most likely value for this task's CR4 in the VMCS. */
1e02ce4c 3897 cr4 = cr4_read_shadow();
d974baa3 3898 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
d7ee039e 3899 vmx->loaded_vmcs->host_state.cr4 = cr4;
d974baa3 3900
a3a8ff8e 3901 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
b2da15ac
AK
3902#ifdef CONFIG_X86_64
3903 /*
3904 * Load null selectors, so we can avoid reloading them in
6d6095bd
SC
3905 * vmx_prepare_switch_to_host(), in case userspace uses
3906 * the null selectors too (the expected case).
b2da15ac
AK
3907 */
3908 vmcs_write16(HOST_DS_SELECTOR, 0);
3909 vmcs_write16(HOST_ES_SELECTOR, 0);
3910#else
a3a8ff8e
NHE
3911 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3912 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
b2da15ac 3913#endif
a3a8ff8e
NHE
3914 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
3915 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
3916
2342080c 3917 vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */
a3a8ff8e 3918
453eafbe 3919 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
a3a8ff8e
NHE
3920
3921 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
3922 vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
3923 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
3924 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
3925
3926 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
3927 rdmsr(MSR_IA32_CR_PAT, low32, high32);
3928 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
3929 }
5a5e8a15 3930
c73da3fc 3931 if (cpu_has_load_ia32_efer())
5a5e8a15 3932 vmcs_write64(HOST_IA32_EFER, host_efer);
a3a8ff8e
NHE
3933}
3934
97b7ead3 3935void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
bf8179a0
NHE
3936{
3937 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
3938 if (enable_ept)
3939 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
fe3ef05c
NHE
3940 if (is_guest_mode(&vmx->vcpu))
3941 vmx->vcpu.arch.cr4_guest_owned_bits &=
3942 ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
bf8179a0
NHE
3943 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
3944}
3945
c075c3e4 3946u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
01e439be
YZ
3947{
3948 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
3949
d62caabb 3950 if (!kvm_vcpu_apicv_active(&vmx->vcpu))
01e439be 3951 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
d02fcf50
PB
3952
3953 if (!enable_vnmi)
3954 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
3955
804939ea
SC
3956 if (!enable_preemption_timer)
3957 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
3958
01e439be
YZ
3959 return pin_based_exec_ctrl;
3960}
3961
d62caabb
AS
3962static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
3963{
3964 struct vcpu_vmx *vmx = to_vmx(vcpu);
3965
c5f2c766 3966 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
3ce424e4
RK
3967 if (cpu_has_secondary_exec_ctrls()) {
3968 if (kvm_vcpu_apicv_active(vcpu))
fe7f895d 3969 secondary_exec_controls_setbit(vmx,
3ce424e4
RK
3970 SECONDARY_EXEC_APIC_REGISTER_VIRT |
3971 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3972 else
fe7f895d 3973 secondary_exec_controls_clearbit(vmx,
3ce424e4
RK
3974 SECONDARY_EXEC_APIC_REGISTER_VIRT |
3975 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3976 }
3977
3978 if (cpu_has_vmx_msr_bitmap())
904e14fb 3979 vmx_update_msr_bitmap(vcpu);
d62caabb
AS
3980}
3981
89b0c9f5
SC
3982u32 vmx_exec_control(struct vcpu_vmx *vmx)
3983{
3984 u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
3985
3986 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
3987 exec_control &= ~CPU_BASED_MOV_DR_EXITING;
3988
3989 if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
3990 exec_control &= ~CPU_BASED_TPR_SHADOW;
3991#ifdef CONFIG_X86_64
3992 exec_control |= CPU_BASED_CR8_STORE_EXITING |
3993 CPU_BASED_CR8_LOAD_EXITING;
3994#endif
3995 }
3996 if (!enable_ept)
3997 exec_control |= CPU_BASED_CR3_STORE_EXITING |
3998 CPU_BASED_CR3_LOAD_EXITING |
3999 CPU_BASED_INVLPG_EXITING;
4000 if (kvm_mwait_in_guest(vmx->vcpu.kvm))
4001 exec_control &= ~(CPU_BASED_MWAIT_EXITING |
4002 CPU_BASED_MONITOR_EXITING);
4003 if (kvm_hlt_in_guest(vmx->vcpu.kvm))
4004 exec_control &= ~CPU_BASED_HLT_EXITING;
4005 return exec_control;
4006}
4007
4008
80154d77 4009static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
bf8179a0 4010{
80154d77
PB
4011 struct kvm_vcpu *vcpu = &vmx->vcpu;
4012
bf8179a0 4013 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
0367f205 4014
2ef7619d 4015 if (vmx_pt_mode_is_system())
f99e3daf 4016 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
80154d77 4017 if (!cpu_need_virtualize_apic_accesses(vcpu))
bf8179a0
NHE
4018 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4019 if (vmx->vpid == 0)
4020 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
4021 if (!enable_ept) {
4022 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
4023 enable_unrestricted_guest = 0;
4024 }
4025 if (!enable_unrestricted_guest)
4026 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
b31c114b 4027 if (kvm_pause_in_guest(vmx->vcpu.kvm))
bf8179a0 4028 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
80154d77 4029 if (!kvm_vcpu_apicv_active(vcpu))
c7c9c56c
YZ
4030 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4031 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
8d14695f 4032 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
0367f205
PB
4033
4034 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
4035 * in vmx_set_cr4. */
4036 exec_control &= ~SECONDARY_EXEC_DESC;
4037
abc4fc58
AG
4038 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4039 (handle_vmptrld).
4040 We can NOT enable shadow_vmcs here because we don't have yet
4041 a current VMCS12
4042 */
4043 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
a3eaa864
KH
4044
4045 if (!enable_pml)
4046 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
843e4330 4047
3db13480
PB
4048 if (vmx_xsaves_supported()) {
4049 /* Exposing XSAVES only when XSAVE is exposed */
4050 bool xsaves_enabled =
96be4e06 4051 boot_cpu_has(X86_FEATURE_XSAVE) &&
3db13480
PB
4052 guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4053 guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
4054
7204160e
AL
4055 vcpu->arch.xsaves_enabled = xsaves_enabled;
4056
3db13480
PB
4057 if (!xsaves_enabled)
4058 exec_control &= ~SECONDARY_EXEC_XSAVES;
4059
4060 if (nested) {
4061 if (xsaves_enabled)
6677f3da 4062 vmx->nested.msrs.secondary_ctls_high |=
3db13480
PB
4063 SECONDARY_EXEC_XSAVES;
4064 else
6677f3da 4065 vmx->nested.msrs.secondary_ctls_high &=
3db13480
PB
4066 ~SECONDARY_EXEC_XSAVES;
4067 }
4068 }
4069
a7a200eb 4070 if (cpu_has_vmx_rdtscp()) {
80154d77
PB
4071 bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
4072 if (!rdtscp_enabled)
4073 exec_control &= ~SECONDARY_EXEC_RDTSCP;
4074
4075 if (nested) {
4076 if (rdtscp_enabled)
6677f3da 4077 vmx->nested.msrs.secondary_ctls_high |=
80154d77
PB
4078 SECONDARY_EXEC_RDTSCP;
4079 else
6677f3da 4080 vmx->nested.msrs.secondary_ctls_high &=
80154d77
PB
4081 ~SECONDARY_EXEC_RDTSCP;
4082 }
4083 }
4084
5ffec6f9 4085 if (cpu_has_vmx_invpcid()) {
80154d77
PB
4086 /* Exposing INVPCID only when PCID is exposed */
4087 bool invpcid_enabled =
4088 guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
4089 guest_cpuid_has(vcpu, X86_FEATURE_PCID);
4090
4091 if (!invpcid_enabled) {
4092 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
4093 guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
4094 }
4095
4096 if (nested) {
4097 if (invpcid_enabled)
6677f3da 4098 vmx->nested.msrs.secondary_ctls_high |=
80154d77
PB
4099 SECONDARY_EXEC_ENABLE_INVPCID;
4100 else
6677f3da 4101 vmx->nested.msrs.secondary_ctls_high &=
80154d77
PB
4102 ~SECONDARY_EXEC_ENABLE_INVPCID;
4103 }
4104 }
4105
45ec368c
JM
4106 if (vmx_rdrand_supported()) {
4107 bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
4108 if (rdrand_enabled)
736fdf72 4109 exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
45ec368c
JM
4110
4111 if (nested) {
4112 if (rdrand_enabled)
6677f3da 4113 vmx->nested.msrs.secondary_ctls_high |=
736fdf72 4114 SECONDARY_EXEC_RDRAND_EXITING;
45ec368c 4115 else
6677f3da 4116 vmx->nested.msrs.secondary_ctls_high &=
736fdf72 4117 ~SECONDARY_EXEC_RDRAND_EXITING;
45ec368c
JM
4118 }
4119 }
4120
75f4fc8d
JM
4121 if (vmx_rdseed_supported()) {
4122 bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
4123 if (rdseed_enabled)
736fdf72 4124 exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
75f4fc8d
JM
4125
4126 if (nested) {
4127 if (rdseed_enabled)
6677f3da 4128 vmx->nested.msrs.secondary_ctls_high |=
736fdf72 4129 SECONDARY_EXEC_RDSEED_EXITING;
75f4fc8d 4130 else
6677f3da 4131 vmx->nested.msrs.secondary_ctls_high &=
736fdf72 4132 ~SECONDARY_EXEC_RDSEED_EXITING;
75f4fc8d
JM
4133 }
4134 }
4135
e69e72fa
TX
4136 if (vmx_waitpkg_supported()) {
4137 bool waitpkg_enabled =
4138 guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG);
4139
4140 if (!waitpkg_enabled)
4141 exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
4142
4143 if (nested) {
4144 if (waitpkg_enabled)
4145 vmx->nested.msrs.secondary_ctls_high |=
4146 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
4147 else
4148 vmx->nested.msrs.secondary_ctls_high &=
4149 ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
4150 }
4151 }
4152
80154d77 4153 vmx->secondary_exec_control = exec_control;
bf8179a0
NHE
4154}
4155
ce88decf
XG
4156static void ept_set_mmio_spte_mask(void)
4157{
4158 /*
4159 * EPT Misconfigurations can be generated if the value of bits 2:0
4160 * of an EPT paging-structure entry is 110b (write/execute).
ce88decf 4161 */
dcdca5fe 4162 kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
4af77151 4163 VMX_EPT_MISCONFIG_WX_VALUE, 0);
ce88decf
XG
4164}
4165
f53cd63c 4166#define VMX_XSS_EXIT_BITMAP 0
6aa8b732 4167
944c3464 4168/*
1b84292b
XL
4169 * Noting that the initialization of Guest-state Area of VMCS is in
4170 * vmx_vcpu_reset().
944c3464 4171 */
1b84292b 4172static void init_vmcs(struct vcpu_vmx *vmx)
944c3464 4173{
944c3464 4174 if (nested)
1b84292b 4175 nested_vmx_set_vmcs_shadowing_bitmap();
944c3464 4176
25c5f225 4177 if (cpu_has_vmx_msr_bitmap())
904e14fb 4178 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
25c5f225 4179
6aa8b732
AK
4180 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
4181
6aa8b732 4182 /* Control */
3af80fec 4183 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
6e5d865c 4184
3af80fec 4185 exec_controls_set(vmx, vmx_exec_control(vmx));
6aa8b732 4186
dfa169bb 4187 if (cpu_has_secondary_exec_ctrls()) {
80154d77 4188 vmx_compute_secondary_exec_control(vmx);
3af80fec 4189 secondary_exec_controls_set(vmx, vmx->secondary_exec_control);
dfa169bb 4190 }
f78e0e2e 4191
d62caabb 4192 if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
c7c9c56c
YZ
4193 vmcs_write64(EOI_EXIT_BITMAP0, 0);
4194 vmcs_write64(EOI_EXIT_BITMAP1, 0);
4195 vmcs_write64(EOI_EXIT_BITMAP2, 0);
4196 vmcs_write64(EOI_EXIT_BITMAP3, 0);
4197
4198 vmcs_write16(GUEST_INTR_STATUS, 0);
01e439be 4199
0bcf261c 4200 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
01e439be 4201 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
c7c9c56c
YZ
4202 }
4203
b31c114b 4204 if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
4b8d54f9 4205 vmcs_write32(PLE_GAP, ple_gap);
a7653ecd
RK
4206 vmx->ple_window = ple_window;
4207 vmx->ple_window_dirty = true;
4b8d54f9
ZE
4208 }
4209
c3707958
XG
4210 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
4211 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
6aa8b732
AK
4212 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
4213
9581d442
AK
4214 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
4215 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
a547c6db 4216 vmx_set_constant_host_state(vmx);
6aa8b732
AK
4217 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
4218 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
6aa8b732 4219
2a499e49
BD
4220 if (cpu_has_vmx_vmfunc())
4221 vmcs_write64(VM_FUNCTION_CONTROL, 0);
4222
2cc51560
ED
4223 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
4224 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
33966dd6 4225 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2cc51560 4226 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
33966dd6 4227 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
6aa8b732 4228
74545705
RK
4229 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
4230 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
468d472f 4231
3af80fec 4232 vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
6aa8b732
AK
4233
4234 /* 22.2.1, 20.8.1 */
3af80fec 4235 vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
1c3d14fe 4236
bd7e5b08
PB
4237 vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
4238 vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
4239
bf8179a0 4240 set_cr4_guest_host_mask(vmx);
e00c8cf2 4241
35fbe0d4
XL
4242 if (vmx->vpid != 0)
4243 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4244
f53cd63c
WL
4245 if (vmx_xsaves_supported())
4246 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
4247
4e59516a 4248 if (enable_pml) {
4e59516a
PF
4249 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
4250 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
4251 }
0b665d30
SC
4252
4253 if (cpu_has_vmx_encls_vmexit())
4254 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
2ef444f1 4255
2ef7619d 4256 if (vmx_pt_mode_is_host_guest()) {
2ef444f1
CP
4257 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
4258 /* Bit[6~0] are forced to 1, writes are ignored. */
4259 vmx->pt_desc.guest.output_mask = 0x7F;
4260 vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
4261 }
e00c8cf2
AK
4262}
4263
d28bc9dd 4264static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
e00c8cf2
AK
4265{
4266 struct vcpu_vmx *vmx = to_vmx(vcpu);
58cb628d 4267 struct msr_data apic_base_msr;
d28bc9dd 4268 u64 cr0;
e00c8cf2 4269
7ffd92c5 4270 vmx->rmode.vm86_active = 0;
d28b387f 4271 vmx->spec_ctrl = 0;
e00c8cf2 4272
6e3ba4ab
TX
4273 vmx->msr_ia32_umwait_control = 0;
4274
ad312c7c 4275 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
95c06540 4276 vmx->hv_deadline_tsc = -1;
d28bc9dd
NA
4277 kvm_set_cr8(vcpu, 0);
4278
4279 if (!init_event) {
4280 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
4281 MSR_IA32_APICBASE_ENABLE;
4282 if (kvm_vcpu_is_reset_bsp(vcpu))
4283 apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
4284 apic_base_msr.host_initiated = true;
4285 kvm_set_apic_base(vcpu, &apic_base_msr);
4286 }
e00c8cf2 4287
2fb92db1
AK
4288 vmx_segment_cache_clear(vmx);
4289
5706be0d 4290 seg_setup(VCPU_SREG_CS);
66450a21 4291 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
f3531054 4292 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
e00c8cf2
AK
4293
4294 seg_setup(VCPU_SREG_DS);
4295 seg_setup(VCPU_SREG_ES);
4296 seg_setup(VCPU_SREG_FS);
4297 seg_setup(VCPU_SREG_GS);
4298 seg_setup(VCPU_SREG_SS);
4299
4300 vmcs_write16(GUEST_TR_SELECTOR, 0);
4301 vmcs_writel(GUEST_TR_BASE, 0);
4302 vmcs_write32(GUEST_TR_LIMIT, 0xffff);
4303 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4304
4305 vmcs_write16(GUEST_LDTR_SELECTOR, 0);
4306 vmcs_writel(GUEST_LDTR_BASE, 0);
4307 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
4308 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
4309
d28bc9dd
NA
4310 if (!init_event) {
4311 vmcs_write32(GUEST_SYSENTER_CS, 0);
4312 vmcs_writel(GUEST_SYSENTER_ESP, 0);
4313 vmcs_writel(GUEST_SYSENTER_EIP, 0);
4314 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4315 }
e00c8cf2 4316
c37c2873 4317 kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
66450a21 4318 kvm_rip_write(vcpu, 0xfff0);
e00c8cf2 4319
e00c8cf2
AK
4320 vmcs_writel(GUEST_GDTR_BASE, 0);
4321 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
4322
4323 vmcs_writel(GUEST_IDTR_BASE, 0);
4324 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
4325
443381a8 4326 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
e00c8cf2 4327 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
f3531054 4328 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
a554d207
WL
4329 if (kvm_mpx_supported())
4330 vmcs_write64(GUEST_BNDCFGS, 0);
e00c8cf2 4331
e00c8cf2
AK
4332 setup_msrs(vmx);
4333
6aa8b732
AK
4334 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
4335
d28bc9dd 4336 if (cpu_has_vmx_tpr_shadow() && !init_event) {
f78e0e2e 4337 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
35754c98 4338 if (cpu_need_tpr_shadow(vcpu))
f78e0e2e 4339 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
d28bc9dd 4340 __pa(vcpu->arch.apic->regs));
f78e0e2e
SY
4341 vmcs_write32(TPR_THRESHOLD, 0);
4342 }
4343
a73896cb 4344 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6aa8b732 4345
d28bc9dd 4346 cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
d28bc9dd 4347 vmx->vcpu.arch.cr0 = cr0;
f2463247 4348 vmx_set_cr0(vcpu, cr0); /* enter rmode */
d28bc9dd 4349 vmx_set_cr4(vcpu, 0);
5690891b 4350 vmx_set_efer(vcpu, 0);
bd7e5b08 4351
d28bc9dd 4352 update_exception_bitmap(vcpu);
6aa8b732 4353
dd5f5341 4354 vpid_sync_context(vmx->vpid);
caa057a2
WL
4355 if (init_event)
4356 vmx_clear_hlt(vcpu);
6aa8b732
AK
4357}
4358
55d2375e 4359static void enable_irq_window(struct kvm_vcpu *vcpu)
3b86cd99 4360{
9dadc2f9 4361 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
3b86cd99
JK
4362}
4363
c9a7953f 4364static void enable_nmi_window(struct kvm_vcpu *vcpu)
3b86cd99 4365{
d02fcf50 4366 if (!enable_vnmi ||
8a1b4392 4367 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
c9a7953f
JK
4368 enable_irq_window(vcpu);
4369 return;
4370 }
3b86cd99 4371
4e2a0bc5 4372 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
3b86cd99
JK
4373}
4374
66fd3f7f 4375static void vmx_inject_irq(struct kvm_vcpu *vcpu)
85f455f7 4376{
9c8cba37 4377 struct vcpu_vmx *vmx = to_vmx(vcpu);
66fd3f7f
GN
4378 uint32_t intr;
4379 int irq = vcpu->arch.interrupt.nr;
9c8cba37 4380
229456fc 4381 trace_kvm_inj_virq(irq);
2714d1d3 4382
fa89a817 4383 ++vcpu->stat.irq_injections;
7ffd92c5 4384 if (vmx->rmode.vm86_active) {
71f9833b
SH
4385 int inc_eip = 0;
4386 if (vcpu->arch.interrupt.soft)
4387 inc_eip = vcpu->arch.event_exit_inst_len;
9497e1f2 4388 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
85f455f7
ED
4389 return;
4390 }
66fd3f7f
GN
4391 intr = irq | INTR_INFO_VALID_MASK;
4392 if (vcpu->arch.interrupt.soft) {
4393 intr |= INTR_TYPE_SOFT_INTR;
4394 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
4395 vmx->vcpu.arch.event_exit_inst_len);
4396 } else
4397 intr |= INTR_TYPE_EXT_INTR;
4398 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
caa057a2
WL
4399
4400 vmx_clear_hlt(vcpu);
85f455f7
ED
4401}
4402
f08864b4
SY
4403static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4404{
66a5a347
JK
4405 struct vcpu_vmx *vmx = to_vmx(vcpu);
4406
d02fcf50 4407 if (!enable_vnmi) {
8a1b4392
PB
4408 /*
4409 * Tracking the NMI-blocked state in software is built upon
4410 * finding the next open IRQ window. This, in turn, depends on
4411 * well-behaving guests: They have to keep IRQs disabled at
4412 * least as long as the NMI handler runs. Otherwise we may
4413 * cause NMI nesting, maybe breaking the guest. But as this is
4414 * highly unlikely, we can live with the residual risk.
4415 */
4416 vmx->loaded_vmcs->soft_vnmi_blocked = 1;
4417 vmx->loaded_vmcs->vnmi_blocked_time = 0;
4418 }
4419
4c4a6f79
PB
4420 ++vcpu->stat.nmi_injections;
4421 vmx->loaded_vmcs->nmi_known_unmasked = false;
3b86cd99 4422
7ffd92c5 4423 if (vmx->rmode.vm86_active) {
9497e1f2 4424 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
66a5a347
JK
4425 return;
4426 }
c5a6d5f7 4427
f08864b4
SY
4428 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
4429 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
caa057a2
WL
4430
4431 vmx_clear_hlt(vcpu);
f08864b4
SY
4432}
4433
97b7ead3 4434bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
3cfc3092 4435{
4c4a6f79
PB
4436 struct vcpu_vmx *vmx = to_vmx(vcpu);
4437 bool masked;
4438
d02fcf50 4439 if (!enable_vnmi)
8a1b4392 4440 return vmx->loaded_vmcs->soft_vnmi_blocked;
4c4a6f79 4441 if (vmx->loaded_vmcs->nmi_known_unmasked)
9d58b931 4442 return false;
4c4a6f79
PB
4443 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
4444 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
4445 return masked;
3cfc3092
JK
4446}
4447
97b7ead3 4448void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3cfc3092
JK
4449{
4450 struct vcpu_vmx *vmx = to_vmx(vcpu);
4451
d02fcf50 4452 if (!enable_vnmi) {
8a1b4392
PB
4453 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
4454 vmx->loaded_vmcs->soft_vnmi_blocked = masked;
4455 vmx->loaded_vmcs->vnmi_blocked_time = 0;
4456 }
4457 } else {
4458 vmx->loaded_vmcs->nmi_known_unmasked = !masked;
4459 if (masked)
4460 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
4461 GUEST_INTR_STATE_NMI);
4462 else
4463 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
4464 GUEST_INTR_STATE_NMI);
4465 }
3cfc3092
JK
4466}
4467
2505dc9f
JK
4468static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
4469{
b6b8a145
JK
4470 if (to_vmx(vcpu)->nested.nested_run_pending)
4471 return 0;
ea8ceb83 4472
d02fcf50 4473 if (!enable_vnmi &&
8a1b4392
PB
4474 to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
4475 return 0;
4476
2505dc9f
JK
4477 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4478 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
4479 | GUEST_INTR_STATE_NMI));
4480}
4481
78646121
GN
4482static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
4483{
a1c77abb
SC
4484 if (to_vmx(vcpu)->nested.nested_run_pending)
4485 return false;
4486
4487 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
4488 return true;
4489
4490 return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
c4282df9
GN
4491 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4492 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
78646121
GN
4493}
4494
cbc94022
IE
4495static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
4496{
4497 int ret;
cbc94022 4498
f7eaeb0a
SC
4499 if (enable_unrestricted_guest)
4500 return 0;
4501
6a3c623b
PX
4502 mutex_lock(&kvm->slots_lock);
4503 ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
4504 PAGE_SIZE * 3);
4505 mutex_unlock(&kvm->slots_lock);
4506
cbc94022
IE
4507 if (ret)
4508 return ret;
40bbb9d0 4509 to_kvm_vmx(kvm)->tss_addr = addr;
1f755a82 4510 return init_rmode_tss(kvm);
cbc94022
IE
4511}
4512
2ac52ab8
SC
4513static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
4514{
40bbb9d0 4515 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
2ac52ab8
SC
4516 return 0;
4517}
4518
0ca1b4f4 4519static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
6aa8b732 4520{
77ab6db0 4521 switch (vec) {
77ab6db0 4522 case BP_VECTOR:
c573cd22
JK
4523 /*
4524 * Update instruction length as we may reinject the exception
4525 * from user space while in guest debugging mode.
4526 */
4527 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
4528 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
d0bfb940 4529 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
0ca1b4f4
GN
4530 return false;
4531 /* fall through */
4532 case DB_VECTOR:
4533 if (vcpu->guest_debug &
4534 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
4535 return false;
d0bfb940
JK
4536 /* fall through */
4537 case DE_VECTOR:
77ab6db0
JK
4538 case OF_VECTOR:
4539 case BR_VECTOR:
4540 case UD_VECTOR:
4541 case DF_VECTOR:
4542 case SS_VECTOR:
4543 case GP_VECTOR:
4544 case MF_VECTOR:
0ca1b4f4 4545 return true;
77ab6db0 4546 }
0ca1b4f4
GN
4547 return false;
4548}
4549
4550static int handle_rmode_exception(struct kvm_vcpu *vcpu,
4551 int vec, u32 err_code)
4552{
4553 /*
4554 * Instruction with address size override prefix opcode 0x67
4555 * Cause the #SS fault with 0 error code in VM86 mode.
4556 */
4557 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
60fc3d02 4558 if (kvm_emulate_instruction(vcpu, 0)) {
0ca1b4f4
GN
4559 if (vcpu->arch.halt_request) {
4560 vcpu->arch.halt_request = 0;
5cb56059 4561 return kvm_vcpu_halt(vcpu);
0ca1b4f4
GN
4562 }
4563 return 1;
4564 }
4565 return 0;
4566 }
4567
4568 /*
4569 * Forward all other exceptions that are valid in real mode.
4570 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
4571 * the required debugging infrastructure rework.
4572 */
4573 kvm_queue_exception(vcpu, vec);
4574 return 1;
6aa8b732
AK
4575}
4576
a0861c02
AK
4577/*
4578 * Trigger machine check on the host. We assume all the MSRs are already set up
4579 * by the CPU and that we still run on the same CPU as the MCE occurred on.
4580 * We pass a fake environment to the machine check handler because we want
4581 * the guest to be always treated like user space, no matter what context
4582 * it used internally.
4583 */
4584static void kvm_machine_check(void)
4585{
4586#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
4587 struct pt_regs regs = {
4588 .cs = 3, /* Fake ring 3 no matter what the guest ran on */
4589 .flags = X86_EFLAGS_IF,
4590 };
4591
4592 do_machine_check(&regs, 0);
4593#endif
4594}
4595
851ba692 4596static int handle_machine_check(struct kvm_vcpu *vcpu)
a0861c02 4597{
95b5a48c 4598 /* handled by vmx_vcpu_run() */
a0861c02
AK
4599 return 1;
4600}
4601
95b5a48c 4602static int handle_exception_nmi(struct kvm_vcpu *vcpu)
6aa8b732 4603{
1155f76a 4604 struct vcpu_vmx *vmx = to_vmx(vcpu);
851ba692 4605 struct kvm_run *kvm_run = vcpu->run;
d0bfb940 4606 u32 intr_info, ex_no, error_code;
42dbaa5a 4607 unsigned long cr2, rip, dr6;
6aa8b732 4608 u32 vect_info;
6aa8b732 4609
1155f76a 4610 vect_info = vmx->idt_vectoring_info;
88786475 4611 intr_info = vmx->exit_intr_info;
6aa8b732 4612
2ea72039 4613 if (is_machine_check(intr_info) || is_nmi(intr_info))
95b5a48c 4614 return 1; /* handled by handle_exception_nmi_irqoff() */
2ab455cc 4615
082d06ed
WL
4616 if (is_invalid_opcode(intr_info))
4617 return handle_ud(vcpu);
7aa81cc0 4618
6aa8b732 4619 error_code = 0;
2e11384c 4620 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
6aa8b732 4621 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
bf4ca23e 4622
9e869480
LA
4623 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
4624 WARN_ON_ONCE(!enable_vmware_backdoor);
a6c6ed1e
SC
4625
4626 /*
4627 * VMware backdoor emulation on #GP interception only handles
4628 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
4629 * error code on #GP.
4630 */
4631 if (error_code) {
4632 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
4633 return 1;
4634 }
60fc3d02 4635 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
9e869480
LA
4636 }
4637
bf4ca23e
XG
4638 /*
4639 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
4640 * MMIO, it is better to report an internal error.
4641 * See the comments in vmx_handle_exit.
4642 */
4643 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
4644 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
4645 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
4646 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
80f0e95d 4647 vcpu->run->internal.ndata = 3;
bf4ca23e
XG
4648 vcpu->run->internal.data[0] = vect_info;
4649 vcpu->run->internal.data[1] = intr_info;
80f0e95d 4650 vcpu->run->internal.data[2] = error_code;
bf4ca23e
XG
4651 return 0;
4652 }
4653
6aa8b732
AK
4654 if (is_page_fault(intr_info)) {
4655 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1261bfa3
WL
4656 /* EPT won't cause page fault directly */
4657 WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
d0006530 4658 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
6aa8b732
AK
4659 }
4660
d0bfb940 4661 ex_no = intr_info & INTR_INFO_VECTOR_MASK;
0ca1b4f4
GN
4662
4663 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
4664 return handle_rmode_exception(vcpu, ex_no, error_code);
4665
42dbaa5a 4666 switch (ex_no) {
54a20552
EN
4667 case AC_VECTOR:
4668 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
4669 return 1;
42dbaa5a
JK
4670 case DB_VECTOR:
4671 dr6 = vmcs_readl(EXIT_QUALIFICATION);
4672 if (!(vcpu->guest_debug &
4673 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
1fc5d194 4674 vcpu->arch.dr6 &= ~DR_TRAP_BITS;
6f43ed01 4675 vcpu->arch.dr6 |= dr6 | DR6_RTM;
32d43cd3 4676 if (is_icebp(intr_info))
1957aa63 4677 WARN_ON(!skip_emulated_instruction(vcpu));
fd2a445a 4678
42dbaa5a
JK
4679 kvm_queue_exception(vcpu, DB_VECTOR);
4680 return 1;
4681 }
4682 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
4683 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
4684 /* fall through */
4685 case BP_VECTOR:
c573cd22
JK
4686 /*
4687 * Update instruction length as we may reinject #BP from
4688 * user space while in guest debugging mode. Reading it for
4689 * #DB as well causes no harm, it is not used in that case.
4690 */
4691 vmx->vcpu.arch.event_exit_inst_len =
4692 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
6aa8b732 4693 kvm_run->exit_reason = KVM_EXIT_DEBUG;
0a434bb2 4694 rip = kvm_rip_read(vcpu);
d0bfb940
JK
4695 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
4696 kvm_run->debug.arch.exception = ex_no;
42dbaa5a
JK
4697 break;
4698 default:
d0bfb940
JK
4699 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
4700 kvm_run->ex.exception = ex_no;
4701 kvm_run->ex.error_code = error_code;
42dbaa5a 4702 break;
6aa8b732 4703 }
6aa8b732
AK
4704 return 0;
4705}
4706
f399e60c 4707static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
6aa8b732 4708{
1165f5fe 4709 ++vcpu->stat.irq_exits;
6aa8b732
AK
4710 return 1;
4711}
4712
851ba692 4713static int handle_triple_fault(struct kvm_vcpu *vcpu)
988ad74f 4714{
851ba692 4715 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
bbeac283 4716 vcpu->mmio_needed = 0;
988ad74f
AK
4717 return 0;
4718}
6aa8b732 4719
851ba692 4720static int handle_io(struct kvm_vcpu *vcpu)
6aa8b732 4721{
bfdaab09 4722 unsigned long exit_qualification;
dca7f128 4723 int size, in, string;
039576c0 4724 unsigned port;
6aa8b732 4725
bfdaab09 4726 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
039576c0 4727 string = (exit_qualification & 16) != 0;
e70669ab 4728
cf8f70bf 4729 ++vcpu->stat.io_exits;
e70669ab 4730
432baf60 4731 if (string)
60fc3d02 4732 return kvm_emulate_instruction(vcpu, 0);
e70669ab 4733
cf8f70bf
GN
4734 port = exit_qualification >> 16;
4735 size = (exit_qualification & 7) + 1;
432baf60 4736 in = (exit_qualification & 8) != 0;
cf8f70bf 4737
dca7f128 4738 return kvm_fast_pio(vcpu, size, port, in);
6aa8b732
AK
4739}
4740
102d8325
IM
4741static void
4742vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4743{
4744 /*
4745 * Patch in the VMCALL instruction:
4746 */
4747 hypercall[0] = 0x0f;
4748 hypercall[1] = 0x01;
4749 hypercall[2] = 0xc1;
102d8325
IM
4750}
4751
0fa06071 4752/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
eeadf9e7
NHE
4753static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
4754{
eeadf9e7 4755 if (is_guest_mode(vcpu)) {
1a0d74e6
JK
4756 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4757 unsigned long orig_val = val;
4758
eeadf9e7
NHE
4759 /*
4760 * We get here when L2 changed cr0 in a way that did not change
4761 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
1a0d74e6
JK
4762 * but did change L0 shadowed bits. So we first calculate the
4763 * effective cr0 value that L1 would like to write into the
4764 * hardware. It consists of the L2-owned bits from the new
4765 * value combined with the L1-owned bits from L1's guest_cr0.
eeadf9e7 4766 */
1a0d74e6
JK
4767 val = (val & ~vmcs12->cr0_guest_host_mask) |
4768 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
4769
3899152c 4770 if (!nested_guest_cr0_valid(vcpu, val))
eeadf9e7 4771 return 1;
1a0d74e6
JK
4772
4773 if (kvm_set_cr0(vcpu, val))
4774 return 1;
4775 vmcs_writel(CR0_READ_SHADOW, orig_val);
eeadf9e7 4776 return 0;
1a0d74e6
JK
4777 } else {
4778 if (to_vmx(vcpu)->nested.vmxon &&
3899152c 4779 !nested_host_cr0_valid(vcpu, val))
1a0d74e6 4780 return 1;
3899152c 4781
eeadf9e7 4782 return kvm_set_cr0(vcpu, val);
1a0d74e6 4783 }
eeadf9e7
NHE
4784}
4785
4786static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
4787{
4788 if (is_guest_mode(vcpu)) {
1a0d74e6
JK
4789 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4790 unsigned long orig_val = val;
4791
4792 /* analogously to handle_set_cr0 */
4793 val = (val & ~vmcs12->cr4_guest_host_mask) |
4794 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
4795 if (kvm_set_cr4(vcpu, val))
eeadf9e7 4796 return 1;
1a0d74e6 4797 vmcs_writel(CR4_READ_SHADOW, orig_val);
eeadf9e7
NHE
4798 return 0;
4799 } else
4800 return kvm_set_cr4(vcpu, val);
4801}
4802
0367f205
PB
4803static int handle_desc(struct kvm_vcpu *vcpu)
4804{
4805 WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
60fc3d02 4806 return kvm_emulate_instruction(vcpu, 0);
0367f205
PB
4807}
4808
851ba692 4809static int handle_cr(struct kvm_vcpu *vcpu)
6aa8b732 4810{
229456fc 4811 unsigned long exit_qualification, val;
6aa8b732
AK
4812 int cr;
4813 int reg;
49a9b07e 4814 int err;
6affcbed 4815 int ret;
6aa8b732 4816
bfdaab09 4817 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6aa8b732
AK
4818 cr = exit_qualification & 15;
4819 reg = (exit_qualification >> 8) & 15;
4820 switch ((exit_qualification >> 4) & 3) {
4821 case 0: /* mov to cr */
1e32c079 4822 val = kvm_register_readl(vcpu, reg);
229456fc 4823 trace_kvm_cr_write(cr, val);
6aa8b732
AK
4824 switch (cr) {
4825 case 0:
eeadf9e7 4826 err = handle_set_cr0(vcpu, val);
6affcbed 4827 return kvm_complete_insn_gp(vcpu, err);
6aa8b732 4828 case 3:
e1de91cc 4829 WARN_ON_ONCE(enable_unrestricted_guest);
2390218b 4830 err = kvm_set_cr3(vcpu, val);
6affcbed 4831 return kvm_complete_insn_gp(vcpu, err);
6aa8b732 4832 case 4:
eeadf9e7 4833 err = handle_set_cr4(vcpu, val);
6affcbed 4834 return kvm_complete_insn_gp(vcpu, err);
0a5fff19
GN
4835 case 8: {
4836 u8 cr8_prev = kvm_get_cr8(vcpu);
1e32c079 4837 u8 cr8 = (u8)val;
eea1cff9 4838 err = kvm_set_cr8(vcpu, cr8);
6affcbed 4839 ret = kvm_complete_insn_gp(vcpu, err);
35754c98 4840 if (lapic_in_kernel(vcpu))
6affcbed 4841 return ret;
0a5fff19 4842 if (cr8_prev <= cr8)
6affcbed
KH
4843 return ret;
4844 /*
4845 * TODO: we might be squashing a
4846 * KVM_GUESTDBG_SINGLESTEP-triggered
4847 * KVM_EXIT_DEBUG here.
4848 */
851ba692 4849 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
0a5fff19
GN
4850 return 0;
4851 }
4b8073e4 4852 }
6aa8b732 4853 break;
25c4c276 4854 case 2: /* clts */
bd7e5b08
PB
4855 WARN_ONCE(1, "Guest should always own CR0.TS");
4856 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
4d4ec087 4857 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
6affcbed 4858 return kvm_skip_emulated_instruction(vcpu);
6aa8b732
AK
4859 case 1: /*mov from cr*/
4860 switch (cr) {
4861 case 3:
e1de91cc 4862 WARN_ON_ONCE(enable_unrestricted_guest);
9f8fe504
AK
4863 val = kvm_read_cr3(vcpu);
4864 kvm_register_write(vcpu, reg, val);
4865 trace_kvm_cr_read(cr, val);
6affcbed 4866 return kvm_skip_emulated_instruction(vcpu);
6aa8b732 4867 case 8:
229456fc
MT
4868 val = kvm_get_cr8(vcpu);
4869 kvm_register_write(vcpu, reg, val);
4870 trace_kvm_cr_read(cr, val);
6affcbed 4871 return kvm_skip_emulated_instruction(vcpu);
6aa8b732
AK
4872 }
4873 break;
4874 case 3: /* lmsw */
a1f83a74 4875 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
4d4ec087 4876 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
a1f83a74 4877 kvm_lmsw(vcpu, val);
6aa8b732 4878
6affcbed 4879 return kvm_skip_emulated_instruction(vcpu);
6aa8b732
AK
4880 default:
4881 break;
4882 }
851ba692 4883 vcpu->run->exit_reason = 0;
a737f256 4884 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
6aa8b732
AK
4885 (int)(exit_qualification >> 4) & 3, cr);
4886 return 0;
4887}
4888
851ba692 4889static int handle_dr(struct kvm_vcpu *vcpu)
6aa8b732 4890{
bfdaab09 4891 unsigned long exit_qualification;
16f8a6f9
NA
4892 int dr, dr7, reg;
4893
4894 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4895 dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
4896
4897 /* First, if DR does not exist, trigger UD */
4898 if (!kvm_require_dr(vcpu, dr))
4899 return 1;
6aa8b732 4900
f2483415 4901 /* Do not handle if the CPL > 0, will trigger GP on re-entry */
0a79b009
AK
4902 if (!kvm_require_cpl(vcpu, 0))
4903 return 1;
16f8a6f9
NA
4904 dr7 = vmcs_readl(GUEST_DR7);
4905 if (dr7 & DR7_GD) {
42dbaa5a
JK
4906 /*
4907 * As the vm-exit takes precedence over the debug trap, we
4908 * need to emulate the latter, either for the host or the
4909 * guest debugging itself.
4910 */
4911 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
851ba692 4912 vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
16f8a6f9 4913 vcpu->run->debug.arch.dr7 = dr7;
82b32774 4914 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
851ba692
AK
4915 vcpu->run->debug.arch.exception = DB_VECTOR;
4916 vcpu->run->exit_reason = KVM_EXIT_DEBUG;
42dbaa5a
JK
4917 return 0;
4918 } else {
1fc5d194 4919 vcpu->arch.dr6 &= ~DR_TRAP_BITS;
6f43ed01 4920 vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
42dbaa5a
JK
4921 kvm_queue_exception(vcpu, DB_VECTOR);
4922 return 1;
4923 }
4924 }
4925
81908bf4 4926 if (vcpu->guest_debug == 0) {
2183f564 4927 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
81908bf4
PB
4928
4929 /*
4930 * No more DR vmexits; force a reload of the debug registers
4931 * and reenter on this instruction. The next vmexit will
4932 * retrieve the full state of the debug registers.
4933 */
4934 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
4935 return 1;
4936 }
4937
42dbaa5a
JK
4938 reg = DEBUG_REG_ACCESS_REG(exit_qualification);
4939 if (exit_qualification & TYPE_MOV_FROM_DR) {
020df079 4940 unsigned long val;
4c4d563b
JK
4941
4942 if (kvm_get_dr(vcpu, dr, &val))
4943 return 1;
4944 kvm_register_write(vcpu, reg, val);
020df079 4945 } else
5777392e 4946 if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
4c4d563b
JK
4947 return 1;
4948
6affcbed 4949 return kvm_skip_emulated_instruction(vcpu);
6aa8b732
AK
4950}
4951
73aaf249
JK
4952static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
4953{
4954 return vcpu->arch.dr6;
4955}
4956
4957static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
4958{
4959}
4960
81908bf4
PB
4961static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
4962{
81908bf4
PB
4963 get_debugreg(vcpu->arch.db[0], 0);
4964 get_debugreg(vcpu->arch.db[1], 1);
4965 get_debugreg(vcpu->arch.db[2], 2);
4966 get_debugreg(vcpu->arch.db[3], 3);
4967 get_debugreg(vcpu->arch.dr6, 6);
4968 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
4969
4970 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
2183f564 4971 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
81908bf4
PB
4972}
4973
020df079
GN
4974static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
4975{
4976 vmcs_writel(GUEST_DR7, val);
4977}
4978
851ba692 4979static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
6e5d865c 4980{
eb90f341 4981 kvm_apic_update_ppr(vcpu);
6e5d865c
YS
4982 return 1;
4983}
4984
851ba692 4985static int handle_interrupt_window(struct kvm_vcpu *vcpu)
6aa8b732 4986{
9dadc2f9 4987 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
2714d1d3 4988
3842d135
AK
4989 kvm_make_request(KVM_REQ_EVENT, vcpu);
4990
a26bf12a 4991 ++vcpu->stat.irq_window_exits;
6aa8b732
AK
4992 return 1;
4993}
4994
851ba692 4995static int handle_vmcall(struct kvm_vcpu *vcpu)
c21415e8 4996{
0d9c055e 4997 return kvm_emulate_hypercall(vcpu);
c21415e8
IM
4998}
4999
ec25d5e6
GN
5000static int handle_invd(struct kvm_vcpu *vcpu)
5001{
60fc3d02 5002 return kvm_emulate_instruction(vcpu, 0);
ec25d5e6
GN
5003}
5004
851ba692 5005static int handle_invlpg(struct kvm_vcpu *vcpu)
a7052897 5006{
f9c617f6 5007 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
a7052897
MT
5008
5009 kvm_mmu_invlpg(vcpu, exit_qualification);
6affcbed 5010 return kvm_skip_emulated_instruction(vcpu);
a7052897
MT
5011}
5012
fee84b07
AK
5013static int handle_rdpmc(struct kvm_vcpu *vcpu)
5014{
5015 int err;
5016
5017 err = kvm_rdpmc(vcpu);
6affcbed 5018 return kvm_complete_insn_gp(vcpu, err);
fee84b07
AK
5019}
5020
851ba692 5021static int handle_wbinvd(struct kvm_vcpu *vcpu)
e5edaa01 5022{
6affcbed 5023 return kvm_emulate_wbinvd(vcpu);
e5edaa01
ED
5024}
5025
2acf923e
DC
5026static int handle_xsetbv(struct kvm_vcpu *vcpu)
5027{
5028 u64 new_bv = kvm_read_edx_eax(vcpu);
de3cd117 5029 u32 index = kvm_rcx_read(vcpu);
2acf923e
DC
5030
5031 if (kvm_set_xcr(vcpu, index, new_bv) == 0)
6affcbed 5032 return kvm_skip_emulated_instruction(vcpu);
2acf923e
DC
5033 return 1;
5034}
5035
851ba692 5036static int handle_apic_access(struct kvm_vcpu *vcpu)
f78e0e2e 5037{
58fbbf26
KT
5038 if (likely(fasteoi)) {
5039 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5040 int access_type, offset;
5041
5042 access_type = exit_qualification & APIC_ACCESS_TYPE;
5043 offset = exit_qualification & APIC_ACCESS_OFFSET;
5044 /*
5045 * Sane guest uses MOV to write EOI, with written value
5046 * not cared. So make a short-circuit here by avoiding
5047 * heavy instruction emulation.
5048 */
5049 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
5050 (offset == APIC_EOI)) {
5051 kvm_lapic_set_eoi(vcpu);
6affcbed 5052 return kvm_skip_emulated_instruction(vcpu);
58fbbf26
KT
5053 }
5054 }
60fc3d02 5055 return kvm_emulate_instruction(vcpu, 0);
f78e0e2e
SY
5056}
5057
c7c9c56c
YZ
5058static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
5059{
5060 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5061 int vector = exit_qualification & 0xff;
5062
5063 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
5064 kvm_apic_set_eoi_accelerated(vcpu, vector);
5065 return 1;
5066}
5067
83d4c286
YZ
5068static int handle_apic_write(struct kvm_vcpu *vcpu)
5069{
5070 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5071 u32 offset = exit_qualification & 0xfff;
5072
5073 /* APIC-write VM exit is trap-like and thus no need to adjust IP */
5074 kvm_apic_write_nodecode(vcpu, offset);
5075 return 1;
5076}
5077
851ba692 5078static int handle_task_switch(struct kvm_vcpu *vcpu)
37817f29 5079{
60637aac 5080 struct vcpu_vmx *vmx = to_vmx(vcpu);
37817f29 5081 unsigned long exit_qualification;
e269fb21
JK
5082 bool has_error_code = false;
5083 u32 error_code = 0;
37817f29 5084 u16 tss_selector;
7f3d35fd 5085 int reason, type, idt_v, idt_index;
64a7ec06
GN
5086
5087 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
7f3d35fd 5088 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
64a7ec06 5089 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
37817f29
IE
5090
5091 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5092
5093 reason = (u32)exit_qualification >> 30;
64a7ec06
GN
5094 if (reason == TASK_SWITCH_GATE && idt_v) {
5095 switch (type) {
5096 case INTR_TYPE_NMI_INTR:
5097 vcpu->arch.nmi_injected = false;
654f06fc 5098 vmx_set_nmi_mask(vcpu, true);
64a7ec06
GN
5099 break;
5100 case INTR_TYPE_EXT_INTR:
66fd3f7f 5101 case INTR_TYPE_SOFT_INTR:
64a7ec06
GN
5102 kvm_clear_interrupt_queue(vcpu);
5103 break;
5104 case INTR_TYPE_HARD_EXCEPTION:
e269fb21
JK
5105 if (vmx->idt_vectoring_info &
5106 VECTORING_INFO_DELIVER_CODE_MASK) {
5107 has_error_code = true;
5108 error_code =
5109 vmcs_read32(IDT_VECTORING_ERROR_CODE);
5110 }
5111 /* fall through */
64a7ec06
GN
5112 case INTR_TYPE_SOFT_EXCEPTION:
5113 kvm_clear_exception_queue(vcpu);
5114 break;
5115 default:
5116 break;
5117 }
60637aac 5118 }
37817f29
IE
5119 tss_selector = exit_qualification;
5120
64a7ec06
GN
5121 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
5122 type != INTR_TYPE_EXT_INTR &&
5123 type != INTR_TYPE_NMI_INTR))
1957aa63 5124 WARN_ON(!skip_emulated_instruction(vcpu));
64a7ec06 5125
42dbaa5a
JK
5126 /*
5127 * TODO: What about debug traps on tss switch?
5128 * Are we supposed to inject them and update dr6?
5129 */
1051778f
SC
5130 return kvm_task_switch(vcpu, tss_selector,
5131 type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
60fc3d02 5132 reason, has_error_code, error_code);
37817f29
IE
5133}
5134
851ba692 5135static int handle_ept_violation(struct kvm_vcpu *vcpu)
1439442c 5136{
f9c617f6 5137 unsigned long exit_qualification;
1439442c 5138 gpa_t gpa;
eebed243 5139 u64 error_code;
1439442c 5140
f9c617f6 5141 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
1439442c 5142
0be9c7a8
GN
5143 /*
5144 * EPT violation happened while executing iret from NMI,
5145 * "blocked by NMI" bit has to be set before next VM entry.
5146 * There are errata that may cause this bit to not be set:
5147 * AAK134, BY25.
5148 */
bcd1c294 5149 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
d02fcf50 5150 enable_vnmi &&
bcd1c294 5151 (exit_qualification & INTR_INFO_UNBLOCK_NMI))
0be9c7a8
GN
5152 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
5153
1439442c 5154 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
229456fc 5155 trace_kvm_page_fault(gpa, exit_qualification);
4f5982a5 5156
27959a44 5157 /* Is it a read fault? */
ab22a473 5158 error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
27959a44
JS
5159 ? PFERR_USER_MASK : 0;
5160 /* Is it a write fault? */
ab22a473 5161 error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
27959a44
JS
5162 ? PFERR_WRITE_MASK : 0;
5163 /* Is it a fetch fault? */
ab22a473 5164 error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
27959a44
JS
5165 ? PFERR_FETCH_MASK : 0;
5166 /* ept page table entry is present? */
5167 error_code |= (exit_qualification &
5168 (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
5169 EPT_VIOLATION_EXECUTABLE))
5170 ? PFERR_PRESENT_MASK : 0;
4f5982a5 5171
eebed243
PB
5172 error_code |= (exit_qualification & 0x100) != 0 ?
5173 PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
25d92081 5174
25d92081 5175 vcpu->arch.exit_qualification = exit_qualification;
4f5982a5 5176 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
1439442c
SY
5177}
5178
851ba692 5179static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
68f89400 5180{
68f89400
MT
5181 gpa_t gpa;
5182
9034e6e8
PB
5183 /*
5184 * A nested guest cannot optimize MMIO vmexits, because we have an
5185 * nGPA here instead of the required GPA.
5186 */
68f89400 5187 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
9034e6e8
PB
5188 if (!is_guest_mode(vcpu) &&
5189 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
931c33b1 5190 trace_kvm_fast_mmio(gpa);
1957aa63 5191 return kvm_skip_emulated_instruction(vcpu);
68c3b4d1 5192 }
68f89400 5193
c75d0edc 5194 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
68f89400
MT
5195}
5196
851ba692 5197static int handle_nmi_window(struct kvm_vcpu *vcpu)
f08864b4 5198{
d02fcf50 5199 WARN_ON_ONCE(!enable_vnmi);
4e2a0bc5 5200 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
f08864b4 5201 ++vcpu->stat.nmi_window_exits;
3842d135 5202 kvm_make_request(KVM_REQ_EVENT, vcpu);
f08864b4
SY
5203
5204 return 1;
5205}
5206
80ced186 5207static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
ea953ef0 5208{
8b3079a5 5209 struct vcpu_vmx *vmx = to_vmx(vcpu);
49e9d557 5210 bool intr_window_requested;
b8405c18 5211 unsigned count = 130;
49e9d557 5212
2bb8cafe
SC
5213 /*
5214 * We should never reach the point where we are emulating L2
5215 * due to invalid guest state as that means we incorrectly
5216 * allowed a nested VMEntry with an invalid vmcs12.
5217 */
5218 WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
5219
2183f564 5220 intr_window_requested = exec_controls_get(vmx) &
9dadc2f9 5221 CPU_BASED_INTR_WINDOW_EXITING;
ea953ef0 5222
98eb2f8b 5223 while (vmx->emulation_required && count-- != 0) {
bdea48e3 5224 if (intr_window_requested && vmx_interrupt_allowed(vcpu))
49e9d557
AK
5225 return handle_interrupt_window(&vmx->vcpu);
5226
72875d8a 5227 if (kvm_test_request(KVM_REQ_EVENT, vcpu))
de87dcdd
AK
5228 return 1;
5229
60fc3d02 5230 if (!kvm_emulate_instruction(vcpu, 0))
8fff2710 5231 return 0;
1d5a4d9b 5232
add5ff7a 5233 if (vmx->emulation_required && !vmx->rmode.vm86_active &&
8fff2710
SC
5234 vcpu->arch.exception.pending) {
5235 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5236 vcpu->run->internal.suberror =
5237 KVM_INTERNAL_ERROR_EMULATION;
5238 vcpu->run->internal.ndata = 0;
5239 return 0;
5240 }
ea953ef0 5241
8d76c49e
GN
5242 if (vcpu->arch.halt_request) {
5243 vcpu->arch.halt_request = 0;
8fff2710 5244 return kvm_vcpu_halt(vcpu);
8d76c49e
GN
5245 }
5246
8fff2710
SC
5247 /*
5248 * Note, return 1 and not 0, vcpu_run() is responsible for
5249 * morphing the pending signal into the proper return code.
5250 */
ea953ef0 5251 if (signal_pending(current))
8fff2710
SC
5252 return 1;
5253
ea953ef0
MG
5254 if (need_resched())
5255 schedule();
5256 }
5257
8fff2710 5258 return 1;
b4a2d31d
RK
5259}
5260
5261static void grow_ple_window(struct kvm_vcpu *vcpu)
5262{
5263 struct vcpu_vmx *vmx = to_vmx(vcpu);
c5c5d6fa 5264 unsigned int old = vmx->ple_window;
b4a2d31d 5265
c8e88717
BM
5266 vmx->ple_window = __grow_ple_window(old, ple_window,
5267 ple_window_grow,
5268 ple_window_max);
b4a2d31d 5269
4f75bcc3 5270 if (vmx->ple_window != old) {
b4a2d31d 5271 vmx->ple_window_dirty = true;
4f75bcc3
PX
5272 trace_kvm_ple_window_update(vcpu->vcpu_id,
5273 vmx->ple_window, old);
5274 }
b4a2d31d
RK
5275}
5276
5277static void shrink_ple_window(struct kvm_vcpu *vcpu)
5278{
5279 struct vcpu_vmx *vmx = to_vmx(vcpu);
c5c5d6fa 5280 unsigned int old = vmx->ple_window;
b4a2d31d 5281
c8e88717
BM
5282 vmx->ple_window = __shrink_ple_window(old, ple_window,
5283 ple_window_shrink,
5284 ple_window);
b4a2d31d 5285
4f75bcc3 5286 if (vmx->ple_window != old) {
b4a2d31d 5287 vmx->ple_window_dirty = true;
4f75bcc3
PX
5288 trace_kvm_ple_window_update(vcpu->vcpu_id,
5289 vmx->ple_window, old);
5290 }
b4a2d31d
RK
5291}
5292
bf9f6ac8
FW
5293/*
5294 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
5295 */
5296static void wakeup_handler(void)
5297{
5298 struct kvm_vcpu *vcpu;
5299 int cpu = smp_processor_id();
5300
5301 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
5302 list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
5303 blocked_vcpu_list) {
5304 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
5305
5306 if (pi_test_on(pi_desc) == 1)
5307 kvm_vcpu_kick(vcpu);
5308 }
5309 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
5310}
5311
e01bca2f 5312static void vmx_enable_tdp(void)
f160c7b7
JS
5313{
5314 kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
5315 enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
5316 enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
5317 0ull, VMX_EPT_EXECUTABLE_MASK,
5318 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
d0ec49d4 5319 VMX_EPT_RWX_MASK, 0ull);
f160c7b7
JS
5320
5321 ept_set_mmio_spte_mask();
f160c7b7
JS
5322}
5323
4b8d54f9
ZE
5324/*
5325 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
5326 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
5327 */
9fb41ba8 5328static int handle_pause(struct kvm_vcpu *vcpu)
4b8d54f9 5329{
b31c114b 5330 if (!kvm_pause_in_guest(vcpu->kvm))
b4a2d31d
RK
5331 grow_ple_window(vcpu);
5332
de63ad4c
LM
5333 /*
5334 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
5335 * VM-execution control is ignored if CPL > 0. OTOH, KVM
5336 * never set PAUSE_EXITING and just set PLE if supported,
5337 * so the vcpu must be CPL=0 if it gets a PAUSE exit.
5338 */
5339 kvm_vcpu_on_spin(vcpu, true);
6affcbed 5340 return kvm_skip_emulated_instruction(vcpu);
4b8d54f9
ZE
5341}
5342
87c00572 5343static int handle_nop(struct kvm_vcpu *vcpu)
59708670 5344{
6affcbed 5345 return kvm_skip_emulated_instruction(vcpu);
59708670
SY
5346}
5347
87c00572
GS
5348static int handle_mwait(struct kvm_vcpu *vcpu)
5349{
5350 printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
5351 return handle_nop(vcpu);
5352}
5353
45ec368c
JM
5354static int handle_invalid_op(struct kvm_vcpu *vcpu)
5355{
5356 kvm_queue_exception(vcpu, UD_VECTOR);
5357 return 1;
5358}
5359
5f3d45e7
MD
5360static int handle_monitor_trap(struct kvm_vcpu *vcpu)
5361{
5362 return 1;
5363}
5364
87c00572
GS
5365static int handle_monitor(struct kvm_vcpu *vcpu)
5366{
5367 printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
5368 return handle_nop(vcpu);
5369}
5370
55d2375e 5371static int handle_invpcid(struct kvm_vcpu *vcpu)
19677e32 5372{
55d2375e
SC
5373 u32 vmx_instruction_info;
5374 unsigned long type;
5375 bool pcid_enabled;
5376 gva_t gva;
5377 struct x86_exception e;
5378 unsigned i;
5379 unsigned long roots_to_free = 0;
5380 struct {
5381 u64 pcid;
5382 u64 gla;
5383 } operand;
f9eb4af6 5384
55d2375e 5385 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
19677e32
BD
5386 kvm_queue_exception(vcpu, UD_VECTOR);
5387 return 1;
5388 }
5389
55d2375e
SC
5390 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5391 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
5392
5393 if (type > 3) {
5394 kvm_inject_gp(vcpu, 0);
f9eb4af6
EK
5395 return 1;
5396 }
5397
55d2375e
SC
5398 /* According to the Intel instruction reference, the memory operand
5399 * is read even if it isn't needed (e.g., for type==all)
5400 */
3573e22c 5401 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
fdb28619
EK
5402 vmx_instruction_info, false,
5403 sizeof(operand), &gva))
3573e22c
BD
5404 return 1;
5405
55d2375e 5406 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
3573e22c
BD
5407 kvm_inject_page_fault(vcpu, &e);
5408 return 1;
5409 }
5410
55d2375e
SC
5411 if (operand.pcid >> 12 != 0) {
5412 kvm_inject_gp(vcpu, 0);
5413 return 1;
abfc52c6 5414 }
e29acc55 5415
55d2375e 5416 pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
e29acc55 5417
55d2375e
SC
5418 switch (type) {
5419 case INVPCID_TYPE_INDIV_ADDR:
5420 if ((!pcid_enabled && (operand.pcid != 0)) ||
5421 is_noncanonical_address(operand.gla, vcpu)) {
5422 kvm_inject_gp(vcpu, 0);
5423 return 1;
5424 }
5425 kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
5426 return kvm_skip_emulated_instruction(vcpu);
61ada748 5427
55d2375e
SC
5428 case INVPCID_TYPE_SINGLE_CTXT:
5429 if (!pcid_enabled && (operand.pcid != 0)) {
5430 kvm_inject_gp(vcpu, 0);
5431 return 1;
5432 }
e29acc55 5433
55d2375e
SC
5434 if (kvm_get_active_pcid(vcpu) == operand.pcid) {
5435 kvm_mmu_sync_roots(vcpu);
5436 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
5437 }
e29acc55 5438
55d2375e
SC
5439 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5440 if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3)
5441 == operand.pcid)
5442 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
63aff655 5443
55d2375e
SC
5444 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
5445 /*
5446 * If neither the current cr3 nor any of the prev_roots use the
5447 * given PCID, then nothing needs to be done here because a
5448 * resync will happen anyway before switching to any other CR3.
5449 */
e29acc55 5450
55d2375e 5451 return kvm_skip_emulated_instruction(vcpu);
61ada748 5452
55d2375e
SC
5453 case INVPCID_TYPE_ALL_NON_GLOBAL:
5454 /*
5455 * Currently, KVM doesn't mark global entries in the shadow
5456 * page tables, so a non-global flush just degenerates to a
5457 * global flush. If needed, we could optimize this later by
5458 * keeping track of global entries in shadow page tables.
5459 */
e29acc55 5460
55d2375e
SC
5461 /* fall-through */
5462 case INVPCID_TYPE_ALL_INCL_GLOBAL:
5463 kvm_mmu_unload(vcpu);
5464 return kvm_skip_emulated_instruction(vcpu);
e29acc55 5465
55d2375e
SC
5466 default:
5467 BUG(); /* We have already checked above that type <= 3 */
5468 }
e29acc55
JM
5469}
5470
55d2375e 5471static int handle_pml_full(struct kvm_vcpu *vcpu)
ec378aee 5472{
55d2375e 5473 unsigned long exit_qualification;
b3897a49 5474
55d2375e 5475 trace_kvm_pml_full(vcpu->vcpu_id);
b3897a49 5476
55d2375e 5477 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
cbf71279
RK
5478
5479 /*
55d2375e
SC
5480 * PML buffer FULL happened while executing iret from NMI,
5481 * "blocked by NMI" bit has to be set before next VM entry.
cbf71279 5482 */
55d2375e
SC
5483 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5484 enable_vnmi &&
5485 (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5486 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5487 GUEST_INTR_STATE_NMI);
e49fcb8b 5488
55d2375e
SC
5489 /*
5490 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
5491 * here.., and there's no userspace involvement needed for PML.
5492 */
ec378aee
NHE
5493 return 1;
5494}
5495
55d2375e 5496static int handle_preemption_timer(struct kvm_vcpu *vcpu)
8ca44e88 5497{
804939ea
SC
5498 struct vcpu_vmx *vmx = to_vmx(vcpu);
5499
5500 if (!vmx->req_immediate_exit &&
5501 !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
55d2375e 5502 kvm_lapic_expired_hv_timer(vcpu);
804939ea 5503
55d2375e 5504 return 1;
8ca44e88
DM
5505}
5506
55d2375e
SC
5507/*
5508 * When nested=0, all VMX instruction VM Exits filter here. The handlers
5509 * are overwritten by nested_vmx_setup() when nested=1.
5510 */
5511static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
b8bbab92 5512{
55d2375e
SC
5513 kvm_queue_exception(vcpu, UD_VECTOR);
5514 return 1;
b8bbab92
VK
5515}
5516
55d2375e 5517static int handle_encls(struct kvm_vcpu *vcpu)
e7953d7f 5518{
55d2375e
SC
5519 /*
5520 * SGX virtualization is not yet supported. There is no software
5521 * enable bit for SGX, so we have to trap ENCLS and inject a #UD
5522 * to prevent the guest from executing ENCLS.
5523 */
5524 kvm_queue_exception(vcpu, UD_VECTOR);
5525 return 1;
e7953d7f
AG
5526}
5527
ec378aee 5528/*
55d2375e
SC
5529 * The exit handlers return 1 if the exit was handled fully and guest execution
5530 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
5531 * to be done to userspace and return 0.
ec378aee 5532 */
55d2375e 5533static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
95b5a48c 5534 [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi,
55d2375e
SC
5535 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
5536 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
5537 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
5538 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
5539 [EXIT_REASON_CR_ACCESS] = handle_cr,
5540 [EXIT_REASON_DR_ACCESS] = handle_dr,
f399e60c
AA
5541 [EXIT_REASON_CPUID] = kvm_emulate_cpuid,
5542 [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,
5543 [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
9dadc2f9 5544 [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
f399e60c 5545 [EXIT_REASON_HLT] = kvm_emulate_halt,
55d2375e
SC
5546 [EXIT_REASON_INVD] = handle_invd,
5547 [EXIT_REASON_INVLPG] = handle_invlpg,
5548 [EXIT_REASON_RDPMC] = handle_rdpmc,
5549 [EXIT_REASON_VMCALL] = handle_vmcall,
5550 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction,
5551 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction,
5552 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction,
5553 [EXIT_REASON_VMPTRST] = handle_vmx_instruction,
5554 [EXIT_REASON_VMREAD] = handle_vmx_instruction,
5555 [EXIT_REASON_VMRESUME] = handle_vmx_instruction,
5556 [EXIT_REASON_VMWRITE] = handle_vmx_instruction,
5557 [EXIT_REASON_VMOFF] = handle_vmx_instruction,
5558 [EXIT_REASON_VMON] = handle_vmx_instruction,
5559 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
5560 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
5561 [EXIT_REASON_APIC_WRITE] = handle_apic_write,
5562 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
5563 [EXIT_REASON_WBINVD] = handle_wbinvd,
5564 [EXIT_REASON_XSETBV] = handle_xsetbv,
5565 [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
5566 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
5567 [EXIT_REASON_GDTR_IDTR] = handle_desc,
5568 [EXIT_REASON_LDTR_TR] = handle_desc,
5569 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
5570 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
5571 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
5572 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait,
5573 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
5574 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
5575 [EXIT_REASON_INVEPT] = handle_vmx_instruction,
5576 [EXIT_REASON_INVVPID] = handle_vmx_instruction,
5577 [EXIT_REASON_RDRAND] = handle_invalid_op,
5578 [EXIT_REASON_RDSEED] = handle_invalid_op,
55d2375e
SC
5579 [EXIT_REASON_PML_FULL] = handle_pml_full,
5580 [EXIT_REASON_INVPCID] = handle_invpcid,
5581 [EXIT_REASON_VMFUNC] = handle_vmx_instruction,
5582 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
5583 [EXIT_REASON_ENCLS] = handle_encls,
5584};
b8bbab92 5585
55d2375e
SC
5586static const int kvm_vmx_max_exit_handlers =
5587 ARRAY_SIZE(kvm_vmx_exit_handlers);
ec378aee 5588
55d2375e 5589static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
ec378aee 5590{
55d2375e
SC
5591 *info1 = vmcs_readl(EXIT_QUALIFICATION);
5592 *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
ec378aee
NHE
5593}
5594
55d2375e 5595static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
27d6c865 5596{
55d2375e
SC
5597 if (vmx->pml_pg) {
5598 __free_page(vmx->pml_pg);
5599 vmx->pml_pg = NULL;
b8bbab92 5600 }
27d6c865
NHE
5601}
5602
55d2375e 5603static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
cd232ad0 5604{
55d2375e
SC
5605 struct vcpu_vmx *vmx = to_vmx(vcpu);
5606 u64 *pml_buf;
5607 u16 pml_idx;
cd232ad0 5608
55d2375e 5609 pml_idx = vmcs_read16(GUEST_PML_INDEX);
cd232ad0 5610
55d2375e
SC
5611 /* Do nothing if PML buffer is empty */
5612 if (pml_idx == (PML_ENTITY_NUM - 1))
5613 return;
cd232ad0 5614
55d2375e
SC
5615 /* PML index always points to next available PML buffer entity */
5616 if (pml_idx >= PML_ENTITY_NUM)
5617 pml_idx = 0;
5618 else
5619 pml_idx++;
945679e3 5620
55d2375e
SC
5621 pml_buf = page_address(vmx->pml_pg);
5622 for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
5623 u64 gpa;
945679e3 5624
55d2375e
SC
5625 gpa = pml_buf[pml_idx];
5626 WARN_ON(gpa & (PAGE_SIZE - 1));
5627 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
945679e3
VK
5628 }
5629
55d2375e
SC
5630 /* reset PML index */
5631 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
945679e3
VK
5632}
5633
f4160e45 5634/*
55d2375e
SC
5635 * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
5636 * Called before reporting dirty_bitmap to userspace.
f4160e45 5637 */
55d2375e 5638static void kvm_flush_pml_buffers(struct kvm *kvm)
49f705c5 5639{
55d2375e
SC
5640 int i;
5641 struct kvm_vcpu *vcpu;
49f705c5 5642 /*
55d2375e
SC
5643 * We only need to kick vcpu out of guest mode here, as PML buffer
5644 * is flushed at beginning of all VMEXITs, and it's obvious that only
5645 * vcpus running in guest are possible to have unflushed GPAs in PML
5646 * buffer.
49f705c5 5647 */
55d2375e
SC
5648 kvm_for_each_vcpu(i, vcpu, kvm)
5649 kvm_vcpu_kick(vcpu);
49f705c5
NHE
5650}
5651
55d2375e 5652static void vmx_dump_sel(char *name, uint32_t sel)
49f705c5 5653{
55d2375e
SC
5654 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
5655 name, vmcs_read16(sel),
5656 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
5657 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
5658 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
49f705c5
NHE
5659}
5660
55d2375e 5661static void vmx_dump_dtsel(char *name, uint32_t limit)
a8bc284e 5662{
55d2375e
SC
5663 pr_err("%s limit=0x%08x, base=0x%016lx\n",
5664 name, vmcs_read32(limit),
5665 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
a8bc284e
JM
5666}
5667
69090810 5668void dump_vmcs(void)
63846663 5669{
6f2f8453
PB
5670 u32 vmentry_ctl, vmexit_ctl;
5671 u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
5672 unsigned long cr4;
5673 u64 efer;
55d2375e 5674 int i, n;
63846663 5675
6f2f8453
PB
5676 if (!dump_invalid_vmcs) {
5677 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
5678 return;
5679 }
5680
5681 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
5682 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
5683 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5684 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
5685 cr4 = vmcs_readl(GUEST_CR4);
5686 efer = vmcs_read64(GUEST_IA32_EFER);
5687 secondary_exec_control = 0;
55d2375e
SC
5688 if (cpu_has_secondary_exec_ctrls())
5689 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
14c07ad8 5690
55d2375e
SC
5691 pr_err("*** Guest State ***\n");
5692 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
5693 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
5694 vmcs_readl(CR0_GUEST_HOST_MASK));
5695 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
5696 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
5697 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
5698 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
5699 (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
5700 {
5701 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
5702 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
5703 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
5704 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
e9ac033e 5705 }
55d2375e
SC
5706 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
5707 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
5708 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
5709 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
5710 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
5711 vmcs_readl(GUEST_SYSENTER_ESP),
5712 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
5713 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
5714 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
5715 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
5716 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
5717 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
5718 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
5719 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
5720 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
5721 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
5722 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
5723 if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
5724 (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
5725 pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
5726 efer, vmcs_read64(GUEST_IA32_PAT));
5727 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
5728 vmcs_read64(GUEST_IA32_DEBUGCTL),
5729 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
5730 if (cpu_has_load_perf_global_ctrl() &&
5731 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
5732 pr_err("PerfGlobCtl = 0x%016llx\n",
5733 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
5734 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
5735 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
5736 pr_err("Interruptibility = %08x ActivityState = %08x\n",
5737 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
5738 vmcs_read32(GUEST_ACTIVITY_STATE));
5739 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
5740 pr_err("InterruptStatus = %04x\n",
5741 vmcs_read16(GUEST_INTR_STATUS));
ff651cb6 5742
55d2375e
SC
5743 pr_err("*** Host State ***\n");
5744 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
5745 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
5746 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
5747 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
5748 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
5749 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
5750 vmcs_read16(HOST_TR_SELECTOR));
5751 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
5752 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
5753 vmcs_readl(HOST_TR_BASE));
5754 pr_err("GDTBase=%016lx IDTBase=%016lx\n",
5755 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
5756 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
5757 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
5758 vmcs_readl(HOST_CR4));
5759 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
5760 vmcs_readl(HOST_IA32_SYSENTER_ESP),
5761 vmcs_read32(HOST_IA32_SYSENTER_CS),
5762 vmcs_readl(HOST_IA32_SYSENTER_EIP));
5763 if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
5764 pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
5765 vmcs_read64(HOST_IA32_EFER),
5766 vmcs_read64(HOST_IA32_PAT));
5767 if (cpu_has_load_perf_global_ctrl() &&
5768 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
5769 pr_err("PerfGlobCtl = 0x%016llx\n",
5770 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
ff651cb6 5771
55d2375e
SC
5772 pr_err("*** Control State ***\n");
5773 pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
5774 pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
5775 pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
5776 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
5777 vmcs_read32(EXCEPTION_BITMAP),
5778 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
5779 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
5780 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
5781 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
5782 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
5783 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
5784 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
5785 vmcs_read32(VM_EXIT_INTR_INFO),
5786 vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
5787 vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
5788 pr_err(" reason=%08x qualification=%016lx\n",
5789 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
5790 pr_err("IDTVectoring: info=%08x errcode=%08x\n",
5791 vmcs_read32(IDT_VECTORING_INFO_FIELD),
5792 vmcs_read32(IDT_VECTORING_ERROR_CODE));
5793 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
5794 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
5795 pr_err("TSC Multiplier = 0x%016llx\n",
5796 vmcs_read64(TSC_MULTIPLIER));
9d609649
PB
5797 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
5798 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
5799 u16 status = vmcs_read16(GUEST_INTR_STATUS);
5800 pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
5801 }
d6a85c32 5802 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
9d609649
PB
5803 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
5804 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
d6a85c32 5805 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
9d609649 5806 }
55d2375e
SC
5807 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
5808 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
5809 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
5810 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
5811 n = vmcs_read32(CR3_TARGET_COUNT);
5812 for (i = 0; i + 1 < n; i += 4)
5813 pr_err("CR3 target%u=%016lx target%u=%016lx\n",
5814 i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
5815 i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
5816 if (i < n)
5817 pr_err("CR3 target%u=%016lx\n",
5818 i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
5819 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
5820 pr_err("PLE Gap=%08x Window=%08x\n",
5821 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
5822 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
5823 pr_err("Virtual processor ID = 0x%04x\n",
5824 vmcs_read16(VIRTUAL_PROCESSOR_ID));
ff651cb6
WV
5825}
5826
55d2375e
SC
5827/*
5828 * The guest has exited. See if we can fix it or if we need userspace
5829 * assistance.
5830 */
1e9e2622
WL
5831static int vmx_handle_exit(struct kvm_vcpu *vcpu,
5832 enum exit_fastpath_completion exit_fastpath)
ff651cb6 5833{
55d2375e
SC
5834 struct vcpu_vmx *vmx = to_vmx(vcpu);
5835 u32 exit_reason = vmx->exit_reason;
5836 u32 vectoring_info = vmx->idt_vectoring_info;
ff651cb6 5837
55d2375e 5838 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
ff651cb6 5839
55d2375e
SC
5840 /*
5841 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
5842 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
5843 * querying dirty_bitmap, we only need to kick all vcpus out of guest
5844 * mode as if vcpus is in root mode, the PML buffer must has been
5845 * flushed already.
5846 */
5847 if (enable_pml)
5848 vmx_flush_pml_buffer(vcpu);
1dc35dac 5849
55d2375e
SC
5850 /* If guest state is invalid, start emulating */
5851 if (vmx->emulation_required)
5852 return handle_invalid_guest_state(vcpu);
1dc35dac 5853
55d2375e
SC
5854 if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
5855 return nested_vmx_reflect_vmexit(vcpu, exit_reason);
9ed38ffa 5856
55d2375e
SC
5857 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
5858 dump_vmcs();
5859 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
5860 vcpu->run->fail_entry.hardware_entry_failure_reason
5861 = exit_reason;
5862 return 0;
9ed38ffa
LP
5863 }
5864
55d2375e 5865 if (unlikely(vmx->fail)) {
3b20e03a 5866 dump_vmcs();
55d2375e
SC
5867 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
5868 vcpu->run->fail_entry.hardware_entry_failure_reason
5869 = vmcs_read32(VM_INSTRUCTION_ERROR);
5870 return 0;
5871 }
50c28f21 5872
55d2375e
SC
5873 /*
5874 * Note:
5875 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
5876 * delivery event since it indicates guest is accessing MMIO.
5877 * The vm-exit can be triggered again after return to guest that
5878 * will cause infinite loop.
5879 */
5880 if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
5881 (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
5882 exit_reason != EXIT_REASON_EPT_VIOLATION &&
5883 exit_reason != EXIT_REASON_PML_FULL &&
5884 exit_reason != EXIT_REASON_TASK_SWITCH)) {
5885 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5886 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
5887 vcpu->run->internal.ndata = 3;
5888 vcpu->run->internal.data[0] = vectoring_info;
5889 vcpu->run->internal.data[1] = exit_reason;
5890 vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
5891 if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
5892 vcpu->run->internal.ndata++;
5893 vcpu->run->internal.data[3] =
5894 vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5895 }
5896 return 0;
5897 }
50c28f21 5898
55d2375e
SC
5899 if (unlikely(!enable_vnmi &&
5900 vmx->loaded_vmcs->soft_vnmi_blocked)) {
5901 if (vmx_interrupt_allowed(vcpu)) {
5902 vmx->loaded_vmcs->soft_vnmi_blocked = 0;
5903 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
5904 vcpu->arch.nmi_pending) {
5905 /*
5906 * This CPU don't support us in finding the end of an
5907 * NMI-blocked window if the guest runs with IRQs
5908 * disabled. So we pull the trigger after 1 s of
5909 * futile waiting, but inform the user about this.
5910 */
5911 printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
5912 "state on VCPU %d after 1 s timeout\n",
5913 __func__, vcpu->vcpu_id);
5914 vmx->loaded_vmcs->soft_vnmi_blocked = 0;
5915 }
5916 }
50c28f21 5917
1e9e2622
WL
5918 if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
5919 kvm_skip_emulated_instruction(vcpu);
5920 return 1;
c926f2f7
MP
5921 }
5922
5923 if (exit_reason >= kvm_vmx_max_exit_handlers)
5924 goto unexpected_vmexit;
4289d272 5925#ifdef CONFIG_RETPOLINE
c926f2f7
MP
5926 if (exit_reason == EXIT_REASON_MSR_WRITE)
5927 return kvm_emulate_wrmsr(vcpu);
5928 else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER)
5929 return handle_preemption_timer(vcpu);
5930 else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW)
5931 return handle_interrupt_window(vcpu);
5932 else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
5933 return handle_external_interrupt(vcpu);
5934 else if (exit_reason == EXIT_REASON_HLT)
5935 return kvm_emulate_halt(vcpu);
5936 else if (exit_reason == EXIT_REASON_EPT_MISCONFIG)
5937 return handle_ept_misconfig(vcpu);
4289d272 5938#endif
c926f2f7
MP
5939
5940 exit_reason = array_index_nospec(exit_reason,
5941 kvm_vmx_max_exit_handlers);
5942 if (!kvm_vmx_exit_handlers[exit_reason])
5943 goto unexpected_vmexit;
5944
5945 return kvm_vmx_exit_handlers[exit_reason](vcpu);
5946
5947unexpected_vmexit:
5948 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason);
5949 dump_vmcs();
5950 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5951 vcpu->run->internal.suberror =
7396d337 5952 KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
c926f2f7
MP
5953 vcpu->run->internal.ndata = 1;
5954 vcpu->run->internal.data[0] = exit_reason;
5955 return 0;
9ed38ffa
LP
5956}
5957
efebf0aa 5958/*
55d2375e
SC
5959 * Software based L1D cache flush which is used when microcode providing
5960 * the cache control MSR is not loaded.
efebf0aa 5961 *
55d2375e
SC
5962 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
5963 * flush it is required to read in 64 KiB because the replacement algorithm
5964 * is not exactly LRU. This could be sized at runtime via topology
5965 * information but as all relevant affected CPUs have 32KiB L1D cache size
5966 * there is no point in doing so.
efebf0aa 5967 */
55d2375e 5968static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
fe3ef05c 5969{
55d2375e 5970 int size = PAGE_SIZE << L1D_CACHE_ORDER;
25a2e4fe
PB
5971
5972 /*
55d2375e
SC
5973 * This code is only executed when the the flush mode is 'cond' or
5974 * 'always'
25a2e4fe 5975 */
55d2375e
SC
5976 if (static_branch_likely(&vmx_l1d_flush_cond)) {
5977 bool flush_l1d;
25a2e4fe 5978
55d2375e
SC
5979 /*
5980 * Clear the per-vcpu flush bit, it gets set again
5981 * either from vcpu_run() or from one of the unsafe
5982 * VMEXIT handlers.
5983 */
5984 flush_l1d = vcpu->arch.l1tf_flush_l1d;
5985 vcpu->arch.l1tf_flush_l1d = false;
25a2e4fe 5986
55d2375e
SC
5987 /*
5988 * Clear the per-cpu flush bit, it gets set again from
5989 * the interrupt handlers.
5990 */
5991 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
5992 kvm_clear_cpu_l1tf_flush_l1d();
25a2e4fe 5993
55d2375e
SC
5994 if (!flush_l1d)
5995 return;
5996 }
09abe320 5997
55d2375e 5998 vcpu->stat.l1d_flush++;
25a2e4fe 5999
55d2375e
SC
6000 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
6001 wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
6002 return;
6003 }
25a2e4fe 6004
55d2375e
SC
6005 asm volatile(
6006 /* First ensure the pages are in the TLB */
6007 "xorl %%eax, %%eax\n"
6008 ".Lpopulate_tlb:\n\t"
6009 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6010 "addl $4096, %%eax\n\t"
6011 "cmpl %%eax, %[size]\n\t"
6012 "jne .Lpopulate_tlb\n\t"
6013 "xorl %%eax, %%eax\n\t"
6014 "cpuid\n\t"
6015 /* Now fill the cache */
6016 "xorl %%eax, %%eax\n"
6017 ".Lfill_cache:\n"
6018 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6019 "addl $64, %%eax\n\t"
6020 "cmpl %%eax, %[size]\n\t"
6021 "jne .Lfill_cache\n\t"
6022 "lfence\n"
6023 :: [flush_pages] "r" (vmx_l1d_flush_pages),
6024 [size] "r" (size)
6025 : "eax", "ebx", "ecx", "edx");
09abe320 6026}
25a2e4fe 6027
55d2375e 6028static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
09abe320 6029{
55d2375e 6030 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
132f4f7e 6031 int tpr_threshold;
09abe320 6032
55d2375e
SC
6033 if (is_guest_mode(vcpu) &&
6034 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
6035 return;
25a2e4fe 6036
132f4f7e 6037 tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
02d496cf
LA
6038 if (is_guest_mode(vcpu))
6039 to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
6040 else
6041 vmcs_write32(TPR_THRESHOLD, tpr_threshold);
8665c3f9
PB
6042}
6043
55d2375e 6044void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
8665c3f9 6045{
fe7f895d 6046 struct vcpu_vmx *vmx = to_vmx(vcpu);
55d2375e 6047 u32 sec_exec_control;
8665c3f9 6048
55d2375e
SC
6049 if (!lapic_in_kernel(vcpu))
6050 return;
9314006d 6051
55d2375e
SC
6052 if (!flexpriority_enabled &&
6053 !cpu_has_vmx_virtualize_x2apic_mode())
6054 return;
705699a1 6055
55d2375e
SC
6056 /* Postpone execution until vmcs01 is the current VMCS. */
6057 if (is_guest_mode(vcpu)) {
fe7f895d 6058 vmx->nested.change_vmcs01_virtual_apic_mode = true;
55d2375e 6059 return;
6beb7bd5 6060 }
fe3ef05c 6061
fe7f895d 6062 sec_exec_control = secondary_exec_controls_get(vmx);
55d2375e
SC
6063 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
6064 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
09abe320 6065
55d2375e
SC
6066 switch (kvm_get_apic_mode(vcpu)) {
6067 case LAPIC_MODE_INVALID:
6068 WARN_ONCE(true, "Invalid local APIC state");
6069 case LAPIC_MODE_DISABLED:
6070 break;
6071 case LAPIC_MODE_XAPIC:
6072 if (flexpriority_enabled) {
6073 sec_exec_control |=
6074 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6075 vmx_flush_tlb(vcpu, true);
6076 }
6077 break;
6078 case LAPIC_MODE_X2APIC:
6079 if (cpu_has_vmx_virtualize_x2apic_mode())
6080 sec_exec_control |=
6081 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6082 break;
09abe320 6083 }
fe7f895d 6084 secondary_exec_controls_set(vmx, sec_exec_control);
09abe320 6085
55d2375e
SC
6086 vmx_update_msr_bitmap(vcpu);
6087}
0238ea91 6088
55d2375e
SC
6089static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
6090{
6091 if (!is_guest_mode(vcpu)) {
6092 vmcs_write64(APIC_ACCESS_ADDR, hpa);
6093 vmx_flush_tlb(vcpu, true);
6094 }
6095}
fe3ef05c 6096
55d2375e
SC
6097static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
6098{
6099 u16 status;
6100 u8 old;
32c7acf0 6101
55d2375e
SC
6102 if (max_isr == -1)
6103 max_isr = 0;
608406e2 6104
55d2375e
SC
6105 status = vmcs_read16(GUEST_INTR_STATUS);
6106 old = status >> 8;
6107 if (max_isr != old) {
6108 status &= 0xff;
6109 status |= max_isr << 8;
6110 vmcs_write16(GUEST_INTR_STATUS, status);
6111 }
6112}
6beb7bd5 6113
55d2375e
SC
6114static void vmx_set_rvi(int vector)
6115{
6116 u16 status;
6117 u8 old;
0b665d30 6118
55d2375e
SC
6119 if (vector == -1)
6120 vector = 0;
fe3ef05c 6121
55d2375e
SC
6122 status = vmcs_read16(GUEST_INTR_STATUS);
6123 old = (u8)status & 0xff;
6124 if ((u8)vector != old) {
6125 status &= ~0xff;
6126 status |= (u8)vector;
6127 vmcs_write16(GUEST_INTR_STATUS, status);
09abe320 6128 }
55d2375e 6129}
09abe320 6130
55d2375e
SC
6131static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
6132{
09abe320 6133 /*
55d2375e
SC
6134 * When running L2, updating RVI is only relevant when
6135 * vmcs12 virtual-interrupt-delivery enabled.
6136 * However, it can be enabled only when L1 also
6137 * intercepts external-interrupts and in that case
6138 * we should not update vmcs02 RVI but instead intercept
6139 * interrupt. Therefore, do nothing when running L2.
fe3ef05c 6140 */
55d2375e
SC
6141 if (!is_guest_mode(vcpu))
6142 vmx_set_rvi(max_irr);
6143}
fe3ef05c 6144
55d2375e
SC
6145static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
6146{
6147 struct vcpu_vmx *vmx = to_vmx(vcpu);
6148 int max_irr;
6149 bool max_irr_updated;
a7c0b07d 6150
55d2375e
SC
6151 WARN_ON(!vcpu->arch.apicv_active);
6152 if (pi_test_on(&vmx->pi_desc)) {
6153 pi_clear_on(&vmx->pi_desc);
6154 /*
d9ff2744 6155 * IOMMU can write to PID.ON, so the barrier matters even on UP.
55d2375e
SC
6156 * But on x86 this is just a compiler barrier anyway.
6157 */
6158 smp_mb__after_atomic();
6159 max_irr_updated =
6160 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
c4ebd629
VK
6161
6162 /*
55d2375e
SC
6163 * If we are running L2 and L1 has a new pending interrupt
6164 * which can be injected, we should re-evaluate
6165 * what should be done with this new L1 interrupt.
6166 * If L1 intercepts external-interrupts, we should
6167 * exit from L2 to L1. Otherwise, interrupt should be
6168 * delivered directly to L2.
c4ebd629 6169 */
55d2375e
SC
6170 if (is_guest_mode(vcpu) && max_irr_updated) {
6171 if (nested_exit_on_intr(vcpu))
6172 kvm_vcpu_exiting_guest_mode(vcpu);
6173 else
6174 kvm_make_request(KVM_REQ_EVENT, vcpu);
c4ebd629 6175 }
55d2375e
SC
6176 } else {
6177 max_irr = kvm_lapic_find_highest_irr(vcpu);
a7c0b07d 6178 }
55d2375e
SC
6179 vmx_hwapic_irr_update(vcpu, max_irr);
6180 return max_irr;
6181}
a7c0b07d 6182
17e433b5
WL
6183static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
6184{
9482ae45
JM
6185 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
6186
6187 return pi_test_on(pi_desc) ||
29881b6e 6188 (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
17e433b5
WL
6189}
6190
55d2375e
SC
6191static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
6192{
6193 if (!kvm_vcpu_apicv_active(vcpu))
6194 return;
25a2e4fe 6195
55d2375e
SC
6196 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
6197 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
6198 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
6199 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
8665c3f9
PB
6200}
6201
55d2375e 6202static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
8665c3f9
PB
6203{
6204 struct vcpu_vmx *vmx = to_vmx(vcpu);
9d1887ef 6205
55d2375e
SC
6206 pi_clear_on(&vmx->pi_desc);
6207 memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
6208}
8665c3f9 6209
95b5a48c 6210static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
55d2375e 6211{
beb8d93b 6212 vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
fe3ef05c 6213
55d2375e 6214 /* if exit due to PF check for async PF */
d71f5e03 6215 if (is_page_fault(vmx->exit_intr_info)) {
55d2375e 6216 vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
55d2375e 6217 /* Handle machine checks before interrupts are enabled */
d71f5e03 6218 } else if (is_machine_check(vmx->exit_intr_info)) {
55d2375e 6219 kvm_machine_check();
55d2375e 6220 /* We need to handle NMIs before interrupts are enabled */
d71f5e03 6221 } else if (is_nmi(vmx->exit_intr_info)) {
55d2375e
SC
6222 kvm_before_interrupt(&vmx->vcpu);
6223 asm("int $2");
6224 kvm_after_interrupt(&vmx->vcpu);
fe3ef05c 6225 }
55d2375e 6226}
fe3ef05c 6227
95b5a48c 6228static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
55d2375e 6229{
49def500
SC
6230 unsigned int vector;
6231 unsigned long entry;
55d2375e 6232#ifdef CONFIG_X86_64
49def500 6233 unsigned long tmp;
55d2375e 6234#endif
49def500
SC
6235 gate_desc *desc;
6236 u32 intr_info;
fe3ef05c 6237
49def500
SC
6238 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
6239 if (WARN_ONCE(!is_external_intr(intr_info),
6240 "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
6241 return;
6242
6243 vector = intr_info & INTR_INFO_VECTOR_MASK;
2342080c 6244 desc = (gate_desc *)host_idt_base + vector;
49def500
SC
6245 entry = gate_offset(desc);
6246
165072b0
SC
6247 kvm_before_interrupt(vcpu);
6248
49def500 6249 asm volatile(
55d2375e 6250#ifdef CONFIG_X86_64
49def500
SC
6251 "mov %%" _ASM_SP ", %[sp]\n\t"
6252 "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
6253 "push $%c[ss]\n\t"
6254 "push %[sp]\n\t"
55d2375e 6255#endif
49def500
SC
6256 "pushf\n\t"
6257 __ASM_SIZE(push) " $%c[cs]\n\t"
6258 CALL_NOSPEC
6259 :
55d2375e 6260#ifdef CONFIG_X86_64
49def500 6261 [sp]"=&r"(tmp),
55d2375e 6262#endif
49def500
SC
6263 ASM_CALL_CONSTRAINT
6264 :
6265 THUNK_TARGET(entry),
6266 [ss]"i"(__KERNEL_DS),
6267 [cs]"i"(__KERNEL_CS)
6268 );
165072b0
SC
6269
6270 kvm_after_interrupt(vcpu);
55d2375e 6271}
95b5a48c
SC
6272STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
6273
1e9e2622
WL
6274static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu,
6275 enum exit_fastpath_completion *exit_fastpath)
95b5a48c
SC
6276{
6277 struct vcpu_vmx *vmx = to_vmx(vcpu);
6278
6279 if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
6280 handle_external_interrupt_irqoff(vcpu);
6281 else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
6282 handle_exception_nmi_irqoff(vmx);
1e9e2622
WL
6283 else if (!is_guest_mode(vcpu) &&
6284 vmx->exit_reason == EXIT_REASON_MSR_WRITE)
6285 *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
95b5a48c 6286}
5a6a9748 6287
55d2375e
SC
6288static bool vmx_has_emulated_msr(int index)
6289{
6290 switch (index) {
6291 case MSR_IA32_SMBASE:
6292 /*
6293 * We cannot do SMM unless we can run the guest in big
6294 * real mode.
6295 */
6296 return enable_unrestricted_guest || emulate_invalid_guest_state;
95c5c7c7
PB
6297 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
6298 return nested;
55d2375e
SC
6299 case MSR_AMD64_VIRT_SPEC_CTRL:
6300 /* This is AMD only. */
6301 return false;
6302 default:
6303 return true;
3184a995 6304 }
55d2375e 6305}
2bb8cafe 6306
55d2375e
SC
6307static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
6308{
6309 u32 exit_intr_info;
6310 bool unblock_nmi;
6311 u8 vector;
6312 bool idtv_info_valid;
7ca29de2 6313
55d2375e 6314 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
feaf0c7d 6315
55d2375e
SC
6316 if (enable_vnmi) {
6317 if (vmx->loaded_vmcs->nmi_known_unmasked)
6318 return;
6319 /*
6320 * Can't use vmx->exit_intr_info since we're not sure what
6321 * the exit reason is.
6322 */
6323 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
6324 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
6325 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
6326 /*
6327 * SDM 3: 27.7.1.2 (September 2008)
6328 * Re-set bit "block by NMI" before VM entry if vmexit caused by
6329 * a guest IRET fault.
6330 * SDM 3: 23.2.2 (September 2008)
6331 * Bit 12 is undefined in any of the following cases:
6332 * If the VM exit sets the valid bit in the IDT-vectoring
6333 * information field.
6334 * If the VM exit is due to a double fault.
6335 */
6336 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
6337 vector != DF_VECTOR && !idtv_info_valid)
6338 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6339 GUEST_INTR_STATE_NMI);
6340 else
6341 vmx->loaded_vmcs->nmi_known_unmasked =
6342 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
6343 & GUEST_INTR_STATE_NMI);
6344 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
6345 vmx->loaded_vmcs->vnmi_blocked_time +=
6346 ktime_to_ns(ktime_sub(ktime_get(),
6347 vmx->loaded_vmcs->entry_time));
fe3ef05c
NHE
6348}
6349
55d2375e
SC
6350static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
6351 u32 idt_vectoring_info,
6352 int instr_len_field,
6353 int error_code_field)
0c7f650e 6354{
55d2375e
SC
6355 u8 vector;
6356 int type;
6357 bool idtv_info_valid;
0c7f650e 6358
55d2375e 6359 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
0c7f650e 6360
55d2375e
SC
6361 vcpu->arch.nmi_injected = false;
6362 kvm_clear_exception_queue(vcpu);
6363 kvm_clear_interrupt_queue(vcpu);
27c42a1b 6364
55d2375e
SC
6365 if (!idtv_info_valid)
6366 return;
c7c2c709 6367
55d2375e 6368 kvm_make_request(KVM_REQ_EVENT, vcpu);
ca0bde28 6369
55d2375e
SC
6370 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
6371 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
64a919f7 6372
55d2375e
SC
6373 switch (type) {
6374 case INTR_TYPE_NMI_INTR:
6375 vcpu->arch.nmi_injected = true;
6376 /*
6377 * SDM 3: 27.7.1.2 (September 2008)
6378 * Clear bit "block by NMI" before VM entry if a NMI
6379 * delivery faulted.
6380 */
6381 vmx_set_nmi_mask(vcpu, false);
6382 break;
6383 case INTR_TYPE_SOFT_EXCEPTION:
6384 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
6385 /* fall through */
6386 case INTR_TYPE_HARD_EXCEPTION:
6387 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
6388 u32 err = vmcs_read32(error_code_field);
6389 kvm_requeue_exception_e(vcpu, vector, err);
6390 } else
6391 kvm_requeue_exception(vcpu, vector);
6392 break;
6393 case INTR_TYPE_SOFT_INTR:
6394 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
6395 /* fall through */
6396 case INTR_TYPE_EXT_INTR:
6397 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
6398 break;
6399 default:
6400 break;
0447378a 6401 }
ca0bde28
JM
6402}
6403
55d2375e 6404static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
f145d90d 6405{
55d2375e
SC
6406 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
6407 VM_EXIT_INSTRUCTION_LEN,
6408 IDT_VECTORING_ERROR_CODE);
f145d90d
LA
6409}
6410
55d2375e 6411static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
ca0bde28 6412{
55d2375e
SC
6413 __vmx_complete_interrupts(vcpu,
6414 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6415 VM_ENTRY_INSTRUCTION_LEN,
6416 VM_ENTRY_EXCEPTION_ERROR_CODE);
f1b026a3 6417
55d2375e 6418 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
ca0bde28
JM
6419}
6420
55d2375e 6421static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
52017608 6422{
55d2375e
SC
6423 int i, nr_msrs;
6424 struct perf_guest_switch_msr *msrs;
7c177938 6425
55d2375e 6426 msrs = perf_guest_get_msrs(&nr_msrs);
384bb783 6427
55d2375e
SC
6428 if (!msrs)
6429 return;
f1b026a3 6430
55d2375e
SC
6431 for (i = 0; i < nr_msrs; i++)
6432 if (msrs[i].host == msrs[i].guest)
6433 clear_atomic_switch_msr(vmx, msrs[i].msr);
6434 else
6435 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
6436 msrs[i].host, false);
ca0bde28 6437}
52017608 6438
6e3ba4ab
TX
6439static void atomic_switch_umwait_control_msr(struct vcpu_vmx *vmx)
6440{
6441 u32 host_umwait_control;
6442
6443 if (!vmx_has_waitpkg(vmx))
6444 return;
6445
6446 host_umwait_control = get_umwait_control_msr();
6447
6448 if (vmx->msr_ia32_umwait_control != host_umwait_control)
6449 add_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL,
6450 vmx->msr_ia32_umwait_control,
6451 host_umwait_control, false);
6452 else
6453 clear_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL);
6454}
6455
55d2375e 6456static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
858e25c0
JM
6457{
6458 struct vcpu_vmx *vmx = to_vmx(vcpu);
55d2375e
SC
6459 u64 tscl;
6460 u32 delta_tsc;
52017608 6461
55d2375e 6462 if (vmx->req_immediate_exit) {
804939ea
SC
6463 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
6464 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
6465 } else if (vmx->hv_deadline_tsc != -1) {
55d2375e
SC
6466 tscl = rdtsc();
6467 if (vmx->hv_deadline_tsc > tscl)
6468 /* set_hv_timer ensures the delta fits in 32-bits */
6469 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
6470 cpu_preemption_timer_multi);
6471 else
6472 delta_tsc = 0;
858e25c0 6473
804939ea
SC
6474 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
6475 vmx->loaded_vmcs->hv_timer_soft_disabled = false;
6476 } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
6477 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
6478 vmx->loaded_vmcs->hv_timer_soft_disabled = true;
7f7f1ba3 6479 }
858e25c0
JM
6480}
6481
c09b03eb 6482void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
ca0bde28 6483{
c09b03eb
SC
6484 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
6485 vmx->loaded_vmcs->host_state.rsp = host_rsp;
6486 vmcs_writel(HOST_RSP, host_rsp);
6487 }
5ad6ece8 6488}
5f3d5799 6489
fc2ba5a2 6490bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
5ad6ece8
SC
6491
6492static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
6493{
6494 struct vcpu_vmx *vmx = to_vmx(vcpu);
6495 unsigned long cr3, cr4;
6496
6497 /* Record the guest's net vcpu time for enforced NMI injections. */
6498 if (unlikely(!enable_vnmi &&
6499 vmx->loaded_vmcs->soft_vnmi_blocked))
6500 vmx->loaded_vmcs->entry_time = ktime_get();
6501
6502 /* Don't enter VMX if guest state is invalid, let the exit handler
6503 start emulation until we arrive back to a valid state */
6504 if (vmx->emulation_required)
6505 return;
6506
6507 if (vmx->ple_window_dirty) {
6508 vmx->ple_window_dirty = false;
6509 vmcs_write32(PLE_WINDOW, vmx->ple_window);
6510 }
6511
c9dfd3fb 6512 /*
6513 * We did this in prepare_switch_to_guest, because it needs to
6514 * be within srcu_read_lock.
6515 */
6516 WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
5ad6ece8 6517
cb3c1e2f 6518 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
5ad6ece8 6519 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
cb3c1e2f 6520 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
5ad6ece8
SC
6521 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
6522
6523 cr3 = __get_current_cr3_fast();
6524 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
6525 vmcs_writel(HOST_CR3, cr3);
6526 vmx->loaded_vmcs->host_state.cr3 = cr3;
6527 }
6528
6529 cr4 = cr4_read_shadow();
6530 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
6531 vmcs_writel(HOST_CR4, cr4);
6532 vmx->loaded_vmcs->host_state.cr4 = cr4;
6533 }
6534
6535 /* When single-stepping over STI and MOV SS, we must clear the
6536 * corresponding interruptibility bits in the guest state. Otherwise
6537 * vmentry fails as it then expects bit 14 (BS) in pending debug
6538 * exceptions being set, but that's not correct for the guest debugging
6539 * case. */
6540 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
6541 vmx_set_interrupt_shadow(vcpu, 0);
6542
139a12cf 6543 kvm_load_guest_xsave_state(vcpu);
1811d979 6544
5ad6ece8
SC
6545 if (static_cpu_has(X86_FEATURE_PKU) &&
6546 kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
6547 vcpu->arch.pkru != vmx->host_pkru)
6548 __write_pkru(vcpu->arch.pkru);
6549
6550 pt_guest_enter(vmx);
6551
6552 atomic_switch_perf_msrs(vmx);
6e3ba4ab 6553 atomic_switch_umwait_control_msr(vmx);
5ad6ece8 6554
804939ea
SC
6555 if (enable_preemption_timer)
6556 vmx_update_hv_timer(vcpu);
5ad6ece8 6557
b6c4bc65
WL
6558 if (lapic_in_kernel(vcpu) &&
6559 vcpu->arch.apic->lapic_timer.timer_advance_ns)
6560 kvm_wait_lapic_expire(vcpu);
6561
5ad6ece8
SC
6562 /*
6563 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
6564 * it's non-zero. Since vmentry is serialising on affected CPUs, there
6565 * is no need to worry about the conditional branch over the wrmsr
6566 * being speculatively taken.
6567 */
6568 x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
6569
fa4bff16 6570 /* L1D Flush includes CPU buffer clear to mitigate MDS */
c823dd5c
SC
6571 if (static_branch_unlikely(&vmx_l1d_should_flush))
6572 vmx_l1d_flush(vcpu);
fa4bff16
LT
6573 else if (static_branch_unlikely(&mds_user_clear))
6574 mds_clear_cpu_buffers();
c823dd5c
SC
6575
6576 if (vcpu->arch.cr2 != read_cr2())
6577 write_cr2(vcpu->arch.cr2);
6578
fc2ba5a2
SC
6579 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
6580 vmx->loaded_vmcs->launched);
c823dd5c
SC
6581
6582 vcpu->arch.cr2 = read_cr2();
b6b8a145 6583
55d2375e
SC
6584 /*
6585 * We do not use IBRS in the kernel. If this vCPU has used the
6586 * SPEC_CTRL MSR it may have left it on; save the value and
6587 * turn it off. This is much more efficient than blindly adding
6588 * it to the atomic save/restore list. Especially as the former
6589 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
6590 *
6591 * For non-nested case:
6592 * If the L01 MSR bitmap does not intercept the MSR, then we need to
6593 * save it.
6594 *
6595 * For nested case:
6596 * If the L02 MSR bitmap does not intercept the MSR, then we need to
6597 * save it.
6598 */
6599 if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
6600 vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
b6b8a145 6601
55d2375e 6602 x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
d264ee0c 6603
55d2375e
SC
6604 /* All fields are clean at this point */
6605 if (static_branch_unlikely(&enable_evmcs))
6606 current_evmcs->hv_clean_fields |=
6607 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
f4124500 6608
6f6a657c
VK
6609 if (static_branch_unlikely(&enable_evmcs))
6610 current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index;
6611
55d2375e
SC
6612 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
6613 if (vmx->host_debugctlmsr)
6614 update_debugctlmsr(vmx->host_debugctlmsr);
f4124500 6615
55d2375e
SC
6616#ifndef CONFIG_X86_64
6617 /*
6618 * The sysexit path does not restore ds/es, so we must set them to
6619 * a reasonable value ourselves.
6620 *
6621 * We can't defer this to vmx_prepare_switch_to_host() since that
6622 * function may be executed in interrupt context, which saves and
6623 * restore segments around it, nullifying its effect.
6624 */
6625 loadsegment(ds, __USER_DS);
6626 loadsegment(es, __USER_DS);
6627#endif
4704d0be 6628
55d2375e
SC
6629 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
6630 | (1 << VCPU_EXREG_RFLAGS)
6631 | (1 << VCPU_EXREG_PDPTR)
6632 | (1 << VCPU_EXREG_SEGMENTS)
6633 | (1 << VCPU_EXREG_CR3));
6634 vcpu->arch.regs_dirty = 0;
7854cbca 6635
2ef444f1
CP
6636 pt_guest_exit(vmx);
6637
3633cfc3 6638 /*
55d2375e
SC
6639 * eager fpu is enabled if PKEY is supported and CR4 is switched
6640 * back on host, so it is safe to read guest PKRU from current
6641 * XSAVE.
3633cfc3 6642 */
55d2375e
SC
6643 if (static_cpu_has(X86_FEATURE_PKU) &&
6644 kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
c806e887 6645 vcpu->arch.pkru = rdpkru();
55d2375e
SC
6646 if (vcpu->arch.pkru != vmx->host_pkru)
6647 __write_pkru(vmx->host_pkru);
3633cfc3
NHE
6648 }
6649
139a12cf 6650 kvm_load_host_xsave_state(vcpu);
1811d979 6651
55d2375e
SC
6652 vmx->nested.nested_run_pending = 0;
6653 vmx->idt_vectoring_info = 0;
119a9c01 6654
55d2375e 6655 vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
beb8d93b
SC
6656 if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
6657 kvm_machine_check();
6658
55d2375e
SC
6659 if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
6660 return;
608406e2 6661
55d2375e
SC
6662 vmx->loaded_vmcs->launched = 1;
6663 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
c18911a2 6664
55d2375e
SC
6665 vmx_recover_nmi_blocking(vmx);
6666 vmx_complete_interrupts(vmx);
6667}
2996fca0 6668
55d2375e 6669static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
cf8b84f4 6670{
55d2375e 6671 struct vcpu_vmx *vmx = to_vmx(vcpu);
4704d0be 6672
55d2375e
SC
6673 if (enable_pml)
6674 vmx_destroy_pml_buffer(vmx);
6675 free_vpid(vmx->vpid);
55d2375e
SC
6676 nested_vmx_free_vcpu(vcpu);
6677 free_loaded_vmcs(vmx->loaded_vmcs);
55d2375e 6678}
4704d0be 6679
987b2594 6680static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
55d2375e 6681{
41836839 6682 struct vcpu_vmx *vmx;
55d2375e 6683 unsigned long *msr_bitmap;
34109c04 6684 int i, cpu, err;
4704d0be 6685
a9dd6f09
SC
6686 BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
6687 vmx = to_vmx(vcpu);
d9a710e5 6688
55d2375e 6689 err = -ENOMEM;
b666a4b6 6690
55d2375e 6691 vmx->vpid = allocate_vpid();
7cdc2d62 6692
5f3d5799 6693 /*
55d2375e
SC
6694 * If PML is turned on, failure on enabling PML just results in failure
6695 * of creating the vcpu, therefore we can simplify PML logic (by
6696 * avoiding dealing with cases, such as enabling PML partially on vcpus
67b0ae43 6697 * for the guest), etc.
5f3d5799 6698 */
55d2375e 6699 if (enable_pml) {
41836839 6700 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
55d2375e 6701 if (!vmx->pml_pg)
987b2594 6702 goto free_vpid;
55d2375e 6703 }
4704d0be 6704
7d73710d 6705 BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != NR_SHARED_MSRS);
4704d0be 6706
4be53410
XL
6707 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
6708 u32 index = vmx_msr_index[i];
6709 u32 data_low, data_high;
6710 int j = vmx->nmsrs;
6711
6712 if (rdmsr_safe(index, &data_low, &data_high) < 0)
6713 continue;
6714 if (wrmsr_safe(index, data_low, data_high) < 0)
6715 continue;
46f4f0aa 6716
4be53410
XL
6717 vmx->guest_msrs[j].index = i;
6718 vmx->guest_msrs[j].data = 0;
46f4f0aa
PB
6719 switch (index) {
6720 case MSR_IA32_TSX_CTRL:
6721 /*
6722 * No need to pass TSX_CTRL_CPUID_CLEAR through, so
6723 * let's avoid changing CPUID bits under the host
6724 * kernel's feet.
6725 */
6726 vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
6727 break;
6728 default:
6729 vmx->guest_msrs[j].mask = -1ull;
6730 break;
6731 }
4be53410
XL
6732 ++vmx->nmsrs;
6733 }
6734
55d2375e
SC
6735 err = alloc_loaded_vmcs(&vmx->vmcs01);
6736 if (err < 0)
7d73710d 6737 goto free_pml;
cb61de2f 6738
55d2375e 6739 msr_bitmap = vmx->vmcs01.msr_bitmap;
788fc1e9 6740 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R);
55d2375e
SC
6741 vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
6742 vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
6743 vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
6744 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
6745 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
6746 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
987b2594 6747 if (kvm_cstate_in_guest(vcpu->kvm)) {
b5170063
WL
6748 vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R);
6749 vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
6750 vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
6751 vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
6752 }
55d2375e 6753 vmx->msr_bitmap_mode = 0;
4704d0be 6754
55d2375e
SC
6755 vmx->loaded_vmcs = &vmx->vmcs01;
6756 cpu = get_cpu();
34109c04
SC
6757 vmx_vcpu_load(vcpu, cpu);
6758 vcpu->cpu = cpu;
1b84292b 6759 init_vmcs(vmx);
34109c04 6760 vmx_vcpu_put(vcpu);
55d2375e 6761 put_cpu();
34109c04 6762 if (cpu_need_virtualize_apic_accesses(vcpu)) {
987b2594 6763 err = alloc_apic_access_page(vcpu->kvm);
55d2375e
SC
6764 if (err)
6765 goto free_vmcs;
6766 }
6767
6768 if (enable_ept && !enable_unrestricted_guest) {
987b2594 6769 err = init_rmode_identity_map(vcpu->kvm);
55d2375e
SC
6770 if (err)
6771 goto free_vmcs;
6772 }
4704d0be 6773
55d2375e
SC
6774 if (nested)
6775 nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
a4443267 6776 vmx_capability.ept);
55d2375e
SC
6777 else
6778 memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
bd18bffc 6779
55d2375e
SC
6780 vmx->nested.posted_intr_nv = -1;
6781 vmx->nested.current_vmptr = -1ull;
bd18bffc 6782
bab0c318 6783 vcpu->arch.microcode_version = 0x100000000ULL;
32ad73db 6784 vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
feaf0c7d 6785
6f1e03bc 6786 /*
55d2375e
SC
6787 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
6788 * or POSTED_INTR_WAKEUP_VECTOR.
6f1e03bc 6789 */
55d2375e
SC
6790 vmx->pi_desc.nv = POSTED_INTR_VECTOR;
6791 vmx->pi_desc.sn = 1;
4704d0be 6792
53963a70
LT
6793 vmx->ept_pointer = INVALID_PAGE;
6794
a9dd6f09 6795 return 0;
4704d0be 6796
55d2375e
SC
6797free_vmcs:
6798 free_loaded_vmcs(vmx->loaded_vmcs);
55d2375e
SC
6799free_pml:
6800 vmx_destroy_pml_buffer(vmx);
987b2594 6801free_vpid:
55d2375e 6802 free_vpid(vmx->vpid);
a9dd6f09 6803 return err;
55d2375e 6804}
36be0b9d 6805
65fd4cb6
TG
6806#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
6807#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
21feb4eb 6808
55d2375e
SC
6809static int vmx_vm_init(struct kvm *kvm)
6810{
6811 spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
ff651cb6 6812
55d2375e
SC
6813 if (!ple_gap)
6814 kvm->arch.pause_in_guest = true;
3af18d9c 6815
55d2375e
SC
6816 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
6817 switch (l1tf_mitigation) {
6818 case L1TF_MITIGATION_OFF:
6819 case L1TF_MITIGATION_FLUSH_NOWARN:
6820 /* 'I explicitly don't care' is set */
6821 break;
6822 case L1TF_MITIGATION_FLUSH:
6823 case L1TF_MITIGATION_FLUSH_NOSMT:
6824 case L1TF_MITIGATION_FULL:
6825 /*
6826 * Warn upon starting the first VM in a potentially
6827 * insecure environment.
6828 */
b284909a 6829 if (sched_smt_active())
55d2375e
SC
6830 pr_warn_once(L1TF_MSG_SMT);
6831 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
6832 pr_warn_once(L1TF_MSG_L1D);
6833 break;
6834 case L1TF_MITIGATION_FULL_FORCE:
6835 /* Flush is enforced */
6836 break;
6837 }
6838 }
4e19c36f 6839 kvm_apicv_init(kvm, enable_apicv);
55d2375e 6840 return 0;
4704d0be
NHE
6841}
6842
f257d6dc 6843static int __init vmx_check_processor_compat(void)
bd18bffc 6844{
55d2375e
SC
6845 struct vmcs_config vmcs_conf;
6846 struct vmx_capability vmx_cap;
bd18bffc 6847
ff10e22e
SC
6848 if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
6849 !this_cpu_has(X86_FEATURE_VMX)) {
6850 pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id());
6851 return -EIO;
6852 }
6853
55d2375e 6854 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
f257d6dc 6855 return -EIO;
55d2375e 6856 if (nested)
a4443267 6857 nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept);
55d2375e
SC
6858 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
6859 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
6860 smp_processor_id());
f257d6dc 6861 return -EIO;
bd18bffc 6862 }
f257d6dc 6863 return 0;
bd18bffc
SC
6864}
6865
55d2375e 6866static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
bd18bffc 6867{
55d2375e
SC
6868 u8 cache;
6869 u64 ipat = 0;
bd18bffc 6870
222f06e7
CW
6871 /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
6872 * memory aliases with conflicting memory types and sometimes MCEs.
6873 * We have to be careful as to what are honored and when.
6874 *
6875 * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to
6876 * UC. The effective memory type is UC or WC depending on guest PAT.
6877 * This was historically the source of MCEs and we want to be
6878 * conservative.
6879 *
6880 * When there is no need to deal with noncoherent DMA (e.g., no VT-d
6881 * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The
6882 * EPT memory type is set to WB. The effective memory type is forced
6883 * WB.
6884 *
6885 * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The
6886 * EPT memory type is used to emulate guest CD/MTRR.
bd18bffc 6887 */
222f06e7 6888
55d2375e
SC
6889 if (is_mmio) {
6890 cache = MTRR_TYPE_UNCACHABLE;
6891 goto exit;
6892 }
bd18bffc 6893
55d2375e
SC
6894 if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
6895 ipat = VMX_EPT_IPAT_BIT;
6896 cache = MTRR_TYPE_WRBACK;
6897 goto exit;
6898 }
bd18bffc 6899
55d2375e
SC
6900 if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
6901 ipat = VMX_EPT_IPAT_BIT;
6902 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
6903 cache = MTRR_TYPE_WRBACK;
6904 else
6905 cache = MTRR_TYPE_UNCACHABLE;
6906 goto exit;
6907 }
bd18bffc 6908
55d2375e 6909 cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
bd18bffc 6910
55d2375e
SC
6911exit:
6912 return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
6913}
bd18bffc 6914
fe7f895d 6915static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
55d2375e 6916{
bd18bffc 6917 /*
55d2375e
SC
6918 * These bits in the secondary execution controls field
6919 * are dynamic, the others are mostly based on the hypervisor
6920 * architecture and the guest's CPUID. Do not touch the
6921 * dynamic bits.
bd18bffc 6922 */
55d2375e
SC
6923 u32 mask =
6924 SECONDARY_EXEC_SHADOW_VMCS |
6925 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
6926 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
6927 SECONDARY_EXEC_DESC;
bd18bffc 6928
fe7f895d
SC
6929 u32 new_ctl = vmx->secondary_exec_control;
6930 u32 cur_ctl = secondary_exec_controls_get(vmx);
bd18bffc 6931
fe7f895d 6932 secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
bd18bffc
SC
6933}
6934
4704d0be 6935/*
55d2375e
SC
6936 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
6937 * (indicating "allowed-1") if they are supported in the guest's CPUID.
4704d0be 6938 */
55d2375e 6939static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
4704d0be
NHE
6940{
6941 struct vcpu_vmx *vmx = to_vmx(vcpu);
55d2375e 6942 struct kvm_cpuid_entry2 *entry;
4704d0be 6943
55d2375e
SC
6944 vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
6945 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
e79f245d 6946
55d2375e
SC
6947#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
6948 if (entry && (entry->_reg & (_cpuid_mask))) \
6949 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
6950} while (0)
ff651cb6 6951
55d2375e 6952 entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
87382003
SC
6953 cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME));
6954 cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME));
6955 cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC));
6956 cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE));
6957 cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE));
6958 cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE));
6959 cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE));
6960 cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE));
6961 cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR));
6962 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
6963 cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX));
6964 cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX));
6965 cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID));
6966 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE));
61ada748 6967
55d2375e 6968 entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
87382003
SC
6969 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE));
6970 cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP));
6971 cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP));
6972 cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU));
6973 cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP));
6974 cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57));
cf3215d9 6975
55d2375e
SC
6976#undef cr4_fixed1_update
6977}
36c3cc42 6978
55d2375e
SC
6979static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
6980{
6981 struct vcpu_vmx *vmx = to_vmx(vcpu);
f459a707 6982
55d2375e
SC
6983 if (kvm_mpx_supported()) {
6984 bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
4704d0be 6985
55d2375e
SC
6986 if (mpx_enabled) {
6987 vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
6988 vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
6989 } else {
6990 vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
6991 vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
6992 }
dccbfcf5 6993 }
55d2375e 6994}
4704d0be 6995
6c0f0bba
LK
6996static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
6997{
6998 struct vcpu_vmx *vmx = to_vmx(vcpu);
6999 struct kvm_cpuid_entry2 *best = NULL;
7000 int i;
7001
7002 for (i = 0; i < PT_CPUID_LEAVES; i++) {
7003 best = kvm_find_cpuid_entry(vcpu, 0x14, i);
7004 if (!best)
7005 return;
7006 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
7007 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
7008 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
7009 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
7010 }
7011
7012 /* Get the number of configurable Address Ranges for filtering */
7013 vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps,
7014 PT_CAP_num_address_ranges);
7015
7016 /* Initialize and clear the no dependency bits */
7017 vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
7018 RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC);
7019
7020 /*
7021 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
7022 * will inject an #GP
7023 */
7024 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
7025 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
7026
7027 /*
7028 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
7029 * PSBFreq can be set
7030 */
7031 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
7032 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
7033 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
7034
7035 /*
7036 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
7037 * MTCFreq can be set
7038 */
7039 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
7040 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
7041 RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE);
7042
7043 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
7044 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
7045 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
7046 RTIT_CTL_PTW_EN);
7047
7048 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
7049 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
7050 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
7051
7052 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
7053 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
7054 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
7055
7056 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */
7057 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
7058 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
7059
7060 /* unmask address range configure area */
7061 for (i = 0; i < vmx->pt_desc.addr_range; i++)
d14eff1b 7062 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
6c0f0bba
LK
7063}
7064
55d2375e
SC
7065static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
7066{
7067 struct vcpu_vmx *vmx = to_vmx(vcpu);
4704d0be 7068
7204160e
AL
7069 /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
7070 vcpu->arch.xsaves_enabled = false;
7071
55d2375e
SC
7072 if (cpu_has_secondary_exec_ctrls()) {
7073 vmx_compute_secondary_exec_control(vmx);
fe7f895d 7074 vmcs_set_secondary_exec_control(vmx);
705699a1 7075 }
4704d0be 7076
55d2375e
SC
7077 if (nested_vmx_allowed(vcpu))
7078 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
32ad73db
SC
7079 FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7080 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
55d2375e
SC
7081 else
7082 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
32ad73db
SC
7083 ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
7084 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
4f350c6d 7085
55d2375e
SC
7086 if (nested_vmx_allowed(vcpu)) {
7087 nested_vmx_cr_fixed1_bits_update(vcpu);
7088 nested_vmx_entry_exit_ctls_update(vcpu);
4f350c6d 7089 }
6c0f0bba
LK
7090
7091 if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
7092 guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
7093 update_intel_pt_cfg(vcpu);
b07a5c53
PB
7094
7095 if (boot_cpu_has(X86_FEATURE_RTM)) {
7096 struct shared_msr_entry *msr;
7097 msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL);
7098 if (msr) {
7099 bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
7100 vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
7101 }
7102 }
55d2375e 7103}
09abb5e3 7104
3ec6fd8c
SC
7105static __init void vmx_set_cpu_caps(void)
7106{
7107 kvm_set_cpu_caps();
7108
7109 /* CPUID 0x1 */
7110 if (nested)
7111 kvm_cpu_cap_set(X86_FEATURE_VMX);
7112
7113 /* CPUID 0x7 */
8721f5b0
SC
7114 if (kvm_mpx_supported())
7115 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
7116 if (cpu_has_vmx_invpcid())
7117 kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
7118 if (vmx_pt_mode_is_host_guest())
7119 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
3ec6fd8c
SC
7120
7121 /* PKU is not yet implemented for shadow paging. */
8721f5b0
SC
7122 if (enable_ept && boot_cpu_has(X86_FEATURE_OSPKE))
7123 kvm_cpu_cap_check_and_set(X86_FEATURE_PKU);
3ec6fd8c 7124
90d2f60f
SC
7125 if (vmx_umip_emulated())
7126 kvm_cpu_cap_set(X86_FEATURE_UMIP);
7127
b3d895d5 7128 /* CPUID 0xD.1 */
408e9a31 7129 supported_xss = 0;
b3d895d5
SC
7130 if (!vmx_xsaves_supported())
7131 kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
7132
3ec6fd8c
SC
7133 /* CPUID 0x80000001 */
7134 if (!cpu_has_vmx_rdtscp())
7135 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
7136}
7137
55d2375e 7138static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
42124925 7139{
55d2375e 7140 to_vmx(vcpu)->req_immediate_exit = true;
7c177938
NHE
7141}
7142
35a57134
OU
7143static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
7144 struct x86_instruction_info *info)
7145{
7146 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7147 unsigned short port;
7148 bool intercept;
7149 int size;
7150
7151 if (info->intercept == x86_intercept_in ||
7152 info->intercept == x86_intercept_ins) {
7153 port = info->src_val;
7154 size = info->dst_bytes;
7155 } else {
7156 port = info->dst_val;
7157 size = info->src_bytes;
7158 }
7159
7160 /*
7161 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
7162 * VM-exits depend on the 'unconditional IO exiting' VM-execution
7163 * control.
7164 *
7165 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
7166 */
7167 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
7168 intercept = nested_cpu_has(vmcs12,
7169 CPU_BASED_UNCOND_IO_EXITING);
7170 else
7171 intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
7172
7173 return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
7174}
7175
8a76d7f2
JR
7176static int vmx_check_intercept(struct kvm_vcpu *vcpu,
7177 struct x86_instruction_info *info,
21f1b8f2
SC
7178 enum x86_intercept_stage stage,
7179 struct x86_exception *exception)
8a76d7f2 7180{
fb6d4d34 7181 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
fb6d4d34 7182
35a57134 7183 switch (info->intercept) {
fb6d4d34
PB
7184 /*
7185 * RDPID causes #UD if disabled through secondary execution controls.
7186 * Because it is marked as EmulateOnUD, we need to intercept it here.
7187 */
35a57134
OU
7188 case x86_intercept_rdtscp:
7189 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
21f1b8f2
SC
7190 exception->vector = UD_VECTOR;
7191 exception->error_code_valid = false;
35a57134
OU
7192 return X86EMUL_PROPAGATE_FAULT;
7193 }
7194 break;
7195
7196 case x86_intercept_in:
7197 case x86_intercept_ins:
7198 case x86_intercept_out:
7199 case x86_intercept_outs:
7200 return vmx_check_intercept_io(vcpu, info);
fb6d4d34
PB
7201
7202 /* TODO: check more intercepts... */
35a57134
OU
7203 default:
7204 break;
7205 }
7206
07721fee 7207 return X86EMUL_UNHANDLEABLE;
8a76d7f2
JR
7208}
7209
64672c95
YJ
7210#ifdef CONFIG_X86_64
7211/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
7212static inline int u64_shl_div_u64(u64 a, unsigned int shift,
7213 u64 divisor, u64 *result)
7214{
7215 u64 low = a << shift, high = a >> (64 - shift);
7216
7217 /* To avoid the overflow on divq */
7218 if (high >= divisor)
7219 return 1;
7220
7221 /* Low hold the result, high hold rem which is discarded */
7222 asm("divq %2\n\t" : "=a" (low), "=d" (high) :
7223 "rm" (divisor), "0" (low), "1" (high));
7224 *result = low;
7225
7226 return 0;
7227}
7228
f9927982
SC
7229static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
7230 bool *expired)
64672c95 7231{
386c6ddb 7232 struct vcpu_vmx *vmx;
c5ce8235 7233 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
39497d76 7234 struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
386c6ddb 7235
0c5f81da
WL
7236 if (kvm_mwait_in_guest(vcpu->kvm) ||
7237 kvm_can_post_timer_interrupt(vcpu))
386c6ddb
KA
7238 return -EOPNOTSUPP;
7239
7240 vmx = to_vmx(vcpu);
7241 tscl = rdtsc();
7242 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
7243 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
39497d76
SC
7244 lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
7245 ktimer->timer_advance_ns);
c5ce8235
WL
7246
7247 if (delta_tsc > lapic_timer_advance_cycles)
7248 delta_tsc -= lapic_timer_advance_cycles;
7249 else
7250 delta_tsc = 0;
64672c95
YJ
7251
7252 /* Convert to host delta tsc if tsc scaling is enabled */
7253 if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
0967fa1c 7254 delta_tsc && u64_shl_div_u64(delta_tsc,
64672c95 7255 kvm_tsc_scaling_ratio_frac_bits,
0967fa1c 7256 vcpu->arch.tsc_scaling_ratio, &delta_tsc))
64672c95
YJ
7257 return -ERANGE;
7258
7259 /*
7260 * If the delta tsc can't fit in the 32 bit after the multi shift,
7261 * we can't use the preemption timer.
7262 * It's possible that it fits on later vmentries, but checking
7263 * on every vmentry is costly so we just use an hrtimer.
7264 */
7265 if (delta_tsc >> (cpu_preemption_timer_multi + 32))
7266 return -ERANGE;
7267
7268 vmx->hv_deadline_tsc = tscl + delta_tsc;
f9927982
SC
7269 *expired = !delta_tsc;
7270 return 0;
64672c95
YJ
7271}
7272
7273static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
7274{
f459a707 7275 to_vmx(vcpu)->hv_deadline_tsc = -1;
64672c95
YJ
7276}
7277#endif
7278
48d89b92 7279static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
ae97a3b8 7280{
b31c114b 7281 if (!kvm_pause_in_guest(vcpu->kvm))
b4a2d31d 7282 shrink_ple_window(vcpu);
ae97a3b8
RK
7283}
7284
843e4330
KH
7285static void vmx_slot_enable_log_dirty(struct kvm *kvm,
7286 struct kvm_memory_slot *slot)
7287{
3c9bd400
JZ
7288 if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
7289 kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
843e4330
KH
7290 kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
7291}
7292
7293static void vmx_slot_disable_log_dirty(struct kvm *kvm,
7294 struct kvm_memory_slot *slot)
7295{
7296 kvm_mmu_slot_set_dirty(kvm, slot);
7297}
7298
7299static void vmx_flush_log_dirty(struct kvm *kvm)
7300{
7301 kvm_flush_pml_buffers(kvm);
7302}
7303
c5f983f6
BD
7304static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
7305{
7306 struct vmcs12 *vmcs12;
7307 struct vcpu_vmx *vmx = to_vmx(vcpu);
3d5f6beb 7308 gpa_t gpa, dst;
c5f983f6
BD
7309
7310 if (is_guest_mode(vcpu)) {
7311 WARN_ON_ONCE(vmx->nested.pml_full);
7312
7313 /*
7314 * Check if PML is enabled for the nested guest.
7315 * Whether eptp bit 6 is set is already checked
7316 * as part of A/D emulation.
7317 */
7318 vmcs12 = get_vmcs12(vcpu);
7319 if (!nested_cpu_has_pml(vmcs12))
7320 return 0;
7321
4769886b 7322 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
c5f983f6
BD
7323 vmx->nested.pml_full = true;
7324 return 1;
7325 }
7326
7327 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
3d5f6beb 7328 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
c5f983f6 7329
3d5f6beb
KA
7330 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
7331 offset_in_page(dst), sizeof(gpa)))
c5f983f6
BD
7332 return 0;
7333
3d5f6beb 7334 vmcs12->guest_pml_index--;
c5f983f6
BD
7335 }
7336
7337 return 0;
7338}
7339
843e4330
KH
7340static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
7341 struct kvm_memory_slot *memslot,
7342 gfn_t offset, unsigned long mask)
7343{
7344 kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
7345}
7346
cd39e117
PB
7347static void __pi_post_block(struct kvm_vcpu *vcpu)
7348{
7349 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
7350 struct pi_desc old, new;
7351 unsigned int dest;
cd39e117
PB
7352
7353 do {
7354 old.control = new.control = pi_desc->control;
8b306e2f
PB
7355 WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
7356 "Wakeup handler not enabled while the VCPU is blocked\n");
cd39e117
PB
7357
7358 dest = cpu_physical_id(vcpu->cpu);
7359
7360 if (x2apic_enabled())
7361 new.ndst = dest;
7362 else
7363 new.ndst = (dest << 8) & 0xFF00;
7364
cd39e117
PB
7365 /* set 'NV' to 'notification vector' */
7366 new.nv = POSTED_INTR_VECTOR;
c0a1666b
PB
7367 } while (cmpxchg64(&pi_desc->control, old.control,
7368 new.control) != old.control);
cd39e117 7369
8b306e2f
PB
7370 if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
7371 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
cd39e117 7372 list_del(&vcpu->blocked_vcpu_list);
8b306e2f 7373 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
cd39e117
PB
7374 vcpu->pre_pcpu = -1;
7375 }
7376}
7377
bf9f6ac8
FW
7378/*
7379 * This routine does the following things for vCPU which is going
7380 * to be blocked if VT-d PI is enabled.
7381 * - Store the vCPU to the wakeup list, so when interrupts happen
7382 * we can find the right vCPU to wake up.
7383 * - Change the Posted-interrupt descriptor as below:
7384 * 'NDST' <-- vcpu->pre_pcpu
7385 * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
7386 * - If 'ON' is set during this process, which means at least one
7387 * interrupt is posted for this vCPU, we cannot block it, in
7388 * this case, return 1, otherwise, return 0.
7389 *
7390 */
bc22512b 7391static int pi_pre_block(struct kvm_vcpu *vcpu)
bf9f6ac8 7392{
bf9f6ac8
FW
7393 unsigned int dest;
7394 struct pi_desc old, new;
7395 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
7396
7397 if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
a0052191
YZ
7398 !irq_remapping_cap(IRQ_POSTING_CAP) ||
7399 !kvm_vcpu_apicv_active(vcpu))
bf9f6ac8
FW
7400 return 0;
7401
8b306e2f
PB
7402 WARN_ON(irqs_disabled());
7403 local_irq_disable();
7404 if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
7405 vcpu->pre_pcpu = vcpu->cpu;
7406 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
7407 list_add_tail(&vcpu->blocked_vcpu_list,
7408 &per_cpu(blocked_vcpu_on_cpu,
7409 vcpu->pre_pcpu));
7410 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
7411 }
bf9f6ac8
FW
7412
7413 do {
7414 old.control = new.control = pi_desc->control;
7415
bf9f6ac8
FW
7416 WARN((pi_desc->sn == 1),
7417 "Warning: SN field of posted-interrupts "
7418 "is set before blocking\n");
7419
7420 /*
7421 * Since vCPU can be preempted during this process,
7422 * vcpu->cpu could be different with pre_pcpu, we
7423 * need to set pre_pcpu as the destination of wakeup
7424 * notification event, then we can find the right vCPU
7425 * to wakeup in wakeup handler if interrupts happen
7426 * when the vCPU is in blocked state.
7427 */
7428 dest = cpu_physical_id(vcpu->pre_pcpu);
7429
7430 if (x2apic_enabled())
7431 new.ndst = dest;
7432 else
7433 new.ndst = (dest << 8) & 0xFF00;
7434
7435 /* set 'NV' to 'wakeup vector' */
7436 new.nv = POSTED_INTR_WAKEUP_VECTOR;
c0a1666b
PB
7437 } while (cmpxchg64(&pi_desc->control, old.control,
7438 new.control) != old.control);
bf9f6ac8 7439
8b306e2f
PB
7440 /* We should not block the vCPU if an interrupt is posted for it. */
7441 if (pi_test_on(pi_desc) == 1)
7442 __pi_post_block(vcpu);
7443
7444 local_irq_enable();
7445 return (vcpu->pre_pcpu == -1);
bf9f6ac8
FW
7446}
7447
bc22512b
YJ
7448static int vmx_pre_block(struct kvm_vcpu *vcpu)
7449{
7450 if (pi_pre_block(vcpu))
7451 return 1;
7452
64672c95
YJ
7453 if (kvm_lapic_hv_timer_in_use(vcpu))
7454 kvm_lapic_switch_to_sw_timer(vcpu);
7455
bc22512b
YJ
7456 return 0;
7457}
7458
7459static void pi_post_block(struct kvm_vcpu *vcpu)
bf9f6ac8 7460{
8b306e2f 7461 if (vcpu->pre_pcpu == -1)
bf9f6ac8
FW
7462 return;
7463
8b306e2f
PB
7464 WARN_ON(irqs_disabled());
7465 local_irq_disable();
cd39e117 7466 __pi_post_block(vcpu);
8b306e2f 7467 local_irq_enable();
bf9f6ac8
FW
7468}
7469
bc22512b
YJ
7470static void vmx_post_block(struct kvm_vcpu *vcpu)
7471{
64672c95
YJ
7472 if (kvm_x86_ops->set_hv_timer)
7473 kvm_lapic_switch_to_hv_timer(vcpu);
7474
bc22512b
YJ
7475 pi_post_block(vcpu);
7476}
7477
efc64404
FW
7478/*
7479 * vmx_update_pi_irte - set IRTE for Posted-Interrupts
7480 *
7481 * @kvm: kvm
7482 * @host_irq: host irq of the interrupt
7483 * @guest_irq: gsi of the interrupt
7484 * @set: set or unset PI
7485 * returns 0 on success, < 0 on failure
7486 */
7487static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
7488 uint32_t guest_irq, bool set)
7489{
7490 struct kvm_kernel_irq_routing_entry *e;
7491 struct kvm_irq_routing_table *irq_rt;
7492 struct kvm_lapic_irq irq;
7493 struct kvm_vcpu *vcpu;
7494 struct vcpu_data vcpu_info;
3a8b0677 7495 int idx, ret = 0;
efc64404
FW
7496
7497 if (!kvm_arch_has_assigned_device(kvm) ||
a0052191
YZ
7498 !irq_remapping_cap(IRQ_POSTING_CAP) ||
7499 !kvm_vcpu_apicv_active(kvm->vcpus[0]))
efc64404
FW
7500 return 0;
7501
7502 idx = srcu_read_lock(&kvm->irq_srcu);
7503 irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
3a8b0677
JS
7504 if (guest_irq >= irq_rt->nr_rt_entries ||
7505 hlist_empty(&irq_rt->map[guest_irq])) {
7506 pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
7507 guest_irq, irq_rt->nr_rt_entries);
7508 goto out;
7509 }
efc64404
FW
7510
7511 hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
7512 if (e->type != KVM_IRQ_ROUTING_MSI)
7513 continue;
7514 /*
7515 * VT-d PI cannot support posting multicast/broadcast
7516 * interrupts to a vCPU, we still use interrupt remapping
7517 * for these kind of interrupts.
7518 *
7519 * For lowest-priority interrupts, we only support
7520 * those with single CPU as the destination, e.g. user
7521 * configures the interrupts via /proc/irq or uses
7522 * irqbalance to make the interrupts single-CPU.
7523 *
7524 * We will support full lowest-priority interrupt later.
fdcf7562
AG
7525 *
7526 * In addition, we can only inject generic interrupts using
7527 * the PI mechanism, refuse to route others through it.
efc64404
FW
7528 */
7529
37131313 7530 kvm_set_msi_irq(kvm, e, &irq);
fdcf7562
AG
7531 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
7532 !kvm_irq_is_postable(&irq)) {
23a1c257
FW
7533 /*
7534 * Make sure the IRTE is in remapped mode if
7535 * we don't handle it in posted mode.
7536 */
7537 ret = irq_set_vcpu_affinity(host_irq, NULL);
7538 if (ret < 0) {
7539 printk(KERN_INFO
7540 "failed to back to remapped mode, irq: %u\n",
7541 host_irq);
7542 goto out;
7543 }
7544
efc64404 7545 continue;
23a1c257 7546 }
efc64404
FW
7547
7548 vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
7549 vcpu_info.vector = irq.vector;
7550
2698d82e 7551 trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
efc64404
FW
7552 vcpu_info.vector, vcpu_info.pi_desc_addr, set);
7553
7554 if (set)
7555 ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
dc91f2eb 7556 else
efc64404 7557 ret = irq_set_vcpu_affinity(host_irq, NULL);
efc64404
FW
7558
7559 if (ret < 0) {
7560 printk(KERN_INFO "%s: failed to update PI IRTE\n",
7561 __func__);
7562 goto out;
7563 }
7564 }
7565
7566 ret = 0;
7567out:
7568 srcu_read_unlock(&kvm->irq_srcu, idx);
7569 return ret;
7570}
7571
c45dcc71
AR
7572static void vmx_setup_mce(struct kvm_vcpu *vcpu)
7573{
7574 if (vcpu->arch.mcg_cap & MCG_LMCE_P)
7575 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
32ad73db 7576 FEAT_CTL_LMCE_ENABLED;
c45dcc71
AR
7577 else
7578 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
32ad73db 7579 ~FEAT_CTL_LMCE_ENABLED;
c45dcc71
AR
7580}
7581
72d7b374
LP
7582static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
7583{
72e9cbdb
LP
7584 /* we need a nested vmexit to enter SMM, postpone if run is pending */
7585 if (to_vmx(vcpu)->nested.nested_run_pending)
7586 return 0;
72d7b374
LP
7587 return 1;
7588}
7589
0234bf88
LP
7590static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
7591{
72e9cbdb
LP
7592 struct vcpu_vmx *vmx = to_vmx(vcpu);
7593
7594 vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
7595 if (vmx->nested.smm.guest_mode)
7596 nested_vmx_vmexit(vcpu, -1, 0, 0);
7597
7598 vmx->nested.smm.vmxon = vmx->nested.vmxon;
7599 vmx->nested.vmxon = false;
caa057a2 7600 vmx_clear_hlt(vcpu);
0234bf88
LP
7601 return 0;
7602}
7603
ed19321f 7604static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
0234bf88 7605{
72e9cbdb
LP
7606 struct vcpu_vmx *vmx = to_vmx(vcpu);
7607 int ret;
7608
7609 if (vmx->nested.smm.vmxon) {
7610 vmx->nested.vmxon = true;
7611 vmx->nested.smm.vmxon = false;
7612 }
7613
7614 if (vmx->nested.smm.guest_mode) {
a633e41e 7615 ret = nested_vmx_enter_non_root_mode(vcpu, false);
72e9cbdb
LP
7616 if (ret)
7617 return ret;
7618
7619 vmx->nested.smm.guest_mode = false;
7620 }
0234bf88
LP
7621 return 0;
7622}
7623
cc3d967f
LP
7624static int enable_smi_window(struct kvm_vcpu *vcpu)
7625{
7626 return 0;
7627}
7628
05d5a486
SB
7629static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
7630{
9481b7f1 7631 return false;
05d5a486
SB
7632}
7633
4b9852f4
LA
7634static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
7635{
7636 return to_vmx(vcpu)->nested.vmxon;
7637}
7638
a3203381
SC
7639static __init int hardware_setup(void)
7640{
7641 unsigned long host_bndcfgs;
2342080c 7642 struct desc_ptr dt;
703c335d 7643 int r, i, ept_lpage_level;
a3203381 7644
2342080c
SC
7645 store_idt(&dt);
7646 host_idt_base = dt.address;
7647
a3203381
SC
7648 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
7649 kvm_define_shared_msr(i, vmx_msr_index[i]);
7650
7651 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
7652 return -EIO;
7653
7654 if (boot_cpu_has(X86_FEATURE_NX))
7655 kvm_enable_efer_bits(EFER_NX);
7656
7657 if (boot_cpu_has(X86_FEATURE_MPX)) {
7658 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
7659 WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
7660 }
7661
7f5581f5 7662 if (!cpu_has_vmx_mpx())
cfc48181
SC
7663 supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
7664 XFEATURE_MASK_BNDCSR);
7665
a3203381
SC
7666 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
7667 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
7668 enable_vpid = 0;
7669
7670 if (!cpu_has_vmx_ept() ||
7671 !cpu_has_vmx_ept_4levels() ||
7672 !cpu_has_vmx_ept_mt_wb() ||
7673 !cpu_has_vmx_invept_global())
7674 enable_ept = 0;
7675
7676 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
7677 enable_ept_ad_bits = 0;
7678
7679 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
7680 enable_unrestricted_guest = 0;
7681
7682 if (!cpu_has_vmx_flexpriority())
7683 flexpriority_enabled = 0;
7684
7685 if (!cpu_has_virtual_nmis())
7686 enable_vnmi = 0;
7687
7688 /*
7689 * set_apic_access_page_addr() is used to reload apic access
7690 * page upon invalidation. No need to do anything if not
7691 * using the APIC_ACCESS_ADDR VMCS field.
7692 */
7693 if (!flexpriority_enabled)
7694 kvm_x86_ops->set_apic_access_page_addr = NULL;
7695
7696 if (!cpu_has_vmx_tpr_shadow())
7697 kvm_x86_ops->update_cr8_intercept = NULL;
7698
a3203381
SC
7699#if IS_ENABLED(CONFIG_HYPERV)
7700 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
1f3a3e46
LT
7701 && enable_ept) {
7702 kvm_x86_ops->tlb_remote_flush = hv_remote_flush_tlb;
7703 kvm_x86_ops->tlb_remote_flush_with_range =
7704 hv_remote_flush_tlb_with_range;
7705 }
a3203381
SC
7706#endif
7707
7708 if (!cpu_has_vmx_ple()) {
7709 ple_gap = 0;
7710 ple_window = 0;
7711 ple_window_grow = 0;
7712 ple_window_max = 0;
7713 ple_window_shrink = 0;
7714 }
7715
7716 if (!cpu_has_vmx_apicv()) {
7717 enable_apicv = 0;
7718 kvm_x86_ops->sync_pir_to_irr = NULL;
7719 }
7720
7721 if (cpu_has_vmx_tsc_scaling()) {
7722 kvm_has_tsc_control = true;
7723 kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
7724 kvm_tsc_scaling_ratio_frac_bits = 48;
7725 }
7726
7727 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
7728
7729 if (enable_ept)
7730 vmx_enable_tdp();
703c335d
SC
7731
7732 if (!enable_ept)
7733 ept_lpage_level = 0;
7734 else if (cpu_has_vmx_ept_1g_page())
7735 ept_lpage_level = PT_PDPE_LEVEL;
7736 else if (cpu_has_vmx_ept_2m_page())
7737 ept_lpage_level = PT_DIRECTORY_LEVEL;
7738 else
7739 ept_lpage_level = PT_PAGE_TABLE_LEVEL;
7740 kvm_configure_mmu(enable_ept, ept_lpage_level);
a3203381 7741
a3203381
SC
7742 /*
7743 * Only enable PML when hardware supports PML feature, and both EPT
7744 * and EPT A/D bit features are enabled -- PML depends on them to work.
7745 */
7746 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
7747 enable_pml = 0;
7748
7749 if (!enable_pml) {
7750 kvm_x86_ops->slot_enable_log_dirty = NULL;
7751 kvm_x86_ops->slot_disable_log_dirty = NULL;
7752 kvm_x86_ops->flush_log_dirty = NULL;
7753 kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
7754 }
7755
7756 if (!cpu_has_vmx_preemption_timer())
804939ea 7757 enable_preemption_timer = false;
a3203381 7758
804939ea
SC
7759 if (enable_preemption_timer) {
7760 u64 use_timer_freq = 5000ULL * 1000 * 1000;
a3203381
SC
7761 u64 vmx_msr;
7762
7763 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
7764 cpu_preemption_timer_multi =
7765 vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
804939ea
SC
7766
7767 if (tsc_khz)
7768 use_timer_freq = (u64)tsc_khz * 1000;
7769 use_timer_freq >>= cpu_preemption_timer_multi;
7770
7771 /*
7772 * KVM "disables" the preemption timer by setting it to its max
7773 * value. Don't use the timer if it might cause spurious exits
7774 * at a rate faster than 0.1 Hz (of uninterrupted guest time).
7775 */
7776 if (use_timer_freq > 0xffffffffu / 10)
7777 enable_preemption_timer = false;
7778 }
7779
7780 if (!enable_preemption_timer) {
a3203381
SC
7781 kvm_x86_ops->set_hv_timer = NULL;
7782 kvm_x86_ops->cancel_hv_timer = NULL;
804939ea 7783 kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
a3203381
SC
7784 }
7785
a3203381 7786 kvm_set_posted_intr_wakeup_handler(wakeup_handler);
a3203381
SC
7787
7788 kvm_mce_cap_supported |= MCG_LMCE_P;
7789
f99e3daf
CP
7790 if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
7791 return -EINVAL;
7792 if (!enable_ept || !cpu_has_vmx_intel_pt())
7793 pt_mode = PT_MODE_SYSTEM;
7794
a3203381 7795 if (nested) {
3e8eaccc 7796 nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
a4443267 7797 vmx_capability.ept);
3e8eaccc 7798
e4027cfa 7799 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
a3203381
SC
7800 if (r)
7801 return r;
7802 }
7803
3ec6fd8c 7804 vmx_set_cpu_caps();
66a6950f 7805
a3203381
SC
7806 r = alloc_kvm_area();
7807 if (r)
7808 nested_vmx_hardware_unsetup();
7809 return r;
7810}
7811
7812static __exit void hardware_unsetup(void)
7813{
7814 if (nested)
7815 nested_vmx_hardware_unsetup();
7816
7817 free_kvm_area();
7818}
7819
ef8efd7a
SS
7820static bool vmx_check_apicv_inhibit_reasons(ulong bit)
7821{
f4fdc0a2
SS
7822 ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
7823 BIT(APICV_INHIBIT_REASON_HYPERV);
ef8efd7a
SS
7824
7825 return supported & BIT(bit);
7826}
7827
404f6aac 7828static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
6aa8b732
AK
7829 .cpu_has_kvm_support = cpu_has_kvm_support,
7830 .disabled_by_bios = vmx_disabled_by_bios,
7831 .hardware_setup = hardware_setup,
7832 .hardware_unsetup = hardware_unsetup,
002c7f7c 7833 .check_processor_compatibility = vmx_check_processor_compat,
6aa8b732
AK
7834 .hardware_enable = hardware_enable,
7835 .hardware_disable = hardware_disable,
04547156 7836 .cpu_has_accelerated_tpr = report_flexpriority,
bc226f07 7837 .has_emulated_msr = vmx_has_emulated_msr,
6aa8b732 7838
562b6b08 7839 .vm_size = sizeof(struct kvm_vmx),
b31c114b
WL
7840 .vm_init = vmx_vm_init,
7841
6aa8b732
AK
7842 .vcpu_create = vmx_create_vcpu,
7843 .vcpu_free = vmx_free_vcpu,
04d2cc77 7844 .vcpu_reset = vmx_vcpu_reset,
6aa8b732 7845
6d6095bd 7846 .prepare_guest_switch = vmx_prepare_switch_to_guest,
6aa8b732
AK
7847 .vcpu_load = vmx_vcpu_load,
7848 .vcpu_put = vmx_vcpu_put,
7849
a96036b8 7850 .update_bp_intercept = update_exception_bitmap,
801e459a 7851 .get_msr_feature = vmx_get_msr_feature,
6aa8b732
AK
7852 .get_msr = vmx_get_msr,
7853 .set_msr = vmx_set_msr,
7854 .get_segment_base = vmx_get_segment_base,
7855 .get_segment = vmx_get_segment,
7856 .set_segment = vmx_set_segment,
2e4d2653 7857 .get_cpl = vmx_get_cpl,
6aa8b732 7858 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
e8467fda 7859 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
25c4c276 7860 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
6aa8b732 7861 .set_cr0 = vmx_set_cr0,
6aa8b732 7862 .set_cr4 = vmx_set_cr4,
6aa8b732 7863 .set_efer = vmx_set_efer,
6aa8b732
AK
7864 .get_idt = vmx_get_idt,
7865 .set_idt = vmx_set_idt,
7866 .get_gdt = vmx_get_gdt,
7867 .set_gdt = vmx_set_gdt,
73aaf249
JK
7868 .get_dr6 = vmx_get_dr6,
7869 .set_dr6 = vmx_set_dr6,
020df079 7870 .set_dr7 = vmx_set_dr7,
81908bf4 7871 .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
5fdbf976 7872 .cache_reg = vmx_cache_reg,
6aa8b732
AK
7873 .get_rflags = vmx_get_rflags,
7874 .set_rflags = vmx_set_rflags,
be94f6b7 7875
6aa8b732 7876 .tlb_flush = vmx_flush_tlb,
faff8758 7877 .tlb_flush_gva = vmx_flush_tlb_gva,
6aa8b732 7878
6aa8b732 7879 .run = vmx_vcpu_run,
6062d012 7880 .handle_exit = vmx_handle_exit,
5ef8acbd
OU
7881 .skip_emulated_instruction = vmx_skip_emulated_instruction,
7882 .update_emulated_instruction = vmx_update_emulated_instruction,
2809f5d2
GC
7883 .set_interrupt_shadow = vmx_set_interrupt_shadow,
7884 .get_interrupt_shadow = vmx_get_interrupt_shadow,
102d8325 7885 .patch_hypercall = vmx_patch_hypercall,
2a8067f1 7886 .set_irq = vmx_inject_irq,
95ba8273 7887 .set_nmi = vmx_inject_nmi,
298101da 7888 .queue_exception = vmx_queue_exception,
b463a6f7 7889 .cancel_injection = vmx_cancel_injection,
78646121 7890 .interrupt_allowed = vmx_interrupt_allowed,
95ba8273 7891 .nmi_allowed = vmx_nmi_allowed,
3cfc3092
JK
7892 .get_nmi_mask = vmx_get_nmi_mask,
7893 .set_nmi_mask = vmx_set_nmi_mask,
95ba8273
GN
7894 .enable_nmi_window = enable_nmi_window,
7895 .enable_irq_window = enable_irq_window,
7896 .update_cr8_intercept = update_cr8_intercept,
8d860bbe 7897 .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
38b99173 7898 .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
d62caabb 7899 .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
c7c9c56c 7900 .load_eoi_exitmap = vmx_load_eoi_exitmap,
967235d3 7901 .apicv_post_state_restore = vmx_apicv_post_state_restore,
ef8efd7a 7902 .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
c7c9c56c
YZ
7903 .hwapic_irr_update = vmx_hwapic_irr_update,
7904 .hwapic_isr_update = vmx_hwapic_isr_update,
e6c67d8c 7905 .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
a20ed54d
YZ
7906 .sync_pir_to_irr = vmx_sync_pir_to_irr,
7907 .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
17e433b5 7908 .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt,
95ba8273 7909
cbc94022 7910 .set_tss_addr = vmx_set_tss_addr,
2ac52ab8 7911 .set_identity_map_addr = vmx_set_identity_map_addr,
67253af5 7912 .get_tdp_level = get_ept_level,
4b12f0de 7913 .get_mt_mask = vmx_get_mt_mask,
229456fc 7914
586f9607 7915 .get_exit_info = vmx_get_exit_info,
586f9607 7916
0e851880 7917 .cpuid_update = vmx_cpuid_update,
f5f48ee1
SY
7918
7919 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
99e3e30a 7920
e79f245d 7921 .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
326e7425 7922 .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
1c97f0a0 7923
727a7e27
PB
7924 .load_mmu_pgd = vmx_load_mmu_pgd,
7925
8a76d7f2 7926 .check_intercept = vmx_check_intercept,
95b5a48c 7927 .handle_exit_irqoff = vmx_handle_exit_irqoff,
b6b8a145 7928
d264ee0c 7929 .request_immediate_exit = vmx_request_immediate_exit,
ae97a3b8
RK
7930
7931 .sched_in = vmx_sched_in,
843e4330
KH
7932
7933 .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
7934 .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
7935 .flush_log_dirty = vmx_flush_log_dirty,
7936 .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
c5f983f6 7937 .write_log_dirty = vmx_write_pml_buffer,
25462f7f 7938
bf9f6ac8
FW
7939 .pre_block = vmx_pre_block,
7940 .post_block = vmx_post_block,
7941
25462f7f 7942 .pmu_ops = &intel_pmu_ops,
efc64404
FW
7943
7944 .update_pi_irte = vmx_update_pi_irte,
64672c95
YJ
7945
7946#ifdef CONFIG_X86_64
7947 .set_hv_timer = vmx_set_hv_timer,
7948 .cancel_hv_timer = vmx_cancel_hv_timer,
7949#endif
c45dcc71
AR
7950
7951 .setup_mce = vmx_setup_mce,
0234bf88 7952
72d7b374 7953 .smi_allowed = vmx_smi_allowed,
0234bf88
LP
7954 .pre_enter_smm = vmx_pre_enter_smm,
7955 .pre_leave_smm = vmx_pre_leave_smm,
cc3d967f 7956 .enable_smi_window = enable_smi_window,
57b119da 7957
e4027cfa
SC
7958 .check_nested_events = NULL,
7959 .get_nested_state = NULL,
7960 .set_nested_state = NULL,
7961 .get_vmcs12_pages = NULL,
7962 .nested_enable_evmcs = NULL,
ea152987 7963 .nested_get_evmcs_version = NULL,
05d5a486 7964 .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
4b9852f4 7965 .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
6aa8b732
AK
7966};
7967
72c6d2db 7968static void vmx_cleanup_l1d_flush(void)
a47dd5f0
PB
7969{
7970 if (vmx_l1d_flush_pages) {
7971 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
7972 vmx_l1d_flush_pages = NULL;
7973 }
72c6d2db
TG
7974 /* Restore state so sysfs ignores VMX */
7975 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
a399477e
KRW
7976}
7977
a7b9020b
TG
7978static void vmx_exit(void)
7979{
7980#ifdef CONFIG_KEXEC_CORE
7981 RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
7982 synchronize_rcu();
7983#endif
7984
7985 kvm_exit();
7986
7987#if IS_ENABLED(CONFIG_HYPERV)
7988 if (static_branch_unlikely(&enable_evmcs)) {
7989 int cpu;
7990 struct hv_vp_assist_page *vp_ap;
7991 /*
7992 * Reset everything to support using non-enlightened VMCS
7993 * access later (e.g. when we reload the module with
7994 * enlightened_vmcs=0)
7995 */
7996 for_each_online_cpu(cpu) {
7997 vp_ap = hv_get_vp_assist_page(cpu);
7998
7999 if (!vp_ap)
8000 continue;
8001
6f6a657c 8002 vp_ap->nested_control.features.directhypercall = 0;
a7b9020b
TG
8003 vp_ap->current_nested_vmcs = 0;
8004 vp_ap->enlighten_vmentry = 0;
8005 }
8006
8007 static_branch_disable(&enable_evmcs);
8008 }
8009#endif
8010 vmx_cleanup_l1d_flush();
8011}
8012module_exit(vmx_exit);
8013
6aa8b732
AK
8014static int __init vmx_init(void)
8015{
773e8a04
VK
8016 int r;
8017
8018#if IS_ENABLED(CONFIG_HYPERV)
8019 /*
8020 * Enlightened VMCS usage should be recommended and the host needs
8021 * to support eVMCS v1 or above. We can also disable eVMCS support
8022 * with module parameter.
8023 */
8024 if (enlightened_vmcs &&
8025 ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
8026 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
8027 KVM_EVMCS_VERSION) {
8028 int cpu;
8029
8030 /* Check that we have assist pages on all online CPUs */
8031 for_each_online_cpu(cpu) {
8032 if (!hv_get_vp_assist_page(cpu)) {
8033 enlightened_vmcs = false;
8034 break;
8035 }
8036 }
8037
8038 if (enlightened_vmcs) {
8039 pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
8040 static_branch_enable(&enable_evmcs);
8041 }
6f6a657c
VK
8042
8043 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
8044 vmx_x86_ops.enable_direct_tlbflush
8045 = hv_enable_direct_tlbflush;
8046
773e8a04
VK
8047 } else {
8048 enlightened_vmcs = false;
8049 }
8050#endif
8051
8052 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
a7b9020b 8053 __alignof__(struct vcpu_vmx), THIS_MODULE);
fdef3ad1 8054 if (r)
34a1cd60 8055 return r;
25c5f225 8056
a7b9020b 8057 /*
7db92e16
TG
8058 * Must be called after kvm_init() so enable_ept is properly set
8059 * up. Hand the parameter mitigation value in which was stored in
8060 * the pre module init parser. If no parameter was given, it will
8061 * contain 'auto' which will be turned into the default 'cond'
8062 * mitigation mode.
8063 */
19a36d32
WL
8064 r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
8065 if (r) {
8066 vmx_exit();
8067 return r;
a47dd5f0 8068 }
25c5f225 8069
2965faa5 8070#ifdef CONFIG_KEXEC_CORE
8f536b76
ZY
8071 rcu_assign_pointer(crash_vmclear_loaded_vmcss,
8072 crash_vmclear_local_loaded_vmcss);
8073#endif
21ebf53b 8074 vmx_check_vmcs12_offsets();
8f536b76 8075
fdef3ad1 8076 return 0;
6aa8b732 8077}
a7b9020b 8078module_init(vmx_init);