]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - arch/x86/kvm/vmx/nested.c
KVM: nVMX: Request to sync eVMCS from VMCS12 after migration
[mirror_ubuntu-jammy-kernel.git] / arch / x86 / kvm / vmx / nested.c
CommitLineData
55d2375e
SC
1// SPDX-License-Identifier: GPL-2.0
2
00089c04 3#include <linux/objtool.h>
55d2375e
SC
4#include <linux/percpu.h>
5
6#include <asm/debugreg.h>
7#include <asm/mmu_context.h>
8
9#include "cpuid.h"
10#include "hyperv.h"
11#include "mmu.h"
12#include "nested.h"
bfc6ad6a 13#include "pmu.h"
72add915 14#include "sgx.h"
55d2375e 15#include "trace.h"
150f17bf 16#include "vmx.h"
55d2375e
SC
17#include "x86.h"
18
19static bool __read_mostly enable_shadow_vmcs = 1;
20module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
21
22static bool __read_mostly nested_early_check = 0;
23module_param(nested_early_check, bool, S_IRUGO);
24
648fc8ae 25#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
5497b955 26
55d2375e
SC
27/*
28 * Hyper-V requires all of these, so mark them as supported even though
29 * they are just treated the same as all-context.
30 */
31#define VMX_VPID_EXTENT_SUPPORTED_MASK \
32 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
33 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
34 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
35 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
36
37#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
38
39enum {
40 VMX_VMREAD_BITMAP,
41 VMX_VMWRITE_BITMAP,
42 VMX_BITMAP_NR
43};
44static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
45
46#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
47#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
48
1c6f0b47
SC
49struct shadow_vmcs_field {
50 u16 encoding;
51 u16 offset;
52};
53static struct shadow_vmcs_field shadow_read_only_fields[] = {
54#define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
55d2375e
SC
55#include "vmcs_shadow_fields.h"
56};
57static int max_shadow_read_only_fields =
58 ARRAY_SIZE(shadow_read_only_fields);
59
1c6f0b47
SC
60static struct shadow_vmcs_field shadow_read_write_fields[] = {
61#define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
55d2375e
SC
62#include "vmcs_shadow_fields.h"
63};
64static int max_shadow_read_write_fields =
65 ARRAY_SIZE(shadow_read_write_fields);
66
8997f657 67static void init_vmcs_shadow_fields(void)
55d2375e
SC
68{
69 int i, j;
70
71 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
72 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
73
74 for (i = j = 0; i < max_shadow_read_only_fields; i++) {
1c6f0b47
SC
75 struct shadow_vmcs_field entry = shadow_read_only_fields[i];
76 u16 field = entry.encoding;
55d2375e
SC
77
78 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
79 (i + 1 == max_shadow_read_only_fields ||
1c6f0b47 80 shadow_read_only_fields[i + 1].encoding != field + 1))
55d2375e
SC
81 pr_err("Missing field from shadow_read_only_field %x\n",
82 field + 1);
83
84 clear_bit(field, vmx_vmread_bitmap);
55d2375e 85 if (field & 1)
1c6f0b47 86#ifdef CONFIG_X86_64
55d2375e 87 continue;
1c6f0b47
SC
88#else
89 entry.offset += sizeof(u32);
55d2375e 90#endif
1c6f0b47 91 shadow_read_only_fields[j++] = entry;
55d2375e
SC
92 }
93 max_shadow_read_only_fields = j;
94
95 for (i = j = 0; i < max_shadow_read_write_fields; i++) {
1c6f0b47
SC
96 struct shadow_vmcs_field entry = shadow_read_write_fields[i];
97 u16 field = entry.encoding;
55d2375e
SC
98
99 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
100 (i + 1 == max_shadow_read_write_fields ||
1c6f0b47 101 shadow_read_write_fields[i + 1].encoding != field + 1))
55d2375e
SC
102 pr_err("Missing field from shadow_read_write_field %x\n",
103 field + 1);
104
b6437805
SC
105 WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
106 field <= GUEST_TR_AR_BYTES,
1c6f0b47 107 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
b6437805 108
55d2375e
SC
109 /*
110 * PML and the preemption timer can be emulated, but the
111 * processor cannot vmwrite to fields that don't exist
112 * on bare metal.
113 */
114 switch (field) {
115 case GUEST_PML_INDEX:
116 if (!cpu_has_vmx_pml())
117 continue;
118 break;
119 case VMX_PREEMPTION_TIMER_VALUE:
120 if (!cpu_has_vmx_preemption_timer())
121 continue;
122 break;
123 case GUEST_INTR_STATUS:
124 if (!cpu_has_vmx_apicv())
125 continue;
126 break;
127 default:
128 break;
129 }
130
131 clear_bit(field, vmx_vmwrite_bitmap);
132 clear_bit(field, vmx_vmread_bitmap);
55d2375e 133 if (field & 1)
1c6f0b47 134#ifdef CONFIG_X86_64
55d2375e 135 continue;
1c6f0b47
SC
136#else
137 entry.offset += sizeof(u32);
55d2375e 138#endif
1c6f0b47 139 shadow_read_write_fields[j++] = entry;
55d2375e
SC
140 }
141 max_shadow_read_write_fields = j;
142}
143
144/*
145 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
146 * set the success or error code of an emulated VMX instruction (as specified
147 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
148 * instruction.
149 */
150static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
151{
152 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
153 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
154 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
155 return kvm_skip_emulated_instruction(vcpu);
156}
157
158static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
159{
160 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
161 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
162 X86_EFLAGS_SF | X86_EFLAGS_OF))
163 | X86_EFLAGS_CF);
164 return kvm_skip_emulated_instruction(vcpu);
165}
166
167static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
168 u32 vm_instruction_error)
169{
55d2375e
SC
170 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
171 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
172 X86_EFLAGS_SF | X86_EFLAGS_OF))
173 | X86_EFLAGS_ZF);
174 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
175 /*
b7685cfd
VK
176 * We don't need to force sync to shadow VMCS because
177 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all
178 * fields and thus must be synced.
55d2375e 179 */
b7685cfd
VK
180 if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
181 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true;
182
55d2375e
SC
183 return kvm_skip_emulated_instruction(vcpu);
184}
185
b2656e4d
SC
186static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
187{
188 struct vcpu_vmx *vmx = to_vmx(vcpu);
189
190 /*
191 * failValid writes the error number to the current VMCS, which
192 * can't be done if there isn't a current VMCS.
193 */
1e9dfbd7
VK
194 if (vmx->nested.current_vmptr == -1ull &&
195 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
b2656e4d
SC
196 return nested_vmx_failInvalid(vcpu);
197
198 return nested_vmx_failValid(vcpu, vm_instruction_error);
199}
200
55d2375e
SC
201static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
202{
203 /* TODO: not to reset guest simply here. */
204 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
205 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
206}
207
f0b5105a
MO
208static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
209{
210 return fixed_bits_valid(control, low, high);
211}
212
213static inline u64 vmx_control_msr(u32 low, u32 high)
214{
215 return low | ((u64)high << 32);
216}
217
55d2375e
SC
218static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
219{
fe7f895d 220 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
55d2375e 221 vmcs_write64(VMCS_LINK_POINTER, -1ull);
88dddc11 222 vmx->nested.need_vmcs12_to_shadow_sync = false;
55d2375e
SC
223}
224
225static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
226{
227 struct vcpu_vmx *vmx = to_vmx(vcpu);
228
1e9dfbd7
VK
229 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
230 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
231 vmx->nested.hv_evmcs = NULL;
232 }
55d2375e 233
1e9dfbd7 234 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
55d2375e
SC
235}
236
c61ca2fc
SC
237static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
238 struct loaded_vmcs *prev)
239{
240 struct vmcs_host_state *dest, *src;
241
242 if (unlikely(!vmx->guest_state_loaded))
243 return;
244
245 src = &prev->host_state;
246 dest = &vmx->loaded_vmcs->host_state;
247
248 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
249 dest->ldt_sel = src->ldt_sel;
250#ifdef CONFIG_X86_64
251 dest->ds_sel = src->ds_sel;
252 dest->es_sel = src->es_sel;
253#endif
254}
255
256static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
257{
258 struct vcpu_vmx *vmx = to_vmx(vcpu);
259 struct loaded_vmcs *prev;
260 int cpu;
261
138534a8 262 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs))
c61ca2fc
SC
263 return;
264
265 cpu = get_cpu();
266 prev = vmx->loaded_vmcs;
267 vmx->loaded_vmcs = vmcs;
268 vmx_vcpu_load_vmcs(vcpu, cpu, prev);
269 vmx_sync_vmcs_host_state(vmx, prev);
270 put_cpu();
271
272 vmx_register_cache_reset(vcpu);
273}
274
55d2375e
SC
275/*
276 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
277 * just stops using VMX.
278 */
279static void free_nested(struct kvm_vcpu *vcpu)
280{
281 struct vcpu_vmx *vmx = to_vmx(vcpu);
282
df82a24b
SC
283 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01))
284 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
285
55d2375e
SC
286 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
287 return;
288
729c15c2 289 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
cf64527b 290
55d2375e
SC
291 vmx->nested.vmxon = false;
292 vmx->nested.smm.vmxon = false;
293 free_vpid(vmx->nested.vpid02);
294 vmx->nested.posted_intr_nv = -1;
295 vmx->nested.current_vmptr = -1ull;
296 if (enable_shadow_vmcs) {
297 vmx_disable_shadow_vmcs(vmx);
298 vmcs_clear(vmx->vmcs01.shadow_vmcs);
299 free_vmcs(vmx->vmcs01.shadow_vmcs);
300 vmx->vmcs01.shadow_vmcs = NULL;
301 }
302 kfree(vmx->nested.cached_vmcs12);
c6bf2ae9 303 vmx->nested.cached_vmcs12 = NULL;
55d2375e 304 kfree(vmx->nested.cached_shadow_vmcs12);
c6bf2ae9 305 vmx->nested.cached_shadow_vmcs12 = NULL;
55d2375e
SC
306 /* Unpin physical memory we referred to in the vmcs02 */
307 if (vmx->nested.apic_access_page) {
b11494bc 308 kvm_release_page_clean(vmx->nested.apic_access_page);
55d2375e
SC
309 vmx->nested.apic_access_page = NULL;
310 }
96c66e87 311 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
3278e049
KA
312 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
313 vmx->nested.pi_desc = NULL;
55d2375e
SC
314
315 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
316
317 nested_release_evmcs(vcpu);
318
319 free_loaded_vmcs(&vmx->nested.vmcs02);
320}
321
55d2375e
SC
322/*
323 * Ensure that the current vmcs of the logical processor is the
324 * vmcs01 of the vcpu before calling free_nested().
325 */
326void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
327{
328 vcpu_load(vcpu);
b4b65b56 329 vmx_leave_nested(vcpu);
55d2375e
SC
330 vcpu_put(vcpu);
331}
332
333static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
334 struct x86_exception *fault)
335{
336 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
337 struct vcpu_vmx *vmx = to_vmx(vcpu);
4dcefa31 338 u32 vm_exit_reason;
55d2375e
SC
339 unsigned long exit_qualification = vcpu->arch.exit_qualification;
340
341 if (vmx->nested.pml_full) {
4dcefa31 342 vm_exit_reason = EXIT_REASON_PML_FULL;
55d2375e
SC
343 vmx->nested.pml_full = false;
344 exit_qualification &= INTR_INFO_UNBLOCK_NMI;
345 } else if (fault->error_code & PFERR_RSVD_MASK)
4dcefa31 346 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
55d2375e 347 else
4dcefa31 348 vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
55d2375e 349
4dcefa31 350 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
55d2375e
SC
351 vmcs12->guest_physical_address = fault->address;
352}
353
354static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
355{
356 WARN_ON(mmu_is_nested(vcpu));
357
358 vcpu->arch.mmu = &vcpu->arch.guest_mmu;
359 kvm_init_shadow_ept_mmu(vcpu,
360 to_vmx(vcpu)->nested.msrs.ept_caps &
361 VMX_EPT_EXECUTE_ONLY_BIT,
362 nested_ept_ad_enabled(vcpu),
ac69dfaa 363 nested_ept_get_eptp(vcpu));
d8dd54e0 364 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp;
55d2375e
SC
365 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
366 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
367
368 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
369}
370
371static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
372{
373 vcpu->arch.mmu = &vcpu->arch.root_mmu;
374 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
375}
376
377static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
378 u16 error_code)
379{
380 bool inequality, bit;
381
382 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
383 inequality =
384 (error_code & vmcs12->page_fault_error_code_mask) !=
385 vmcs12->page_fault_error_code_match;
386 return inequality ^ bit;
387}
388
389
390/*
391 * KVM wants to inject page-faults which it got to the guest. This function
392 * checks whether in a nested guest, we need to inject them to L1 or L2.
393 */
394static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
395{
396 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
397 unsigned int nr = vcpu->arch.exception.nr;
398 bool has_payload = vcpu->arch.exception.has_payload;
399 unsigned long payload = vcpu->arch.exception.payload;
400
401 if (nr == PF_VECTOR) {
402 if (vcpu->arch.exception.nested_apf) {
403 *exit_qual = vcpu->arch.apf.nested_apf_token;
404 return 1;
405 }
406 if (nested_vmx_is_page_fault_vmexit(vmcs12,
407 vcpu->arch.exception.error_code)) {
408 *exit_qual = has_payload ? payload : vcpu->arch.cr2;
409 return 1;
410 }
411 } else if (vmcs12->exception_bitmap & (1u << nr)) {
412 if (nr == DB_VECTOR) {
413 if (!has_payload) {
414 payload = vcpu->arch.dr6;
9a3ecd5e
CQ
415 payload &= ~DR6_BT;
416 payload ^= DR6_ACTIVE_LOW;
55d2375e
SC
417 }
418 *exit_qual = payload;
419 } else
420 *exit_qual = 0;
421 return 1;
422 }
423
424 return 0;
425}
426
427
428static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
429 struct x86_exception *fault)
430{
431 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
432
433 WARN_ON(!is_guest_mode(vcpu));
434
435 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
436 !to_vmx(vcpu)->nested.nested_run_pending) {
437 vmcs12->vm_exit_intr_error_code = fault->error_code;
438 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
439 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
440 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
441 fault->address);
442 } else {
443 kvm_inject_page_fault(vcpu, fault);
444 }
445}
446
55d2375e
SC
447static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
448 struct vmcs12 *vmcs12)
449{
450 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
451 return 0;
452
5497b955
SC
453 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
454 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
55d2375e
SC
455 return -EINVAL;
456
457 return 0;
458}
459
460static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
461 struct vmcs12 *vmcs12)
462{
463 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
464 return 0;
465
5497b955 466 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
55d2375e
SC
467 return -EINVAL;
468
469 return 0;
470}
471
472static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
473 struct vmcs12 *vmcs12)
474{
475 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
476 return 0;
477
5497b955 478 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
55d2375e
SC
479 return -EINVAL;
480
481 return 0;
482}
483
484/*
485 * Check if MSR is intercepted for L01 MSR bitmap.
486 */
487static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
488{
489 unsigned long *msr_bitmap;
490 int f = sizeof(unsigned long);
491
492 if (!cpu_has_vmx_msr_bitmap())
493 return true;
494
495 msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
496
497 if (msr <= 0x1fff) {
498 return !!test_bit(msr, msr_bitmap + 0x800 / f);
499 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
500 msr &= 0x1fff;
501 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
502 }
503
504 return true;
505}
506
507/*
508 * If a msr is allowed by L0, we should check whether it is allowed by L1.
509 * The corresponding bit will be cleared unless both of L0 and L1 allow it.
510 */
511static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
512 unsigned long *msr_bitmap_nested,
513 u32 msr, int type)
514{
515 int f = sizeof(unsigned long);
516
517 /*
518 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
519 * have the write-low and read-high bitmap offsets the wrong way round.
520 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
521 */
522 if (msr <= 0x1fff) {
523 if (type & MSR_TYPE_R &&
524 !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
525 /* read-low */
526 __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
527
528 if (type & MSR_TYPE_W &&
529 !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
530 /* write-low */
531 __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
532
533 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
534 msr &= 0x1fff;
535 if (type & MSR_TYPE_R &&
536 !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
537 /* read-high */
538 __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
539
540 if (type & MSR_TYPE_W &&
541 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
542 /* write-high */
543 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
544
545 }
546}
547
ffdbd50d
ML
548static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
549{
acff7847
MO
550 int msr;
551
552 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
553 unsigned word = msr / BITS_PER_LONG;
554
555 msr_bitmap[word] = ~0;
556 msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
557 }
558}
559
55d2375e
SC
560/*
561 * Merge L0's and L1's MSR bitmap, return false to indicate that
562 * we do not use the hardware.
563 */
564static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
565 struct vmcs12 *vmcs12)
566{
567 int msr;
55d2375e
SC
568 unsigned long *msr_bitmap_l1;
569 unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
31f0b6c4 570 struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
55d2375e
SC
571
572 /* Nothing to do if the MSR bitmap is not in use. */
573 if (!cpu_has_vmx_msr_bitmap() ||
574 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
575 return false;
576
31f0b6c4 577 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
55d2375e
SC
578 return false;
579
31f0b6c4 580 msr_bitmap_l1 = (unsigned long *)map->hva;
55d2375e 581
acff7847
MO
582 /*
583 * To keep the control flow simple, pay eight 8-byte writes (sixteen
584 * 4-byte writes on 32-bit systems) up front to enable intercepts for
585 * the x2APIC MSR range and selectively disable them below.
586 */
587 enable_x2apic_msr_intercepts(msr_bitmap_l0);
588
589 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
590 if (nested_cpu_has_apic_reg_virt(vmcs12)) {
591 /*
592 * L0 need not intercept reads for MSRs between 0x800
593 * and 0x8ff, it just lets the processor take the value
594 * from the virtual-APIC page; take those 256 bits
595 * directly from the L1 bitmap.
596 */
597 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
598 unsigned word = msr / BITS_PER_LONG;
599
600 msr_bitmap_l0[word] = msr_bitmap_l1[word];
601 }
602 }
55d2375e 603
55d2375e
SC
604 nested_vmx_disable_intercept_for_msr(
605 msr_bitmap_l1, msr_bitmap_l0,
acff7847 606 X2APIC_MSR(APIC_TASKPRI),
c73f4c99 607 MSR_TYPE_R | MSR_TYPE_W);
acff7847
MO
608
609 if (nested_cpu_has_vid(vmcs12)) {
610 nested_vmx_disable_intercept_for_msr(
611 msr_bitmap_l1, msr_bitmap_l0,
612 X2APIC_MSR(APIC_EOI),
613 MSR_TYPE_W);
614 nested_vmx_disable_intercept_for_msr(
615 msr_bitmap_l1, msr_bitmap_l0,
616 X2APIC_MSR(APIC_SELF_IPI),
617 MSR_TYPE_W);
618 }
55d2375e
SC
619 }
620
d69129b4 621 /* KVM unconditionally exposes the FS/GS base MSRs to L1. */
dbdd096a 622#ifdef CONFIG_X86_64
d69129b4
SC
623 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
624 MSR_FS_BASE, MSR_TYPE_RW);
625
626 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
627 MSR_GS_BASE, MSR_TYPE_RW);
628
629 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
630 MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
dbdd096a 631#endif
d69129b4
SC
632
633 /*
634 * Checking the L0->L1 bitmap is trying to verify two things:
635 *
636 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
637 * ensures that we do not accidentally generate an L02 MSR bitmap
638 * from the L12 MSR bitmap that is too permissive.
639 * 2. That L1 or L2s have actually used the MSR. This avoids
640 * unnecessarily merging of the bitmap if the MSR is unused. This
641 * works properly because we only update the L01 MSR bitmap lazily.
642 * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
643 * updated to reflect this when L1 (or its L2s) actually write to
644 * the MSR.
645 */
646 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
55d2375e
SC
647 nested_vmx_disable_intercept_for_msr(
648 msr_bitmap_l1, msr_bitmap_l0,
649 MSR_IA32_SPEC_CTRL,
650 MSR_TYPE_R | MSR_TYPE_W);
651
d69129b4 652 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
55d2375e
SC
653 nested_vmx_disable_intercept_for_msr(
654 msr_bitmap_l1, msr_bitmap_l0,
655 MSR_IA32_PRED_CMD,
656 MSR_TYPE_W);
657
31f0b6c4 658 kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
55d2375e
SC
659
660 return true;
661}
662
663static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
664 struct vmcs12 *vmcs12)
665{
88925305 666 struct kvm_host_map map;
55d2375e 667 struct vmcs12 *shadow;
55d2375e
SC
668
669 if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
670 vmcs12->vmcs_link_pointer == -1ull)
671 return;
672
673 shadow = get_shadow_vmcs12(vcpu);
55d2375e 674
88925305
KA
675 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
676 return;
55d2375e 677
88925305
KA
678 memcpy(shadow, map.hva, VMCS12_SIZE);
679 kvm_vcpu_unmap(vcpu, &map, false);
55d2375e
SC
680}
681
682static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
683 struct vmcs12 *vmcs12)
684{
685 struct vcpu_vmx *vmx = to_vmx(vcpu);
686
687 if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
688 vmcs12->vmcs_link_pointer == -1ull)
689 return;
690
691 kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
692 get_shadow_vmcs12(vcpu), VMCS12_SIZE);
693}
694
695/*
696 * In nested virtualization, check if L1 has set
697 * VM_EXIT_ACK_INTR_ON_EXIT
698 */
699static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
700{
701 return get_vmcs12(vcpu)->vm_exit_controls &
702 VM_EXIT_ACK_INTR_ON_EXIT;
703}
704
55d2375e
SC
705static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
706 struct vmcs12 *vmcs12)
707{
708 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
5497b955 709 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
55d2375e
SC
710 return -EINVAL;
711 else
712 return 0;
713}
714
715static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
716 struct vmcs12 *vmcs12)
717{
718 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
719 !nested_cpu_has_apic_reg_virt(vmcs12) &&
720 !nested_cpu_has_vid(vmcs12) &&
721 !nested_cpu_has_posted_intr(vmcs12))
722 return 0;
723
724 /*
725 * If virtualize x2apic mode is enabled,
726 * virtualize apic access must be disabled.
727 */
5497b955
SC
728 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
729 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
55d2375e
SC
730 return -EINVAL;
731
732 /*
733 * If virtual interrupt delivery is enabled,
734 * we must exit on external interrupts.
735 */
5497b955 736 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
55d2375e
SC
737 return -EINVAL;
738
739 /*
740 * bits 15:8 should be zero in posted_intr_nv,
741 * the descriptor address has been already checked
742 * in nested_get_vmcs12_pages.
743 *
744 * bits 5:0 of posted_intr_desc_addr should be zero.
745 */
746 if (nested_cpu_has_posted_intr(vmcs12) &&
5497b955
SC
747 (CC(!nested_cpu_has_vid(vmcs12)) ||
748 CC(!nested_exit_intr_ack_set(vcpu)) ||
749 CC((vmcs12->posted_intr_nv & 0xff00)) ||
636e8b73 750 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64))))
55d2375e
SC
751 return -EINVAL;
752
753 /* tpr shadow is needed by all apicv features. */
5497b955 754 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
55d2375e
SC
755 return -EINVAL;
756
757 return 0;
758}
759
760static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
f9b245e1 761 u32 count, u64 addr)
55d2375e 762{
55d2375e
SC
763 if (count == 0)
764 return 0;
636e8b73
SC
765
766 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
767 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
55d2375e 768 return -EINVAL;
f9b245e1 769
55d2375e
SC
770 return 0;
771}
772
61446ba7
KS
773static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
774 struct vmcs12 *vmcs12)
55d2375e 775{
5497b955
SC
776 if (CC(nested_vmx_check_msr_switch(vcpu,
777 vmcs12->vm_exit_msr_load_count,
778 vmcs12->vm_exit_msr_load_addr)) ||
779 CC(nested_vmx_check_msr_switch(vcpu,
780 vmcs12->vm_exit_msr_store_count,
781 vmcs12->vm_exit_msr_store_addr)))
55d2375e 782 return -EINVAL;
f9b245e1 783
55d2375e
SC
784 return 0;
785}
786
5fbf9634
KS
787static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
788 struct vmcs12 *vmcs12)
61446ba7 789{
5497b955
SC
790 if (CC(nested_vmx_check_msr_switch(vcpu,
791 vmcs12->vm_entry_msr_load_count,
792 vmcs12->vm_entry_msr_load_addr)))
61446ba7
KS
793 return -EINVAL;
794
795 return 0;
796}
797
55d2375e
SC
798static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
799 struct vmcs12 *vmcs12)
800{
801 if (!nested_cpu_has_pml(vmcs12))
802 return 0;
803
5497b955
SC
804 if (CC(!nested_cpu_has_ept(vmcs12)) ||
805 CC(!page_address_valid(vcpu, vmcs12->pml_address)))
55d2375e
SC
806 return -EINVAL;
807
808 return 0;
809}
810
811static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
812 struct vmcs12 *vmcs12)
813{
5497b955
SC
814 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
815 !nested_cpu_has_ept(vmcs12)))
55d2375e
SC
816 return -EINVAL;
817 return 0;
818}
819
820static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
821 struct vmcs12 *vmcs12)
822{
5497b955
SC
823 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
824 !nested_cpu_has_ept(vmcs12)))
55d2375e
SC
825 return -EINVAL;
826 return 0;
827}
828
829static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
830 struct vmcs12 *vmcs12)
831{
832 if (!nested_cpu_has_shadow_vmcs(vmcs12))
833 return 0;
834
5497b955
SC
835 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
836 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
55d2375e
SC
837 return -EINVAL;
838
839 return 0;
840}
841
842static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
843 struct vmx_msr_entry *e)
844{
845 /* x2APIC MSR accesses are not allowed */
5497b955 846 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
55d2375e 847 return -EINVAL;
5497b955
SC
848 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
849 CC(e->index == MSR_IA32_UCODE_REV))
55d2375e 850 return -EINVAL;
5497b955 851 if (CC(e->reserved != 0))
55d2375e
SC
852 return -EINVAL;
853 return 0;
854}
855
856static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
857 struct vmx_msr_entry *e)
858{
5497b955
SC
859 if (CC(e->index == MSR_FS_BASE) ||
860 CC(e->index == MSR_GS_BASE) ||
861 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
55d2375e
SC
862 nested_vmx_msr_check_common(vcpu, e))
863 return -EINVAL;
864 return 0;
865}
866
867static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
868 struct vmx_msr_entry *e)
869{
5497b955 870 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
55d2375e
SC
871 nested_vmx_msr_check_common(vcpu, e))
872 return -EINVAL;
873 return 0;
874}
875
f0b5105a
MO
876static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
877{
878 struct vcpu_vmx *vmx = to_vmx(vcpu);
879 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
880 vmx->nested.msrs.misc_high);
881
882 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
883}
884
55d2375e
SC
885/*
886 * Load guest's/host's msr at nested entry/exit.
887 * return 0 for success, entry index for failure.
f0b5105a
MO
888 *
889 * One of the failure modes for MSR load/store is when a list exceeds the
890 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
891 * as possible, process all valid entries before failing rather than precheck
892 * for a capacity violation.
55d2375e
SC
893 */
894static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
895{
896 u32 i;
897 struct vmx_msr_entry e;
f0b5105a 898 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
55d2375e 899
55d2375e 900 for (i = 0; i < count; i++) {
f0b5105a
MO
901 if (unlikely(i >= max_msr_list_size))
902 goto fail;
903
55d2375e
SC
904 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
905 &e, sizeof(e))) {
906 pr_debug_ratelimited(
907 "%s cannot read MSR entry (%u, 0x%08llx)\n",
908 __func__, i, gpa + i * sizeof(e));
909 goto fail;
910 }
911 if (nested_vmx_load_msr_check(vcpu, &e)) {
912 pr_debug_ratelimited(
913 "%s check failed (%u, 0x%x, 0x%x)\n",
914 __func__, i, e.index, e.reserved);
915 goto fail;
916 }
f20935d8 917 if (kvm_set_msr(vcpu, e.index, e.value)) {
55d2375e
SC
918 pr_debug_ratelimited(
919 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
920 __func__, i, e.index, e.value);
921 goto fail;
922 }
923 }
924 return 0;
925fail:
68cda40d 926 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */
55d2375e
SC
927 return i + 1;
928}
929
662f1d1d
AL
930static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
931 u32 msr_index,
932 u64 *data)
933{
934 struct vcpu_vmx *vmx = to_vmx(vcpu);
935
936 /*
937 * If the L0 hypervisor stored a more accurate value for the TSC that
938 * does not include the time taken for emulation of the L2->L1
939 * VM-exit in L0, use the more accurate value.
940 */
941 if (msr_index == MSR_IA32_TSC) {
a128a934
SC
942 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest,
943 MSR_IA32_TSC);
662f1d1d 944
a128a934
SC
945 if (i >= 0) {
946 u64 val = vmx->msr_autostore.guest.val[i].value;
662f1d1d
AL
947
948 *data = kvm_read_l1_tsc(vcpu, val);
949 return true;
950 }
951 }
952
953 if (kvm_get_msr(vcpu, msr_index, data)) {
954 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
955 msr_index);
956 return false;
957 }
958 return true;
959}
960
365d3d55
AL
961static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
962 struct vmx_msr_entry *e)
963{
964 if (kvm_vcpu_read_guest(vcpu,
965 gpa + i * sizeof(*e),
966 e, 2 * sizeof(u32))) {
967 pr_debug_ratelimited(
968 "%s cannot read MSR entry (%u, 0x%08llx)\n",
969 __func__, i, gpa + i * sizeof(*e));
970 return false;
971 }
972 if (nested_vmx_store_msr_check(vcpu, e)) {
973 pr_debug_ratelimited(
974 "%s check failed (%u, 0x%x, 0x%x)\n",
975 __func__, i, e->index, e->reserved);
976 return false;
977 }
978 return true;
979}
980
55d2375e
SC
981static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
982{
f20935d8 983 u64 data;
55d2375e
SC
984 u32 i;
985 struct vmx_msr_entry e;
f0b5105a 986 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
55d2375e
SC
987
988 for (i = 0; i < count; i++) {
f0b5105a
MO
989 if (unlikely(i >= max_msr_list_size))
990 return -EINVAL;
991
365d3d55 992 if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
55d2375e 993 return -EINVAL;
365d3d55 994
662f1d1d 995 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
55d2375e 996 return -EINVAL;
662f1d1d 997
55d2375e
SC
998 if (kvm_vcpu_write_guest(vcpu,
999 gpa + i * sizeof(e) +
1000 offsetof(struct vmx_msr_entry, value),
f20935d8 1001 &data, sizeof(data))) {
55d2375e
SC
1002 pr_debug_ratelimited(
1003 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
f20935d8 1004 __func__, i, e.index, data);
55d2375e
SC
1005 return -EINVAL;
1006 }
1007 }
1008 return 0;
1009}
1010
662f1d1d
AL
1011static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
1012{
1013 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1014 u32 count = vmcs12->vm_exit_msr_store_count;
1015 u64 gpa = vmcs12->vm_exit_msr_store_addr;
1016 struct vmx_msr_entry e;
1017 u32 i;
1018
1019 for (i = 0; i < count; i++) {
1020 if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1021 return false;
1022
1023 if (e.index == msr_index)
1024 return true;
1025 }
1026 return false;
1027}
1028
1029static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
1030 u32 msr_index)
1031{
1032 struct vcpu_vmx *vmx = to_vmx(vcpu);
1033 struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
1034 bool in_vmcs12_store_list;
a128a934 1035 int msr_autostore_slot;
662f1d1d
AL
1036 bool in_autostore_list;
1037 int last;
1038
a128a934
SC
1039 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index);
1040 in_autostore_list = msr_autostore_slot >= 0;
662f1d1d
AL
1041 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
1042
1043 if (in_vmcs12_store_list && !in_autostore_list) {
ce833b23 1044 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) {
662f1d1d
AL
1045 /*
1046 * Emulated VMEntry does not fail here. Instead a less
1047 * accurate value will be returned by
1048 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
1049 * instead of reading the value from the vmcs02 VMExit
1050 * MSR-store area.
1051 */
1052 pr_warn_ratelimited(
1053 "Not enough msr entries in msr_autostore. Can't add msr %x\n",
1054 msr_index);
1055 return;
1056 }
1057 last = autostore->nr++;
1058 autostore->val[last].index = msr_index;
1059 } else if (!in_vmcs12_store_list && in_autostore_list) {
1060 last = --autostore->nr;
a128a934 1061 autostore->val[msr_autostore_slot] = autostore->val[last];
662f1d1d
AL
1062 }
1063}
1064
41fab65e
SC
1065/*
1066 * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit.
1067 * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't
1068 * enable VPID for L2 (implying it expects a TLB flush on VMX transitions).
1069 * Here's why.
1070 *
1071 * If EPT is enabled by L0 a sync is never needed:
1072 * - if it is disabled by L1, then L0 is not shadowing L1 or L2 PTEs, there
1073 * cannot be unsync'd SPTEs for either L1 or L2.
1074 *
1075 * - if it is also enabled by L1, then L0 doesn't need to sync on VM-Enter
1076 * VM-Enter as VM-Enter isn't required to invalidate guest-physical mappings
1077 * (irrespective of VPID), i.e. L1 can't rely on the (virtual) CPU to flush
1078 * stale guest-physical mappings for L2 from the TLB. And as above, L0 isn't
1079 * shadowing L1 PTEs so there are no unsync'd SPTEs to sync on VM-Exit.
1080 *
1081 * If EPT is disabled by L0:
1082 * - if VPID is enabled by L1 (for L2), the situation is similar to when L1
1083 * enables EPT: L0 doesn't need to sync as VM-Enter and VM-Exit aren't
1084 * required to invalidate linear mappings (EPT is disabled so there are
1085 * no combined or guest-physical mappings), i.e. L1 can't rely on the
1086 * (virtual) CPU to flush stale linear mappings for either L2 or itself (L1).
1087 *
1088 * - however if VPID is disabled by L1, then a sync is needed as L1 expects all
1089 * linear mappings (EPT is disabled so there are no combined or guest-physical
1090 * mappings) to be invalidated on both VM-Enter and VM-Exit.
1091 *
1092 * Note, this logic is subtly different than nested_has_guest_tlb_tag(), which
1093 * additionally checks that L2 has been assigned a VPID (when EPT is disabled).
1094 * Whether or not L2 has been assigned a VPID by L0 is irrelevant with respect
1095 * to L1's expectations, e.g. L0 needs to invalidate hardware TLB entries if L2
1096 * doesn't have a unique VPID to prevent reusing L1's entries (assuming L1 has
1097 * been assigned a VPID), but L0 doesn't need to do a MMU sync because L1
1098 * doesn't expect stale (virtual) TLB entries to be flushed, i.e. L1 doesn't
1099 * know that L0 will flush the TLB and so L1 will do INVVPID as needed to flush
1100 * stale TLB entries, at which point L0 will sync L2's MMU.
1101 */
1102static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu)
1103{
1104 return !enable_ept && !nested_cpu_has_vpid(get_vmcs12(vcpu));
1105}
1106
55d2375e 1107/*
ea79a750
SC
1108 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are
1109 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected
1110 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
1111 * @entry_failure_code.
55d2375e 1112 */
0f857223
ML
1113static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
1114 bool nested_ept, bool reload_pdptrs,
68cda40d 1115 enum vm_entry_failure_code *entry_failure_code)
55d2375e 1116{
636e8b73 1117 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) {
0cc69204
SC
1118 *entry_failure_code = ENTRY_FAIL_DEFAULT;
1119 return -EINVAL;
1120 }
55d2375e 1121
0cc69204
SC
1122 /*
1123 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
1124 * must not be dereferenced.
1125 */
0f857223 1126 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) &&
bcb72d06
SC
1127 CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
1128 *entry_failure_code = ENTRY_FAIL_PDPTE;
1129 return -EINVAL;
55d2375e
SC
1130 }
1131
41fab65e 1132 /*
9805c5f7
SC
1133 * Unconditionally skip the TLB flush on fast CR3 switch, all TLB
1134 * flushes are handled by nested_vmx_transition_tlb_flush(). See
1135 * nested_vmx_transition_mmu_sync for details on skipping the MMU sync.
41fab65e 1136 */
55d2375e 1137 if (!nested_ept)
be01e8e2 1138 kvm_mmu_new_pgd(vcpu, cr3, true,
41fab65e 1139 !nested_vmx_transition_mmu_sync(vcpu));
55d2375e
SC
1140
1141 vcpu->arch.cr3 = cr3;
cb3c1e2f 1142 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
55d2375e
SC
1143
1144 kvm_init_mmu(vcpu, false);
1145
1146 return 0;
1147}
1148
1149/*
1150 * Returns if KVM is able to config CPU to tag TLB entries
1151 * populated by L2 differently than TLB entries populated
1152 * by L1.
1153 *
992edeae
LA
1154 * If L0 uses EPT, L1 and L2 run with different EPTP because
1155 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
1156 * are tagged with different EPTP.
55d2375e
SC
1157 *
1158 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
1159 * with different VPID (L1 entries are tagged with vmx->vpid
1160 * while L2 entries are tagged with vmx->nested.vpid02).
1161 */
1162static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
1163{
1164 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1165
992edeae 1166 return enable_ept ||
55d2375e
SC
1167 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
1168}
1169
50b265a4
SC
1170static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
1171 struct vmcs12 *vmcs12,
1172 bool is_vmenter)
1173{
1174 struct vcpu_vmx *vmx = to_vmx(vcpu);
1175
1176 /*
1177 * If VPID is disabled, linear and combined mappings are flushed on
1178 * VM-Enter/VM-Exit, and guest-physical mappings are valid only for
1179 * their associated EPTP.
1180 */
1181 if (!enable_vpid)
1182 return;
1183
1184 /*
1185 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
1186 * for *all* contexts to be flushed on VM-Enter/VM-Exit.
1187 *
1188 * If VPID is enabled and used by vmc12, but L2 does not have a unique
1189 * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate
c51e1ffe
SC
1190 * a VPID for L2, flush the current context as the effective ASID is
1191 * common to both L1 and L2.
50b265a4
SC
1192 *
1193 * Defer the flush so that it runs after vmcs02.EPTP has been set by
1194 * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid
1195 * redundant flushes further down the nested pipeline.
1196 *
1197 * If a TLB flush isn't required due to any of the above, and vpid12 is
1198 * changing then the new "virtual" VPID (vpid12) will reuse the same
1199 * "real" VPID (vpid02), and so needs to be sync'd. There is no direct
1200 * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for
1201 * all nested vCPUs.
1202 */
c51e1ffe 1203 if (!nested_cpu_has_vpid(vmcs12)) {
50b265a4 1204 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
c51e1ffe
SC
1205 } else if (!nested_has_guest_tlb_tag(vcpu)) {
1206 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
50b265a4
SC
1207 } else if (is_vmenter &&
1208 vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
1209 vmx->nested.last_vpid = vmcs12->virtual_processor_id;
1210 vpid_sync_context(nested_get_vpid02(vcpu));
1211 }
1212}
1213
55d2375e
SC
1214static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
1215{
1216 superset &= mask;
1217 subset &= mask;
1218
1219 return (superset | subset) == superset;
1220}
1221
1222static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
1223{
1224 const u64 feature_and_reserved =
1225 /* feature (except bit 48; see below) */
1226 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
1227 /* reserved */
1228 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
1229 u64 vmx_basic = vmx->nested.msrs.basic;
1230
1231 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
1232 return -EINVAL;
1233
1234 /*
1235 * KVM does not emulate a version of VMX that constrains physical
1236 * addresses of VMX structures (e.g. VMCS) to 32-bits.
1237 */
1238 if (data & BIT_ULL(48))
1239 return -EINVAL;
1240
1241 if (vmx_basic_vmcs_revision_id(vmx_basic) !=
1242 vmx_basic_vmcs_revision_id(data))
1243 return -EINVAL;
1244
1245 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
1246 return -EINVAL;
1247
1248 vmx->nested.msrs.basic = data;
1249 return 0;
1250}
1251
1252static int
1253vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1254{
1255 u64 supported;
1256 u32 *lowp, *highp;
1257
1258 switch (msr_index) {
1259 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1260 lowp = &vmx->nested.msrs.pinbased_ctls_low;
1261 highp = &vmx->nested.msrs.pinbased_ctls_high;
1262 break;
1263 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1264 lowp = &vmx->nested.msrs.procbased_ctls_low;
1265 highp = &vmx->nested.msrs.procbased_ctls_high;
1266 break;
1267 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1268 lowp = &vmx->nested.msrs.exit_ctls_low;
1269 highp = &vmx->nested.msrs.exit_ctls_high;
1270 break;
1271 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1272 lowp = &vmx->nested.msrs.entry_ctls_low;
1273 highp = &vmx->nested.msrs.entry_ctls_high;
1274 break;
1275 case MSR_IA32_VMX_PROCBASED_CTLS2:
1276 lowp = &vmx->nested.msrs.secondary_ctls_low;
1277 highp = &vmx->nested.msrs.secondary_ctls_high;
1278 break;
1279 default:
1280 BUG();
1281 }
1282
1283 supported = vmx_control_msr(*lowp, *highp);
1284
1285 /* Check must-be-1 bits are still 1. */
1286 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
1287 return -EINVAL;
1288
1289 /* Check must-be-0 bits are still 0. */
1290 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
1291 return -EINVAL;
1292
1293 *lowp = data;
1294 *highp = data >> 32;
1295 return 0;
1296}
1297
1298static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1299{
1300 const u64 feature_and_reserved_bits =
1301 /* feature */
1302 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
1303 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
1304 /* reserved */
1305 GENMASK_ULL(13, 9) | BIT_ULL(31);
1306 u64 vmx_misc;
1307
1308 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
1309 vmx->nested.msrs.misc_high);
1310
1311 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
1312 return -EINVAL;
1313
1314 if ((vmx->nested.msrs.pinbased_ctls_high &
1315 PIN_BASED_VMX_PREEMPTION_TIMER) &&
1316 vmx_misc_preemption_timer_rate(data) !=
1317 vmx_misc_preemption_timer_rate(vmx_misc))
1318 return -EINVAL;
1319
1320 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
1321 return -EINVAL;
1322
1323 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
1324 return -EINVAL;
1325
1326 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
1327 return -EINVAL;
1328
1329 vmx->nested.msrs.misc_low = data;
1330 vmx->nested.msrs.misc_high = data >> 32;
1331
55d2375e
SC
1332 return 0;
1333}
1334
1335static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
1336{
1337 u64 vmx_ept_vpid_cap;
1338
1339 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
1340 vmx->nested.msrs.vpid_caps);
1341
1342 /* Every bit is either reserved or a feature bit. */
1343 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
1344 return -EINVAL;
1345
1346 vmx->nested.msrs.ept_caps = data;
1347 vmx->nested.msrs.vpid_caps = data >> 32;
1348 return 0;
1349}
1350
1351static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1352{
1353 u64 *msr;
1354
1355 switch (msr_index) {
1356 case MSR_IA32_VMX_CR0_FIXED0:
1357 msr = &vmx->nested.msrs.cr0_fixed0;
1358 break;
1359 case MSR_IA32_VMX_CR4_FIXED0:
1360 msr = &vmx->nested.msrs.cr4_fixed0;
1361 break;
1362 default:
1363 BUG();
1364 }
1365
1366 /*
1367 * 1 bits (which indicates bits which "must-be-1" during VMX operation)
1368 * must be 1 in the restored value.
1369 */
1370 if (!is_bitwise_subset(data, *msr, -1ULL))
1371 return -EINVAL;
1372
1373 *msr = data;
1374 return 0;
1375}
1376
1377/*
1378 * Called when userspace is restoring VMX MSRs.
1379 *
1380 * Returns 0 on success, non-0 otherwise.
1381 */
1382int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1383{
1384 struct vcpu_vmx *vmx = to_vmx(vcpu);
1385
1386 /*
1387 * Don't allow changes to the VMX capability MSRs while the vCPU
1388 * is in VMX operation.
1389 */
1390 if (vmx->nested.vmxon)
1391 return -EBUSY;
1392
1393 switch (msr_index) {
1394 case MSR_IA32_VMX_BASIC:
1395 return vmx_restore_vmx_basic(vmx, data);
1396 case MSR_IA32_VMX_PINBASED_CTLS:
1397 case MSR_IA32_VMX_PROCBASED_CTLS:
1398 case MSR_IA32_VMX_EXIT_CTLS:
1399 case MSR_IA32_VMX_ENTRY_CTLS:
1400 /*
1401 * The "non-true" VMX capability MSRs are generated from the
1402 * "true" MSRs, so we do not support restoring them directly.
1403 *
1404 * If userspace wants to emulate VMX_BASIC[55]=0, userspace
1405 * should restore the "true" MSRs with the must-be-1 bits
1406 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1407 * DEFAULT SETTINGS".
1408 */
1409 return -EINVAL;
1410 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1411 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1412 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1413 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1414 case MSR_IA32_VMX_PROCBASED_CTLS2:
1415 return vmx_restore_control_msr(vmx, msr_index, data);
1416 case MSR_IA32_VMX_MISC:
1417 return vmx_restore_vmx_misc(vmx, data);
1418 case MSR_IA32_VMX_CR0_FIXED0:
1419 case MSR_IA32_VMX_CR4_FIXED0:
1420 return vmx_restore_fixed0_msr(vmx, msr_index, data);
1421 case MSR_IA32_VMX_CR0_FIXED1:
1422 case MSR_IA32_VMX_CR4_FIXED1:
1423 /*
1424 * These MSRs are generated based on the vCPU's CPUID, so we
1425 * do not support restoring them directly.
1426 */
1427 return -EINVAL;
1428 case MSR_IA32_VMX_EPT_VPID_CAP:
1429 return vmx_restore_vmx_ept_vpid_cap(vmx, data);
1430 case MSR_IA32_VMX_VMCS_ENUM:
1431 vmx->nested.msrs.vmcs_enum = data;
1432 return 0;
e8a70bd4
PB
1433 case MSR_IA32_VMX_VMFUNC:
1434 if (data & ~vmx->nested.msrs.vmfunc_controls)
1435 return -EINVAL;
1436 vmx->nested.msrs.vmfunc_controls = data;
1437 return 0;
55d2375e
SC
1438 default:
1439 /*
1440 * The rest of the VMX capability MSRs do not support restore.
1441 */
1442 return -EINVAL;
1443 }
1444}
1445
1446/* Returns 0 on success, non-0 otherwise. */
1447int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1448{
1449 switch (msr_index) {
1450 case MSR_IA32_VMX_BASIC:
1451 *pdata = msrs->basic;
1452 break;
1453 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1454 case MSR_IA32_VMX_PINBASED_CTLS:
1455 *pdata = vmx_control_msr(
1456 msrs->pinbased_ctls_low,
1457 msrs->pinbased_ctls_high);
1458 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
1459 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1460 break;
1461 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1462 case MSR_IA32_VMX_PROCBASED_CTLS:
1463 *pdata = vmx_control_msr(
1464 msrs->procbased_ctls_low,
1465 msrs->procbased_ctls_high);
1466 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
1467 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1468 break;
1469 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1470 case MSR_IA32_VMX_EXIT_CTLS:
1471 *pdata = vmx_control_msr(
1472 msrs->exit_ctls_low,
1473 msrs->exit_ctls_high);
1474 if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
1475 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
1476 break;
1477 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1478 case MSR_IA32_VMX_ENTRY_CTLS:
1479 *pdata = vmx_control_msr(
1480 msrs->entry_ctls_low,
1481 msrs->entry_ctls_high);
1482 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
1483 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
1484 break;
1485 case MSR_IA32_VMX_MISC:
1486 *pdata = vmx_control_msr(
1487 msrs->misc_low,
1488 msrs->misc_high);
1489 break;
1490 case MSR_IA32_VMX_CR0_FIXED0:
1491 *pdata = msrs->cr0_fixed0;
1492 break;
1493 case MSR_IA32_VMX_CR0_FIXED1:
1494 *pdata = msrs->cr0_fixed1;
1495 break;
1496 case MSR_IA32_VMX_CR4_FIXED0:
1497 *pdata = msrs->cr4_fixed0;
1498 break;
1499 case MSR_IA32_VMX_CR4_FIXED1:
1500 *pdata = msrs->cr4_fixed1;
1501 break;
1502 case MSR_IA32_VMX_VMCS_ENUM:
1503 *pdata = msrs->vmcs_enum;
1504 break;
1505 case MSR_IA32_VMX_PROCBASED_CTLS2:
1506 *pdata = vmx_control_msr(
1507 msrs->secondary_ctls_low,
1508 msrs->secondary_ctls_high);
1509 break;
1510 case MSR_IA32_VMX_EPT_VPID_CAP:
1511 *pdata = msrs->ept_caps |
1512 ((u64)msrs->vpid_caps << 32);
1513 break;
1514 case MSR_IA32_VMX_VMFUNC:
1515 *pdata = msrs->vmfunc_controls;
1516 break;
1517 default:
1518 return 1;
1519 }
1520
1521 return 0;
1522}
1523
1524/*
fadcead0
SC
1525 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
1526 * been modified by the L1 guest. Note, "writable" in this context means
1527 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
1528 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
1529 * VM-exit information fields (which are actually writable if the vCPU is
1530 * configured to support "VMWRITE to any supported field in the VMCS").
55d2375e
SC
1531 */
1532static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1533{
55d2375e 1534 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
fadcead0 1535 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1c6f0b47
SC
1536 struct shadow_vmcs_field field;
1537 unsigned long val;
fadcead0 1538 int i;
55d2375e 1539
88dddc11
PB
1540 if (WARN_ON(!shadow_vmcs))
1541 return;
1542
55d2375e
SC
1543 preempt_disable();
1544
1545 vmcs_load(shadow_vmcs);
1546
fadcead0
SC
1547 for (i = 0; i < max_shadow_read_write_fields; i++) {
1548 field = shadow_read_write_fields[i];
1c6f0b47
SC
1549 val = __vmcs_readl(field.encoding);
1550 vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
55d2375e
SC
1551 }
1552
1553 vmcs_clear(shadow_vmcs);
1554 vmcs_load(vmx->loaded_vmcs->vmcs);
1555
1556 preempt_enable();
1557}
1558
1559static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1560{
1c6f0b47 1561 const struct shadow_vmcs_field *fields[] = {
55d2375e
SC
1562 shadow_read_write_fields,
1563 shadow_read_only_fields
1564 };
1565 const int max_fields[] = {
1566 max_shadow_read_write_fields,
1567 max_shadow_read_only_fields
1568 };
55d2375e 1569 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1c6f0b47
SC
1570 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1571 struct shadow_vmcs_field field;
1572 unsigned long val;
1573 int i, q;
55d2375e 1574
88dddc11
PB
1575 if (WARN_ON(!shadow_vmcs))
1576 return;
1577
55d2375e
SC
1578 vmcs_load(shadow_vmcs);
1579
1580 for (q = 0; q < ARRAY_SIZE(fields); q++) {
1581 for (i = 0; i < max_fields[q]; i++) {
1582 field = fields[q][i];
1c6f0b47
SC
1583 val = vmcs12_read_any(vmcs12, field.encoding,
1584 field.offset);
1585 __vmcs_writel(field.encoding, val);
55d2375e
SC
1586 }
1587 }
1588
1589 vmcs_clear(shadow_vmcs);
1590 vmcs_load(vmx->loaded_vmcs->vmcs);
1591}
1592
d6bf71a1 1593static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields)
55d2375e
SC
1594{
1595 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1596 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1597
1598 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1599 vmcs12->tpr_threshold = evmcs->tpr_threshold;
1600 vmcs12->guest_rip = evmcs->guest_rip;
1601
d6bf71a1 1602 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1603 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
1604 vmcs12->guest_rsp = evmcs->guest_rsp;
1605 vmcs12->guest_rflags = evmcs->guest_rflags;
1606 vmcs12->guest_interruptibility_info =
1607 evmcs->guest_interruptibility_info;
1608 }
1609
d6bf71a1 1610 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1611 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1612 vmcs12->cpu_based_vm_exec_control =
1613 evmcs->cpu_based_vm_exec_control;
1614 }
1615
d6bf71a1 1616 if (unlikely(!(hv_clean_fields &
f9bc5227 1617 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
55d2375e
SC
1618 vmcs12->exception_bitmap = evmcs->exception_bitmap;
1619 }
1620
d6bf71a1 1621 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1622 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
1623 vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
1624 }
1625
d6bf71a1 1626 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1627 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
1628 vmcs12->vm_entry_intr_info_field =
1629 evmcs->vm_entry_intr_info_field;
1630 vmcs12->vm_entry_exception_error_code =
1631 evmcs->vm_entry_exception_error_code;
1632 vmcs12->vm_entry_instruction_len =
1633 evmcs->vm_entry_instruction_len;
1634 }
1635
d6bf71a1 1636 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1637 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1638 vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
1639 vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
1640 vmcs12->host_cr0 = evmcs->host_cr0;
1641 vmcs12->host_cr3 = evmcs->host_cr3;
1642 vmcs12->host_cr4 = evmcs->host_cr4;
1643 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
1644 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
1645 vmcs12->host_rip = evmcs->host_rip;
1646 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
1647 vmcs12->host_es_selector = evmcs->host_es_selector;
1648 vmcs12->host_cs_selector = evmcs->host_cs_selector;
1649 vmcs12->host_ss_selector = evmcs->host_ss_selector;
1650 vmcs12->host_ds_selector = evmcs->host_ds_selector;
1651 vmcs12->host_fs_selector = evmcs->host_fs_selector;
1652 vmcs12->host_gs_selector = evmcs->host_gs_selector;
1653 vmcs12->host_tr_selector = evmcs->host_tr_selector;
1654 }
1655
d6bf71a1 1656 if (unlikely(!(hv_clean_fields &
f9bc5227 1657 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
55d2375e
SC
1658 vmcs12->pin_based_vm_exec_control =
1659 evmcs->pin_based_vm_exec_control;
1660 vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
1661 vmcs12->secondary_vm_exec_control =
1662 evmcs->secondary_vm_exec_control;
1663 }
1664
d6bf71a1 1665 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1666 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
1667 vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
1668 vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
1669 }
1670
d6bf71a1 1671 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1672 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
1673 vmcs12->msr_bitmap = evmcs->msr_bitmap;
1674 }
1675
d6bf71a1 1676 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1677 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
1678 vmcs12->guest_es_base = evmcs->guest_es_base;
1679 vmcs12->guest_cs_base = evmcs->guest_cs_base;
1680 vmcs12->guest_ss_base = evmcs->guest_ss_base;
1681 vmcs12->guest_ds_base = evmcs->guest_ds_base;
1682 vmcs12->guest_fs_base = evmcs->guest_fs_base;
1683 vmcs12->guest_gs_base = evmcs->guest_gs_base;
1684 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
1685 vmcs12->guest_tr_base = evmcs->guest_tr_base;
1686 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
1687 vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
1688 vmcs12->guest_es_limit = evmcs->guest_es_limit;
1689 vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
1690 vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
1691 vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
1692 vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
1693 vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
1694 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
1695 vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
1696 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
1697 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
1698 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
1699 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
1700 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
1701 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
1702 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
1703 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
1704 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
1705 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
1706 vmcs12->guest_es_selector = evmcs->guest_es_selector;
1707 vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
1708 vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
1709 vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
1710 vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
1711 vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
1712 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
1713 vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
1714 }
1715
d6bf71a1 1716 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1717 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
1718 vmcs12->tsc_offset = evmcs->tsc_offset;
1719 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
1720 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
1721 }
1722
d6bf71a1 1723 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1724 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
1725 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
1726 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
1727 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
1728 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
1729 vmcs12->guest_cr0 = evmcs->guest_cr0;
1730 vmcs12->guest_cr3 = evmcs->guest_cr3;
1731 vmcs12->guest_cr4 = evmcs->guest_cr4;
1732 vmcs12->guest_dr7 = evmcs->guest_dr7;
1733 }
1734
d6bf71a1 1735 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1736 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
1737 vmcs12->host_fs_base = evmcs->host_fs_base;
1738 vmcs12->host_gs_base = evmcs->host_gs_base;
1739 vmcs12->host_tr_base = evmcs->host_tr_base;
1740 vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
1741 vmcs12->host_idtr_base = evmcs->host_idtr_base;
1742 vmcs12->host_rsp = evmcs->host_rsp;
1743 }
1744
d6bf71a1 1745 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1746 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
1747 vmcs12->ept_pointer = evmcs->ept_pointer;
1748 vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
1749 }
1750
d6bf71a1 1751 if (unlikely(!(hv_clean_fields &
55d2375e
SC
1752 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
1753 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
1754 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
1755 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
1756 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
1757 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
1758 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
1759 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
1760 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
1761 vmcs12->guest_pending_dbg_exceptions =
1762 evmcs->guest_pending_dbg_exceptions;
1763 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
1764 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
1765 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
1766 vmcs12->guest_activity_state = evmcs->guest_activity_state;
1767 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
1768 }
1769
1770 /*
1771 * Not used?
1772 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1773 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1774 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
55d2375e
SC
1775 * vmcs12->page_fault_error_code_mask =
1776 * evmcs->page_fault_error_code_mask;
1777 * vmcs12->page_fault_error_code_match =
1778 * evmcs->page_fault_error_code_match;
1779 * vmcs12->cr3_target_count = evmcs->cr3_target_count;
1780 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1781 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1782 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1783 */
1784
1785 /*
1786 * Read only fields:
1787 * vmcs12->guest_physical_address = evmcs->guest_physical_address;
1788 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1789 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1790 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1791 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1792 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1793 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1794 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1795 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1796 * vmcs12->exit_qualification = evmcs->exit_qualification;
1797 * vmcs12->guest_linear_address = evmcs->guest_linear_address;
1798 *
1799 * Not present in struct vmcs12:
1800 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1801 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1802 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1803 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1804 */
1805
25641caf 1806 return;
55d2375e
SC
1807}
1808
25641caf 1809static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
55d2375e
SC
1810{
1811 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1812 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1813
1814 /*
1815 * Should not be changed by KVM:
1816 *
1817 * evmcs->host_es_selector = vmcs12->host_es_selector;
1818 * evmcs->host_cs_selector = vmcs12->host_cs_selector;
1819 * evmcs->host_ss_selector = vmcs12->host_ss_selector;
1820 * evmcs->host_ds_selector = vmcs12->host_ds_selector;
1821 * evmcs->host_fs_selector = vmcs12->host_fs_selector;
1822 * evmcs->host_gs_selector = vmcs12->host_gs_selector;
1823 * evmcs->host_tr_selector = vmcs12->host_tr_selector;
1824 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1825 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1826 * evmcs->host_cr0 = vmcs12->host_cr0;
1827 * evmcs->host_cr3 = vmcs12->host_cr3;
1828 * evmcs->host_cr4 = vmcs12->host_cr4;
1829 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1830 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1831 * evmcs->host_rip = vmcs12->host_rip;
1832 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1833 * evmcs->host_fs_base = vmcs12->host_fs_base;
1834 * evmcs->host_gs_base = vmcs12->host_gs_base;
1835 * evmcs->host_tr_base = vmcs12->host_tr_base;
1836 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1837 * evmcs->host_idtr_base = vmcs12->host_idtr_base;
1838 * evmcs->host_rsp = vmcs12->host_rsp;
3731905e 1839 * sync_vmcs02_to_vmcs12() doesn't read these:
55d2375e
SC
1840 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1841 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1842 * evmcs->msr_bitmap = vmcs12->msr_bitmap;
1843 * evmcs->ept_pointer = vmcs12->ept_pointer;
1844 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1845 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1846 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1847 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
55d2375e
SC
1848 * evmcs->tpr_threshold = vmcs12->tpr_threshold;
1849 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1850 * evmcs->exception_bitmap = vmcs12->exception_bitmap;
1851 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1852 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1853 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1854 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1855 * evmcs->page_fault_error_code_mask =
1856 * vmcs12->page_fault_error_code_mask;
1857 * evmcs->page_fault_error_code_match =
1858 * vmcs12->page_fault_error_code_match;
1859 * evmcs->cr3_target_count = vmcs12->cr3_target_count;
1860 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1861 * evmcs->tsc_offset = vmcs12->tsc_offset;
1862 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1863 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1864 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1865 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1866 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
1867 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
1868 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
1869 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
1870 *
1871 * Not present in struct vmcs12:
1872 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
1873 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
1874 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
1875 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
1876 */
1877
1878 evmcs->guest_es_selector = vmcs12->guest_es_selector;
1879 evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
1880 evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
1881 evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
1882 evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
1883 evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
1884 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
1885 evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
1886
1887 evmcs->guest_es_limit = vmcs12->guest_es_limit;
1888 evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
1889 evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
1890 evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
1891 evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
1892 evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
1893 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
1894 evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
1895 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
1896 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
1897
1898 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
1899 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
1900 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
1901 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
1902 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
1903 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
1904 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
1905 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
1906
1907 evmcs->guest_es_base = vmcs12->guest_es_base;
1908 evmcs->guest_cs_base = vmcs12->guest_cs_base;
1909 evmcs->guest_ss_base = vmcs12->guest_ss_base;
1910 evmcs->guest_ds_base = vmcs12->guest_ds_base;
1911 evmcs->guest_fs_base = vmcs12->guest_fs_base;
1912 evmcs->guest_gs_base = vmcs12->guest_gs_base;
1913 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
1914 evmcs->guest_tr_base = vmcs12->guest_tr_base;
1915 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
1916 evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
1917
1918 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
1919 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
1920
1921 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
1922 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
1923 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
1924 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
1925
1926 evmcs->guest_pending_dbg_exceptions =
1927 vmcs12->guest_pending_dbg_exceptions;
1928 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
1929 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
1930
1931 evmcs->guest_activity_state = vmcs12->guest_activity_state;
1932 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
1933
1934 evmcs->guest_cr0 = vmcs12->guest_cr0;
1935 evmcs->guest_cr3 = vmcs12->guest_cr3;
1936 evmcs->guest_cr4 = vmcs12->guest_cr4;
1937 evmcs->guest_dr7 = vmcs12->guest_dr7;
1938
1939 evmcs->guest_physical_address = vmcs12->guest_physical_address;
1940
1941 evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
1942 evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
1943 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
1944 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
1945 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
1946 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
1947 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
1948 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
1949
1950 evmcs->exit_qualification = vmcs12->exit_qualification;
1951
1952 evmcs->guest_linear_address = vmcs12->guest_linear_address;
1953 evmcs->guest_rsp = vmcs12->guest_rsp;
1954 evmcs->guest_rflags = vmcs12->guest_rflags;
1955
1956 evmcs->guest_interruptibility_info =
1957 vmcs12->guest_interruptibility_info;
1958 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
1959 evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
1960 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
1961 evmcs->vm_entry_exception_error_code =
1962 vmcs12->vm_entry_exception_error_code;
1963 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
1964
1965 evmcs->guest_rip = vmcs12->guest_rip;
1966
1967 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
1968
25641caf 1969 return;
55d2375e
SC
1970}
1971
1972/*
1973 * This is an equivalent of the nested hypervisor executing the vmptrld
1974 * instruction.
1975 */
b6a0653a
VK
1976static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
1977 struct kvm_vcpu *vcpu, bool from_launch)
55d2375e
SC
1978{
1979 struct vcpu_vmx *vmx = to_vmx(vcpu);
a21a39c2 1980 bool evmcs_gpa_changed = false;
11e34914 1981 u64 evmcs_gpa;
55d2375e
SC
1982
1983 if (likely(!vmx->nested.enlightened_vmcs_enabled))
b6a0653a 1984 return EVMPTRLD_DISABLED;
55d2375e 1985
02761716
VK
1986 if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) {
1987 nested_release_evmcs(vcpu);
b6a0653a 1988 return EVMPTRLD_DISABLED;
02761716 1989 }
55d2375e 1990
1e9dfbd7
VK
1991 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
1992 vmx->nested.current_vmptr = -1ull;
55d2375e
SC
1993
1994 nested_release_evmcs(vcpu);
1995
11e34914 1996 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
dee9c049 1997 &vmx->nested.hv_evmcs_map))
b6a0653a 1998 return EVMPTRLD_ERROR;
55d2375e 1999
dee9c049 2000 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
55d2375e
SC
2001
2002 /*
2003 * Currently, KVM only supports eVMCS version 1
2004 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
2005 * value to first u32 field of eVMCS which should specify eVMCS
2006 * VersionNumber.
2007 *
2008 * Guest should be aware of supported eVMCS versions by host by
2009 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
2010 * expected to set this CPUID leaf according to the value
2011 * returned in vmcs_version from nested_enable_evmcs().
2012 *
2013 * However, it turns out that Microsoft Hyper-V fails to comply
2014 * to their own invented interface: When Hyper-V use eVMCS, it
2015 * just sets first u32 field of eVMCS to revision_id specified
2016 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
2017 * which is one of the supported versions specified in
2018 * CPUID.0x4000000A.EAX[0:15].
2019 *
2020 * To overcome Hyper-V bug, we accept here either a supported
2021 * eVMCS version or VMCS12 revision_id as valid values for first
2022 * u32 field of eVMCS.
2023 */
2024 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
2025 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
2026 nested_release_evmcs(vcpu);
b6a0653a 2027 return EVMPTRLD_VMFAIL;
55d2375e
SC
2028 }
2029
11e34914 2030 vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
55d2375e 2031
a21a39c2 2032 evmcs_gpa_changed = true;
55d2375e
SC
2033 /*
2034 * Unlike normal vmcs12, enlightened vmcs12 is not fully
2035 * reloaded from guest's memory (read only fields, fields not
2036 * present in struct hv_enlightened_vmcs, ...). Make sure there
2037 * are no leftovers.
2038 */
2039 if (from_launch) {
2040 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2041 memset(vmcs12, 0, sizeof(*vmcs12));
2042 vmcs12->hdr.revision_id = VMCS12_REVISION;
2043 }
2044
2045 }
a21a39c2
VK
2046
2047 /*
ffdbd50d 2048 * Clean fields data can't be used on VMLAUNCH and when we switch
a21a39c2
VK
2049 * between different L2 guests as KVM keeps a single VMCS12 per L1.
2050 */
2051 if (from_launch || evmcs_gpa_changed)
2052 vmx->nested.hv_evmcs->hv_clean_fields &=
2053 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2054
b6a0653a 2055 return EVMPTRLD_SUCCEEDED;
55d2375e
SC
2056}
2057
3731905e 2058void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
55d2375e
SC
2059{
2060 struct vcpu_vmx *vmx = to_vmx(vcpu);
2061
dc313385 2062 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
55d2375e 2063 copy_vmcs12_to_enlightened(vmx);
dc313385 2064 else
55d2375e 2065 copy_vmcs12_to_shadow(vmx);
55d2375e 2066
3731905e 2067 vmx->nested.need_vmcs12_to_shadow_sync = false;
55d2375e
SC
2068}
2069
2070static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
2071{
2072 struct vcpu_vmx *vmx =
2073 container_of(timer, struct vcpu_vmx, nested.preemption_timer);
2074
2075 vmx->nested.preemption_timer_expired = true;
2076 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
2077 kvm_vcpu_kick(&vmx->vcpu);
2078
2079 return HRTIMER_NORESTART;
2080}
2081
850448f3
PS
2082static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu)
2083{
2084 struct vcpu_vmx *vmx = to_vmx(vcpu);
2085 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
850448f3
PS
2086
2087 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >>
2088 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2089
2090 if (!vmx->nested.has_preemption_timer_deadline) {
8d7fbf01
MS
2091 vmx->nested.preemption_timer_deadline =
2092 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc;
850448f3 2093 vmx->nested.has_preemption_timer_deadline = true;
8d7fbf01
MS
2094 }
2095 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc;
850448f3
PS
2096}
2097
2098static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
2099 u64 preemption_timeout)
55d2375e 2100{
55d2375e
SC
2101 struct vcpu_vmx *vmx = to_vmx(vcpu);
2102
2103 /*
2104 * A timer value of zero is architecturally guaranteed to cause
2105 * a VMExit prior to executing any instructions in the guest.
2106 */
2107 if (preemption_timeout == 0) {
2108 vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
2109 return;
2110 }
2111
2112 if (vcpu->arch.virtual_tsc_khz == 0)
2113 return;
2114
2115 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2116 preemption_timeout *= 1000000;
2117 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
2118 hrtimer_start(&vmx->nested.preemption_timer,
ada0098d
JM
2119 ktime_add_ns(ktime_get(), preemption_timeout),
2120 HRTIMER_MODE_ABS_PINNED);
55d2375e
SC
2121}
2122
2123static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2124{
2125 if (vmx->nested.nested_run_pending &&
2126 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
2127 return vmcs12->guest_ia32_efer;
2128 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
2129 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
2130 else
2131 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
2132}
2133
2134static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
2135{
2136 /*
2137 * If vmcs02 hasn't been initialized, set the constant vmcs02 state
2138 * according to L0's settings (vmcs12 is irrelevant here). Host
2139 * fields that come from L0 and are not constant, e.g. HOST_CR3,
2140 * will be set as needed prior to VMLAUNCH/VMRESUME.
2141 */
2142 if (vmx->nested.vmcs02_initialized)
2143 return;
2144 vmx->nested.vmcs02_initialized = true;
2145
2146 /*
2147 * We don't care what the EPTP value is we just need to guarantee
2148 * it's valid so we don't get a false positive when doing early
2149 * consistency checks.
2150 */
2151 if (enable_ept && nested_early_check)
2a40b900
SC
2152 vmcs_write64(EPT_POINTER,
2153 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
55d2375e
SC
2154
2155 /* All VMFUNCs are currently emulated through L0 vmexits. */
2156 if (cpu_has_vmx_vmfunc())
2157 vmcs_write64(VM_FUNCTION_CONTROL, 0);
2158
2159 if (cpu_has_vmx_posted_intr())
2160 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
2161
2162 if (cpu_has_vmx_msr_bitmap())
2163 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
2164
4d6c9892 2165 /*
c3bb9a20
SC
2166 * PML is emulated for L2, but never enabled in hardware as the MMU
2167 * handles A/D emulation. Disabling PML for L2 also avoids having to
2168 * deal with filtering out L2 GPAs from the buffer.
4d6c9892
SC
2169 */
2170 if (enable_pml) {
c3bb9a20
SC
2171 vmcs_write64(PML_ADDRESS, 0);
2172 vmcs_write16(GUEST_PML_INDEX, -1);
4d6c9892 2173 }
55d2375e 2174
c538d57f
SC
2175 if (cpu_has_vmx_encls_vmexit())
2176 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
55d2375e
SC
2177
2178 /*
2179 * Set the MSR load/store lists to match L0's settings. Only the
2180 * addresses are constant (for vmcs02), the counts can change based
2181 * on L2's behavior, e.g. switching to/from long mode.
2182 */
662f1d1d 2183 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
55d2375e
SC
2184 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2185 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
2186
2187 vmx_set_constant_host_state(vmx);
2188}
2189
b1346ab2 2190static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
55d2375e
SC
2191 struct vmcs12 *vmcs12)
2192{
2193 prepare_vmcs02_constant_state(vmx);
2194
2195 vmcs_write64(VMCS_LINK_POINTER, -1ull);
2196
2197 if (enable_vpid) {
2198 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
2199 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
2200 else
2201 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2202 }
2203}
2204
2205static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2206{
c3bb9a20 2207 u32 exec_control;
55d2375e
SC
2208 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
2209
1e9dfbd7 2210 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
b1346ab2 2211 prepare_vmcs02_early_rare(vmx, vmcs12);
55d2375e 2212
55d2375e
SC
2213 /*
2214 * PIN CONTROLS
2215 */
c075c3e4 2216 exec_control = vmx_pin_based_exec_ctrl(vmx);
804939ea
SC
2217 exec_control |= (vmcs12->pin_based_vm_exec_control &
2218 ~PIN_BASED_VMX_PREEMPTION_TIMER);
55d2375e
SC
2219
2220 /* Posted interrupts setting is only taken from vmcs12. */
2221 if (nested_cpu_has_posted_intr(vmcs12)) {
2222 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
2223 vmx->nested.pi_pending = false;
2224 } else {
2225 exec_control &= ~PIN_BASED_POSTED_INTR;
2226 }
3af80fec 2227 pin_controls_set(vmx, exec_control);
55d2375e
SC
2228
2229 /*
2230 * EXEC CONTROLS
2231 */
2232 exec_control = vmx_exec_control(vmx); /* L0's desires */
9dadc2f9 2233 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
4e2a0bc5 2234 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
55d2375e
SC
2235 exec_control &= ~CPU_BASED_TPR_SHADOW;
2236 exec_control |= vmcs12->cpu_based_vm_exec_control;
2237
02d496cf 2238 vmx->nested.l1_tpr_threshold = -1;
ca2f5466 2239 if (exec_control & CPU_BASED_TPR_SHADOW)
55d2375e 2240 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
55d2375e 2241#ifdef CONFIG_X86_64
ca2f5466 2242 else
55d2375e
SC
2243 exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2244 CPU_BASED_CR8_STORE_EXITING;
2245#endif
55d2375e
SC
2246
2247 /*
2248 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2249 * for I/O port accesses.
2250 */
55d2375e 2251 exec_control |= CPU_BASED_UNCOND_IO_EXITING;
de0286b7
SC
2252 exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2253
2254 /*
2255 * This bit will be computed in nested_get_vmcs12_pages, because
2256 * we do not have access to L1's MSR bitmap yet. For now, keep
2257 * the same bit as before, hoping to avoid multiple VMWRITEs that
2258 * only set/clear this bit.
2259 */
2260 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
2261 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
2262
3af80fec 2263 exec_controls_set(vmx, exec_control);
55d2375e
SC
2264
2265 /*
2266 * SECONDARY EXEC CONTROLS
2267 */
2268 if (cpu_has_secondary_exec_ctrls()) {
2269 exec_control = vmx->secondary_exec_control;
2270
2271 /* Take the following fields only from vmcs12 */
2272 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2273 SECONDARY_EXEC_ENABLE_INVPCID |
7f3603b6 2274 SECONDARY_EXEC_ENABLE_RDTSCP |
55d2375e 2275 SECONDARY_EXEC_XSAVES |
e69e72fa 2276 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
55d2375e
SC
2277 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2278 SECONDARY_EXEC_APIC_REGISTER_VIRT |
d041b5ea
IS
2279 SECONDARY_EXEC_ENABLE_VMFUNC |
2280 SECONDARY_EXEC_TSC_SCALING);
55d2375e 2281 if (nested_cpu_has(vmcs12,
c3bb9a20
SC
2282 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
2283 exec_control |= vmcs12->secondary_vm_exec_control;
2284
2285 /* PML is emulated and never enabled in hardware for L2. */
2286 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
55d2375e
SC
2287
2288 /* VMCS shadowing for L2 is emulated for now */
2289 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2290
55d2375e 2291 /*
469debdb
SC
2292 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
2293 * will not have to rewrite the controls just for this bit.
55d2375e 2294 */
469debdb
SC
2295 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
2296 (vmcs12->guest_cr4 & X86_CR4_UMIP))
2297 exec_control |= SECONDARY_EXEC_DESC;
55d2375e 2298
55d2375e
SC
2299 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2300 vmcs_write16(GUEST_INTR_STATUS,
2301 vmcs12->guest_intr_status);
55d2375e 2302
bddd82d1
KS
2303 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
2304 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2305
72add915
SC
2306 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
2307 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
2308
3af80fec 2309 secondary_exec_controls_set(vmx, exec_control);
55d2375e
SC
2310 }
2311
2312 /*
2313 * ENTRY CONTROLS
2314 *
2315 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2316 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2317 * on the related bits (if supported by the CPU) in the hope that
2318 * we can avoid VMWrites during vmx_set_efer().
2319 */
2320 exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) &
2321 ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
2322 if (cpu_has_load_ia32_efer()) {
2323 if (guest_efer & EFER_LMA)
2324 exec_control |= VM_ENTRY_IA32E_MODE;
2325 if (guest_efer != host_efer)
2326 exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2327 }
3af80fec 2328 vm_entry_controls_set(vmx, exec_control);
55d2375e
SC
2329
2330 /*
2331 * EXIT CONTROLS
2332 *
2333 * L2->L1 exit controls are emulated - the hardware exit is to L0 so
2334 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2335 * bits may be modified by vmx_set_efer() in prepare_vmcs02().
2336 */
2337 exec_control = vmx_vmexit_ctrl();
2338 if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
2339 exec_control |= VM_EXIT_LOAD_IA32_EFER;
3af80fec 2340 vm_exit_controls_set(vmx, exec_control);
55d2375e
SC
2341
2342 /*
2343 * Interrupt/Exception Fields
2344 */
2345 if (vmx->nested.nested_run_pending) {
2346 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2347 vmcs12->vm_entry_intr_info_field);
2348 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2349 vmcs12->vm_entry_exception_error_code);
2350 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2351 vmcs12->vm_entry_instruction_len);
2352 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2353 vmcs12->guest_interruptibility_info);
2354 vmx->loaded_vmcs->nmi_known_unmasked =
2355 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
2356 } else {
2357 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
2358 }
2359}
2360
b1346ab2 2361static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
55d2375e
SC
2362{
2363 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2364
2365 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2366 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2367 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
2368 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
2369 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
2370 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
2371 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
2372 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
2373 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
2374 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
2375 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
2376 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
2377 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
2378 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
2379 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
2380 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
2381 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
2382 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2383 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2384 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
1c6f0b47
SC
2385 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2386 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
55d2375e
SC
2387 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2388 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2389 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
2390 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
2391 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
2392 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
2393 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
2394 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
2395 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
2396 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
2397 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
2398 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
2399 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
2400 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
2401 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
2402 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
fc387d8d
SC
2403
2404 vmx->segment_cache.bitmask = 0;
55d2375e
SC
2405 }
2406
2407 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2408 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
2409 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
2410 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
2411 vmcs12->guest_pending_dbg_exceptions);
2412 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
2413 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
2414
2415 /*
2416 * L1 may access the L2's PDPTR, so save them to construct
2417 * vmcs12
2418 */
2419 if (enable_ept) {
2420 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2421 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2422 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2423 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2424 }
c27e5b0d
SC
2425
2426 if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
2427 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2428 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
55d2375e
SC
2429 }
2430
2431 if (nested_cpu_has_xsaves(vmcs12))
2432 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
2433
2434 /*
2435 * Whether page-faults are trapped is determined by a combination of
a0c13434
PB
2436 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0
2437 * doesn't care about page faults then we should set all of these to
2438 * L1's desires. However, if L0 does care about (some) page faults, it
2439 * is not easy (if at all possible?) to merge L0 and L1's desires, we
2440 * simply ask to exit on each and every L2 page fault. This is done by
2441 * setting MASK=MATCH=0 and (see below) EB.PF=1.
55d2375e
SC
2442 * Note that below we don't need special code to set EB.PF beyond the
2443 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2444 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2445 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
2446 */
a0c13434
PB
2447 if (vmx_need_pf_intercept(&vmx->vcpu)) {
2448 /*
2449 * TODO: if both L0 and L1 need the same MASK and MATCH,
2450 * go ahead and use it?
2451 */
2452 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
2453 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
2454 } else {
2455 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask);
2456 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match);
2457 }
55d2375e
SC
2458
2459 if (cpu_has_vmx_apicv()) {
2460 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
2461 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
2462 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
2463 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
2464 }
2465
662f1d1d
AL
2466 /*
2467 * Make sure the msr_autostore list is up to date before we set the
2468 * count in the vmcs02.
2469 */
2470 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
2471
2472 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
55d2375e
SC
2473 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2474 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2475
2476 set_cr4_guest_host_mask(vmx);
55d2375e
SC
2477}
2478
2479/*
2480 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2481 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2482 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2483 * guest in a way that will both be appropriate to L1's requests, and our
2484 * needs. In addition to modifying the active vmcs (which is vmcs02), this
2485 * function also has additional necessary side-effects, like setting various
2486 * vcpu->arch fields.
2487 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
2488 * is assigned to entry_failure_code on failure.
2489 */
2490static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
0f857223 2491 bool from_vmentry,
68cda40d 2492 enum vm_entry_failure_code *entry_failure_code)
55d2375e
SC
2493{
2494 struct vcpu_vmx *vmx = to_vmx(vcpu);
c7554efc 2495 bool load_guest_pdptrs_vmcs12 = false;
55d2375e 2496
1e9dfbd7 2497 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
b1346ab2 2498 prepare_vmcs02_rare(vmx, vmcs12);
55d2375e 2499 vmx->nested.dirty_vmcs12 = false;
55d2375e 2500
1e9dfbd7
VK
2501 load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) ||
2502 !(vmx->nested.hv_evmcs->hv_clean_fields &
c7554efc 2503 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
55d2375e
SC
2504 }
2505
2506 if (vmx->nested.nested_run_pending &&
2507 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2508 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
2509 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
2510 } else {
2511 kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2512 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
2513 }
3b013a29
SC
2514 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
2515 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
2516 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
55d2375e
SC
2517 vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2518
55d2375e
SC
2519 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2520 * bitwise-or of what L1 wants to trap for L2, and what we want to
2521 * trap. Note that CR0.TS also needs updating - we do this later.
2522 */
b6a7cc35 2523 vmx_update_exception_bitmap(vcpu);
55d2375e
SC
2524 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
2525 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2526
2527 if (vmx->nested.nested_run_pending &&
2528 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
2529 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
2530 vcpu->arch.pat = vmcs12->guest_ia32_pat;
2531 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2532 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
2533 }
2534
d041b5ea
IS
2535 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
2536 vcpu->arch.l1_tsc_offset,
2537 vmx_get_l2_tsc_offset(vcpu),
2538 vmx_get_l2_tsc_multiplier(vcpu));
2539
2540 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
2541 vcpu->arch.l1_tsc_scaling_ratio,
2542 vmx_get_l2_tsc_multiplier(vcpu));
2543
55d2375e 2544 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
55d2375e 2545 if (kvm_has_tsc_control)
1ab9287a 2546 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
55d2375e 2547
50b265a4 2548 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
55d2375e
SC
2549
2550 if (nested_cpu_has_ept(vmcs12))
2551 nested_ept_init_mmu_context(vcpu);
55d2375e
SC
2552
2553 /*
2554 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
2555 * bits which we consider mandatory enabled.
2556 * The CR0_READ_SHADOW is what L2 should have expected to read given
2557 * the specifications by L1; It's not enough to take
2558 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
2559 * have more bits than L1 expected.
2560 */
2561 vmx_set_cr0(vcpu, vmcs12->guest_cr0);
2562 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2563
2564 vmx_set_cr4(vcpu, vmcs12->guest_cr4);
2565 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
2566
2567 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
2568 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2569 vmx_set_efer(vcpu, vcpu->arch.efer);
2570
2571 /*
2572 * Guest state is invalid and unrestricted guest is disabled,
2573 * which means L1 attempted VMEntry to L2 with invalid state.
2574 * Fail the VMEntry.
2575 */
2ba4493a 2576 if (CC(!vmx_guest_state_valid(vcpu))) {
55d2375e 2577 *entry_failure_code = ENTRY_FAIL_DEFAULT;
c80add0f 2578 return -EINVAL;
55d2375e
SC
2579 }
2580
2581 /* Shadow page tables on either EPT or shadow page tables. */
2582 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
0f857223 2583 from_vmentry, entry_failure_code))
c80add0f 2584 return -EINVAL;
55d2375e 2585
04f11ef4
SC
2586 /*
2587 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12
2588 * on nested VM-Exit, which can occur without actually running L2 and
727a7e27 2589 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with
04f11ef4
SC
2590 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
2591 * transition to HLT instead of running L2.
2592 */
2593 if (enable_ept)
2594 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
2595
c7554efc
SC
2596 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
2597 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
2598 is_pae_paging(vcpu)) {
2599 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2600 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2601 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2602 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2603 }
2604
55d2375e
SC
2605 if (!enable_ept)
2606 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
2607
71f73470 2608 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
d1968421
OU
2609 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
2610 vmcs12->guest_ia32_perf_global_ctrl)))
71f73470
OU
2611 return -EINVAL;
2612
e9c16c78
PB
2613 kvm_rsp_write(vcpu, vmcs12->guest_rsp);
2614 kvm_rip_write(vcpu, vmcs12->guest_rip);
dc313385
VK
2615
2616 /*
2617 * It was observed that genuine Hyper-V running in L1 doesn't reset
2618 * 'hv_clean_fields' by itself, it only sets the corresponding dirty
2619 * bits when it changes a field in eVMCS. Mark all fields as clean
2620 * here.
2621 */
2622 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
2623 vmx->nested.hv_evmcs->hv_clean_fields |=
2624 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2625
55d2375e
SC
2626 return 0;
2627}
2628
2629static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
2630{
5497b955
SC
2631 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
2632 nested_cpu_has_virtual_nmis(vmcs12)))
55d2375e
SC
2633 return -EINVAL;
2634
5497b955 2635 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
4e2a0bc5 2636 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING)))
55d2375e
SC
2637 return -EINVAL;
2638
2639 return 0;
2640}
2641
ac6389ab 2642static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
55d2375e
SC
2643{
2644 struct vcpu_vmx *vmx = to_vmx(vcpu);
55d2375e
SC
2645
2646 /* Check for memory type validity */
ac6389ab 2647 switch (new_eptp & VMX_EPTP_MT_MASK) {
55d2375e 2648 case VMX_EPTP_MT_UC:
5497b955 2649 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
55d2375e
SC
2650 return false;
2651 break;
2652 case VMX_EPTP_MT_WB:
5497b955 2653 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
55d2375e
SC
2654 return false;
2655 break;
2656 default:
2657 return false;
2658 }
2659
bb1fcc70 2660 /* Page-walk levels validity. */
ac6389ab 2661 switch (new_eptp & VMX_EPTP_PWL_MASK) {
bb1fcc70
SC
2662 case VMX_EPTP_PWL_5:
2663 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT)))
2664 return false;
2665 break;
2666 case VMX_EPTP_PWL_4:
2667 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT)))
2668 return false;
2669 break;
2670 default:
55d2375e 2671 return false;
bb1fcc70 2672 }
55d2375e
SC
2673
2674 /* Reserved bits should not be set */
636e8b73 2675 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f)))
55d2375e
SC
2676 return false;
2677
2678 /* AD, if set, should be supported */
ac6389ab 2679 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) {
5497b955 2680 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
55d2375e
SC
2681 return false;
2682 }
2683
2684 return true;
2685}
2686
461b4ba4
KS
2687/*
2688 * Checks related to VM-Execution Control Fields
2689 */
2690static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
2691 struct vmcs12 *vmcs12)
55d2375e
SC
2692{
2693 struct vcpu_vmx *vmx = to_vmx(vcpu);
55d2375e 2694
5497b955
SC
2695 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
2696 vmx->nested.msrs.pinbased_ctls_low,
2697 vmx->nested.msrs.pinbased_ctls_high)) ||
2698 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
2699 vmx->nested.msrs.procbased_ctls_low,
2700 vmx->nested.msrs.procbased_ctls_high)))
461b4ba4 2701 return -EINVAL;
55d2375e 2702
461b4ba4 2703 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
5497b955
SC
2704 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
2705 vmx->nested.msrs.secondary_ctls_low,
2706 vmx->nested.msrs.secondary_ctls_high)))
461b4ba4
KS
2707 return -EINVAL;
2708
5497b955 2709 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
461b4ba4
KS
2710 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
2711 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
2712 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
2713 nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
2714 nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
2715 nested_vmx_check_nmi_controls(vmcs12) ||
2716 nested_vmx_check_pml_controls(vcpu, vmcs12) ||
2717 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
2718 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
2719 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
5497b955 2720 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
461b4ba4
KS
2721 return -EINVAL;
2722
bc441211
SC
2723 if (!nested_cpu_has_preemption_timer(vmcs12) &&
2724 nested_cpu_has_save_preemption_timer(vmcs12))
2725 return -EINVAL;
2726
461b4ba4 2727 if (nested_cpu_has_ept(vmcs12) &&
ac6389ab 2728 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer)))
461b4ba4 2729 return -EINVAL;
55d2375e
SC
2730
2731 if (nested_cpu_has_vmfunc(vmcs12)) {
5497b955
SC
2732 if (CC(vmcs12->vm_function_control &
2733 ~vmx->nested.msrs.vmfunc_controls))
461b4ba4 2734 return -EINVAL;
55d2375e
SC
2735
2736 if (nested_cpu_has_eptp_switching(vmcs12)) {
5497b955
SC
2737 if (CC(!nested_cpu_has_ept(vmcs12)) ||
2738 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
461b4ba4 2739 return -EINVAL;
55d2375e
SC
2740 }
2741 }
2742
461b4ba4
KS
2743 return 0;
2744}
2745
61446ba7
KS
2746/*
2747 * Checks related to VM-Exit Control Fields
2748 */
2749static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
2750 struct vmcs12 *vmcs12)
2751{
2752 struct vcpu_vmx *vmx = to_vmx(vcpu);
2753
5497b955
SC
2754 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
2755 vmx->nested.msrs.exit_ctls_low,
2756 vmx->nested.msrs.exit_ctls_high)) ||
2757 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
61446ba7
KS
2758 return -EINVAL;
2759
2760 return 0;
2761}
2762
5fbf9634
KS
2763/*
2764 * Checks related to VM-Entry Control Fields
2765 */
2766static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2767 struct vmcs12 *vmcs12)
461b4ba4
KS
2768{
2769 struct vcpu_vmx *vmx = to_vmx(vcpu);
55d2375e 2770
5497b955
SC
2771 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
2772 vmx->nested.msrs.entry_ctls_low,
2773 vmx->nested.msrs.entry_ctls_high)))
5fbf9634 2774 return -EINVAL;
55d2375e
SC
2775
2776 /*
2777 * From the Intel SDM, volume 3:
2778 * Fields relevant to VM-entry event injection must be set properly.
2779 * These fields are the VM-entry interruption-information field, the
2780 * VM-entry exception error code, and the VM-entry instruction length.
2781 */
2782 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
2783 u32 intr_info = vmcs12->vm_entry_intr_info_field;
2784 u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
2785 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
2786 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
2787 bool should_have_error_code;
2788 bool urg = nested_cpu_has2(vmcs12,
2789 SECONDARY_EXEC_UNRESTRICTED_GUEST);
2790 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
2791
2792 /* VM-entry interruption-info field: interruption type */
5497b955
SC
2793 if (CC(intr_type == INTR_TYPE_RESERVED) ||
2794 CC(intr_type == INTR_TYPE_OTHER_EVENT &&
2795 !nested_cpu_supports_monitor_trap_flag(vcpu)))
5fbf9634 2796 return -EINVAL;
55d2375e
SC
2797
2798 /* VM-entry interruption-info field: vector */
5497b955
SC
2799 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
2800 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
2801 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
5fbf9634 2802 return -EINVAL;
55d2375e
SC
2803
2804 /* VM-entry interruption-info field: deliver error code */
2805 should_have_error_code =
2806 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
2807 x86_exception_has_error_code(vector);
5497b955 2808 if (CC(has_error_code != should_have_error_code))
5fbf9634 2809 return -EINVAL;
55d2375e
SC
2810
2811 /* VM-entry exception error code */
5497b955 2812 if (CC(has_error_code &&
567926cc 2813 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
5fbf9634 2814 return -EINVAL;
55d2375e
SC
2815
2816 /* VM-entry interruption-info field: reserved bits */
5497b955 2817 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
5fbf9634 2818 return -EINVAL;
55d2375e
SC
2819
2820 /* VM-entry instruction length */
2821 switch (intr_type) {
2822 case INTR_TYPE_SOFT_EXCEPTION:
2823 case INTR_TYPE_SOFT_INTR:
2824 case INTR_TYPE_PRIV_SW_EXCEPTION:
5497b955
SC
2825 if (CC(vmcs12->vm_entry_instruction_len > 15) ||
2826 CC(vmcs12->vm_entry_instruction_len == 0 &&
2827 CC(!nested_cpu_has_zero_length_injection(vcpu))))
5fbf9634 2828 return -EINVAL;
55d2375e
SC
2829 }
2830 }
2831
5fbf9634
KS
2832 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
2833 return -EINVAL;
2834
2835 return 0;
2836}
2837
5478ba34
SC
2838static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
2839 struct vmcs12 *vmcs12)
2840{
2841 if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
2842 nested_check_vm_exit_controls(vcpu, vmcs12) ||
2843 nested_check_vm_entry_controls(vcpu, vmcs12))
98d9e858 2844 return -EINVAL;
5478ba34 2845
a8350231
VK
2846 if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled)
2847 return nested_evmcs_check_controls(vmcs12);
2848
5478ba34
SC
2849 return 0;
2850}
2851
98d9e858
PB
2852static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
2853 struct vmcs12 *vmcs12)
5fbf9634
KS
2854{
2855 bool ia32e;
2856
5497b955
SC
2857 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
2858 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
636e8b73 2859 CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3)))
254b2f3b 2860 return -EINVAL;
711eff3a 2861
5497b955
SC
2862 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
2863 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
711eff3a
KS
2864 return -EINVAL;
2865
f6b0db1f 2866 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
5497b955 2867 CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
f6b0db1f
KS
2868 return -EINVAL;
2869
c547cb6f
OU
2870 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2871 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
2872 vmcs12->host_ia32_perf_global_ctrl)))
2873 return -EINVAL;
2874
fd3edd4a
PB
2875#ifdef CONFIG_X86_64
2876 ia32e = !!(vcpu->arch.efer & EFER_LMA);
2877#else
2878 ia32e = false;
2879#endif
2880
2881 if (ia32e) {
2882 if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) ||
2883 CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
2884 return -EINVAL;
2885 } else {
2886 if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) ||
2887 CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
2888 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
2889 CC((vmcs12->host_rip) >> 32))
2890 return -EINVAL;
2891 }
1ef23e1f 2892
5497b955
SC
2893 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2894 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2895 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2896 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2897 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2898 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2899 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2900 CC(vmcs12->host_cs_selector == 0) ||
2901 CC(vmcs12->host_tr_selector == 0) ||
2902 CC(vmcs12->host_ss_selector == 0 && !ia32e))
1ef23e1f
KS
2903 return -EINVAL;
2904
5497b955
SC
2905 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
2906 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
2907 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
2908 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
fd3edd4a
PB
2909 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
2910 CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
5845038c 2911 return -EINVAL;
1ef23e1f 2912
5fbf9634
KS
2913 /*
2914 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
2915 * IA32_EFER MSR must be 0 in the field for that register. In addition,
2916 * the values of the LMA and LME bits in the field must each be that of
2917 * the host address-space size VM-exit control.
2918 */
2919 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
5497b955
SC
2920 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
2921 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
2922 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
254b2f3b 2923 return -EINVAL;
5fbf9634
KS
2924 }
2925
55d2375e
SC
2926 return 0;
2927}
2928
2929static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
2930 struct vmcs12 *vmcs12)
2931{
88925305 2932 int r = 0;
55d2375e 2933 struct vmcs12 *shadow;
88925305 2934 struct kvm_host_map map;
55d2375e
SC
2935
2936 if (vmcs12->vmcs_link_pointer == -1ull)
2937 return 0;
2938
5497b955 2939 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
55d2375e
SC
2940 return -EINVAL;
2941
5497b955 2942 if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)))
55d2375e
SC
2943 return -EINVAL;
2944
88925305
KA
2945 shadow = map.hva;
2946
5497b955
SC
2947 if (CC(shadow->hdr.revision_id != VMCS12_REVISION) ||
2948 CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
55d2375e 2949 r = -EINVAL;
88925305
KA
2950
2951 kvm_vcpu_unmap(vcpu, &map, false);
55d2375e
SC
2952 return r;
2953}
2954
9c3e922b
SC
2955/*
2956 * Checks related to Guest Non-register State
2957 */
2958static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
2959{
5497b955 2960 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
bf0cd88c
YQ
2961 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT &&
2962 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI))
9c3e922b
SC
2963 return -EINVAL;
2964
2965 return 0;
2966}
2967
5478ba34
SC
2968static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
2969 struct vmcs12 *vmcs12,
68cda40d 2970 enum vm_entry_failure_code *entry_failure_code)
55d2375e
SC
2971{
2972 bool ia32e;
2973
68cda40d 2974 *entry_failure_code = ENTRY_FAIL_DEFAULT;
55d2375e 2975
5497b955
SC
2976 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
2977 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
c80add0f 2978 return -EINVAL;
55d2375e 2979
b91991bf
KS
2980 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
2981 CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
2982 return -EINVAL;
2983
de2bc2bf 2984 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
5497b955 2985 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
c80add0f 2986 return -EINVAL;
55d2375e
SC
2987
2988 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
68cda40d 2989 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR;
c80add0f 2990 return -EINVAL;
55d2375e
SC
2991 }
2992
bfc6ad6a
OU
2993 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2994 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
2995 vmcs12->guest_ia32_perf_global_ctrl)))
2996 return -EINVAL;
2997
55d2375e
SC
2998 /*
2999 * If the load IA32_EFER VM-entry control is 1, the following checks
3000 * are performed on the field for the IA32_EFER MSR:
3001 * - Bits reserved in the IA32_EFER MSR must be 0.
3002 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
3003 * the IA-32e mode guest VM-exit control. It must also be identical
3004 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
3005 * CR0.PG) is 1.
3006 */
3007 if (to_vmx(vcpu)->nested.nested_run_pending &&
3008 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
3009 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
5497b955
SC
3010 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
3011 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
3012 CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
3013 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
c80add0f 3014 return -EINVAL;
55d2375e
SC
3015 }
3016
3017 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
5497b955
SC
3018 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
3019 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
c80add0f 3020 return -EINVAL;
55d2375e 3021
9c3e922b 3022 if (nested_check_guest_non_reg_state(vmcs12))
c80add0f 3023 return -EINVAL;
55d2375e
SC
3024
3025 return 0;
3026}
3027
453eafbe 3028static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
55d2375e
SC
3029{
3030 struct vcpu_vmx *vmx = to_vmx(vcpu);
3031 unsigned long cr3, cr4;
f1727b49 3032 bool vm_fail;
55d2375e
SC
3033
3034 if (!nested_early_check)
3035 return 0;
3036
3037 if (vmx->msr_autoload.host.nr)
3038 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
3039 if (vmx->msr_autoload.guest.nr)
3040 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
3041
3042 preempt_disable();
3043
3044 vmx_prepare_switch_to_guest(vcpu);
3045
3046 /*
3047 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
3048 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
49f933d4 3049 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
55d2375e
SC
3050 * there is no need to preserve other bits or save/restore the field.
3051 */
3052 vmcs_writel(GUEST_RFLAGS, 0);
3053
55d2375e
SC
3054 cr3 = __get_current_cr3_fast();
3055 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
3056 vmcs_writel(HOST_CR3, cr3);
3057 vmx->loaded_vmcs->host_state.cr3 = cr3;
3058 }
3059
3060 cr4 = cr4_read_shadow();
3061 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
3062 vmcs_writel(HOST_CR4, cr4);
3063 vmx->loaded_vmcs->host_state.cr4 = cr4;
3064 }
3065
150f17bf
UB
3066 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
3067 vmx->loaded_vmcs->launched);
55d2375e 3068
55d2375e
SC
3069 if (vmx->msr_autoload.host.nr)
3070 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
3071 if (vmx->msr_autoload.guest.nr)
3072 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
3073
f1727b49 3074 if (vm_fail) {
380e0055
SC
3075 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
3076
541e886f 3077 preempt_enable();
380e0055
SC
3078
3079 trace_kvm_nested_vmenter_failed(
3080 "early hardware check VM-instruction error: ", error);
3081 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
55d2375e
SC
3082 return 1;
3083 }
3084
3085 /*
3086 * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
3087 */
55d2375e
SC
3088 if (hw_breakpoint_active())
3089 set_debugreg(__this_cpu_read(cpu_dr7), 7);
84b6a349 3090 local_irq_enable();
541e886f 3091 preempt_enable();
55d2375e
SC
3092
3093 /*
3094 * A non-failing VMEntry means we somehow entered guest mode with
3095 * an illegal RIP, and that's just the tip of the iceberg. There
3096 * is no telling what memory has been modified or what state has
3097 * been exposed to unknown code. Hitting this all but guarantees
3098 * a (very critical) hardware issue.
3099 */
3100 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
3101 VMX_EXIT_REASONS_FAILED_VMENTRY));
3102
3103 return 0;
3104}
55d2375e 3105
9a78e158 3106static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
55d2375e 3107{
55d2375e 3108 struct vcpu_vmx *vmx = to_vmx(vcpu);
55d2375e 3109
e942dbf8
VK
3110 /*
3111 * hv_evmcs may end up being not mapped after migration (when
3112 * L2 was running), map it here to make sure vmcs12 changes are
3113 * properly reflected.
3114 */
1e9dfbd7 3115 if (vmx->nested.enlightened_vmcs_enabled &&
27849968 3116 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
b6a0653a
VK
3117 enum nested_evmptrld_status evmptrld_status =
3118 nested_vmx_handle_enlightened_vmptrld(vcpu, false);
3119
3120 if (evmptrld_status == EVMPTRLD_VMFAIL ||
f5c7e842 3121 evmptrld_status == EVMPTRLD_ERROR)
b6a0653a 3122 return false;
8629b625
VK
3123
3124 /*
3125 * Post migration VMCS12 always provides the most actual
3126 * information, copy it to eVMCS upon entry.
3127 */
3128 vmx->nested.need_vmcs12_to_shadow_sync = true;
b6a0653a 3129 }
e942dbf8 3130
9a78e158
PB
3131 return true;
3132}
3133
3134static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
3135{
3136 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3137 struct vcpu_vmx *vmx = to_vmx(vcpu);
3138 struct kvm_host_map *map;
3139 struct page *page;
3140 u64 hpa;
3141
158a48ec
ML
3142 if (!vcpu->arch.pdptrs_from_userspace &&
3143 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
0f857223
ML
3144 /*
3145 * Reload the guest's PDPTRs since after a migration
3146 * the guest CR3 might be restored prior to setting the nested
3147 * state which can lead to a load of wrong PDPTRs.
3148 */
3149 if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)))
3150 return false;
3151 }
3152
3153
55d2375e
SC
3154 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3155 /*
3156 * Translate L1 physical address to host physical
3157 * address for vmcs02. Keep the page pinned, so this
3158 * physical address remains valid. We keep a reference
3159 * to it so we can release it later.
3160 */
3161 if (vmx->nested.apic_access_page) { /* shouldn't happen */
b11494bc 3162 kvm_release_page_clean(vmx->nested.apic_access_page);
55d2375e
SC
3163 vmx->nested.apic_access_page = NULL;
3164 }
3165 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
55d2375e
SC
3166 if (!is_error_page(page)) {
3167 vmx->nested.apic_access_page = page;
3168 hpa = page_to_phys(vmx->nested.apic_access_page);
3169 vmcs_write64(APIC_ACCESS_ADDR, hpa);
3170 } else {
671ddc70
JM
3171 pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n",
3172 __func__);
3173 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3174 vcpu->run->internal.suberror =
3175 KVM_INTERNAL_ERROR_EMULATION;
3176 vcpu->run->internal.ndata = 0;
3177 return false;
55d2375e
SC
3178 }
3179 }
3180
3181 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
96c66e87 3182 map = &vmx->nested.virtual_apic_map;
55d2375e 3183
96c66e87
KA
3184 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
3185 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
69090810
PB
3186 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
3187 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
3188 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3189 /*
3190 * The processor will never use the TPR shadow, simply
3191 * clear the bit from the execution control. Such a
3192 * configuration is useless, but it happens in tests.
3193 * For any other configuration, failing the vm entry is
3194 * _not_ what the processor does but it's basically the
3195 * only possibility we have.
3196 */
2183f564 3197 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
69090810 3198 } else {
ca2f5466
SC
3199 /*
3200 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
3201 * force VM-Entry to fail.
3202 */
3203 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
55d2375e
SC
3204 }
3205 }
3206
3207 if (nested_cpu_has_posted_intr(vmcs12)) {
3278e049
KA
3208 map = &vmx->nested.pi_desc_map;
3209
3210 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
3211 vmx->nested.pi_desc =
3212 (struct pi_desc *)(((void *)map->hva) +
3213 offset_in_page(vmcs12->posted_intr_desc_addr));
3214 vmcs_write64(POSTED_INTR_DESC_ADDR,
3215 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
966eefb8
JM
3216 } else {
3217 /*
3218 * Defer the KVM_INTERNAL_EXIT until KVM tries to
3219 * access the contents of the VMCS12 posted interrupt
3220 * descriptor. (Note that KVM may do this when it
3221 * should not, per the architectural specification.)
3222 */
3223 vmx->nested.pi_desc = NULL;
3224 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
55d2375e 3225 }
55d2375e
SC
3226 }
3227 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
2183f564 3228 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
55d2375e 3229 else
2183f564 3230 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
9a78e158
PB
3231
3232 return true;
3233}
3234
3235static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
3236{
f5c7e842
VK
3237 if (!nested_get_evmcs_page(vcpu)) {
3238 pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
3239 __func__);
3240 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3241 vcpu->run->internal.suberror =
3242 KVM_INTERNAL_ERROR_EMULATION;
3243 vcpu->run->internal.ndata = 0;
3244
9a78e158 3245 return false;
f5c7e842 3246 }
9a78e158
PB
3247
3248 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
3249 return false;
3250
671ddc70 3251 return true;
55d2375e
SC
3252}
3253
02f5fb2e
SC
3254static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
3255{
3256 struct vmcs12 *vmcs12;
3257 struct vcpu_vmx *vmx = to_vmx(vcpu);
3258 gpa_t dst;
3259
3260 if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
3261 return 0;
3262
3263 if (WARN_ON_ONCE(vmx->nested.pml_full))
3264 return 1;
3265
3266 /*
3267 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is
3268 * set is already checked as part of A/D emulation.
3269 */
3270 vmcs12 = get_vmcs12(vcpu);
3271 if (!nested_cpu_has_pml(vmcs12))
3272 return 0;
3273
3274 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
3275 vmx->nested.pml_full = true;
3276 return 1;
3277 }
3278
3279 gpa &= ~0xFFFull;
3280 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
3281
3282 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
3283 offset_in_page(dst), sizeof(gpa)))
3284 return 0;
3285
3286 vmcs12->guest_pml_index--;
3287
3288 return 0;
3289}
3290
55d2375e
SC
3291/*
3292 * Intel's VMX Instruction Reference specifies a common set of prerequisites
3293 * for running VMX instructions (except VMXON, whose prerequisites are
3294 * slightly different). It also specifies what exception to inject otherwise.
3295 * Note that many of these exceptions have priority over VM exits, so they
3296 * don't have to be checked again here.
3297 */
3298static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
3299{
3300 if (!to_vmx(vcpu)->nested.vmxon) {
3301 kvm_queue_exception(vcpu, UD_VECTOR);
3302 return 0;
3303 }
3304
3305 if (vmx_get_cpl(vcpu)) {
3306 kvm_inject_gp(vcpu, 0);
3307 return 0;
3308 }
3309
3310 return 1;
3311}
3312
3313static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
3314{
3315 u8 rvi = vmx_get_rvi();
3316 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
3317
3318 return ((rvi & 0xf0) > (vppr & 0xf0));
3319}
3320
3321static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3322 struct vmcs12 *vmcs12);
3323
3324/*
3325 * If from_vmentry is false, this is being called from state restore (either RSM
3326 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
671ddc70
JM
3327 *
3328 * Returns:
463bfeee
ML
3329 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode
3330 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail
3331 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit
3332 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error
55d2375e 3333 */
671ddc70
JM
3334enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
3335 bool from_vmentry)
55d2375e
SC
3336{
3337 struct vcpu_vmx *vmx = to_vmx(vcpu);
3338 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
68cda40d 3339 enum vm_entry_failure_code entry_failure_code;
55d2375e 3340 bool evaluate_pending_interrupts;
8e533240
SC
3341 union vmx_exit_reason exit_reason = {
3342 .basic = EXIT_REASON_INVALID_STATE,
3343 .failed_vmentry = 1,
3344 };
3345 u32 failed_index;
55d2375e 3346
eeeb4f67
SC
3347 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
3348 kvm_vcpu_flush_tlb_current(vcpu);
3349
2183f564 3350 evaluate_pending_interrupts = exec_controls_get(vmx) &
4e2a0bc5 3351 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
55d2375e
SC
3352 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
3353 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
3354
3355 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
3356 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
3357 if (kvm_mpx_supported() &&
3358 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
3359 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3360
f087a029
SC
3361 /*
3362 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3363 * nested early checks are disabled. In the event of a "late" VM-Fail,
3364 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3365 * software model to the pre-VMEntry host state. When EPT is disabled,
3366 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3367 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing
3368 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3369 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested
3370 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3371 * guaranteed to be overwritten with a shadow CR3 prior to re-entering
3372 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3373 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3374 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3375 * path would need to manually save/restore vmcs01.GUEST_CR3.
3376 */
3377 if (!enable_ept && !nested_early_check)
3378 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3379
55d2375e
SC
3380 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
3381
3382 prepare_vmcs02_early(vmx, vmcs12);
3383
3384 if (from_vmentry) {
b89d5ad0
SC
3385 if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
3386 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
671ddc70 3387 return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
b89d5ad0 3388 }
55d2375e
SC
3389
3390 if (nested_vmx_check_vmentry_hw(vcpu)) {
3391 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
671ddc70 3392 return NVMX_VMENTRY_VMFAIL;
55d2375e
SC
3393 }
3394
68cda40d
SC
3395 if (nested_vmx_check_guest_state(vcpu, vmcs12,
3396 &entry_failure_code)) {
8e533240 3397 exit_reason.basic = EXIT_REASON_INVALID_STATE;
68cda40d 3398 vmcs12->exit_qualification = entry_failure_code;
55d2375e 3399 goto vmentry_fail_vmexit;
68cda40d 3400 }
55d2375e
SC
3401 }
3402
3403 enter_guest_mode(vcpu);
55d2375e 3404
0f857223 3405 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) {
8e533240 3406 exit_reason.basic = EXIT_REASON_INVALID_STATE;
68cda40d 3407 vmcs12->exit_qualification = entry_failure_code;
55d2375e 3408 goto vmentry_fail_vmexit_guest_mode;
68cda40d 3409 }
55d2375e
SC
3410
3411 if (from_vmentry) {
68cda40d
SC
3412 failed_index = nested_vmx_load_msr(vcpu,
3413 vmcs12->vm_entry_msr_load_addr,
3414 vmcs12->vm_entry_msr_load_count);
3415 if (failed_index) {
8e533240 3416 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
68cda40d 3417 vmcs12->exit_qualification = failed_index;
55d2375e 3418 goto vmentry_fail_vmexit_guest_mode;
68cda40d 3419 }
55d2375e
SC
3420 } else {
3421 /*
3422 * The MMU is not initialized to point at the right entities yet and
3423 * "get pages" would need to read data from the guest (i.e. we will
3424 * need to perform gpa to hpa translation). Request a call
3425 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs
3426 * have already been set at vmentry time and should not be reset.
3427 */
729c15c2 3428 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
55d2375e
SC
3429 }
3430
3431 /*
3432 * If L1 had a pending IRQ/NMI until it executed
3433 * VMLAUNCH/VMRESUME which wasn't delivered because it was
3434 * disallowed (e.g. interrupts disabled), L0 needs to
3435 * evaluate if this pending event should cause an exit from L2
3436 * to L1 or delivered directly to L2 (e.g. In case L1 don't
3437 * intercept EXTERNAL_INTERRUPT).
3438 *
3439 * Usually this would be handled by the processor noticing an
3440 * IRQ/NMI window request, or checking RVI during evaluation of
3441 * pending virtual interrupts. However, this setting was done
3442 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
3443 * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
3444 */
3445 if (unlikely(evaluate_pending_interrupts))
3446 kvm_make_request(KVM_REQ_EVENT, vcpu);
3447
359a6c3d
PB
3448 /*
3449 * Do not start the preemption timer hrtimer until after we know
3450 * we are successful, so that only nested_vmx_vmexit needs to cancel
3451 * the timer.
3452 */
3453 vmx->nested.preemption_timer_expired = false;
850448f3
PS
3454 if (nested_cpu_has_preemption_timer(vmcs12)) {
3455 u64 timer_value = vmx_calc_preemption_timer_value(vcpu);
3456 vmx_start_preemption_timer(vcpu, timer_value);
3457 }
359a6c3d 3458
55d2375e
SC
3459 /*
3460 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3461 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3462 * returned as far as L1 is concerned. It will only return (and set
3463 * the success flag) when L2 exits (see nested_vmx_vmexit()).
3464 */
671ddc70 3465 return NVMX_VMENTRY_SUCCESS;
55d2375e
SC
3466
3467 /*
3468 * A failed consistency check that leads to a VMExit during L1's
3469 * VMEnter to L2 is a variation of a normal VMexit, as explained in
3470 * 26.7 "VM-entry failures during or after loading guest state".
3471 */
3472vmentry_fail_vmexit_guest_mode:
5e3d394f 3473 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
55d2375e
SC
3474 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3475 leave_guest_mode(vcpu);
3476
3477vmentry_fail_vmexit:
3478 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3479
3480 if (!from_vmentry)
671ddc70 3481 return NVMX_VMENTRY_VMEXIT;
55d2375e
SC
3482
3483 load_vmcs12_host_state(vcpu, vmcs12);
8e533240 3484 vmcs12->vm_exit_reason = exit_reason.full;
1e9dfbd7 3485 if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
3731905e 3486 vmx->nested.need_vmcs12_to_shadow_sync = true;
671ddc70 3487 return NVMX_VMENTRY_VMEXIT;
55d2375e
SC
3488}
3489
3490/*
3491 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3492 * for running an L2 nested guest.
3493 */
3494static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3495{
3496 struct vmcs12 *vmcs12;
671ddc70 3497 enum nvmx_vmentry_status status;
55d2375e
SC
3498 struct vcpu_vmx *vmx = to_vmx(vcpu);
3499 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
b6a0653a 3500 enum nested_evmptrld_status evmptrld_status;
55d2375e
SC
3501
3502 if (!nested_vmx_check_permission(vcpu))
3503 return 1;
3504
b6a0653a
VK
3505 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch);
3506 if (evmptrld_status == EVMPTRLD_ERROR) {
3507 kvm_queue_exception(vcpu, UD_VECTOR);
55d2375e 3508 return 1;
fc595f35 3509 } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) {
b6a0653a
VK
3510 return nested_vmx_failInvalid(vcpu);
3511 }
55d2375e 3512
1e9dfbd7
VK
3513 if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) &&
3514 vmx->nested.current_vmptr == -1ull))
55d2375e
SC
3515 return nested_vmx_failInvalid(vcpu);
3516
3517 vmcs12 = get_vmcs12(vcpu);
3518
3519 /*
3520 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3521 * that there *is* a valid VMCS pointer, RFLAGS.CF is set
3522 * rather than RFLAGS.ZF, and no error number is stored to the
3523 * VM-instruction error field.
3524 */
fc595f35 3525 if (CC(vmcs12->hdr.shadow_vmcs))
55d2375e
SC
3526 return nested_vmx_failInvalid(vcpu);
3527
1e9dfbd7 3528 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
d6bf71a1 3529 copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields);
55d2375e
SC
3530 /* Enlightened VMCS doesn't have launch state */
3531 vmcs12->launch_state = !launch;
3532 } else if (enable_shadow_vmcs) {
3533 copy_shadow_to_vmcs12(vmx);
3534 }
3535
3536 /*
3537 * The nested entry process starts with enforcing various prerequisites
3538 * on vmcs12 as required by the Intel SDM, and act appropriately when
3539 * they fail: As the SDM explains, some conditions should cause the
3540 * instruction to fail, while others will cause the instruction to seem
3541 * to succeed, but return an EXIT_REASON_INVALID_STATE.
3542 * To speed up the normal (success) code path, we should avoid checking
3543 * for misconfigurations which will anyway be caught by the processor
3544 * when using the merged vmcs02.
3545 */
fc595f35 3546 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS))
b2656e4d 3547 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
55d2375e 3548
fc595f35 3549 if (CC(vmcs12->launch_state == launch))
b2656e4d 3550 return nested_vmx_fail(vcpu,
55d2375e
SC
3551 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3552 : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3553
98d9e858 3554 if (nested_vmx_check_controls(vcpu, vmcs12))
b2656e4d 3555 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
5478ba34 3556
98d9e858 3557 if (nested_vmx_check_host_state(vcpu, vmcs12))
b2656e4d 3558 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
55d2375e
SC
3559
3560 /*
3561 * We're finally done with prerequisite checking, and can start with
3562 * the nested entry.
3563 */
3564 vmx->nested.nested_run_pending = 1;
850448f3 3565 vmx->nested.has_preemption_timer_deadline = false;
671ddc70
JM
3566 status = nested_vmx_enter_non_root_mode(vcpu, true);
3567 if (unlikely(status != NVMX_VMENTRY_SUCCESS))
3568 goto vmentry_failed;
55d2375e 3569
25bb2cf9
SC
3570 /* Emulate processing of posted interrupts on VM-Enter. */
3571 if (nested_cpu_has_posted_intr(vmcs12) &&
3572 kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) {
3573 vmx->nested.pi_pending = true;
3574 kvm_make_request(KVM_REQ_EVENT, vcpu);
3575 kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv);
3576 }
3577
55d2375e
SC
3578 /* Hide L1D cache contents from the nested guest. */
3579 vmx->vcpu.arch.l1tf_flush_l1d = true;
3580
3581 /*
3582 * Must happen outside of nested_vmx_enter_non_root_mode() as it will
3583 * also be used as part of restoring nVMX state for
3584 * snapshot restore (migration).
3585 *
3586 * In this flow, it is assumed that vmcs12 cache was
163b0991 3587 * transferred as part of captured nVMX state and should
55d2375e
SC
3588 * therefore not be read from guest memory (which may not
3589 * exist on destination host yet).
3590 */
3591 nested_cache_shadow_vmcs12(vcpu, vmcs12);
3592
bf0cd88c
YQ
3593 switch (vmcs12->guest_activity_state) {
3594 case GUEST_ACTIVITY_HLT:
3595 /*
3596 * If we're entering a halted L2 vcpu and the L2 vcpu won't be
3597 * awakened by event injection or by an NMI-window VM-exit or
3598 * by an interrupt-window VM-exit, halt the vcpu.
3599 */
3600 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
3601 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) &&
3602 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
3603 (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
3604 vmx->nested.nested_run_pending = 0;
3605 return kvm_vcpu_halt(vcpu);
3606 }
3607 break;
3608 case GUEST_ACTIVITY_WAIT_SIPI:
55d2375e 3609 vmx->nested.nested_run_pending = 0;
bf0cd88c
YQ
3610 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
3611 break;
3612 default:
3613 break;
55d2375e 3614 }
bf0cd88c 3615
55d2375e 3616 return 1;
671ddc70
JM
3617
3618vmentry_failed:
3619 vmx->nested.nested_run_pending = 0;
3620 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
3621 return 0;
3622 if (status == NVMX_VMENTRY_VMEXIT)
3623 return 1;
3624 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
b2656e4d 3625 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
55d2375e
SC
3626}
3627
3628/*
3629 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
67b0ae43 3630 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK).
55d2375e
SC
3631 * This function returns the new value we should put in vmcs12.guest_cr0.
3632 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3633 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3634 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3635 * didn't trap the bit, because if L1 did, so would L0).
3636 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3637 * been modified by L2, and L1 knows it. So just leave the old value of
3638 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3639 * isn't relevant, because if L0 traps this bit it can set it to anything.
3640 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3641 * changed these bits, and therefore they need to be updated, but L0
3642 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3643 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3644 */
3645static inline unsigned long
3646vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3647{
3648 return
3649 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
3650 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
3651 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
3652 vcpu->arch.cr0_guest_owned_bits));
3653}
3654
3655static inline unsigned long
3656vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3657{
3658 return
3659 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
3660 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
3661 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
3662 vcpu->arch.cr4_guest_owned_bits));
3663}
3664
3665static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
3666 struct vmcs12 *vmcs12)
3667{
3668 u32 idt_vectoring;
3669 unsigned int nr;
3670
3671 if (vcpu->arch.exception.injected) {
3672 nr = vcpu->arch.exception.nr;
3673 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3674
3675 if (kvm_exception_is_soft(nr)) {
3676 vmcs12->vm_exit_instruction_len =
3677 vcpu->arch.event_exit_inst_len;
3678 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
3679 } else
3680 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
3681
3682 if (vcpu->arch.exception.has_error_code) {
3683 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
3684 vmcs12->idt_vectoring_error_code =
3685 vcpu->arch.exception.error_code;
3686 }
3687
3688 vmcs12->idt_vectoring_info_field = idt_vectoring;
3689 } else if (vcpu->arch.nmi_injected) {
3690 vmcs12->idt_vectoring_info_field =
3691 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
3692 } else if (vcpu->arch.interrupt.injected) {
3693 nr = vcpu->arch.interrupt.nr;
3694 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3695
3696 if (vcpu->arch.interrupt.soft) {
3697 idt_vectoring |= INTR_TYPE_SOFT_INTR;
3698 vmcs12->vm_entry_instruction_len =
3699 vcpu->arch.event_exit_inst_len;
3700 } else
3701 idt_vectoring |= INTR_TYPE_EXT_INTR;
3702
3703 vmcs12->idt_vectoring_info_field = idt_vectoring;
3704 }
3705}
3706
3707
96b100cd 3708void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
55d2375e
SC
3709{
3710 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3711 gfn_t gfn;
3712
3713 /*
3714 * Don't need to mark the APIC access page dirty; it is never
3715 * written to by the CPU during APIC virtualization.
3716 */
3717
3718 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3719 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
3720 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3721 }
3722
3723 if (nested_cpu_has_posted_intr(vmcs12)) {
3724 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
3725 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3726 }
3727}
3728
650293c3 3729static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
55d2375e
SC
3730{
3731 struct vcpu_vmx *vmx = to_vmx(vcpu);
3732 int max_irr;
3733 void *vapic_page;
3734 u16 status;
3735
966eefb8 3736 if (!vmx->nested.pi_pending)
650293c3 3737 return 0;
55d2375e 3738
966eefb8
JM
3739 if (!vmx->nested.pi_desc)
3740 goto mmio_needed;
3741
55d2375e 3742 vmx->nested.pi_pending = false;
966eefb8 3743
55d2375e 3744 if (!pi_test_and_clear_on(vmx->nested.pi_desc))
650293c3 3745 return 0;
55d2375e
SC
3746
3747 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
3748 if (max_irr != 256) {
96c66e87
KA
3749 vapic_page = vmx->nested.virtual_apic_map.hva;
3750 if (!vapic_page)
0fe998b2 3751 goto mmio_needed;
96c66e87 3752
55d2375e
SC
3753 __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
3754 vapic_page, &max_irr);
55d2375e
SC
3755 status = vmcs_read16(GUEST_INTR_STATUS);
3756 if ((u8)max_irr > ((u8)status & 0xff)) {
3757 status &= ~0xff;
3758 status |= (u8)max_irr;
3759 vmcs_write16(GUEST_INTR_STATUS, status);
3760 }
3761 }
3762
3763 nested_mark_vmcs12_pages_dirty(vcpu);
650293c3 3764 return 0;
0fe998b2
JM
3765
3766mmio_needed:
3767 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL);
3768 return -ENXIO;
55d2375e
SC
3769}
3770
3771static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3772 unsigned long exit_qual)
3773{
3774 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3775 unsigned int nr = vcpu->arch.exception.nr;
3776 u32 intr_info = nr | INTR_INFO_VALID_MASK;
3777
3778 if (vcpu->arch.exception.has_error_code) {
3779 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
3780 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3781 }
3782
3783 if (kvm_exception_is_soft(nr))
3784 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3785 else
3786 intr_info |= INTR_TYPE_HARD_EXCEPTION;
3787
3788 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3789 vmx_get_nmi_mask(vcpu))
3790 intr_info |= INTR_INFO_UNBLOCK_NMI;
3791
3792 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3793}
3794
684c0422
OU
3795/*
3796 * Returns true if a debug trap is pending delivery.
3797 *
3798 * In KVM, debug traps bear an exception payload. As such, the class of a #DB
3799 * exception may be inferred from the presence of an exception payload.
3800 */
3801static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu)
3802{
3803 return vcpu->arch.exception.pending &&
3804 vcpu->arch.exception.nr == DB_VECTOR &&
3805 vcpu->arch.exception.payload;
3806}
3807
3808/*
3809 * Certain VM-exits set the 'pending debug exceptions' field to indicate a
3810 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM
3811 * represents these debug traps with a payload that is said to be compatible
3812 * with the 'pending debug exceptions' field, write the payload to the VMCS
3813 * field if a VM-exit is delivered before the debug trap.
3814 */
3815static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
3816{
3817 if (vmx_pending_dbg_trap(vcpu))
3818 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
3819 vcpu->arch.exception.payload);
3820}
3821
d2060bd4
SC
3822static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
3823{
3824 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
3825 to_vmx(vcpu)->nested.preemption_timer_expired;
3826}
3827
a1c77abb 3828static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
55d2375e
SC
3829{
3830 struct vcpu_vmx *vmx = to_vmx(vcpu);
3831 unsigned long exit_qual;
3832 bool block_nested_events =
3833 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
5ef8acbd 3834 bool mtf_pending = vmx->nested.mtf_pending;
4b9852f4
LA
3835 struct kvm_lapic *apic = vcpu->arch.apic;
3836
5ef8acbd
OU
3837 /*
3838 * Clear the MTF state. If a higher priority VM-exit is delivered first,
3839 * this state is discarded.
3840 */
5c8beb47
OU
3841 if (!block_nested_events)
3842 vmx->nested.mtf_pending = false;
5ef8acbd 3843
4b9852f4
LA
3844 if (lapic_in_kernel(vcpu) &&
3845 test_bit(KVM_APIC_INIT, &apic->pending_events)) {
3846 if (block_nested_events)
3847 return -EBUSY;
684c0422 3848 nested_vmx_update_pending_dbg(vcpu);
e64a8508 3849 clear_bit(KVM_APIC_INIT, &apic->pending_events);
bf0cd88c
YQ
3850 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
3851 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
3852 return 0;
3853 }
3854
3855 if (lapic_in_kernel(vcpu) &&
3856 test_bit(KVM_APIC_SIPI, &apic->pending_events)) {
3857 if (block_nested_events)
3858 return -EBUSY;
3859
3860 clear_bit(KVM_APIC_SIPI, &apic->pending_events);
3861 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
3862 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0,
3863 apic->sipi_vector & 0xFFUL);
4b9852f4
LA
3864 return 0;
3865 }
55d2375e 3866
5ef8acbd
OU
3867 /*
3868 * Process any exceptions that are not debug traps before MTF.
4020da3b
ML
3869 *
3870 * Note that only a pending nested run can block a pending exception.
3871 * Otherwise an injected NMI/interrupt should either be
3872 * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO,
3873 * while delivering the pending exception.
5ef8acbd 3874 */
4020da3b 3875
6ce347af 3876 if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
4020da3b 3877 if (vmx->nested.nested_run_pending)
5ef8acbd 3878 return -EBUSY;
6ce347af
SC
3879 if (!nested_vmx_check_exception(vcpu, &exit_qual))
3880 goto no_vmexit;
5ef8acbd
OU
3881 nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3882 return 0;
3883 }
3884
3885 if (mtf_pending) {
3886 if (block_nested_events)
3887 return -EBUSY;
3888 nested_vmx_update_pending_dbg(vcpu);
3889 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
3890 return 0;
3891 }
3892
6ce347af 3893 if (vcpu->arch.exception.pending) {
4020da3b 3894 if (vmx->nested.nested_run_pending)
55d2375e 3895 return -EBUSY;
6ce347af
SC
3896 if (!nested_vmx_check_exception(vcpu, &exit_qual))
3897 goto no_vmexit;
55d2375e
SC
3898 nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3899 return 0;
3900 }
3901
d2060bd4 3902 if (nested_vmx_preemption_timer_pending(vcpu)) {
55d2375e
SC
3903 if (block_nested_events)
3904 return -EBUSY;
3905 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
3906 return 0;
3907 }
3908
1cd2f0b0
SC
3909 if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
3910 if (block_nested_events)
3911 return -EBUSY;
3912 goto no_vmexit;
3913 }
3914
15ff0b45 3915 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) {
55d2375e
SC
3916 if (block_nested_events)
3917 return -EBUSY;
15ff0b45
SC
3918 if (!nested_exit_on_nmi(vcpu))
3919 goto no_vmexit;
3920
55d2375e
SC
3921 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
3922 NMI_VECTOR | INTR_TYPE_NMI_INTR |
3923 INTR_INFO_VALID_MASK, 0);
3924 /*
3925 * The NMI-triggered VM exit counts as injection:
3926 * clear this one and block further NMIs.
3927 */
3928 vcpu->arch.nmi_pending = 0;
3929 vmx_set_nmi_mask(vcpu, true);
3930 return 0;
3931 }
3932
15ff0b45 3933 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
55d2375e
SC
3934 if (block_nested_events)
3935 return -EBUSY;
15ff0b45
SC
3936 if (!nested_exit_on_intr(vcpu))
3937 goto no_vmexit;
55d2375e
SC
3938 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
3939 return 0;
3940 }
3941
6ce347af 3942no_vmexit:
650293c3 3943 return vmx_complete_nested_posted_interrupt(vcpu);
55d2375e
SC
3944}
3945
3946static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
3947{
3948 ktime_t remaining =
3949 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
3950 u64 value;
3951
3952 if (ktime_to_ns(remaining) <= 0)
3953 return 0;
3954
3955 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
3956 do_div(value, 1000000);
3957 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
3958}
3959
7952d769 3960static bool is_vmcs12_ext_field(unsigned long field)
55d2375e 3961{
7952d769
SC
3962 switch (field) {
3963 case GUEST_ES_SELECTOR:
3964 case GUEST_CS_SELECTOR:
3965 case GUEST_SS_SELECTOR:
3966 case GUEST_DS_SELECTOR:
3967 case GUEST_FS_SELECTOR:
3968 case GUEST_GS_SELECTOR:
3969 case GUEST_LDTR_SELECTOR:
3970 case GUEST_TR_SELECTOR:
3971 case GUEST_ES_LIMIT:
3972 case GUEST_CS_LIMIT:
3973 case GUEST_SS_LIMIT:
3974 case GUEST_DS_LIMIT:
3975 case GUEST_FS_LIMIT:
3976 case GUEST_GS_LIMIT:
3977 case GUEST_LDTR_LIMIT:
3978 case GUEST_TR_LIMIT:
3979 case GUEST_GDTR_LIMIT:
3980 case GUEST_IDTR_LIMIT:
3981 case GUEST_ES_AR_BYTES:
3982 case GUEST_DS_AR_BYTES:
3983 case GUEST_FS_AR_BYTES:
3984 case GUEST_GS_AR_BYTES:
3985 case GUEST_LDTR_AR_BYTES:
3986 case GUEST_TR_AR_BYTES:
3987 case GUEST_ES_BASE:
3988 case GUEST_CS_BASE:
3989 case GUEST_SS_BASE:
3990 case GUEST_DS_BASE:
3991 case GUEST_FS_BASE:
3992 case GUEST_GS_BASE:
3993 case GUEST_LDTR_BASE:
3994 case GUEST_TR_BASE:
3995 case GUEST_GDTR_BASE:
3996 case GUEST_IDTR_BASE:
3997 case GUEST_PENDING_DBG_EXCEPTIONS:
3998 case GUEST_BNDCFGS:
3999 return true;
4000 default:
4001 break;
4002 }
55d2375e 4003
7952d769
SC
4004 return false;
4005}
4006
4007static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
4008 struct vmcs12 *vmcs12)
4009{
4010 struct vcpu_vmx *vmx = to_vmx(vcpu);
55d2375e
SC
4011
4012 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
4013 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
4014 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
4015 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
4016 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
4017 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
4018 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
4019 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
4020 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
4021 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
4022 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
4023 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
4024 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
4025 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
4026 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
4027 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
4028 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
4029 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
4030 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
55d2375e
SC
4031 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
4032 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
4033 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
4034 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
4035 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
4036 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
4037 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
4038 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
4039 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
4040 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
4041 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
4042 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
4043 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
4044 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
4045 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
7952d769
SC
4046 vmcs12->guest_pending_dbg_exceptions =
4047 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
4048 if (kvm_mpx_supported())
4049 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
4050
4051 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
4052}
4053
4054static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
4055 struct vmcs12 *vmcs12)
4056{
4057 struct vcpu_vmx *vmx = to_vmx(vcpu);
4058 int cpu;
4059
4060 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
4061 return;
4062
4063
4064 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
4065
4066 cpu = get_cpu();
4067 vmx->loaded_vmcs = &vmx->nested.vmcs02;
1af1bb05 4068 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01);
7952d769
SC
4069
4070 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4071
4072 vmx->loaded_vmcs = &vmx->vmcs01;
1af1bb05 4073 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02);
7952d769
SC
4074 put_cpu();
4075}
4076
4077/*
4078 * Update the guest state fields of vmcs12 to reflect changes that
4079 * occurred while L2 was running. (The "IA-32e mode guest" bit of the
4080 * VM-entry controls is also updated, since this is really a guest
4081 * state bit.)
4082 */
4083static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
4084{
4085 struct vcpu_vmx *vmx = to_vmx(vcpu);
4086
1e9dfbd7 4087 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
7952d769
SC
4088 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4089
1e9dfbd7
VK
4090 vmx->nested.need_sync_vmcs02_to_vmcs12_rare =
4091 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr);
7952d769
SC
4092
4093 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
4094 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
4095
4096 vmcs12->guest_rsp = kvm_rsp_read(vcpu);
4097 vmcs12->guest_rip = kvm_rip_read(vcpu);
4098 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
4099
4100 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
4101 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
55d2375e
SC
4102
4103 vmcs12->guest_interruptibility_info =
4104 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
7952d769 4105
55d2375e
SC
4106 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
4107 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
bf0cd88c
YQ
4108 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
4109 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI;
55d2375e
SC
4110 else
4111 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
4112
b4b65b56 4113 if (nested_cpu_has_preemption_timer(vmcs12) &&
850448f3
PS
4114 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER &&
4115 !vmx->nested.nested_run_pending)
4116 vmcs12->vmx_preemption_timer_value =
4117 vmx_get_preemption_timer_value(vcpu);
55d2375e
SC
4118
4119 /*
4120 * In some cases (usually, nested EPT), L2 is allowed to change its
4121 * own CR3 without exiting. If it has changed it, we must keep it.
4122 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
4123 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
4124 *
4125 * Additionally, restore L2's PDPTR to vmcs12.
4126 */
4127 if (enable_ept) {
4128 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
c7554efc
SC
4129 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
4130 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
4131 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
4132 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
4133 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
4134 }
55d2375e
SC
4135 }
4136
4137 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
4138
4139 if (nested_cpu_has_vid(vmcs12))
4140 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
4141
4142 vmcs12->vm_entry_controls =
4143 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
4144 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
4145
699a1ac2 4146 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
55d2375e 4147 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
55d2375e 4148
55d2375e
SC
4149 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
4150 vmcs12->guest_ia32_efer = vcpu->arch.efer;
55d2375e
SC
4151}
4152
4153/*
4154 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
4155 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
4156 * and this function updates it to reflect the changes to the guest state while
4157 * L2 was running (and perhaps made some exits which were handled directly by L0
4158 * without going back to L1), and to reflect the exit reason.
4159 * Note that we do not have to copy here all VMCS fields, just those that
4160 * could have changed by the L2 guest or the exit - i.e., the guest-state and
4161 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
4162 * which already writes to vmcs12 directly.
4163 */
4164static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
4dcefa31 4165 u32 vm_exit_reason, u32 exit_intr_info,
55d2375e
SC
4166 unsigned long exit_qualification)
4167{
55d2375e 4168 /* update exit information fields: */
4dcefa31 4169 vmcs12->vm_exit_reason = vm_exit_reason;
3c0c2ad1
SC
4170 if (to_vmx(vcpu)->exit_reason.enclave_mode)
4171 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
55d2375e
SC
4172 vmcs12->exit_qualification = exit_qualification;
4173 vmcs12->vm_exit_intr_info = exit_intr_info;
4174
4175 vmcs12->idt_vectoring_info_field = 0;
4176 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4177 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4178
4179 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
4180 vmcs12->launch_state = 1;
4181
4182 /* vm_entry_intr_info_field is cleared on exit. Emulate this
4183 * instead of reading the real value. */
4184 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
4185
4186 /*
4187 * Transfer the event that L0 or L1 may wanted to inject into
4188 * L2 to IDT_VECTORING_INFO_FIELD.
4189 */
4190 vmcs12_save_pending_event(vcpu, vmcs12);
a0d4f803
KS
4191
4192 /*
4193 * According to spec, there's no need to store the guest's
4194 * MSRs if the exit is due to a VM-entry failure that occurs
4195 * during or after loading the guest state. Since this exit
4196 * does not fall in that category, we need to save the MSRs.
4197 */
4198 if (nested_vmx_store_msr(vcpu,
4199 vmcs12->vm_exit_msr_store_addr,
4200 vmcs12->vm_exit_msr_store_count))
4201 nested_vmx_abort(vcpu,
4202 VMX_ABORT_SAVE_GUEST_MSR_FAIL);
55d2375e
SC
4203 }
4204
4205 /*
4206 * Drop what we picked up for L2 via vmx_complete_interrupts. It is
4207 * preserved above and would only end up incorrectly in L1.
4208 */
4209 vcpu->arch.nmi_injected = false;
4210 kvm_clear_exception_queue(vcpu);
4211 kvm_clear_interrupt_queue(vcpu);
4212}
4213
4214/*
4215 * A part of what we need to when the nested L2 guest exits and we want to
4216 * run its L1 parent, is to reset L1's guest state to the host state specified
4217 * in vmcs12.
4218 * This function is to be called not only on normal nested exit, but also on
4219 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
4220 * Failures During or After Loading Guest State").
4221 * This function should be called when the active VMCS is L1's (vmcs01).
4222 */
4223static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
4224 struct vmcs12 *vmcs12)
4225{
68cda40d 4226 enum vm_entry_failure_code ignored;
55d2375e 4227 struct kvm_segment seg;
55d2375e
SC
4228
4229 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
4230 vcpu->arch.efer = vmcs12->host_ia32_efer;
4231 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4232 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
4233 else
4234 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
4235 vmx_set_efer(vcpu, vcpu->arch.efer);
4236
e9c16c78
PB
4237 kvm_rsp_write(vcpu, vmcs12->host_rsp);
4238 kvm_rip_write(vcpu, vmcs12->host_rip);
55d2375e
SC
4239 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
4240 vmx_set_interrupt_shadow(vcpu, 0);
4241
4242 /*
4243 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
4244 * actually changed, because vmx_set_cr0 refers to efer set above.
4245 *
4246 * CR0_GUEST_HOST_MASK is already set in the original vmcs01
4247 * (KVM doesn't change it);
4248 */
fa71e952 4249 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
55d2375e
SC
4250 vmx_set_cr0(vcpu, vmcs12->host_cr0);
4251
4252 /* Same as above - no reason to call set_cr4_guest_host_mask(). */
4253 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4254 vmx_set_cr4(vcpu, vmcs12->host_cr4);
4255
4256 nested_ept_uninit_mmu_context(vcpu);
4257
4258 /*
4259 * Only PDPTE load can fail as the value of cr3 was checked on entry and
4260 * couldn't have changed.
4261 */
0f857223 4262 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored))
55d2375e
SC
4263 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
4264
50b265a4 4265 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
55d2375e
SC
4266
4267 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
4268 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
4269 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
4270 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
4271 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
4272 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
4273 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
4274
4275 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
4276 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
4277 vmcs_write64(GUEST_BNDCFGS, 0);
4278
4279 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
4280 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
4281 vcpu->arch.pat = vmcs12->host_ia32_pat;
4282 }
4283 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
d1968421
OU
4284 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
4285 vmcs12->host_ia32_perf_global_ctrl));
55d2375e
SC
4286
4287 /* Set L1 segment info according to Intel SDM
4288 27.5.2 Loading Host Segment and Descriptor-Table Registers */
4289 seg = (struct kvm_segment) {
4290 .base = 0,
4291 .limit = 0xFFFFFFFF,
4292 .selector = vmcs12->host_cs_selector,
4293 .type = 11,
4294 .present = 1,
4295 .s = 1,
4296 .g = 1
4297 };
4298 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4299 seg.l = 1;
4300 else
4301 seg.db = 1;
4302 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
4303 seg = (struct kvm_segment) {
4304 .base = 0,
4305 .limit = 0xFFFFFFFF,
4306 .type = 3,
4307 .present = 1,
4308 .s = 1,
4309 .db = 1,
4310 .g = 1
4311 };
4312 seg.selector = vmcs12->host_ds_selector;
4313 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
4314 seg.selector = vmcs12->host_es_selector;
4315 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
4316 seg.selector = vmcs12->host_ss_selector;
4317 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
4318 seg.selector = vmcs12->host_fs_selector;
4319 seg.base = vmcs12->host_fs_base;
4320 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
4321 seg.selector = vmcs12->host_gs_selector;
4322 seg.base = vmcs12->host_gs_base;
4323 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
4324 seg = (struct kvm_segment) {
4325 .base = vmcs12->host_tr_base,
4326 .limit = 0x67,
4327 .selector = vmcs12->host_tr_selector,
4328 .type = 11,
4329 .present = 1
4330 };
4331 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
4332
4333 kvm_set_dr(vcpu, 7, 0x400);
4334 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4335
4336 if (cpu_has_vmx_msr_bitmap())
4337 vmx_update_msr_bitmap(vcpu);
4338
4339 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
4340 vmcs12->vm_exit_msr_load_count))
4341 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4342}
4343
4344static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
4345{
eb3db1b1 4346 struct vmx_uret_msr *efer_msr;
55d2375e
SC
4347 unsigned int i;
4348
4349 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
4350 return vmcs_read64(GUEST_IA32_EFER);
4351
4352 if (cpu_has_load_ia32_efer())
4353 return host_efer;
4354
4355 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
4356 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
4357 return vmx->msr_autoload.guest.val[i].value;
4358 }
4359
d85a8034 4360 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER);
55d2375e
SC
4361 if (efer_msr)
4362 return efer_msr->data;
4363
4364 return host_efer;
4365}
4366
4367static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
4368{
4369 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4370 struct vcpu_vmx *vmx = to_vmx(vcpu);
4371 struct vmx_msr_entry g, h;
55d2375e
SC
4372 gpa_t gpa;
4373 u32 i, j;
4374
4375 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
4376
4377 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
4378 /*
4379 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
4380 * as vmcs01.GUEST_DR7 contains a userspace defined value
4381 * and vcpu->arch.dr7 is not squirreled away before the
4382 * nested VMENTER (not worth adding a variable in nested_vmx).
4383 */
4384 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
4385 kvm_set_dr(vcpu, 7, DR7_FIXED_1);
4386 else
4387 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
4388 }
4389
4390 /*
4391 * Note that calling vmx_set_{efer,cr0,cr4} is important as they
4392 * handle a variety of side effects to KVM's software model.
4393 */
4394 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
4395
fa71e952 4396 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
55d2375e
SC
4397 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
4398
4399 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4400 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
4401
4402 nested_ept_uninit_mmu_context(vcpu);
f087a029 4403 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
cb3c1e2f 4404 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
55d2375e
SC
4405
4406 /*
4407 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
4408 * from vmcs01 (if necessary). The PDPTRs are not loaded on
4409 * VMFail, like everything else we just need to ensure our
4410 * software model is up-to-date.
4411 */
9932b49e 4412 if (enable_ept && is_pae_paging(vcpu))
f087a029 4413 ept_save_pdptrs(vcpu);
55d2375e
SC
4414
4415 kvm_mmu_reset_context(vcpu);
4416
4417 if (cpu_has_vmx_msr_bitmap())
4418 vmx_update_msr_bitmap(vcpu);
4419
4420 /*
4421 * This nasty bit of open coding is a compromise between blindly
4422 * loading L1's MSRs using the exit load lists (incorrect emulation
4423 * of VMFail), leaving the nested VM's MSRs in the software model
4424 * (incorrect behavior) and snapshotting the modified MSRs (too
4425 * expensive since the lists are unbound by hardware). For each
4426 * MSR that was (prematurely) loaded from the nested VMEntry load
4427 * list, reload it from the exit load list if it exists and differs
4428 * from the guest value. The intent is to stuff host state as
4429 * silently as possible, not to fully process the exit load list.
4430 */
55d2375e
SC
4431 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
4432 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
4433 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
4434 pr_debug_ratelimited(
4435 "%s read MSR index failed (%u, 0x%08llx)\n",
4436 __func__, i, gpa);
4437 goto vmabort;
4438 }
4439
4440 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
4441 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
4442 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
4443 pr_debug_ratelimited(
4444 "%s read MSR failed (%u, 0x%08llx)\n",
4445 __func__, j, gpa);
4446 goto vmabort;
4447 }
4448 if (h.index != g.index)
4449 continue;
4450 if (h.value == g.value)
4451 break;
4452
4453 if (nested_vmx_load_msr_check(vcpu, &h)) {
4454 pr_debug_ratelimited(
4455 "%s check failed (%u, 0x%x, 0x%x)\n",
4456 __func__, j, h.index, h.reserved);
4457 goto vmabort;
4458 }
4459
f20935d8 4460 if (kvm_set_msr(vcpu, h.index, h.value)) {
55d2375e
SC
4461 pr_debug_ratelimited(
4462 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
4463 __func__, j, h.index, h.value);
4464 goto vmabort;
4465 }
4466 }
4467 }
4468
4469 return;
4470
4471vmabort:
4472 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4473}
4474
4475/*
4476 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
4477 * and modify vmcs12 to make it see what it would expect to see there if
4478 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
4479 */
4dcefa31 4480void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
55d2375e
SC
4481 u32 exit_intr_info, unsigned long exit_qualification)
4482{
4483 struct vcpu_vmx *vmx = to_vmx(vcpu);
4484 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4485
4486 /* trying to cancel vmlaunch/vmresume is a bug */
4487 WARN_ON_ONCE(vmx->nested.nested_run_pending);
4488
cb6a32c2
SC
4489 /* Similarly, triple faults in L2 should never escape. */
4490 WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
4491
f5c7e842
VK
4492 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
4493 /*
4494 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
4495 * Enlightened VMCS after migration and we still need to
4496 * do that when something is forcing L2->L1 exit prior to
4497 * the first L2 run.
4498 */
4499 (void)nested_get_evmcs_page(vcpu);
4500 }
f2c7ef3b 4501
eeeb4f67
SC
4502 /* Service the TLB flush request for L2 before switching to L1. */
4503 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
4504 kvm_vcpu_flush_tlb_current(vcpu);
4505
43fea4e4
PS
4506 /*
4507 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
4508 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are
4509 * up-to-date before switching to L1.
4510 */
4511 if (enable_ept && is_pae_paging(vcpu))
4512 vmx_ept_load_pdptrs(vcpu);
4513
55d2375e
SC
4514 leave_guest_mode(vcpu);
4515
b4b65b56
PB
4516 if (nested_cpu_has_preemption_timer(vmcs12))
4517 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
4518
d041b5ea
IS
4519 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) {
4520 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset;
4521 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
4522 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
4523 }
55d2375e
SC
4524
4525 if (likely(!vmx->fail)) {
3731905e 4526 sync_vmcs02_to_vmcs12(vcpu, vmcs12);
f4f8316d 4527
4dcefa31
SC
4528 if (vm_exit_reason != -1)
4529 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
4530 exit_intr_info, exit_qualification);
55d2375e
SC
4531
4532 /*
3731905e 4533 * Must happen outside of sync_vmcs02_to_vmcs12() as it will
55d2375e
SC
4534 * also be used to capture vmcs12 cache as part of
4535 * capturing nVMX state for snapshot (migration).
4536 *
4537 * Otherwise, this flush will dirty guest memory at a
4538 * point it is already assumed by user-space to be
4539 * immutable.
4540 */
4541 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
55d2375e
SC
4542 } else {
4543 /*
4544 * The only expected VM-instruction error is "VM entry with
4545 * invalid control field(s)." Anything else indicates a
4546 * problem with L0. And we should never get here with a
4547 * VMFail of any type if early consistency checks are enabled.
4548 */
4549 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
4550 VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4551 WARN_ON_ONCE(nested_early_check);
4552 }
4553
4554 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
4555
4556 /* Update any VMCS fields that might have changed while L2 ran */
4557 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
4558 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
4559 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
1ab9287a
IS
4560 if (kvm_has_tsc_control)
4561 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
4562
02d496cf
LA
4563 if (vmx->nested.l1_tpr_threshold != -1)
4564 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
55d2375e 4565
55d2375e
SC
4566 if (vmx->nested.change_vmcs01_virtual_apic_mode) {
4567 vmx->nested.change_vmcs01_virtual_apic_mode = false;
4568 vmx_set_virtual_apic_mode(vcpu);
55d2375e
SC
4569 }
4570
a85863c2
MS
4571 if (vmx->nested.update_vmcs01_cpu_dirty_logging) {
4572 vmx->nested.update_vmcs01_cpu_dirty_logging = false;
4573 vmx_update_cpu_dirty_logging(vcpu);
4574 }
4575
55d2375e
SC
4576 /* Unpin physical memory we referred to in vmcs02 */
4577 if (vmx->nested.apic_access_page) {
b11494bc 4578 kvm_release_page_clean(vmx->nested.apic_access_page);
55d2375e
SC
4579 vmx->nested.apic_access_page = NULL;
4580 }
96c66e87 4581 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
3278e049
KA
4582 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
4583 vmx->nested.pi_desc = NULL;
55d2375e 4584
1196cb97
SC
4585 if (vmx->nested.reload_vmcs01_apic_access_page) {
4586 vmx->nested.reload_vmcs01_apic_access_page = false;
4587 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4588 }
55d2375e 4589
4dcefa31 4590 if ((vm_exit_reason != -1) &&
1e9dfbd7 4591 (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)))
3731905e 4592 vmx->nested.need_vmcs12_to_shadow_sync = true;
55d2375e
SC
4593
4594 /* in case we halted in L2 */
4595 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4596
4597 if (likely(!vmx->fail)) {
4dcefa31 4598 if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
a1c77abb 4599 nested_exit_intr_ack_set(vcpu)) {
55d2375e
SC
4600 int irq = kvm_cpu_get_interrupt(vcpu);
4601 WARN_ON(irq < 0);
4602 vmcs12->vm_exit_intr_info = irq |
4603 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
4604 }
4605
4dcefa31 4606 if (vm_exit_reason != -1)
55d2375e
SC
4607 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
4608 vmcs12->exit_qualification,
4609 vmcs12->idt_vectoring_info_field,
4610 vmcs12->vm_exit_intr_info,
4611 vmcs12->vm_exit_intr_error_code,
4612 KVM_ISA_VMX);
4613
4614 load_vmcs12_host_state(vcpu, vmcs12);
4615
4616 return;
4617 }
4618
4619 /*
4620 * After an early L2 VM-entry failure, we're now back
4621 * in L1 which thinks it just finished a VMLAUNCH or
4622 * VMRESUME instruction, so we need to set the failure
4623 * flag and the VM-instruction error field of the VMCS
4624 * accordingly, and skip the emulated instruction.
4625 */
b2656e4d 4626 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
55d2375e
SC
4627
4628 /*
4629 * Restore L1's host state to KVM's software model. We're here
4630 * because a consistency check was caught by hardware, which
4631 * means some amount of guest state has been propagated to KVM's
4632 * model and needs to be unwound to the host's state.
4633 */
4634 nested_vmx_restore_host_state(vcpu);
4635
4636 vmx->fail = 0;
4637}
4638
cb6a32c2
SC
4639static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
4640{
4641 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
4642}
4643
55d2375e
SC
4644/*
4645 * Decode the memory-address operand of a vmx instruction, as recorded on an
4646 * exit caused by such an instruction (run by a guest hypervisor).
4647 * On success, returns 0. When the operand is invalid, returns 1 and throws
49f933d4 4648 * #UD, #GP, or #SS.
55d2375e
SC
4649 */
4650int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
fdb28619 4651 u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
55d2375e
SC
4652{
4653 gva_t off;
4654 bool exn;
4655 struct kvm_segment s;
4656
4657 /*
4658 * According to Vol. 3B, "Information for VM Exits Due to Instruction
4659 * Execution", on an exit, vmx_instruction_info holds most of the
4660 * addressing components of the operand. Only the displacement part
4661 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4662 * For how an actual address is calculated from all these components,
4663 * refer to Vol. 1, "Operand Addressing".
4664 */
4665 int scaling = vmx_instruction_info & 3;
4666 int addr_size = (vmx_instruction_info >> 7) & 7;
4667 bool is_reg = vmx_instruction_info & (1u << 10);
4668 int seg_reg = (vmx_instruction_info >> 15) & 7;
4669 int index_reg = (vmx_instruction_info >> 18) & 0xf;
4670 bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4671 int base_reg = (vmx_instruction_info >> 23) & 0xf;
4672 bool base_is_valid = !(vmx_instruction_info & (1u << 27));
4673
4674 if (is_reg) {
4675 kvm_queue_exception(vcpu, UD_VECTOR);
4676 return 1;
4677 }
4678
4679 /* Addr = segment_base + offset */
4680 /* offset = base + [index * scale] + displacement */
4681 off = exit_qualification; /* holds the displacement */
946c522b
SC
4682 if (addr_size == 1)
4683 off = (gva_t)sign_extend64(off, 31);
4684 else if (addr_size == 0)
4685 off = (gva_t)sign_extend64(off, 15);
55d2375e
SC
4686 if (base_is_valid)
4687 off += kvm_register_read(vcpu, base_reg);
4688 if (index_is_valid)
e6302698 4689 off += kvm_register_read(vcpu, index_reg) << scaling;
55d2375e 4690 vmx_get_segment(vcpu, &s, seg_reg);
55d2375e 4691
8570f9e8
SC
4692 /*
4693 * The effective address, i.e. @off, of a memory operand is truncated
4694 * based on the address size of the instruction. Note that this is
4695 * the *effective address*, i.e. the address prior to accounting for
4696 * the segment's base.
4697 */
55d2375e 4698 if (addr_size == 1) /* 32 bit */
8570f9e8
SC
4699 off &= 0xffffffff;
4700 else if (addr_size == 0) /* 16 bit */
4701 off &= 0xffff;
55d2375e
SC
4702
4703 /* Checks for #GP/#SS exceptions. */
4704 exn = false;
4705 if (is_long_mode(vcpu)) {
8570f9e8
SC
4706 /*
4707 * The virtual/linear address is never truncated in 64-bit
4708 * mode, e.g. a 32-bit address size can yield a 64-bit virtual
4709 * address when using FS/GS with a non-zero base.
4710 */
6694e480
LA
4711 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
4712 *ret = s.base + off;
4713 else
4714 *ret = off;
8570f9e8 4715
55d2375e
SC
4716 /* Long mode: #GP(0)/#SS(0) if the memory address is in a
4717 * non-canonical form. This is the only check on the memory
4718 * destination for long mode!
4719 */
4720 exn = is_noncanonical_address(*ret, vcpu);
e0dfacbf 4721 } else {
8570f9e8
SC
4722 /*
4723 * When not in long mode, the virtual/linear address is
4724 * unconditionally truncated to 32 bits regardless of the
4725 * address size.
4726 */
4727 *ret = (s.base + off) & 0xffffffff;
4728
55d2375e
SC
4729 /* Protected mode: apply checks for segment validity in the
4730 * following order:
4731 * - segment type check (#GP(0) may be thrown)
4732 * - usability check (#GP(0)/#SS(0))
4733 * - limit check (#GP(0)/#SS(0))
4734 */
4735 if (wr)
4736 /* #GP(0) if the destination operand is located in a
4737 * read-only data segment or any code segment.
4738 */
4739 exn = ((s.type & 0xa) == 0 || (s.type & 8));
4740 else
4741 /* #GP(0) if the source operand is located in an
4742 * execute-only code segment
4743 */
4744 exn = ((s.type & 0xa) == 8);
4745 if (exn) {
4746 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4747 return 1;
4748 }
4749 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
4750 */
4751 exn = (s.unusable != 0);
34333cc6
SC
4752
4753 /*
4754 * Protected mode: #GP(0)/#SS(0) if the memory operand is
4755 * outside the segment limit. All CPUs that support VMX ignore
4756 * limit checks for flat segments, i.e. segments with base==0,
4757 * limit==0xffffffff and of type expand-up data or code.
55d2375e 4758 */
34333cc6
SC
4759 if (!(s.base == 0 && s.limit == 0xffffffff &&
4760 ((s.type & 8) || !(s.type & 4))))
fdb28619 4761 exn = exn || ((u64)off + len - 1 > s.limit);
55d2375e
SC
4762 }
4763 if (exn) {
4764 kvm_queue_exception_e(vcpu,
4765 seg_reg == VCPU_SREG_SS ?
4766 SS_VECTOR : GP_VECTOR,
4767 0);
4768 return 1;
4769 }
4770
4771 return 0;
4772}
4773
03a8871a
OU
4774void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
4775{
4776 struct vcpu_vmx *vmx;
4777
4778 if (!nested_vmx_allowed(vcpu))
4779 return;
4780
4781 vmx = to_vmx(vcpu);
afaf0b2f 4782 if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) {
03a8871a
OU
4783 vmx->nested.msrs.entry_ctls_high |=
4784 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4785 vmx->nested.msrs.exit_ctls_high |=
4786 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4787 } else {
4788 vmx->nested.msrs.entry_ctls_high &=
4789 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4790 vmx->nested.msrs.exit_ctls_high &=
c6b177a3 4791 ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
03a8871a
OU
4792 }
4793}
4794
7a35e515
VK
4795static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
4796 int *ret)
55d2375e
SC
4797{
4798 gva_t gva;
4799 struct x86_exception e;
7a35e515 4800 int r;
55d2375e 4801
5addc235 4802 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
fdb28619 4803 vmcs_read32(VMX_INSTRUCTION_INFO), false,
7a35e515
VK
4804 sizeof(*vmpointer), &gva)) {
4805 *ret = 1;
4806 return -EINVAL;
4807 }
55d2375e 4808
7a35e515
VK
4809 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
4810 if (r != X86EMUL_CONTINUE) {
3f3393b3 4811 *ret = kvm_handle_memory_failure(vcpu, r, &e);
7a35e515 4812 return -EINVAL;
55d2375e
SC
4813 }
4814
4815 return 0;
4816}
4817
4818/*
4819 * Allocate a shadow VMCS and associate it with the currently loaded
4820 * VMCS, unless such a shadow VMCS already exists. The newly allocated
4821 * VMCS is also VMCLEARed, so that it is ready for use.
4822 */
4823static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
4824{
4825 struct vcpu_vmx *vmx = to_vmx(vcpu);
4826 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
4827
4828 /*
4829 * We should allocate a shadow vmcs for vmcs01 only when L1
4830 * executes VMXON and free it when L1 executes VMXOFF.
4831 * As it is invalid to execute VMXON twice, we shouldn't reach
4832 * here when vmcs01 already have an allocated shadow vmcs.
4833 */
4834 WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
4835
4836 if (!loaded_vmcs->shadow_vmcs) {
4837 loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
4838 if (loaded_vmcs->shadow_vmcs)
4839 vmcs_clear(loaded_vmcs->shadow_vmcs);
4840 }
4841 return loaded_vmcs->shadow_vmcs;
4842}
4843
4844static int enter_vmx_operation(struct kvm_vcpu *vcpu)
4845{
4846 struct vcpu_vmx *vmx = to_vmx(vcpu);
4847 int r;
4848
4849 r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
4850 if (r < 0)
4851 goto out_vmcs02;
4852
41836839 4853 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
55d2375e
SC
4854 if (!vmx->nested.cached_vmcs12)
4855 goto out_cached_vmcs12;
4856
41836839 4857 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
55d2375e
SC
4858 if (!vmx->nested.cached_shadow_vmcs12)
4859 goto out_cached_shadow_vmcs12;
4860
4861 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
4862 goto out_shadow_vmcs;
4863
4864 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
ada0098d 4865 HRTIMER_MODE_ABS_PINNED);
55d2375e
SC
4866 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
4867
4868 vmx->nested.vpid02 = allocate_vpid();
4869
4870 vmx->nested.vmcs02_initialized = false;
4871 vmx->nested.vmxon = true;
ee85dec2 4872
2ef7619d 4873 if (vmx_pt_mode_is_host_guest()) {
ee85dec2 4874 vmx->pt_desc.guest.ctl = 0;
476c9bd8 4875 pt_update_intercept_for_msr(vcpu);
ee85dec2
LK
4876 }
4877
55d2375e
SC
4878 return 0;
4879
4880out_shadow_vmcs:
4881 kfree(vmx->nested.cached_shadow_vmcs12);
4882
4883out_cached_shadow_vmcs12:
4884 kfree(vmx->nested.cached_vmcs12);
4885
4886out_cached_vmcs12:
4887 free_loaded_vmcs(&vmx->nested.vmcs02);
4888
4889out_vmcs02:
4890 return -ENOMEM;
4891}
4892
4893/*
4894 * Emulate the VMXON instruction.
4895 * Currently, we just remember that VMX is active, and do not save or even
4896 * inspect the argument to VMXON (the so-called "VMXON pointer") because we
4897 * do not currently need to store anything in that guest-allocated memory
4898 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
4899 * argument is different from the VMXON pointer (which the spec says they do).
4900 */
4901static int handle_vmon(struct kvm_vcpu *vcpu)
4902{
4903 int ret;
4904 gpa_t vmptr;
2e408936 4905 uint32_t revision;
55d2375e 4906 struct vcpu_vmx *vmx = to_vmx(vcpu);
32ad73db
SC
4907 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED
4908 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
55d2375e
SC
4909
4910 /*
4911 * The Intel VMX Instruction Reference lists a bunch of bits that are
4912 * prerequisite to running VMXON, most notably cr4.VMXE must be set to
c2fe3cd4 4913 * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this).
55d2375e
SC
4914 * Otherwise, we should fail with #UD. But most faulting conditions
4915 * have already been checked by hardware, prior to the VM-exit for
4916 * VMXON. We do test guest cr4.VMXE because processor CR4 always has
4917 * that bit set to 1 in non-root mode.
4918 */
4919 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
4920 kvm_queue_exception(vcpu, UD_VECTOR);
4921 return 1;
4922 }
4923
4924 /* CPL=0 must be checked manually. */
4925 if (vmx_get_cpl(vcpu)) {
4926 kvm_inject_gp(vcpu, 0);
4927 return 1;
4928 }
4929
4930 if (vmx->nested.vmxon)
b2656e4d 4931 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
55d2375e
SC
4932
4933 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
4934 != VMXON_NEEDED_FEATURES) {
4935 kvm_inject_gp(vcpu, 0);
4936 return 1;
4937 }
4938
7a35e515
VK
4939 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret))
4940 return ret;
55d2375e
SC
4941
4942 /*
4943 * SDM 3: 24.11.5
4944 * The first 4 bytes of VMXON region contain the supported
4945 * VMCS revision identifier
4946 *
4947 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
4948 * which replaces physical address width with 32
4949 */
e0bf2665 4950 if (!page_address_valid(vcpu, vmptr))
55d2375e
SC
4951 return nested_vmx_failInvalid(vcpu);
4952
2e408936
KA
4953 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
4954 revision != VMCS12_REVISION)
55d2375e 4955 return nested_vmx_failInvalid(vcpu);
55d2375e
SC
4956
4957 vmx->nested.vmxon_ptr = vmptr;
4958 ret = enter_vmx_operation(vcpu);
4959 if (ret)
4960 return ret;
4961
4962 return nested_vmx_succeed(vcpu);
4963}
4964
4965static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
4966{
4967 struct vcpu_vmx *vmx = to_vmx(vcpu);
4968
4969 if (vmx->nested.current_vmptr == -1ull)
4970 return;
4971
7952d769
SC
4972 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
4973
55d2375e
SC
4974 if (enable_shadow_vmcs) {
4975 /* copy to memory all shadowed fields in case
4976 they were modified */
4977 copy_shadow_to_vmcs12(vmx);
55d2375e
SC
4978 vmx_disable_shadow_vmcs(vmx);
4979 }
4980 vmx->nested.posted_intr_nv = -1;
4981
4982 /* Flush VMCS12 to guest memory */
4983 kvm_vcpu_write_guest_page(vcpu,
4984 vmx->nested.current_vmptr >> PAGE_SHIFT,
4985 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
4986
4987 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
4988
4989 vmx->nested.current_vmptr = -1ull;
4990}
4991
4992/* Emulate the VMXOFF instruction */
4993static int handle_vmoff(struct kvm_vcpu *vcpu)
4994{
4995 if (!nested_vmx_check_permission(vcpu))
4996 return 1;
4b9852f4 4997
55d2375e 4998 free_nested(vcpu);
4b9852f4
LA
4999
5000 /* Process a latched INIT during time CPU was in VMX operation */
5001 kvm_make_request(KVM_REQ_EVENT, vcpu);
5002
55d2375e
SC
5003 return nested_vmx_succeed(vcpu);
5004}
5005
5006/* Emulate the VMCLEAR instruction */
5007static int handle_vmclear(struct kvm_vcpu *vcpu)
5008{
5009 struct vcpu_vmx *vmx = to_vmx(vcpu);
5010 u32 zero = 0;
5011 gpa_t vmptr;
11e34914 5012 u64 evmcs_gpa;
7a35e515 5013 int r;
55d2375e
SC
5014
5015 if (!nested_vmx_check_permission(vcpu))
5016 return 1;
5017
7a35e515
VK
5018 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
5019 return r;
55d2375e 5020
e0bf2665 5021 if (!page_address_valid(vcpu, vmptr))
b2656e4d 5022 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
55d2375e
SC
5023
5024 if (vmptr == vmx->nested.vmxon_ptr)
b2656e4d 5025 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
55d2375e 5026
11e34914
VK
5027 /*
5028 * When Enlightened VMEntry is enabled on the calling CPU we treat
5029 * memory area pointer by vmptr as Enlightened VMCS (as there's no good
5030 * way to distinguish it from VMCS12) and we must not corrupt it by
5031 * writing to the non-existent 'launch_state' field. The area doesn't
5032 * have to be the currently active EVMCS on the calling CPU and there's
5033 * nothing KVM has to do to transition it from 'active' to 'non-active'
5034 * state. It is possible that the area will stay mapped as
5035 * vmx->nested.hv_evmcs but this shouldn't be a problem.
5036 */
5037 if (likely(!vmx->nested.enlightened_vmcs_enabled ||
5038 !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
55d2375e
SC
5039 if (vmptr == vmx->nested.current_vmptr)
5040 nested_release_vmcs12(vcpu);
5041
5042 kvm_vcpu_write_guest(vcpu,
5043 vmptr + offsetof(struct vmcs12,
5044 launch_state),
5045 &zero, sizeof(zero));
3b19b81a
VK
5046 } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) {
5047 nested_release_evmcs(vcpu);
55d2375e
SC
5048 }
5049
5050 return nested_vmx_succeed(vcpu);
5051}
5052
55d2375e
SC
5053/* Emulate the VMLAUNCH instruction */
5054static int handle_vmlaunch(struct kvm_vcpu *vcpu)
5055{
5056 return nested_vmx_run(vcpu, true);
5057}
5058
5059/* Emulate the VMRESUME instruction */
5060static int handle_vmresume(struct kvm_vcpu *vcpu)
5061{
5062
5063 return nested_vmx_run(vcpu, false);
5064}
5065
5066static int handle_vmread(struct kvm_vcpu *vcpu)
5067{
dd2d6042
JM
5068 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
5069 : get_vmcs12(vcpu);
5addc235 5070 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
c90f4d03
JM
5071 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5072 struct vcpu_vmx *vmx = to_vmx(vcpu);
f7eea636 5073 struct x86_exception e;
c90f4d03
JM
5074 unsigned long field;
5075 u64 value;
5076 gva_t gva = 0;
1c6f0b47 5077 short offset;
7a35e515 5078 int len, r;
55d2375e
SC
5079
5080 if (!nested_vmx_check_permission(vcpu))
5081 return 1;
5082
dd2d6042
JM
5083 /*
5084 * In VMX non-root operation, when the VMCS-link pointer is -1ull,
5085 * any VMREAD sets the ALU flags for VMfailInvalid.
5086 */
5087 if (vmx->nested.current_vmptr == -1ull ||
5088 (is_guest_mode(vcpu) &&
5089 get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
55d2375e
SC
5090 return nested_vmx_failInvalid(vcpu);
5091
55d2375e 5092 /* Decode instruction info and find the field to read */
27b4a9c4 5093 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
1c6f0b47
SC
5094
5095 offset = vmcs_field_to_offset(field);
5096 if (offset < 0)
b2656e4d 5097 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
55d2375e 5098
7952d769
SC
5099 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
5100 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5101
c90f4d03
JM
5102 /* Read the field, zero-extended to a u64 value */
5103 value = vmcs12_read_any(vmcs12, field, offset);
1c6f0b47 5104
55d2375e
SC
5105 /*
5106 * Now copy part of this value to register or memory, as requested.
5107 * Note that the number of bits actually copied is 32 or 64 depending
5108 * on the guest's mode (32 or 64 bit), not on the given field's length.
5109 */
c90f4d03 5110 if (instr_info & BIT(10)) {
27b4a9c4 5111 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value);
55d2375e 5112 } else {
fdb28619 5113 len = is_64_bit_mode(vcpu) ? 8 : 4;
55d2375e 5114 if (get_vmx_mem_address(vcpu, exit_qualification,
c90f4d03 5115 instr_info, true, len, &gva))
55d2375e
SC
5116 return 1;
5117 /* _system ok, nested_vmx_check_permission has verified cpl=0 */
7a35e515
VK
5118 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e);
5119 if (r != X86EMUL_CONTINUE)
3f3393b3 5120 return kvm_handle_memory_failure(vcpu, r, &e);
55d2375e
SC
5121 }
5122
5123 return nested_vmx_succeed(vcpu);
5124}
5125
e2174295
SC
5126static bool is_shadow_field_rw(unsigned long field)
5127{
5128 switch (field) {
5129#define SHADOW_FIELD_RW(x, y) case x:
5130#include "vmcs_shadow_fields.h"
5131 return true;
5132 default:
5133 break;
5134 }
5135 return false;
5136}
5137
5138static bool is_shadow_field_ro(unsigned long field)
5139{
5140 switch (field) {
5141#define SHADOW_FIELD_RO(x, y) case x:
5142#include "vmcs_shadow_fields.h"
5143 return true;
5144 default:
5145 break;
5146 }
5147 return false;
5148}
55d2375e
SC
5149
5150static int handle_vmwrite(struct kvm_vcpu *vcpu)
5151{
c90f4d03
JM
5152 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
5153 : get_vmcs12(vcpu);
5addc235 5154 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
c90f4d03
JM
5155 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5156 struct vcpu_vmx *vmx = to_vmx(vcpu);
5157 struct x86_exception e;
55d2375e 5158 unsigned long field;
c90f4d03 5159 short offset;
55d2375e 5160 gva_t gva;
7a35e515 5161 int len, r;
55d2375e 5162
c90f4d03
JM
5163 /*
5164 * The value to write might be 32 or 64 bits, depending on L1's long
55d2375e
SC
5165 * mode, and eventually we need to write that into a field of several
5166 * possible lengths. The code below first zero-extends the value to 64
c90f4d03 5167 * bit (value), and then copies only the appropriate number of
55d2375e
SC
5168 * bits into the vmcs12 field.
5169 */
c90f4d03 5170 u64 value = 0;
55d2375e
SC
5171
5172 if (!nested_vmx_check_permission(vcpu))
5173 return 1;
5174
dd2d6042
JM
5175 /*
5176 * In VMX non-root operation, when the VMCS-link pointer is -1ull,
5177 * any VMWRITE sets the ALU flags for VMfailInvalid.
5178 */
5179 if (vmx->nested.current_vmptr == -1ull ||
5180 (is_guest_mode(vcpu) &&
5181 get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
55d2375e
SC
5182 return nested_vmx_failInvalid(vcpu);
5183
c90f4d03 5184 if (instr_info & BIT(10))
27b4a9c4 5185 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf));
55d2375e 5186 else {
fdb28619 5187 len = is_64_bit_mode(vcpu) ? 8 : 4;
55d2375e 5188 if (get_vmx_mem_address(vcpu, exit_qualification,
c90f4d03 5189 instr_info, false, len, &gva))
55d2375e 5190 return 1;
7a35e515
VK
5191 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e);
5192 if (r != X86EMUL_CONTINUE)
3f3393b3 5193 return kvm_handle_memory_failure(vcpu, r, &e);
55d2375e
SC
5194 }
5195
27b4a9c4 5196 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
693e02cc
JM
5197
5198 offset = vmcs_field_to_offset(field);
5199 if (offset < 0)
b2656e4d 5200 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
55d2375e 5201
55d2375e
SC
5202 /*
5203 * If the vCPU supports "VMWRITE to any supported field in the
5204 * VMCS," then the "read-only" fields are actually read/write.
5205 */
5206 if (vmcs_field_readonly(field) &&
5207 !nested_cpu_has_vmwrite_any_field(vcpu))
b2656e4d 5208 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
55d2375e 5209
dd2d6042
JM
5210 /*
5211 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
5212 * vmcs12, else we may crush a field or consume a stale value.
5213 */
5214 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field))
5215 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
55d2375e
SC
5216
5217 /*
b6437805
SC
5218 * Some Intel CPUs intentionally drop the reserved bits of the AR byte
5219 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM
5220 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
5221 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
5222 * from L1 will return a different value than VMREAD from L2 (L1 sees
5223 * the stripped down value, L2 sees the full value as stored by KVM).
55d2375e 5224 */
b6437805 5225 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
c90f4d03 5226 value &= 0x1f0ff;
b6437805 5227
c90f4d03 5228 vmcs12_write_any(vmcs12, field, offset, value);
55d2375e
SC
5229
5230 /*
e2174295
SC
5231 * Do not track vmcs12 dirty-state if in guest-mode as we actually
5232 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated
5233 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
5234 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
55d2375e 5235 */
e2174295
SC
5236 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
5237 /*
5238 * L1 can read these fields without exiting, ensure the
5239 * shadow VMCS is up-to-date.
5240 */
5241 if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
5242 preempt_disable();
5243 vmcs_load(vmx->vmcs01.shadow_vmcs);
fadcead0 5244
c90f4d03 5245 __vmcs_writel(field, value);
fadcead0 5246
e2174295
SC
5247 vmcs_clear(vmx->vmcs01.shadow_vmcs);
5248 vmcs_load(vmx->loaded_vmcs->vmcs);
5249 preempt_enable();
55d2375e 5250 }
e2174295 5251 vmx->nested.dirty_vmcs12 = true;
55d2375e
SC
5252 }
5253
5254 return nested_vmx_succeed(vcpu);
5255}
5256
5257static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
5258{
5259 vmx->nested.current_vmptr = vmptr;
5260 if (enable_shadow_vmcs) {
fe7f895d 5261 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
55d2375e
SC
5262 vmcs_write64(VMCS_LINK_POINTER,
5263 __pa(vmx->vmcs01.shadow_vmcs));
3731905e 5264 vmx->nested.need_vmcs12_to_shadow_sync = true;
55d2375e
SC
5265 }
5266 vmx->nested.dirty_vmcs12 = true;
5267}
5268
5269/* Emulate the VMPTRLD instruction */
5270static int handle_vmptrld(struct kvm_vcpu *vcpu)
5271{
5272 struct vcpu_vmx *vmx = to_vmx(vcpu);
5273 gpa_t vmptr;
7a35e515 5274 int r;
55d2375e
SC
5275
5276 if (!nested_vmx_check_permission(vcpu))
5277 return 1;
5278
7a35e515
VK
5279 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
5280 return r;
55d2375e 5281
e0bf2665 5282 if (!page_address_valid(vcpu, vmptr))
b2656e4d 5283 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
55d2375e
SC
5284
5285 if (vmptr == vmx->nested.vmxon_ptr)
b2656e4d 5286 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
55d2375e
SC
5287
5288 /* Forbid normal VMPTRLD if Enlightened version was used */
1e9dfbd7 5289 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
55d2375e
SC
5290 return 1;
5291
5292 if (vmx->nested.current_vmptr != vmptr) {
b146b839 5293 struct kvm_host_map map;
55d2375e 5294 struct vmcs12 *new_vmcs12;
55d2375e 5295
b146b839 5296 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
55d2375e
SC
5297 /*
5298 * Reads from an unbacked page return all 1s,
5299 * which means that the 32 bits located at the
5300 * given physical address won't match the required
5301 * VMCS12_REVISION identifier.
5302 */
b2656e4d 5303 return nested_vmx_fail(vcpu,
55d2375e 5304 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
55d2375e 5305 }
b146b839
KA
5306
5307 new_vmcs12 = map.hva;
5308
55d2375e
SC
5309 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
5310 (new_vmcs12->hdr.shadow_vmcs &&
5311 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
b146b839 5312 kvm_vcpu_unmap(vcpu, &map, false);
b2656e4d 5313 return nested_vmx_fail(vcpu,
55d2375e
SC
5314 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5315 }
5316
5317 nested_release_vmcs12(vcpu);
5318
5319 /*
5320 * Load VMCS12 from guest memory since it is not already
5321 * cached.
5322 */
5323 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
b146b839 5324 kvm_vcpu_unmap(vcpu, &map, false);
55d2375e
SC
5325
5326 set_current_vmptr(vmx, vmptr);
5327 }
5328
5329 return nested_vmx_succeed(vcpu);
5330}
5331
5332/* Emulate the VMPTRST instruction */
5333static int handle_vmptrst(struct kvm_vcpu *vcpu)
5334{
5addc235 5335 unsigned long exit_qual = vmx_get_exit_qual(vcpu);
55d2375e
SC
5336 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5337 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
5338 struct x86_exception e;
5339 gva_t gva;
7a35e515 5340 int r;
55d2375e
SC
5341
5342 if (!nested_vmx_check_permission(vcpu))
5343 return 1;
5344
1e9dfbd7 5345 if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr)))
55d2375e
SC
5346 return 1;
5347
fdb28619
EK
5348 if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
5349 true, sizeof(gpa_t), &gva))
55d2375e
SC
5350 return 1;
5351 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
7a35e515
VK
5352 r = kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
5353 sizeof(gpa_t), &e);
5354 if (r != X86EMUL_CONTINUE)
3f3393b3 5355 return kvm_handle_memory_failure(vcpu, r, &e);
7a35e515 5356
55d2375e
SC
5357 return nested_vmx_succeed(vcpu);
5358}
5359
ce8fe7b7
SC
5360#define EPTP_PA_MASK GENMASK_ULL(51, 12)
5361
5362static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
5363{
5364 return VALID_PAGE(root_hpa) &&
5365 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
5366}
5367
55d2375e
SC
5368/* Emulate the INVEPT instruction */
5369static int handle_invept(struct kvm_vcpu *vcpu)
5370{
5371 struct vcpu_vmx *vmx = to_vmx(vcpu);
5372 u32 vmx_instruction_info, types;
ce8fe7b7
SC
5373 unsigned long type, roots_to_free;
5374 struct kvm_mmu *mmu;
55d2375e
SC
5375 gva_t gva;
5376 struct x86_exception e;
5377 struct {
5378 u64 eptp, gpa;
5379 } operand;
7a35e515 5380 int i, r;
55d2375e
SC
5381
5382 if (!(vmx->nested.msrs.secondary_ctls_high &
5383 SECONDARY_EXEC_ENABLE_EPT) ||
5384 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
5385 kvm_queue_exception(vcpu, UD_VECTOR);
5386 return 1;
5387 }
5388
5389 if (!nested_vmx_check_permission(vcpu))
5390 return 1;
5391
5392 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
27b4a9c4 5393 type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
55d2375e
SC
5394
5395 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
5396
5397 if (type >= 32 || !(types & (1 << type)))
b2656e4d 5398 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
55d2375e
SC
5399
5400 /* According to the Intel VMX instruction reference, the memory
5401 * operand is read even if it isn't needed (e.g., for type==global)
5402 */
5addc235 5403 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
fdb28619 5404 vmx_instruction_info, false, sizeof(operand), &gva))
55d2375e 5405 return 1;
7a35e515
VK
5406 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
5407 if (r != X86EMUL_CONTINUE)
3f3393b3 5408 return kvm_handle_memory_failure(vcpu, r, &e);
55d2375e 5409
ce8fe7b7
SC
5410 /*
5411 * Nested EPT roots are always held through guest_mmu,
5412 * not root_mmu.
5413 */
5414 mmu = &vcpu->arch.guest_mmu;
5415
55d2375e 5416 switch (type) {
b1190198 5417 case VMX_EPT_EXTENT_CONTEXT:
eed0030e 5418 if (!nested_vmx_check_eptp(vcpu, operand.eptp))
b2656e4d 5419 return nested_vmx_fail(vcpu,
eed0030e 5420 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
f8aa7e39 5421
ce8fe7b7 5422 roots_to_free = 0;
be01e8e2 5423 if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd,
ce8fe7b7
SC
5424 operand.eptp))
5425 roots_to_free |= KVM_MMU_ROOT_CURRENT;
5426
5427 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5428 if (nested_ept_root_matches(mmu->prev_roots[i].hpa,
be01e8e2 5429 mmu->prev_roots[i].pgd,
ce8fe7b7
SC
5430 operand.eptp))
5431 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
5432 }
5433 break;
eed0030e 5434 case VMX_EPT_EXTENT_GLOBAL:
ce8fe7b7 5435 roots_to_free = KVM_MMU_ROOTS_ALL;
55d2375e
SC
5436 break;
5437 default:
f9336e32 5438 BUG();
55d2375e
SC
5439 break;
5440 }
5441
ce8fe7b7
SC
5442 if (roots_to_free)
5443 kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
5444
55d2375e
SC
5445 return nested_vmx_succeed(vcpu);
5446}
5447
5448static int handle_invvpid(struct kvm_vcpu *vcpu)
5449{
5450 struct vcpu_vmx *vmx = to_vmx(vcpu);
5451 u32 vmx_instruction_info;
5452 unsigned long type, types;
5453 gva_t gva;
5454 struct x86_exception e;
5455 struct {
5456 u64 vpid;
5457 u64 gla;
5458 } operand;
5459 u16 vpid02;
7a35e515 5460 int r;
55d2375e
SC
5461
5462 if (!(vmx->nested.msrs.secondary_ctls_high &
5463 SECONDARY_EXEC_ENABLE_VPID) ||
5464 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
5465 kvm_queue_exception(vcpu, UD_VECTOR);
5466 return 1;
5467 }
5468
5469 if (!nested_vmx_check_permission(vcpu))
5470 return 1;
5471
5472 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
27b4a9c4 5473 type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
55d2375e
SC
5474
5475 types = (vmx->nested.msrs.vpid_caps &
5476 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
5477
5478 if (type >= 32 || !(types & (1 << type)))
b2656e4d 5479 return nested_vmx_fail(vcpu,
55d2375e
SC
5480 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5481
5482 /* according to the intel vmx instruction reference, the memory
5483 * operand is read even if it isn't needed (e.g., for type==global)
5484 */
5addc235 5485 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
fdb28619 5486 vmx_instruction_info, false, sizeof(operand), &gva))
55d2375e 5487 return 1;
7a35e515
VK
5488 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
5489 if (r != X86EMUL_CONTINUE)
3f3393b3 5490 return kvm_handle_memory_failure(vcpu, r, &e);
7a35e515 5491
55d2375e 5492 if (operand.vpid >> 16)
b2656e4d 5493 return nested_vmx_fail(vcpu,
55d2375e
SC
5494 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5495
5496 vpid02 = nested_get_vpid02(vcpu);
5497 switch (type) {
5498 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
5499 if (!operand.vpid ||
5500 is_noncanonical_address(operand.gla, vcpu))
b2656e4d 5501 return nested_vmx_fail(vcpu,
55d2375e 5502 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
bc41d0c4 5503 vpid_sync_vcpu_addr(vpid02, operand.gla);
55d2375e
SC
5504 break;
5505 case VMX_VPID_EXTENT_SINGLE_CONTEXT:
5506 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
5507 if (!operand.vpid)
b2656e4d 5508 return nested_vmx_fail(vcpu,
55d2375e 5509 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
446ace4b 5510 vpid_sync_context(vpid02);
55d2375e
SC
5511 break;
5512 case VMX_VPID_EXTENT_ALL_CONTEXT:
446ace4b 5513 vpid_sync_context(vpid02);
55d2375e
SC
5514 break;
5515 default:
5516 WARN_ON_ONCE(1);
5517 return kvm_skip_emulated_instruction(vcpu);
5518 }
5519
d6e3f838
JS
5520 /*
5521 * Sync the shadow page tables if EPT is disabled, L1 is invalidating
5522 * linear mappings for L2 (tagged with L2's VPID). Free all roots as
5523 * VPIDs are not tracked in the MMU role.
5524 *
5525 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
5526 * an MMU when EPT is disabled.
5527 *
5528 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
5529 */
5530 if (!enable_ept)
5531 kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu,
5532 KVM_MMU_ROOTS_ALL);
5533
55d2375e
SC
5534 return nested_vmx_succeed(vcpu);
5535}
5536
5537static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
5538 struct vmcs12 *vmcs12)
5539{
2b3eaf81 5540 u32 index = kvm_rcx_read(vcpu);
ac6389ab 5541 u64 new_eptp;
55d2375e
SC
5542 bool accessed_dirty;
5543 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
5544
5545 if (!nested_cpu_has_eptp_switching(vmcs12) ||
5546 !nested_cpu_has_ept(vmcs12))
5547 return 1;
5548
5549 if (index >= VMFUNC_EPTP_ENTRIES)
5550 return 1;
5551
5552
5553 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
ac6389ab 5554 &new_eptp, index * 8, 8))
55d2375e
SC
5555 return 1;
5556
ac6389ab 5557 accessed_dirty = !!(new_eptp & VMX_EPTP_AD_ENABLE_BIT);
55d2375e
SC
5558
5559 /*
5560 * If the (L2) guest does a vmfunc to the currently
5561 * active ept pointer, we don't have to do anything else
5562 */
ac6389ab
SC
5563 if (vmcs12->ept_pointer != new_eptp) {
5564 if (!nested_vmx_check_eptp(vcpu, new_eptp))
55d2375e
SC
5565 return 1;
5566
55d2375e
SC
5567 mmu->ept_ad = accessed_dirty;
5568 mmu->mmu_role.base.ad_disabled = !accessed_dirty;
ac6389ab 5569 vmcs12->ept_pointer = new_eptp;
c805f5d5
SC
5570
5571 kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
55d2375e
SC
5572 }
5573
5574 return 0;
5575}
5576
5577static int handle_vmfunc(struct kvm_vcpu *vcpu)
5578{
5579 struct vcpu_vmx *vmx = to_vmx(vcpu);
5580 struct vmcs12 *vmcs12;
2b3eaf81 5581 u32 function = kvm_rax_read(vcpu);
55d2375e
SC
5582
5583 /*
5584 * VMFUNC is only supported for nested guests, but we always enable the
5585 * secondary control for simplicity; for non-nested mode, fake that we
5586 * didn't by injecting #UD.
5587 */
5588 if (!is_guest_mode(vcpu)) {
5589 kvm_queue_exception(vcpu, UD_VECTOR);
5590 return 1;
5591 }
5592
5593 vmcs12 = get_vmcs12(vcpu);
5594 if ((vmcs12->vm_function_control & (1 << function)) == 0)
5595 goto fail;
5596
5597 switch (function) {
5598 case 0:
5599 if (nested_vmx_eptp_switching(vcpu, vmcs12))
5600 goto fail;
5601 break;
5602 default:
5603 goto fail;
5604 }
5605 return kvm_skip_emulated_instruction(vcpu);
5606
5607fail:
8e533240
SC
5608 /*
5609 * This is effectively a reflected VM-Exit, as opposed to a synthesized
5610 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode
5611 * EXIT_REASON_VMFUNC as the exit reason.
5612 */
5613 nested_vmx_vmexit(vcpu, vmx->exit_reason.full,
87915858 5614 vmx_get_intr_info(vcpu),
5addc235 5615 vmx_get_exit_qual(vcpu));
55d2375e
SC
5616 return 1;
5617}
5618
e71237d3
OU
5619/*
5620 * Return true if an IO instruction with the specified port and size should cause
5621 * a VM-exit into L1.
5622 */
5623bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
5624 int size)
55d2375e 5625{
e71237d3 5626 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
55d2375e 5627 gpa_t bitmap, last_bitmap;
55d2375e
SC
5628 u8 b;
5629
55d2375e
SC
5630 last_bitmap = (gpa_t)-1;
5631 b = -1;
5632
5633 while (size > 0) {
5634 if (port < 0x8000)
5635 bitmap = vmcs12->io_bitmap_a;
5636 else if (port < 0x10000)
5637 bitmap = vmcs12->io_bitmap_b;
5638 else
5639 return true;
5640 bitmap += (port & 0x7fff) / 8;
5641
5642 if (last_bitmap != bitmap)
5643 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
5644 return true;
5645 if (b & (1 << (port & 7)))
5646 return true;
5647
5648 port++;
5649 size--;
5650 last_bitmap = bitmap;
5651 }
5652
5653 return false;
5654}
5655
e71237d3
OU
5656static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
5657 struct vmcs12 *vmcs12)
5658{
5659 unsigned long exit_qualification;
35a57134 5660 unsigned short port;
e71237d3
OU
5661 int size;
5662
5663 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
5664 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
5665
5addc235 5666 exit_qualification = vmx_get_exit_qual(vcpu);
e71237d3
OU
5667
5668 port = exit_qualification >> 16;
5669 size = (exit_qualification & 7) + 1;
5670
5671 return nested_vmx_check_io_bitmaps(vcpu, port, size);
5672}
5673
55d2375e 5674/*
463bfeee 5675 * Return 1 if we should exit from L2 to L1 to handle an MSR access,
55d2375e
SC
5676 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
5677 * disinterest in the current event (read or write a specific MSR) by using an
5678 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
5679 */
5680static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
8e533240
SC
5681 struct vmcs12 *vmcs12,
5682 union vmx_exit_reason exit_reason)
55d2375e 5683{
2b3eaf81 5684 u32 msr_index = kvm_rcx_read(vcpu);
55d2375e
SC
5685 gpa_t bitmap;
5686
5687 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
5688 return true;
5689
5690 /*
5691 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
5692 * for the four combinations of read/write and low/high MSR numbers.
5693 * First we need to figure out which of the four to use:
5694 */
5695 bitmap = vmcs12->msr_bitmap;
8e533240 5696 if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
55d2375e
SC
5697 bitmap += 2048;
5698 if (msr_index >= 0xc0000000) {
5699 msr_index -= 0xc0000000;
5700 bitmap += 1024;
5701 }
5702
5703 /* Then read the msr_index'th bit from this bitmap: */
5704 if (msr_index < 1024*8) {
5705 unsigned char b;
5706 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
5707 return true;
5708 return 1 & (b >> (msr_index & 7));
5709 } else
5710 return true; /* let L1 handle the wrong parameter */
5711}
5712
5713/*
5714 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
5715 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
5716 * intercept (via guest_host_mask etc.) the current event.
5717 */
5718static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
5719 struct vmcs12 *vmcs12)
5720{
5addc235 5721 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
55d2375e
SC
5722 int cr = exit_qualification & 15;
5723 int reg;
5724 unsigned long val;
5725
5726 switch ((exit_qualification >> 4) & 3) {
5727 case 0: /* mov to cr */
5728 reg = (exit_qualification >> 8) & 15;
27b4a9c4 5729 val = kvm_register_read(vcpu, reg);
55d2375e
SC
5730 switch (cr) {
5731 case 0:
5732 if (vmcs12->cr0_guest_host_mask &
5733 (val ^ vmcs12->cr0_read_shadow))
5734 return true;
5735 break;
5736 case 3:
55d2375e
SC
5737 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
5738 return true;
5739 break;
5740 case 4:
5741 if (vmcs12->cr4_guest_host_mask &
5742 (vmcs12->cr4_read_shadow ^ val))
5743 return true;
5744 break;
5745 case 8:
5746 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
5747 return true;
5748 break;
5749 }
5750 break;
5751 case 2: /* clts */
5752 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
5753 (vmcs12->cr0_read_shadow & X86_CR0_TS))
5754 return true;
5755 break;
5756 case 1: /* mov from cr */
5757 switch (cr) {
5758 case 3:
5759 if (vmcs12->cpu_based_vm_exec_control &
5760 CPU_BASED_CR3_STORE_EXITING)
5761 return true;
5762 break;
5763 case 8:
5764 if (vmcs12->cpu_based_vm_exec_control &
5765 CPU_BASED_CR8_STORE_EXITING)
5766 return true;
5767 break;
5768 }
5769 break;
5770 case 3: /* lmsw */
5771 /*
5772 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5773 * cr0. Other attempted changes are ignored, with no exit.
5774 */
5775 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5776 if (vmcs12->cr0_guest_host_mask & 0xe &
5777 (val ^ vmcs12->cr0_read_shadow))
5778 return true;
5779 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
5780 !(vmcs12->cr0_read_shadow & 0x1) &&
5781 (val & 0x1))
5782 return true;
5783 break;
5784 }
5785 return false;
5786}
5787
72add915
SC
5788static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
5789 struct vmcs12 *vmcs12)
5790{
5791 u32 encls_leaf;
5792
5793 if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
5794 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
5795 return false;
5796
5797 encls_leaf = kvm_rax_read(vcpu);
5798 if (encls_leaf > 62)
5799 encls_leaf = 63;
5800 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
5801}
5802
55d2375e
SC
5803static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
5804 struct vmcs12 *vmcs12, gpa_t bitmap)
5805{
5806 u32 vmx_instruction_info;
5807 unsigned long field;
5808 u8 b;
5809
5810 if (!nested_cpu_has_shadow_vmcs(vmcs12))
5811 return true;
5812
5813 /* Decode instruction info and find the field to access */
5814 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5815 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5816
5817 /* Out-of-range fields always cause a VM exit from L2 to L1 */
5818 if (field >> 15)
5819 return true;
5820
5821 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
5822 return true;
5823
5824 return 1 & (b >> (field & 7));
5825}
5826
b045ae90
OU
5827static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
5828{
5829 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field;
5830
5831 if (nested_cpu_has_mtf(vmcs12))
5832 return true;
5833
5834 /*
5835 * An MTF VM-exit may be injected into the guest by setting the
5836 * interruption-type to 7 (other event) and the vector field to 0. Such
5837 * is the case regardless of the 'monitor trap flag' VM-execution
5838 * control.
5839 */
5840 return entry_intr_info == (INTR_INFO_VALID_MASK
5841 | INTR_TYPE_OTHER_EVENT);
5842}
5843
55d2375e 5844/*
2c1f3323
SC
5845 * Return true if L0 wants to handle an exit from L2 regardless of whether or not
5846 * L1 wants the exit. Only call this when in is_guest_mode (L2).
55d2375e 5847 */
8e533240
SC
5848static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
5849 union vmx_exit_reason exit_reason)
55d2375e 5850{
236871b6 5851 u32 intr_info;
55d2375e 5852
8e533240 5853 switch ((u16)exit_reason.basic) {
55d2375e 5854 case EXIT_REASON_EXCEPTION_NMI:
87915858 5855 intr_info = vmx_get_intr_info(vcpu);
55d2375e 5856 if (is_nmi(intr_info))
2c1f3323 5857 return true;
55d2375e 5858 else if (is_page_fault(intr_info))
68fd66f1 5859 return vcpu->arch.apf.host_apf_flags || !enable_ept;
55d2375e
SC
5860 else if (is_debug(intr_info) &&
5861 vcpu->guest_debug &
5862 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
2c1f3323 5863 return true;
55d2375e
SC
5864 else if (is_breakpoint(intr_info) &&
5865 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
2c1f3323
SC
5866 return true;
5867 return false;
5868 case EXIT_REASON_EXTERNAL_INTERRUPT:
5869 return true;
5870 case EXIT_REASON_MCE_DURING_VMENTRY:
5871 return true;
5872 case EXIT_REASON_EPT_VIOLATION:
5873 /*
5874 * L0 always deals with the EPT violation. If nested EPT is
5875 * used, and the nested mmu code discovers that the address is
5876 * missing in the guest EPT table (EPT12), the EPT violation
5877 * will be injected with nested_ept_inject_page_fault()
5878 */
5879 return true;
5880 case EXIT_REASON_EPT_MISCONFIG:
5881 /*
5882 * L2 never uses directly L1's EPT, but rather L0's own EPT
5883 * table (shadow on EPT) or a merged EPT table that L0 built
5884 * (EPT on EPT). So any problems with the structure of the
5885 * table is L0's fault.
5886 */
5887 return true;
5888 case EXIT_REASON_PREEMPTION_TIMER:
5889 return true;
5890 case EXIT_REASON_PML_FULL:
c3bb9a20
SC
5891 /*
5892 * PML is emulated for an L1 VMM and should never be enabled in
5893 * vmcs02, always "handle" PML_FULL by exiting to userspace.
5894 */
2c1f3323
SC
5895 return true;
5896 case EXIT_REASON_VMFUNC:
5897 /* VM functions are emulated through L2->L0 vmexits. */
5898 return true;
2c1f3323
SC
5899 default:
5900 break;
5901 }
5902 return false;
5903}
5904
5905/*
5906 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in
5907 * is_guest_mode (L2).
5908 */
8e533240
SC
5909static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
5910 union vmx_exit_reason exit_reason)
2c1f3323
SC
5911{
5912 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
9bd4af24 5913 u32 intr_info;
2c1f3323 5914
8e533240 5915 switch ((u16)exit_reason.basic) {
2c1f3323 5916 case EXIT_REASON_EXCEPTION_NMI:
87915858 5917 intr_info = vmx_get_intr_info(vcpu);
2c1f3323
SC
5918 if (is_nmi(intr_info))
5919 return true;
5920 else if (is_page_fault(intr_info))
5921 return true;
55d2375e
SC
5922 return vmcs12->exception_bitmap &
5923 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
5924 case EXIT_REASON_EXTERNAL_INTERRUPT:
2c1f3323 5925 return nested_exit_on_intr(vcpu);
55d2375e
SC
5926 case EXIT_REASON_TRIPLE_FAULT:
5927 return true;
9dadc2f9
XL
5928 case EXIT_REASON_INTERRUPT_WINDOW:
5929 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING);
55d2375e 5930 case EXIT_REASON_NMI_WINDOW:
4e2a0bc5 5931 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING);
55d2375e
SC
5932 case EXIT_REASON_TASK_SWITCH:
5933 return true;
5934 case EXIT_REASON_CPUID:
5935 return true;
5936 case EXIT_REASON_HLT:
5937 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
5938 case EXIT_REASON_INVD:
5939 return true;
5940 case EXIT_REASON_INVLPG:
5941 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5942 case EXIT_REASON_RDPMC:
5943 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
5944 case EXIT_REASON_RDRAND:
5945 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
5946 case EXIT_REASON_RDSEED:
5947 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
5948 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
5949 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
5950 case EXIT_REASON_VMREAD:
5951 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5952 vmcs12->vmread_bitmap);
5953 case EXIT_REASON_VMWRITE:
5954 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5955 vmcs12->vmwrite_bitmap);
5956 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
5957 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
5958 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
5959 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
5960 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
5961 /*
5962 * VMX instructions trap unconditionally. This allows L1 to
5963 * emulate them for its L2 guest, i.e., allows 3-level nesting!
5964 */
5965 return true;
5966 case EXIT_REASON_CR_ACCESS:
5967 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
5968 case EXIT_REASON_DR_ACCESS:
5969 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
5970 case EXIT_REASON_IO_INSTRUCTION:
5971 return nested_vmx_exit_handled_io(vcpu, vmcs12);
5972 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
5973 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
5974 case EXIT_REASON_MSR_READ:
5975 case EXIT_REASON_MSR_WRITE:
5976 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
5977 case EXIT_REASON_INVALID_STATE:
5978 return true;
5979 case EXIT_REASON_MWAIT_INSTRUCTION:
5980 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5981 case EXIT_REASON_MONITOR_TRAP_FLAG:
b045ae90 5982 return nested_vmx_exit_handled_mtf(vmcs12);
55d2375e
SC
5983 case EXIT_REASON_MONITOR_INSTRUCTION:
5984 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
5985 case EXIT_REASON_PAUSE_INSTRUCTION:
5986 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
5987 nested_cpu_has2(vmcs12,
5988 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
5989 case EXIT_REASON_MCE_DURING_VMENTRY:
2c1f3323 5990 return true;
55d2375e
SC
5991 case EXIT_REASON_TPR_BELOW_THRESHOLD:
5992 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
5993 case EXIT_REASON_APIC_ACCESS:
5994 case EXIT_REASON_APIC_WRITE:
5995 case EXIT_REASON_EOI_INDUCED:
5996 /*
5997 * The controls for "virtualize APIC accesses," "APIC-
5998 * register virtualization," and "virtual-interrupt
5999 * delivery" only come from vmcs12.
6000 */
6001 return true;
55d2375e
SC
6002 case EXIT_REASON_INVPCID:
6003 return
6004 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
6005 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
6006 case EXIT_REASON_WBINVD:
6007 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
6008 case EXIT_REASON_XSETBV:
6009 return true;
6010 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
6011 /*
6012 * This should never happen, since it is not possible to
6013 * set XSS to a non-zero value---neither in L1 nor in L2.
6014 * If if it were, XSS would have to be checked against
6015 * the XSS exit bitmap in vmcs12.
6016 */
6017 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
bf653b78
TX
6018 case EXIT_REASON_UMWAIT:
6019 case EXIT_REASON_TPAUSE:
6020 return nested_cpu_has2(vmcs12,
6021 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
72add915
SC
6022 case EXIT_REASON_ENCLS:
6023 return nested_vmx_exit_handled_encls(vcpu, vmcs12);
55d2375e
SC
6024 default:
6025 return true;
6026 }
6027}
6028
7b7bd87d
SC
6029/*
6030 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was
6031 * reflected into L1.
6032 */
f47baaed 6033bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
7b7bd87d 6034{
fbdd5025 6035 struct vcpu_vmx *vmx = to_vmx(vcpu);
8e533240 6036 union vmx_exit_reason exit_reason = vmx->exit_reason;
87796555
SC
6037 unsigned long exit_qual;
6038 u32 exit_intr_info;
fbdd5025
SC
6039
6040 WARN_ON_ONCE(vmx->nested.nested_run_pending);
6041
6042 /*
6043 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
6044 * has already loaded L2's state.
6045 */
6046 if (unlikely(vmx->fail)) {
6047 trace_kvm_nested_vmenter_failed(
6048 "hardware VM-instruction error: ",
6049 vmcs_read32(VM_INSTRUCTION_ERROR));
6050 exit_intr_info = 0;
6051 exit_qual = 0;
6052 goto reflect_vmexit;
6053 }
7b7bd87d 6054
8e533240 6055 trace_kvm_nested_vmexit(exit_reason.full, vcpu, KVM_ISA_VMX);
236871b6 6056
2c1f3323
SC
6057 /* If L0 (KVM) wants the exit, it trumps L1's desires. */
6058 if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
6059 return false;
6060
6061 /* If L1 doesn't want the exit, handle it in L0. */
6062 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason))
7b7bd87d
SC
6063 return false;
6064
6065 /*
1d283062
SC
6066 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For
6067 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would
6068 * need to be synthesized by querying the in-kernel LAPIC, but external
6069 * interrupts are never reflected to L1 so it's a non-issue.
7b7bd87d 6070 */
02f1965f 6071 exit_intr_info = vmx_get_intr_info(vcpu);
f315f2b1 6072 if (is_exception_with_error_code(exit_intr_info)) {
7b7bd87d
SC
6073 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6074
6075 vmcs12->vm_exit_intr_error_code =
6076 vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6077 }
02f1965f 6078 exit_qual = vmx_get_exit_qual(vcpu);
7b7bd87d 6079
fbdd5025 6080reflect_vmexit:
8e533240 6081 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual);
7b7bd87d
SC
6082 return true;
6083}
55d2375e
SC
6084
6085static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
6086 struct kvm_nested_state __user *user_kvm_nested_state,
6087 u32 user_data_size)
6088{
6089 struct vcpu_vmx *vmx;
6090 struct vmcs12 *vmcs12;
6091 struct kvm_nested_state kvm_state = {
6092 .flags = 0,
6ca00dfa 6093 .format = KVM_STATE_NESTED_FORMAT_VMX,
55d2375e 6094 .size = sizeof(kvm_state),
850448f3 6095 .hdr.vmx.flags = 0,
6ca00dfa
LA
6096 .hdr.vmx.vmxon_pa = -1ull,
6097 .hdr.vmx.vmcs12_pa = -1ull,
850448f3 6098 .hdr.vmx.preemption_timer_deadline = 0,
55d2375e 6099 };
6ca00dfa
LA
6100 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
6101 &user_kvm_nested_state->data.vmx[0];
55d2375e
SC
6102
6103 if (!vcpu)
6ca00dfa 6104 return kvm_state.size + sizeof(*user_vmx_nested_state);
55d2375e
SC
6105
6106 vmx = to_vmx(vcpu);
6107 vmcs12 = get_vmcs12(vcpu);
6108
55d2375e
SC
6109 if (nested_vmx_allowed(vcpu) &&
6110 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
6ca00dfa
LA
6111 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
6112 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
55d2375e
SC
6113
6114 if (vmx_has_valid_vmcs12(vcpu)) {
6ca00dfa 6115 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
55d2375e 6116
27849968
VK
6117 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
6118 if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
323d73a8
LA
6119 kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
6120
55d2375e
SC
6121 if (is_guest_mode(vcpu) &&
6122 nested_cpu_has_shadow_vmcs(vmcs12) &&
6123 vmcs12->vmcs_link_pointer != -1ull)
6ca00dfa 6124 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
55d2375e
SC
6125 }
6126
6127 if (vmx->nested.smm.vmxon)
6ca00dfa 6128 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
55d2375e
SC
6129
6130 if (vmx->nested.smm.guest_mode)
6ca00dfa 6131 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
55d2375e
SC
6132
6133 if (is_guest_mode(vcpu)) {
6134 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
6135
6136 if (vmx->nested.nested_run_pending)
6137 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
5ef8acbd
OU
6138
6139 if (vmx->nested.mtf_pending)
6140 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
850448f3
PS
6141
6142 if (nested_cpu_has_preemption_timer(vmcs12) &&
6143 vmx->nested.has_preemption_timer_deadline) {
6144 kvm_state.hdr.vmx.flags |=
6145 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE;
6146 kvm_state.hdr.vmx.preemption_timer_deadline =
6147 vmx->nested.preemption_timer_deadline;
6148 }
55d2375e
SC
6149 }
6150 }
6151
6152 if (user_data_size < kvm_state.size)
6153 goto out;
6154
6155 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
6156 return -EFAULT;
6157
6158 if (!vmx_has_valid_vmcs12(vcpu))
6159 goto out;
6160
6161 /*
6162 * When running L2, the authoritative vmcs12 state is in the
6163 * vmcs02. When running L1, the authoritative vmcs12 state is
6164 * in the shadow or enlightened vmcs linked to vmcs01, unless
3731905e 6165 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
55d2375e
SC
6166 * vmcs12 state is in the vmcs12 already.
6167 */
6168 if (is_guest_mode(vcpu)) {
3731905e 6169 sync_vmcs02_to_vmcs12(vcpu, vmcs12);
7952d769 6170 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
d51e1d3f
ML
6171 } else {
6172 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
6173 if (!vmx->nested.need_vmcs12_to_shadow_sync) {
1e9dfbd7 6174 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
d6bf71a1
VK
6175 /*
6176 * L1 hypervisor is not obliged to keep eVMCS
6177 * clean fields data always up-to-date while
6178 * not in guest mode, 'hv_clean_fields' is only
6179 * supposed to be actual upon vmentry so we need
6180 * to ignore it here and do full copy.
6181 */
6182 copy_enlightened_to_vmcs12(vmx, 0);
d51e1d3f
ML
6183 else if (enable_shadow_vmcs)
6184 copy_shadow_to_vmcs12(vmx);
6185 }
55d2375e
SC
6186 }
6187
6ca00dfa
LA
6188 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
6189 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
6190
3a33d030
TR
6191 /*
6192 * Copy over the full allocated size of vmcs12 rather than just the size
6193 * of the struct.
6194 */
6ca00dfa 6195 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
55d2375e
SC
6196 return -EFAULT;
6197
6198 if (nested_cpu_has_shadow_vmcs(vmcs12) &&
6199 vmcs12->vmcs_link_pointer != -1ull) {
6ca00dfa 6200 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
3a33d030 6201 get_shadow_vmcs12(vcpu), VMCS12_SIZE))
55d2375e
SC
6202 return -EFAULT;
6203 }
55d2375e
SC
6204out:
6205 return kvm_state.size;
6206}
6207
6208/*
6209 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
6210 */
6211void vmx_leave_nested(struct kvm_vcpu *vcpu)
6212{
6213 if (is_guest_mode(vcpu)) {
6214 to_vmx(vcpu)->nested.nested_run_pending = 0;
6215 nested_vmx_vmexit(vcpu, -1, 0, 0);
6216 }
6217 free_nested(vcpu);
6218}
6219
6220static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
6221 struct kvm_nested_state __user *user_kvm_nested_state,
6222 struct kvm_nested_state *kvm_state)
6223{
6224 struct vcpu_vmx *vmx = to_vmx(vcpu);
6225 struct vmcs12 *vmcs12;
68cda40d 6226 enum vm_entry_failure_code ignored;
6ca00dfa
LA
6227 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
6228 &user_kvm_nested_state->data.vmx[0];
55d2375e
SC
6229 int ret;
6230
6ca00dfa 6231 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
55d2375e
SC
6232 return -EINVAL;
6233
6ca00dfa
LA
6234 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) {
6235 if (kvm_state->hdr.vmx.smm.flags)
55d2375e
SC
6236 return -EINVAL;
6237
6ca00dfa 6238 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull)
55d2375e
SC
6239 return -EINVAL;
6240
323d73a8
LA
6241 /*
6242 * KVM_STATE_NESTED_EVMCS used to signal that KVM should
6243 * enable eVMCS capability on vCPU. However, since then
6244 * code was changed such that flag signals vmcs12 should
6245 * be copied into eVMCS in guest memory.
6246 *
6247 * To preserve backwards compatability, allow user
6248 * to set this flag even when there is no VMXON region.
6249 */
9fd58877
PB
6250 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
6251 return -EINVAL;
6252 } else {
6253 if (!nested_vmx_allowed(vcpu))
6254 return -EINVAL;
55d2375e 6255
9fd58877
PB
6256 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
6257 return -EINVAL;
323d73a8 6258 }
55d2375e 6259
6ca00dfa 6260 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
55d2375e
SC
6261 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
6262 return -EINVAL;
6263
6ca00dfa 6264 if (kvm_state->hdr.vmx.smm.flags &
55d2375e
SC
6265 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
6266 return -EINVAL;
6267
5e105c88
PB
6268 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE)
6269 return -EINVAL;
6270
55d2375e
SC
6271 /*
6272 * SMM temporarily disables VMX, so we cannot be in guest mode,
6273 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
6274 * must be zero.
6275 */
65b712f1
LA
6276 if (is_smm(vcpu) ?
6277 (kvm_state->flags &
6278 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
6279 : kvm_state->hdr.vmx.smm.flags)
55d2375e
SC
6280 return -EINVAL;
6281
6ca00dfa
LA
6282 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
6283 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
55d2375e
SC
6284 return -EINVAL;
6285
323d73a8
LA
6286 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
6287 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled))
9fd58877 6288 return -EINVAL;
55d2375e 6289
323d73a8 6290 vmx_leave_nested(vcpu);
9fd58877
PB
6291
6292 if (kvm_state->hdr.vmx.vmxon_pa == -1ull)
6293 return 0;
332d0797 6294
6ca00dfa 6295 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
55d2375e
SC
6296 ret = enter_vmx_operation(vcpu);
6297 if (ret)
6298 return ret;
6299
0f02bd0a
PB
6300 /* Empty 'VMXON' state is permitted if no VMCS loaded */
6301 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) {
6302 /* See vmx_has_valid_vmcs12. */
6303 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
6304 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
6305 (kvm_state->hdr.vmx.vmcs12_pa != -1ull))
6306 return -EINVAL;
6307 else
6308 return 0;
6309 }
55d2375e 6310
6ca00dfa
LA
6311 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) {
6312 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
6313 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
55d2375e
SC
6314 return -EINVAL;
6315
6ca00dfa 6316 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
55d2375e
SC
6317 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
6318 /*
e942dbf8
VK
6319 * nested_vmx_handle_enlightened_vmptrld() cannot be called
6320 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be
6321 * restored yet. EVMCS will be mapped from
6322 * nested_get_vmcs12_pages().
55d2375e 6323 */
27849968 6324 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING;
729c15c2 6325 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
55d2375e
SC
6326 } else {
6327 return -EINVAL;
6328 }
6329
6ca00dfa 6330 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
55d2375e
SC
6331 vmx->nested.smm.vmxon = true;
6332 vmx->nested.vmxon = false;
6333
6ca00dfa 6334 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
55d2375e
SC
6335 vmx->nested.smm.guest_mode = true;
6336 }
6337
6338 vmcs12 = get_vmcs12(vcpu);
6ca00dfa 6339 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
55d2375e
SC
6340 return -EFAULT;
6341
6342 if (vmcs12->hdr.revision_id != VMCS12_REVISION)
6343 return -EINVAL;
6344
6345 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
6346 return 0;
6347
21be4ca1
SC
6348 vmx->nested.nested_run_pending =
6349 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
6350
5ef8acbd
OU
6351 vmx->nested.mtf_pending =
6352 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
6353
21be4ca1 6354 ret = -EINVAL;
55d2375e
SC
6355 if (nested_cpu_has_shadow_vmcs(vmcs12) &&
6356 vmcs12->vmcs_link_pointer != -1ull) {
6357 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
6358
6ca00dfa
LA
6359 if (kvm_state->size <
6360 sizeof(*kvm_state) +
6361 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
21be4ca1 6362 goto error_guest_mode;
55d2375e
SC
6363
6364 if (copy_from_user(shadow_vmcs12,
6ca00dfa
LA
6365 user_vmx_nested_state->shadow_vmcs12,
6366 sizeof(*shadow_vmcs12))) {
21be4ca1
SC
6367 ret = -EFAULT;
6368 goto error_guest_mode;
6369 }
55d2375e
SC
6370
6371 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
6372 !shadow_vmcs12->hdr.shadow_vmcs)
21be4ca1 6373 goto error_guest_mode;
55d2375e
SC
6374 }
6375
83d31e52 6376 vmx->nested.has_preemption_timer_deadline = false;
850448f3
PS
6377 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) {
6378 vmx->nested.has_preemption_timer_deadline = true;
6379 vmx->nested.preemption_timer_deadline =
6380 kvm_state->hdr.vmx.preemption_timer_deadline;
6381 }
6382
5478ba34
SC
6383 if (nested_vmx_check_controls(vcpu, vmcs12) ||
6384 nested_vmx_check_host_state(vcpu, vmcs12) ||
68cda40d 6385 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
21be4ca1 6386 goto error_guest_mode;
55d2375e
SC
6387
6388 vmx->nested.dirty_vmcs12 = true;
6389 ret = nested_vmx_enter_non_root_mode(vcpu, false);
21be4ca1
SC
6390 if (ret)
6391 goto error_guest_mode;
55d2375e
SC
6392
6393 return 0;
21be4ca1
SC
6394
6395error_guest_mode:
6396 vmx->nested.nested_run_pending = 0;
6397 return ret;
55d2375e
SC
6398}
6399
1b84292b 6400void nested_vmx_set_vmcs_shadowing_bitmap(void)
55d2375e
SC
6401{
6402 if (enable_shadow_vmcs) {
55d2375e 6403 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
fadcead0 6404 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
55d2375e
SC
6405 }
6406}
6407
6408/*
6409 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
6410 * returned for the various VMX controls MSRs when nested VMX is enabled.
6411 * The same values should also be used to verify that vmcs12 control fields are
6412 * valid during nested entry from L1 to L2.
6413 * Each of these control msrs has a low and high 32-bit half: A low bit is on
6414 * if the corresponding bit in the (32-bit) control field *must* be on, and a
6415 * bit in the high half is on if the corresponding bit in the control field
6416 * may be on. See also vmx_control_verify().
6417 */
a4443267 6418void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
55d2375e
SC
6419{
6420 /*
6421 * Note that as a general rule, the high half of the MSRs (bits in
6422 * the control fields which may be 1) should be initialized by the
6423 * intersection of the underlying hardware's MSR (i.e., features which
6424 * can be supported) and the list of features we want to expose -
6425 * because they are known to be properly supported in our code.
6426 * Also, usually, the low half of the MSRs (bits which must be 1) can
6427 * be set to 0, meaning that L1 may turn off any of these bits. The
6428 * reason is that if one of these bits is necessary, it will appear
6429 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
6430 * fields of vmcs01 and vmcs02, will turn these bits off - and
2c1f3323 6431 * nested_vmx_l1_wants_exit() will not pass related exits to L1.
55d2375e
SC
6432 * These rules have exceptions below.
6433 */
6434
6435 /* pin-based controls */
6436 rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
6437 msrs->pinbased_ctls_low,
6438 msrs->pinbased_ctls_high);
6439 msrs->pinbased_ctls_low |=
6440 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6441 msrs->pinbased_ctls_high &=
6442 PIN_BASED_EXT_INTR_MASK |
6443 PIN_BASED_NMI_EXITING |
6444 PIN_BASED_VIRTUAL_NMIS |
a4443267 6445 (enable_apicv ? PIN_BASED_POSTED_INTR : 0);
55d2375e
SC
6446 msrs->pinbased_ctls_high |=
6447 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
6448 PIN_BASED_VMX_PREEMPTION_TIMER;
6449
6450 /* exit controls */
6451 rdmsr(MSR_IA32_VMX_EXIT_CTLS,
6452 msrs->exit_ctls_low,
6453 msrs->exit_ctls_high);
6454 msrs->exit_ctls_low =
6455 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
6456
6457 msrs->exit_ctls_high &=
6458#ifdef CONFIG_X86_64
6459 VM_EXIT_HOST_ADDR_SPACE_SIZE |
6460#endif
efc83133
CQ
6461 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
6462 VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
55d2375e
SC
6463 msrs->exit_ctls_high |=
6464 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
6465 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
6466 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
6467
6468 /* We support free control of debug control saving. */
6469 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
6470
6471 /* entry controls */
6472 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
6473 msrs->entry_ctls_low,
6474 msrs->entry_ctls_high);
6475 msrs->entry_ctls_low =
6476 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
6477 msrs->entry_ctls_high &=
6478#ifdef CONFIG_X86_64
6479 VM_ENTRY_IA32E_MODE |
6480#endif
efc83133
CQ
6481 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
6482 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
55d2375e
SC
6483 msrs->entry_ctls_high |=
6484 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
6485
6486 /* We support free control of debug control loading. */
6487 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
6488
6489 /* cpu-based controls */
6490 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
6491 msrs->procbased_ctls_low,
6492 msrs->procbased_ctls_high);
6493 msrs->procbased_ctls_low =
6494 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6495 msrs->procbased_ctls_high &=
9dadc2f9 6496 CPU_BASED_INTR_WINDOW_EXITING |
5e3d394f 6497 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
55d2375e
SC
6498 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
6499 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
6500 CPU_BASED_CR3_STORE_EXITING |
6501#ifdef CONFIG_X86_64
6502 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
6503#endif
6504 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
6505 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
6506 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
6507 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
6508 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
6509 /*
6510 * We can allow some features even when not supported by the
6511 * hardware. For example, L1 can specify an MSR bitmap - and we
6512 * can use it to avoid exits to L1 - even when L0 runs L2
6513 * without MSR bitmaps.
6514 */
6515 msrs->procbased_ctls_high |=
6516 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
6517 CPU_BASED_USE_MSR_BITMAPS;
6518
6519 /* We support free control of CR3 access interception. */
6520 msrs->procbased_ctls_low &=
6521 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
6522
6523 /*
6524 * secondary cpu-based controls. Do not include those that
7c1b761b
XL
6525 * depend on CPUID bits, they are added later by
6526 * vmx_vcpu_after_set_cpuid.
55d2375e 6527 */
6b1971c6
VK
6528 if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
6529 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
6530 msrs->secondary_ctls_low,
6531 msrs->secondary_ctls_high);
6532
55d2375e
SC
6533 msrs->secondary_ctls_low = 0;
6534 msrs->secondary_ctls_high &=
6535 SECONDARY_EXEC_DESC |
7f3603b6 6536 SECONDARY_EXEC_ENABLE_RDTSCP |
55d2375e 6537 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
6defc591 6538 SECONDARY_EXEC_WBINVD_EXITING |
55d2375e
SC
6539 SECONDARY_EXEC_APIC_REGISTER_VIRT |
6540 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
6defc591
PB
6541 SECONDARY_EXEC_RDRAND_EXITING |
6542 SECONDARY_EXEC_ENABLE_INVPCID |
6543 SECONDARY_EXEC_RDSEED_EXITING |
d041b5ea
IS
6544 SECONDARY_EXEC_XSAVES |
6545 SECONDARY_EXEC_TSC_SCALING;
55d2375e
SC
6546
6547 /*
6548 * We can emulate "VMCS shadowing," even if the hardware
6549 * doesn't support it.
6550 */
6551 msrs->secondary_ctls_high |=
6552 SECONDARY_EXEC_SHADOW_VMCS;
6553
6554 if (enable_ept) {
6555 /* nested EPT: emulate EPT also to L1 */
6556 msrs->secondary_ctls_high |=
6557 SECONDARY_EXEC_ENABLE_EPT;
bb1fcc70
SC
6558 msrs->ept_caps =
6559 VMX_EPT_PAGE_WALK_4_BIT |
6560 VMX_EPT_PAGE_WALK_5_BIT |
6561 VMX_EPTP_WB_BIT |
96d47010
SC
6562 VMX_EPT_INVEPT_BIT |
6563 VMX_EPT_EXECUTE_ONLY_BIT;
6564
55d2375e
SC
6565 msrs->ept_caps &= ept_caps;
6566 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
6567 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
6568 VMX_EPT_1GB_PAGE_BIT;
6569 if (enable_ept_ad_bits) {
6570 msrs->secondary_ctls_high |=
6571 SECONDARY_EXEC_ENABLE_PML;
6572 msrs->ept_caps |= VMX_EPT_AD_BIT;
6573 }
6574 }
6575
6576 if (cpu_has_vmx_vmfunc()) {
6577 msrs->secondary_ctls_high |=
6578 SECONDARY_EXEC_ENABLE_VMFUNC;
6579 /*
6580 * Advertise EPTP switching unconditionally
6581 * since we emulate it
6582 */
6583 if (enable_ept)
6584 msrs->vmfunc_controls =
6585 VMX_VMFUNC_EPTP_SWITCHING;
6586 }
6587
6588 /*
6589 * Old versions of KVM use the single-context version without
6590 * checking for support, so declare that it is supported even
6591 * though it is treated as global context. The alternative is
6592 * not failing the single-context invvpid, and it is worse.
6593 */
6594 if (enable_vpid) {
6595 msrs->secondary_ctls_high |=
6596 SECONDARY_EXEC_ENABLE_VPID;
6597 msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
6598 VMX_VPID_EXTENT_SUPPORTED_MASK;
6599 }
6600
6601 if (enable_unrestricted_guest)
6602 msrs->secondary_ctls_high |=
6603 SECONDARY_EXEC_UNRESTRICTED_GUEST;
6604
6605 if (flexpriority_enabled)
6606 msrs->secondary_ctls_high |=
6607 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6608
72add915
SC
6609 if (enable_sgx)
6610 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
6611
55d2375e
SC
6612 /* miscellaneous data */
6613 rdmsr(MSR_IA32_VMX_MISC,
6614 msrs->misc_low,
6615 msrs->misc_high);
6616 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
6617 msrs->misc_low |=
6618 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
6619 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
bf0cd88c
YQ
6620 VMX_MISC_ACTIVITY_HLT |
6621 VMX_MISC_ACTIVITY_WAIT_SIPI;
55d2375e
SC
6622 msrs->misc_high = 0;
6623
6624 /*
6625 * This MSR reports some information about VMX support. We
6626 * should return information about the VMX we emulate for the
6627 * guest, and the VMCS structure we give it - not about the
6628 * VMX support of the underlying hardware.
6629 */
6630 msrs->basic =
6631 VMCS12_REVISION |
6632 VMX_BASIC_TRUE_CTLS |
6633 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
6634 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
6635
6636 if (cpu_has_vmx_basic_inout())
6637 msrs->basic |= VMX_BASIC_INOUT;
6638
6639 /*
6640 * These MSRs specify bits which the guest must keep fixed on
6641 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
6642 * We picked the standard core2 setting.
6643 */
6644#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
6645#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
6646 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
6647 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
6648
6649 /* These MSRs specify bits which the guest must keep fixed off. */
6650 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
6651 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
6652
6653 /* highest index: VMX_PREEMPTION_TIMER_VALUE */
6654 msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
6655}
6656
6657void nested_vmx_hardware_unsetup(void)
6658{
6659 int i;
6660
6661 if (enable_shadow_vmcs) {
6662 for (i = 0; i < VMX_BITMAP_NR; i++)
6663 free_page((unsigned long)vmx_bitmap[i]);
6664 }
6665}
6666
6c1c6e58 6667__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
55d2375e
SC
6668{
6669 int i;
6670
6671 if (!cpu_has_vmx_shadow_vmcs())
6672 enable_shadow_vmcs = 0;
6673 if (enable_shadow_vmcs) {
6674 for (i = 0; i < VMX_BITMAP_NR; i++) {
41836839
BG
6675 /*
6676 * The vmx_bitmap is not tied to a VM and so should
6677 * not be charged to a memcg.
6678 */
55d2375e
SC
6679 vmx_bitmap[i] = (unsigned long *)
6680 __get_free_page(GFP_KERNEL);
6681 if (!vmx_bitmap[i]) {
6682 nested_vmx_hardware_unsetup();
6683 return -ENOMEM;
6684 }
6685 }
6686
6687 init_vmcs_shadow_fields();
6688 }
6689
cc877670
LA
6690 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear;
6691 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch;
6692 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld;
6693 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst;
6694 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread;
6695 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume;
6696 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite;
6697 exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff;
6698 exit_handlers[EXIT_REASON_VMON] = handle_vmon;
6699 exit_handlers[EXIT_REASON_INVEPT] = handle_invept;
6700 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid;
6701 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc;
55d2375e 6702
55d2375e
SC
6703 return 0;
6704}
33b22172
PB
6705
6706struct kvm_x86_nested_ops vmx_nested_ops = {
6707 .check_events = vmx_check_nested_events,
d2060bd4 6708 .hv_timer_pending = nested_vmx_preemption_timer_pending,
cb6a32c2 6709 .triple_fault = nested_vmx_triple_fault,
33b22172
PB
6710 .get_state = vmx_get_nested_state,
6711 .set_state = vmx_set_nested_state,
9a78e158 6712 .get_nested_state_pages = vmx_get_nested_state_pages,
02f5fb2e 6713 .write_log_dirty = nested_vmx_write_pml_buffer,
33b22172
PB
6714 .enable_evmcs = nested_enable_evmcs,
6715 .get_evmcs_version = nested_get_evmcs_version,
6716};