]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blob - arch/x86/kvm/vmx/nested.c
KVM: nVMX: Request to sync eVMCS from VMCS12 after migration
[mirror_ubuntu-jammy-kernel.git] / arch / x86 / kvm / vmx / nested.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include <linux/objtool.h>
4 #include <linux/percpu.h>
5
6 #include <asm/debugreg.h>
7 #include <asm/mmu_context.h>
8
9 #include "cpuid.h"
10 #include "hyperv.h"
11 #include "mmu.h"
12 #include "nested.h"
13 #include "pmu.h"
14 #include "sgx.h"
15 #include "trace.h"
16 #include "vmx.h"
17 #include "x86.h"
18
19 static bool __read_mostly enable_shadow_vmcs = 1;
20 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
21
22 static bool __read_mostly nested_early_check = 0;
23 module_param(nested_early_check, bool, S_IRUGO);
24
25 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
26
27 /*
28 * Hyper-V requires all of these, so mark them as supported even though
29 * they are just treated the same as all-context.
30 */
31 #define VMX_VPID_EXTENT_SUPPORTED_MASK \
32 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
33 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
34 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
35 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
36
37 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
38
39 enum {
40 VMX_VMREAD_BITMAP,
41 VMX_VMWRITE_BITMAP,
42 VMX_BITMAP_NR
43 };
44 static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
45
46 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
47 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
48
49 struct shadow_vmcs_field {
50 u16 encoding;
51 u16 offset;
52 };
53 static struct shadow_vmcs_field shadow_read_only_fields[] = {
54 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
55 #include "vmcs_shadow_fields.h"
56 };
57 static int max_shadow_read_only_fields =
58 ARRAY_SIZE(shadow_read_only_fields);
59
60 static struct shadow_vmcs_field shadow_read_write_fields[] = {
61 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
62 #include "vmcs_shadow_fields.h"
63 };
64 static int max_shadow_read_write_fields =
65 ARRAY_SIZE(shadow_read_write_fields);
66
67 static void init_vmcs_shadow_fields(void)
68 {
69 int i, j;
70
71 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
72 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
73
74 for (i = j = 0; i < max_shadow_read_only_fields; i++) {
75 struct shadow_vmcs_field entry = shadow_read_only_fields[i];
76 u16 field = entry.encoding;
77
78 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
79 (i + 1 == max_shadow_read_only_fields ||
80 shadow_read_only_fields[i + 1].encoding != field + 1))
81 pr_err("Missing field from shadow_read_only_field %x\n",
82 field + 1);
83
84 clear_bit(field, vmx_vmread_bitmap);
85 if (field & 1)
86 #ifdef CONFIG_X86_64
87 continue;
88 #else
89 entry.offset += sizeof(u32);
90 #endif
91 shadow_read_only_fields[j++] = entry;
92 }
93 max_shadow_read_only_fields = j;
94
95 for (i = j = 0; i < max_shadow_read_write_fields; i++) {
96 struct shadow_vmcs_field entry = shadow_read_write_fields[i];
97 u16 field = entry.encoding;
98
99 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
100 (i + 1 == max_shadow_read_write_fields ||
101 shadow_read_write_fields[i + 1].encoding != field + 1))
102 pr_err("Missing field from shadow_read_write_field %x\n",
103 field + 1);
104
105 WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
106 field <= GUEST_TR_AR_BYTES,
107 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
108
109 /*
110 * PML and the preemption timer can be emulated, but the
111 * processor cannot vmwrite to fields that don't exist
112 * on bare metal.
113 */
114 switch (field) {
115 case GUEST_PML_INDEX:
116 if (!cpu_has_vmx_pml())
117 continue;
118 break;
119 case VMX_PREEMPTION_TIMER_VALUE:
120 if (!cpu_has_vmx_preemption_timer())
121 continue;
122 break;
123 case GUEST_INTR_STATUS:
124 if (!cpu_has_vmx_apicv())
125 continue;
126 break;
127 default:
128 break;
129 }
130
131 clear_bit(field, vmx_vmwrite_bitmap);
132 clear_bit(field, vmx_vmread_bitmap);
133 if (field & 1)
134 #ifdef CONFIG_X86_64
135 continue;
136 #else
137 entry.offset += sizeof(u32);
138 #endif
139 shadow_read_write_fields[j++] = entry;
140 }
141 max_shadow_read_write_fields = j;
142 }
143
144 /*
145 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
146 * set the success or error code of an emulated VMX instruction (as specified
147 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
148 * instruction.
149 */
150 static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
151 {
152 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
153 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
154 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
155 return kvm_skip_emulated_instruction(vcpu);
156 }
157
158 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
159 {
160 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
161 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
162 X86_EFLAGS_SF | X86_EFLAGS_OF))
163 | X86_EFLAGS_CF);
164 return kvm_skip_emulated_instruction(vcpu);
165 }
166
167 static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
168 u32 vm_instruction_error)
169 {
170 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
171 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
172 X86_EFLAGS_SF | X86_EFLAGS_OF))
173 | X86_EFLAGS_ZF);
174 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
175 /*
176 * We don't need to force sync to shadow VMCS because
177 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all
178 * fields and thus must be synced.
179 */
180 if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
181 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true;
182
183 return kvm_skip_emulated_instruction(vcpu);
184 }
185
186 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
187 {
188 struct vcpu_vmx *vmx = to_vmx(vcpu);
189
190 /*
191 * failValid writes the error number to the current VMCS, which
192 * can't be done if there isn't a current VMCS.
193 */
194 if (vmx->nested.current_vmptr == -1ull &&
195 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
196 return nested_vmx_failInvalid(vcpu);
197
198 return nested_vmx_failValid(vcpu, vm_instruction_error);
199 }
200
201 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
202 {
203 /* TODO: not to reset guest simply here. */
204 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
205 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
206 }
207
208 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
209 {
210 return fixed_bits_valid(control, low, high);
211 }
212
213 static inline u64 vmx_control_msr(u32 low, u32 high)
214 {
215 return low | ((u64)high << 32);
216 }
217
218 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
219 {
220 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
221 vmcs_write64(VMCS_LINK_POINTER, -1ull);
222 vmx->nested.need_vmcs12_to_shadow_sync = false;
223 }
224
225 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
226 {
227 struct vcpu_vmx *vmx = to_vmx(vcpu);
228
229 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
230 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
231 vmx->nested.hv_evmcs = NULL;
232 }
233
234 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
235 }
236
237 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
238 struct loaded_vmcs *prev)
239 {
240 struct vmcs_host_state *dest, *src;
241
242 if (unlikely(!vmx->guest_state_loaded))
243 return;
244
245 src = &prev->host_state;
246 dest = &vmx->loaded_vmcs->host_state;
247
248 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
249 dest->ldt_sel = src->ldt_sel;
250 #ifdef CONFIG_X86_64
251 dest->ds_sel = src->ds_sel;
252 dest->es_sel = src->es_sel;
253 #endif
254 }
255
256 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
257 {
258 struct vcpu_vmx *vmx = to_vmx(vcpu);
259 struct loaded_vmcs *prev;
260 int cpu;
261
262 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs))
263 return;
264
265 cpu = get_cpu();
266 prev = vmx->loaded_vmcs;
267 vmx->loaded_vmcs = vmcs;
268 vmx_vcpu_load_vmcs(vcpu, cpu, prev);
269 vmx_sync_vmcs_host_state(vmx, prev);
270 put_cpu();
271
272 vmx_register_cache_reset(vcpu);
273 }
274
275 /*
276 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
277 * just stops using VMX.
278 */
279 static void free_nested(struct kvm_vcpu *vcpu)
280 {
281 struct vcpu_vmx *vmx = to_vmx(vcpu);
282
283 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01))
284 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
285
286 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
287 return;
288
289 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
290
291 vmx->nested.vmxon = false;
292 vmx->nested.smm.vmxon = false;
293 free_vpid(vmx->nested.vpid02);
294 vmx->nested.posted_intr_nv = -1;
295 vmx->nested.current_vmptr = -1ull;
296 if (enable_shadow_vmcs) {
297 vmx_disable_shadow_vmcs(vmx);
298 vmcs_clear(vmx->vmcs01.shadow_vmcs);
299 free_vmcs(vmx->vmcs01.shadow_vmcs);
300 vmx->vmcs01.shadow_vmcs = NULL;
301 }
302 kfree(vmx->nested.cached_vmcs12);
303 vmx->nested.cached_vmcs12 = NULL;
304 kfree(vmx->nested.cached_shadow_vmcs12);
305 vmx->nested.cached_shadow_vmcs12 = NULL;
306 /* Unpin physical memory we referred to in the vmcs02 */
307 if (vmx->nested.apic_access_page) {
308 kvm_release_page_clean(vmx->nested.apic_access_page);
309 vmx->nested.apic_access_page = NULL;
310 }
311 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
312 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
313 vmx->nested.pi_desc = NULL;
314
315 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
316
317 nested_release_evmcs(vcpu);
318
319 free_loaded_vmcs(&vmx->nested.vmcs02);
320 }
321
322 /*
323 * Ensure that the current vmcs of the logical processor is the
324 * vmcs01 of the vcpu before calling free_nested().
325 */
326 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
327 {
328 vcpu_load(vcpu);
329 vmx_leave_nested(vcpu);
330 vcpu_put(vcpu);
331 }
332
333 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
334 struct x86_exception *fault)
335 {
336 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
337 struct vcpu_vmx *vmx = to_vmx(vcpu);
338 u32 vm_exit_reason;
339 unsigned long exit_qualification = vcpu->arch.exit_qualification;
340
341 if (vmx->nested.pml_full) {
342 vm_exit_reason = EXIT_REASON_PML_FULL;
343 vmx->nested.pml_full = false;
344 exit_qualification &= INTR_INFO_UNBLOCK_NMI;
345 } else if (fault->error_code & PFERR_RSVD_MASK)
346 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
347 else
348 vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
349
350 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
351 vmcs12->guest_physical_address = fault->address;
352 }
353
354 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
355 {
356 WARN_ON(mmu_is_nested(vcpu));
357
358 vcpu->arch.mmu = &vcpu->arch.guest_mmu;
359 kvm_init_shadow_ept_mmu(vcpu,
360 to_vmx(vcpu)->nested.msrs.ept_caps &
361 VMX_EPT_EXECUTE_ONLY_BIT,
362 nested_ept_ad_enabled(vcpu),
363 nested_ept_get_eptp(vcpu));
364 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp;
365 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
366 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
367
368 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
369 }
370
371 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
372 {
373 vcpu->arch.mmu = &vcpu->arch.root_mmu;
374 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
375 }
376
377 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
378 u16 error_code)
379 {
380 bool inequality, bit;
381
382 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
383 inequality =
384 (error_code & vmcs12->page_fault_error_code_mask) !=
385 vmcs12->page_fault_error_code_match;
386 return inequality ^ bit;
387 }
388
389
390 /*
391 * KVM wants to inject page-faults which it got to the guest. This function
392 * checks whether in a nested guest, we need to inject them to L1 or L2.
393 */
394 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
395 {
396 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
397 unsigned int nr = vcpu->arch.exception.nr;
398 bool has_payload = vcpu->arch.exception.has_payload;
399 unsigned long payload = vcpu->arch.exception.payload;
400
401 if (nr == PF_VECTOR) {
402 if (vcpu->arch.exception.nested_apf) {
403 *exit_qual = vcpu->arch.apf.nested_apf_token;
404 return 1;
405 }
406 if (nested_vmx_is_page_fault_vmexit(vmcs12,
407 vcpu->arch.exception.error_code)) {
408 *exit_qual = has_payload ? payload : vcpu->arch.cr2;
409 return 1;
410 }
411 } else if (vmcs12->exception_bitmap & (1u << nr)) {
412 if (nr == DB_VECTOR) {
413 if (!has_payload) {
414 payload = vcpu->arch.dr6;
415 payload &= ~DR6_BT;
416 payload ^= DR6_ACTIVE_LOW;
417 }
418 *exit_qual = payload;
419 } else
420 *exit_qual = 0;
421 return 1;
422 }
423
424 return 0;
425 }
426
427
428 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
429 struct x86_exception *fault)
430 {
431 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
432
433 WARN_ON(!is_guest_mode(vcpu));
434
435 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
436 !to_vmx(vcpu)->nested.nested_run_pending) {
437 vmcs12->vm_exit_intr_error_code = fault->error_code;
438 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
439 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
440 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
441 fault->address);
442 } else {
443 kvm_inject_page_fault(vcpu, fault);
444 }
445 }
446
447 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
448 struct vmcs12 *vmcs12)
449 {
450 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
451 return 0;
452
453 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
454 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
455 return -EINVAL;
456
457 return 0;
458 }
459
460 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
461 struct vmcs12 *vmcs12)
462 {
463 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
464 return 0;
465
466 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
467 return -EINVAL;
468
469 return 0;
470 }
471
472 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
473 struct vmcs12 *vmcs12)
474 {
475 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
476 return 0;
477
478 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
479 return -EINVAL;
480
481 return 0;
482 }
483
484 /*
485 * Check if MSR is intercepted for L01 MSR bitmap.
486 */
487 static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
488 {
489 unsigned long *msr_bitmap;
490 int f = sizeof(unsigned long);
491
492 if (!cpu_has_vmx_msr_bitmap())
493 return true;
494
495 msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
496
497 if (msr <= 0x1fff) {
498 return !!test_bit(msr, msr_bitmap + 0x800 / f);
499 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
500 msr &= 0x1fff;
501 return !!test_bit(msr, msr_bitmap + 0xc00 / f);
502 }
503
504 return true;
505 }
506
507 /*
508 * If a msr is allowed by L0, we should check whether it is allowed by L1.
509 * The corresponding bit will be cleared unless both of L0 and L1 allow it.
510 */
511 static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
512 unsigned long *msr_bitmap_nested,
513 u32 msr, int type)
514 {
515 int f = sizeof(unsigned long);
516
517 /*
518 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
519 * have the write-low and read-high bitmap offsets the wrong way round.
520 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
521 */
522 if (msr <= 0x1fff) {
523 if (type & MSR_TYPE_R &&
524 !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
525 /* read-low */
526 __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
527
528 if (type & MSR_TYPE_W &&
529 !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
530 /* write-low */
531 __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
532
533 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
534 msr &= 0x1fff;
535 if (type & MSR_TYPE_R &&
536 !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
537 /* read-high */
538 __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
539
540 if (type & MSR_TYPE_W &&
541 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
542 /* write-high */
543 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
544
545 }
546 }
547
548 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
549 {
550 int msr;
551
552 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
553 unsigned word = msr / BITS_PER_LONG;
554
555 msr_bitmap[word] = ~0;
556 msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
557 }
558 }
559
560 /*
561 * Merge L0's and L1's MSR bitmap, return false to indicate that
562 * we do not use the hardware.
563 */
564 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
565 struct vmcs12 *vmcs12)
566 {
567 int msr;
568 unsigned long *msr_bitmap_l1;
569 unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
570 struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
571
572 /* Nothing to do if the MSR bitmap is not in use. */
573 if (!cpu_has_vmx_msr_bitmap() ||
574 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
575 return false;
576
577 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
578 return false;
579
580 msr_bitmap_l1 = (unsigned long *)map->hva;
581
582 /*
583 * To keep the control flow simple, pay eight 8-byte writes (sixteen
584 * 4-byte writes on 32-bit systems) up front to enable intercepts for
585 * the x2APIC MSR range and selectively disable them below.
586 */
587 enable_x2apic_msr_intercepts(msr_bitmap_l0);
588
589 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
590 if (nested_cpu_has_apic_reg_virt(vmcs12)) {
591 /*
592 * L0 need not intercept reads for MSRs between 0x800
593 * and 0x8ff, it just lets the processor take the value
594 * from the virtual-APIC page; take those 256 bits
595 * directly from the L1 bitmap.
596 */
597 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
598 unsigned word = msr / BITS_PER_LONG;
599
600 msr_bitmap_l0[word] = msr_bitmap_l1[word];
601 }
602 }
603
604 nested_vmx_disable_intercept_for_msr(
605 msr_bitmap_l1, msr_bitmap_l0,
606 X2APIC_MSR(APIC_TASKPRI),
607 MSR_TYPE_R | MSR_TYPE_W);
608
609 if (nested_cpu_has_vid(vmcs12)) {
610 nested_vmx_disable_intercept_for_msr(
611 msr_bitmap_l1, msr_bitmap_l0,
612 X2APIC_MSR(APIC_EOI),
613 MSR_TYPE_W);
614 nested_vmx_disable_intercept_for_msr(
615 msr_bitmap_l1, msr_bitmap_l0,
616 X2APIC_MSR(APIC_SELF_IPI),
617 MSR_TYPE_W);
618 }
619 }
620
621 /* KVM unconditionally exposes the FS/GS base MSRs to L1. */
622 #ifdef CONFIG_X86_64
623 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
624 MSR_FS_BASE, MSR_TYPE_RW);
625
626 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
627 MSR_GS_BASE, MSR_TYPE_RW);
628
629 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
630 MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
631 #endif
632
633 /*
634 * Checking the L0->L1 bitmap is trying to verify two things:
635 *
636 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
637 * ensures that we do not accidentally generate an L02 MSR bitmap
638 * from the L12 MSR bitmap that is too permissive.
639 * 2. That L1 or L2s have actually used the MSR. This avoids
640 * unnecessarily merging of the bitmap if the MSR is unused. This
641 * works properly because we only update the L01 MSR bitmap lazily.
642 * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
643 * updated to reflect this when L1 (or its L2s) actually write to
644 * the MSR.
645 */
646 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
647 nested_vmx_disable_intercept_for_msr(
648 msr_bitmap_l1, msr_bitmap_l0,
649 MSR_IA32_SPEC_CTRL,
650 MSR_TYPE_R | MSR_TYPE_W);
651
652 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
653 nested_vmx_disable_intercept_for_msr(
654 msr_bitmap_l1, msr_bitmap_l0,
655 MSR_IA32_PRED_CMD,
656 MSR_TYPE_W);
657
658 kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
659
660 return true;
661 }
662
663 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
664 struct vmcs12 *vmcs12)
665 {
666 struct kvm_host_map map;
667 struct vmcs12 *shadow;
668
669 if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
670 vmcs12->vmcs_link_pointer == -1ull)
671 return;
672
673 shadow = get_shadow_vmcs12(vcpu);
674
675 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
676 return;
677
678 memcpy(shadow, map.hva, VMCS12_SIZE);
679 kvm_vcpu_unmap(vcpu, &map, false);
680 }
681
682 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
683 struct vmcs12 *vmcs12)
684 {
685 struct vcpu_vmx *vmx = to_vmx(vcpu);
686
687 if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
688 vmcs12->vmcs_link_pointer == -1ull)
689 return;
690
691 kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
692 get_shadow_vmcs12(vcpu), VMCS12_SIZE);
693 }
694
695 /*
696 * In nested virtualization, check if L1 has set
697 * VM_EXIT_ACK_INTR_ON_EXIT
698 */
699 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
700 {
701 return get_vmcs12(vcpu)->vm_exit_controls &
702 VM_EXIT_ACK_INTR_ON_EXIT;
703 }
704
705 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
706 struct vmcs12 *vmcs12)
707 {
708 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
709 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
710 return -EINVAL;
711 else
712 return 0;
713 }
714
715 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
716 struct vmcs12 *vmcs12)
717 {
718 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
719 !nested_cpu_has_apic_reg_virt(vmcs12) &&
720 !nested_cpu_has_vid(vmcs12) &&
721 !nested_cpu_has_posted_intr(vmcs12))
722 return 0;
723
724 /*
725 * If virtualize x2apic mode is enabled,
726 * virtualize apic access must be disabled.
727 */
728 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
729 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
730 return -EINVAL;
731
732 /*
733 * If virtual interrupt delivery is enabled,
734 * we must exit on external interrupts.
735 */
736 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
737 return -EINVAL;
738
739 /*
740 * bits 15:8 should be zero in posted_intr_nv,
741 * the descriptor address has been already checked
742 * in nested_get_vmcs12_pages.
743 *
744 * bits 5:0 of posted_intr_desc_addr should be zero.
745 */
746 if (nested_cpu_has_posted_intr(vmcs12) &&
747 (CC(!nested_cpu_has_vid(vmcs12)) ||
748 CC(!nested_exit_intr_ack_set(vcpu)) ||
749 CC((vmcs12->posted_intr_nv & 0xff00)) ||
750 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64))))
751 return -EINVAL;
752
753 /* tpr shadow is needed by all apicv features. */
754 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
755 return -EINVAL;
756
757 return 0;
758 }
759
760 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
761 u32 count, u64 addr)
762 {
763 if (count == 0)
764 return 0;
765
766 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
767 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
768 return -EINVAL;
769
770 return 0;
771 }
772
773 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
774 struct vmcs12 *vmcs12)
775 {
776 if (CC(nested_vmx_check_msr_switch(vcpu,
777 vmcs12->vm_exit_msr_load_count,
778 vmcs12->vm_exit_msr_load_addr)) ||
779 CC(nested_vmx_check_msr_switch(vcpu,
780 vmcs12->vm_exit_msr_store_count,
781 vmcs12->vm_exit_msr_store_addr)))
782 return -EINVAL;
783
784 return 0;
785 }
786
787 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
788 struct vmcs12 *vmcs12)
789 {
790 if (CC(nested_vmx_check_msr_switch(vcpu,
791 vmcs12->vm_entry_msr_load_count,
792 vmcs12->vm_entry_msr_load_addr)))
793 return -EINVAL;
794
795 return 0;
796 }
797
798 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
799 struct vmcs12 *vmcs12)
800 {
801 if (!nested_cpu_has_pml(vmcs12))
802 return 0;
803
804 if (CC(!nested_cpu_has_ept(vmcs12)) ||
805 CC(!page_address_valid(vcpu, vmcs12->pml_address)))
806 return -EINVAL;
807
808 return 0;
809 }
810
811 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
812 struct vmcs12 *vmcs12)
813 {
814 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
815 !nested_cpu_has_ept(vmcs12)))
816 return -EINVAL;
817 return 0;
818 }
819
820 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
821 struct vmcs12 *vmcs12)
822 {
823 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
824 !nested_cpu_has_ept(vmcs12)))
825 return -EINVAL;
826 return 0;
827 }
828
829 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
830 struct vmcs12 *vmcs12)
831 {
832 if (!nested_cpu_has_shadow_vmcs(vmcs12))
833 return 0;
834
835 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
836 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
837 return -EINVAL;
838
839 return 0;
840 }
841
842 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
843 struct vmx_msr_entry *e)
844 {
845 /* x2APIC MSR accesses are not allowed */
846 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
847 return -EINVAL;
848 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
849 CC(e->index == MSR_IA32_UCODE_REV))
850 return -EINVAL;
851 if (CC(e->reserved != 0))
852 return -EINVAL;
853 return 0;
854 }
855
856 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
857 struct vmx_msr_entry *e)
858 {
859 if (CC(e->index == MSR_FS_BASE) ||
860 CC(e->index == MSR_GS_BASE) ||
861 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
862 nested_vmx_msr_check_common(vcpu, e))
863 return -EINVAL;
864 return 0;
865 }
866
867 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
868 struct vmx_msr_entry *e)
869 {
870 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
871 nested_vmx_msr_check_common(vcpu, e))
872 return -EINVAL;
873 return 0;
874 }
875
876 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
877 {
878 struct vcpu_vmx *vmx = to_vmx(vcpu);
879 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
880 vmx->nested.msrs.misc_high);
881
882 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
883 }
884
885 /*
886 * Load guest's/host's msr at nested entry/exit.
887 * return 0 for success, entry index for failure.
888 *
889 * One of the failure modes for MSR load/store is when a list exceeds the
890 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
891 * as possible, process all valid entries before failing rather than precheck
892 * for a capacity violation.
893 */
894 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
895 {
896 u32 i;
897 struct vmx_msr_entry e;
898 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
899
900 for (i = 0; i < count; i++) {
901 if (unlikely(i >= max_msr_list_size))
902 goto fail;
903
904 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
905 &e, sizeof(e))) {
906 pr_debug_ratelimited(
907 "%s cannot read MSR entry (%u, 0x%08llx)\n",
908 __func__, i, gpa + i * sizeof(e));
909 goto fail;
910 }
911 if (nested_vmx_load_msr_check(vcpu, &e)) {
912 pr_debug_ratelimited(
913 "%s check failed (%u, 0x%x, 0x%x)\n",
914 __func__, i, e.index, e.reserved);
915 goto fail;
916 }
917 if (kvm_set_msr(vcpu, e.index, e.value)) {
918 pr_debug_ratelimited(
919 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
920 __func__, i, e.index, e.value);
921 goto fail;
922 }
923 }
924 return 0;
925 fail:
926 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */
927 return i + 1;
928 }
929
930 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
931 u32 msr_index,
932 u64 *data)
933 {
934 struct vcpu_vmx *vmx = to_vmx(vcpu);
935
936 /*
937 * If the L0 hypervisor stored a more accurate value for the TSC that
938 * does not include the time taken for emulation of the L2->L1
939 * VM-exit in L0, use the more accurate value.
940 */
941 if (msr_index == MSR_IA32_TSC) {
942 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest,
943 MSR_IA32_TSC);
944
945 if (i >= 0) {
946 u64 val = vmx->msr_autostore.guest.val[i].value;
947
948 *data = kvm_read_l1_tsc(vcpu, val);
949 return true;
950 }
951 }
952
953 if (kvm_get_msr(vcpu, msr_index, data)) {
954 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
955 msr_index);
956 return false;
957 }
958 return true;
959 }
960
961 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
962 struct vmx_msr_entry *e)
963 {
964 if (kvm_vcpu_read_guest(vcpu,
965 gpa + i * sizeof(*e),
966 e, 2 * sizeof(u32))) {
967 pr_debug_ratelimited(
968 "%s cannot read MSR entry (%u, 0x%08llx)\n",
969 __func__, i, gpa + i * sizeof(*e));
970 return false;
971 }
972 if (nested_vmx_store_msr_check(vcpu, e)) {
973 pr_debug_ratelimited(
974 "%s check failed (%u, 0x%x, 0x%x)\n",
975 __func__, i, e->index, e->reserved);
976 return false;
977 }
978 return true;
979 }
980
981 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
982 {
983 u64 data;
984 u32 i;
985 struct vmx_msr_entry e;
986 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
987
988 for (i = 0; i < count; i++) {
989 if (unlikely(i >= max_msr_list_size))
990 return -EINVAL;
991
992 if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
993 return -EINVAL;
994
995 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
996 return -EINVAL;
997
998 if (kvm_vcpu_write_guest(vcpu,
999 gpa + i * sizeof(e) +
1000 offsetof(struct vmx_msr_entry, value),
1001 &data, sizeof(data))) {
1002 pr_debug_ratelimited(
1003 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
1004 __func__, i, e.index, data);
1005 return -EINVAL;
1006 }
1007 }
1008 return 0;
1009 }
1010
1011 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
1012 {
1013 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1014 u32 count = vmcs12->vm_exit_msr_store_count;
1015 u64 gpa = vmcs12->vm_exit_msr_store_addr;
1016 struct vmx_msr_entry e;
1017 u32 i;
1018
1019 for (i = 0; i < count; i++) {
1020 if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1021 return false;
1022
1023 if (e.index == msr_index)
1024 return true;
1025 }
1026 return false;
1027 }
1028
1029 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
1030 u32 msr_index)
1031 {
1032 struct vcpu_vmx *vmx = to_vmx(vcpu);
1033 struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
1034 bool in_vmcs12_store_list;
1035 int msr_autostore_slot;
1036 bool in_autostore_list;
1037 int last;
1038
1039 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index);
1040 in_autostore_list = msr_autostore_slot >= 0;
1041 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
1042
1043 if (in_vmcs12_store_list && !in_autostore_list) {
1044 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) {
1045 /*
1046 * Emulated VMEntry does not fail here. Instead a less
1047 * accurate value will be returned by
1048 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
1049 * instead of reading the value from the vmcs02 VMExit
1050 * MSR-store area.
1051 */
1052 pr_warn_ratelimited(
1053 "Not enough msr entries in msr_autostore. Can't add msr %x\n",
1054 msr_index);
1055 return;
1056 }
1057 last = autostore->nr++;
1058 autostore->val[last].index = msr_index;
1059 } else if (!in_vmcs12_store_list && in_autostore_list) {
1060 last = --autostore->nr;
1061 autostore->val[msr_autostore_slot] = autostore->val[last];
1062 }
1063 }
1064
1065 /*
1066 * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit.
1067 * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't
1068 * enable VPID for L2 (implying it expects a TLB flush on VMX transitions).
1069 * Here's why.
1070 *
1071 * If EPT is enabled by L0 a sync is never needed:
1072 * - if it is disabled by L1, then L0 is not shadowing L1 or L2 PTEs, there
1073 * cannot be unsync'd SPTEs for either L1 or L2.
1074 *
1075 * - if it is also enabled by L1, then L0 doesn't need to sync on VM-Enter
1076 * VM-Enter as VM-Enter isn't required to invalidate guest-physical mappings
1077 * (irrespective of VPID), i.e. L1 can't rely on the (virtual) CPU to flush
1078 * stale guest-physical mappings for L2 from the TLB. And as above, L0 isn't
1079 * shadowing L1 PTEs so there are no unsync'd SPTEs to sync on VM-Exit.
1080 *
1081 * If EPT is disabled by L0:
1082 * - if VPID is enabled by L1 (for L2), the situation is similar to when L1
1083 * enables EPT: L0 doesn't need to sync as VM-Enter and VM-Exit aren't
1084 * required to invalidate linear mappings (EPT is disabled so there are
1085 * no combined or guest-physical mappings), i.e. L1 can't rely on the
1086 * (virtual) CPU to flush stale linear mappings for either L2 or itself (L1).
1087 *
1088 * - however if VPID is disabled by L1, then a sync is needed as L1 expects all
1089 * linear mappings (EPT is disabled so there are no combined or guest-physical
1090 * mappings) to be invalidated on both VM-Enter and VM-Exit.
1091 *
1092 * Note, this logic is subtly different than nested_has_guest_tlb_tag(), which
1093 * additionally checks that L2 has been assigned a VPID (when EPT is disabled).
1094 * Whether or not L2 has been assigned a VPID by L0 is irrelevant with respect
1095 * to L1's expectations, e.g. L0 needs to invalidate hardware TLB entries if L2
1096 * doesn't have a unique VPID to prevent reusing L1's entries (assuming L1 has
1097 * been assigned a VPID), but L0 doesn't need to do a MMU sync because L1
1098 * doesn't expect stale (virtual) TLB entries to be flushed, i.e. L1 doesn't
1099 * know that L0 will flush the TLB and so L1 will do INVVPID as needed to flush
1100 * stale TLB entries, at which point L0 will sync L2's MMU.
1101 */
1102 static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu)
1103 {
1104 return !enable_ept && !nested_cpu_has_vpid(get_vmcs12(vcpu));
1105 }
1106
1107 /*
1108 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are
1109 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected
1110 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
1111 * @entry_failure_code.
1112 */
1113 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
1114 bool nested_ept, bool reload_pdptrs,
1115 enum vm_entry_failure_code *entry_failure_code)
1116 {
1117 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) {
1118 *entry_failure_code = ENTRY_FAIL_DEFAULT;
1119 return -EINVAL;
1120 }
1121
1122 /*
1123 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
1124 * must not be dereferenced.
1125 */
1126 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) &&
1127 CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
1128 *entry_failure_code = ENTRY_FAIL_PDPTE;
1129 return -EINVAL;
1130 }
1131
1132 /*
1133 * Unconditionally skip the TLB flush on fast CR3 switch, all TLB
1134 * flushes are handled by nested_vmx_transition_tlb_flush(). See
1135 * nested_vmx_transition_mmu_sync for details on skipping the MMU sync.
1136 */
1137 if (!nested_ept)
1138 kvm_mmu_new_pgd(vcpu, cr3, true,
1139 !nested_vmx_transition_mmu_sync(vcpu));
1140
1141 vcpu->arch.cr3 = cr3;
1142 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
1143
1144 kvm_init_mmu(vcpu, false);
1145
1146 return 0;
1147 }
1148
1149 /*
1150 * Returns if KVM is able to config CPU to tag TLB entries
1151 * populated by L2 differently than TLB entries populated
1152 * by L1.
1153 *
1154 * If L0 uses EPT, L1 and L2 run with different EPTP because
1155 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
1156 * are tagged with different EPTP.
1157 *
1158 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
1159 * with different VPID (L1 entries are tagged with vmx->vpid
1160 * while L2 entries are tagged with vmx->nested.vpid02).
1161 */
1162 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
1163 {
1164 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1165
1166 return enable_ept ||
1167 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
1168 }
1169
1170 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
1171 struct vmcs12 *vmcs12,
1172 bool is_vmenter)
1173 {
1174 struct vcpu_vmx *vmx = to_vmx(vcpu);
1175
1176 /*
1177 * If VPID is disabled, linear and combined mappings are flushed on
1178 * VM-Enter/VM-Exit, and guest-physical mappings are valid only for
1179 * their associated EPTP.
1180 */
1181 if (!enable_vpid)
1182 return;
1183
1184 /*
1185 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
1186 * for *all* contexts to be flushed on VM-Enter/VM-Exit.
1187 *
1188 * If VPID is enabled and used by vmc12, but L2 does not have a unique
1189 * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate
1190 * a VPID for L2, flush the current context as the effective ASID is
1191 * common to both L1 and L2.
1192 *
1193 * Defer the flush so that it runs after vmcs02.EPTP has been set by
1194 * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid
1195 * redundant flushes further down the nested pipeline.
1196 *
1197 * If a TLB flush isn't required due to any of the above, and vpid12 is
1198 * changing then the new "virtual" VPID (vpid12) will reuse the same
1199 * "real" VPID (vpid02), and so needs to be sync'd. There is no direct
1200 * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for
1201 * all nested vCPUs.
1202 */
1203 if (!nested_cpu_has_vpid(vmcs12)) {
1204 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1205 } else if (!nested_has_guest_tlb_tag(vcpu)) {
1206 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
1207 } else if (is_vmenter &&
1208 vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
1209 vmx->nested.last_vpid = vmcs12->virtual_processor_id;
1210 vpid_sync_context(nested_get_vpid02(vcpu));
1211 }
1212 }
1213
1214 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
1215 {
1216 superset &= mask;
1217 subset &= mask;
1218
1219 return (superset | subset) == superset;
1220 }
1221
1222 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
1223 {
1224 const u64 feature_and_reserved =
1225 /* feature (except bit 48; see below) */
1226 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
1227 /* reserved */
1228 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
1229 u64 vmx_basic = vmx->nested.msrs.basic;
1230
1231 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
1232 return -EINVAL;
1233
1234 /*
1235 * KVM does not emulate a version of VMX that constrains physical
1236 * addresses of VMX structures (e.g. VMCS) to 32-bits.
1237 */
1238 if (data & BIT_ULL(48))
1239 return -EINVAL;
1240
1241 if (vmx_basic_vmcs_revision_id(vmx_basic) !=
1242 vmx_basic_vmcs_revision_id(data))
1243 return -EINVAL;
1244
1245 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
1246 return -EINVAL;
1247
1248 vmx->nested.msrs.basic = data;
1249 return 0;
1250 }
1251
1252 static int
1253 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1254 {
1255 u64 supported;
1256 u32 *lowp, *highp;
1257
1258 switch (msr_index) {
1259 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1260 lowp = &vmx->nested.msrs.pinbased_ctls_low;
1261 highp = &vmx->nested.msrs.pinbased_ctls_high;
1262 break;
1263 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1264 lowp = &vmx->nested.msrs.procbased_ctls_low;
1265 highp = &vmx->nested.msrs.procbased_ctls_high;
1266 break;
1267 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1268 lowp = &vmx->nested.msrs.exit_ctls_low;
1269 highp = &vmx->nested.msrs.exit_ctls_high;
1270 break;
1271 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1272 lowp = &vmx->nested.msrs.entry_ctls_low;
1273 highp = &vmx->nested.msrs.entry_ctls_high;
1274 break;
1275 case MSR_IA32_VMX_PROCBASED_CTLS2:
1276 lowp = &vmx->nested.msrs.secondary_ctls_low;
1277 highp = &vmx->nested.msrs.secondary_ctls_high;
1278 break;
1279 default:
1280 BUG();
1281 }
1282
1283 supported = vmx_control_msr(*lowp, *highp);
1284
1285 /* Check must-be-1 bits are still 1. */
1286 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
1287 return -EINVAL;
1288
1289 /* Check must-be-0 bits are still 0. */
1290 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
1291 return -EINVAL;
1292
1293 *lowp = data;
1294 *highp = data >> 32;
1295 return 0;
1296 }
1297
1298 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1299 {
1300 const u64 feature_and_reserved_bits =
1301 /* feature */
1302 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
1303 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
1304 /* reserved */
1305 GENMASK_ULL(13, 9) | BIT_ULL(31);
1306 u64 vmx_misc;
1307
1308 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
1309 vmx->nested.msrs.misc_high);
1310
1311 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
1312 return -EINVAL;
1313
1314 if ((vmx->nested.msrs.pinbased_ctls_high &
1315 PIN_BASED_VMX_PREEMPTION_TIMER) &&
1316 vmx_misc_preemption_timer_rate(data) !=
1317 vmx_misc_preemption_timer_rate(vmx_misc))
1318 return -EINVAL;
1319
1320 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
1321 return -EINVAL;
1322
1323 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
1324 return -EINVAL;
1325
1326 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
1327 return -EINVAL;
1328
1329 vmx->nested.msrs.misc_low = data;
1330 vmx->nested.msrs.misc_high = data >> 32;
1331
1332 return 0;
1333 }
1334
1335 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
1336 {
1337 u64 vmx_ept_vpid_cap;
1338
1339 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
1340 vmx->nested.msrs.vpid_caps);
1341
1342 /* Every bit is either reserved or a feature bit. */
1343 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
1344 return -EINVAL;
1345
1346 vmx->nested.msrs.ept_caps = data;
1347 vmx->nested.msrs.vpid_caps = data >> 32;
1348 return 0;
1349 }
1350
1351 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1352 {
1353 u64 *msr;
1354
1355 switch (msr_index) {
1356 case MSR_IA32_VMX_CR0_FIXED0:
1357 msr = &vmx->nested.msrs.cr0_fixed0;
1358 break;
1359 case MSR_IA32_VMX_CR4_FIXED0:
1360 msr = &vmx->nested.msrs.cr4_fixed0;
1361 break;
1362 default:
1363 BUG();
1364 }
1365
1366 /*
1367 * 1 bits (which indicates bits which "must-be-1" during VMX operation)
1368 * must be 1 in the restored value.
1369 */
1370 if (!is_bitwise_subset(data, *msr, -1ULL))
1371 return -EINVAL;
1372
1373 *msr = data;
1374 return 0;
1375 }
1376
1377 /*
1378 * Called when userspace is restoring VMX MSRs.
1379 *
1380 * Returns 0 on success, non-0 otherwise.
1381 */
1382 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1383 {
1384 struct vcpu_vmx *vmx = to_vmx(vcpu);
1385
1386 /*
1387 * Don't allow changes to the VMX capability MSRs while the vCPU
1388 * is in VMX operation.
1389 */
1390 if (vmx->nested.vmxon)
1391 return -EBUSY;
1392
1393 switch (msr_index) {
1394 case MSR_IA32_VMX_BASIC:
1395 return vmx_restore_vmx_basic(vmx, data);
1396 case MSR_IA32_VMX_PINBASED_CTLS:
1397 case MSR_IA32_VMX_PROCBASED_CTLS:
1398 case MSR_IA32_VMX_EXIT_CTLS:
1399 case MSR_IA32_VMX_ENTRY_CTLS:
1400 /*
1401 * The "non-true" VMX capability MSRs are generated from the
1402 * "true" MSRs, so we do not support restoring them directly.
1403 *
1404 * If userspace wants to emulate VMX_BASIC[55]=0, userspace
1405 * should restore the "true" MSRs with the must-be-1 bits
1406 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1407 * DEFAULT SETTINGS".
1408 */
1409 return -EINVAL;
1410 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1411 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1412 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1413 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1414 case MSR_IA32_VMX_PROCBASED_CTLS2:
1415 return vmx_restore_control_msr(vmx, msr_index, data);
1416 case MSR_IA32_VMX_MISC:
1417 return vmx_restore_vmx_misc(vmx, data);
1418 case MSR_IA32_VMX_CR0_FIXED0:
1419 case MSR_IA32_VMX_CR4_FIXED0:
1420 return vmx_restore_fixed0_msr(vmx, msr_index, data);
1421 case MSR_IA32_VMX_CR0_FIXED1:
1422 case MSR_IA32_VMX_CR4_FIXED1:
1423 /*
1424 * These MSRs are generated based on the vCPU's CPUID, so we
1425 * do not support restoring them directly.
1426 */
1427 return -EINVAL;
1428 case MSR_IA32_VMX_EPT_VPID_CAP:
1429 return vmx_restore_vmx_ept_vpid_cap(vmx, data);
1430 case MSR_IA32_VMX_VMCS_ENUM:
1431 vmx->nested.msrs.vmcs_enum = data;
1432 return 0;
1433 case MSR_IA32_VMX_VMFUNC:
1434 if (data & ~vmx->nested.msrs.vmfunc_controls)
1435 return -EINVAL;
1436 vmx->nested.msrs.vmfunc_controls = data;
1437 return 0;
1438 default:
1439 /*
1440 * The rest of the VMX capability MSRs do not support restore.
1441 */
1442 return -EINVAL;
1443 }
1444 }
1445
1446 /* Returns 0 on success, non-0 otherwise. */
1447 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1448 {
1449 switch (msr_index) {
1450 case MSR_IA32_VMX_BASIC:
1451 *pdata = msrs->basic;
1452 break;
1453 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1454 case MSR_IA32_VMX_PINBASED_CTLS:
1455 *pdata = vmx_control_msr(
1456 msrs->pinbased_ctls_low,
1457 msrs->pinbased_ctls_high);
1458 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
1459 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1460 break;
1461 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1462 case MSR_IA32_VMX_PROCBASED_CTLS:
1463 *pdata = vmx_control_msr(
1464 msrs->procbased_ctls_low,
1465 msrs->procbased_ctls_high);
1466 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
1467 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1468 break;
1469 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1470 case MSR_IA32_VMX_EXIT_CTLS:
1471 *pdata = vmx_control_msr(
1472 msrs->exit_ctls_low,
1473 msrs->exit_ctls_high);
1474 if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
1475 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
1476 break;
1477 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1478 case MSR_IA32_VMX_ENTRY_CTLS:
1479 *pdata = vmx_control_msr(
1480 msrs->entry_ctls_low,
1481 msrs->entry_ctls_high);
1482 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
1483 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
1484 break;
1485 case MSR_IA32_VMX_MISC:
1486 *pdata = vmx_control_msr(
1487 msrs->misc_low,
1488 msrs->misc_high);
1489 break;
1490 case MSR_IA32_VMX_CR0_FIXED0:
1491 *pdata = msrs->cr0_fixed0;
1492 break;
1493 case MSR_IA32_VMX_CR0_FIXED1:
1494 *pdata = msrs->cr0_fixed1;
1495 break;
1496 case MSR_IA32_VMX_CR4_FIXED0:
1497 *pdata = msrs->cr4_fixed0;
1498 break;
1499 case MSR_IA32_VMX_CR4_FIXED1:
1500 *pdata = msrs->cr4_fixed1;
1501 break;
1502 case MSR_IA32_VMX_VMCS_ENUM:
1503 *pdata = msrs->vmcs_enum;
1504 break;
1505 case MSR_IA32_VMX_PROCBASED_CTLS2:
1506 *pdata = vmx_control_msr(
1507 msrs->secondary_ctls_low,
1508 msrs->secondary_ctls_high);
1509 break;
1510 case MSR_IA32_VMX_EPT_VPID_CAP:
1511 *pdata = msrs->ept_caps |
1512 ((u64)msrs->vpid_caps << 32);
1513 break;
1514 case MSR_IA32_VMX_VMFUNC:
1515 *pdata = msrs->vmfunc_controls;
1516 break;
1517 default:
1518 return 1;
1519 }
1520
1521 return 0;
1522 }
1523
1524 /*
1525 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
1526 * been modified by the L1 guest. Note, "writable" in this context means
1527 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
1528 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
1529 * VM-exit information fields (which are actually writable if the vCPU is
1530 * configured to support "VMWRITE to any supported field in the VMCS").
1531 */
1532 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1533 {
1534 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1535 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1536 struct shadow_vmcs_field field;
1537 unsigned long val;
1538 int i;
1539
1540 if (WARN_ON(!shadow_vmcs))
1541 return;
1542
1543 preempt_disable();
1544
1545 vmcs_load(shadow_vmcs);
1546
1547 for (i = 0; i < max_shadow_read_write_fields; i++) {
1548 field = shadow_read_write_fields[i];
1549 val = __vmcs_readl(field.encoding);
1550 vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
1551 }
1552
1553 vmcs_clear(shadow_vmcs);
1554 vmcs_load(vmx->loaded_vmcs->vmcs);
1555
1556 preempt_enable();
1557 }
1558
1559 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1560 {
1561 const struct shadow_vmcs_field *fields[] = {
1562 shadow_read_write_fields,
1563 shadow_read_only_fields
1564 };
1565 const int max_fields[] = {
1566 max_shadow_read_write_fields,
1567 max_shadow_read_only_fields
1568 };
1569 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1570 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1571 struct shadow_vmcs_field field;
1572 unsigned long val;
1573 int i, q;
1574
1575 if (WARN_ON(!shadow_vmcs))
1576 return;
1577
1578 vmcs_load(shadow_vmcs);
1579
1580 for (q = 0; q < ARRAY_SIZE(fields); q++) {
1581 for (i = 0; i < max_fields[q]; i++) {
1582 field = fields[q][i];
1583 val = vmcs12_read_any(vmcs12, field.encoding,
1584 field.offset);
1585 __vmcs_writel(field.encoding, val);
1586 }
1587 }
1588
1589 vmcs_clear(shadow_vmcs);
1590 vmcs_load(vmx->loaded_vmcs->vmcs);
1591 }
1592
1593 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields)
1594 {
1595 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1596 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1597
1598 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1599 vmcs12->tpr_threshold = evmcs->tpr_threshold;
1600 vmcs12->guest_rip = evmcs->guest_rip;
1601
1602 if (unlikely(!(hv_clean_fields &
1603 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
1604 vmcs12->guest_rsp = evmcs->guest_rsp;
1605 vmcs12->guest_rflags = evmcs->guest_rflags;
1606 vmcs12->guest_interruptibility_info =
1607 evmcs->guest_interruptibility_info;
1608 }
1609
1610 if (unlikely(!(hv_clean_fields &
1611 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1612 vmcs12->cpu_based_vm_exec_control =
1613 evmcs->cpu_based_vm_exec_control;
1614 }
1615
1616 if (unlikely(!(hv_clean_fields &
1617 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
1618 vmcs12->exception_bitmap = evmcs->exception_bitmap;
1619 }
1620
1621 if (unlikely(!(hv_clean_fields &
1622 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
1623 vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
1624 }
1625
1626 if (unlikely(!(hv_clean_fields &
1627 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
1628 vmcs12->vm_entry_intr_info_field =
1629 evmcs->vm_entry_intr_info_field;
1630 vmcs12->vm_entry_exception_error_code =
1631 evmcs->vm_entry_exception_error_code;
1632 vmcs12->vm_entry_instruction_len =
1633 evmcs->vm_entry_instruction_len;
1634 }
1635
1636 if (unlikely(!(hv_clean_fields &
1637 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1638 vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
1639 vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
1640 vmcs12->host_cr0 = evmcs->host_cr0;
1641 vmcs12->host_cr3 = evmcs->host_cr3;
1642 vmcs12->host_cr4 = evmcs->host_cr4;
1643 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
1644 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
1645 vmcs12->host_rip = evmcs->host_rip;
1646 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
1647 vmcs12->host_es_selector = evmcs->host_es_selector;
1648 vmcs12->host_cs_selector = evmcs->host_cs_selector;
1649 vmcs12->host_ss_selector = evmcs->host_ss_selector;
1650 vmcs12->host_ds_selector = evmcs->host_ds_selector;
1651 vmcs12->host_fs_selector = evmcs->host_fs_selector;
1652 vmcs12->host_gs_selector = evmcs->host_gs_selector;
1653 vmcs12->host_tr_selector = evmcs->host_tr_selector;
1654 }
1655
1656 if (unlikely(!(hv_clean_fields &
1657 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
1658 vmcs12->pin_based_vm_exec_control =
1659 evmcs->pin_based_vm_exec_control;
1660 vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
1661 vmcs12->secondary_vm_exec_control =
1662 evmcs->secondary_vm_exec_control;
1663 }
1664
1665 if (unlikely(!(hv_clean_fields &
1666 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
1667 vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
1668 vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
1669 }
1670
1671 if (unlikely(!(hv_clean_fields &
1672 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
1673 vmcs12->msr_bitmap = evmcs->msr_bitmap;
1674 }
1675
1676 if (unlikely(!(hv_clean_fields &
1677 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
1678 vmcs12->guest_es_base = evmcs->guest_es_base;
1679 vmcs12->guest_cs_base = evmcs->guest_cs_base;
1680 vmcs12->guest_ss_base = evmcs->guest_ss_base;
1681 vmcs12->guest_ds_base = evmcs->guest_ds_base;
1682 vmcs12->guest_fs_base = evmcs->guest_fs_base;
1683 vmcs12->guest_gs_base = evmcs->guest_gs_base;
1684 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
1685 vmcs12->guest_tr_base = evmcs->guest_tr_base;
1686 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
1687 vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
1688 vmcs12->guest_es_limit = evmcs->guest_es_limit;
1689 vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
1690 vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
1691 vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
1692 vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
1693 vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
1694 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
1695 vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
1696 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
1697 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
1698 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
1699 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
1700 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
1701 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
1702 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
1703 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
1704 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
1705 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
1706 vmcs12->guest_es_selector = evmcs->guest_es_selector;
1707 vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
1708 vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
1709 vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
1710 vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
1711 vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
1712 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
1713 vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
1714 }
1715
1716 if (unlikely(!(hv_clean_fields &
1717 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
1718 vmcs12->tsc_offset = evmcs->tsc_offset;
1719 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
1720 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
1721 }
1722
1723 if (unlikely(!(hv_clean_fields &
1724 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
1725 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
1726 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
1727 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
1728 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
1729 vmcs12->guest_cr0 = evmcs->guest_cr0;
1730 vmcs12->guest_cr3 = evmcs->guest_cr3;
1731 vmcs12->guest_cr4 = evmcs->guest_cr4;
1732 vmcs12->guest_dr7 = evmcs->guest_dr7;
1733 }
1734
1735 if (unlikely(!(hv_clean_fields &
1736 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
1737 vmcs12->host_fs_base = evmcs->host_fs_base;
1738 vmcs12->host_gs_base = evmcs->host_gs_base;
1739 vmcs12->host_tr_base = evmcs->host_tr_base;
1740 vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
1741 vmcs12->host_idtr_base = evmcs->host_idtr_base;
1742 vmcs12->host_rsp = evmcs->host_rsp;
1743 }
1744
1745 if (unlikely(!(hv_clean_fields &
1746 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
1747 vmcs12->ept_pointer = evmcs->ept_pointer;
1748 vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
1749 }
1750
1751 if (unlikely(!(hv_clean_fields &
1752 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
1753 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
1754 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
1755 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
1756 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
1757 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
1758 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
1759 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
1760 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
1761 vmcs12->guest_pending_dbg_exceptions =
1762 evmcs->guest_pending_dbg_exceptions;
1763 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
1764 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
1765 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
1766 vmcs12->guest_activity_state = evmcs->guest_activity_state;
1767 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
1768 }
1769
1770 /*
1771 * Not used?
1772 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1773 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1774 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
1775 * vmcs12->page_fault_error_code_mask =
1776 * evmcs->page_fault_error_code_mask;
1777 * vmcs12->page_fault_error_code_match =
1778 * evmcs->page_fault_error_code_match;
1779 * vmcs12->cr3_target_count = evmcs->cr3_target_count;
1780 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1781 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1782 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1783 */
1784
1785 /*
1786 * Read only fields:
1787 * vmcs12->guest_physical_address = evmcs->guest_physical_address;
1788 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1789 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1790 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1791 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1792 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1793 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1794 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1795 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1796 * vmcs12->exit_qualification = evmcs->exit_qualification;
1797 * vmcs12->guest_linear_address = evmcs->guest_linear_address;
1798 *
1799 * Not present in struct vmcs12:
1800 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1801 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1802 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1803 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1804 */
1805
1806 return;
1807 }
1808
1809 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
1810 {
1811 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1812 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1813
1814 /*
1815 * Should not be changed by KVM:
1816 *
1817 * evmcs->host_es_selector = vmcs12->host_es_selector;
1818 * evmcs->host_cs_selector = vmcs12->host_cs_selector;
1819 * evmcs->host_ss_selector = vmcs12->host_ss_selector;
1820 * evmcs->host_ds_selector = vmcs12->host_ds_selector;
1821 * evmcs->host_fs_selector = vmcs12->host_fs_selector;
1822 * evmcs->host_gs_selector = vmcs12->host_gs_selector;
1823 * evmcs->host_tr_selector = vmcs12->host_tr_selector;
1824 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1825 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1826 * evmcs->host_cr0 = vmcs12->host_cr0;
1827 * evmcs->host_cr3 = vmcs12->host_cr3;
1828 * evmcs->host_cr4 = vmcs12->host_cr4;
1829 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1830 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1831 * evmcs->host_rip = vmcs12->host_rip;
1832 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1833 * evmcs->host_fs_base = vmcs12->host_fs_base;
1834 * evmcs->host_gs_base = vmcs12->host_gs_base;
1835 * evmcs->host_tr_base = vmcs12->host_tr_base;
1836 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1837 * evmcs->host_idtr_base = vmcs12->host_idtr_base;
1838 * evmcs->host_rsp = vmcs12->host_rsp;
1839 * sync_vmcs02_to_vmcs12() doesn't read these:
1840 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1841 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1842 * evmcs->msr_bitmap = vmcs12->msr_bitmap;
1843 * evmcs->ept_pointer = vmcs12->ept_pointer;
1844 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1845 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1846 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1847 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
1848 * evmcs->tpr_threshold = vmcs12->tpr_threshold;
1849 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1850 * evmcs->exception_bitmap = vmcs12->exception_bitmap;
1851 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1852 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1853 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1854 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1855 * evmcs->page_fault_error_code_mask =
1856 * vmcs12->page_fault_error_code_mask;
1857 * evmcs->page_fault_error_code_match =
1858 * vmcs12->page_fault_error_code_match;
1859 * evmcs->cr3_target_count = vmcs12->cr3_target_count;
1860 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1861 * evmcs->tsc_offset = vmcs12->tsc_offset;
1862 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1863 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1864 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1865 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1866 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
1867 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
1868 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
1869 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
1870 *
1871 * Not present in struct vmcs12:
1872 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
1873 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
1874 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
1875 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
1876 */
1877
1878 evmcs->guest_es_selector = vmcs12->guest_es_selector;
1879 evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
1880 evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
1881 evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
1882 evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
1883 evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
1884 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
1885 evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
1886
1887 evmcs->guest_es_limit = vmcs12->guest_es_limit;
1888 evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
1889 evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
1890 evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
1891 evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
1892 evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
1893 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
1894 evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
1895 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
1896 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
1897
1898 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
1899 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
1900 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
1901 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
1902 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
1903 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
1904 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
1905 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
1906
1907 evmcs->guest_es_base = vmcs12->guest_es_base;
1908 evmcs->guest_cs_base = vmcs12->guest_cs_base;
1909 evmcs->guest_ss_base = vmcs12->guest_ss_base;
1910 evmcs->guest_ds_base = vmcs12->guest_ds_base;
1911 evmcs->guest_fs_base = vmcs12->guest_fs_base;
1912 evmcs->guest_gs_base = vmcs12->guest_gs_base;
1913 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
1914 evmcs->guest_tr_base = vmcs12->guest_tr_base;
1915 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
1916 evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
1917
1918 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
1919 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
1920
1921 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
1922 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
1923 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
1924 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
1925
1926 evmcs->guest_pending_dbg_exceptions =
1927 vmcs12->guest_pending_dbg_exceptions;
1928 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
1929 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
1930
1931 evmcs->guest_activity_state = vmcs12->guest_activity_state;
1932 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
1933
1934 evmcs->guest_cr0 = vmcs12->guest_cr0;
1935 evmcs->guest_cr3 = vmcs12->guest_cr3;
1936 evmcs->guest_cr4 = vmcs12->guest_cr4;
1937 evmcs->guest_dr7 = vmcs12->guest_dr7;
1938
1939 evmcs->guest_physical_address = vmcs12->guest_physical_address;
1940
1941 evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
1942 evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
1943 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
1944 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
1945 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
1946 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
1947 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
1948 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
1949
1950 evmcs->exit_qualification = vmcs12->exit_qualification;
1951
1952 evmcs->guest_linear_address = vmcs12->guest_linear_address;
1953 evmcs->guest_rsp = vmcs12->guest_rsp;
1954 evmcs->guest_rflags = vmcs12->guest_rflags;
1955
1956 evmcs->guest_interruptibility_info =
1957 vmcs12->guest_interruptibility_info;
1958 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
1959 evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
1960 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
1961 evmcs->vm_entry_exception_error_code =
1962 vmcs12->vm_entry_exception_error_code;
1963 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
1964
1965 evmcs->guest_rip = vmcs12->guest_rip;
1966
1967 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
1968
1969 return;
1970 }
1971
1972 /*
1973 * This is an equivalent of the nested hypervisor executing the vmptrld
1974 * instruction.
1975 */
1976 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
1977 struct kvm_vcpu *vcpu, bool from_launch)
1978 {
1979 struct vcpu_vmx *vmx = to_vmx(vcpu);
1980 bool evmcs_gpa_changed = false;
1981 u64 evmcs_gpa;
1982
1983 if (likely(!vmx->nested.enlightened_vmcs_enabled))
1984 return EVMPTRLD_DISABLED;
1985
1986 if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) {
1987 nested_release_evmcs(vcpu);
1988 return EVMPTRLD_DISABLED;
1989 }
1990
1991 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
1992 vmx->nested.current_vmptr = -1ull;
1993
1994 nested_release_evmcs(vcpu);
1995
1996 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
1997 &vmx->nested.hv_evmcs_map))
1998 return EVMPTRLD_ERROR;
1999
2000 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
2001
2002 /*
2003 * Currently, KVM only supports eVMCS version 1
2004 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
2005 * value to first u32 field of eVMCS which should specify eVMCS
2006 * VersionNumber.
2007 *
2008 * Guest should be aware of supported eVMCS versions by host by
2009 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
2010 * expected to set this CPUID leaf according to the value
2011 * returned in vmcs_version from nested_enable_evmcs().
2012 *
2013 * However, it turns out that Microsoft Hyper-V fails to comply
2014 * to their own invented interface: When Hyper-V use eVMCS, it
2015 * just sets first u32 field of eVMCS to revision_id specified
2016 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
2017 * which is one of the supported versions specified in
2018 * CPUID.0x4000000A.EAX[0:15].
2019 *
2020 * To overcome Hyper-V bug, we accept here either a supported
2021 * eVMCS version or VMCS12 revision_id as valid values for first
2022 * u32 field of eVMCS.
2023 */
2024 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
2025 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
2026 nested_release_evmcs(vcpu);
2027 return EVMPTRLD_VMFAIL;
2028 }
2029
2030 vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
2031
2032 evmcs_gpa_changed = true;
2033 /*
2034 * Unlike normal vmcs12, enlightened vmcs12 is not fully
2035 * reloaded from guest's memory (read only fields, fields not
2036 * present in struct hv_enlightened_vmcs, ...). Make sure there
2037 * are no leftovers.
2038 */
2039 if (from_launch) {
2040 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2041 memset(vmcs12, 0, sizeof(*vmcs12));
2042 vmcs12->hdr.revision_id = VMCS12_REVISION;
2043 }
2044
2045 }
2046
2047 /*
2048 * Clean fields data can't be used on VMLAUNCH and when we switch
2049 * between different L2 guests as KVM keeps a single VMCS12 per L1.
2050 */
2051 if (from_launch || evmcs_gpa_changed)
2052 vmx->nested.hv_evmcs->hv_clean_fields &=
2053 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2054
2055 return EVMPTRLD_SUCCEEDED;
2056 }
2057
2058 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
2059 {
2060 struct vcpu_vmx *vmx = to_vmx(vcpu);
2061
2062 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
2063 copy_vmcs12_to_enlightened(vmx);
2064 else
2065 copy_vmcs12_to_shadow(vmx);
2066
2067 vmx->nested.need_vmcs12_to_shadow_sync = false;
2068 }
2069
2070 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
2071 {
2072 struct vcpu_vmx *vmx =
2073 container_of(timer, struct vcpu_vmx, nested.preemption_timer);
2074
2075 vmx->nested.preemption_timer_expired = true;
2076 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
2077 kvm_vcpu_kick(&vmx->vcpu);
2078
2079 return HRTIMER_NORESTART;
2080 }
2081
2082 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu)
2083 {
2084 struct vcpu_vmx *vmx = to_vmx(vcpu);
2085 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2086
2087 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >>
2088 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2089
2090 if (!vmx->nested.has_preemption_timer_deadline) {
2091 vmx->nested.preemption_timer_deadline =
2092 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc;
2093 vmx->nested.has_preemption_timer_deadline = true;
2094 }
2095 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc;
2096 }
2097
2098 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
2099 u64 preemption_timeout)
2100 {
2101 struct vcpu_vmx *vmx = to_vmx(vcpu);
2102
2103 /*
2104 * A timer value of zero is architecturally guaranteed to cause
2105 * a VMExit prior to executing any instructions in the guest.
2106 */
2107 if (preemption_timeout == 0) {
2108 vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
2109 return;
2110 }
2111
2112 if (vcpu->arch.virtual_tsc_khz == 0)
2113 return;
2114
2115 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2116 preemption_timeout *= 1000000;
2117 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
2118 hrtimer_start(&vmx->nested.preemption_timer,
2119 ktime_add_ns(ktime_get(), preemption_timeout),
2120 HRTIMER_MODE_ABS_PINNED);
2121 }
2122
2123 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2124 {
2125 if (vmx->nested.nested_run_pending &&
2126 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
2127 return vmcs12->guest_ia32_efer;
2128 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
2129 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
2130 else
2131 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
2132 }
2133
2134 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
2135 {
2136 /*
2137 * If vmcs02 hasn't been initialized, set the constant vmcs02 state
2138 * according to L0's settings (vmcs12 is irrelevant here). Host
2139 * fields that come from L0 and are not constant, e.g. HOST_CR3,
2140 * will be set as needed prior to VMLAUNCH/VMRESUME.
2141 */
2142 if (vmx->nested.vmcs02_initialized)
2143 return;
2144 vmx->nested.vmcs02_initialized = true;
2145
2146 /*
2147 * We don't care what the EPTP value is we just need to guarantee
2148 * it's valid so we don't get a false positive when doing early
2149 * consistency checks.
2150 */
2151 if (enable_ept && nested_early_check)
2152 vmcs_write64(EPT_POINTER,
2153 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
2154
2155 /* All VMFUNCs are currently emulated through L0 vmexits. */
2156 if (cpu_has_vmx_vmfunc())
2157 vmcs_write64(VM_FUNCTION_CONTROL, 0);
2158
2159 if (cpu_has_vmx_posted_intr())
2160 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
2161
2162 if (cpu_has_vmx_msr_bitmap())
2163 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
2164
2165 /*
2166 * PML is emulated for L2, but never enabled in hardware as the MMU
2167 * handles A/D emulation. Disabling PML for L2 also avoids having to
2168 * deal with filtering out L2 GPAs from the buffer.
2169 */
2170 if (enable_pml) {
2171 vmcs_write64(PML_ADDRESS, 0);
2172 vmcs_write16(GUEST_PML_INDEX, -1);
2173 }
2174
2175 if (cpu_has_vmx_encls_vmexit())
2176 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
2177
2178 /*
2179 * Set the MSR load/store lists to match L0's settings. Only the
2180 * addresses are constant (for vmcs02), the counts can change based
2181 * on L2's behavior, e.g. switching to/from long mode.
2182 */
2183 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
2184 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2185 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
2186
2187 vmx_set_constant_host_state(vmx);
2188 }
2189
2190 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
2191 struct vmcs12 *vmcs12)
2192 {
2193 prepare_vmcs02_constant_state(vmx);
2194
2195 vmcs_write64(VMCS_LINK_POINTER, -1ull);
2196
2197 if (enable_vpid) {
2198 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
2199 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
2200 else
2201 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2202 }
2203 }
2204
2205 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2206 {
2207 u32 exec_control;
2208 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
2209
2210 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
2211 prepare_vmcs02_early_rare(vmx, vmcs12);
2212
2213 /*
2214 * PIN CONTROLS
2215 */
2216 exec_control = vmx_pin_based_exec_ctrl(vmx);
2217 exec_control |= (vmcs12->pin_based_vm_exec_control &
2218 ~PIN_BASED_VMX_PREEMPTION_TIMER);
2219
2220 /* Posted interrupts setting is only taken from vmcs12. */
2221 if (nested_cpu_has_posted_intr(vmcs12)) {
2222 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
2223 vmx->nested.pi_pending = false;
2224 } else {
2225 exec_control &= ~PIN_BASED_POSTED_INTR;
2226 }
2227 pin_controls_set(vmx, exec_control);
2228
2229 /*
2230 * EXEC CONTROLS
2231 */
2232 exec_control = vmx_exec_control(vmx); /* L0's desires */
2233 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
2234 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
2235 exec_control &= ~CPU_BASED_TPR_SHADOW;
2236 exec_control |= vmcs12->cpu_based_vm_exec_control;
2237
2238 vmx->nested.l1_tpr_threshold = -1;
2239 if (exec_control & CPU_BASED_TPR_SHADOW)
2240 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
2241 #ifdef CONFIG_X86_64
2242 else
2243 exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2244 CPU_BASED_CR8_STORE_EXITING;
2245 #endif
2246
2247 /*
2248 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2249 * for I/O port accesses.
2250 */
2251 exec_control |= CPU_BASED_UNCOND_IO_EXITING;
2252 exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2253
2254 /*
2255 * This bit will be computed in nested_get_vmcs12_pages, because
2256 * we do not have access to L1's MSR bitmap yet. For now, keep
2257 * the same bit as before, hoping to avoid multiple VMWRITEs that
2258 * only set/clear this bit.
2259 */
2260 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
2261 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
2262
2263 exec_controls_set(vmx, exec_control);
2264
2265 /*
2266 * SECONDARY EXEC CONTROLS
2267 */
2268 if (cpu_has_secondary_exec_ctrls()) {
2269 exec_control = vmx->secondary_exec_control;
2270
2271 /* Take the following fields only from vmcs12 */
2272 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2273 SECONDARY_EXEC_ENABLE_INVPCID |
2274 SECONDARY_EXEC_ENABLE_RDTSCP |
2275 SECONDARY_EXEC_XSAVES |
2276 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
2277 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2278 SECONDARY_EXEC_APIC_REGISTER_VIRT |
2279 SECONDARY_EXEC_ENABLE_VMFUNC |
2280 SECONDARY_EXEC_TSC_SCALING);
2281 if (nested_cpu_has(vmcs12,
2282 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
2283 exec_control |= vmcs12->secondary_vm_exec_control;
2284
2285 /* PML is emulated and never enabled in hardware for L2. */
2286 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
2287
2288 /* VMCS shadowing for L2 is emulated for now */
2289 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2290
2291 /*
2292 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
2293 * will not have to rewrite the controls just for this bit.
2294 */
2295 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
2296 (vmcs12->guest_cr4 & X86_CR4_UMIP))
2297 exec_control |= SECONDARY_EXEC_DESC;
2298
2299 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2300 vmcs_write16(GUEST_INTR_STATUS,
2301 vmcs12->guest_intr_status);
2302
2303 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
2304 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2305
2306 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
2307 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
2308
2309 secondary_exec_controls_set(vmx, exec_control);
2310 }
2311
2312 /*
2313 * ENTRY CONTROLS
2314 *
2315 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2316 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2317 * on the related bits (if supported by the CPU) in the hope that
2318 * we can avoid VMWrites during vmx_set_efer().
2319 */
2320 exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) &
2321 ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
2322 if (cpu_has_load_ia32_efer()) {
2323 if (guest_efer & EFER_LMA)
2324 exec_control |= VM_ENTRY_IA32E_MODE;
2325 if (guest_efer != host_efer)
2326 exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2327 }
2328 vm_entry_controls_set(vmx, exec_control);
2329
2330 /*
2331 * EXIT CONTROLS
2332 *
2333 * L2->L1 exit controls are emulated - the hardware exit is to L0 so
2334 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2335 * bits may be modified by vmx_set_efer() in prepare_vmcs02().
2336 */
2337 exec_control = vmx_vmexit_ctrl();
2338 if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
2339 exec_control |= VM_EXIT_LOAD_IA32_EFER;
2340 vm_exit_controls_set(vmx, exec_control);
2341
2342 /*
2343 * Interrupt/Exception Fields
2344 */
2345 if (vmx->nested.nested_run_pending) {
2346 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2347 vmcs12->vm_entry_intr_info_field);
2348 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2349 vmcs12->vm_entry_exception_error_code);
2350 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2351 vmcs12->vm_entry_instruction_len);
2352 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2353 vmcs12->guest_interruptibility_info);
2354 vmx->loaded_vmcs->nmi_known_unmasked =
2355 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
2356 } else {
2357 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
2358 }
2359 }
2360
2361 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2362 {
2363 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2364
2365 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2366 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2367 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
2368 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
2369 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
2370 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
2371 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
2372 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
2373 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
2374 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
2375 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
2376 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
2377 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
2378 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
2379 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
2380 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
2381 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
2382 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2383 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2384 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
2385 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2386 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
2387 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2388 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2389 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
2390 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
2391 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
2392 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
2393 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
2394 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
2395 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
2396 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
2397 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
2398 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
2399 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
2400 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
2401 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
2402 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
2403
2404 vmx->segment_cache.bitmask = 0;
2405 }
2406
2407 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2408 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
2409 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
2410 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
2411 vmcs12->guest_pending_dbg_exceptions);
2412 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
2413 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
2414
2415 /*
2416 * L1 may access the L2's PDPTR, so save them to construct
2417 * vmcs12
2418 */
2419 if (enable_ept) {
2420 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2421 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2422 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2423 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2424 }
2425
2426 if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
2427 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2428 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
2429 }
2430
2431 if (nested_cpu_has_xsaves(vmcs12))
2432 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
2433
2434 /*
2435 * Whether page-faults are trapped is determined by a combination of
2436 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0
2437 * doesn't care about page faults then we should set all of these to
2438 * L1's desires. However, if L0 does care about (some) page faults, it
2439 * is not easy (if at all possible?) to merge L0 and L1's desires, we
2440 * simply ask to exit on each and every L2 page fault. This is done by
2441 * setting MASK=MATCH=0 and (see below) EB.PF=1.
2442 * Note that below we don't need special code to set EB.PF beyond the
2443 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2444 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2445 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
2446 */
2447 if (vmx_need_pf_intercept(&vmx->vcpu)) {
2448 /*
2449 * TODO: if both L0 and L1 need the same MASK and MATCH,
2450 * go ahead and use it?
2451 */
2452 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
2453 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
2454 } else {
2455 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask);
2456 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match);
2457 }
2458
2459 if (cpu_has_vmx_apicv()) {
2460 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
2461 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
2462 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
2463 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
2464 }
2465
2466 /*
2467 * Make sure the msr_autostore list is up to date before we set the
2468 * count in the vmcs02.
2469 */
2470 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
2471
2472 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
2473 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2474 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2475
2476 set_cr4_guest_host_mask(vmx);
2477 }
2478
2479 /*
2480 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2481 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2482 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2483 * guest in a way that will both be appropriate to L1's requests, and our
2484 * needs. In addition to modifying the active vmcs (which is vmcs02), this
2485 * function also has additional necessary side-effects, like setting various
2486 * vcpu->arch fields.
2487 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
2488 * is assigned to entry_failure_code on failure.
2489 */
2490 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2491 bool from_vmentry,
2492 enum vm_entry_failure_code *entry_failure_code)
2493 {
2494 struct vcpu_vmx *vmx = to_vmx(vcpu);
2495 bool load_guest_pdptrs_vmcs12 = false;
2496
2497 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
2498 prepare_vmcs02_rare(vmx, vmcs12);
2499 vmx->nested.dirty_vmcs12 = false;
2500
2501 load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) ||
2502 !(vmx->nested.hv_evmcs->hv_clean_fields &
2503 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
2504 }
2505
2506 if (vmx->nested.nested_run_pending &&
2507 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2508 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
2509 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
2510 } else {
2511 kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2512 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
2513 }
2514 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
2515 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
2516 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
2517 vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2518
2519 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2520 * bitwise-or of what L1 wants to trap for L2, and what we want to
2521 * trap. Note that CR0.TS also needs updating - we do this later.
2522 */
2523 vmx_update_exception_bitmap(vcpu);
2524 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
2525 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2526
2527 if (vmx->nested.nested_run_pending &&
2528 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
2529 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
2530 vcpu->arch.pat = vmcs12->guest_ia32_pat;
2531 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2532 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
2533 }
2534
2535 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
2536 vcpu->arch.l1_tsc_offset,
2537 vmx_get_l2_tsc_offset(vcpu),
2538 vmx_get_l2_tsc_multiplier(vcpu));
2539
2540 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
2541 vcpu->arch.l1_tsc_scaling_ratio,
2542 vmx_get_l2_tsc_multiplier(vcpu));
2543
2544 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2545 if (kvm_has_tsc_control)
2546 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
2547
2548 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
2549
2550 if (nested_cpu_has_ept(vmcs12))
2551 nested_ept_init_mmu_context(vcpu);
2552
2553 /*
2554 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
2555 * bits which we consider mandatory enabled.
2556 * The CR0_READ_SHADOW is what L2 should have expected to read given
2557 * the specifications by L1; It's not enough to take
2558 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
2559 * have more bits than L1 expected.
2560 */
2561 vmx_set_cr0(vcpu, vmcs12->guest_cr0);
2562 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2563
2564 vmx_set_cr4(vcpu, vmcs12->guest_cr4);
2565 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
2566
2567 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
2568 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2569 vmx_set_efer(vcpu, vcpu->arch.efer);
2570
2571 /*
2572 * Guest state is invalid and unrestricted guest is disabled,
2573 * which means L1 attempted VMEntry to L2 with invalid state.
2574 * Fail the VMEntry.
2575 */
2576 if (CC(!vmx_guest_state_valid(vcpu))) {
2577 *entry_failure_code = ENTRY_FAIL_DEFAULT;
2578 return -EINVAL;
2579 }
2580
2581 /* Shadow page tables on either EPT or shadow page tables. */
2582 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
2583 from_vmentry, entry_failure_code))
2584 return -EINVAL;
2585
2586 /*
2587 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12
2588 * on nested VM-Exit, which can occur without actually running L2 and
2589 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with
2590 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
2591 * transition to HLT instead of running L2.
2592 */
2593 if (enable_ept)
2594 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
2595
2596 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
2597 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
2598 is_pae_paging(vcpu)) {
2599 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2600 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2601 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2602 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2603 }
2604
2605 if (!enable_ept)
2606 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
2607
2608 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2609 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
2610 vmcs12->guest_ia32_perf_global_ctrl)))
2611 return -EINVAL;
2612
2613 kvm_rsp_write(vcpu, vmcs12->guest_rsp);
2614 kvm_rip_write(vcpu, vmcs12->guest_rip);
2615
2616 /*
2617 * It was observed that genuine Hyper-V running in L1 doesn't reset
2618 * 'hv_clean_fields' by itself, it only sets the corresponding dirty
2619 * bits when it changes a field in eVMCS. Mark all fields as clean
2620 * here.
2621 */
2622 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
2623 vmx->nested.hv_evmcs->hv_clean_fields |=
2624 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2625
2626 return 0;
2627 }
2628
2629 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
2630 {
2631 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
2632 nested_cpu_has_virtual_nmis(vmcs12)))
2633 return -EINVAL;
2634
2635 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
2636 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING)))
2637 return -EINVAL;
2638
2639 return 0;
2640 }
2641
2642 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
2643 {
2644 struct vcpu_vmx *vmx = to_vmx(vcpu);
2645
2646 /* Check for memory type validity */
2647 switch (new_eptp & VMX_EPTP_MT_MASK) {
2648 case VMX_EPTP_MT_UC:
2649 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
2650 return false;
2651 break;
2652 case VMX_EPTP_MT_WB:
2653 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
2654 return false;
2655 break;
2656 default:
2657 return false;
2658 }
2659
2660 /* Page-walk levels validity. */
2661 switch (new_eptp & VMX_EPTP_PWL_MASK) {
2662 case VMX_EPTP_PWL_5:
2663 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT)))
2664 return false;
2665 break;
2666 case VMX_EPTP_PWL_4:
2667 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT)))
2668 return false;
2669 break;
2670 default:
2671 return false;
2672 }
2673
2674 /* Reserved bits should not be set */
2675 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f)))
2676 return false;
2677
2678 /* AD, if set, should be supported */
2679 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) {
2680 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
2681 return false;
2682 }
2683
2684 return true;
2685 }
2686
2687 /*
2688 * Checks related to VM-Execution Control Fields
2689 */
2690 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
2691 struct vmcs12 *vmcs12)
2692 {
2693 struct vcpu_vmx *vmx = to_vmx(vcpu);
2694
2695 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
2696 vmx->nested.msrs.pinbased_ctls_low,
2697 vmx->nested.msrs.pinbased_ctls_high)) ||
2698 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
2699 vmx->nested.msrs.procbased_ctls_low,
2700 vmx->nested.msrs.procbased_ctls_high)))
2701 return -EINVAL;
2702
2703 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
2704 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
2705 vmx->nested.msrs.secondary_ctls_low,
2706 vmx->nested.msrs.secondary_ctls_high)))
2707 return -EINVAL;
2708
2709 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
2710 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
2711 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
2712 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
2713 nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
2714 nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
2715 nested_vmx_check_nmi_controls(vmcs12) ||
2716 nested_vmx_check_pml_controls(vcpu, vmcs12) ||
2717 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
2718 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
2719 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
2720 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
2721 return -EINVAL;
2722
2723 if (!nested_cpu_has_preemption_timer(vmcs12) &&
2724 nested_cpu_has_save_preemption_timer(vmcs12))
2725 return -EINVAL;
2726
2727 if (nested_cpu_has_ept(vmcs12) &&
2728 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer)))
2729 return -EINVAL;
2730
2731 if (nested_cpu_has_vmfunc(vmcs12)) {
2732 if (CC(vmcs12->vm_function_control &
2733 ~vmx->nested.msrs.vmfunc_controls))
2734 return -EINVAL;
2735
2736 if (nested_cpu_has_eptp_switching(vmcs12)) {
2737 if (CC(!nested_cpu_has_ept(vmcs12)) ||
2738 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
2739 return -EINVAL;
2740 }
2741 }
2742
2743 return 0;
2744 }
2745
2746 /*
2747 * Checks related to VM-Exit Control Fields
2748 */
2749 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
2750 struct vmcs12 *vmcs12)
2751 {
2752 struct vcpu_vmx *vmx = to_vmx(vcpu);
2753
2754 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
2755 vmx->nested.msrs.exit_ctls_low,
2756 vmx->nested.msrs.exit_ctls_high)) ||
2757 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
2758 return -EINVAL;
2759
2760 return 0;
2761 }
2762
2763 /*
2764 * Checks related to VM-Entry Control Fields
2765 */
2766 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2767 struct vmcs12 *vmcs12)
2768 {
2769 struct vcpu_vmx *vmx = to_vmx(vcpu);
2770
2771 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
2772 vmx->nested.msrs.entry_ctls_low,
2773 vmx->nested.msrs.entry_ctls_high)))
2774 return -EINVAL;
2775
2776 /*
2777 * From the Intel SDM, volume 3:
2778 * Fields relevant to VM-entry event injection must be set properly.
2779 * These fields are the VM-entry interruption-information field, the
2780 * VM-entry exception error code, and the VM-entry instruction length.
2781 */
2782 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
2783 u32 intr_info = vmcs12->vm_entry_intr_info_field;
2784 u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
2785 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
2786 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
2787 bool should_have_error_code;
2788 bool urg = nested_cpu_has2(vmcs12,
2789 SECONDARY_EXEC_UNRESTRICTED_GUEST);
2790 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
2791
2792 /* VM-entry interruption-info field: interruption type */
2793 if (CC(intr_type == INTR_TYPE_RESERVED) ||
2794 CC(intr_type == INTR_TYPE_OTHER_EVENT &&
2795 !nested_cpu_supports_monitor_trap_flag(vcpu)))
2796 return -EINVAL;
2797
2798 /* VM-entry interruption-info field: vector */
2799 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
2800 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
2801 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
2802 return -EINVAL;
2803
2804 /* VM-entry interruption-info field: deliver error code */
2805 should_have_error_code =
2806 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
2807 x86_exception_has_error_code(vector);
2808 if (CC(has_error_code != should_have_error_code))
2809 return -EINVAL;
2810
2811 /* VM-entry exception error code */
2812 if (CC(has_error_code &&
2813 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
2814 return -EINVAL;
2815
2816 /* VM-entry interruption-info field: reserved bits */
2817 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
2818 return -EINVAL;
2819
2820 /* VM-entry instruction length */
2821 switch (intr_type) {
2822 case INTR_TYPE_SOFT_EXCEPTION:
2823 case INTR_TYPE_SOFT_INTR:
2824 case INTR_TYPE_PRIV_SW_EXCEPTION:
2825 if (CC(vmcs12->vm_entry_instruction_len > 15) ||
2826 CC(vmcs12->vm_entry_instruction_len == 0 &&
2827 CC(!nested_cpu_has_zero_length_injection(vcpu))))
2828 return -EINVAL;
2829 }
2830 }
2831
2832 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
2833 return -EINVAL;
2834
2835 return 0;
2836 }
2837
2838 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
2839 struct vmcs12 *vmcs12)
2840 {
2841 if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
2842 nested_check_vm_exit_controls(vcpu, vmcs12) ||
2843 nested_check_vm_entry_controls(vcpu, vmcs12))
2844 return -EINVAL;
2845
2846 if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled)
2847 return nested_evmcs_check_controls(vmcs12);
2848
2849 return 0;
2850 }
2851
2852 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
2853 struct vmcs12 *vmcs12)
2854 {
2855 bool ia32e;
2856
2857 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
2858 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
2859 CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3)))
2860 return -EINVAL;
2861
2862 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
2863 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
2864 return -EINVAL;
2865
2866 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
2867 CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
2868 return -EINVAL;
2869
2870 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2871 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
2872 vmcs12->host_ia32_perf_global_ctrl)))
2873 return -EINVAL;
2874
2875 #ifdef CONFIG_X86_64
2876 ia32e = !!(vcpu->arch.efer & EFER_LMA);
2877 #else
2878 ia32e = false;
2879 #endif
2880
2881 if (ia32e) {
2882 if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) ||
2883 CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
2884 return -EINVAL;
2885 } else {
2886 if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) ||
2887 CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
2888 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
2889 CC((vmcs12->host_rip) >> 32))
2890 return -EINVAL;
2891 }
2892
2893 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2894 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2895 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2896 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2897 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2898 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2899 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2900 CC(vmcs12->host_cs_selector == 0) ||
2901 CC(vmcs12->host_tr_selector == 0) ||
2902 CC(vmcs12->host_ss_selector == 0 && !ia32e))
2903 return -EINVAL;
2904
2905 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
2906 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
2907 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
2908 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
2909 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
2910 CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
2911 return -EINVAL;
2912
2913 /*
2914 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
2915 * IA32_EFER MSR must be 0 in the field for that register. In addition,
2916 * the values of the LMA and LME bits in the field must each be that of
2917 * the host address-space size VM-exit control.
2918 */
2919 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
2920 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
2921 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
2922 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
2923 return -EINVAL;
2924 }
2925
2926 return 0;
2927 }
2928
2929 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
2930 struct vmcs12 *vmcs12)
2931 {
2932 int r = 0;
2933 struct vmcs12 *shadow;
2934 struct kvm_host_map map;
2935
2936 if (vmcs12->vmcs_link_pointer == -1ull)
2937 return 0;
2938
2939 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
2940 return -EINVAL;
2941
2942 if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)))
2943 return -EINVAL;
2944
2945 shadow = map.hva;
2946
2947 if (CC(shadow->hdr.revision_id != VMCS12_REVISION) ||
2948 CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
2949 r = -EINVAL;
2950
2951 kvm_vcpu_unmap(vcpu, &map, false);
2952 return r;
2953 }
2954
2955 /*
2956 * Checks related to Guest Non-register State
2957 */
2958 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
2959 {
2960 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
2961 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT &&
2962 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI))
2963 return -EINVAL;
2964
2965 return 0;
2966 }
2967
2968 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
2969 struct vmcs12 *vmcs12,
2970 enum vm_entry_failure_code *entry_failure_code)
2971 {
2972 bool ia32e;
2973
2974 *entry_failure_code = ENTRY_FAIL_DEFAULT;
2975
2976 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
2977 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
2978 return -EINVAL;
2979
2980 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
2981 CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
2982 return -EINVAL;
2983
2984 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
2985 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
2986 return -EINVAL;
2987
2988 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
2989 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR;
2990 return -EINVAL;
2991 }
2992
2993 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2994 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
2995 vmcs12->guest_ia32_perf_global_ctrl)))
2996 return -EINVAL;
2997
2998 /*
2999 * If the load IA32_EFER VM-entry control is 1, the following checks
3000 * are performed on the field for the IA32_EFER MSR:
3001 * - Bits reserved in the IA32_EFER MSR must be 0.
3002 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
3003 * the IA-32e mode guest VM-exit control. It must also be identical
3004 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
3005 * CR0.PG) is 1.
3006 */
3007 if (to_vmx(vcpu)->nested.nested_run_pending &&
3008 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
3009 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
3010 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
3011 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
3012 CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
3013 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
3014 return -EINVAL;
3015 }
3016
3017 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
3018 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
3019 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
3020 return -EINVAL;
3021
3022 if (nested_check_guest_non_reg_state(vmcs12))
3023 return -EINVAL;
3024
3025 return 0;
3026 }
3027
3028 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
3029 {
3030 struct vcpu_vmx *vmx = to_vmx(vcpu);
3031 unsigned long cr3, cr4;
3032 bool vm_fail;
3033
3034 if (!nested_early_check)
3035 return 0;
3036
3037 if (vmx->msr_autoload.host.nr)
3038 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
3039 if (vmx->msr_autoload.guest.nr)
3040 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
3041
3042 preempt_disable();
3043
3044 vmx_prepare_switch_to_guest(vcpu);
3045
3046 /*
3047 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
3048 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
3049 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
3050 * there is no need to preserve other bits or save/restore the field.
3051 */
3052 vmcs_writel(GUEST_RFLAGS, 0);
3053
3054 cr3 = __get_current_cr3_fast();
3055 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
3056 vmcs_writel(HOST_CR3, cr3);
3057 vmx->loaded_vmcs->host_state.cr3 = cr3;
3058 }
3059
3060 cr4 = cr4_read_shadow();
3061 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
3062 vmcs_writel(HOST_CR4, cr4);
3063 vmx->loaded_vmcs->host_state.cr4 = cr4;
3064 }
3065
3066 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
3067 vmx->loaded_vmcs->launched);
3068
3069 if (vmx->msr_autoload.host.nr)
3070 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
3071 if (vmx->msr_autoload.guest.nr)
3072 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
3073
3074 if (vm_fail) {
3075 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
3076
3077 preempt_enable();
3078
3079 trace_kvm_nested_vmenter_failed(
3080 "early hardware check VM-instruction error: ", error);
3081 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3082 return 1;
3083 }
3084
3085 /*
3086 * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
3087 */
3088 if (hw_breakpoint_active())
3089 set_debugreg(__this_cpu_read(cpu_dr7), 7);
3090 local_irq_enable();
3091 preempt_enable();
3092
3093 /*
3094 * A non-failing VMEntry means we somehow entered guest mode with
3095 * an illegal RIP, and that's just the tip of the iceberg. There
3096 * is no telling what memory has been modified or what state has
3097 * been exposed to unknown code. Hitting this all but guarantees
3098 * a (very critical) hardware issue.
3099 */
3100 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
3101 VMX_EXIT_REASONS_FAILED_VMENTRY));
3102
3103 return 0;
3104 }
3105
3106 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
3107 {
3108 struct vcpu_vmx *vmx = to_vmx(vcpu);
3109
3110 /*
3111 * hv_evmcs may end up being not mapped after migration (when
3112 * L2 was running), map it here to make sure vmcs12 changes are
3113 * properly reflected.
3114 */
3115 if (vmx->nested.enlightened_vmcs_enabled &&
3116 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
3117 enum nested_evmptrld_status evmptrld_status =
3118 nested_vmx_handle_enlightened_vmptrld(vcpu, false);
3119
3120 if (evmptrld_status == EVMPTRLD_VMFAIL ||
3121 evmptrld_status == EVMPTRLD_ERROR)
3122 return false;
3123
3124 /*
3125 * Post migration VMCS12 always provides the most actual
3126 * information, copy it to eVMCS upon entry.
3127 */
3128 vmx->nested.need_vmcs12_to_shadow_sync = true;
3129 }
3130
3131 return true;
3132 }
3133
3134 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
3135 {
3136 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3137 struct vcpu_vmx *vmx = to_vmx(vcpu);
3138 struct kvm_host_map *map;
3139 struct page *page;
3140 u64 hpa;
3141
3142 if (!vcpu->arch.pdptrs_from_userspace &&
3143 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
3144 /*
3145 * Reload the guest's PDPTRs since after a migration
3146 * the guest CR3 might be restored prior to setting the nested
3147 * state which can lead to a load of wrong PDPTRs.
3148 */
3149 if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)))
3150 return false;
3151 }
3152
3153
3154 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3155 /*
3156 * Translate L1 physical address to host physical
3157 * address for vmcs02. Keep the page pinned, so this
3158 * physical address remains valid. We keep a reference
3159 * to it so we can release it later.
3160 */
3161 if (vmx->nested.apic_access_page) { /* shouldn't happen */
3162 kvm_release_page_clean(vmx->nested.apic_access_page);
3163 vmx->nested.apic_access_page = NULL;
3164 }
3165 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
3166 if (!is_error_page(page)) {
3167 vmx->nested.apic_access_page = page;
3168 hpa = page_to_phys(vmx->nested.apic_access_page);
3169 vmcs_write64(APIC_ACCESS_ADDR, hpa);
3170 } else {
3171 pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n",
3172 __func__);
3173 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3174 vcpu->run->internal.suberror =
3175 KVM_INTERNAL_ERROR_EMULATION;
3176 vcpu->run->internal.ndata = 0;
3177 return false;
3178 }
3179 }
3180
3181 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3182 map = &vmx->nested.virtual_apic_map;
3183
3184 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
3185 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
3186 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
3187 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
3188 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3189 /*
3190 * The processor will never use the TPR shadow, simply
3191 * clear the bit from the execution control. Such a
3192 * configuration is useless, but it happens in tests.
3193 * For any other configuration, failing the vm entry is
3194 * _not_ what the processor does but it's basically the
3195 * only possibility we have.
3196 */
3197 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
3198 } else {
3199 /*
3200 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
3201 * force VM-Entry to fail.
3202 */
3203 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
3204 }
3205 }
3206
3207 if (nested_cpu_has_posted_intr(vmcs12)) {
3208 map = &vmx->nested.pi_desc_map;
3209
3210 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
3211 vmx->nested.pi_desc =
3212 (struct pi_desc *)(((void *)map->hva) +
3213 offset_in_page(vmcs12->posted_intr_desc_addr));
3214 vmcs_write64(POSTED_INTR_DESC_ADDR,
3215 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
3216 } else {
3217 /*
3218 * Defer the KVM_INTERNAL_EXIT until KVM tries to
3219 * access the contents of the VMCS12 posted interrupt
3220 * descriptor. (Note that KVM may do this when it
3221 * should not, per the architectural specification.)
3222 */
3223 vmx->nested.pi_desc = NULL;
3224 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
3225 }
3226 }
3227 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
3228 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3229 else
3230 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3231
3232 return true;
3233 }
3234
3235 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
3236 {
3237 if (!nested_get_evmcs_page(vcpu)) {
3238 pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
3239 __func__);
3240 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3241 vcpu->run->internal.suberror =
3242 KVM_INTERNAL_ERROR_EMULATION;
3243 vcpu->run->internal.ndata = 0;
3244
3245 return false;
3246 }
3247
3248 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
3249 return false;
3250
3251 return true;
3252 }
3253
3254 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
3255 {
3256 struct vmcs12 *vmcs12;
3257 struct vcpu_vmx *vmx = to_vmx(vcpu);
3258 gpa_t dst;
3259
3260 if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
3261 return 0;
3262
3263 if (WARN_ON_ONCE(vmx->nested.pml_full))
3264 return 1;
3265
3266 /*
3267 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is
3268 * set is already checked as part of A/D emulation.
3269 */
3270 vmcs12 = get_vmcs12(vcpu);
3271 if (!nested_cpu_has_pml(vmcs12))
3272 return 0;
3273
3274 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
3275 vmx->nested.pml_full = true;
3276 return 1;
3277 }
3278
3279 gpa &= ~0xFFFull;
3280 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
3281
3282 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
3283 offset_in_page(dst), sizeof(gpa)))
3284 return 0;
3285
3286 vmcs12->guest_pml_index--;
3287
3288 return 0;
3289 }
3290
3291 /*
3292 * Intel's VMX Instruction Reference specifies a common set of prerequisites
3293 * for running VMX instructions (except VMXON, whose prerequisites are
3294 * slightly different). It also specifies what exception to inject otherwise.
3295 * Note that many of these exceptions have priority over VM exits, so they
3296 * don't have to be checked again here.
3297 */
3298 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
3299 {
3300 if (!to_vmx(vcpu)->nested.vmxon) {
3301 kvm_queue_exception(vcpu, UD_VECTOR);
3302 return 0;
3303 }
3304
3305 if (vmx_get_cpl(vcpu)) {
3306 kvm_inject_gp(vcpu, 0);
3307 return 0;
3308 }
3309
3310 return 1;
3311 }
3312
3313 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
3314 {
3315 u8 rvi = vmx_get_rvi();
3316 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
3317
3318 return ((rvi & 0xf0) > (vppr & 0xf0));
3319 }
3320
3321 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3322 struct vmcs12 *vmcs12);
3323
3324 /*
3325 * If from_vmentry is false, this is being called from state restore (either RSM
3326 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
3327 *
3328 * Returns:
3329 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode
3330 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail
3331 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit
3332 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error
3333 */
3334 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
3335 bool from_vmentry)
3336 {
3337 struct vcpu_vmx *vmx = to_vmx(vcpu);
3338 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3339 enum vm_entry_failure_code entry_failure_code;
3340 bool evaluate_pending_interrupts;
3341 union vmx_exit_reason exit_reason = {
3342 .basic = EXIT_REASON_INVALID_STATE,
3343 .failed_vmentry = 1,
3344 };
3345 u32 failed_index;
3346
3347 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
3348 kvm_vcpu_flush_tlb_current(vcpu);
3349
3350 evaluate_pending_interrupts = exec_controls_get(vmx) &
3351 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
3352 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
3353 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
3354
3355 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
3356 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
3357 if (kvm_mpx_supported() &&
3358 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
3359 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3360
3361 /*
3362 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3363 * nested early checks are disabled. In the event of a "late" VM-Fail,
3364 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3365 * software model to the pre-VMEntry host state. When EPT is disabled,
3366 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3367 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing
3368 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3369 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested
3370 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3371 * guaranteed to be overwritten with a shadow CR3 prior to re-entering
3372 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3373 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3374 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3375 * path would need to manually save/restore vmcs01.GUEST_CR3.
3376 */
3377 if (!enable_ept && !nested_early_check)
3378 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3379
3380 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
3381
3382 prepare_vmcs02_early(vmx, vmcs12);
3383
3384 if (from_vmentry) {
3385 if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
3386 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3387 return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
3388 }
3389
3390 if (nested_vmx_check_vmentry_hw(vcpu)) {
3391 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3392 return NVMX_VMENTRY_VMFAIL;
3393 }
3394
3395 if (nested_vmx_check_guest_state(vcpu, vmcs12,
3396 &entry_failure_code)) {
3397 exit_reason.basic = EXIT_REASON_INVALID_STATE;
3398 vmcs12->exit_qualification = entry_failure_code;
3399 goto vmentry_fail_vmexit;
3400 }
3401 }
3402
3403 enter_guest_mode(vcpu);
3404
3405 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) {
3406 exit_reason.basic = EXIT_REASON_INVALID_STATE;
3407 vmcs12->exit_qualification = entry_failure_code;
3408 goto vmentry_fail_vmexit_guest_mode;
3409 }
3410
3411 if (from_vmentry) {
3412 failed_index = nested_vmx_load_msr(vcpu,
3413 vmcs12->vm_entry_msr_load_addr,
3414 vmcs12->vm_entry_msr_load_count);
3415 if (failed_index) {
3416 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
3417 vmcs12->exit_qualification = failed_index;
3418 goto vmentry_fail_vmexit_guest_mode;
3419 }
3420 } else {
3421 /*
3422 * The MMU is not initialized to point at the right entities yet and
3423 * "get pages" would need to read data from the guest (i.e. we will
3424 * need to perform gpa to hpa translation). Request a call
3425 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs
3426 * have already been set at vmentry time and should not be reset.
3427 */
3428 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
3429 }
3430
3431 /*
3432 * If L1 had a pending IRQ/NMI until it executed
3433 * VMLAUNCH/VMRESUME which wasn't delivered because it was
3434 * disallowed (e.g. interrupts disabled), L0 needs to
3435 * evaluate if this pending event should cause an exit from L2
3436 * to L1 or delivered directly to L2 (e.g. In case L1 don't
3437 * intercept EXTERNAL_INTERRUPT).
3438 *
3439 * Usually this would be handled by the processor noticing an
3440 * IRQ/NMI window request, or checking RVI during evaluation of
3441 * pending virtual interrupts. However, this setting was done
3442 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
3443 * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
3444 */
3445 if (unlikely(evaluate_pending_interrupts))
3446 kvm_make_request(KVM_REQ_EVENT, vcpu);
3447
3448 /*
3449 * Do not start the preemption timer hrtimer until after we know
3450 * we are successful, so that only nested_vmx_vmexit needs to cancel
3451 * the timer.
3452 */
3453 vmx->nested.preemption_timer_expired = false;
3454 if (nested_cpu_has_preemption_timer(vmcs12)) {
3455 u64 timer_value = vmx_calc_preemption_timer_value(vcpu);
3456 vmx_start_preemption_timer(vcpu, timer_value);
3457 }
3458
3459 /*
3460 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3461 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3462 * returned as far as L1 is concerned. It will only return (and set
3463 * the success flag) when L2 exits (see nested_vmx_vmexit()).
3464 */
3465 return NVMX_VMENTRY_SUCCESS;
3466
3467 /*
3468 * A failed consistency check that leads to a VMExit during L1's
3469 * VMEnter to L2 is a variation of a normal VMexit, as explained in
3470 * 26.7 "VM-entry failures during or after loading guest state".
3471 */
3472 vmentry_fail_vmexit_guest_mode:
3473 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
3474 vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3475 leave_guest_mode(vcpu);
3476
3477 vmentry_fail_vmexit:
3478 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3479
3480 if (!from_vmentry)
3481 return NVMX_VMENTRY_VMEXIT;
3482
3483 load_vmcs12_host_state(vcpu, vmcs12);
3484 vmcs12->vm_exit_reason = exit_reason.full;
3485 if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
3486 vmx->nested.need_vmcs12_to_shadow_sync = true;
3487 return NVMX_VMENTRY_VMEXIT;
3488 }
3489
3490 /*
3491 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3492 * for running an L2 nested guest.
3493 */
3494 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3495 {
3496 struct vmcs12 *vmcs12;
3497 enum nvmx_vmentry_status status;
3498 struct vcpu_vmx *vmx = to_vmx(vcpu);
3499 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
3500 enum nested_evmptrld_status evmptrld_status;
3501
3502 if (!nested_vmx_check_permission(vcpu))
3503 return 1;
3504
3505 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch);
3506 if (evmptrld_status == EVMPTRLD_ERROR) {
3507 kvm_queue_exception(vcpu, UD_VECTOR);
3508 return 1;
3509 } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) {
3510 return nested_vmx_failInvalid(vcpu);
3511 }
3512
3513 if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) &&
3514 vmx->nested.current_vmptr == -1ull))
3515 return nested_vmx_failInvalid(vcpu);
3516
3517 vmcs12 = get_vmcs12(vcpu);
3518
3519 /*
3520 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3521 * that there *is* a valid VMCS pointer, RFLAGS.CF is set
3522 * rather than RFLAGS.ZF, and no error number is stored to the
3523 * VM-instruction error field.
3524 */
3525 if (CC(vmcs12->hdr.shadow_vmcs))
3526 return nested_vmx_failInvalid(vcpu);
3527
3528 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
3529 copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields);
3530 /* Enlightened VMCS doesn't have launch state */
3531 vmcs12->launch_state = !launch;
3532 } else if (enable_shadow_vmcs) {
3533 copy_shadow_to_vmcs12(vmx);
3534 }
3535
3536 /*
3537 * The nested entry process starts with enforcing various prerequisites
3538 * on vmcs12 as required by the Intel SDM, and act appropriately when
3539 * they fail: As the SDM explains, some conditions should cause the
3540 * instruction to fail, while others will cause the instruction to seem
3541 * to succeed, but return an EXIT_REASON_INVALID_STATE.
3542 * To speed up the normal (success) code path, we should avoid checking
3543 * for misconfigurations which will anyway be caught by the processor
3544 * when using the merged vmcs02.
3545 */
3546 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS))
3547 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
3548
3549 if (CC(vmcs12->launch_state == launch))
3550 return nested_vmx_fail(vcpu,
3551 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3552 : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3553
3554 if (nested_vmx_check_controls(vcpu, vmcs12))
3555 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3556
3557 if (nested_vmx_check_host_state(vcpu, vmcs12))
3558 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3559
3560 /*
3561 * We're finally done with prerequisite checking, and can start with
3562 * the nested entry.
3563 */
3564 vmx->nested.nested_run_pending = 1;
3565 vmx->nested.has_preemption_timer_deadline = false;
3566 status = nested_vmx_enter_non_root_mode(vcpu, true);
3567 if (unlikely(status != NVMX_VMENTRY_SUCCESS))
3568 goto vmentry_failed;
3569
3570 /* Emulate processing of posted interrupts on VM-Enter. */
3571 if (nested_cpu_has_posted_intr(vmcs12) &&
3572 kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) {
3573 vmx->nested.pi_pending = true;
3574 kvm_make_request(KVM_REQ_EVENT, vcpu);
3575 kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv);
3576 }
3577
3578 /* Hide L1D cache contents from the nested guest. */
3579 vmx->vcpu.arch.l1tf_flush_l1d = true;
3580
3581 /*
3582 * Must happen outside of nested_vmx_enter_non_root_mode() as it will
3583 * also be used as part of restoring nVMX state for
3584 * snapshot restore (migration).
3585 *
3586 * In this flow, it is assumed that vmcs12 cache was
3587 * transferred as part of captured nVMX state and should
3588 * therefore not be read from guest memory (which may not
3589 * exist on destination host yet).
3590 */
3591 nested_cache_shadow_vmcs12(vcpu, vmcs12);
3592
3593 switch (vmcs12->guest_activity_state) {
3594 case GUEST_ACTIVITY_HLT:
3595 /*
3596 * If we're entering a halted L2 vcpu and the L2 vcpu won't be
3597 * awakened by event injection or by an NMI-window VM-exit or
3598 * by an interrupt-window VM-exit, halt the vcpu.
3599 */
3600 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
3601 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) &&
3602 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
3603 (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
3604 vmx->nested.nested_run_pending = 0;
3605 return kvm_vcpu_halt(vcpu);
3606 }
3607 break;
3608 case GUEST_ACTIVITY_WAIT_SIPI:
3609 vmx->nested.nested_run_pending = 0;
3610 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
3611 break;
3612 default:
3613 break;
3614 }
3615
3616 return 1;
3617
3618 vmentry_failed:
3619 vmx->nested.nested_run_pending = 0;
3620 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
3621 return 0;
3622 if (status == NVMX_VMENTRY_VMEXIT)
3623 return 1;
3624 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
3625 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3626 }
3627
3628 /*
3629 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
3630 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK).
3631 * This function returns the new value we should put in vmcs12.guest_cr0.
3632 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3633 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3634 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3635 * didn't trap the bit, because if L1 did, so would L0).
3636 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3637 * been modified by L2, and L1 knows it. So just leave the old value of
3638 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3639 * isn't relevant, because if L0 traps this bit it can set it to anything.
3640 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3641 * changed these bits, and therefore they need to be updated, but L0
3642 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3643 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3644 */
3645 static inline unsigned long
3646 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3647 {
3648 return
3649 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
3650 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
3651 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
3652 vcpu->arch.cr0_guest_owned_bits));
3653 }
3654
3655 static inline unsigned long
3656 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3657 {
3658 return
3659 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
3660 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
3661 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
3662 vcpu->arch.cr4_guest_owned_bits));
3663 }
3664
3665 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
3666 struct vmcs12 *vmcs12)
3667 {
3668 u32 idt_vectoring;
3669 unsigned int nr;
3670
3671 if (vcpu->arch.exception.injected) {
3672 nr = vcpu->arch.exception.nr;
3673 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3674
3675 if (kvm_exception_is_soft(nr)) {
3676 vmcs12->vm_exit_instruction_len =
3677 vcpu->arch.event_exit_inst_len;
3678 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
3679 } else
3680 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
3681
3682 if (vcpu->arch.exception.has_error_code) {
3683 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
3684 vmcs12->idt_vectoring_error_code =
3685 vcpu->arch.exception.error_code;
3686 }
3687
3688 vmcs12->idt_vectoring_info_field = idt_vectoring;
3689 } else if (vcpu->arch.nmi_injected) {
3690 vmcs12->idt_vectoring_info_field =
3691 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
3692 } else if (vcpu->arch.interrupt.injected) {
3693 nr = vcpu->arch.interrupt.nr;
3694 idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3695
3696 if (vcpu->arch.interrupt.soft) {
3697 idt_vectoring |= INTR_TYPE_SOFT_INTR;
3698 vmcs12->vm_entry_instruction_len =
3699 vcpu->arch.event_exit_inst_len;
3700 } else
3701 idt_vectoring |= INTR_TYPE_EXT_INTR;
3702
3703 vmcs12->idt_vectoring_info_field = idt_vectoring;
3704 }
3705 }
3706
3707
3708 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
3709 {
3710 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3711 gfn_t gfn;
3712
3713 /*
3714 * Don't need to mark the APIC access page dirty; it is never
3715 * written to by the CPU during APIC virtualization.
3716 */
3717
3718 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3719 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
3720 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3721 }
3722
3723 if (nested_cpu_has_posted_intr(vmcs12)) {
3724 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
3725 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3726 }
3727 }
3728
3729 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
3730 {
3731 struct vcpu_vmx *vmx = to_vmx(vcpu);
3732 int max_irr;
3733 void *vapic_page;
3734 u16 status;
3735
3736 if (!vmx->nested.pi_pending)
3737 return 0;
3738
3739 if (!vmx->nested.pi_desc)
3740 goto mmio_needed;
3741
3742 vmx->nested.pi_pending = false;
3743
3744 if (!pi_test_and_clear_on(vmx->nested.pi_desc))
3745 return 0;
3746
3747 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
3748 if (max_irr != 256) {
3749 vapic_page = vmx->nested.virtual_apic_map.hva;
3750 if (!vapic_page)
3751 goto mmio_needed;
3752
3753 __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
3754 vapic_page, &max_irr);
3755 status = vmcs_read16(GUEST_INTR_STATUS);
3756 if ((u8)max_irr > ((u8)status & 0xff)) {
3757 status &= ~0xff;
3758 status |= (u8)max_irr;
3759 vmcs_write16(GUEST_INTR_STATUS, status);
3760 }
3761 }
3762
3763 nested_mark_vmcs12_pages_dirty(vcpu);
3764 return 0;
3765
3766 mmio_needed:
3767 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL);
3768 return -ENXIO;
3769 }
3770
3771 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3772 unsigned long exit_qual)
3773 {
3774 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3775 unsigned int nr = vcpu->arch.exception.nr;
3776 u32 intr_info = nr | INTR_INFO_VALID_MASK;
3777
3778 if (vcpu->arch.exception.has_error_code) {
3779 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
3780 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3781 }
3782
3783 if (kvm_exception_is_soft(nr))
3784 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3785 else
3786 intr_info |= INTR_TYPE_HARD_EXCEPTION;
3787
3788 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3789 vmx_get_nmi_mask(vcpu))
3790 intr_info |= INTR_INFO_UNBLOCK_NMI;
3791
3792 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3793 }
3794
3795 /*
3796 * Returns true if a debug trap is pending delivery.
3797 *
3798 * In KVM, debug traps bear an exception payload. As such, the class of a #DB
3799 * exception may be inferred from the presence of an exception payload.
3800 */
3801 static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu)
3802 {
3803 return vcpu->arch.exception.pending &&
3804 vcpu->arch.exception.nr == DB_VECTOR &&
3805 vcpu->arch.exception.payload;
3806 }
3807
3808 /*
3809 * Certain VM-exits set the 'pending debug exceptions' field to indicate a
3810 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM
3811 * represents these debug traps with a payload that is said to be compatible
3812 * with the 'pending debug exceptions' field, write the payload to the VMCS
3813 * field if a VM-exit is delivered before the debug trap.
3814 */
3815 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
3816 {
3817 if (vmx_pending_dbg_trap(vcpu))
3818 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
3819 vcpu->arch.exception.payload);
3820 }
3821
3822 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
3823 {
3824 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
3825 to_vmx(vcpu)->nested.preemption_timer_expired;
3826 }
3827
3828 static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
3829 {
3830 struct vcpu_vmx *vmx = to_vmx(vcpu);
3831 unsigned long exit_qual;
3832 bool block_nested_events =
3833 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
3834 bool mtf_pending = vmx->nested.mtf_pending;
3835 struct kvm_lapic *apic = vcpu->arch.apic;
3836
3837 /*
3838 * Clear the MTF state. If a higher priority VM-exit is delivered first,
3839 * this state is discarded.
3840 */
3841 if (!block_nested_events)
3842 vmx->nested.mtf_pending = false;
3843
3844 if (lapic_in_kernel(vcpu) &&
3845 test_bit(KVM_APIC_INIT, &apic->pending_events)) {
3846 if (block_nested_events)
3847 return -EBUSY;
3848 nested_vmx_update_pending_dbg(vcpu);
3849 clear_bit(KVM_APIC_INIT, &apic->pending_events);
3850 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
3851 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
3852 return 0;
3853 }
3854
3855 if (lapic_in_kernel(vcpu) &&
3856 test_bit(KVM_APIC_SIPI, &apic->pending_events)) {
3857 if (block_nested_events)
3858 return -EBUSY;
3859
3860 clear_bit(KVM_APIC_SIPI, &apic->pending_events);
3861 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
3862 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0,
3863 apic->sipi_vector & 0xFFUL);
3864 return 0;
3865 }
3866
3867 /*
3868 * Process any exceptions that are not debug traps before MTF.
3869 *
3870 * Note that only a pending nested run can block a pending exception.
3871 * Otherwise an injected NMI/interrupt should either be
3872 * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO,
3873 * while delivering the pending exception.
3874 */
3875
3876 if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
3877 if (vmx->nested.nested_run_pending)
3878 return -EBUSY;
3879 if (!nested_vmx_check_exception(vcpu, &exit_qual))
3880 goto no_vmexit;
3881 nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3882 return 0;
3883 }
3884
3885 if (mtf_pending) {
3886 if (block_nested_events)
3887 return -EBUSY;
3888 nested_vmx_update_pending_dbg(vcpu);
3889 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
3890 return 0;
3891 }
3892
3893 if (vcpu->arch.exception.pending) {
3894 if (vmx->nested.nested_run_pending)
3895 return -EBUSY;
3896 if (!nested_vmx_check_exception(vcpu, &exit_qual))
3897 goto no_vmexit;
3898 nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3899 return 0;
3900 }
3901
3902 if (nested_vmx_preemption_timer_pending(vcpu)) {
3903 if (block_nested_events)
3904 return -EBUSY;
3905 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
3906 return 0;
3907 }
3908
3909 if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
3910 if (block_nested_events)
3911 return -EBUSY;
3912 goto no_vmexit;
3913 }
3914
3915 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) {
3916 if (block_nested_events)
3917 return -EBUSY;
3918 if (!nested_exit_on_nmi(vcpu))
3919 goto no_vmexit;
3920
3921 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
3922 NMI_VECTOR | INTR_TYPE_NMI_INTR |
3923 INTR_INFO_VALID_MASK, 0);
3924 /*
3925 * The NMI-triggered VM exit counts as injection:
3926 * clear this one and block further NMIs.
3927 */
3928 vcpu->arch.nmi_pending = 0;
3929 vmx_set_nmi_mask(vcpu, true);
3930 return 0;
3931 }
3932
3933 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
3934 if (block_nested_events)
3935 return -EBUSY;
3936 if (!nested_exit_on_intr(vcpu))
3937 goto no_vmexit;
3938 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
3939 return 0;
3940 }
3941
3942 no_vmexit:
3943 return vmx_complete_nested_posted_interrupt(vcpu);
3944 }
3945
3946 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
3947 {
3948 ktime_t remaining =
3949 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
3950 u64 value;
3951
3952 if (ktime_to_ns(remaining) <= 0)
3953 return 0;
3954
3955 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
3956 do_div(value, 1000000);
3957 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
3958 }
3959
3960 static bool is_vmcs12_ext_field(unsigned long field)
3961 {
3962 switch (field) {
3963 case GUEST_ES_SELECTOR:
3964 case GUEST_CS_SELECTOR:
3965 case GUEST_SS_SELECTOR:
3966 case GUEST_DS_SELECTOR:
3967 case GUEST_FS_SELECTOR:
3968 case GUEST_GS_SELECTOR:
3969 case GUEST_LDTR_SELECTOR:
3970 case GUEST_TR_SELECTOR:
3971 case GUEST_ES_LIMIT:
3972 case GUEST_CS_LIMIT:
3973 case GUEST_SS_LIMIT:
3974 case GUEST_DS_LIMIT:
3975 case GUEST_FS_LIMIT:
3976 case GUEST_GS_LIMIT:
3977 case GUEST_LDTR_LIMIT:
3978 case GUEST_TR_LIMIT:
3979 case GUEST_GDTR_LIMIT:
3980 case GUEST_IDTR_LIMIT:
3981 case GUEST_ES_AR_BYTES:
3982 case GUEST_DS_AR_BYTES:
3983 case GUEST_FS_AR_BYTES:
3984 case GUEST_GS_AR_BYTES:
3985 case GUEST_LDTR_AR_BYTES:
3986 case GUEST_TR_AR_BYTES:
3987 case GUEST_ES_BASE:
3988 case GUEST_CS_BASE:
3989 case GUEST_SS_BASE:
3990 case GUEST_DS_BASE:
3991 case GUEST_FS_BASE:
3992 case GUEST_GS_BASE:
3993 case GUEST_LDTR_BASE:
3994 case GUEST_TR_BASE:
3995 case GUEST_GDTR_BASE:
3996 case GUEST_IDTR_BASE:
3997 case GUEST_PENDING_DBG_EXCEPTIONS:
3998 case GUEST_BNDCFGS:
3999 return true;
4000 default:
4001 break;
4002 }
4003
4004 return false;
4005 }
4006
4007 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
4008 struct vmcs12 *vmcs12)
4009 {
4010 struct vcpu_vmx *vmx = to_vmx(vcpu);
4011
4012 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
4013 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
4014 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
4015 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
4016 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
4017 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
4018 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
4019 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
4020 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
4021 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
4022 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
4023 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
4024 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
4025 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
4026 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
4027 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
4028 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
4029 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
4030 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
4031 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
4032 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
4033 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
4034 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
4035 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
4036 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
4037 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
4038 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
4039 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
4040 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
4041 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
4042 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
4043 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
4044 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
4045 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
4046 vmcs12->guest_pending_dbg_exceptions =
4047 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
4048 if (kvm_mpx_supported())
4049 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
4050
4051 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
4052 }
4053
4054 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
4055 struct vmcs12 *vmcs12)
4056 {
4057 struct vcpu_vmx *vmx = to_vmx(vcpu);
4058 int cpu;
4059
4060 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
4061 return;
4062
4063
4064 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
4065
4066 cpu = get_cpu();
4067 vmx->loaded_vmcs = &vmx->nested.vmcs02;
4068 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01);
4069
4070 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4071
4072 vmx->loaded_vmcs = &vmx->vmcs01;
4073 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02);
4074 put_cpu();
4075 }
4076
4077 /*
4078 * Update the guest state fields of vmcs12 to reflect changes that
4079 * occurred while L2 was running. (The "IA-32e mode guest" bit of the
4080 * VM-entry controls is also updated, since this is really a guest
4081 * state bit.)
4082 */
4083 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
4084 {
4085 struct vcpu_vmx *vmx = to_vmx(vcpu);
4086
4087 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
4088 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4089
4090 vmx->nested.need_sync_vmcs02_to_vmcs12_rare =
4091 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr);
4092
4093 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
4094 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
4095
4096 vmcs12->guest_rsp = kvm_rsp_read(vcpu);
4097 vmcs12->guest_rip = kvm_rip_read(vcpu);
4098 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
4099
4100 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
4101 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
4102
4103 vmcs12->guest_interruptibility_info =
4104 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
4105
4106 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
4107 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
4108 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
4109 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI;
4110 else
4111 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
4112
4113 if (nested_cpu_has_preemption_timer(vmcs12) &&
4114 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER &&
4115 !vmx->nested.nested_run_pending)
4116 vmcs12->vmx_preemption_timer_value =
4117 vmx_get_preemption_timer_value(vcpu);
4118
4119 /*
4120 * In some cases (usually, nested EPT), L2 is allowed to change its
4121 * own CR3 without exiting. If it has changed it, we must keep it.
4122 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
4123 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
4124 *
4125 * Additionally, restore L2's PDPTR to vmcs12.
4126 */
4127 if (enable_ept) {
4128 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
4129 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
4130 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
4131 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
4132 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
4133 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
4134 }
4135 }
4136
4137 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
4138
4139 if (nested_cpu_has_vid(vmcs12))
4140 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
4141
4142 vmcs12->vm_entry_controls =
4143 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
4144 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
4145
4146 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
4147 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
4148
4149 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
4150 vmcs12->guest_ia32_efer = vcpu->arch.efer;
4151 }
4152
4153 /*
4154 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
4155 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
4156 * and this function updates it to reflect the changes to the guest state while
4157 * L2 was running (and perhaps made some exits which were handled directly by L0
4158 * without going back to L1), and to reflect the exit reason.
4159 * Note that we do not have to copy here all VMCS fields, just those that
4160 * could have changed by the L2 guest or the exit - i.e., the guest-state and
4161 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
4162 * which already writes to vmcs12 directly.
4163 */
4164 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
4165 u32 vm_exit_reason, u32 exit_intr_info,
4166 unsigned long exit_qualification)
4167 {
4168 /* update exit information fields: */
4169 vmcs12->vm_exit_reason = vm_exit_reason;
4170 if (to_vmx(vcpu)->exit_reason.enclave_mode)
4171 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
4172 vmcs12->exit_qualification = exit_qualification;
4173 vmcs12->vm_exit_intr_info = exit_intr_info;
4174
4175 vmcs12->idt_vectoring_info_field = 0;
4176 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
4177 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4178
4179 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
4180 vmcs12->launch_state = 1;
4181
4182 /* vm_entry_intr_info_field is cleared on exit. Emulate this
4183 * instead of reading the real value. */
4184 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
4185
4186 /*
4187 * Transfer the event that L0 or L1 may wanted to inject into
4188 * L2 to IDT_VECTORING_INFO_FIELD.
4189 */
4190 vmcs12_save_pending_event(vcpu, vmcs12);
4191
4192 /*
4193 * According to spec, there's no need to store the guest's
4194 * MSRs if the exit is due to a VM-entry failure that occurs
4195 * during or after loading the guest state. Since this exit
4196 * does not fall in that category, we need to save the MSRs.
4197 */
4198 if (nested_vmx_store_msr(vcpu,
4199 vmcs12->vm_exit_msr_store_addr,
4200 vmcs12->vm_exit_msr_store_count))
4201 nested_vmx_abort(vcpu,
4202 VMX_ABORT_SAVE_GUEST_MSR_FAIL);
4203 }
4204
4205 /*
4206 * Drop what we picked up for L2 via vmx_complete_interrupts. It is
4207 * preserved above and would only end up incorrectly in L1.
4208 */
4209 vcpu->arch.nmi_injected = false;
4210 kvm_clear_exception_queue(vcpu);
4211 kvm_clear_interrupt_queue(vcpu);
4212 }
4213
4214 /*
4215 * A part of what we need to when the nested L2 guest exits and we want to
4216 * run its L1 parent, is to reset L1's guest state to the host state specified
4217 * in vmcs12.
4218 * This function is to be called not only on normal nested exit, but also on
4219 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
4220 * Failures During or After Loading Guest State").
4221 * This function should be called when the active VMCS is L1's (vmcs01).
4222 */
4223 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
4224 struct vmcs12 *vmcs12)
4225 {
4226 enum vm_entry_failure_code ignored;
4227 struct kvm_segment seg;
4228
4229 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
4230 vcpu->arch.efer = vmcs12->host_ia32_efer;
4231 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4232 vcpu->arch.efer |= (EFER_LMA | EFER_LME);
4233 else
4234 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
4235 vmx_set_efer(vcpu, vcpu->arch.efer);
4236
4237 kvm_rsp_write(vcpu, vmcs12->host_rsp);
4238 kvm_rip_write(vcpu, vmcs12->host_rip);
4239 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
4240 vmx_set_interrupt_shadow(vcpu, 0);
4241
4242 /*
4243 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
4244 * actually changed, because vmx_set_cr0 refers to efer set above.
4245 *
4246 * CR0_GUEST_HOST_MASK is already set in the original vmcs01
4247 * (KVM doesn't change it);
4248 */
4249 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
4250 vmx_set_cr0(vcpu, vmcs12->host_cr0);
4251
4252 /* Same as above - no reason to call set_cr4_guest_host_mask(). */
4253 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4254 vmx_set_cr4(vcpu, vmcs12->host_cr4);
4255
4256 nested_ept_uninit_mmu_context(vcpu);
4257
4258 /*
4259 * Only PDPTE load can fail as the value of cr3 was checked on entry and
4260 * couldn't have changed.
4261 */
4262 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored))
4263 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
4264
4265 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
4266
4267 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
4268 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
4269 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
4270 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
4271 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
4272 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
4273 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
4274
4275 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
4276 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
4277 vmcs_write64(GUEST_BNDCFGS, 0);
4278
4279 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
4280 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
4281 vcpu->arch.pat = vmcs12->host_ia32_pat;
4282 }
4283 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
4284 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
4285 vmcs12->host_ia32_perf_global_ctrl));
4286
4287 /* Set L1 segment info according to Intel SDM
4288 27.5.2 Loading Host Segment and Descriptor-Table Registers */
4289 seg = (struct kvm_segment) {
4290 .base = 0,
4291 .limit = 0xFFFFFFFF,
4292 .selector = vmcs12->host_cs_selector,
4293 .type = 11,
4294 .present = 1,
4295 .s = 1,
4296 .g = 1
4297 };
4298 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4299 seg.l = 1;
4300 else
4301 seg.db = 1;
4302 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
4303 seg = (struct kvm_segment) {
4304 .base = 0,
4305 .limit = 0xFFFFFFFF,
4306 .type = 3,
4307 .present = 1,
4308 .s = 1,
4309 .db = 1,
4310 .g = 1
4311 };
4312 seg.selector = vmcs12->host_ds_selector;
4313 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
4314 seg.selector = vmcs12->host_es_selector;
4315 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
4316 seg.selector = vmcs12->host_ss_selector;
4317 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
4318 seg.selector = vmcs12->host_fs_selector;
4319 seg.base = vmcs12->host_fs_base;
4320 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
4321 seg.selector = vmcs12->host_gs_selector;
4322 seg.base = vmcs12->host_gs_base;
4323 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
4324 seg = (struct kvm_segment) {
4325 .base = vmcs12->host_tr_base,
4326 .limit = 0x67,
4327 .selector = vmcs12->host_tr_selector,
4328 .type = 11,
4329 .present = 1
4330 };
4331 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
4332
4333 kvm_set_dr(vcpu, 7, 0x400);
4334 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4335
4336 if (cpu_has_vmx_msr_bitmap())
4337 vmx_update_msr_bitmap(vcpu);
4338
4339 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
4340 vmcs12->vm_exit_msr_load_count))
4341 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4342 }
4343
4344 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
4345 {
4346 struct vmx_uret_msr *efer_msr;
4347 unsigned int i;
4348
4349 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
4350 return vmcs_read64(GUEST_IA32_EFER);
4351
4352 if (cpu_has_load_ia32_efer())
4353 return host_efer;
4354
4355 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
4356 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
4357 return vmx->msr_autoload.guest.val[i].value;
4358 }
4359
4360 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER);
4361 if (efer_msr)
4362 return efer_msr->data;
4363
4364 return host_efer;
4365 }
4366
4367 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
4368 {
4369 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4370 struct vcpu_vmx *vmx = to_vmx(vcpu);
4371 struct vmx_msr_entry g, h;
4372 gpa_t gpa;
4373 u32 i, j;
4374
4375 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
4376
4377 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
4378 /*
4379 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
4380 * as vmcs01.GUEST_DR7 contains a userspace defined value
4381 * and vcpu->arch.dr7 is not squirreled away before the
4382 * nested VMENTER (not worth adding a variable in nested_vmx).
4383 */
4384 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
4385 kvm_set_dr(vcpu, 7, DR7_FIXED_1);
4386 else
4387 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
4388 }
4389
4390 /*
4391 * Note that calling vmx_set_{efer,cr0,cr4} is important as they
4392 * handle a variety of side effects to KVM's software model.
4393 */
4394 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
4395
4396 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
4397 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
4398
4399 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4400 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
4401
4402 nested_ept_uninit_mmu_context(vcpu);
4403 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
4404 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
4405
4406 /*
4407 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
4408 * from vmcs01 (if necessary). The PDPTRs are not loaded on
4409 * VMFail, like everything else we just need to ensure our
4410 * software model is up-to-date.
4411 */
4412 if (enable_ept && is_pae_paging(vcpu))
4413 ept_save_pdptrs(vcpu);
4414
4415 kvm_mmu_reset_context(vcpu);
4416
4417 if (cpu_has_vmx_msr_bitmap())
4418 vmx_update_msr_bitmap(vcpu);
4419
4420 /*
4421 * This nasty bit of open coding is a compromise between blindly
4422 * loading L1's MSRs using the exit load lists (incorrect emulation
4423 * of VMFail), leaving the nested VM's MSRs in the software model
4424 * (incorrect behavior) and snapshotting the modified MSRs (too
4425 * expensive since the lists are unbound by hardware). For each
4426 * MSR that was (prematurely) loaded from the nested VMEntry load
4427 * list, reload it from the exit load list if it exists and differs
4428 * from the guest value. The intent is to stuff host state as
4429 * silently as possible, not to fully process the exit load list.
4430 */
4431 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
4432 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
4433 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
4434 pr_debug_ratelimited(
4435 "%s read MSR index failed (%u, 0x%08llx)\n",
4436 __func__, i, gpa);
4437 goto vmabort;
4438 }
4439
4440 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
4441 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
4442 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
4443 pr_debug_ratelimited(
4444 "%s read MSR failed (%u, 0x%08llx)\n",
4445 __func__, j, gpa);
4446 goto vmabort;
4447 }
4448 if (h.index != g.index)
4449 continue;
4450 if (h.value == g.value)
4451 break;
4452
4453 if (nested_vmx_load_msr_check(vcpu, &h)) {
4454 pr_debug_ratelimited(
4455 "%s check failed (%u, 0x%x, 0x%x)\n",
4456 __func__, j, h.index, h.reserved);
4457 goto vmabort;
4458 }
4459
4460 if (kvm_set_msr(vcpu, h.index, h.value)) {
4461 pr_debug_ratelimited(
4462 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
4463 __func__, j, h.index, h.value);
4464 goto vmabort;
4465 }
4466 }
4467 }
4468
4469 return;
4470
4471 vmabort:
4472 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4473 }
4474
4475 /*
4476 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
4477 * and modify vmcs12 to make it see what it would expect to see there if
4478 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
4479 */
4480 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
4481 u32 exit_intr_info, unsigned long exit_qualification)
4482 {
4483 struct vcpu_vmx *vmx = to_vmx(vcpu);
4484 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4485
4486 /* trying to cancel vmlaunch/vmresume is a bug */
4487 WARN_ON_ONCE(vmx->nested.nested_run_pending);
4488
4489 /* Similarly, triple faults in L2 should never escape. */
4490 WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
4491
4492 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
4493 /*
4494 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
4495 * Enlightened VMCS after migration and we still need to
4496 * do that when something is forcing L2->L1 exit prior to
4497 * the first L2 run.
4498 */
4499 (void)nested_get_evmcs_page(vcpu);
4500 }
4501
4502 /* Service the TLB flush request for L2 before switching to L1. */
4503 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
4504 kvm_vcpu_flush_tlb_current(vcpu);
4505
4506 /*
4507 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
4508 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are
4509 * up-to-date before switching to L1.
4510 */
4511 if (enable_ept && is_pae_paging(vcpu))
4512 vmx_ept_load_pdptrs(vcpu);
4513
4514 leave_guest_mode(vcpu);
4515
4516 if (nested_cpu_has_preemption_timer(vmcs12))
4517 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
4518
4519 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) {
4520 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset;
4521 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
4522 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
4523 }
4524
4525 if (likely(!vmx->fail)) {
4526 sync_vmcs02_to_vmcs12(vcpu, vmcs12);
4527
4528 if (vm_exit_reason != -1)
4529 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
4530 exit_intr_info, exit_qualification);
4531
4532 /*
4533 * Must happen outside of sync_vmcs02_to_vmcs12() as it will
4534 * also be used to capture vmcs12 cache as part of
4535 * capturing nVMX state for snapshot (migration).
4536 *
4537 * Otherwise, this flush will dirty guest memory at a
4538 * point it is already assumed by user-space to be
4539 * immutable.
4540 */
4541 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
4542 } else {
4543 /*
4544 * The only expected VM-instruction error is "VM entry with
4545 * invalid control field(s)." Anything else indicates a
4546 * problem with L0. And we should never get here with a
4547 * VMFail of any type if early consistency checks are enabled.
4548 */
4549 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
4550 VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4551 WARN_ON_ONCE(nested_early_check);
4552 }
4553
4554 vmx_switch_vmcs(vcpu, &vmx->vmcs01);
4555
4556 /* Update any VMCS fields that might have changed while L2 ran */
4557 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
4558 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
4559 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
4560 if (kvm_has_tsc_control)
4561 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
4562
4563 if (vmx->nested.l1_tpr_threshold != -1)
4564 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
4565
4566 if (vmx->nested.change_vmcs01_virtual_apic_mode) {
4567 vmx->nested.change_vmcs01_virtual_apic_mode = false;
4568 vmx_set_virtual_apic_mode(vcpu);
4569 }
4570
4571 if (vmx->nested.update_vmcs01_cpu_dirty_logging) {
4572 vmx->nested.update_vmcs01_cpu_dirty_logging = false;
4573 vmx_update_cpu_dirty_logging(vcpu);
4574 }
4575
4576 /* Unpin physical memory we referred to in vmcs02 */
4577 if (vmx->nested.apic_access_page) {
4578 kvm_release_page_clean(vmx->nested.apic_access_page);
4579 vmx->nested.apic_access_page = NULL;
4580 }
4581 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
4582 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
4583 vmx->nested.pi_desc = NULL;
4584
4585 if (vmx->nested.reload_vmcs01_apic_access_page) {
4586 vmx->nested.reload_vmcs01_apic_access_page = false;
4587 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4588 }
4589
4590 if ((vm_exit_reason != -1) &&
4591 (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)))
4592 vmx->nested.need_vmcs12_to_shadow_sync = true;
4593
4594 /* in case we halted in L2 */
4595 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4596
4597 if (likely(!vmx->fail)) {
4598 if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
4599 nested_exit_intr_ack_set(vcpu)) {
4600 int irq = kvm_cpu_get_interrupt(vcpu);
4601 WARN_ON(irq < 0);
4602 vmcs12->vm_exit_intr_info = irq |
4603 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
4604 }
4605
4606 if (vm_exit_reason != -1)
4607 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
4608 vmcs12->exit_qualification,
4609 vmcs12->idt_vectoring_info_field,
4610 vmcs12->vm_exit_intr_info,
4611 vmcs12->vm_exit_intr_error_code,
4612 KVM_ISA_VMX);
4613
4614 load_vmcs12_host_state(vcpu, vmcs12);
4615
4616 return;
4617 }
4618
4619 /*
4620 * After an early L2 VM-entry failure, we're now back
4621 * in L1 which thinks it just finished a VMLAUNCH or
4622 * VMRESUME instruction, so we need to set the failure
4623 * flag and the VM-instruction error field of the VMCS
4624 * accordingly, and skip the emulated instruction.
4625 */
4626 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4627
4628 /*
4629 * Restore L1's host state to KVM's software model. We're here
4630 * because a consistency check was caught by hardware, which
4631 * means some amount of guest state has been propagated to KVM's
4632 * model and needs to be unwound to the host's state.
4633 */
4634 nested_vmx_restore_host_state(vcpu);
4635
4636 vmx->fail = 0;
4637 }
4638
4639 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
4640 {
4641 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
4642 }
4643
4644 /*
4645 * Decode the memory-address operand of a vmx instruction, as recorded on an
4646 * exit caused by such an instruction (run by a guest hypervisor).
4647 * On success, returns 0. When the operand is invalid, returns 1 and throws
4648 * #UD, #GP, or #SS.
4649 */
4650 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4651 u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
4652 {
4653 gva_t off;
4654 bool exn;
4655 struct kvm_segment s;
4656
4657 /*
4658 * According to Vol. 3B, "Information for VM Exits Due to Instruction
4659 * Execution", on an exit, vmx_instruction_info holds most of the
4660 * addressing components of the operand. Only the displacement part
4661 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4662 * For how an actual address is calculated from all these components,
4663 * refer to Vol. 1, "Operand Addressing".
4664 */
4665 int scaling = vmx_instruction_info & 3;
4666 int addr_size = (vmx_instruction_info >> 7) & 7;
4667 bool is_reg = vmx_instruction_info & (1u << 10);
4668 int seg_reg = (vmx_instruction_info >> 15) & 7;
4669 int index_reg = (vmx_instruction_info >> 18) & 0xf;
4670 bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4671 int base_reg = (vmx_instruction_info >> 23) & 0xf;
4672 bool base_is_valid = !(vmx_instruction_info & (1u << 27));
4673
4674 if (is_reg) {
4675 kvm_queue_exception(vcpu, UD_VECTOR);
4676 return 1;
4677 }
4678
4679 /* Addr = segment_base + offset */
4680 /* offset = base + [index * scale] + displacement */
4681 off = exit_qualification; /* holds the displacement */
4682 if (addr_size == 1)
4683 off = (gva_t)sign_extend64(off, 31);
4684 else if (addr_size == 0)
4685 off = (gva_t)sign_extend64(off, 15);
4686 if (base_is_valid)
4687 off += kvm_register_read(vcpu, base_reg);
4688 if (index_is_valid)
4689 off += kvm_register_read(vcpu, index_reg) << scaling;
4690 vmx_get_segment(vcpu, &s, seg_reg);
4691
4692 /*
4693 * The effective address, i.e. @off, of a memory operand is truncated
4694 * based on the address size of the instruction. Note that this is
4695 * the *effective address*, i.e. the address prior to accounting for
4696 * the segment's base.
4697 */
4698 if (addr_size == 1) /* 32 bit */
4699 off &= 0xffffffff;
4700 else if (addr_size == 0) /* 16 bit */
4701 off &= 0xffff;
4702
4703 /* Checks for #GP/#SS exceptions. */
4704 exn = false;
4705 if (is_long_mode(vcpu)) {
4706 /*
4707 * The virtual/linear address is never truncated in 64-bit
4708 * mode, e.g. a 32-bit address size can yield a 64-bit virtual
4709 * address when using FS/GS with a non-zero base.
4710 */
4711 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
4712 *ret = s.base + off;
4713 else
4714 *ret = off;
4715
4716 /* Long mode: #GP(0)/#SS(0) if the memory address is in a
4717 * non-canonical form. This is the only check on the memory
4718 * destination for long mode!
4719 */
4720 exn = is_noncanonical_address(*ret, vcpu);
4721 } else {
4722 /*
4723 * When not in long mode, the virtual/linear address is
4724 * unconditionally truncated to 32 bits regardless of the
4725 * address size.
4726 */
4727 *ret = (s.base + off) & 0xffffffff;
4728
4729 /* Protected mode: apply checks for segment validity in the
4730 * following order:
4731 * - segment type check (#GP(0) may be thrown)
4732 * - usability check (#GP(0)/#SS(0))
4733 * - limit check (#GP(0)/#SS(0))
4734 */
4735 if (wr)
4736 /* #GP(0) if the destination operand is located in a
4737 * read-only data segment or any code segment.
4738 */
4739 exn = ((s.type & 0xa) == 0 || (s.type & 8));
4740 else
4741 /* #GP(0) if the source operand is located in an
4742 * execute-only code segment
4743 */
4744 exn = ((s.type & 0xa) == 8);
4745 if (exn) {
4746 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4747 return 1;
4748 }
4749 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
4750 */
4751 exn = (s.unusable != 0);
4752
4753 /*
4754 * Protected mode: #GP(0)/#SS(0) if the memory operand is
4755 * outside the segment limit. All CPUs that support VMX ignore
4756 * limit checks for flat segments, i.e. segments with base==0,
4757 * limit==0xffffffff and of type expand-up data or code.
4758 */
4759 if (!(s.base == 0 && s.limit == 0xffffffff &&
4760 ((s.type & 8) || !(s.type & 4))))
4761 exn = exn || ((u64)off + len - 1 > s.limit);
4762 }
4763 if (exn) {
4764 kvm_queue_exception_e(vcpu,
4765 seg_reg == VCPU_SREG_SS ?
4766 SS_VECTOR : GP_VECTOR,
4767 0);
4768 return 1;
4769 }
4770
4771 return 0;
4772 }
4773
4774 void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
4775 {
4776 struct vcpu_vmx *vmx;
4777
4778 if (!nested_vmx_allowed(vcpu))
4779 return;
4780
4781 vmx = to_vmx(vcpu);
4782 if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) {
4783 vmx->nested.msrs.entry_ctls_high |=
4784 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4785 vmx->nested.msrs.exit_ctls_high |=
4786 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4787 } else {
4788 vmx->nested.msrs.entry_ctls_high &=
4789 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4790 vmx->nested.msrs.exit_ctls_high &=
4791 ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4792 }
4793 }
4794
4795 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
4796 int *ret)
4797 {
4798 gva_t gva;
4799 struct x86_exception e;
4800 int r;
4801
4802 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
4803 vmcs_read32(VMX_INSTRUCTION_INFO), false,
4804 sizeof(*vmpointer), &gva)) {
4805 *ret = 1;
4806 return -EINVAL;
4807 }
4808
4809 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
4810 if (r != X86EMUL_CONTINUE) {
4811 *ret = kvm_handle_memory_failure(vcpu, r, &e);
4812 return -EINVAL;
4813 }
4814
4815 return 0;
4816 }
4817
4818 /*
4819 * Allocate a shadow VMCS and associate it with the currently loaded
4820 * VMCS, unless such a shadow VMCS already exists. The newly allocated
4821 * VMCS is also VMCLEARed, so that it is ready for use.
4822 */
4823 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
4824 {
4825 struct vcpu_vmx *vmx = to_vmx(vcpu);
4826 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
4827
4828 /*
4829 * We should allocate a shadow vmcs for vmcs01 only when L1
4830 * executes VMXON and free it when L1 executes VMXOFF.
4831 * As it is invalid to execute VMXON twice, we shouldn't reach
4832 * here when vmcs01 already have an allocated shadow vmcs.
4833 */
4834 WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
4835
4836 if (!loaded_vmcs->shadow_vmcs) {
4837 loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
4838 if (loaded_vmcs->shadow_vmcs)
4839 vmcs_clear(loaded_vmcs->shadow_vmcs);
4840 }
4841 return loaded_vmcs->shadow_vmcs;
4842 }
4843
4844 static int enter_vmx_operation(struct kvm_vcpu *vcpu)
4845 {
4846 struct vcpu_vmx *vmx = to_vmx(vcpu);
4847 int r;
4848
4849 r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
4850 if (r < 0)
4851 goto out_vmcs02;
4852
4853 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4854 if (!vmx->nested.cached_vmcs12)
4855 goto out_cached_vmcs12;
4856
4857 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4858 if (!vmx->nested.cached_shadow_vmcs12)
4859 goto out_cached_shadow_vmcs12;
4860
4861 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
4862 goto out_shadow_vmcs;
4863
4864 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
4865 HRTIMER_MODE_ABS_PINNED);
4866 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
4867
4868 vmx->nested.vpid02 = allocate_vpid();
4869
4870 vmx->nested.vmcs02_initialized = false;
4871 vmx->nested.vmxon = true;
4872
4873 if (vmx_pt_mode_is_host_guest()) {
4874 vmx->pt_desc.guest.ctl = 0;
4875 pt_update_intercept_for_msr(vcpu);
4876 }
4877
4878 return 0;
4879
4880 out_shadow_vmcs:
4881 kfree(vmx->nested.cached_shadow_vmcs12);
4882
4883 out_cached_shadow_vmcs12:
4884 kfree(vmx->nested.cached_vmcs12);
4885
4886 out_cached_vmcs12:
4887 free_loaded_vmcs(&vmx->nested.vmcs02);
4888
4889 out_vmcs02:
4890 return -ENOMEM;
4891 }
4892
4893 /*
4894 * Emulate the VMXON instruction.
4895 * Currently, we just remember that VMX is active, and do not save or even
4896 * inspect the argument to VMXON (the so-called "VMXON pointer") because we
4897 * do not currently need to store anything in that guest-allocated memory
4898 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
4899 * argument is different from the VMXON pointer (which the spec says they do).
4900 */
4901 static int handle_vmon(struct kvm_vcpu *vcpu)
4902 {
4903 int ret;
4904 gpa_t vmptr;
4905 uint32_t revision;
4906 struct vcpu_vmx *vmx = to_vmx(vcpu);
4907 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED
4908 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
4909
4910 /*
4911 * The Intel VMX Instruction Reference lists a bunch of bits that are
4912 * prerequisite to running VMXON, most notably cr4.VMXE must be set to
4913 * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this).
4914 * Otherwise, we should fail with #UD. But most faulting conditions
4915 * have already been checked by hardware, prior to the VM-exit for
4916 * VMXON. We do test guest cr4.VMXE because processor CR4 always has
4917 * that bit set to 1 in non-root mode.
4918 */
4919 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
4920 kvm_queue_exception(vcpu, UD_VECTOR);
4921 return 1;
4922 }
4923
4924 /* CPL=0 must be checked manually. */
4925 if (vmx_get_cpl(vcpu)) {
4926 kvm_inject_gp(vcpu, 0);
4927 return 1;
4928 }
4929
4930 if (vmx->nested.vmxon)
4931 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
4932
4933 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
4934 != VMXON_NEEDED_FEATURES) {
4935 kvm_inject_gp(vcpu, 0);
4936 return 1;
4937 }
4938
4939 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret))
4940 return ret;
4941
4942 /*
4943 * SDM 3: 24.11.5
4944 * The first 4 bytes of VMXON region contain the supported
4945 * VMCS revision identifier
4946 *
4947 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
4948 * which replaces physical address width with 32
4949 */
4950 if (!page_address_valid(vcpu, vmptr))
4951 return nested_vmx_failInvalid(vcpu);
4952
4953 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
4954 revision != VMCS12_REVISION)
4955 return nested_vmx_failInvalid(vcpu);
4956
4957 vmx->nested.vmxon_ptr = vmptr;
4958 ret = enter_vmx_operation(vcpu);
4959 if (ret)
4960 return ret;
4961
4962 return nested_vmx_succeed(vcpu);
4963 }
4964
4965 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
4966 {
4967 struct vcpu_vmx *vmx = to_vmx(vcpu);
4968
4969 if (vmx->nested.current_vmptr == -1ull)
4970 return;
4971
4972 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
4973
4974 if (enable_shadow_vmcs) {
4975 /* copy to memory all shadowed fields in case
4976 they were modified */
4977 copy_shadow_to_vmcs12(vmx);
4978 vmx_disable_shadow_vmcs(vmx);
4979 }
4980 vmx->nested.posted_intr_nv = -1;
4981
4982 /* Flush VMCS12 to guest memory */
4983 kvm_vcpu_write_guest_page(vcpu,
4984 vmx->nested.current_vmptr >> PAGE_SHIFT,
4985 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
4986
4987 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
4988
4989 vmx->nested.current_vmptr = -1ull;
4990 }
4991
4992 /* Emulate the VMXOFF instruction */
4993 static int handle_vmoff(struct kvm_vcpu *vcpu)
4994 {
4995 if (!nested_vmx_check_permission(vcpu))
4996 return 1;
4997
4998 free_nested(vcpu);
4999
5000 /* Process a latched INIT during time CPU was in VMX operation */
5001 kvm_make_request(KVM_REQ_EVENT, vcpu);
5002
5003 return nested_vmx_succeed(vcpu);
5004 }
5005
5006 /* Emulate the VMCLEAR instruction */
5007 static int handle_vmclear(struct kvm_vcpu *vcpu)
5008 {
5009 struct vcpu_vmx *vmx = to_vmx(vcpu);
5010 u32 zero = 0;
5011 gpa_t vmptr;
5012 u64 evmcs_gpa;
5013 int r;
5014
5015 if (!nested_vmx_check_permission(vcpu))
5016 return 1;
5017
5018 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
5019 return r;
5020
5021 if (!page_address_valid(vcpu, vmptr))
5022 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
5023
5024 if (vmptr == vmx->nested.vmxon_ptr)
5025 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
5026
5027 /*
5028 * When Enlightened VMEntry is enabled on the calling CPU we treat
5029 * memory area pointer by vmptr as Enlightened VMCS (as there's no good
5030 * way to distinguish it from VMCS12) and we must not corrupt it by
5031 * writing to the non-existent 'launch_state' field. The area doesn't
5032 * have to be the currently active EVMCS on the calling CPU and there's
5033 * nothing KVM has to do to transition it from 'active' to 'non-active'
5034 * state. It is possible that the area will stay mapped as
5035 * vmx->nested.hv_evmcs but this shouldn't be a problem.
5036 */
5037 if (likely(!vmx->nested.enlightened_vmcs_enabled ||
5038 !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
5039 if (vmptr == vmx->nested.current_vmptr)
5040 nested_release_vmcs12(vcpu);
5041
5042 kvm_vcpu_write_guest(vcpu,
5043 vmptr + offsetof(struct vmcs12,
5044 launch_state),
5045 &zero, sizeof(zero));
5046 } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) {
5047 nested_release_evmcs(vcpu);
5048 }
5049
5050 return nested_vmx_succeed(vcpu);
5051 }
5052
5053 /* Emulate the VMLAUNCH instruction */
5054 static int handle_vmlaunch(struct kvm_vcpu *vcpu)
5055 {
5056 return nested_vmx_run(vcpu, true);
5057 }
5058
5059 /* Emulate the VMRESUME instruction */
5060 static int handle_vmresume(struct kvm_vcpu *vcpu)
5061 {
5062
5063 return nested_vmx_run(vcpu, false);
5064 }
5065
5066 static int handle_vmread(struct kvm_vcpu *vcpu)
5067 {
5068 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
5069 : get_vmcs12(vcpu);
5070 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5071 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5072 struct vcpu_vmx *vmx = to_vmx(vcpu);
5073 struct x86_exception e;
5074 unsigned long field;
5075 u64 value;
5076 gva_t gva = 0;
5077 short offset;
5078 int len, r;
5079
5080 if (!nested_vmx_check_permission(vcpu))
5081 return 1;
5082
5083 /*
5084 * In VMX non-root operation, when the VMCS-link pointer is -1ull,
5085 * any VMREAD sets the ALU flags for VMfailInvalid.
5086 */
5087 if (vmx->nested.current_vmptr == -1ull ||
5088 (is_guest_mode(vcpu) &&
5089 get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
5090 return nested_vmx_failInvalid(vcpu);
5091
5092 /* Decode instruction info and find the field to read */
5093 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
5094
5095 offset = vmcs_field_to_offset(field);
5096 if (offset < 0)
5097 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5098
5099 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
5100 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5101
5102 /* Read the field, zero-extended to a u64 value */
5103 value = vmcs12_read_any(vmcs12, field, offset);
5104
5105 /*
5106 * Now copy part of this value to register or memory, as requested.
5107 * Note that the number of bits actually copied is 32 or 64 depending
5108 * on the guest's mode (32 or 64 bit), not on the given field's length.
5109 */
5110 if (instr_info & BIT(10)) {
5111 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value);
5112 } else {
5113 len = is_64_bit_mode(vcpu) ? 8 : 4;
5114 if (get_vmx_mem_address(vcpu, exit_qualification,
5115 instr_info, true, len, &gva))
5116 return 1;
5117 /* _system ok, nested_vmx_check_permission has verified cpl=0 */
5118 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e);
5119 if (r != X86EMUL_CONTINUE)
5120 return kvm_handle_memory_failure(vcpu, r, &e);
5121 }
5122
5123 return nested_vmx_succeed(vcpu);
5124 }
5125
5126 static bool is_shadow_field_rw(unsigned long field)
5127 {
5128 switch (field) {
5129 #define SHADOW_FIELD_RW(x, y) case x:
5130 #include "vmcs_shadow_fields.h"
5131 return true;
5132 default:
5133 break;
5134 }
5135 return false;
5136 }
5137
5138 static bool is_shadow_field_ro(unsigned long field)
5139 {
5140 switch (field) {
5141 #define SHADOW_FIELD_RO(x, y) case x:
5142 #include "vmcs_shadow_fields.h"
5143 return true;
5144 default:
5145 break;
5146 }
5147 return false;
5148 }
5149
5150 static int handle_vmwrite(struct kvm_vcpu *vcpu)
5151 {
5152 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
5153 : get_vmcs12(vcpu);
5154 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5155 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5156 struct vcpu_vmx *vmx = to_vmx(vcpu);
5157 struct x86_exception e;
5158 unsigned long field;
5159 short offset;
5160 gva_t gva;
5161 int len, r;
5162
5163 /*
5164 * The value to write might be 32 or 64 bits, depending on L1's long
5165 * mode, and eventually we need to write that into a field of several
5166 * possible lengths. The code below first zero-extends the value to 64
5167 * bit (value), and then copies only the appropriate number of
5168 * bits into the vmcs12 field.
5169 */
5170 u64 value = 0;
5171
5172 if (!nested_vmx_check_permission(vcpu))
5173 return 1;
5174
5175 /*
5176 * In VMX non-root operation, when the VMCS-link pointer is -1ull,
5177 * any VMWRITE sets the ALU flags for VMfailInvalid.
5178 */
5179 if (vmx->nested.current_vmptr == -1ull ||
5180 (is_guest_mode(vcpu) &&
5181 get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
5182 return nested_vmx_failInvalid(vcpu);
5183
5184 if (instr_info & BIT(10))
5185 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf));
5186 else {
5187 len = is_64_bit_mode(vcpu) ? 8 : 4;
5188 if (get_vmx_mem_address(vcpu, exit_qualification,
5189 instr_info, false, len, &gva))
5190 return 1;
5191 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e);
5192 if (r != X86EMUL_CONTINUE)
5193 return kvm_handle_memory_failure(vcpu, r, &e);
5194 }
5195
5196 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
5197
5198 offset = vmcs_field_to_offset(field);
5199 if (offset < 0)
5200 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
5201
5202 /*
5203 * If the vCPU supports "VMWRITE to any supported field in the
5204 * VMCS," then the "read-only" fields are actually read/write.
5205 */
5206 if (vmcs_field_readonly(field) &&
5207 !nested_cpu_has_vmwrite_any_field(vcpu))
5208 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
5209
5210 /*
5211 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
5212 * vmcs12, else we may crush a field or consume a stale value.
5213 */
5214 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field))
5215 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5216
5217 /*
5218 * Some Intel CPUs intentionally drop the reserved bits of the AR byte
5219 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM
5220 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
5221 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
5222 * from L1 will return a different value than VMREAD from L2 (L1 sees
5223 * the stripped down value, L2 sees the full value as stored by KVM).
5224 */
5225 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
5226 value &= 0x1f0ff;
5227
5228 vmcs12_write_any(vmcs12, field, offset, value);
5229
5230 /*
5231 * Do not track vmcs12 dirty-state if in guest-mode as we actually
5232 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated
5233 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
5234 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
5235 */
5236 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
5237 /*
5238 * L1 can read these fields without exiting, ensure the
5239 * shadow VMCS is up-to-date.
5240 */
5241 if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
5242 preempt_disable();
5243 vmcs_load(vmx->vmcs01.shadow_vmcs);
5244
5245 __vmcs_writel(field, value);
5246
5247 vmcs_clear(vmx->vmcs01.shadow_vmcs);
5248 vmcs_load(vmx->loaded_vmcs->vmcs);
5249 preempt_enable();
5250 }
5251 vmx->nested.dirty_vmcs12 = true;
5252 }
5253
5254 return nested_vmx_succeed(vcpu);
5255 }
5256
5257 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
5258 {
5259 vmx->nested.current_vmptr = vmptr;
5260 if (enable_shadow_vmcs) {
5261 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
5262 vmcs_write64(VMCS_LINK_POINTER,
5263 __pa(vmx->vmcs01.shadow_vmcs));
5264 vmx->nested.need_vmcs12_to_shadow_sync = true;
5265 }
5266 vmx->nested.dirty_vmcs12 = true;
5267 }
5268
5269 /* Emulate the VMPTRLD instruction */
5270 static int handle_vmptrld(struct kvm_vcpu *vcpu)
5271 {
5272 struct vcpu_vmx *vmx = to_vmx(vcpu);
5273 gpa_t vmptr;
5274 int r;
5275
5276 if (!nested_vmx_check_permission(vcpu))
5277 return 1;
5278
5279 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
5280 return r;
5281
5282 if (!page_address_valid(vcpu, vmptr))
5283 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
5284
5285 if (vmptr == vmx->nested.vmxon_ptr)
5286 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
5287
5288 /* Forbid normal VMPTRLD if Enlightened version was used */
5289 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
5290 return 1;
5291
5292 if (vmx->nested.current_vmptr != vmptr) {
5293 struct kvm_host_map map;
5294 struct vmcs12 *new_vmcs12;
5295
5296 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
5297 /*
5298 * Reads from an unbacked page return all 1s,
5299 * which means that the 32 bits located at the
5300 * given physical address won't match the required
5301 * VMCS12_REVISION identifier.
5302 */
5303 return nested_vmx_fail(vcpu,
5304 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5305 }
5306
5307 new_vmcs12 = map.hva;
5308
5309 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
5310 (new_vmcs12->hdr.shadow_vmcs &&
5311 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
5312 kvm_vcpu_unmap(vcpu, &map, false);
5313 return nested_vmx_fail(vcpu,
5314 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5315 }
5316
5317 nested_release_vmcs12(vcpu);
5318
5319 /*
5320 * Load VMCS12 from guest memory since it is not already
5321 * cached.
5322 */
5323 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
5324 kvm_vcpu_unmap(vcpu, &map, false);
5325
5326 set_current_vmptr(vmx, vmptr);
5327 }
5328
5329 return nested_vmx_succeed(vcpu);
5330 }
5331
5332 /* Emulate the VMPTRST instruction */
5333 static int handle_vmptrst(struct kvm_vcpu *vcpu)
5334 {
5335 unsigned long exit_qual = vmx_get_exit_qual(vcpu);
5336 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5337 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
5338 struct x86_exception e;
5339 gva_t gva;
5340 int r;
5341
5342 if (!nested_vmx_check_permission(vcpu))
5343 return 1;
5344
5345 if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr)))
5346 return 1;
5347
5348 if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
5349 true, sizeof(gpa_t), &gva))
5350 return 1;
5351 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
5352 r = kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
5353 sizeof(gpa_t), &e);
5354 if (r != X86EMUL_CONTINUE)
5355 return kvm_handle_memory_failure(vcpu, r, &e);
5356
5357 return nested_vmx_succeed(vcpu);
5358 }
5359
5360 #define EPTP_PA_MASK GENMASK_ULL(51, 12)
5361
5362 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
5363 {
5364 return VALID_PAGE(root_hpa) &&
5365 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
5366 }
5367
5368 /* Emulate the INVEPT instruction */
5369 static int handle_invept(struct kvm_vcpu *vcpu)
5370 {
5371 struct vcpu_vmx *vmx = to_vmx(vcpu);
5372 u32 vmx_instruction_info, types;
5373 unsigned long type, roots_to_free;
5374 struct kvm_mmu *mmu;
5375 gva_t gva;
5376 struct x86_exception e;
5377 struct {
5378 u64 eptp, gpa;
5379 } operand;
5380 int i, r;
5381
5382 if (!(vmx->nested.msrs.secondary_ctls_high &
5383 SECONDARY_EXEC_ENABLE_EPT) ||
5384 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
5385 kvm_queue_exception(vcpu, UD_VECTOR);
5386 return 1;
5387 }
5388
5389 if (!nested_vmx_check_permission(vcpu))
5390 return 1;
5391
5392 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5393 type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
5394
5395 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
5396
5397 if (type >= 32 || !(types & (1 << type)))
5398 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5399
5400 /* According to the Intel VMX instruction reference, the memory
5401 * operand is read even if it isn't needed (e.g., for type==global)
5402 */
5403 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5404 vmx_instruction_info, false, sizeof(operand), &gva))
5405 return 1;
5406 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
5407 if (r != X86EMUL_CONTINUE)
5408 return kvm_handle_memory_failure(vcpu, r, &e);
5409
5410 /*
5411 * Nested EPT roots are always held through guest_mmu,
5412 * not root_mmu.
5413 */
5414 mmu = &vcpu->arch.guest_mmu;
5415
5416 switch (type) {
5417 case VMX_EPT_EXTENT_CONTEXT:
5418 if (!nested_vmx_check_eptp(vcpu, operand.eptp))
5419 return nested_vmx_fail(vcpu,
5420 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5421
5422 roots_to_free = 0;
5423 if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd,
5424 operand.eptp))
5425 roots_to_free |= KVM_MMU_ROOT_CURRENT;
5426
5427 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5428 if (nested_ept_root_matches(mmu->prev_roots[i].hpa,
5429 mmu->prev_roots[i].pgd,
5430 operand.eptp))
5431 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
5432 }
5433 break;
5434 case VMX_EPT_EXTENT_GLOBAL:
5435 roots_to_free = KVM_MMU_ROOTS_ALL;
5436 break;
5437 default:
5438 BUG();
5439 break;
5440 }
5441
5442 if (roots_to_free)
5443 kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
5444
5445 return nested_vmx_succeed(vcpu);
5446 }
5447
5448 static int handle_invvpid(struct kvm_vcpu *vcpu)
5449 {
5450 struct vcpu_vmx *vmx = to_vmx(vcpu);
5451 u32 vmx_instruction_info;
5452 unsigned long type, types;
5453 gva_t gva;
5454 struct x86_exception e;
5455 struct {
5456 u64 vpid;
5457 u64 gla;
5458 } operand;
5459 u16 vpid02;
5460 int r;
5461
5462 if (!(vmx->nested.msrs.secondary_ctls_high &
5463 SECONDARY_EXEC_ENABLE_VPID) ||
5464 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
5465 kvm_queue_exception(vcpu, UD_VECTOR);
5466 return 1;
5467 }
5468
5469 if (!nested_vmx_check_permission(vcpu))
5470 return 1;
5471
5472 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5473 type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
5474
5475 types = (vmx->nested.msrs.vpid_caps &
5476 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
5477
5478 if (type >= 32 || !(types & (1 << type)))
5479 return nested_vmx_fail(vcpu,
5480 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5481
5482 /* according to the intel vmx instruction reference, the memory
5483 * operand is read even if it isn't needed (e.g., for type==global)
5484 */
5485 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
5486 vmx_instruction_info, false, sizeof(operand), &gva))
5487 return 1;
5488 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
5489 if (r != X86EMUL_CONTINUE)
5490 return kvm_handle_memory_failure(vcpu, r, &e);
5491
5492 if (operand.vpid >> 16)
5493 return nested_vmx_fail(vcpu,
5494 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5495
5496 vpid02 = nested_get_vpid02(vcpu);
5497 switch (type) {
5498 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
5499 if (!operand.vpid ||
5500 is_noncanonical_address(operand.gla, vcpu))
5501 return nested_vmx_fail(vcpu,
5502 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5503 vpid_sync_vcpu_addr(vpid02, operand.gla);
5504 break;
5505 case VMX_VPID_EXTENT_SINGLE_CONTEXT:
5506 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
5507 if (!operand.vpid)
5508 return nested_vmx_fail(vcpu,
5509 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5510 vpid_sync_context(vpid02);
5511 break;
5512 case VMX_VPID_EXTENT_ALL_CONTEXT:
5513 vpid_sync_context(vpid02);
5514 break;
5515 default:
5516 WARN_ON_ONCE(1);
5517 return kvm_skip_emulated_instruction(vcpu);
5518 }
5519
5520 /*
5521 * Sync the shadow page tables if EPT is disabled, L1 is invalidating
5522 * linear mappings for L2 (tagged with L2's VPID). Free all roots as
5523 * VPIDs are not tracked in the MMU role.
5524 *
5525 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
5526 * an MMU when EPT is disabled.
5527 *
5528 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
5529 */
5530 if (!enable_ept)
5531 kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu,
5532 KVM_MMU_ROOTS_ALL);
5533
5534 return nested_vmx_succeed(vcpu);
5535 }
5536
5537 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
5538 struct vmcs12 *vmcs12)
5539 {
5540 u32 index = kvm_rcx_read(vcpu);
5541 u64 new_eptp;
5542 bool accessed_dirty;
5543 struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
5544
5545 if (!nested_cpu_has_eptp_switching(vmcs12) ||
5546 !nested_cpu_has_ept(vmcs12))
5547 return 1;
5548
5549 if (index >= VMFUNC_EPTP_ENTRIES)
5550 return 1;
5551
5552
5553 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
5554 &new_eptp, index * 8, 8))
5555 return 1;
5556
5557 accessed_dirty = !!(new_eptp & VMX_EPTP_AD_ENABLE_BIT);
5558
5559 /*
5560 * If the (L2) guest does a vmfunc to the currently
5561 * active ept pointer, we don't have to do anything else
5562 */
5563 if (vmcs12->ept_pointer != new_eptp) {
5564 if (!nested_vmx_check_eptp(vcpu, new_eptp))
5565 return 1;
5566
5567 mmu->ept_ad = accessed_dirty;
5568 mmu->mmu_role.base.ad_disabled = !accessed_dirty;
5569 vmcs12->ept_pointer = new_eptp;
5570
5571 kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
5572 }
5573
5574 return 0;
5575 }
5576
5577 static int handle_vmfunc(struct kvm_vcpu *vcpu)
5578 {
5579 struct vcpu_vmx *vmx = to_vmx(vcpu);
5580 struct vmcs12 *vmcs12;
5581 u32 function = kvm_rax_read(vcpu);
5582
5583 /*
5584 * VMFUNC is only supported for nested guests, but we always enable the
5585 * secondary control for simplicity; for non-nested mode, fake that we
5586 * didn't by injecting #UD.
5587 */
5588 if (!is_guest_mode(vcpu)) {
5589 kvm_queue_exception(vcpu, UD_VECTOR);
5590 return 1;
5591 }
5592
5593 vmcs12 = get_vmcs12(vcpu);
5594 if ((vmcs12->vm_function_control & (1 << function)) == 0)
5595 goto fail;
5596
5597 switch (function) {
5598 case 0:
5599 if (nested_vmx_eptp_switching(vcpu, vmcs12))
5600 goto fail;
5601 break;
5602 default:
5603 goto fail;
5604 }
5605 return kvm_skip_emulated_instruction(vcpu);
5606
5607 fail:
5608 /*
5609 * This is effectively a reflected VM-Exit, as opposed to a synthesized
5610 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode
5611 * EXIT_REASON_VMFUNC as the exit reason.
5612 */
5613 nested_vmx_vmexit(vcpu, vmx->exit_reason.full,
5614 vmx_get_intr_info(vcpu),
5615 vmx_get_exit_qual(vcpu));
5616 return 1;
5617 }
5618
5619 /*
5620 * Return true if an IO instruction with the specified port and size should cause
5621 * a VM-exit into L1.
5622 */
5623 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
5624 int size)
5625 {
5626 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5627 gpa_t bitmap, last_bitmap;
5628 u8 b;
5629
5630 last_bitmap = (gpa_t)-1;
5631 b = -1;
5632
5633 while (size > 0) {
5634 if (port < 0x8000)
5635 bitmap = vmcs12->io_bitmap_a;
5636 else if (port < 0x10000)
5637 bitmap = vmcs12->io_bitmap_b;
5638 else
5639 return true;
5640 bitmap += (port & 0x7fff) / 8;
5641
5642 if (last_bitmap != bitmap)
5643 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
5644 return true;
5645 if (b & (1 << (port & 7)))
5646 return true;
5647
5648 port++;
5649 size--;
5650 last_bitmap = bitmap;
5651 }
5652
5653 return false;
5654 }
5655
5656 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
5657 struct vmcs12 *vmcs12)
5658 {
5659 unsigned long exit_qualification;
5660 unsigned short port;
5661 int size;
5662
5663 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
5664 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
5665
5666 exit_qualification = vmx_get_exit_qual(vcpu);
5667
5668 port = exit_qualification >> 16;
5669 size = (exit_qualification & 7) + 1;
5670
5671 return nested_vmx_check_io_bitmaps(vcpu, port, size);
5672 }
5673
5674 /*
5675 * Return 1 if we should exit from L2 to L1 to handle an MSR access,
5676 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
5677 * disinterest in the current event (read or write a specific MSR) by using an
5678 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
5679 */
5680 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5681 struct vmcs12 *vmcs12,
5682 union vmx_exit_reason exit_reason)
5683 {
5684 u32 msr_index = kvm_rcx_read(vcpu);
5685 gpa_t bitmap;
5686
5687 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
5688 return true;
5689
5690 /*
5691 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
5692 * for the four combinations of read/write and low/high MSR numbers.
5693 * First we need to figure out which of the four to use:
5694 */
5695 bitmap = vmcs12->msr_bitmap;
5696 if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
5697 bitmap += 2048;
5698 if (msr_index >= 0xc0000000) {
5699 msr_index -= 0xc0000000;
5700 bitmap += 1024;
5701 }
5702
5703 /* Then read the msr_index'th bit from this bitmap: */
5704 if (msr_index < 1024*8) {
5705 unsigned char b;
5706 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
5707 return true;
5708 return 1 & (b >> (msr_index & 7));
5709 } else
5710 return true; /* let L1 handle the wrong parameter */
5711 }
5712
5713 /*
5714 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
5715 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
5716 * intercept (via guest_host_mask etc.) the current event.
5717 */
5718 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
5719 struct vmcs12 *vmcs12)
5720 {
5721 unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5722 int cr = exit_qualification & 15;
5723 int reg;
5724 unsigned long val;
5725
5726 switch ((exit_qualification >> 4) & 3) {
5727 case 0: /* mov to cr */
5728 reg = (exit_qualification >> 8) & 15;
5729 val = kvm_register_read(vcpu, reg);
5730 switch (cr) {
5731 case 0:
5732 if (vmcs12->cr0_guest_host_mask &
5733 (val ^ vmcs12->cr0_read_shadow))
5734 return true;
5735 break;
5736 case 3:
5737 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
5738 return true;
5739 break;
5740 case 4:
5741 if (vmcs12->cr4_guest_host_mask &
5742 (vmcs12->cr4_read_shadow ^ val))
5743 return true;
5744 break;
5745 case 8:
5746 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
5747 return true;
5748 break;
5749 }
5750 break;
5751 case 2: /* clts */
5752 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
5753 (vmcs12->cr0_read_shadow & X86_CR0_TS))
5754 return true;
5755 break;
5756 case 1: /* mov from cr */
5757 switch (cr) {
5758 case 3:
5759 if (vmcs12->cpu_based_vm_exec_control &
5760 CPU_BASED_CR3_STORE_EXITING)
5761 return true;
5762 break;
5763 case 8:
5764 if (vmcs12->cpu_based_vm_exec_control &
5765 CPU_BASED_CR8_STORE_EXITING)
5766 return true;
5767 break;
5768 }
5769 break;
5770 case 3: /* lmsw */
5771 /*
5772 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5773 * cr0. Other attempted changes are ignored, with no exit.
5774 */
5775 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5776 if (vmcs12->cr0_guest_host_mask & 0xe &
5777 (val ^ vmcs12->cr0_read_shadow))
5778 return true;
5779 if ((vmcs12->cr0_guest_host_mask & 0x1) &&
5780 !(vmcs12->cr0_read_shadow & 0x1) &&
5781 (val & 0x1))
5782 return true;
5783 break;
5784 }
5785 return false;
5786 }
5787
5788 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
5789 struct vmcs12 *vmcs12)
5790 {
5791 u32 encls_leaf;
5792
5793 if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
5794 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
5795 return false;
5796
5797 encls_leaf = kvm_rax_read(vcpu);
5798 if (encls_leaf > 62)
5799 encls_leaf = 63;
5800 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf);
5801 }
5802
5803 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
5804 struct vmcs12 *vmcs12, gpa_t bitmap)
5805 {
5806 u32 vmx_instruction_info;
5807 unsigned long field;
5808 u8 b;
5809
5810 if (!nested_cpu_has_shadow_vmcs(vmcs12))
5811 return true;
5812
5813 /* Decode instruction info and find the field to access */
5814 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5815 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5816
5817 /* Out-of-range fields always cause a VM exit from L2 to L1 */
5818 if (field >> 15)
5819 return true;
5820
5821 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
5822 return true;
5823
5824 return 1 & (b >> (field & 7));
5825 }
5826
5827 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
5828 {
5829 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field;
5830
5831 if (nested_cpu_has_mtf(vmcs12))
5832 return true;
5833
5834 /*
5835 * An MTF VM-exit may be injected into the guest by setting the
5836 * interruption-type to 7 (other event) and the vector field to 0. Such
5837 * is the case regardless of the 'monitor trap flag' VM-execution
5838 * control.
5839 */
5840 return entry_intr_info == (INTR_INFO_VALID_MASK
5841 | INTR_TYPE_OTHER_EVENT);
5842 }
5843
5844 /*
5845 * Return true if L0 wants to handle an exit from L2 regardless of whether or not
5846 * L1 wants the exit. Only call this when in is_guest_mode (L2).
5847 */
5848 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
5849 union vmx_exit_reason exit_reason)
5850 {
5851 u32 intr_info;
5852
5853 switch ((u16)exit_reason.basic) {
5854 case EXIT_REASON_EXCEPTION_NMI:
5855 intr_info = vmx_get_intr_info(vcpu);
5856 if (is_nmi(intr_info))
5857 return true;
5858 else if (is_page_fault(intr_info))
5859 return vcpu->arch.apf.host_apf_flags || !enable_ept;
5860 else if (is_debug(intr_info) &&
5861 vcpu->guest_debug &
5862 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
5863 return true;
5864 else if (is_breakpoint(intr_info) &&
5865 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5866 return true;
5867 return false;
5868 case EXIT_REASON_EXTERNAL_INTERRUPT:
5869 return true;
5870 case EXIT_REASON_MCE_DURING_VMENTRY:
5871 return true;
5872 case EXIT_REASON_EPT_VIOLATION:
5873 /*
5874 * L0 always deals with the EPT violation. If nested EPT is
5875 * used, and the nested mmu code discovers that the address is
5876 * missing in the guest EPT table (EPT12), the EPT violation
5877 * will be injected with nested_ept_inject_page_fault()
5878 */
5879 return true;
5880 case EXIT_REASON_EPT_MISCONFIG:
5881 /*
5882 * L2 never uses directly L1's EPT, but rather L0's own EPT
5883 * table (shadow on EPT) or a merged EPT table that L0 built
5884 * (EPT on EPT). So any problems with the structure of the
5885 * table is L0's fault.
5886 */
5887 return true;
5888 case EXIT_REASON_PREEMPTION_TIMER:
5889 return true;
5890 case EXIT_REASON_PML_FULL:
5891 /*
5892 * PML is emulated for an L1 VMM and should never be enabled in
5893 * vmcs02, always "handle" PML_FULL by exiting to userspace.
5894 */
5895 return true;
5896 case EXIT_REASON_VMFUNC:
5897 /* VM functions are emulated through L2->L0 vmexits. */
5898 return true;
5899 default:
5900 break;
5901 }
5902 return false;
5903 }
5904
5905 /*
5906 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in
5907 * is_guest_mode (L2).
5908 */
5909 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
5910 union vmx_exit_reason exit_reason)
5911 {
5912 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5913 u32 intr_info;
5914
5915 switch ((u16)exit_reason.basic) {
5916 case EXIT_REASON_EXCEPTION_NMI:
5917 intr_info = vmx_get_intr_info(vcpu);
5918 if (is_nmi(intr_info))
5919 return true;
5920 else if (is_page_fault(intr_info))
5921 return true;
5922 return vmcs12->exception_bitmap &
5923 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
5924 case EXIT_REASON_EXTERNAL_INTERRUPT:
5925 return nested_exit_on_intr(vcpu);
5926 case EXIT_REASON_TRIPLE_FAULT:
5927 return true;
5928 case EXIT_REASON_INTERRUPT_WINDOW:
5929 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING);
5930 case EXIT_REASON_NMI_WINDOW:
5931 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING);
5932 case EXIT_REASON_TASK_SWITCH:
5933 return true;
5934 case EXIT_REASON_CPUID:
5935 return true;
5936 case EXIT_REASON_HLT:
5937 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
5938 case EXIT_REASON_INVD:
5939 return true;
5940 case EXIT_REASON_INVLPG:
5941 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5942 case EXIT_REASON_RDPMC:
5943 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
5944 case EXIT_REASON_RDRAND:
5945 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
5946 case EXIT_REASON_RDSEED:
5947 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
5948 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
5949 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
5950 case EXIT_REASON_VMREAD:
5951 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5952 vmcs12->vmread_bitmap);
5953 case EXIT_REASON_VMWRITE:
5954 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5955 vmcs12->vmwrite_bitmap);
5956 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
5957 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
5958 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
5959 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
5960 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
5961 /*
5962 * VMX instructions trap unconditionally. This allows L1 to
5963 * emulate them for its L2 guest, i.e., allows 3-level nesting!
5964 */
5965 return true;
5966 case EXIT_REASON_CR_ACCESS:
5967 return nested_vmx_exit_handled_cr(vcpu, vmcs12);
5968 case EXIT_REASON_DR_ACCESS:
5969 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
5970 case EXIT_REASON_IO_INSTRUCTION:
5971 return nested_vmx_exit_handled_io(vcpu, vmcs12);
5972 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
5973 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
5974 case EXIT_REASON_MSR_READ:
5975 case EXIT_REASON_MSR_WRITE:
5976 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
5977 case EXIT_REASON_INVALID_STATE:
5978 return true;
5979 case EXIT_REASON_MWAIT_INSTRUCTION:
5980 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5981 case EXIT_REASON_MONITOR_TRAP_FLAG:
5982 return nested_vmx_exit_handled_mtf(vmcs12);
5983 case EXIT_REASON_MONITOR_INSTRUCTION:
5984 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
5985 case EXIT_REASON_PAUSE_INSTRUCTION:
5986 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
5987 nested_cpu_has2(vmcs12,
5988 SECONDARY_EXEC_PAUSE_LOOP_EXITING);
5989 case EXIT_REASON_MCE_DURING_VMENTRY:
5990 return true;
5991 case EXIT_REASON_TPR_BELOW_THRESHOLD:
5992 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
5993 case EXIT_REASON_APIC_ACCESS:
5994 case EXIT_REASON_APIC_WRITE:
5995 case EXIT_REASON_EOI_INDUCED:
5996 /*
5997 * The controls for "virtualize APIC accesses," "APIC-
5998 * register virtualization," and "virtual-interrupt
5999 * delivery" only come from vmcs12.
6000 */
6001 return true;
6002 case EXIT_REASON_INVPCID:
6003 return
6004 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
6005 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
6006 case EXIT_REASON_WBINVD:
6007 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
6008 case EXIT_REASON_XSETBV:
6009 return true;
6010 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
6011 /*
6012 * This should never happen, since it is not possible to
6013 * set XSS to a non-zero value---neither in L1 nor in L2.
6014 * If if it were, XSS would have to be checked against
6015 * the XSS exit bitmap in vmcs12.
6016 */
6017 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
6018 case EXIT_REASON_UMWAIT:
6019 case EXIT_REASON_TPAUSE:
6020 return nested_cpu_has2(vmcs12,
6021 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
6022 case EXIT_REASON_ENCLS:
6023 return nested_vmx_exit_handled_encls(vcpu, vmcs12);
6024 default:
6025 return true;
6026 }
6027 }
6028
6029 /*
6030 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was
6031 * reflected into L1.
6032 */
6033 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
6034 {
6035 struct vcpu_vmx *vmx = to_vmx(vcpu);
6036 union vmx_exit_reason exit_reason = vmx->exit_reason;
6037 unsigned long exit_qual;
6038 u32 exit_intr_info;
6039
6040 WARN_ON_ONCE(vmx->nested.nested_run_pending);
6041
6042 /*
6043 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
6044 * has already loaded L2's state.
6045 */
6046 if (unlikely(vmx->fail)) {
6047 trace_kvm_nested_vmenter_failed(
6048 "hardware VM-instruction error: ",
6049 vmcs_read32(VM_INSTRUCTION_ERROR));
6050 exit_intr_info = 0;
6051 exit_qual = 0;
6052 goto reflect_vmexit;
6053 }
6054
6055 trace_kvm_nested_vmexit(exit_reason.full, vcpu, KVM_ISA_VMX);
6056
6057 /* If L0 (KVM) wants the exit, it trumps L1's desires. */
6058 if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
6059 return false;
6060
6061 /* If L1 doesn't want the exit, handle it in L0. */
6062 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason))
6063 return false;
6064
6065 /*
6066 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For
6067 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would
6068 * need to be synthesized by querying the in-kernel LAPIC, but external
6069 * interrupts are never reflected to L1 so it's a non-issue.
6070 */
6071 exit_intr_info = vmx_get_intr_info(vcpu);
6072 if (is_exception_with_error_code(exit_intr_info)) {
6073 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6074
6075 vmcs12->vm_exit_intr_error_code =
6076 vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
6077 }
6078 exit_qual = vmx_get_exit_qual(vcpu);
6079
6080 reflect_vmexit:
6081 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual);
6082 return true;
6083 }
6084
6085 static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
6086 struct kvm_nested_state __user *user_kvm_nested_state,
6087 u32 user_data_size)
6088 {
6089 struct vcpu_vmx *vmx;
6090 struct vmcs12 *vmcs12;
6091 struct kvm_nested_state kvm_state = {
6092 .flags = 0,
6093 .format = KVM_STATE_NESTED_FORMAT_VMX,
6094 .size = sizeof(kvm_state),
6095 .hdr.vmx.flags = 0,
6096 .hdr.vmx.vmxon_pa = -1ull,
6097 .hdr.vmx.vmcs12_pa = -1ull,
6098 .hdr.vmx.preemption_timer_deadline = 0,
6099 };
6100 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
6101 &user_kvm_nested_state->data.vmx[0];
6102
6103 if (!vcpu)
6104 return kvm_state.size + sizeof(*user_vmx_nested_state);
6105
6106 vmx = to_vmx(vcpu);
6107 vmcs12 = get_vmcs12(vcpu);
6108
6109 if (nested_vmx_allowed(vcpu) &&
6110 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
6111 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
6112 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
6113
6114 if (vmx_has_valid_vmcs12(vcpu)) {
6115 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
6116
6117 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
6118 if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
6119 kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
6120
6121 if (is_guest_mode(vcpu) &&
6122 nested_cpu_has_shadow_vmcs(vmcs12) &&
6123 vmcs12->vmcs_link_pointer != -1ull)
6124 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
6125 }
6126
6127 if (vmx->nested.smm.vmxon)
6128 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
6129
6130 if (vmx->nested.smm.guest_mode)
6131 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
6132
6133 if (is_guest_mode(vcpu)) {
6134 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
6135
6136 if (vmx->nested.nested_run_pending)
6137 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
6138
6139 if (vmx->nested.mtf_pending)
6140 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
6141
6142 if (nested_cpu_has_preemption_timer(vmcs12) &&
6143 vmx->nested.has_preemption_timer_deadline) {
6144 kvm_state.hdr.vmx.flags |=
6145 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE;
6146 kvm_state.hdr.vmx.preemption_timer_deadline =
6147 vmx->nested.preemption_timer_deadline;
6148 }
6149 }
6150 }
6151
6152 if (user_data_size < kvm_state.size)
6153 goto out;
6154
6155 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
6156 return -EFAULT;
6157
6158 if (!vmx_has_valid_vmcs12(vcpu))
6159 goto out;
6160
6161 /*
6162 * When running L2, the authoritative vmcs12 state is in the
6163 * vmcs02. When running L1, the authoritative vmcs12 state is
6164 * in the shadow or enlightened vmcs linked to vmcs01, unless
6165 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
6166 * vmcs12 state is in the vmcs12 already.
6167 */
6168 if (is_guest_mode(vcpu)) {
6169 sync_vmcs02_to_vmcs12(vcpu, vmcs12);
6170 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
6171 } else {
6172 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
6173 if (!vmx->nested.need_vmcs12_to_shadow_sync) {
6174 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
6175 /*
6176 * L1 hypervisor is not obliged to keep eVMCS
6177 * clean fields data always up-to-date while
6178 * not in guest mode, 'hv_clean_fields' is only
6179 * supposed to be actual upon vmentry so we need
6180 * to ignore it here and do full copy.
6181 */
6182 copy_enlightened_to_vmcs12(vmx, 0);
6183 else if (enable_shadow_vmcs)
6184 copy_shadow_to_vmcs12(vmx);
6185 }
6186 }
6187
6188 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
6189 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
6190
6191 /*
6192 * Copy over the full allocated size of vmcs12 rather than just the size
6193 * of the struct.
6194 */
6195 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
6196 return -EFAULT;
6197
6198 if (nested_cpu_has_shadow_vmcs(vmcs12) &&
6199 vmcs12->vmcs_link_pointer != -1ull) {
6200 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
6201 get_shadow_vmcs12(vcpu), VMCS12_SIZE))
6202 return -EFAULT;
6203 }
6204 out:
6205 return kvm_state.size;
6206 }
6207
6208 /*
6209 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
6210 */
6211 void vmx_leave_nested(struct kvm_vcpu *vcpu)
6212 {
6213 if (is_guest_mode(vcpu)) {
6214 to_vmx(vcpu)->nested.nested_run_pending = 0;
6215 nested_vmx_vmexit(vcpu, -1, 0, 0);
6216 }
6217 free_nested(vcpu);
6218 }
6219
6220 static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
6221 struct kvm_nested_state __user *user_kvm_nested_state,
6222 struct kvm_nested_state *kvm_state)
6223 {
6224 struct vcpu_vmx *vmx = to_vmx(vcpu);
6225 struct vmcs12 *vmcs12;
6226 enum vm_entry_failure_code ignored;
6227 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
6228 &user_kvm_nested_state->data.vmx[0];
6229 int ret;
6230
6231 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
6232 return -EINVAL;
6233
6234 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) {
6235 if (kvm_state->hdr.vmx.smm.flags)
6236 return -EINVAL;
6237
6238 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull)
6239 return -EINVAL;
6240
6241 /*
6242 * KVM_STATE_NESTED_EVMCS used to signal that KVM should
6243 * enable eVMCS capability on vCPU. However, since then
6244 * code was changed such that flag signals vmcs12 should
6245 * be copied into eVMCS in guest memory.
6246 *
6247 * To preserve backwards compatability, allow user
6248 * to set this flag even when there is no VMXON region.
6249 */
6250 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
6251 return -EINVAL;
6252 } else {
6253 if (!nested_vmx_allowed(vcpu))
6254 return -EINVAL;
6255
6256 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
6257 return -EINVAL;
6258 }
6259
6260 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
6261 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
6262 return -EINVAL;
6263
6264 if (kvm_state->hdr.vmx.smm.flags &
6265 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
6266 return -EINVAL;
6267
6268 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE)
6269 return -EINVAL;
6270
6271 /*
6272 * SMM temporarily disables VMX, so we cannot be in guest mode,
6273 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
6274 * must be zero.
6275 */
6276 if (is_smm(vcpu) ?
6277 (kvm_state->flags &
6278 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
6279 : kvm_state->hdr.vmx.smm.flags)
6280 return -EINVAL;
6281
6282 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
6283 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
6284 return -EINVAL;
6285
6286 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
6287 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled))
6288 return -EINVAL;
6289
6290 vmx_leave_nested(vcpu);
6291
6292 if (kvm_state->hdr.vmx.vmxon_pa == -1ull)
6293 return 0;
6294
6295 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
6296 ret = enter_vmx_operation(vcpu);
6297 if (ret)
6298 return ret;
6299
6300 /* Empty 'VMXON' state is permitted if no VMCS loaded */
6301 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) {
6302 /* See vmx_has_valid_vmcs12. */
6303 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
6304 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
6305 (kvm_state->hdr.vmx.vmcs12_pa != -1ull))
6306 return -EINVAL;
6307 else
6308 return 0;
6309 }
6310
6311 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) {
6312 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
6313 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
6314 return -EINVAL;
6315
6316 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
6317 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
6318 /*
6319 * nested_vmx_handle_enlightened_vmptrld() cannot be called
6320 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be
6321 * restored yet. EVMCS will be mapped from
6322 * nested_get_vmcs12_pages().
6323 */
6324 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING;
6325 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
6326 } else {
6327 return -EINVAL;
6328 }
6329
6330 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
6331 vmx->nested.smm.vmxon = true;
6332 vmx->nested.vmxon = false;
6333
6334 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
6335 vmx->nested.smm.guest_mode = true;
6336 }
6337
6338 vmcs12 = get_vmcs12(vcpu);
6339 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
6340 return -EFAULT;
6341
6342 if (vmcs12->hdr.revision_id != VMCS12_REVISION)
6343 return -EINVAL;
6344
6345 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
6346 return 0;
6347
6348 vmx->nested.nested_run_pending =
6349 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
6350
6351 vmx->nested.mtf_pending =
6352 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
6353
6354 ret = -EINVAL;
6355 if (nested_cpu_has_shadow_vmcs(vmcs12) &&
6356 vmcs12->vmcs_link_pointer != -1ull) {
6357 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
6358
6359 if (kvm_state->size <
6360 sizeof(*kvm_state) +
6361 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
6362 goto error_guest_mode;
6363
6364 if (copy_from_user(shadow_vmcs12,
6365 user_vmx_nested_state->shadow_vmcs12,
6366 sizeof(*shadow_vmcs12))) {
6367 ret = -EFAULT;
6368 goto error_guest_mode;
6369 }
6370
6371 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
6372 !shadow_vmcs12->hdr.shadow_vmcs)
6373 goto error_guest_mode;
6374 }
6375
6376 vmx->nested.has_preemption_timer_deadline = false;
6377 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) {
6378 vmx->nested.has_preemption_timer_deadline = true;
6379 vmx->nested.preemption_timer_deadline =
6380 kvm_state->hdr.vmx.preemption_timer_deadline;
6381 }
6382
6383 if (nested_vmx_check_controls(vcpu, vmcs12) ||
6384 nested_vmx_check_host_state(vcpu, vmcs12) ||
6385 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
6386 goto error_guest_mode;
6387
6388 vmx->nested.dirty_vmcs12 = true;
6389 ret = nested_vmx_enter_non_root_mode(vcpu, false);
6390 if (ret)
6391 goto error_guest_mode;
6392
6393 return 0;
6394
6395 error_guest_mode:
6396 vmx->nested.nested_run_pending = 0;
6397 return ret;
6398 }
6399
6400 void nested_vmx_set_vmcs_shadowing_bitmap(void)
6401 {
6402 if (enable_shadow_vmcs) {
6403 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
6404 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
6405 }
6406 }
6407
6408 /*
6409 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
6410 * returned for the various VMX controls MSRs when nested VMX is enabled.
6411 * The same values should also be used to verify that vmcs12 control fields are
6412 * valid during nested entry from L1 to L2.
6413 * Each of these control msrs has a low and high 32-bit half: A low bit is on
6414 * if the corresponding bit in the (32-bit) control field *must* be on, and a
6415 * bit in the high half is on if the corresponding bit in the control field
6416 * may be on. See also vmx_control_verify().
6417 */
6418 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
6419 {
6420 /*
6421 * Note that as a general rule, the high half of the MSRs (bits in
6422 * the control fields which may be 1) should be initialized by the
6423 * intersection of the underlying hardware's MSR (i.e., features which
6424 * can be supported) and the list of features we want to expose -
6425 * because they are known to be properly supported in our code.
6426 * Also, usually, the low half of the MSRs (bits which must be 1) can
6427 * be set to 0, meaning that L1 may turn off any of these bits. The
6428 * reason is that if one of these bits is necessary, it will appear
6429 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
6430 * fields of vmcs01 and vmcs02, will turn these bits off - and
6431 * nested_vmx_l1_wants_exit() will not pass related exits to L1.
6432 * These rules have exceptions below.
6433 */
6434
6435 /* pin-based controls */
6436 rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
6437 msrs->pinbased_ctls_low,
6438 msrs->pinbased_ctls_high);
6439 msrs->pinbased_ctls_low |=
6440 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6441 msrs->pinbased_ctls_high &=
6442 PIN_BASED_EXT_INTR_MASK |
6443 PIN_BASED_NMI_EXITING |
6444 PIN_BASED_VIRTUAL_NMIS |
6445 (enable_apicv ? PIN_BASED_POSTED_INTR : 0);
6446 msrs->pinbased_ctls_high |=
6447 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
6448 PIN_BASED_VMX_PREEMPTION_TIMER;
6449
6450 /* exit controls */
6451 rdmsr(MSR_IA32_VMX_EXIT_CTLS,
6452 msrs->exit_ctls_low,
6453 msrs->exit_ctls_high);
6454 msrs->exit_ctls_low =
6455 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
6456
6457 msrs->exit_ctls_high &=
6458 #ifdef CONFIG_X86_64
6459 VM_EXIT_HOST_ADDR_SPACE_SIZE |
6460 #endif
6461 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
6462 VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
6463 msrs->exit_ctls_high |=
6464 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
6465 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
6466 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
6467
6468 /* We support free control of debug control saving. */
6469 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
6470
6471 /* entry controls */
6472 rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
6473 msrs->entry_ctls_low,
6474 msrs->entry_ctls_high);
6475 msrs->entry_ctls_low =
6476 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
6477 msrs->entry_ctls_high &=
6478 #ifdef CONFIG_X86_64
6479 VM_ENTRY_IA32E_MODE |
6480 #endif
6481 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
6482 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
6483 msrs->entry_ctls_high |=
6484 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
6485
6486 /* We support free control of debug control loading. */
6487 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
6488
6489 /* cpu-based controls */
6490 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
6491 msrs->procbased_ctls_low,
6492 msrs->procbased_ctls_high);
6493 msrs->procbased_ctls_low =
6494 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6495 msrs->procbased_ctls_high &=
6496 CPU_BASED_INTR_WINDOW_EXITING |
6497 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
6498 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
6499 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
6500 CPU_BASED_CR3_STORE_EXITING |
6501 #ifdef CONFIG_X86_64
6502 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
6503 #endif
6504 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
6505 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
6506 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
6507 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
6508 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
6509 /*
6510 * We can allow some features even when not supported by the
6511 * hardware. For example, L1 can specify an MSR bitmap - and we
6512 * can use it to avoid exits to L1 - even when L0 runs L2
6513 * without MSR bitmaps.
6514 */
6515 msrs->procbased_ctls_high |=
6516 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
6517 CPU_BASED_USE_MSR_BITMAPS;
6518
6519 /* We support free control of CR3 access interception. */
6520 msrs->procbased_ctls_low &=
6521 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
6522
6523 /*
6524 * secondary cpu-based controls. Do not include those that
6525 * depend on CPUID bits, they are added later by
6526 * vmx_vcpu_after_set_cpuid.
6527 */
6528 if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
6529 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
6530 msrs->secondary_ctls_low,
6531 msrs->secondary_ctls_high);
6532
6533 msrs->secondary_ctls_low = 0;
6534 msrs->secondary_ctls_high &=
6535 SECONDARY_EXEC_DESC |
6536 SECONDARY_EXEC_ENABLE_RDTSCP |
6537 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
6538 SECONDARY_EXEC_WBINVD_EXITING |
6539 SECONDARY_EXEC_APIC_REGISTER_VIRT |
6540 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
6541 SECONDARY_EXEC_RDRAND_EXITING |
6542 SECONDARY_EXEC_ENABLE_INVPCID |
6543 SECONDARY_EXEC_RDSEED_EXITING |
6544 SECONDARY_EXEC_XSAVES |
6545 SECONDARY_EXEC_TSC_SCALING;
6546
6547 /*
6548 * We can emulate "VMCS shadowing," even if the hardware
6549 * doesn't support it.
6550 */
6551 msrs->secondary_ctls_high |=
6552 SECONDARY_EXEC_SHADOW_VMCS;
6553
6554 if (enable_ept) {
6555 /* nested EPT: emulate EPT also to L1 */
6556 msrs->secondary_ctls_high |=
6557 SECONDARY_EXEC_ENABLE_EPT;
6558 msrs->ept_caps =
6559 VMX_EPT_PAGE_WALK_4_BIT |
6560 VMX_EPT_PAGE_WALK_5_BIT |
6561 VMX_EPTP_WB_BIT |
6562 VMX_EPT_INVEPT_BIT |
6563 VMX_EPT_EXECUTE_ONLY_BIT;
6564
6565 msrs->ept_caps &= ept_caps;
6566 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
6567 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
6568 VMX_EPT_1GB_PAGE_BIT;
6569 if (enable_ept_ad_bits) {
6570 msrs->secondary_ctls_high |=
6571 SECONDARY_EXEC_ENABLE_PML;
6572 msrs->ept_caps |= VMX_EPT_AD_BIT;
6573 }
6574 }
6575
6576 if (cpu_has_vmx_vmfunc()) {
6577 msrs->secondary_ctls_high |=
6578 SECONDARY_EXEC_ENABLE_VMFUNC;
6579 /*
6580 * Advertise EPTP switching unconditionally
6581 * since we emulate it
6582 */
6583 if (enable_ept)
6584 msrs->vmfunc_controls =
6585 VMX_VMFUNC_EPTP_SWITCHING;
6586 }
6587
6588 /*
6589 * Old versions of KVM use the single-context version without
6590 * checking for support, so declare that it is supported even
6591 * though it is treated as global context. The alternative is
6592 * not failing the single-context invvpid, and it is worse.
6593 */
6594 if (enable_vpid) {
6595 msrs->secondary_ctls_high |=
6596 SECONDARY_EXEC_ENABLE_VPID;
6597 msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
6598 VMX_VPID_EXTENT_SUPPORTED_MASK;
6599 }
6600
6601 if (enable_unrestricted_guest)
6602 msrs->secondary_ctls_high |=
6603 SECONDARY_EXEC_UNRESTRICTED_GUEST;
6604
6605 if (flexpriority_enabled)
6606 msrs->secondary_ctls_high |=
6607 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6608
6609 if (enable_sgx)
6610 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
6611
6612 /* miscellaneous data */
6613 rdmsr(MSR_IA32_VMX_MISC,
6614 msrs->misc_low,
6615 msrs->misc_high);
6616 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
6617 msrs->misc_low |=
6618 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
6619 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
6620 VMX_MISC_ACTIVITY_HLT |
6621 VMX_MISC_ACTIVITY_WAIT_SIPI;
6622 msrs->misc_high = 0;
6623
6624 /*
6625 * This MSR reports some information about VMX support. We
6626 * should return information about the VMX we emulate for the
6627 * guest, and the VMCS structure we give it - not about the
6628 * VMX support of the underlying hardware.
6629 */
6630 msrs->basic =
6631 VMCS12_REVISION |
6632 VMX_BASIC_TRUE_CTLS |
6633 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
6634 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
6635
6636 if (cpu_has_vmx_basic_inout())
6637 msrs->basic |= VMX_BASIC_INOUT;
6638
6639 /*
6640 * These MSRs specify bits which the guest must keep fixed on
6641 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
6642 * We picked the standard core2 setting.
6643 */
6644 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
6645 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE
6646 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
6647 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
6648
6649 /* These MSRs specify bits which the guest must keep fixed off. */
6650 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
6651 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
6652
6653 /* highest index: VMX_PREEMPTION_TIMER_VALUE */
6654 msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
6655 }
6656
6657 void nested_vmx_hardware_unsetup(void)
6658 {
6659 int i;
6660
6661 if (enable_shadow_vmcs) {
6662 for (i = 0; i < VMX_BITMAP_NR; i++)
6663 free_page((unsigned long)vmx_bitmap[i]);
6664 }
6665 }
6666
6667 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
6668 {
6669 int i;
6670
6671 if (!cpu_has_vmx_shadow_vmcs())
6672 enable_shadow_vmcs = 0;
6673 if (enable_shadow_vmcs) {
6674 for (i = 0; i < VMX_BITMAP_NR; i++) {
6675 /*
6676 * The vmx_bitmap is not tied to a VM and so should
6677 * not be charged to a memcg.
6678 */
6679 vmx_bitmap[i] = (unsigned long *)
6680 __get_free_page(GFP_KERNEL);
6681 if (!vmx_bitmap[i]) {
6682 nested_vmx_hardware_unsetup();
6683 return -ENOMEM;
6684 }
6685 }
6686
6687 init_vmcs_shadow_fields();
6688 }
6689
6690 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear;
6691 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch;
6692 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld;
6693 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst;
6694 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread;
6695 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume;
6696 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite;
6697 exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff;
6698 exit_handlers[EXIT_REASON_VMON] = handle_vmon;
6699 exit_handlers[EXIT_REASON_INVEPT] = handle_invept;
6700 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid;
6701 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc;
6702
6703 return 0;
6704 }
6705
6706 struct kvm_x86_nested_ops vmx_nested_ops = {
6707 .check_events = vmx_check_nested_events,
6708 .hv_timer_pending = nested_vmx_preemption_timer_pending,
6709 .triple_fault = nested_vmx_triple_fault,
6710 .get_state = vmx_get_nested_state,
6711 .set_state = vmx_set_nested_state,
6712 .get_nested_state_pages = vmx_get_nested_state_pages,
6713 .write_log_dirty = nested_vmx_write_pml_buffer,
6714 .enable_evmcs = nested_enable_evmcs,
6715 .get_evmcs_version = nested_get_evmcs_version,
6716 };