1 // SPDX-License-Identifier: GPL-2.0
3 #include <linux/objtool.h>
4 #include <linux/percpu.h>
6 #include <asm/debugreg.h>
7 #include <asm/mmu_context.h>
19 static bool __read_mostly enable_shadow_vmcs
= 1;
20 module_param_named(enable_shadow_vmcs
, enable_shadow_vmcs
, bool, S_IRUGO
);
22 static bool __read_mostly nested_early_check
= 0;
23 module_param(nested_early_check
, bool, S_IRUGO
);
25 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
28 * Hyper-V requires all of these, so mark them as supported even though
29 * they are just treated the same as all-context.
31 #define VMX_VPID_EXTENT_SUPPORTED_MASK \
32 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
33 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
34 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
35 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
37 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
44 static unsigned long *vmx_bitmap
[VMX_BITMAP_NR
];
46 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
47 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
49 struct shadow_vmcs_field
{
53 static struct shadow_vmcs_field shadow_read_only_fields
[] = {
54 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
55 #include "vmcs_shadow_fields.h"
57 static int max_shadow_read_only_fields
=
58 ARRAY_SIZE(shadow_read_only_fields
);
60 static struct shadow_vmcs_field shadow_read_write_fields
[] = {
61 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
62 #include "vmcs_shadow_fields.h"
64 static int max_shadow_read_write_fields
=
65 ARRAY_SIZE(shadow_read_write_fields
);
67 static void init_vmcs_shadow_fields(void)
71 memset(vmx_vmread_bitmap
, 0xff, PAGE_SIZE
);
72 memset(vmx_vmwrite_bitmap
, 0xff, PAGE_SIZE
);
74 for (i
= j
= 0; i
< max_shadow_read_only_fields
; i
++) {
75 struct shadow_vmcs_field entry
= shadow_read_only_fields
[i
];
76 u16 field
= entry
.encoding
;
78 if (vmcs_field_width(field
) == VMCS_FIELD_WIDTH_U64
&&
79 (i
+ 1 == max_shadow_read_only_fields
||
80 shadow_read_only_fields
[i
+ 1].encoding
!= field
+ 1))
81 pr_err("Missing field from shadow_read_only_field %x\n",
84 clear_bit(field
, vmx_vmread_bitmap
);
89 entry
.offset
+= sizeof(u32
);
91 shadow_read_only_fields
[j
++] = entry
;
93 max_shadow_read_only_fields
= j
;
95 for (i
= j
= 0; i
< max_shadow_read_write_fields
; i
++) {
96 struct shadow_vmcs_field entry
= shadow_read_write_fields
[i
];
97 u16 field
= entry
.encoding
;
99 if (vmcs_field_width(field
) == VMCS_FIELD_WIDTH_U64
&&
100 (i
+ 1 == max_shadow_read_write_fields
||
101 shadow_read_write_fields
[i
+ 1].encoding
!= field
+ 1))
102 pr_err("Missing field from shadow_read_write_field %x\n",
105 WARN_ONCE(field
>= GUEST_ES_AR_BYTES
&&
106 field
<= GUEST_TR_AR_BYTES
,
107 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
110 * PML and the preemption timer can be emulated, but the
111 * processor cannot vmwrite to fields that don't exist
115 case GUEST_PML_INDEX
:
116 if (!cpu_has_vmx_pml())
119 case VMX_PREEMPTION_TIMER_VALUE
:
120 if (!cpu_has_vmx_preemption_timer())
123 case GUEST_INTR_STATUS
:
124 if (!cpu_has_vmx_apicv())
131 clear_bit(field
, vmx_vmwrite_bitmap
);
132 clear_bit(field
, vmx_vmread_bitmap
);
137 entry
.offset
+= sizeof(u32
);
139 shadow_read_write_fields
[j
++] = entry
;
141 max_shadow_read_write_fields
= j
;
145 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
146 * set the success or error code of an emulated VMX instruction (as specified
147 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
150 static int nested_vmx_succeed(struct kvm_vcpu
*vcpu
)
152 vmx_set_rflags(vcpu
, vmx_get_rflags(vcpu
)
153 & ~(X86_EFLAGS_CF
| X86_EFLAGS_PF
| X86_EFLAGS_AF
|
154 X86_EFLAGS_ZF
| X86_EFLAGS_SF
| X86_EFLAGS_OF
));
155 return kvm_skip_emulated_instruction(vcpu
);
158 static int nested_vmx_failInvalid(struct kvm_vcpu
*vcpu
)
160 vmx_set_rflags(vcpu
, (vmx_get_rflags(vcpu
)
161 & ~(X86_EFLAGS_PF
| X86_EFLAGS_AF
| X86_EFLAGS_ZF
|
162 X86_EFLAGS_SF
| X86_EFLAGS_OF
))
164 return kvm_skip_emulated_instruction(vcpu
);
167 static int nested_vmx_failValid(struct kvm_vcpu
*vcpu
,
168 u32 vm_instruction_error
)
170 vmx_set_rflags(vcpu
, (vmx_get_rflags(vcpu
)
171 & ~(X86_EFLAGS_CF
| X86_EFLAGS_PF
| X86_EFLAGS_AF
|
172 X86_EFLAGS_SF
| X86_EFLAGS_OF
))
174 get_vmcs12(vcpu
)->vm_instruction_error
= vm_instruction_error
;
176 * We don't need to force sync to shadow VMCS because
177 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all
178 * fields and thus must be synced.
180 if (to_vmx(vcpu
)->nested
.hv_evmcs_vmptr
!= EVMPTR_INVALID
)
181 to_vmx(vcpu
)->nested
.need_vmcs12_to_shadow_sync
= true;
183 return kvm_skip_emulated_instruction(vcpu
);
186 static int nested_vmx_fail(struct kvm_vcpu
*vcpu
, u32 vm_instruction_error
)
188 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
191 * failValid writes the error number to the current VMCS, which
192 * can't be done if there isn't a current VMCS.
194 if (vmx
->nested
.current_vmptr
== -1ull &&
195 !evmptr_is_valid(vmx
->nested
.hv_evmcs_vmptr
))
196 return nested_vmx_failInvalid(vcpu
);
198 return nested_vmx_failValid(vcpu
, vm_instruction_error
);
201 static void nested_vmx_abort(struct kvm_vcpu
*vcpu
, u32 indicator
)
203 /* TODO: not to reset guest simply here. */
204 kvm_make_request(KVM_REQ_TRIPLE_FAULT
, vcpu
);
205 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator
);
208 static inline bool vmx_control_verify(u32 control
, u32 low
, u32 high
)
210 return fixed_bits_valid(control
, low
, high
);
213 static inline u64
vmx_control_msr(u32 low
, u32 high
)
215 return low
| ((u64
)high
<< 32);
218 static void vmx_disable_shadow_vmcs(struct vcpu_vmx
*vmx
)
220 secondary_exec_controls_clearbit(vmx
, SECONDARY_EXEC_SHADOW_VMCS
);
221 vmcs_write64(VMCS_LINK_POINTER
, -1ull);
222 vmx
->nested
.need_vmcs12_to_shadow_sync
= false;
225 static inline void nested_release_evmcs(struct kvm_vcpu
*vcpu
)
227 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
229 if (evmptr_is_valid(vmx
->nested
.hv_evmcs_vmptr
)) {
230 kvm_vcpu_unmap(vcpu
, &vmx
->nested
.hv_evmcs_map
, true);
231 vmx
->nested
.hv_evmcs
= NULL
;
234 vmx
->nested
.hv_evmcs_vmptr
= EVMPTR_INVALID
;
237 static void vmx_sync_vmcs_host_state(struct vcpu_vmx
*vmx
,
238 struct loaded_vmcs
*prev
)
240 struct vmcs_host_state
*dest
, *src
;
242 if (unlikely(!vmx
->guest_state_loaded
))
245 src
= &prev
->host_state
;
246 dest
= &vmx
->loaded_vmcs
->host_state
;
248 vmx_set_host_fs_gs(dest
, src
->fs_sel
, src
->gs_sel
, src
->fs_base
, src
->gs_base
);
249 dest
->ldt_sel
= src
->ldt_sel
;
251 dest
->ds_sel
= src
->ds_sel
;
252 dest
->es_sel
= src
->es_sel
;
256 static void vmx_switch_vmcs(struct kvm_vcpu
*vcpu
, struct loaded_vmcs
*vmcs
)
258 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
259 struct loaded_vmcs
*prev
;
262 if (WARN_ON_ONCE(vmx
->loaded_vmcs
== vmcs
))
266 prev
= vmx
->loaded_vmcs
;
267 vmx
->loaded_vmcs
= vmcs
;
268 vmx_vcpu_load_vmcs(vcpu
, cpu
, prev
);
269 vmx_sync_vmcs_host_state(vmx
, prev
);
272 vmx_register_cache_reset(vcpu
);
276 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
277 * just stops using VMX.
279 static void free_nested(struct kvm_vcpu
*vcpu
)
281 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
283 if (WARN_ON_ONCE(vmx
->loaded_vmcs
!= &vmx
->vmcs01
))
284 vmx_switch_vmcs(vcpu
, &vmx
->vmcs01
);
286 if (!vmx
->nested
.vmxon
&& !vmx
->nested
.smm
.vmxon
)
289 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES
, vcpu
);
291 vmx
->nested
.vmxon
= false;
292 vmx
->nested
.smm
.vmxon
= false;
293 free_vpid(vmx
->nested
.vpid02
);
294 vmx
->nested
.posted_intr_nv
= -1;
295 vmx
->nested
.current_vmptr
= -1ull;
296 if (enable_shadow_vmcs
) {
297 vmx_disable_shadow_vmcs(vmx
);
298 vmcs_clear(vmx
->vmcs01
.shadow_vmcs
);
299 free_vmcs(vmx
->vmcs01
.shadow_vmcs
);
300 vmx
->vmcs01
.shadow_vmcs
= NULL
;
302 kfree(vmx
->nested
.cached_vmcs12
);
303 vmx
->nested
.cached_vmcs12
= NULL
;
304 kfree(vmx
->nested
.cached_shadow_vmcs12
);
305 vmx
->nested
.cached_shadow_vmcs12
= NULL
;
306 /* Unpin physical memory we referred to in the vmcs02 */
307 if (vmx
->nested
.apic_access_page
) {
308 kvm_release_page_clean(vmx
->nested
.apic_access_page
);
309 vmx
->nested
.apic_access_page
= NULL
;
311 kvm_vcpu_unmap(vcpu
, &vmx
->nested
.virtual_apic_map
, true);
312 kvm_vcpu_unmap(vcpu
, &vmx
->nested
.pi_desc_map
, true);
313 vmx
->nested
.pi_desc
= NULL
;
315 kvm_mmu_free_roots(vcpu
, &vcpu
->arch
.guest_mmu
, KVM_MMU_ROOTS_ALL
);
317 nested_release_evmcs(vcpu
);
319 free_loaded_vmcs(&vmx
->nested
.vmcs02
);
323 * Ensure that the current vmcs of the logical processor is the
324 * vmcs01 of the vcpu before calling free_nested().
326 void nested_vmx_free_vcpu(struct kvm_vcpu
*vcpu
)
329 vmx_leave_nested(vcpu
);
333 static void nested_ept_inject_page_fault(struct kvm_vcpu
*vcpu
,
334 struct x86_exception
*fault
)
336 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
337 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
339 unsigned long exit_qualification
= vcpu
->arch
.exit_qualification
;
341 if (vmx
->nested
.pml_full
) {
342 vm_exit_reason
= EXIT_REASON_PML_FULL
;
343 vmx
->nested
.pml_full
= false;
344 exit_qualification
&= INTR_INFO_UNBLOCK_NMI
;
345 } else if (fault
->error_code
& PFERR_RSVD_MASK
)
346 vm_exit_reason
= EXIT_REASON_EPT_MISCONFIG
;
348 vm_exit_reason
= EXIT_REASON_EPT_VIOLATION
;
350 nested_vmx_vmexit(vcpu
, vm_exit_reason
, 0, exit_qualification
);
351 vmcs12
->guest_physical_address
= fault
->address
;
354 static void nested_ept_init_mmu_context(struct kvm_vcpu
*vcpu
)
356 WARN_ON(mmu_is_nested(vcpu
));
358 vcpu
->arch
.mmu
= &vcpu
->arch
.guest_mmu
;
359 kvm_init_shadow_ept_mmu(vcpu
,
360 to_vmx(vcpu
)->nested
.msrs
.ept_caps
&
361 VMX_EPT_EXECUTE_ONLY_BIT
,
362 nested_ept_ad_enabled(vcpu
),
363 nested_ept_get_eptp(vcpu
));
364 vcpu
->arch
.mmu
->get_guest_pgd
= nested_ept_get_eptp
;
365 vcpu
->arch
.mmu
->inject_page_fault
= nested_ept_inject_page_fault
;
366 vcpu
->arch
.mmu
->get_pdptr
= kvm_pdptr_read
;
368 vcpu
->arch
.walk_mmu
= &vcpu
->arch
.nested_mmu
;
371 static void nested_ept_uninit_mmu_context(struct kvm_vcpu
*vcpu
)
373 vcpu
->arch
.mmu
= &vcpu
->arch
.root_mmu
;
374 vcpu
->arch
.walk_mmu
= &vcpu
->arch
.root_mmu
;
377 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12
*vmcs12
,
380 bool inequality
, bit
;
382 bit
= (vmcs12
->exception_bitmap
& (1u << PF_VECTOR
)) != 0;
384 (error_code
& vmcs12
->page_fault_error_code_mask
) !=
385 vmcs12
->page_fault_error_code_match
;
386 return inequality
^ bit
;
391 * KVM wants to inject page-faults which it got to the guest. This function
392 * checks whether in a nested guest, we need to inject them to L1 or L2.
394 static int nested_vmx_check_exception(struct kvm_vcpu
*vcpu
, unsigned long *exit_qual
)
396 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
397 unsigned int nr
= vcpu
->arch
.exception
.nr
;
398 bool has_payload
= vcpu
->arch
.exception
.has_payload
;
399 unsigned long payload
= vcpu
->arch
.exception
.payload
;
401 if (nr
== PF_VECTOR
) {
402 if (vcpu
->arch
.exception
.nested_apf
) {
403 *exit_qual
= vcpu
->arch
.apf
.nested_apf_token
;
406 if (nested_vmx_is_page_fault_vmexit(vmcs12
,
407 vcpu
->arch
.exception
.error_code
)) {
408 *exit_qual
= has_payload
? payload
: vcpu
->arch
.cr2
;
411 } else if (vmcs12
->exception_bitmap
& (1u << nr
)) {
412 if (nr
== DB_VECTOR
) {
414 payload
= vcpu
->arch
.dr6
;
416 payload
^= DR6_ACTIVE_LOW
;
418 *exit_qual
= payload
;
428 static void vmx_inject_page_fault_nested(struct kvm_vcpu
*vcpu
,
429 struct x86_exception
*fault
)
431 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
433 WARN_ON(!is_guest_mode(vcpu
));
435 if (nested_vmx_is_page_fault_vmexit(vmcs12
, fault
->error_code
) &&
436 !to_vmx(vcpu
)->nested
.nested_run_pending
) {
437 vmcs12
->vm_exit_intr_error_code
= fault
->error_code
;
438 nested_vmx_vmexit(vcpu
, EXIT_REASON_EXCEPTION_NMI
,
439 PF_VECTOR
| INTR_TYPE_HARD_EXCEPTION
|
440 INTR_INFO_DELIVER_CODE_MASK
| INTR_INFO_VALID_MASK
,
443 kvm_inject_page_fault(vcpu
, fault
);
447 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu
*vcpu
,
448 struct vmcs12
*vmcs12
)
450 if (!nested_cpu_has(vmcs12
, CPU_BASED_USE_IO_BITMAPS
))
453 if (CC(!page_address_valid(vcpu
, vmcs12
->io_bitmap_a
)) ||
454 CC(!page_address_valid(vcpu
, vmcs12
->io_bitmap_b
)))
460 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu
*vcpu
,
461 struct vmcs12
*vmcs12
)
463 if (!nested_cpu_has(vmcs12
, CPU_BASED_USE_MSR_BITMAPS
))
466 if (CC(!page_address_valid(vcpu
, vmcs12
->msr_bitmap
)))
472 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu
*vcpu
,
473 struct vmcs12
*vmcs12
)
475 if (!nested_cpu_has(vmcs12
, CPU_BASED_TPR_SHADOW
))
478 if (CC(!page_address_valid(vcpu
, vmcs12
->virtual_apic_page_addr
)))
485 * Check if MSR is intercepted for L01 MSR bitmap.
487 static bool msr_write_intercepted_l01(struct kvm_vcpu
*vcpu
, u32 msr
)
489 unsigned long *msr_bitmap
;
490 int f
= sizeof(unsigned long);
492 if (!cpu_has_vmx_msr_bitmap())
495 msr_bitmap
= to_vmx(vcpu
)->vmcs01
.msr_bitmap
;
498 return !!test_bit(msr
, msr_bitmap
+ 0x800 / f
);
499 } else if ((msr
>= 0xc0000000) && (msr
<= 0xc0001fff)) {
501 return !!test_bit(msr
, msr_bitmap
+ 0xc00 / f
);
508 * If a msr is allowed by L0, we should check whether it is allowed by L1.
509 * The corresponding bit will be cleared unless both of L0 and L1 allow it.
511 static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1
,
512 unsigned long *msr_bitmap_nested
,
515 int f
= sizeof(unsigned long);
518 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
519 * have the write-low and read-high bitmap offsets the wrong way round.
520 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
523 if (type
& MSR_TYPE_R
&&
524 !test_bit(msr
, msr_bitmap_l1
+ 0x000 / f
))
526 __clear_bit(msr
, msr_bitmap_nested
+ 0x000 / f
);
528 if (type
& MSR_TYPE_W
&&
529 !test_bit(msr
, msr_bitmap_l1
+ 0x800 / f
))
531 __clear_bit(msr
, msr_bitmap_nested
+ 0x800 / f
);
533 } else if ((msr
>= 0xc0000000) && (msr
<= 0xc0001fff)) {
535 if (type
& MSR_TYPE_R
&&
536 !test_bit(msr
, msr_bitmap_l1
+ 0x400 / f
))
538 __clear_bit(msr
, msr_bitmap_nested
+ 0x400 / f
);
540 if (type
& MSR_TYPE_W
&&
541 !test_bit(msr
, msr_bitmap_l1
+ 0xc00 / f
))
543 __clear_bit(msr
, msr_bitmap_nested
+ 0xc00 / f
);
548 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap
)
552 for (msr
= 0x800; msr
<= 0x8ff; msr
+= BITS_PER_LONG
) {
553 unsigned word
= msr
/ BITS_PER_LONG
;
555 msr_bitmap
[word
] = ~0;
556 msr_bitmap
[word
+ (0x800 / sizeof(long))] = ~0;
561 * Merge L0's and L1's MSR bitmap, return false to indicate that
562 * we do not use the hardware.
564 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu
*vcpu
,
565 struct vmcs12
*vmcs12
)
568 unsigned long *msr_bitmap_l1
;
569 unsigned long *msr_bitmap_l0
= to_vmx(vcpu
)->nested
.vmcs02
.msr_bitmap
;
570 struct kvm_host_map
*map
= &to_vmx(vcpu
)->nested
.msr_bitmap_map
;
572 /* Nothing to do if the MSR bitmap is not in use. */
573 if (!cpu_has_vmx_msr_bitmap() ||
574 !nested_cpu_has(vmcs12
, CPU_BASED_USE_MSR_BITMAPS
))
577 if (kvm_vcpu_map(vcpu
, gpa_to_gfn(vmcs12
->msr_bitmap
), map
))
580 msr_bitmap_l1
= (unsigned long *)map
->hva
;
583 * To keep the control flow simple, pay eight 8-byte writes (sixteen
584 * 4-byte writes on 32-bit systems) up front to enable intercepts for
585 * the x2APIC MSR range and selectively disable them below.
587 enable_x2apic_msr_intercepts(msr_bitmap_l0
);
589 if (nested_cpu_has_virt_x2apic_mode(vmcs12
)) {
590 if (nested_cpu_has_apic_reg_virt(vmcs12
)) {
592 * L0 need not intercept reads for MSRs between 0x800
593 * and 0x8ff, it just lets the processor take the value
594 * from the virtual-APIC page; take those 256 bits
595 * directly from the L1 bitmap.
597 for (msr
= 0x800; msr
<= 0x8ff; msr
+= BITS_PER_LONG
) {
598 unsigned word
= msr
/ BITS_PER_LONG
;
600 msr_bitmap_l0
[word
] = msr_bitmap_l1
[word
];
604 nested_vmx_disable_intercept_for_msr(
605 msr_bitmap_l1
, msr_bitmap_l0
,
606 X2APIC_MSR(APIC_TASKPRI
),
607 MSR_TYPE_R
| MSR_TYPE_W
);
609 if (nested_cpu_has_vid(vmcs12
)) {
610 nested_vmx_disable_intercept_for_msr(
611 msr_bitmap_l1
, msr_bitmap_l0
,
612 X2APIC_MSR(APIC_EOI
),
614 nested_vmx_disable_intercept_for_msr(
615 msr_bitmap_l1
, msr_bitmap_l0
,
616 X2APIC_MSR(APIC_SELF_IPI
),
621 /* KVM unconditionally exposes the FS/GS base MSRs to L1. */
623 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1
, msr_bitmap_l0
,
624 MSR_FS_BASE
, MSR_TYPE_RW
);
626 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1
, msr_bitmap_l0
,
627 MSR_GS_BASE
, MSR_TYPE_RW
);
629 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1
, msr_bitmap_l0
,
630 MSR_KERNEL_GS_BASE
, MSR_TYPE_RW
);
634 * Checking the L0->L1 bitmap is trying to verify two things:
636 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
637 * ensures that we do not accidentally generate an L02 MSR bitmap
638 * from the L12 MSR bitmap that is too permissive.
639 * 2. That L1 or L2s have actually used the MSR. This avoids
640 * unnecessarily merging of the bitmap if the MSR is unused. This
641 * works properly because we only update the L01 MSR bitmap lazily.
642 * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
643 * updated to reflect this when L1 (or its L2s) actually write to
646 if (!msr_write_intercepted_l01(vcpu
, MSR_IA32_SPEC_CTRL
))
647 nested_vmx_disable_intercept_for_msr(
648 msr_bitmap_l1
, msr_bitmap_l0
,
650 MSR_TYPE_R
| MSR_TYPE_W
);
652 if (!msr_write_intercepted_l01(vcpu
, MSR_IA32_PRED_CMD
))
653 nested_vmx_disable_intercept_for_msr(
654 msr_bitmap_l1
, msr_bitmap_l0
,
658 kvm_vcpu_unmap(vcpu
, &to_vmx(vcpu
)->nested
.msr_bitmap_map
, false);
663 static void nested_cache_shadow_vmcs12(struct kvm_vcpu
*vcpu
,
664 struct vmcs12
*vmcs12
)
666 struct kvm_host_map map
;
667 struct vmcs12
*shadow
;
669 if (!nested_cpu_has_shadow_vmcs(vmcs12
) ||
670 vmcs12
->vmcs_link_pointer
== -1ull)
673 shadow
= get_shadow_vmcs12(vcpu
);
675 if (kvm_vcpu_map(vcpu
, gpa_to_gfn(vmcs12
->vmcs_link_pointer
), &map
))
678 memcpy(shadow
, map
.hva
, VMCS12_SIZE
);
679 kvm_vcpu_unmap(vcpu
, &map
, false);
682 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu
*vcpu
,
683 struct vmcs12
*vmcs12
)
685 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
687 if (!nested_cpu_has_shadow_vmcs(vmcs12
) ||
688 vmcs12
->vmcs_link_pointer
== -1ull)
691 kvm_write_guest(vmx
->vcpu
.kvm
, vmcs12
->vmcs_link_pointer
,
692 get_shadow_vmcs12(vcpu
), VMCS12_SIZE
);
696 * In nested virtualization, check if L1 has set
697 * VM_EXIT_ACK_INTR_ON_EXIT
699 static bool nested_exit_intr_ack_set(struct kvm_vcpu
*vcpu
)
701 return get_vmcs12(vcpu
)->vm_exit_controls
&
702 VM_EXIT_ACK_INTR_ON_EXIT
;
705 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu
*vcpu
,
706 struct vmcs12
*vmcs12
)
708 if (nested_cpu_has2(vmcs12
, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES
) &&
709 CC(!page_address_valid(vcpu
, vmcs12
->apic_access_addr
)))
715 static int nested_vmx_check_apicv_controls(struct kvm_vcpu
*vcpu
,
716 struct vmcs12
*vmcs12
)
718 if (!nested_cpu_has_virt_x2apic_mode(vmcs12
) &&
719 !nested_cpu_has_apic_reg_virt(vmcs12
) &&
720 !nested_cpu_has_vid(vmcs12
) &&
721 !nested_cpu_has_posted_intr(vmcs12
))
725 * If virtualize x2apic mode is enabled,
726 * virtualize apic access must be disabled.
728 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12
) &&
729 nested_cpu_has2(vmcs12
, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES
)))
733 * If virtual interrupt delivery is enabled,
734 * we must exit on external interrupts.
736 if (CC(nested_cpu_has_vid(vmcs12
) && !nested_exit_on_intr(vcpu
)))
740 * bits 15:8 should be zero in posted_intr_nv,
741 * the descriptor address has been already checked
742 * in nested_get_vmcs12_pages.
744 * bits 5:0 of posted_intr_desc_addr should be zero.
746 if (nested_cpu_has_posted_intr(vmcs12
) &&
747 (CC(!nested_cpu_has_vid(vmcs12
)) ||
748 CC(!nested_exit_intr_ack_set(vcpu
)) ||
749 CC((vmcs12
->posted_intr_nv
& 0xff00)) ||
750 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu
, vmcs12
->posted_intr_desc_addr
, 64))))
753 /* tpr shadow is needed by all apicv features. */
754 if (CC(!nested_cpu_has(vmcs12
, CPU_BASED_TPR_SHADOW
)))
760 static int nested_vmx_check_msr_switch(struct kvm_vcpu
*vcpu
,
766 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu
, addr
, 16) ||
767 !kvm_vcpu_is_legal_gpa(vcpu
, (addr
+ count
* sizeof(struct vmx_msr_entry
) - 1)))
773 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu
*vcpu
,
774 struct vmcs12
*vmcs12
)
776 if (CC(nested_vmx_check_msr_switch(vcpu
,
777 vmcs12
->vm_exit_msr_load_count
,
778 vmcs12
->vm_exit_msr_load_addr
)) ||
779 CC(nested_vmx_check_msr_switch(vcpu
,
780 vmcs12
->vm_exit_msr_store_count
,
781 vmcs12
->vm_exit_msr_store_addr
)))
787 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu
*vcpu
,
788 struct vmcs12
*vmcs12
)
790 if (CC(nested_vmx_check_msr_switch(vcpu
,
791 vmcs12
->vm_entry_msr_load_count
,
792 vmcs12
->vm_entry_msr_load_addr
)))
798 static int nested_vmx_check_pml_controls(struct kvm_vcpu
*vcpu
,
799 struct vmcs12
*vmcs12
)
801 if (!nested_cpu_has_pml(vmcs12
))
804 if (CC(!nested_cpu_has_ept(vmcs12
)) ||
805 CC(!page_address_valid(vcpu
, vmcs12
->pml_address
)))
811 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu
*vcpu
,
812 struct vmcs12
*vmcs12
)
814 if (CC(nested_cpu_has2(vmcs12
, SECONDARY_EXEC_UNRESTRICTED_GUEST
) &&
815 !nested_cpu_has_ept(vmcs12
)))
820 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu
*vcpu
,
821 struct vmcs12
*vmcs12
)
823 if (CC(nested_cpu_has2(vmcs12
, SECONDARY_EXEC_MODE_BASED_EPT_EXEC
) &&
824 !nested_cpu_has_ept(vmcs12
)))
829 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu
*vcpu
,
830 struct vmcs12
*vmcs12
)
832 if (!nested_cpu_has_shadow_vmcs(vmcs12
))
835 if (CC(!page_address_valid(vcpu
, vmcs12
->vmread_bitmap
)) ||
836 CC(!page_address_valid(vcpu
, vmcs12
->vmwrite_bitmap
)))
842 static int nested_vmx_msr_check_common(struct kvm_vcpu
*vcpu
,
843 struct vmx_msr_entry
*e
)
845 /* x2APIC MSR accesses are not allowed */
846 if (CC(vcpu
->arch
.apic_base
& X2APIC_ENABLE
&& e
->index
>> 8 == 0x8))
848 if (CC(e
->index
== MSR_IA32_UCODE_WRITE
) || /* SDM Table 35-2 */
849 CC(e
->index
== MSR_IA32_UCODE_REV
))
851 if (CC(e
->reserved
!= 0))
856 static int nested_vmx_load_msr_check(struct kvm_vcpu
*vcpu
,
857 struct vmx_msr_entry
*e
)
859 if (CC(e
->index
== MSR_FS_BASE
) ||
860 CC(e
->index
== MSR_GS_BASE
) ||
861 CC(e
->index
== MSR_IA32_SMM_MONITOR_CTL
) || /* SMM is not supported */
862 nested_vmx_msr_check_common(vcpu
, e
))
867 static int nested_vmx_store_msr_check(struct kvm_vcpu
*vcpu
,
868 struct vmx_msr_entry
*e
)
870 if (CC(e
->index
== MSR_IA32_SMBASE
) || /* SMM is not supported */
871 nested_vmx_msr_check_common(vcpu
, e
))
876 static u32
nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu
*vcpu
)
878 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
879 u64 vmx_misc
= vmx_control_msr(vmx
->nested
.msrs
.misc_low
,
880 vmx
->nested
.msrs
.misc_high
);
882 return (vmx_misc_max_msr(vmx_misc
) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER
;
886 * Load guest's/host's msr at nested entry/exit.
887 * return 0 for success, entry index for failure.
889 * One of the failure modes for MSR load/store is when a list exceeds the
890 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
891 * as possible, process all valid entries before failing rather than precheck
892 * for a capacity violation.
894 static u32
nested_vmx_load_msr(struct kvm_vcpu
*vcpu
, u64 gpa
, u32 count
)
897 struct vmx_msr_entry e
;
898 u32 max_msr_list_size
= nested_vmx_max_atomic_switch_msrs(vcpu
);
900 for (i
= 0; i
< count
; i
++) {
901 if (unlikely(i
>= max_msr_list_size
))
904 if (kvm_vcpu_read_guest(vcpu
, gpa
+ i
* sizeof(e
),
906 pr_debug_ratelimited(
907 "%s cannot read MSR entry (%u, 0x%08llx)\n",
908 __func__
, i
, gpa
+ i
* sizeof(e
));
911 if (nested_vmx_load_msr_check(vcpu
, &e
)) {
912 pr_debug_ratelimited(
913 "%s check failed (%u, 0x%x, 0x%x)\n",
914 __func__
, i
, e
.index
, e
.reserved
);
917 if (kvm_set_msr(vcpu
, e
.index
, e
.value
)) {
918 pr_debug_ratelimited(
919 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
920 __func__
, i
, e
.index
, e
.value
);
926 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */
930 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu
*vcpu
,
934 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
937 * If the L0 hypervisor stored a more accurate value for the TSC that
938 * does not include the time taken for emulation of the L2->L1
939 * VM-exit in L0, use the more accurate value.
941 if (msr_index
== MSR_IA32_TSC
) {
942 int i
= vmx_find_loadstore_msr_slot(&vmx
->msr_autostore
.guest
,
946 u64 val
= vmx
->msr_autostore
.guest
.val
[i
].value
;
948 *data
= kvm_read_l1_tsc(vcpu
, val
);
953 if (kvm_get_msr(vcpu
, msr_index
, data
)) {
954 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__
,
961 static bool read_and_check_msr_entry(struct kvm_vcpu
*vcpu
, u64 gpa
, int i
,
962 struct vmx_msr_entry
*e
)
964 if (kvm_vcpu_read_guest(vcpu
,
965 gpa
+ i
* sizeof(*e
),
966 e
, 2 * sizeof(u32
))) {
967 pr_debug_ratelimited(
968 "%s cannot read MSR entry (%u, 0x%08llx)\n",
969 __func__
, i
, gpa
+ i
* sizeof(*e
));
972 if (nested_vmx_store_msr_check(vcpu
, e
)) {
973 pr_debug_ratelimited(
974 "%s check failed (%u, 0x%x, 0x%x)\n",
975 __func__
, i
, e
->index
, e
->reserved
);
981 static int nested_vmx_store_msr(struct kvm_vcpu
*vcpu
, u64 gpa
, u32 count
)
985 struct vmx_msr_entry e
;
986 u32 max_msr_list_size
= nested_vmx_max_atomic_switch_msrs(vcpu
);
988 for (i
= 0; i
< count
; i
++) {
989 if (unlikely(i
>= max_msr_list_size
))
992 if (!read_and_check_msr_entry(vcpu
, gpa
, i
, &e
))
995 if (!nested_vmx_get_vmexit_msr_value(vcpu
, e
.index
, &data
))
998 if (kvm_vcpu_write_guest(vcpu
,
999 gpa
+ i
* sizeof(e
) +
1000 offsetof(struct vmx_msr_entry
, value
),
1001 &data
, sizeof(data
))) {
1002 pr_debug_ratelimited(
1003 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
1004 __func__
, i
, e
.index
, data
);
1011 static bool nested_msr_store_list_has_msr(struct kvm_vcpu
*vcpu
, u32 msr_index
)
1013 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
1014 u32 count
= vmcs12
->vm_exit_msr_store_count
;
1015 u64 gpa
= vmcs12
->vm_exit_msr_store_addr
;
1016 struct vmx_msr_entry e
;
1019 for (i
= 0; i
< count
; i
++) {
1020 if (!read_and_check_msr_entry(vcpu
, gpa
, i
, &e
))
1023 if (e
.index
== msr_index
)
1029 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu
*vcpu
,
1032 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
1033 struct vmx_msrs
*autostore
= &vmx
->msr_autostore
.guest
;
1034 bool in_vmcs12_store_list
;
1035 int msr_autostore_slot
;
1036 bool in_autostore_list
;
1039 msr_autostore_slot
= vmx_find_loadstore_msr_slot(autostore
, msr_index
);
1040 in_autostore_list
= msr_autostore_slot
>= 0;
1041 in_vmcs12_store_list
= nested_msr_store_list_has_msr(vcpu
, msr_index
);
1043 if (in_vmcs12_store_list
&& !in_autostore_list
) {
1044 if (autostore
->nr
== MAX_NR_LOADSTORE_MSRS
) {
1046 * Emulated VMEntry does not fail here. Instead a less
1047 * accurate value will be returned by
1048 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
1049 * instead of reading the value from the vmcs02 VMExit
1052 pr_warn_ratelimited(
1053 "Not enough msr entries in msr_autostore. Can't add msr %x\n",
1057 last
= autostore
->nr
++;
1058 autostore
->val
[last
].index
= msr_index
;
1059 } else if (!in_vmcs12_store_list
&& in_autostore_list
) {
1060 last
= --autostore
->nr
;
1061 autostore
->val
[msr_autostore_slot
] = autostore
->val
[last
];
1066 * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit.
1067 * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't
1068 * enable VPID for L2 (implying it expects a TLB flush on VMX transitions).
1071 * If EPT is enabled by L0 a sync is never needed:
1072 * - if it is disabled by L1, then L0 is not shadowing L1 or L2 PTEs, there
1073 * cannot be unsync'd SPTEs for either L1 or L2.
1075 * - if it is also enabled by L1, then L0 doesn't need to sync on VM-Enter
1076 * VM-Enter as VM-Enter isn't required to invalidate guest-physical mappings
1077 * (irrespective of VPID), i.e. L1 can't rely on the (virtual) CPU to flush
1078 * stale guest-physical mappings for L2 from the TLB. And as above, L0 isn't
1079 * shadowing L1 PTEs so there are no unsync'd SPTEs to sync on VM-Exit.
1081 * If EPT is disabled by L0:
1082 * - if VPID is enabled by L1 (for L2), the situation is similar to when L1
1083 * enables EPT: L0 doesn't need to sync as VM-Enter and VM-Exit aren't
1084 * required to invalidate linear mappings (EPT is disabled so there are
1085 * no combined or guest-physical mappings), i.e. L1 can't rely on the
1086 * (virtual) CPU to flush stale linear mappings for either L2 or itself (L1).
1088 * - however if VPID is disabled by L1, then a sync is needed as L1 expects all
1089 * linear mappings (EPT is disabled so there are no combined or guest-physical
1090 * mappings) to be invalidated on both VM-Enter and VM-Exit.
1092 * Note, this logic is subtly different than nested_has_guest_tlb_tag(), which
1093 * additionally checks that L2 has been assigned a VPID (when EPT is disabled).
1094 * Whether or not L2 has been assigned a VPID by L0 is irrelevant with respect
1095 * to L1's expectations, e.g. L0 needs to invalidate hardware TLB entries if L2
1096 * doesn't have a unique VPID to prevent reusing L1's entries (assuming L1 has
1097 * been assigned a VPID), but L0 doesn't need to do a MMU sync because L1
1098 * doesn't expect stale (virtual) TLB entries to be flushed, i.e. L1 doesn't
1099 * know that L0 will flush the TLB and so L1 will do INVVPID as needed to flush
1100 * stale TLB entries, at which point L0 will sync L2's MMU.
1102 static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu
*vcpu
)
1104 return !enable_ept
&& !nested_cpu_has_vpid(get_vmcs12(vcpu
));
1108 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are
1109 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected
1110 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
1111 * @entry_failure_code.
1113 static int nested_vmx_load_cr3(struct kvm_vcpu
*vcpu
, unsigned long cr3
,
1114 bool nested_ept
, bool reload_pdptrs
,
1115 enum vm_entry_failure_code
*entry_failure_code
)
1117 if (CC(kvm_vcpu_is_illegal_gpa(vcpu
, cr3
))) {
1118 *entry_failure_code
= ENTRY_FAIL_DEFAULT
;
1123 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
1124 * must not be dereferenced.
1126 if (reload_pdptrs
&& !nested_ept
&& is_pae_paging(vcpu
) &&
1127 CC(!load_pdptrs(vcpu
, vcpu
->arch
.walk_mmu
, cr3
))) {
1128 *entry_failure_code
= ENTRY_FAIL_PDPTE
;
1133 * Unconditionally skip the TLB flush on fast CR3 switch, all TLB
1134 * flushes are handled by nested_vmx_transition_tlb_flush(). See
1135 * nested_vmx_transition_mmu_sync for details on skipping the MMU sync.
1138 kvm_mmu_new_pgd(vcpu
, cr3
, true,
1139 !nested_vmx_transition_mmu_sync(vcpu
));
1141 vcpu
->arch
.cr3
= cr3
;
1142 kvm_register_mark_available(vcpu
, VCPU_EXREG_CR3
);
1144 kvm_init_mmu(vcpu
, false);
1150 * Returns if KVM is able to config CPU to tag TLB entries
1151 * populated by L2 differently than TLB entries populated
1154 * If L0 uses EPT, L1 and L2 run with different EPTP because
1155 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
1156 * are tagged with different EPTP.
1158 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
1159 * with different VPID (L1 entries are tagged with vmx->vpid
1160 * while L2 entries are tagged with vmx->nested.vpid02).
1162 static bool nested_has_guest_tlb_tag(struct kvm_vcpu
*vcpu
)
1164 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
1166 return enable_ept
||
1167 (nested_cpu_has_vpid(vmcs12
) && to_vmx(vcpu
)->nested
.vpid02
);
1170 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu
*vcpu
,
1171 struct vmcs12
*vmcs12
,
1174 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
1177 * If VPID is disabled, linear and combined mappings are flushed on
1178 * VM-Enter/VM-Exit, and guest-physical mappings are valid only for
1179 * their associated EPTP.
1185 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
1186 * for *all* contexts to be flushed on VM-Enter/VM-Exit.
1188 * If VPID is enabled and used by vmc12, but L2 does not have a unique
1189 * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate
1190 * a VPID for L2, flush the current context as the effective ASID is
1191 * common to both L1 and L2.
1193 * Defer the flush so that it runs after vmcs02.EPTP has been set by
1194 * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid
1195 * redundant flushes further down the nested pipeline.
1197 * If a TLB flush isn't required due to any of the above, and vpid12 is
1198 * changing then the new "virtual" VPID (vpid12) will reuse the same
1199 * "real" VPID (vpid02), and so needs to be sync'd. There is no direct
1200 * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for
1203 if (!nested_cpu_has_vpid(vmcs12
)) {
1204 kvm_make_request(KVM_REQ_TLB_FLUSH
, vcpu
);
1205 } else if (!nested_has_guest_tlb_tag(vcpu
)) {
1206 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT
, vcpu
);
1207 } else if (is_vmenter
&&
1208 vmcs12
->virtual_processor_id
!= vmx
->nested
.last_vpid
) {
1209 vmx
->nested
.last_vpid
= vmcs12
->virtual_processor_id
;
1210 vpid_sync_context(nested_get_vpid02(vcpu
));
1214 static bool is_bitwise_subset(u64 superset
, u64 subset
, u64 mask
)
1219 return (superset
| subset
) == superset
;
1222 static int vmx_restore_vmx_basic(struct vcpu_vmx
*vmx
, u64 data
)
1224 const u64 feature_and_reserved
=
1225 /* feature (except bit 48; see below) */
1226 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
1228 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
1229 u64 vmx_basic
= vmx
->nested
.msrs
.basic
;
1231 if (!is_bitwise_subset(vmx_basic
, data
, feature_and_reserved
))
1235 * KVM does not emulate a version of VMX that constrains physical
1236 * addresses of VMX structures (e.g. VMCS) to 32-bits.
1238 if (data
& BIT_ULL(48))
1241 if (vmx_basic_vmcs_revision_id(vmx_basic
) !=
1242 vmx_basic_vmcs_revision_id(data
))
1245 if (vmx_basic_vmcs_size(vmx_basic
) > vmx_basic_vmcs_size(data
))
1248 vmx
->nested
.msrs
.basic
= data
;
1253 vmx_restore_control_msr(struct vcpu_vmx
*vmx
, u32 msr_index
, u64 data
)
1258 switch (msr_index
) {
1259 case MSR_IA32_VMX_TRUE_PINBASED_CTLS
:
1260 lowp
= &vmx
->nested
.msrs
.pinbased_ctls_low
;
1261 highp
= &vmx
->nested
.msrs
.pinbased_ctls_high
;
1263 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS
:
1264 lowp
= &vmx
->nested
.msrs
.procbased_ctls_low
;
1265 highp
= &vmx
->nested
.msrs
.procbased_ctls_high
;
1267 case MSR_IA32_VMX_TRUE_EXIT_CTLS
:
1268 lowp
= &vmx
->nested
.msrs
.exit_ctls_low
;
1269 highp
= &vmx
->nested
.msrs
.exit_ctls_high
;
1271 case MSR_IA32_VMX_TRUE_ENTRY_CTLS
:
1272 lowp
= &vmx
->nested
.msrs
.entry_ctls_low
;
1273 highp
= &vmx
->nested
.msrs
.entry_ctls_high
;
1275 case MSR_IA32_VMX_PROCBASED_CTLS2
:
1276 lowp
= &vmx
->nested
.msrs
.secondary_ctls_low
;
1277 highp
= &vmx
->nested
.msrs
.secondary_ctls_high
;
1283 supported
= vmx_control_msr(*lowp
, *highp
);
1285 /* Check must-be-1 bits are still 1. */
1286 if (!is_bitwise_subset(data
, supported
, GENMASK_ULL(31, 0)))
1289 /* Check must-be-0 bits are still 0. */
1290 if (!is_bitwise_subset(supported
, data
, GENMASK_ULL(63, 32)))
1294 *highp
= data
>> 32;
1298 static int vmx_restore_vmx_misc(struct vcpu_vmx
*vmx
, u64 data
)
1300 const u64 feature_and_reserved_bits
=
1302 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
1303 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
1305 GENMASK_ULL(13, 9) | BIT_ULL(31);
1308 vmx_misc
= vmx_control_msr(vmx
->nested
.msrs
.misc_low
,
1309 vmx
->nested
.msrs
.misc_high
);
1311 if (!is_bitwise_subset(vmx_misc
, data
, feature_and_reserved_bits
))
1314 if ((vmx
->nested
.msrs
.pinbased_ctls_high
&
1315 PIN_BASED_VMX_PREEMPTION_TIMER
) &&
1316 vmx_misc_preemption_timer_rate(data
) !=
1317 vmx_misc_preemption_timer_rate(vmx_misc
))
1320 if (vmx_misc_cr3_count(data
) > vmx_misc_cr3_count(vmx_misc
))
1323 if (vmx_misc_max_msr(data
) > vmx_misc_max_msr(vmx_misc
))
1326 if (vmx_misc_mseg_revid(data
) != vmx_misc_mseg_revid(vmx_misc
))
1329 vmx
->nested
.msrs
.misc_low
= data
;
1330 vmx
->nested
.msrs
.misc_high
= data
>> 32;
1335 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx
*vmx
, u64 data
)
1337 u64 vmx_ept_vpid_cap
;
1339 vmx_ept_vpid_cap
= vmx_control_msr(vmx
->nested
.msrs
.ept_caps
,
1340 vmx
->nested
.msrs
.vpid_caps
);
1342 /* Every bit is either reserved or a feature bit. */
1343 if (!is_bitwise_subset(vmx_ept_vpid_cap
, data
, -1ULL))
1346 vmx
->nested
.msrs
.ept_caps
= data
;
1347 vmx
->nested
.msrs
.vpid_caps
= data
>> 32;
1351 static int vmx_restore_fixed0_msr(struct vcpu_vmx
*vmx
, u32 msr_index
, u64 data
)
1355 switch (msr_index
) {
1356 case MSR_IA32_VMX_CR0_FIXED0
:
1357 msr
= &vmx
->nested
.msrs
.cr0_fixed0
;
1359 case MSR_IA32_VMX_CR4_FIXED0
:
1360 msr
= &vmx
->nested
.msrs
.cr4_fixed0
;
1367 * 1 bits (which indicates bits which "must-be-1" during VMX operation)
1368 * must be 1 in the restored value.
1370 if (!is_bitwise_subset(data
, *msr
, -1ULL))
1378 * Called when userspace is restoring VMX MSRs.
1380 * Returns 0 on success, non-0 otherwise.
1382 int vmx_set_vmx_msr(struct kvm_vcpu
*vcpu
, u32 msr_index
, u64 data
)
1384 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
1387 * Don't allow changes to the VMX capability MSRs while the vCPU
1388 * is in VMX operation.
1390 if (vmx
->nested
.vmxon
)
1393 switch (msr_index
) {
1394 case MSR_IA32_VMX_BASIC
:
1395 return vmx_restore_vmx_basic(vmx
, data
);
1396 case MSR_IA32_VMX_PINBASED_CTLS
:
1397 case MSR_IA32_VMX_PROCBASED_CTLS
:
1398 case MSR_IA32_VMX_EXIT_CTLS
:
1399 case MSR_IA32_VMX_ENTRY_CTLS
:
1401 * The "non-true" VMX capability MSRs are generated from the
1402 * "true" MSRs, so we do not support restoring them directly.
1404 * If userspace wants to emulate VMX_BASIC[55]=0, userspace
1405 * should restore the "true" MSRs with the must-be-1 bits
1406 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1407 * DEFAULT SETTINGS".
1410 case MSR_IA32_VMX_TRUE_PINBASED_CTLS
:
1411 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS
:
1412 case MSR_IA32_VMX_TRUE_EXIT_CTLS
:
1413 case MSR_IA32_VMX_TRUE_ENTRY_CTLS
:
1414 case MSR_IA32_VMX_PROCBASED_CTLS2
:
1415 return vmx_restore_control_msr(vmx
, msr_index
, data
);
1416 case MSR_IA32_VMX_MISC
:
1417 return vmx_restore_vmx_misc(vmx
, data
);
1418 case MSR_IA32_VMX_CR0_FIXED0
:
1419 case MSR_IA32_VMX_CR4_FIXED0
:
1420 return vmx_restore_fixed0_msr(vmx
, msr_index
, data
);
1421 case MSR_IA32_VMX_CR0_FIXED1
:
1422 case MSR_IA32_VMX_CR4_FIXED1
:
1424 * These MSRs are generated based on the vCPU's CPUID, so we
1425 * do not support restoring them directly.
1428 case MSR_IA32_VMX_EPT_VPID_CAP
:
1429 return vmx_restore_vmx_ept_vpid_cap(vmx
, data
);
1430 case MSR_IA32_VMX_VMCS_ENUM
:
1431 vmx
->nested
.msrs
.vmcs_enum
= data
;
1433 case MSR_IA32_VMX_VMFUNC
:
1434 if (data
& ~vmx
->nested
.msrs
.vmfunc_controls
)
1436 vmx
->nested
.msrs
.vmfunc_controls
= data
;
1440 * The rest of the VMX capability MSRs do not support restore.
1446 /* Returns 0 on success, non-0 otherwise. */
1447 int vmx_get_vmx_msr(struct nested_vmx_msrs
*msrs
, u32 msr_index
, u64
*pdata
)
1449 switch (msr_index
) {
1450 case MSR_IA32_VMX_BASIC
:
1451 *pdata
= msrs
->basic
;
1453 case MSR_IA32_VMX_TRUE_PINBASED_CTLS
:
1454 case MSR_IA32_VMX_PINBASED_CTLS
:
1455 *pdata
= vmx_control_msr(
1456 msrs
->pinbased_ctls_low
,
1457 msrs
->pinbased_ctls_high
);
1458 if (msr_index
== MSR_IA32_VMX_PINBASED_CTLS
)
1459 *pdata
|= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR
;
1461 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS
:
1462 case MSR_IA32_VMX_PROCBASED_CTLS
:
1463 *pdata
= vmx_control_msr(
1464 msrs
->procbased_ctls_low
,
1465 msrs
->procbased_ctls_high
);
1466 if (msr_index
== MSR_IA32_VMX_PROCBASED_CTLS
)
1467 *pdata
|= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR
;
1469 case MSR_IA32_VMX_TRUE_EXIT_CTLS
:
1470 case MSR_IA32_VMX_EXIT_CTLS
:
1471 *pdata
= vmx_control_msr(
1472 msrs
->exit_ctls_low
,
1473 msrs
->exit_ctls_high
);
1474 if (msr_index
== MSR_IA32_VMX_EXIT_CTLS
)
1475 *pdata
|= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR
;
1477 case MSR_IA32_VMX_TRUE_ENTRY_CTLS
:
1478 case MSR_IA32_VMX_ENTRY_CTLS
:
1479 *pdata
= vmx_control_msr(
1480 msrs
->entry_ctls_low
,
1481 msrs
->entry_ctls_high
);
1482 if (msr_index
== MSR_IA32_VMX_ENTRY_CTLS
)
1483 *pdata
|= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR
;
1485 case MSR_IA32_VMX_MISC
:
1486 *pdata
= vmx_control_msr(
1490 case MSR_IA32_VMX_CR0_FIXED0
:
1491 *pdata
= msrs
->cr0_fixed0
;
1493 case MSR_IA32_VMX_CR0_FIXED1
:
1494 *pdata
= msrs
->cr0_fixed1
;
1496 case MSR_IA32_VMX_CR4_FIXED0
:
1497 *pdata
= msrs
->cr4_fixed0
;
1499 case MSR_IA32_VMX_CR4_FIXED1
:
1500 *pdata
= msrs
->cr4_fixed1
;
1502 case MSR_IA32_VMX_VMCS_ENUM
:
1503 *pdata
= msrs
->vmcs_enum
;
1505 case MSR_IA32_VMX_PROCBASED_CTLS2
:
1506 *pdata
= vmx_control_msr(
1507 msrs
->secondary_ctls_low
,
1508 msrs
->secondary_ctls_high
);
1510 case MSR_IA32_VMX_EPT_VPID_CAP
:
1511 *pdata
= msrs
->ept_caps
|
1512 ((u64
)msrs
->vpid_caps
<< 32);
1514 case MSR_IA32_VMX_VMFUNC
:
1515 *pdata
= msrs
->vmfunc_controls
;
1525 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
1526 * been modified by the L1 guest. Note, "writable" in this context means
1527 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
1528 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
1529 * VM-exit information fields (which are actually writable if the vCPU is
1530 * configured to support "VMWRITE to any supported field in the VMCS").
1532 static void copy_shadow_to_vmcs12(struct vcpu_vmx
*vmx
)
1534 struct vmcs
*shadow_vmcs
= vmx
->vmcs01
.shadow_vmcs
;
1535 struct vmcs12
*vmcs12
= get_vmcs12(&vmx
->vcpu
);
1536 struct shadow_vmcs_field field
;
1540 if (WARN_ON(!shadow_vmcs
))
1545 vmcs_load(shadow_vmcs
);
1547 for (i
= 0; i
< max_shadow_read_write_fields
; i
++) {
1548 field
= shadow_read_write_fields
[i
];
1549 val
= __vmcs_readl(field
.encoding
);
1550 vmcs12_write_any(vmcs12
, field
.encoding
, field
.offset
, val
);
1553 vmcs_clear(shadow_vmcs
);
1554 vmcs_load(vmx
->loaded_vmcs
->vmcs
);
1559 static void copy_vmcs12_to_shadow(struct vcpu_vmx
*vmx
)
1561 const struct shadow_vmcs_field
*fields
[] = {
1562 shadow_read_write_fields
,
1563 shadow_read_only_fields
1565 const int max_fields
[] = {
1566 max_shadow_read_write_fields
,
1567 max_shadow_read_only_fields
1569 struct vmcs
*shadow_vmcs
= vmx
->vmcs01
.shadow_vmcs
;
1570 struct vmcs12
*vmcs12
= get_vmcs12(&vmx
->vcpu
);
1571 struct shadow_vmcs_field field
;
1575 if (WARN_ON(!shadow_vmcs
))
1578 vmcs_load(shadow_vmcs
);
1580 for (q
= 0; q
< ARRAY_SIZE(fields
); q
++) {
1581 for (i
= 0; i
< max_fields
[q
]; i
++) {
1582 field
= fields
[q
][i
];
1583 val
= vmcs12_read_any(vmcs12
, field
.encoding
,
1585 __vmcs_writel(field
.encoding
, val
);
1589 vmcs_clear(shadow_vmcs
);
1590 vmcs_load(vmx
->loaded_vmcs
->vmcs
);
1593 static void copy_enlightened_to_vmcs12(struct vcpu_vmx
*vmx
, u32 hv_clean_fields
)
1595 struct vmcs12
*vmcs12
= vmx
->nested
.cached_vmcs12
;
1596 struct hv_enlightened_vmcs
*evmcs
= vmx
->nested
.hv_evmcs
;
1598 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1599 vmcs12
->tpr_threshold
= evmcs
->tpr_threshold
;
1600 vmcs12
->guest_rip
= evmcs
->guest_rip
;
1602 if (unlikely(!(hv_clean_fields
&
1603 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC
))) {
1604 vmcs12
->guest_rsp
= evmcs
->guest_rsp
;
1605 vmcs12
->guest_rflags
= evmcs
->guest_rflags
;
1606 vmcs12
->guest_interruptibility_info
=
1607 evmcs
->guest_interruptibility_info
;
1610 if (unlikely(!(hv_clean_fields
&
1611 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC
))) {
1612 vmcs12
->cpu_based_vm_exec_control
=
1613 evmcs
->cpu_based_vm_exec_control
;
1616 if (unlikely(!(hv_clean_fields
&
1617 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN
))) {
1618 vmcs12
->exception_bitmap
= evmcs
->exception_bitmap
;
1621 if (unlikely(!(hv_clean_fields
&
1622 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY
))) {
1623 vmcs12
->vm_entry_controls
= evmcs
->vm_entry_controls
;
1626 if (unlikely(!(hv_clean_fields
&
1627 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT
))) {
1628 vmcs12
->vm_entry_intr_info_field
=
1629 evmcs
->vm_entry_intr_info_field
;
1630 vmcs12
->vm_entry_exception_error_code
=
1631 evmcs
->vm_entry_exception_error_code
;
1632 vmcs12
->vm_entry_instruction_len
=
1633 evmcs
->vm_entry_instruction_len
;
1636 if (unlikely(!(hv_clean_fields
&
1637 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1
))) {
1638 vmcs12
->host_ia32_pat
= evmcs
->host_ia32_pat
;
1639 vmcs12
->host_ia32_efer
= evmcs
->host_ia32_efer
;
1640 vmcs12
->host_cr0
= evmcs
->host_cr0
;
1641 vmcs12
->host_cr3
= evmcs
->host_cr3
;
1642 vmcs12
->host_cr4
= evmcs
->host_cr4
;
1643 vmcs12
->host_ia32_sysenter_esp
= evmcs
->host_ia32_sysenter_esp
;
1644 vmcs12
->host_ia32_sysenter_eip
= evmcs
->host_ia32_sysenter_eip
;
1645 vmcs12
->host_rip
= evmcs
->host_rip
;
1646 vmcs12
->host_ia32_sysenter_cs
= evmcs
->host_ia32_sysenter_cs
;
1647 vmcs12
->host_es_selector
= evmcs
->host_es_selector
;
1648 vmcs12
->host_cs_selector
= evmcs
->host_cs_selector
;
1649 vmcs12
->host_ss_selector
= evmcs
->host_ss_selector
;
1650 vmcs12
->host_ds_selector
= evmcs
->host_ds_selector
;
1651 vmcs12
->host_fs_selector
= evmcs
->host_fs_selector
;
1652 vmcs12
->host_gs_selector
= evmcs
->host_gs_selector
;
1653 vmcs12
->host_tr_selector
= evmcs
->host_tr_selector
;
1656 if (unlikely(!(hv_clean_fields
&
1657 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1
))) {
1658 vmcs12
->pin_based_vm_exec_control
=
1659 evmcs
->pin_based_vm_exec_control
;
1660 vmcs12
->vm_exit_controls
= evmcs
->vm_exit_controls
;
1661 vmcs12
->secondary_vm_exec_control
=
1662 evmcs
->secondary_vm_exec_control
;
1665 if (unlikely(!(hv_clean_fields
&
1666 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP
))) {
1667 vmcs12
->io_bitmap_a
= evmcs
->io_bitmap_a
;
1668 vmcs12
->io_bitmap_b
= evmcs
->io_bitmap_b
;
1671 if (unlikely(!(hv_clean_fields
&
1672 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP
))) {
1673 vmcs12
->msr_bitmap
= evmcs
->msr_bitmap
;
1676 if (unlikely(!(hv_clean_fields
&
1677 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2
))) {
1678 vmcs12
->guest_es_base
= evmcs
->guest_es_base
;
1679 vmcs12
->guest_cs_base
= evmcs
->guest_cs_base
;
1680 vmcs12
->guest_ss_base
= evmcs
->guest_ss_base
;
1681 vmcs12
->guest_ds_base
= evmcs
->guest_ds_base
;
1682 vmcs12
->guest_fs_base
= evmcs
->guest_fs_base
;
1683 vmcs12
->guest_gs_base
= evmcs
->guest_gs_base
;
1684 vmcs12
->guest_ldtr_base
= evmcs
->guest_ldtr_base
;
1685 vmcs12
->guest_tr_base
= evmcs
->guest_tr_base
;
1686 vmcs12
->guest_gdtr_base
= evmcs
->guest_gdtr_base
;
1687 vmcs12
->guest_idtr_base
= evmcs
->guest_idtr_base
;
1688 vmcs12
->guest_es_limit
= evmcs
->guest_es_limit
;
1689 vmcs12
->guest_cs_limit
= evmcs
->guest_cs_limit
;
1690 vmcs12
->guest_ss_limit
= evmcs
->guest_ss_limit
;
1691 vmcs12
->guest_ds_limit
= evmcs
->guest_ds_limit
;
1692 vmcs12
->guest_fs_limit
= evmcs
->guest_fs_limit
;
1693 vmcs12
->guest_gs_limit
= evmcs
->guest_gs_limit
;
1694 vmcs12
->guest_ldtr_limit
= evmcs
->guest_ldtr_limit
;
1695 vmcs12
->guest_tr_limit
= evmcs
->guest_tr_limit
;
1696 vmcs12
->guest_gdtr_limit
= evmcs
->guest_gdtr_limit
;
1697 vmcs12
->guest_idtr_limit
= evmcs
->guest_idtr_limit
;
1698 vmcs12
->guest_es_ar_bytes
= evmcs
->guest_es_ar_bytes
;
1699 vmcs12
->guest_cs_ar_bytes
= evmcs
->guest_cs_ar_bytes
;
1700 vmcs12
->guest_ss_ar_bytes
= evmcs
->guest_ss_ar_bytes
;
1701 vmcs12
->guest_ds_ar_bytes
= evmcs
->guest_ds_ar_bytes
;
1702 vmcs12
->guest_fs_ar_bytes
= evmcs
->guest_fs_ar_bytes
;
1703 vmcs12
->guest_gs_ar_bytes
= evmcs
->guest_gs_ar_bytes
;
1704 vmcs12
->guest_ldtr_ar_bytes
= evmcs
->guest_ldtr_ar_bytes
;
1705 vmcs12
->guest_tr_ar_bytes
= evmcs
->guest_tr_ar_bytes
;
1706 vmcs12
->guest_es_selector
= evmcs
->guest_es_selector
;
1707 vmcs12
->guest_cs_selector
= evmcs
->guest_cs_selector
;
1708 vmcs12
->guest_ss_selector
= evmcs
->guest_ss_selector
;
1709 vmcs12
->guest_ds_selector
= evmcs
->guest_ds_selector
;
1710 vmcs12
->guest_fs_selector
= evmcs
->guest_fs_selector
;
1711 vmcs12
->guest_gs_selector
= evmcs
->guest_gs_selector
;
1712 vmcs12
->guest_ldtr_selector
= evmcs
->guest_ldtr_selector
;
1713 vmcs12
->guest_tr_selector
= evmcs
->guest_tr_selector
;
1716 if (unlikely(!(hv_clean_fields
&
1717 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2
))) {
1718 vmcs12
->tsc_offset
= evmcs
->tsc_offset
;
1719 vmcs12
->virtual_apic_page_addr
= evmcs
->virtual_apic_page_addr
;
1720 vmcs12
->xss_exit_bitmap
= evmcs
->xss_exit_bitmap
;
1723 if (unlikely(!(hv_clean_fields
&
1724 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR
))) {
1725 vmcs12
->cr0_guest_host_mask
= evmcs
->cr0_guest_host_mask
;
1726 vmcs12
->cr4_guest_host_mask
= evmcs
->cr4_guest_host_mask
;
1727 vmcs12
->cr0_read_shadow
= evmcs
->cr0_read_shadow
;
1728 vmcs12
->cr4_read_shadow
= evmcs
->cr4_read_shadow
;
1729 vmcs12
->guest_cr0
= evmcs
->guest_cr0
;
1730 vmcs12
->guest_cr3
= evmcs
->guest_cr3
;
1731 vmcs12
->guest_cr4
= evmcs
->guest_cr4
;
1732 vmcs12
->guest_dr7
= evmcs
->guest_dr7
;
1735 if (unlikely(!(hv_clean_fields
&
1736 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER
))) {
1737 vmcs12
->host_fs_base
= evmcs
->host_fs_base
;
1738 vmcs12
->host_gs_base
= evmcs
->host_gs_base
;
1739 vmcs12
->host_tr_base
= evmcs
->host_tr_base
;
1740 vmcs12
->host_gdtr_base
= evmcs
->host_gdtr_base
;
1741 vmcs12
->host_idtr_base
= evmcs
->host_idtr_base
;
1742 vmcs12
->host_rsp
= evmcs
->host_rsp
;
1745 if (unlikely(!(hv_clean_fields
&
1746 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT
))) {
1747 vmcs12
->ept_pointer
= evmcs
->ept_pointer
;
1748 vmcs12
->virtual_processor_id
= evmcs
->virtual_processor_id
;
1751 if (unlikely(!(hv_clean_fields
&
1752 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1
))) {
1753 vmcs12
->vmcs_link_pointer
= evmcs
->vmcs_link_pointer
;
1754 vmcs12
->guest_ia32_debugctl
= evmcs
->guest_ia32_debugctl
;
1755 vmcs12
->guest_ia32_pat
= evmcs
->guest_ia32_pat
;
1756 vmcs12
->guest_ia32_efer
= evmcs
->guest_ia32_efer
;
1757 vmcs12
->guest_pdptr0
= evmcs
->guest_pdptr0
;
1758 vmcs12
->guest_pdptr1
= evmcs
->guest_pdptr1
;
1759 vmcs12
->guest_pdptr2
= evmcs
->guest_pdptr2
;
1760 vmcs12
->guest_pdptr3
= evmcs
->guest_pdptr3
;
1761 vmcs12
->guest_pending_dbg_exceptions
=
1762 evmcs
->guest_pending_dbg_exceptions
;
1763 vmcs12
->guest_sysenter_esp
= evmcs
->guest_sysenter_esp
;
1764 vmcs12
->guest_sysenter_eip
= evmcs
->guest_sysenter_eip
;
1765 vmcs12
->guest_bndcfgs
= evmcs
->guest_bndcfgs
;
1766 vmcs12
->guest_activity_state
= evmcs
->guest_activity_state
;
1767 vmcs12
->guest_sysenter_cs
= evmcs
->guest_sysenter_cs
;
1772 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1773 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1774 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
1775 * vmcs12->page_fault_error_code_mask =
1776 * evmcs->page_fault_error_code_mask;
1777 * vmcs12->page_fault_error_code_match =
1778 * evmcs->page_fault_error_code_match;
1779 * vmcs12->cr3_target_count = evmcs->cr3_target_count;
1780 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1781 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1782 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1787 * vmcs12->guest_physical_address = evmcs->guest_physical_address;
1788 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1789 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1790 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1791 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1792 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1793 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1794 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1795 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1796 * vmcs12->exit_qualification = evmcs->exit_qualification;
1797 * vmcs12->guest_linear_address = evmcs->guest_linear_address;
1799 * Not present in struct vmcs12:
1800 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1801 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1802 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1803 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1809 static void copy_vmcs12_to_enlightened(struct vcpu_vmx
*vmx
)
1811 struct vmcs12
*vmcs12
= vmx
->nested
.cached_vmcs12
;
1812 struct hv_enlightened_vmcs
*evmcs
= vmx
->nested
.hv_evmcs
;
1815 * Should not be changed by KVM:
1817 * evmcs->host_es_selector = vmcs12->host_es_selector;
1818 * evmcs->host_cs_selector = vmcs12->host_cs_selector;
1819 * evmcs->host_ss_selector = vmcs12->host_ss_selector;
1820 * evmcs->host_ds_selector = vmcs12->host_ds_selector;
1821 * evmcs->host_fs_selector = vmcs12->host_fs_selector;
1822 * evmcs->host_gs_selector = vmcs12->host_gs_selector;
1823 * evmcs->host_tr_selector = vmcs12->host_tr_selector;
1824 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1825 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1826 * evmcs->host_cr0 = vmcs12->host_cr0;
1827 * evmcs->host_cr3 = vmcs12->host_cr3;
1828 * evmcs->host_cr4 = vmcs12->host_cr4;
1829 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1830 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1831 * evmcs->host_rip = vmcs12->host_rip;
1832 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1833 * evmcs->host_fs_base = vmcs12->host_fs_base;
1834 * evmcs->host_gs_base = vmcs12->host_gs_base;
1835 * evmcs->host_tr_base = vmcs12->host_tr_base;
1836 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1837 * evmcs->host_idtr_base = vmcs12->host_idtr_base;
1838 * evmcs->host_rsp = vmcs12->host_rsp;
1839 * sync_vmcs02_to_vmcs12() doesn't read these:
1840 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1841 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1842 * evmcs->msr_bitmap = vmcs12->msr_bitmap;
1843 * evmcs->ept_pointer = vmcs12->ept_pointer;
1844 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1845 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1846 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1847 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
1848 * evmcs->tpr_threshold = vmcs12->tpr_threshold;
1849 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1850 * evmcs->exception_bitmap = vmcs12->exception_bitmap;
1851 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1852 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1853 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1854 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1855 * evmcs->page_fault_error_code_mask =
1856 * vmcs12->page_fault_error_code_mask;
1857 * evmcs->page_fault_error_code_match =
1858 * vmcs12->page_fault_error_code_match;
1859 * evmcs->cr3_target_count = vmcs12->cr3_target_count;
1860 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1861 * evmcs->tsc_offset = vmcs12->tsc_offset;
1862 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1863 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1864 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1865 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1866 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
1867 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
1868 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
1869 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
1871 * Not present in struct vmcs12:
1872 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
1873 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
1874 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
1875 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
1878 evmcs
->guest_es_selector
= vmcs12
->guest_es_selector
;
1879 evmcs
->guest_cs_selector
= vmcs12
->guest_cs_selector
;
1880 evmcs
->guest_ss_selector
= vmcs12
->guest_ss_selector
;
1881 evmcs
->guest_ds_selector
= vmcs12
->guest_ds_selector
;
1882 evmcs
->guest_fs_selector
= vmcs12
->guest_fs_selector
;
1883 evmcs
->guest_gs_selector
= vmcs12
->guest_gs_selector
;
1884 evmcs
->guest_ldtr_selector
= vmcs12
->guest_ldtr_selector
;
1885 evmcs
->guest_tr_selector
= vmcs12
->guest_tr_selector
;
1887 evmcs
->guest_es_limit
= vmcs12
->guest_es_limit
;
1888 evmcs
->guest_cs_limit
= vmcs12
->guest_cs_limit
;
1889 evmcs
->guest_ss_limit
= vmcs12
->guest_ss_limit
;
1890 evmcs
->guest_ds_limit
= vmcs12
->guest_ds_limit
;
1891 evmcs
->guest_fs_limit
= vmcs12
->guest_fs_limit
;
1892 evmcs
->guest_gs_limit
= vmcs12
->guest_gs_limit
;
1893 evmcs
->guest_ldtr_limit
= vmcs12
->guest_ldtr_limit
;
1894 evmcs
->guest_tr_limit
= vmcs12
->guest_tr_limit
;
1895 evmcs
->guest_gdtr_limit
= vmcs12
->guest_gdtr_limit
;
1896 evmcs
->guest_idtr_limit
= vmcs12
->guest_idtr_limit
;
1898 evmcs
->guest_es_ar_bytes
= vmcs12
->guest_es_ar_bytes
;
1899 evmcs
->guest_cs_ar_bytes
= vmcs12
->guest_cs_ar_bytes
;
1900 evmcs
->guest_ss_ar_bytes
= vmcs12
->guest_ss_ar_bytes
;
1901 evmcs
->guest_ds_ar_bytes
= vmcs12
->guest_ds_ar_bytes
;
1902 evmcs
->guest_fs_ar_bytes
= vmcs12
->guest_fs_ar_bytes
;
1903 evmcs
->guest_gs_ar_bytes
= vmcs12
->guest_gs_ar_bytes
;
1904 evmcs
->guest_ldtr_ar_bytes
= vmcs12
->guest_ldtr_ar_bytes
;
1905 evmcs
->guest_tr_ar_bytes
= vmcs12
->guest_tr_ar_bytes
;
1907 evmcs
->guest_es_base
= vmcs12
->guest_es_base
;
1908 evmcs
->guest_cs_base
= vmcs12
->guest_cs_base
;
1909 evmcs
->guest_ss_base
= vmcs12
->guest_ss_base
;
1910 evmcs
->guest_ds_base
= vmcs12
->guest_ds_base
;
1911 evmcs
->guest_fs_base
= vmcs12
->guest_fs_base
;
1912 evmcs
->guest_gs_base
= vmcs12
->guest_gs_base
;
1913 evmcs
->guest_ldtr_base
= vmcs12
->guest_ldtr_base
;
1914 evmcs
->guest_tr_base
= vmcs12
->guest_tr_base
;
1915 evmcs
->guest_gdtr_base
= vmcs12
->guest_gdtr_base
;
1916 evmcs
->guest_idtr_base
= vmcs12
->guest_idtr_base
;
1918 evmcs
->guest_ia32_pat
= vmcs12
->guest_ia32_pat
;
1919 evmcs
->guest_ia32_efer
= vmcs12
->guest_ia32_efer
;
1921 evmcs
->guest_pdptr0
= vmcs12
->guest_pdptr0
;
1922 evmcs
->guest_pdptr1
= vmcs12
->guest_pdptr1
;
1923 evmcs
->guest_pdptr2
= vmcs12
->guest_pdptr2
;
1924 evmcs
->guest_pdptr3
= vmcs12
->guest_pdptr3
;
1926 evmcs
->guest_pending_dbg_exceptions
=
1927 vmcs12
->guest_pending_dbg_exceptions
;
1928 evmcs
->guest_sysenter_esp
= vmcs12
->guest_sysenter_esp
;
1929 evmcs
->guest_sysenter_eip
= vmcs12
->guest_sysenter_eip
;
1931 evmcs
->guest_activity_state
= vmcs12
->guest_activity_state
;
1932 evmcs
->guest_sysenter_cs
= vmcs12
->guest_sysenter_cs
;
1934 evmcs
->guest_cr0
= vmcs12
->guest_cr0
;
1935 evmcs
->guest_cr3
= vmcs12
->guest_cr3
;
1936 evmcs
->guest_cr4
= vmcs12
->guest_cr4
;
1937 evmcs
->guest_dr7
= vmcs12
->guest_dr7
;
1939 evmcs
->guest_physical_address
= vmcs12
->guest_physical_address
;
1941 evmcs
->vm_instruction_error
= vmcs12
->vm_instruction_error
;
1942 evmcs
->vm_exit_reason
= vmcs12
->vm_exit_reason
;
1943 evmcs
->vm_exit_intr_info
= vmcs12
->vm_exit_intr_info
;
1944 evmcs
->vm_exit_intr_error_code
= vmcs12
->vm_exit_intr_error_code
;
1945 evmcs
->idt_vectoring_info_field
= vmcs12
->idt_vectoring_info_field
;
1946 evmcs
->idt_vectoring_error_code
= vmcs12
->idt_vectoring_error_code
;
1947 evmcs
->vm_exit_instruction_len
= vmcs12
->vm_exit_instruction_len
;
1948 evmcs
->vmx_instruction_info
= vmcs12
->vmx_instruction_info
;
1950 evmcs
->exit_qualification
= vmcs12
->exit_qualification
;
1952 evmcs
->guest_linear_address
= vmcs12
->guest_linear_address
;
1953 evmcs
->guest_rsp
= vmcs12
->guest_rsp
;
1954 evmcs
->guest_rflags
= vmcs12
->guest_rflags
;
1956 evmcs
->guest_interruptibility_info
=
1957 vmcs12
->guest_interruptibility_info
;
1958 evmcs
->cpu_based_vm_exec_control
= vmcs12
->cpu_based_vm_exec_control
;
1959 evmcs
->vm_entry_controls
= vmcs12
->vm_entry_controls
;
1960 evmcs
->vm_entry_intr_info_field
= vmcs12
->vm_entry_intr_info_field
;
1961 evmcs
->vm_entry_exception_error_code
=
1962 vmcs12
->vm_entry_exception_error_code
;
1963 evmcs
->vm_entry_instruction_len
= vmcs12
->vm_entry_instruction_len
;
1965 evmcs
->guest_rip
= vmcs12
->guest_rip
;
1967 evmcs
->guest_bndcfgs
= vmcs12
->guest_bndcfgs
;
1973 * This is an equivalent of the nested hypervisor executing the vmptrld
1976 static enum nested_evmptrld_status
nested_vmx_handle_enlightened_vmptrld(
1977 struct kvm_vcpu
*vcpu
, bool from_launch
)
1979 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
1980 bool evmcs_gpa_changed
= false;
1983 if (likely(!vmx
->nested
.enlightened_vmcs_enabled
))
1984 return EVMPTRLD_DISABLED
;
1986 if (!nested_enlightened_vmentry(vcpu
, &evmcs_gpa
)) {
1987 nested_release_evmcs(vcpu
);
1988 return EVMPTRLD_DISABLED
;
1991 if (unlikely(evmcs_gpa
!= vmx
->nested
.hv_evmcs_vmptr
)) {
1992 vmx
->nested
.current_vmptr
= -1ull;
1994 nested_release_evmcs(vcpu
);
1996 if (kvm_vcpu_map(vcpu
, gpa_to_gfn(evmcs_gpa
),
1997 &vmx
->nested
.hv_evmcs_map
))
1998 return EVMPTRLD_ERROR
;
2000 vmx
->nested
.hv_evmcs
= vmx
->nested
.hv_evmcs_map
.hva
;
2003 * Currently, KVM only supports eVMCS version 1
2004 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
2005 * value to first u32 field of eVMCS which should specify eVMCS
2008 * Guest should be aware of supported eVMCS versions by host by
2009 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
2010 * expected to set this CPUID leaf according to the value
2011 * returned in vmcs_version from nested_enable_evmcs().
2013 * However, it turns out that Microsoft Hyper-V fails to comply
2014 * to their own invented interface: When Hyper-V use eVMCS, it
2015 * just sets first u32 field of eVMCS to revision_id specified
2016 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
2017 * which is one of the supported versions specified in
2018 * CPUID.0x4000000A.EAX[0:15].
2020 * To overcome Hyper-V bug, we accept here either a supported
2021 * eVMCS version or VMCS12 revision_id as valid values for first
2022 * u32 field of eVMCS.
2024 if ((vmx
->nested
.hv_evmcs
->revision_id
!= KVM_EVMCS_VERSION
) &&
2025 (vmx
->nested
.hv_evmcs
->revision_id
!= VMCS12_REVISION
)) {
2026 nested_release_evmcs(vcpu
);
2027 return EVMPTRLD_VMFAIL
;
2030 vmx
->nested
.hv_evmcs_vmptr
= evmcs_gpa
;
2032 evmcs_gpa_changed
= true;
2034 * Unlike normal vmcs12, enlightened vmcs12 is not fully
2035 * reloaded from guest's memory (read only fields, fields not
2036 * present in struct hv_enlightened_vmcs, ...). Make sure there
2040 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
2041 memset(vmcs12
, 0, sizeof(*vmcs12
));
2042 vmcs12
->hdr
.revision_id
= VMCS12_REVISION
;
2048 * Clean fields data can't be used on VMLAUNCH and when we switch
2049 * between different L2 guests as KVM keeps a single VMCS12 per L1.
2051 if (from_launch
|| evmcs_gpa_changed
)
2052 vmx
->nested
.hv_evmcs
->hv_clean_fields
&=
2053 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL
;
2055 return EVMPTRLD_SUCCEEDED
;
2058 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu
*vcpu
)
2060 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
2062 if (evmptr_is_valid(vmx
->nested
.hv_evmcs_vmptr
))
2063 copy_vmcs12_to_enlightened(vmx
);
2065 copy_vmcs12_to_shadow(vmx
);
2067 vmx
->nested
.need_vmcs12_to_shadow_sync
= false;
2070 static enum hrtimer_restart
vmx_preemption_timer_fn(struct hrtimer
*timer
)
2072 struct vcpu_vmx
*vmx
=
2073 container_of(timer
, struct vcpu_vmx
, nested
.preemption_timer
);
2075 vmx
->nested
.preemption_timer_expired
= true;
2076 kvm_make_request(KVM_REQ_EVENT
, &vmx
->vcpu
);
2077 kvm_vcpu_kick(&vmx
->vcpu
);
2079 return HRTIMER_NORESTART
;
2082 static u64
vmx_calc_preemption_timer_value(struct kvm_vcpu
*vcpu
)
2084 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
2085 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
2087 u64 l1_scaled_tsc
= kvm_read_l1_tsc(vcpu
, rdtsc()) >>
2088 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE
;
2090 if (!vmx
->nested
.has_preemption_timer_deadline
) {
2091 vmx
->nested
.preemption_timer_deadline
=
2092 vmcs12
->vmx_preemption_timer_value
+ l1_scaled_tsc
;
2093 vmx
->nested
.has_preemption_timer_deadline
= true;
2095 return vmx
->nested
.preemption_timer_deadline
- l1_scaled_tsc
;
2098 static void vmx_start_preemption_timer(struct kvm_vcpu
*vcpu
,
2099 u64 preemption_timeout
)
2101 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
2104 * A timer value of zero is architecturally guaranteed to cause
2105 * a VMExit prior to executing any instructions in the guest.
2107 if (preemption_timeout
== 0) {
2108 vmx_preemption_timer_fn(&vmx
->nested
.preemption_timer
);
2112 if (vcpu
->arch
.virtual_tsc_khz
== 0)
2115 preemption_timeout
<<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE
;
2116 preemption_timeout
*= 1000000;
2117 do_div(preemption_timeout
, vcpu
->arch
.virtual_tsc_khz
);
2118 hrtimer_start(&vmx
->nested
.preemption_timer
,
2119 ktime_add_ns(ktime_get(), preemption_timeout
),
2120 HRTIMER_MODE_ABS_PINNED
);
2123 static u64
nested_vmx_calc_efer(struct vcpu_vmx
*vmx
, struct vmcs12
*vmcs12
)
2125 if (vmx
->nested
.nested_run_pending
&&
2126 (vmcs12
->vm_entry_controls
& VM_ENTRY_LOAD_IA32_EFER
))
2127 return vmcs12
->guest_ia32_efer
;
2128 else if (vmcs12
->vm_entry_controls
& VM_ENTRY_IA32E_MODE
)
2129 return vmx
->vcpu
.arch
.efer
| (EFER_LMA
| EFER_LME
);
2131 return vmx
->vcpu
.arch
.efer
& ~(EFER_LMA
| EFER_LME
);
2134 static void prepare_vmcs02_constant_state(struct vcpu_vmx
*vmx
)
2137 * If vmcs02 hasn't been initialized, set the constant vmcs02 state
2138 * according to L0's settings (vmcs12 is irrelevant here). Host
2139 * fields that come from L0 and are not constant, e.g. HOST_CR3,
2140 * will be set as needed prior to VMLAUNCH/VMRESUME.
2142 if (vmx
->nested
.vmcs02_initialized
)
2144 vmx
->nested
.vmcs02_initialized
= true;
2147 * We don't care what the EPTP value is we just need to guarantee
2148 * it's valid so we don't get a false positive when doing early
2149 * consistency checks.
2151 if (enable_ept
&& nested_early_check
)
2152 vmcs_write64(EPT_POINTER
,
2153 construct_eptp(&vmx
->vcpu
, 0, PT64_ROOT_4LEVEL
));
2155 /* All VMFUNCs are currently emulated through L0 vmexits. */
2156 if (cpu_has_vmx_vmfunc())
2157 vmcs_write64(VM_FUNCTION_CONTROL
, 0);
2159 if (cpu_has_vmx_posted_intr())
2160 vmcs_write16(POSTED_INTR_NV
, POSTED_INTR_NESTED_VECTOR
);
2162 if (cpu_has_vmx_msr_bitmap())
2163 vmcs_write64(MSR_BITMAP
, __pa(vmx
->nested
.vmcs02
.msr_bitmap
));
2166 * PML is emulated for L2, but never enabled in hardware as the MMU
2167 * handles A/D emulation. Disabling PML for L2 also avoids having to
2168 * deal with filtering out L2 GPAs from the buffer.
2171 vmcs_write64(PML_ADDRESS
, 0);
2172 vmcs_write16(GUEST_PML_INDEX
, -1);
2175 if (cpu_has_vmx_encls_vmexit())
2176 vmcs_write64(ENCLS_EXITING_BITMAP
, -1ull);
2179 * Set the MSR load/store lists to match L0's settings. Only the
2180 * addresses are constant (for vmcs02), the counts can change based
2181 * on L2's behavior, e.g. switching to/from long mode.
2183 vmcs_write64(VM_EXIT_MSR_STORE_ADDR
, __pa(vmx
->msr_autostore
.guest
.val
));
2184 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR
, __pa(vmx
->msr_autoload
.host
.val
));
2185 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR
, __pa(vmx
->msr_autoload
.guest
.val
));
2187 vmx_set_constant_host_state(vmx
);
2190 static void prepare_vmcs02_early_rare(struct vcpu_vmx
*vmx
,
2191 struct vmcs12
*vmcs12
)
2193 prepare_vmcs02_constant_state(vmx
);
2195 vmcs_write64(VMCS_LINK_POINTER
, -1ull);
2198 if (nested_cpu_has_vpid(vmcs12
) && vmx
->nested
.vpid02
)
2199 vmcs_write16(VIRTUAL_PROCESSOR_ID
, vmx
->nested
.vpid02
);
2201 vmcs_write16(VIRTUAL_PROCESSOR_ID
, vmx
->vpid
);
2205 static void prepare_vmcs02_early(struct vcpu_vmx
*vmx
, struct vmcs12
*vmcs12
)
2208 u64 guest_efer
= nested_vmx_calc_efer(vmx
, vmcs12
);
2210 if (vmx
->nested
.dirty_vmcs12
|| evmptr_is_valid(vmx
->nested
.hv_evmcs_vmptr
))
2211 prepare_vmcs02_early_rare(vmx
, vmcs12
);
2216 exec_control
= vmx_pin_based_exec_ctrl(vmx
);
2217 exec_control
|= (vmcs12
->pin_based_vm_exec_control
&
2218 ~PIN_BASED_VMX_PREEMPTION_TIMER
);
2220 /* Posted interrupts setting is only taken from vmcs12. */
2221 if (nested_cpu_has_posted_intr(vmcs12
)) {
2222 vmx
->nested
.posted_intr_nv
= vmcs12
->posted_intr_nv
;
2223 vmx
->nested
.pi_pending
= false;
2225 exec_control
&= ~PIN_BASED_POSTED_INTR
;
2227 pin_controls_set(vmx
, exec_control
);
2232 exec_control
= vmx_exec_control(vmx
); /* L0's desires */
2233 exec_control
&= ~CPU_BASED_INTR_WINDOW_EXITING
;
2234 exec_control
&= ~CPU_BASED_NMI_WINDOW_EXITING
;
2235 exec_control
&= ~CPU_BASED_TPR_SHADOW
;
2236 exec_control
|= vmcs12
->cpu_based_vm_exec_control
;
2238 vmx
->nested
.l1_tpr_threshold
= -1;
2239 if (exec_control
& CPU_BASED_TPR_SHADOW
)
2240 vmcs_write32(TPR_THRESHOLD
, vmcs12
->tpr_threshold
);
2241 #ifdef CONFIG_X86_64
2243 exec_control
|= CPU_BASED_CR8_LOAD_EXITING
|
2244 CPU_BASED_CR8_STORE_EXITING
;
2248 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2249 * for I/O port accesses.
2251 exec_control
|= CPU_BASED_UNCOND_IO_EXITING
;
2252 exec_control
&= ~CPU_BASED_USE_IO_BITMAPS
;
2255 * This bit will be computed in nested_get_vmcs12_pages, because
2256 * we do not have access to L1's MSR bitmap yet. For now, keep
2257 * the same bit as before, hoping to avoid multiple VMWRITEs that
2258 * only set/clear this bit.
2260 exec_control
&= ~CPU_BASED_USE_MSR_BITMAPS
;
2261 exec_control
|= exec_controls_get(vmx
) & CPU_BASED_USE_MSR_BITMAPS
;
2263 exec_controls_set(vmx
, exec_control
);
2266 * SECONDARY EXEC CONTROLS
2268 if (cpu_has_secondary_exec_ctrls()) {
2269 exec_control
= vmx
->secondary_exec_control
;
2271 /* Take the following fields only from vmcs12 */
2272 exec_control
&= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES
|
2273 SECONDARY_EXEC_ENABLE_INVPCID
|
2274 SECONDARY_EXEC_ENABLE_RDTSCP
|
2275 SECONDARY_EXEC_XSAVES
|
2276 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE
|
2277 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY
|
2278 SECONDARY_EXEC_APIC_REGISTER_VIRT
|
2279 SECONDARY_EXEC_ENABLE_VMFUNC
|
2280 SECONDARY_EXEC_TSC_SCALING
);
2281 if (nested_cpu_has(vmcs12
,
2282 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS
))
2283 exec_control
|= vmcs12
->secondary_vm_exec_control
;
2285 /* PML is emulated and never enabled in hardware for L2. */
2286 exec_control
&= ~SECONDARY_EXEC_ENABLE_PML
;
2288 /* VMCS shadowing for L2 is emulated for now */
2289 exec_control
&= ~SECONDARY_EXEC_SHADOW_VMCS
;
2292 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
2293 * will not have to rewrite the controls just for this bit.
2295 if (!boot_cpu_has(X86_FEATURE_UMIP
) && vmx_umip_emulated() &&
2296 (vmcs12
->guest_cr4
& X86_CR4_UMIP
))
2297 exec_control
|= SECONDARY_EXEC_DESC
;
2299 if (exec_control
& SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY
)
2300 vmcs_write16(GUEST_INTR_STATUS
,
2301 vmcs12
->guest_intr_status
);
2303 if (!nested_cpu_has2(vmcs12
, SECONDARY_EXEC_UNRESTRICTED_GUEST
))
2304 exec_control
&= ~SECONDARY_EXEC_UNRESTRICTED_GUEST
;
2306 if (exec_control
& SECONDARY_EXEC_ENCLS_EXITING
)
2307 vmx_write_encls_bitmap(&vmx
->vcpu
, vmcs12
);
2309 secondary_exec_controls_set(vmx
, exec_control
);
2315 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2316 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2317 * on the related bits (if supported by the CPU) in the hope that
2318 * we can avoid VMWrites during vmx_set_efer().
2320 exec_control
= (vmcs12
->vm_entry_controls
| vmx_vmentry_ctrl()) &
2321 ~VM_ENTRY_IA32E_MODE
& ~VM_ENTRY_LOAD_IA32_EFER
;
2322 if (cpu_has_load_ia32_efer()) {
2323 if (guest_efer
& EFER_LMA
)
2324 exec_control
|= VM_ENTRY_IA32E_MODE
;
2325 if (guest_efer
!= host_efer
)
2326 exec_control
|= VM_ENTRY_LOAD_IA32_EFER
;
2328 vm_entry_controls_set(vmx
, exec_control
);
2333 * L2->L1 exit controls are emulated - the hardware exit is to L0 so
2334 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2335 * bits may be modified by vmx_set_efer() in prepare_vmcs02().
2337 exec_control
= vmx_vmexit_ctrl();
2338 if (cpu_has_load_ia32_efer() && guest_efer
!= host_efer
)
2339 exec_control
|= VM_EXIT_LOAD_IA32_EFER
;
2340 vm_exit_controls_set(vmx
, exec_control
);
2343 * Interrupt/Exception Fields
2345 if (vmx
->nested
.nested_run_pending
) {
2346 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD
,
2347 vmcs12
->vm_entry_intr_info_field
);
2348 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE
,
2349 vmcs12
->vm_entry_exception_error_code
);
2350 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN
,
2351 vmcs12
->vm_entry_instruction_len
);
2352 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO
,
2353 vmcs12
->guest_interruptibility_info
);
2354 vmx
->loaded_vmcs
->nmi_known_unmasked
=
2355 !(vmcs12
->guest_interruptibility_info
& GUEST_INTR_STATE_NMI
);
2357 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD
, 0);
2361 static void prepare_vmcs02_rare(struct vcpu_vmx
*vmx
, struct vmcs12
*vmcs12
)
2363 struct hv_enlightened_vmcs
*hv_evmcs
= vmx
->nested
.hv_evmcs
;
2365 if (!hv_evmcs
|| !(hv_evmcs
->hv_clean_fields
&
2366 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2
)) {
2367 vmcs_write16(GUEST_ES_SELECTOR
, vmcs12
->guest_es_selector
);
2368 vmcs_write16(GUEST_CS_SELECTOR
, vmcs12
->guest_cs_selector
);
2369 vmcs_write16(GUEST_SS_SELECTOR
, vmcs12
->guest_ss_selector
);
2370 vmcs_write16(GUEST_DS_SELECTOR
, vmcs12
->guest_ds_selector
);
2371 vmcs_write16(GUEST_FS_SELECTOR
, vmcs12
->guest_fs_selector
);
2372 vmcs_write16(GUEST_GS_SELECTOR
, vmcs12
->guest_gs_selector
);
2373 vmcs_write16(GUEST_LDTR_SELECTOR
, vmcs12
->guest_ldtr_selector
);
2374 vmcs_write16(GUEST_TR_SELECTOR
, vmcs12
->guest_tr_selector
);
2375 vmcs_write32(GUEST_ES_LIMIT
, vmcs12
->guest_es_limit
);
2376 vmcs_write32(GUEST_CS_LIMIT
, vmcs12
->guest_cs_limit
);
2377 vmcs_write32(GUEST_SS_LIMIT
, vmcs12
->guest_ss_limit
);
2378 vmcs_write32(GUEST_DS_LIMIT
, vmcs12
->guest_ds_limit
);
2379 vmcs_write32(GUEST_FS_LIMIT
, vmcs12
->guest_fs_limit
);
2380 vmcs_write32(GUEST_GS_LIMIT
, vmcs12
->guest_gs_limit
);
2381 vmcs_write32(GUEST_LDTR_LIMIT
, vmcs12
->guest_ldtr_limit
);
2382 vmcs_write32(GUEST_TR_LIMIT
, vmcs12
->guest_tr_limit
);
2383 vmcs_write32(GUEST_GDTR_LIMIT
, vmcs12
->guest_gdtr_limit
);
2384 vmcs_write32(GUEST_IDTR_LIMIT
, vmcs12
->guest_idtr_limit
);
2385 vmcs_write32(GUEST_CS_AR_BYTES
, vmcs12
->guest_cs_ar_bytes
);
2386 vmcs_write32(GUEST_SS_AR_BYTES
, vmcs12
->guest_ss_ar_bytes
);
2387 vmcs_write32(GUEST_ES_AR_BYTES
, vmcs12
->guest_es_ar_bytes
);
2388 vmcs_write32(GUEST_DS_AR_BYTES
, vmcs12
->guest_ds_ar_bytes
);
2389 vmcs_write32(GUEST_FS_AR_BYTES
, vmcs12
->guest_fs_ar_bytes
);
2390 vmcs_write32(GUEST_GS_AR_BYTES
, vmcs12
->guest_gs_ar_bytes
);
2391 vmcs_write32(GUEST_LDTR_AR_BYTES
, vmcs12
->guest_ldtr_ar_bytes
);
2392 vmcs_write32(GUEST_TR_AR_BYTES
, vmcs12
->guest_tr_ar_bytes
);
2393 vmcs_writel(GUEST_ES_BASE
, vmcs12
->guest_es_base
);
2394 vmcs_writel(GUEST_CS_BASE
, vmcs12
->guest_cs_base
);
2395 vmcs_writel(GUEST_SS_BASE
, vmcs12
->guest_ss_base
);
2396 vmcs_writel(GUEST_DS_BASE
, vmcs12
->guest_ds_base
);
2397 vmcs_writel(GUEST_FS_BASE
, vmcs12
->guest_fs_base
);
2398 vmcs_writel(GUEST_GS_BASE
, vmcs12
->guest_gs_base
);
2399 vmcs_writel(GUEST_LDTR_BASE
, vmcs12
->guest_ldtr_base
);
2400 vmcs_writel(GUEST_TR_BASE
, vmcs12
->guest_tr_base
);
2401 vmcs_writel(GUEST_GDTR_BASE
, vmcs12
->guest_gdtr_base
);
2402 vmcs_writel(GUEST_IDTR_BASE
, vmcs12
->guest_idtr_base
);
2404 vmx
->segment_cache
.bitmask
= 0;
2407 if (!hv_evmcs
|| !(hv_evmcs
->hv_clean_fields
&
2408 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1
)) {
2409 vmcs_write32(GUEST_SYSENTER_CS
, vmcs12
->guest_sysenter_cs
);
2410 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS
,
2411 vmcs12
->guest_pending_dbg_exceptions
);
2412 vmcs_writel(GUEST_SYSENTER_ESP
, vmcs12
->guest_sysenter_esp
);
2413 vmcs_writel(GUEST_SYSENTER_EIP
, vmcs12
->guest_sysenter_eip
);
2416 * L1 may access the L2's PDPTR, so save them to construct
2420 vmcs_write64(GUEST_PDPTR0
, vmcs12
->guest_pdptr0
);
2421 vmcs_write64(GUEST_PDPTR1
, vmcs12
->guest_pdptr1
);
2422 vmcs_write64(GUEST_PDPTR2
, vmcs12
->guest_pdptr2
);
2423 vmcs_write64(GUEST_PDPTR3
, vmcs12
->guest_pdptr3
);
2426 if (kvm_mpx_supported() && vmx
->nested
.nested_run_pending
&&
2427 (vmcs12
->vm_entry_controls
& VM_ENTRY_LOAD_BNDCFGS
))
2428 vmcs_write64(GUEST_BNDCFGS
, vmcs12
->guest_bndcfgs
);
2431 if (nested_cpu_has_xsaves(vmcs12
))
2432 vmcs_write64(XSS_EXIT_BITMAP
, vmcs12
->xss_exit_bitmap
);
2435 * Whether page-faults are trapped is determined by a combination of
2436 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0
2437 * doesn't care about page faults then we should set all of these to
2438 * L1's desires. However, if L0 does care about (some) page faults, it
2439 * is not easy (if at all possible?) to merge L0 and L1's desires, we
2440 * simply ask to exit on each and every L2 page fault. This is done by
2441 * setting MASK=MATCH=0 and (see below) EB.PF=1.
2442 * Note that below we don't need special code to set EB.PF beyond the
2443 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2444 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2445 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
2447 if (vmx_need_pf_intercept(&vmx
->vcpu
)) {
2449 * TODO: if both L0 and L1 need the same MASK and MATCH,
2450 * go ahead and use it?
2452 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK
, 0);
2453 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH
, 0);
2455 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK
, vmcs12
->page_fault_error_code_mask
);
2456 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH
, vmcs12
->page_fault_error_code_match
);
2459 if (cpu_has_vmx_apicv()) {
2460 vmcs_write64(EOI_EXIT_BITMAP0
, vmcs12
->eoi_exit_bitmap0
);
2461 vmcs_write64(EOI_EXIT_BITMAP1
, vmcs12
->eoi_exit_bitmap1
);
2462 vmcs_write64(EOI_EXIT_BITMAP2
, vmcs12
->eoi_exit_bitmap2
);
2463 vmcs_write64(EOI_EXIT_BITMAP3
, vmcs12
->eoi_exit_bitmap3
);
2467 * Make sure the msr_autostore list is up to date before we set the
2468 * count in the vmcs02.
2470 prepare_vmx_msr_autostore_list(&vmx
->vcpu
, MSR_IA32_TSC
);
2472 vmcs_write32(VM_EXIT_MSR_STORE_COUNT
, vmx
->msr_autostore
.guest
.nr
);
2473 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT
, vmx
->msr_autoload
.host
.nr
);
2474 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT
, vmx
->msr_autoload
.guest
.nr
);
2476 set_cr4_guest_host_mask(vmx
);
2480 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2481 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2482 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2483 * guest in a way that will both be appropriate to L1's requests, and our
2484 * needs. In addition to modifying the active vmcs (which is vmcs02), this
2485 * function also has additional necessary side-effects, like setting various
2486 * vcpu->arch fields.
2487 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
2488 * is assigned to entry_failure_code on failure.
2490 static int prepare_vmcs02(struct kvm_vcpu
*vcpu
, struct vmcs12
*vmcs12
,
2492 enum vm_entry_failure_code
*entry_failure_code
)
2494 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
2495 bool load_guest_pdptrs_vmcs12
= false;
2497 if (vmx
->nested
.dirty_vmcs12
|| evmptr_is_valid(vmx
->nested
.hv_evmcs_vmptr
)) {
2498 prepare_vmcs02_rare(vmx
, vmcs12
);
2499 vmx
->nested
.dirty_vmcs12
= false;
2501 load_guest_pdptrs_vmcs12
= !evmptr_is_valid(vmx
->nested
.hv_evmcs_vmptr
) ||
2502 !(vmx
->nested
.hv_evmcs
->hv_clean_fields
&
2503 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1
);
2506 if (vmx
->nested
.nested_run_pending
&&
2507 (vmcs12
->vm_entry_controls
& VM_ENTRY_LOAD_DEBUG_CONTROLS
)) {
2508 kvm_set_dr(vcpu
, 7, vmcs12
->guest_dr7
);
2509 vmcs_write64(GUEST_IA32_DEBUGCTL
, vmcs12
->guest_ia32_debugctl
);
2511 kvm_set_dr(vcpu
, 7, vcpu
->arch
.dr7
);
2512 vmcs_write64(GUEST_IA32_DEBUGCTL
, vmx
->nested
.vmcs01_debugctl
);
2514 if (kvm_mpx_supported() && (!vmx
->nested
.nested_run_pending
||
2515 !(vmcs12
->vm_entry_controls
& VM_ENTRY_LOAD_BNDCFGS
)))
2516 vmcs_write64(GUEST_BNDCFGS
, vmx
->nested
.vmcs01_guest_bndcfgs
);
2517 vmx_set_rflags(vcpu
, vmcs12
->guest_rflags
);
2519 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2520 * bitwise-or of what L1 wants to trap for L2, and what we want to
2521 * trap. Note that CR0.TS also needs updating - we do this later.
2523 vmx_update_exception_bitmap(vcpu
);
2524 vcpu
->arch
.cr0_guest_owned_bits
&= ~vmcs12
->cr0_guest_host_mask
;
2525 vmcs_writel(CR0_GUEST_HOST_MASK
, ~vcpu
->arch
.cr0_guest_owned_bits
);
2527 if (vmx
->nested
.nested_run_pending
&&
2528 (vmcs12
->vm_entry_controls
& VM_ENTRY_LOAD_IA32_PAT
)) {
2529 vmcs_write64(GUEST_IA32_PAT
, vmcs12
->guest_ia32_pat
);
2530 vcpu
->arch
.pat
= vmcs12
->guest_ia32_pat
;
2531 } else if (vmcs_config
.vmentry_ctrl
& VM_ENTRY_LOAD_IA32_PAT
) {
2532 vmcs_write64(GUEST_IA32_PAT
, vmx
->vcpu
.arch
.pat
);
2535 vcpu
->arch
.tsc_offset
= kvm_calc_nested_tsc_offset(
2536 vcpu
->arch
.l1_tsc_offset
,
2537 vmx_get_l2_tsc_offset(vcpu
),
2538 vmx_get_l2_tsc_multiplier(vcpu
));
2540 vcpu
->arch
.tsc_scaling_ratio
= kvm_calc_nested_tsc_multiplier(
2541 vcpu
->arch
.l1_tsc_scaling_ratio
,
2542 vmx_get_l2_tsc_multiplier(vcpu
));
2544 vmcs_write64(TSC_OFFSET
, vcpu
->arch
.tsc_offset
);
2545 if (kvm_has_tsc_control
)
2546 vmcs_write64(TSC_MULTIPLIER
, vcpu
->arch
.tsc_scaling_ratio
);
2548 nested_vmx_transition_tlb_flush(vcpu
, vmcs12
, true);
2550 if (nested_cpu_has_ept(vmcs12
))
2551 nested_ept_init_mmu_context(vcpu
);
2554 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
2555 * bits which we consider mandatory enabled.
2556 * The CR0_READ_SHADOW is what L2 should have expected to read given
2557 * the specifications by L1; It's not enough to take
2558 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
2559 * have more bits than L1 expected.
2561 vmx_set_cr0(vcpu
, vmcs12
->guest_cr0
);
2562 vmcs_writel(CR0_READ_SHADOW
, nested_read_cr0(vmcs12
));
2564 vmx_set_cr4(vcpu
, vmcs12
->guest_cr4
);
2565 vmcs_writel(CR4_READ_SHADOW
, nested_read_cr4(vmcs12
));
2567 vcpu
->arch
.efer
= nested_vmx_calc_efer(vmx
, vmcs12
);
2568 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2569 vmx_set_efer(vcpu
, vcpu
->arch
.efer
);
2572 * Guest state is invalid and unrestricted guest is disabled,
2573 * which means L1 attempted VMEntry to L2 with invalid state.
2576 if (CC(!vmx_guest_state_valid(vcpu
))) {
2577 *entry_failure_code
= ENTRY_FAIL_DEFAULT
;
2581 /* Shadow page tables on either EPT or shadow page tables. */
2582 if (nested_vmx_load_cr3(vcpu
, vmcs12
->guest_cr3
, nested_cpu_has_ept(vmcs12
),
2583 from_vmentry
, entry_failure_code
))
2587 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12
2588 * on nested VM-Exit, which can occur without actually running L2 and
2589 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with
2590 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
2591 * transition to HLT instead of running L2.
2594 vmcs_writel(GUEST_CR3
, vmcs12
->guest_cr3
);
2596 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
2597 if (load_guest_pdptrs_vmcs12
&& nested_cpu_has_ept(vmcs12
) &&
2598 is_pae_paging(vcpu
)) {
2599 vmcs_write64(GUEST_PDPTR0
, vmcs12
->guest_pdptr0
);
2600 vmcs_write64(GUEST_PDPTR1
, vmcs12
->guest_pdptr1
);
2601 vmcs_write64(GUEST_PDPTR2
, vmcs12
->guest_pdptr2
);
2602 vmcs_write64(GUEST_PDPTR3
, vmcs12
->guest_pdptr3
);
2606 vcpu
->arch
.walk_mmu
->inject_page_fault
= vmx_inject_page_fault_nested
;
2608 if ((vmcs12
->vm_entry_controls
& VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL
) &&
2609 WARN_ON_ONCE(kvm_set_msr(vcpu
, MSR_CORE_PERF_GLOBAL_CTRL
,
2610 vmcs12
->guest_ia32_perf_global_ctrl
)))
2613 kvm_rsp_write(vcpu
, vmcs12
->guest_rsp
);
2614 kvm_rip_write(vcpu
, vmcs12
->guest_rip
);
2617 * It was observed that genuine Hyper-V running in L1 doesn't reset
2618 * 'hv_clean_fields' by itself, it only sets the corresponding dirty
2619 * bits when it changes a field in eVMCS. Mark all fields as clean
2622 if (evmptr_is_valid(vmx
->nested
.hv_evmcs_vmptr
))
2623 vmx
->nested
.hv_evmcs
->hv_clean_fields
|=
2624 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL
;
2629 static int nested_vmx_check_nmi_controls(struct vmcs12
*vmcs12
)
2631 if (CC(!nested_cpu_has_nmi_exiting(vmcs12
) &&
2632 nested_cpu_has_virtual_nmis(vmcs12
)))
2635 if (CC(!nested_cpu_has_virtual_nmis(vmcs12
) &&
2636 nested_cpu_has(vmcs12
, CPU_BASED_NMI_WINDOW_EXITING
)))
2642 static bool nested_vmx_check_eptp(struct kvm_vcpu
*vcpu
, u64 new_eptp
)
2644 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
2646 /* Check for memory type validity */
2647 switch (new_eptp
& VMX_EPTP_MT_MASK
) {
2648 case VMX_EPTP_MT_UC
:
2649 if (CC(!(vmx
->nested
.msrs
.ept_caps
& VMX_EPTP_UC_BIT
)))
2652 case VMX_EPTP_MT_WB
:
2653 if (CC(!(vmx
->nested
.msrs
.ept_caps
& VMX_EPTP_WB_BIT
)))
2660 /* Page-walk levels validity. */
2661 switch (new_eptp
& VMX_EPTP_PWL_MASK
) {
2662 case VMX_EPTP_PWL_5
:
2663 if (CC(!(vmx
->nested
.msrs
.ept_caps
& VMX_EPT_PAGE_WALK_5_BIT
)))
2666 case VMX_EPTP_PWL_4
:
2667 if (CC(!(vmx
->nested
.msrs
.ept_caps
& VMX_EPT_PAGE_WALK_4_BIT
)))
2674 /* Reserved bits should not be set */
2675 if (CC(kvm_vcpu_is_illegal_gpa(vcpu
, new_eptp
) || ((new_eptp
>> 7) & 0x1f)))
2678 /* AD, if set, should be supported */
2679 if (new_eptp
& VMX_EPTP_AD_ENABLE_BIT
) {
2680 if (CC(!(vmx
->nested
.msrs
.ept_caps
& VMX_EPT_AD_BIT
)))
2688 * Checks related to VM-Execution Control Fields
2690 static int nested_check_vm_execution_controls(struct kvm_vcpu
*vcpu
,
2691 struct vmcs12
*vmcs12
)
2693 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
2695 if (CC(!vmx_control_verify(vmcs12
->pin_based_vm_exec_control
,
2696 vmx
->nested
.msrs
.pinbased_ctls_low
,
2697 vmx
->nested
.msrs
.pinbased_ctls_high
)) ||
2698 CC(!vmx_control_verify(vmcs12
->cpu_based_vm_exec_control
,
2699 vmx
->nested
.msrs
.procbased_ctls_low
,
2700 vmx
->nested
.msrs
.procbased_ctls_high
)))
2703 if (nested_cpu_has(vmcs12
, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS
) &&
2704 CC(!vmx_control_verify(vmcs12
->secondary_vm_exec_control
,
2705 vmx
->nested
.msrs
.secondary_ctls_low
,
2706 vmx
->nested
.msrs
.secondary_ctls_high
)))
2709 if (CC(vmcs12
->cr3_target_count
> nested_cpu_vmx_misc_cr3_count(vcpu
)) ||
2710 nested_vmx_check_io_bitmap_controls(vcpu
, vmcs12
) ||
2711 nested_vmx_check_msr_bitmap_controls(vcpu
, vmcs12
) ||
2712 nested_vmx_check_tpr_shadow_controls(vcpu
, vmcs12
) ||
2713 nested_vmx_check_apic_access_controls(vcpu
, vmcs12
) ||
2714 nested_vmx_check_apicv_controls(vcpu
, vmcs12
) ||
2715 nested_vmx_check_nmi_controls(vmcs12
) ||
2716 nested_vmx_check_pml_controls(vcpu
, vmcs12
) ||
2717 nested_vmx_check_unrestricted_guest_controls(vcpu
, vmcs12
) ||
2718 nested_vmx_check_mode_based_ept_exec_controls(vcpu
, vmcs12
) ||
2719 nested_vmx_check_shadow_vmcs_controls(vcpu
, vmcs12
) ||
2720 CC(nested_cpu_has_vpid(vmcs12
) && !vmcs12
->virtual_processor_id
))
2723 if (!nested_cpu_has_preemption_timer(vmcs12
) &&
2724 nested_cpu_has_save_preemption_timer(vmcs12
))
2727 if (nested_cpu_has_ept(vmcs12
) &&
2728 CC(!nested_vmx_check_eptp(vcpu
, vmcs12
->ept_pointer
)))
2731 if (nested_cpu_has_vmfunc(vmcs12
)) {
2732 if (CC(vmcs12
->vm_function_control
&
2733 ~vmx
->nested
.msrs
.vmfunc_controls
))
2736 if (nested_cpu_has_eptp_switching(vmcs12
)) {
2737 if (CC(!nested_cpu_has_ept(vmcs12
)) ||
2738 CC(!page_address_valid(vcpu
, vmcs12
->eptp_list_address
)))
2747 * Checks related to VM-Exit Control Fields
2749 static int nested_check_vm_exit_controls(struct kvm_vcpu
*vcpu
,
2750 struct vmcs12
*vmcs12
)
2752 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
2754 if (CC(!vmx_control_verify(vmcs12
->vm_exit_controls
,
2755 vmx
->nested
.msrs
.exit_ctls_low
,
2756 vmx
->nested
.msrs
.exit_ctls_high
)) ||
2757 CC(nested_vmx_check_exit_msr_switch_controls(vcpu
, vmcs12
)))
2764 * Checks related to VM-Entry Control Fields
2766 static int nested_check_vm_entry_controls(struct kvm_vcpu
*vcpu
,
2767 struct vmcs12
*vmcs12
)
2769 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
2771 if (CC(!vmx_control_verify(vmcs12
->vm_entry_controls
,
2772 vmx
->nested
.msrs
.entry_ctls_low
,
2773 vmx
->nested
.msrs
.entry_ctls_high
)))
2777 * From the Intel SDM, volume 3:
2778 * Fields relevant to VM-entry event injection must be set properly.
2779 * These fields are the VM-entry interruption-information field, the
2780 * VM-entry exception error code, and the VM-entry instruction length.
2782 if (vmcs12
->vm_entry_intr_info_field
& INTR_INFO_VALID_MASK
) {
2783 u32 intr_info
= vmcs12
->vm_entry_intr_info_field
;
2784 u8 vector
= intr_info
& INTR_INFO_VECTOR_MASK
;
2785 u32 intr_type
= intr_info
& INTR_INFO_INTR_TYPE_MASK
;
2786 bool has_error_code
= intr_info
& INTR_INFO_DELIVER_CODE_MASK
;
2787 bool should_have_error_code
;
2788 bool urg
= nested_cpu_has2(vmcs12
,
2789 SECONDARY_EXEC_UNRESTRICTED_GUEST
);
2790 bool prot_mode
= !urg
|| vmcs12
->guest_cr0
& X86_CR0_PE
;
2792 /* VM-entry interruption-info field: interruption type */
2793 if (CC(intr_type
== INTR_TYPE_RESERVED
) ||
2794 CC(intr_type
== INTR_TYPE_OTHER_EVENT
&&
2795 !nested_cpu_supports_monitor_trap_flag(vcpu
)))
2798 /* VM-entry interruption-info field: vector */
2799 if (CC(intr_type
== INTR_TYPE_NMI_INTR
&& vector
!= NMI_VECTOR
) ||
2800 CC(intr_type
== INTR_TYPE_HARD_EXCEPTION
&& vector
> 31) ||
2801 CC(intr_type
== INTR_TYPE_OTHER_EVENT
&& vector
!= 0))
2804 /* VM-entry interruption-info field: deliver error code */
2805 should_have_error_code
=
2806 intr_type
== INTR_TYPE_HARD_EXCEPTION
&& prot_mode
&&
2807 x86_exception_has_error_code(vector
);
2808 if (CC(has_error_code
!= should_have_error_code
))
2811 /* VM-entry exception error code */
2812 if (CC(has_error_code
&&
2813 vmcs12
->vm_entry_exception_error_code
& GENMASK(31, 16)))
2816 /* VM-entry interruption-info field: reserved bits */
2817 if (CC(intr_info
& INTR_INFO_RESVD_BITS_MASK
))
2820 /* VM-entry instruction length */
2821 switch (intr_type
) {
2822 case INTR_TYPE_SOFT_EXCEPTION
:
2823 case INTR_TYPE_SOFT_INTR
:
2824 case INTR_TYPE_PRIV_SW_EXCEPTION
:
2825 if (CC(vmcs12
->vm_entry_instruction_len
> 15) ||
2826 CC(vmcs12
->vm_entry_instruction_len
== 0 &&
2827 CC(!nested_cpu_has_zero_length_injection(vcpu
))))
2832 if (nested_vmx_check_entry_msr_switch_controls(vcpu
, vmcs12
))
2838 static int nested_vmx_check_controls(struct kvm_vcpu
*vcpu
,
2839 struct vmcs12
*vmcs12
)
2841 if (nested_check_vm_execution_controls(vcpu
, vmcs12
) ||
2842 nested_check_vm_exit_controls(vcpu
, vmcs12
) ||
2843 nested_check_vm_entry_controls(vcpu
, vmcs12
))
2846 if (to_vmx(vcpu
)->nested
.enlightened_vmcs_enabled
)
2847 return nested_evmcs_check_controls(vmcs12
);
2852 static int nested_vmx_check_host_state(struct kvm_vcpu
*vcpu
,
2853 struct vmcs12
*vmcs12
)
2857 if (CC(!nested_host_cr0_valid(vcpu
, vmcs12
->host_cr0
)) ||
2858 CC(!nested_host_cr4_valid(vcpu
, vmcs12
->host_cr4
)) ||
2859 CC(kvm_vcpu_is_illegal_gpa(vcpu
, vmcs12
->host_cr3
)))
2862 if (CC(is_noncanonical_address(vmcs12
->host_ia32_sysenter_esp
, vcpu
)) ||
2863 CC(is_noncanonical_address(vmcs12
->host_ia32_sysenter_eip
, vcpu
)))
2866 if ((vmcs12
->vm_exit_controls
& VM_EXIT_LOAD_IA32_PAT
) &&
2867 CC(!kvm_pat_valid(vmcs12
->host_ia32_pat
)))
2870 if ((vmcs12
->vm_exit_controls
& VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL
) &&
2871 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu
),
2872 vmcs12
->host_ia32_perf_global_ctrl
)))
2875 #ifdef CONFIG_X86_64
2876 ia32e
= !!(vcpu
->arch
.efer
& EFER_LMA
);
2882 if (CC(!(vmcs12
->vm_exit_controls
& VM_EXIT_HOST_ADDR_SPACE_SIZE
)) ||
2883 CC(!(vmcs12
->host_cr4
& X86_CR4_PAE
)))
2886 if (CC(vmcs12
->vm_exit_controls
& VM_EXIT_HOST_ADDR_SPACE_SIZE
) ||
2887 CC(vmcs12
->vm_entry_controls
& VM_ENTRY_IA32E_MODE
) ||
2888 CC(vmcs12
->host_cr4
& X86_CR4_PCIDE
) ||
2889 CC((vmcs12
->host_rip
) >> 32))
2893 if (CC(vmcs12
->host_cs_selector
& (SEGMENT_RPL_MASK
| SEGMENT_TI_MASK
)) ||
2894 CC(vmcs12
->host_ss_selector
& (SEGMENT_RPL_MASK
| SEGMENT_TI_MASK
)) ||
2895 CC(vmcs12
->host_ds_selector
& (SEGMENT_RPL_MASK
| SEGMENT_TI_MASK
)) ||
2896 CC(vmcs12
->host_es_selector
& (SEGMENT_RPL_MASK
| SEGMENT_TI_MASK
)) ||
2897 CC(vmcs12
->host_fs_selector
& (SEGMENT_RPL_MASK
| SEGMENT_TI_MASK
)) ||
2898 CC(vmcs12
->host_gs_selector
& (SEGMENT_RPL_MASK
| SEGMENT_TI_MASK
)) ||
2899 CC(vmcs12
->host_tr_selector
& (SEGMENT_RPL_MASK
| SEGMENT_TI_MASK
)) ||
2900 CC(vmcs12
->host_cs_selector
== 0) ||
2901 CC(vmcs12
->host_tr_selector
== 0) ||
2902 CC(vmcs12
->host_ss_selector
== 0 && !ia32e
))
2905 if (CC(is_noncanonical_address(vmcs12
->host_fs_base
, vcpu
)) ||
2906 CC(is_noncanonical_address(vmcs12
->host_gs_base
, vcpu
)) ||
2907 CC(is_noncanonical_address(vmcs12
->host_gdtr_base
, vcpu
)) ||
2908 CC(is_noncanonical_address(vmcs12
->host_idtr_base
, vcpu
)) ||
2909 CC(is_noncanonical_address(vmcs12
->host_tr_base
, vcpu
)) ||
2910 CC(is_noncanonical_address(vmcs12
->host_rip
, vcpu
)))
2914 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
2915 * IA32_EFER MSR must be 0 in the field for that register. In addition,
2916 * the values of the LMA and LME bits in the field must each be that of
2917 * the host address-space size VM-exit control.
2919 if (vmcs12
->vm_exit_controls
& VM_EXIT_LOAD_IA32_EFER
) {
2920 if (CC(!kvm_valid_efer(vcpu
, vmcs12
->host_ia32_efer
)) ||
2921 CC(ia32e
!= !!(vmcs12
->host_ia32_efer
& EFER_LMA
)) ||
2922 CC(ia32e
!= !!(vmcs12
->host_ia32_efer
& EFER_LME
)))
2929 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu
*vcpu
,
2930 struct vmcs12
*vmcs12
)
2933 struct vmcs12
*shadow
;
2934 struct kvm_host_map map
;
2936 if (vmcs12
->vmcs_link_pointer
== -1ull)
2939 if (CC(!page_address_valid(vcpu
, vmcs12
->vmcs_link_pointer
)))
2942 if (CC(kvm_vcpu_map(vcpu
, gpa_to_gfn(vmcs12
->vmcs_link_pointer
), &map
)))
2947 if (CC(shadow
->hdr
.revision_id
!= VMCS12_REVISION
) ||
2948 CC(shadow
->hdr
.shadow_vmcs
!= nested_cpu_has_shadow_vmcs(vmcs12
)))
2951 kvm_vcpu_unmap(vcpu
, &map
, false);
2956 * Checks related to Guest Non-register State
2958 static int nested_check_guest_non_reg_state(struct vmcs12
*vmcs12
)
2960 if (CC(vmcs12
->guest_activity_state
!= GUEST_ACTIVITY_ACTIVE
&&
2961 vmcs12
->guest_activity_state
!= GUEST_ACTIVITY_HLT
&&
2962 vmcs12
->guest_activity_state
!= GUEST_ACTIVITY_WAIT_SIPI
))
2968 static int nested_vmx_check_guest_state(struct kvm_vcpu
*vcpu
,
2969 struct vmcs12
*vmcs12
,
2970 enum vm_entry_failure_code
*entry_failure_code
)
2974 *entry_failure_code
= ENTRY_FAIL_DEFAULT
;
2976 if (CC(!nested_guest_cr0_valid(vcpu
, vmcs12
->guest_cr0
)) ||
2977 CC(!nested_guest_cr4_valid(vcpu
, vmcs12
->guest_cr4
)))
2980 if ((vmcs12
->vm_entry_controls
& VM_ENTRY_LOAD_DEBUG_CONTROLS
) &&
2981 CC(!kvm_dr7_valid(vmcs12
->guest_dr7
)))
2984 if ((vmcs12
->vm_entry_controls
& VM_ENTRY_LOAD_IA32_PAT
) &&
2985 CC(!kvm_pat_valid(vmcs12
->guest_ia32_pat
)))
2988 if (nested_vmx_check_vmcs_link_ptr(vcpu
, vmcs12
)) {
2989 *entry_failure_code
= ENTRY_FAIL_VMCS_LINK_PTR
;
2993 if ((vmcs12
->vm_entry_controls
& VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL
) &&
2994 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu
),
2995 vmcs12
->guest_ia32_perf_global_ctrl
)))
2999 * If the load IA32_EFER VM-entry control is 1, the following checks
3000 * are performed on the field for the IA32_EFER MSR:
3001 * - Bits reserved in the IA32_EFER MSR must be 0.
3002 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
3003 * the IA-32e mode guest VM-exit control. It must also be identical
3004 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
3007 if (to_vmx(vcpu
)->nested
.nested_run_pending
&&
3008 (vmcs12
->vm_entry_controls
& VM_ENTRY_LOAD_IA32_EFER
)) {
3009 ia32e
= (vmcs12
->vm_entry_controls
& VM_ENTRY_IA32E_MODE
) != 0;
3010 if (CC(!kvm_valid_efer(vcpu
, vmcs12
->guest_ia32_efer
)) ||
3011 CC(ia32e
!= !!(vmcs12
->guest_ia32_efer
& EFER_LMA
)) ||
3012 CC(((vmcs12
->guest_cr0
& X86_CR0_PG
) &&
3013 ia32e
!= !!(vmcs12
->guest_ia32_efer
& EFER_LME
))))
3017 if ((vmcs12
->vm_entry_controls
& VM_ENTRY_LOAD_BNDCFGS
) &&
3018 (CC(is_noncanonical_address(vmcs12
->guest_bndcfgs
& PAGE_MASK
, vcpu
)) ||
3019 CC((vmcs12
->guest_bndcfgs
& MSR_IA32_BNDCFGS_RSVD
))))
3022 if (nested_check_guest_non_reg_state(vmcs12
))
3028 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu
*vcpu
)
3030 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3031 unsigned long cr3
, cr4
;
3034 if (!nested_early_check
)
3037 if (vmx
->msr_autoload
.host
.nr
)
3038 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT
, 0);
3039 if (vmx
->msr_autoload
.guest
.nr
)
3040 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT
, 0);
3044 vmx_prepare_switch_to_guest(vcpu
);
3047 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
3048 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
3049 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
3050 * there is no need to preserve other bits or save/restore the field.
3052 vmcs_writel(GUEST_RFLAGS
, 0);
3054 cr3
= __get_current_cr3_fast();
3055 if (unlikely(cr3
!= vmx
->loaded_vmcs
->host_state
.cr3
)) {
3056 vmcs_writel(HOST_CR3
, cr3
);
3057 vmx
->loaded_vmcs
->host_state
.cr3
= cr3
;
3060 cr4
= cr4_read_shadow();
3061 if (unlikely(cr4
!= vmx
->loaded_vmcs
->host_state
.cr4
)) {
3062 vmcs_writel(HOST_CR4
, cr4
);
3063 vmx
->loaded_vmcs
->host_state
.cr4
= cr4
;
3066 vm_fail
= __vmx_vcpu_run(vmx
, (unsigned long *)&vcpu
->arch
.regs
,
3067 vmx
->loaded_vmcs
->launched
);
3069 if (vmx
->msr_autoload
.host
.nr
)
3070 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT
, vmx
->msr_autoload
.host
.nr
);
3071 if (vmx
->msr_autoload
.guest
.nr
)
3072 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT
, vmx
->msr_autoload
.guest
.nr
);
3075 u32 error
= vmcs_read32(VM_INSTRUCTION_ERROR
);
3079 trace_kvm_nested_vmenter_failed(
3080 "early hardware check VM-instruction error: ", error
);
3081 WARN_ON_ONCE(error
!= VMXERR_ENTRY_INVALID_CONTROL_FIELD
);
3086 * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
3088 if (hw_breakpoint_active())
3089 set_debugreg(__this_cpu_read(cpu_dr7
), 7);
3094 * A non-failing VMEntry means we somehow entered guest mode with
3095 * an illegal RIP, and that's just the tip of the iceberg. There
3096 * is no telling what memory has been modified or what state has
3097 * been exposed to unknown code. Hitting this all but guarantees
3098 * a (very critical) hardware issue.
3100 WARN_ON(!(vmcs_read32(VM_EXIT_REASON
) &
3101 VMX_EXIT_REASONS_FAILED_VMENTRY
));
3106 static bool nested_get_evmcs_page(struct kvm_vcpu
*vcpu
)
3108 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3111 * hv_evmcs may end up being not mapped after migration (when
3112 * L2 was running), map it here to make sure vmcs12 changes are
3113 * properly reflected.
3115 if (vmx
->nested
.enlightened_vmcs_enabled
&&
3116 vmx
->nested
.hv_evmcs_vmptr
== EVMPTR_MAP_PENDING
) {
3117 enum nested_evmptrld_status evmptrld_status
=
3118 nested_vmx_handle_enlightened_vmptrld(vcpu
, false);
3120 if (evmptrld_status
== EVMPTRLD_VMFAIL
||
3121 evmptrld_status
== EVMPTRLD_ERROR
)
3125 * Post migration VMCS12 always provides the most actual
3126 * information, copy it to eVMCS upon entry.
3128 vmx
->nested
.need_vmcs12_to_shadow_sync
= true;
3134 static bool nested_get_vmcs12_pages(struct kvm_vcpu
*vcpu
)
3136 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
3137 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3138 struct kvm_host_map
*map
;
3142 if (!vcpu
->arch
.pdptrs_from_userspace
&&
3143 !nested_cpu_has_ept(vmcs12
) && is_pae_paging(vcpu
)) {
3145 * Reload the guest's PDPTRs since after a migration
3146 * the guest CR3 might be restored prior to setting the nested
3147 * state which can lead to a load of wrong PDPTRs.
3149 if (CC(!load_pdptrs(vcpu
, vcpu
->arch
.walk_mmu
, vcpu
->arch
.cr3
)))
3154 if (nested_cpu_has2(vmcs12
, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES
)) {
3156 * Translate L1 physical address to host physical
3157 * address for vmcs02. Keep the page pinned, so this
3158 * physical address remains valid. We keep a reference
3159 * to it so we can release it later.
3161 if (vmx
->nested
.apic_access_page
) { /* shouldn't happen */
3162 kvm_release_page_clean(vmx
->nested
.apic_access_page
);
3163 vmx
->nested
.apic_access_page
= NULL
;
3165 page
= kvm_vcpu_gpa_to_page(vcpu
, vmcs12
->apic_access_addr
);
3166 if (!is_error_page(page
)) {
3167 vmx
->nested
.apic_access_page
= page
;
3168 hpa
= page_to_phys(vmx
->nested
.apic_access_page
);
3169 vmcs_write64(APIC_ACCESS_ADDR
, hpa
);
3171 pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n",
3173 vcpu
->run
->exit_reason
= KVM_EXIT_INTERNAL_ERROR
;
3174 vcpu
->run
->internal
.suberror
=
3175 KVM_INTERNAL_ERROR_EMULATION
;
3176 vcpu
->run
->internal
.ndata
= 0;
3181 if (nested_cpu_has(vmcs12
, CPU_BASED_TPR_SHADOW
)) {
3182 map
= &vmx
->nested
.virtual_apic_map
;
3184 if (!kvm_vcpu_map(vcpu
, gpa_to_gfn(vmcs12
->virtual_apic_page_addr
), map
)) {
3185 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR
, pfn_to_hpa(map
->pfn
));
3186 } else if (nested_cpu_has(vmcs12
, CPU_BASED_CR8_LOAD_EXITING
) &&
3187 nested_cpu_has(vmcs12
, CPU_BASED_CR8_STORE_EXITING
) &&
3188 !nested_cpu_has2(vmcs12
, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES
)) {
3190 * The processor will never use the TPR shadow, simply
3191 * clear the bit from the execution control. Such a
3192 * configuration is useless, but it happens in tests.
3193 * For any other configuration, failing the vm entry is
3194 * _not_ what the processor does but it's basically the
3195 * only possibility we have.
3197 exec_controls_clearbit(vmx
, CPU_BASED_TPR_SHADOW
);
3200 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
3201 * force VM-Entry to fail.
3203 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR
, -1ull);
3207 if (nested_cpu_has_posted_intr(vmcs12
)) {
3208 map
= &vmx
->nested
.pi_desc_map
;
3210 if (!kvm_vcpu_map(vcpu
, gpa_to_gfn(vmcs12
->posted_intr_desc_addr
), map
)) {
3211 vmx
->nested
.pi_desc
=
3212 (struct pi_desc
*)(((void *)map
->hva
) +
3213 offset_in_page(vmcs12
->posted_intr_desc_addr
));
3214 vmcs_write64(POSTED_INTR_DESC_ADDR
,
3215 pfn_to_hpa(map
->pfn
) + offset_in_page(vmcs12
->posted_intr_desc_addr
));
3218 * Defer the KVM_INTERNAL_EXIT until KVM tries to
3219 * access the contents of the VMCS12 posted interrupt
3220 * descriptor. (Note that KVM may do this when it
3221 * should not, per the architectural specification.)
3223 vmx
->nested
.pi_desc
= NULL
;
3224 pin_controls_clearbit(vmx
, PIN_BASED_POSTED_INTR
);
3227 if (nested_vmx_prepare_msr_bitmap(vcpu
, vmcs12
))
3228 exec_controls_setbit(vmx
, CPU_BASED_USE_MSR_BITMAPS
);
3230 exec_controls_clearbit(vmx
, CPU_BASED_USE_MSR_BITMAPS
);
3235 static bool vmx_get_nested_state_pages(struct kvm_vcpu
*vcpu
)
3237 if (!nested_get_evmcs_page(vcpu
)) {
3238 pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
3240 vcpu
->run
->exit_reason
= KVM_EXIT_INTERNAL_ERROR
;
3241 vcpu
->run
->internal
.suberror
=
3242 KVM_INTERNAL_ERROR_EMULATION
;
3243 vcpu
->run
->internal
.ndata
= 0;
3248 if (is_guest_mode(vcpu
) && !nested_get_vmcs12_pages(vcpu
))
3254 static int nested_vmx_write_pml_buffer(struct kvm_vcpu
*vcpu
, gpa_t gpa
)
3256 struct vmcs12
*vmcs12
;
3257 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3260 if (WARN_ON_ONCE(!is_guest_mode(vcpu
)))
3263 if (WARN_ON_ONCE(vmx
->nested
.pml_full
))
3267 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is
3268 * set is already checked as part of A/D emulation.
3270 vmcs12
= get_vmcs12(vcpu
);
3271 if (!nested_cpu_has_pml(vmcs12
))
3274 if (vmcs12
->guest_pml_index
>= PML_ENTITY_NUM
) {
3275 vmx
->nested
.pml_full
= true;
3280 dst
= vmcs12
->pml_address
+ sizeof(u64
) * vmcs12
->guest_pml_index
;
3282 if (kvm_write_guest_page(vcpu
->kvm
, gpa_to_gfn(dst
), &gpa
,
3283 offset_in_page(dst
), sizeof(gpa
)))
3286 vmcs12
->guest_pml_index
--;
3292 * Intel's VMX Instruction Reference specifies a common set of prerequisites
3293 * for running VMX instructions (except VMXON, whose prerequisites are
3294 * slightly different). It also specifies what exception to inject otherwise.
3295 * Note that many of these exceptions have priority over VM exits, so they
3296 * don't have to be checked again here.
3298 static int nested_vmx_check_permission(struct kvm_vcpu
*vcpu
)
3300 if (!to_vmx(vcpu
)->nested
.vmxon
) {
3301 kvm_queue_exception(vcpu
, UD_VECTOR
);
3305 if (vmx_get_cpl(vcpu
)) {
3306 kvm_inject_gp(vcpu
, 0);
3313 static u8
vmx_has_apicv_interrupt(struct kvm_vcpu
*vcpu
)
3315 u8 rvi
= vmx_get_rvi();
3316 u8 vppr
= kvm_lapic_get_reg(vcpu
->arch
.apic
, APIC_PROCPRI
);
3318 return ((rvi
& 0xf0) > (vppr
& 0xf0));
3321 static void load_vmcs12_host_state(struct kvm_vcpu
*vcpu
,
3322 struct vmcs12
*vmcs12
);
3325 * If from_vmentry is false, this is being called from state restore (either RSM
3326 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
3329 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode
3330 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail
3331 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit
3332 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error
3334 enum nvmx_vmentry_status
nested_vmx_enter_non_root_mode(struct kvm_vcpu
*vcpu
,
3337 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3338 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
3339 enum vm_entry_failure_code entry_failure_code
;
3340 bool evaluate_pending_interrupts
;
3341 union vmx_exit_reason exit_reason
= {
3342 .basic
= EXIT_REASON_INVALID_STATE
,
3343 .failed_vmentry
= 1,
3347 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT
, vcpu
))
3348 kvm_vcpu_flush_tlb_current(vcpu
);
3350 evaluate_pending_interrupts
= exec_controls_get(vmx
) &
3351 (CPU_BASED_INTR_WINDOW_EXITING
| CPU_BASED_NMI_WINDOW_EXITING
);
3352 if (likely(!evaluate_pending_interrupts
) && kvm_vcpu_apicv_active(vcpu
))
3353 evaluate_pending_interrupts
|= vmx_has_apicv_interrupt(vcpu
);
3355 if (!(vmcs12
->vm_entry_controls
& VM_ENTRY_LOAD_DEBUG_CONTROLS
))
3356 vmx
->nested
.vmcs01_debugctl
= vmcs_read64(GUEST_IA32_DEBUGCTL
);
3357 if (kvm_mpx_supported() &&
3358 !(vmcs12
->vm_entry_controls
& VM_ENTRY_LOAD_BNDCFGS
))
3359 vmx
->nested
.vmcs01_guest_bndcfgs
= vmcs_read64(GUEST_BNDCFGS
);
3362 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3363 * nested early checks are disabled. In the event of a "late" VM-Fail,
3364 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3365 * software model to the pre-VMEntry host state. When EPT is disabled,
3366 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3367 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing
3368 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3369 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested
3370 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3371 * guaranteed to be overwritten with a shadow CR3 prior to re-entering
3372 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3373 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3374 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3375 * path would need to manually save/restore vmcs01.GUEST_CR3.
3377 if (!enable_ept
&& !nested_early_check
)
3378 vmcs_writel(GUEST_CR3
, vcpu
->arch
.cr3
);
3380 vmx_switch_vmcs(vcpu
, &vmx
->nested
.vmcs02
);
3382 prepare_vmcs02_early(vmx
, vmcs12
);
3385 if (unlikely(!nested_get_vmcs12_pages(vcpu
))) {
3386 vmx_switch_vmcs(vcpu
, &vmx
->vmcs01
);
3387 return NVMX_VMENTRY_KVM_INTERNAL_ERROR
;
3390 if (nested_vmx_check_vmentry_hw(vcpu
)) {
3391 vmx_switch_vmcs(vcpu
, &vmx
->vmcs01
);
3392 return NVMX_VMENTRY_VMFAIL
;
3395 if (nested_vmx_check_guest_state(vcpu
, vmcs12
,
3396 &entry_failure_code
)) {
3397 exit_reason
.basic
= EXIT_REASON_INVALID_STATE
;
3398 vmcs12
->exit_qualification
= entry_failure_code
;
3399 goto vmentry_fail_vmexit
;
3403 enter_guest_mode(vcpu
);
3405 if (prepare_vmcs02(vcpu
, vmcs12
, from_vmentry
, &entry_failure_code
)) {
3406 exit_reason
.basic
= EXIT_REASON_INVALID_STATE
;
3407 vmcs12
->exit_qualification
= entry_failure_code
;
3408 goto vmentry_fail_vmexit_guest_mode
;
3412 failed_index
= nested_vmx_load_msr(vcpu
,
3413 vmcs12
->vm_entry_msr_load_addr
,
3414 vmcs12
->vm_entry_msr_load_count
);
3416 exit_reason
.basic
= EXIT_REASON_MSR_LOAD_FAIL
;
3417 vmcs12
->exit_qualification
= failed_index
;
3418 goto vmentry_fail_vmexit_guest_mode
;
3422 * The MMU is not initialized to point at the right entities yet and
3423 * "get pages" would need to read data from the guest (i.e. we will
3424 * need to perform gpa to hpa translation). Request a call
3425 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs
3426 * have already been set at vmentry time and should not be reset.
3428 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES
, vcpu
);
3432 * If L1 had a pending IRQ/NMI until it executed
3433 * VMLAUNCH/VMRESUME which wasn't delivered because it was
3434 * disallowed (e.g. interrupts disabled), L0 needs to
3435 * evaluate if this pending event should cause an exit from L2
3436 * to L1 or delivered directly to L2 (e.g. In case L1 don't
3437 * intercept EXTERNAL_INTERRUPT).
3439 * Usually this would be handled by the processor noticing an
3440 * IRQ/NMI window request, or checking RVI during evaluation of
3441 * pending virtual interrupts. However, this setting was done
3442 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
3443 * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
3445 if (unlikely(evaluate_pending_interrupts
))
3446 kvm_make_request(KVM_REQ_EVENT
, vcpu
);
3449 * Do not start the preemption timer hrtimer until after we know
3450 * we are successful, so that only nested_vmx_vmexit needs to cancel
3453 vmx
->nested
.preemption_timer_expired
= false;
3454 if (nested_cpu_has_preemption_timer(vmcs12
)) {
3455 u64 timer_value
= vmx_calc_preemption_timer_value(vcpu
);
3456 vmx_start_preemption_timer(vcpu
, timer_value
);
3460 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3461 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3462 * returned as far as L1 is concerned. It will only return (and set
3463 * the success flag) when L2 exits (see nested_vmx_vmexit()).
3465 return NVMX_VMENTRY_SUCCESS
;
3468 * A failed consistency check that leads to a VMExit during L1's
3469 * VMEnter to L2 is a variation of a normal VMexit, as explained in
3470 * 26.7 "VM-entry failures during or after loading guest state".
3472 vmentry_fail_vmexit_guest_mode
:
3473 if (vmcs12
->cpu_based_vm_exec_control
& CPU_BASED_USE_TSC_OFFSETTING
)
3474 vcpu
->arch
.tsc_offset
-= vmcs12
->tsc_offset
;
3475 leave_guest_mode(vcpu
);
3477 vmentry_fail_vmexit
:
3478 vmx_switch_vmcs(vcpu
, &vmx
->vmcs01
);
3481 return NVMX_VMENTRY_VMEXIT
;
3483 load_vmcs12_host_state(vcpu
, vmcs12
);
3484 vmcs12
->vm_exit_reason
= exit_reason
.full
;
3485 if (enable_shadow_vmcs
|| evmptr_is_valid(vmx
->nested
.hv_evmcs_vmptr
))
3486 vmx
->nested
.need_vmcs12_to_shadow_sync
= true;
3487 return NVMX_VMENTRY_VMEXIT
;
3491 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3492 * for running an L2 nested guest.
3494 static int nested_vmx_run(struct kvm_vcpu
*vcpu
, bool launch
)
3496 struct vmcs12
*vmcs12
;
3497 enum nvmx_vmentry_status status
;
3498 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3499 u32 interrupt_shadow
= vmx_get_interrupt_shadow(vcpu
);
3500 enum nested_evmptrld_status evmptrld_status
;
3502 if (!nested_vmx_check_permission(vcpu
))
3505 evmptrld_status
= nested_vmx_handle_enlightened_vmptrld(vcpu
, launch
);
3506 if (evmptrld_status
== EVMPTRLD_ERROR
) {
3507 kvm_queue_exception(vcpu
, UD_VECTOR
);
3509 } else if (CC(evmptrld_status
== EVMPTRLD_VMFAIL
)) {
3510 return nested_vmx_failInvalid(vcpu
);
3513 if (CC(!evmptr_is_valid(vmx
->nested
.hv_evmcs_vmptr
) &&
3514 vmx
->nested
.current_vmptr
== -1ull))
3515 return nested_vmx_failInvalid(vcpu
);
3517 vmcs12
= get_vmcs12(vcpu
);
3520 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3521 * that there *is* a valid VMCS pointer, RFLAGS.CF is set
3522 * rather than RFLAGS.ZF, and no error number is stored to the
3523 * VM-instruction error field.
3525 if (CC(vmcs12
->hdr
.shadow_vmcs
))
3526 return nested_vmx_failInvalid(vcpu
);
3528 if (evmptr_is_valid(vmx
->nested
.hv_evmcs_vmptr
)) {
3529 copy_enlightened_to_vmcs12(vmx
, vmx
->nested
.hv_evmcs
->hv_clean_fields
);
3530 /* Enlightened VMCS doesn't have launch state */
3531 vmcs12
->launch_state
= !launch
;
3532 } else if (enable_shadow_vmcs
) {
3533 copy_shadow_to_vmcs12(vmx
);
3537 * The nested entry process starts with enforcing various prerequisites
3538 * on vmcs12 as required by the Intel SDM, and act appropriately when
3539 * they fail: As the SDM explains, some conditions should cause the
3540 * instruction to fail, while others will cause the instruction to seem
3541 * to succeed, but return an EXIT_REASON_INVALID_STATE.
3542 * To speed up the normal (success) code path, we should avoid checking
3543 * for misconfigurations which will anyway be caught by the processor
3544 * when using the merged vmcs02.
3546 if (CC(interrupt_shadow
& KVM_X86_SHADOW_INT_MOV_SS
))
3547 return nested_vmx_fail(vcpu
, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS
);
3549 if (CC(vmcs12
->launch_state
== launch
))
3550 return nested_vmx_fail(vcpu
,
3551 launch
? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3552 : VMXERR_VMRESUME_NONLAUNCHED_VMCS
);
3554 if (nested_vmx_check_controls(vcpu
, vmcs12
))
3555 return nested_vmx_fail(vcpu
, VMXERR_ENTRY_INVALID_CONTROL_FIELD
);
3557 if (nested_vmx_check_host_state(vcpu
, vmcs12
))
3558 return nested_vmx_fail(vcpu
, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD
);
3561 * We're finally done with prerequisite checking, and can start with
3564 vmx
->nested
.nested_run_pending
= 1;
3565 vmx
->nested
.has_preemption_timer_deadline
= false;
3566 status
= nested_vmx_enter_non_root_mode(vcpu
, true);
3567 if (unlikely(status
!= NVMX_VMENTRY_SUCCESS
))
3568 goto vmentry_failed
;
3570 /* Emulate processing of posted interrupts on VM-Enter. */
3571 if (nested_cpu_has_posted_intr(vmcs12
) &&
3572 kvm_apic_has_interrupt(vcpu
) == vmx
->nested
.posted_intr_nv
) {
3573 vmx
->nested
.pi_pending
= true;
3574 kvm_make_request(KVM_REQ_EVENT
, vcpu
);
3575 kvm_apic_clear_irr(vcpu
, vmx
->nested
.posted_intr_nv
);
3578 /* Hide L1D cache contents from the nested guest. */
3579 vmx
->vcpu
.arch
.l1tf_flush_l1d
= true;
3582 * Must happen outside of nested_vmx_enter_non_root_mode() as it will
3583 * also be used as part of restoring nVMX state for
3584 * snapshot restore (migration).
3586 * In this flow, it is assumed that vmcs12 cache was
3587 * transferred as part of captured nVMX state and should
3588 * therefore not be read from guest memory (which may not
3589 * exist on destination host yet).
3591 nested_cache_shadow_vmcs12(vcpu
, vmcs12
);
3593 switch (vmcs12
->guest_activity_state
) {
3594 case GUEST_ACTIVITY_HLT
:
3596 * If we're entering a halted L2 vcpu and the L2 vcpu won't be
3597 * awakened by event injection or by an NMI-window VM-exit or
3598 * by an interrupt-window VM-exit, halt the vcpu.
3600 if (!(vmcs12
->vm_entry_intr_info_field
& INTR_INFO_VALID_MASK
) &&
3601 !nested_cpu_has(vmcs12
, CPU_BASED_NMI_WINDOW_EXITING
) &&
3602 !(nested_cpu_has(vmcs12
, CPU_BASED_INTR_WINDOW_EXITING
) &&
3603 (vmcs12
->guest_rflags
& X86_EFLAGS_IF
))) {
3604 vmx
->nested
.nested_run_pending
= 0;
3605 return kvm_vcpu_halt(vcpu
);
3608 case GUEST_ACTIVITY_WAIT_SIPI
:
3609 vmx
->nested
.nested_run_pending
= 0;
3610 vcpu
->arch
.mp_state
= KVM_MP_STATE_INIT_RECEIVED
;
3619 vmx
->nested
.nested_run_pending
= 0;
3620 if (status
== NVMX_VMENTRY_KVM_INTERNAL_ERROR
)
3622 if (status
== NVMX_VMENTRY_VMEXIT
)
3624 WARN_ON_ONCE(status
!= NVMX_VMENTRY_VMFAIL
);
3625 return nested_vmx_fail(vcpu
, VMXERR_ENTRY_INVALID_CONTROL_FIELD
);
3629 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
3630 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK).
3631 * This function returns the new value we should put in vmcs12.guest_cr0.
3632 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3633 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3634 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3635 * didn't trap the bit, because if L1 did, so would L0).
3636 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3637 * been modified by L2, and L1 knows it. So just leave the old value of
3638 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3639 * isn't relevant, because if L0 traps this bit it can set it to anything.
3640 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3641 * changed these bits, and therefore they need to be updated, but L0
3642 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3643 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3645 static inline unsigned long
3646 vmcs12_guest_cr0(struct kvm_vcpu
*vcpu
, struct vmcs12
*vmcs12
)
3649 /*1*/ (vmcs_readl(GUEST_CR0
) & vcpu
->arch
.cr0_guest_owned_bits
) |
3650 /*2*/ (vmcs12
->guest_cr0
& vmcs12
->cr0_guest_host_mask
) |
3651 /*3*/ (vmcs_readl(CR0_READ_SHADOW
) & ~(vmcs12
->cr0_guest_host_mask
|
3652 vcpu
->arch
.cr0_guest_owned_bits
));
3655 static inline unsigned long
3656 vmcs12_guest_cr4(struct kvm_vcpu
*vcpu
, struct vmcs12
*vmcs12
)
3659 /*1*/ (vmcs_readl(GUEST_CR4
) & vcpu
->arch
.cr4_guest_owned_bits
) |
3660 /*2*/ (vmcs12
->guest_cr4
& vmcs12
->cr4_guest_host_mask
) |
3661 /*3*/ (vmcs_readl(CR4_READ_SHADOW
) & ~(vmcs12
->cr4_guest_host_mask
|
3662 vcpu
->arch
.cr4_guest_owned_bits
));
3665 static void vmcs12_save_pending_event(struct kvm_vcpu
*vcpu
,
3666 struct vmcs12
*vmcs12
)
3671 if (vcpu
->arch
.exception
.injected
) {
3672 nr
= vcpu
->arch
.exception
.nr
;
3673 idt_vectoring
= nr
| VECTORING_INFO_VALID_MASK
;
3675 if (kvm_exception_is_soft(nr
)) {
3676 vmcs12
->vm_exit_instruction_len
=
3677 vcpu
->arch
.event_exit_inst_len
;
3678 idt_vectoring
|= INTR_TYPE_SOFT_EXCEPTION
;
3680 idt_vectoring
|= INTR_TYPE_HARD_EXCEPTION
;
3682 if (vcpu
->arch
.exception
.has_error_code
) {
3683 idt_vectoring
|= VECTORING_INFO_DELIVER_CODE_MASK
;
3684 vmcs12
->idt_vectoring_error_code
=
3685 vcpu
->arch
.exception
.error_code
;
3688 vmcs12
->idt_vectoring_info_field
= idt_vectoring
;
3689 } else if (vcpu
->arch
.nmi_injected
) {
3690 vmcs12
->idt_vectoring_info_field
=
3691 INTR_TYPE_NMI_INTR
| INTR_INFO_VALID_MASK
| NMI_VECTOR
;
3692 } else if (vcpu
->arch
.interrupt
.injected
) {
3693 nr
= vcpu
->arch
.interrupt
.nr
;
3694 idt_vectoring
= nr
| VECTORING_INFO_VALID_MASK
;
3696 if (vcpu
->arch
.interrupt
.soft
) {
3697 idt_vectoring
|= INTR_TYPE_SOFT_INTR
;
3698 vmcs12
->vm_entry_instruction_len
=
3699 vcpu
->arch
.event_exit_inst_len
;
3701 idt_vectoring
|= INTR_TYPE_EXT_INTR
;
3703 vmcs12
->idt_vectoring_info_field
= idt_vectoring
;
3708 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu
*vcpu
)
3710 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
3714 * Don't need to mark the APIC access page dirty; it is never
3715 * written to by the CPU during APIC virtualization.
3718 if (nested_cpu_has(vmcs12
, CPU_BASED_TPR_SHADOW
)) {
3719 gfn
= vmcs12
->virtual_apic_page_addr
>> PAGE_SHIFT
;
3720 kvm_vcpu_mark_page_dirty(vcpu
, gfn
);
3723 if (nested_cpu_has_posted_intr(vmcs12
)) {
3724 gfn
= vmcs12
->posted_intr_desc_addr
>> PAGE_SHIFT
;
3725 kvm_vcpu_mark_page_dirty(vcpu
, gfn
);
3729 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu
*vcpu
)
3731 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3736 if (!vmx
->nested
.pi_pending
)
3739 if (!vmx
->nested
.pi_desc
)
3742 vmx
->nested
.pi_pending
= false;
3744 if (!pi_test_and_clear_on(vmx
->nested
.pi_desc
))
3747 max_irr
= find_last_bit((unsigned long *)vmx
->nested
.pi_desc
->pir
, 256);
3748 if (max_irr
!= 256) {
3749 vapic_page
= vmx
->nested
.virtual_apic_map
.hva
;
3753 __kvm_apic_update_irr(vmx
->nested
.pi_desc
->pir
,
3754 vapic_page
, &max_irr
);
3755 status
= vmcs_read16(GUEST_INTR_STATUS
);
3756 if ((u8
)max_irr
> ((u8
)status
& 0xff)) {
3758 status
|= (u8
)max_irr
;
3759 vmcs_write16(GUEST_INTR_STATUS
, status
);
3763 nested_mark_vmcs12_pages_dirty(vcpu
);
3767 kvm_handle_memory_failure(vcpu
, X86EMUL_IO_NEEDED
, NULL
);
3771 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu
*vcpu
,
3772 unsigned long exit_qual
)
3774 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
3775 unsigned int nr
= vcpu
->arch
.exception
.nr
;
3776 u32 intr_info
= nr
| INTR_INFO_VALID_MASK
;
3778 if (vcpu
->arch
.exception
.has_error_code
) {
3779 vmcs12
->vm_exit_intr_error_code
= vcpu
->arch
.exception
.error_code
;
3780 intr_info
|= INTR_INFO_DELIVER_CODE_MASK
;
3783 if (kvm_exception_is_soft(nr
))
3784 intr_info
|= INTR_TYPE_SOFT_EXCEPTION
;
3786 intr_info
|= INTR_TYPE_HARD_EXCEPTION
;
3788 if (!(vmcs12
->idt_vectoring_info_field
& VECTORING_INFO_VALID_MASK
) &&
3789 vmx_get_nmi_mask(vcpu
))
3790 intr_info
|= INTR_INFO_UNBLOCK_NMI
;
3792 nested_vmx_vmexit(vcpu
, EXIT_REASON_EXCEPTION_NMI
, intr_info
, exit_qual
);
3796 * Returns true if a debug trap is pending delivery.
3798 * In KVM, debug traps bear an exception payload. As such, the class of a #DB
3799 * exception may be inferred from the presence of an exception payload.
3801 static inline bool vmx_pending_dbg_trap(struct kvm_vcpu
*vcpu
)
3803 return vcpu
->arch
.exception
.pending
&&
3804 vcpu
->arch
.exception
.nr
== DB_VECTOR
&&
3805 vcpu
->arch
.exception
.payload
;
3809 * Certain VM-exits set the 'pending debug exceptions' field to indicate a
3810 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM
3811 * represents these debug traps with a payload that is said to be compatible
3812 * with the 'pending debug exceptions' field, write the payload to the VMCS
3813 * field if a VM-exit is delivered before the debug trap.
3815 static void nested_vmx_update_pending_dbg(struct kvm_vcpu
*vcpu
)
3817 if (vmx_pending_dbg_trap(vcpu
))
3818 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS
,
3819 vcpu
->arch
.exception
.payload
);
3822 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu
*vcpu
)
3824 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu
)) &&
3825 to_vmx(vcpu
)->nested
.preemption_timer_expired
;
3828 static int vmx_check_nested_events(struct kvm_vcpu
*vcpu
)
3830 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
3831 unsigned long exit_qual
;
3832 bool block_nested_events
=
3833 vmx
->nested
.nested_run_pending
|| kvm_event_needs_reinjection(vcpu
);
3834 bool mtf_pending
= vmx
->nested
.mtf_pending
;
3835 struct kvm_lapic
*apic
= vcpu
->arch
.apic
;
3838 * Clear the MTF state. If a higher priority VM-exit is delivered first,
3839 * this state is discarded.
3841 if (!block_nested_events
)
3842 vmx
->nested
.mtf_pending
= false;
3844 if (lapic_in_kernel(vcpu
) &&
3845 test_bit(KVM_APIC_INIT
, &apic
->pending_events
)) {
3846 if (block_nested_events
)
3848 nested_vmx_update_pending_dbg(vcpu
);
3849 clear_bit(KVM_APIC_INIT
, &apic
->pending_events
);
3850 if (vcpu
->arch
.mp_state
!= KVM_MP_STATE_INIT_RECEIVED
)
3851 nested_vmx_vmexit(vcpu
, EXIT_REASON_INIT_SIGNAL
, 0, 0);
3855 if (lapic_in_kernel(vcpu
) &&
3856 test_bit(KVM_APIC_SIPI
, &apic
->pending_events
)) {
3857 if (block_nested_events
)
3860 clear_bit(KVM_APIC_SIPI
, &apic
->pending_events
);
3861 if (vcpu
->arch
.mp_state
== KVM_MP_STATE_INIT_RECEIVED
)
3862 nested_vmx_vmexit(vcpu
, EXIT_REASON_SIPI_SIGNAL
, 0,
3863 apic
->sipi_vector
& 0xFFUL
);
3868 * Process any exceptions that are not debug traps before MTF.
3870 * Note that only a pending nested run can block a pending exception.
3871 * Otherwise an injected NMI/interrupt should either be
3872 * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO,
3873 * while delivering the pending exception.
3876 if (vcpu
->arch
.exception
.pending
&& !vmx_pending_dbg_trap(vcpu
)) {
3877 if (vmx
->nested
.nested_run_pending
)
3879 if (!nested_vmx_check_exception(vcpu
, &exit_qual
))
3881 nested_vmx_inject_exception_vmexit(vcpu
, exit_qual
);
3886 if (block_nested_events
)
3888 nested_vmx_update_pending_dbg(vcpu
);
3889 nested_vmx_vmexit(vcpu
, EXIT_REASON_MONITOR_TRAP_FLAG
, 0, 0);
3893 if (vcpu
->arch
.exception
.pending
) {
3894 if (vmx
->nested
.nested_run_pending
)
3896 if (!nested_vmx_check_exception(vcpu
, &exit_qual
))
3898 nested_vmx_inject_exception_vmexit(vcpu
, exit_qual
);
3902 if (nested_vmx_preemption_timer_pending(vcpu
)) {
3903 if (block_nested_events
)
3905 nested_vmx_vmexit(vcpu
, EXIT_REASON_PREEMPTION_TIMER
, 0, 0);
3909 if (vcpu
->arch
.smi_pending
&& !is_smm(vcpu
)) {
3910 if (block_nested_events
)
3915 if (vcpu
->arch
.nmi_pending
&& !vmx_nmi_blocked(vcpu
)) {
3916 if (block_nested_events
)
3918 if (!nested_exit_on_nmi(vcpu
))
3921 nested_vmx_vmexit(vcpu
, EXIT_REASON_EXCEPTION_NMI
,
3922 NMI_VECTOR
| INTR_TYPE_NMI_INTR
|
3923 INTR_INFO_VALID_MASK
, 0);
3925 * The NMI-triggered VM exit counts as injection:
3926 * clear this one and block further NMIs.
3928 vcpu
->arch
.nmi_pending
= 0;
3929 vmx_set_nmi_mask(vcpu
, true);
3933 if (kvm_cpu_has_interrupt(vcpu
) && !vmx_interrupt_blocked(vcpu
)) {
3934 if (block_nested_events
)
3936 if (!nested_exit_on_intr(vcpu
))
3938 nested_vmx_vmexit(vcpu
, EXIT_REASON_EXTERNAL_INTERRUPT
, 0, 0);
3943 return vmx_complete_nested_posted_interrupt(vcpu
);
3946 static u32
vmx_get_preemption_timer_value(struct kvm_vcpu
*vcpu
)
3949 hrtimer_get_remaining(&to_vmx(vcpu
)->nested
.preemption_timer
);
3952 if (ktime_to_ns(remaining
) <= 0)
3955 value
= ktime_to_ns(remaining
) * vcpu
->arch
.virtual_tsc_khz
;
3956 do_div(value
, 1000000);
3957 return value
>> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE
;
3960 static bool is_vmcs12_ext_field(unsigned long field
)
3963 case GUEST_ES_SELECTOR
:
3964 case GUEST_CS_SELECTOR
:
3965 case GUEST_SS_SELECTOR
:
3966 case GUEST_DS_SELECTOR
:
3967 case GUEST_FS_SELECTOR
:
3968 case GUEST_GS_SELECTOR
:
3969 case GUEST_LDTR_SELECTOR
:
3970 case GUEST_TR_SELECTOR
:
3971 case GUEST_ES_LIMIT
:
3972 case GUEST_CS_LIMIT
:
3973 case GUEST_SS_LIMIT
:
3974 case GUEST_DS_LIMIT
:
3975 case GUEST_FS_LIMIT
:
3976 case GUEST_GS_LIMIT
:
3977 case GUEST_LDTR_LIMIT
:
3978 case GUEST_TR_LIMIT
:
3979 case GUEST_GDTR_LIMIT
:
3980 case GUEST_IDTR_LIMIT
:
3981 case GUEST_ES_AR_BYTES
:
3982 case GUEST_DS_AR_BYTES
:
3983 case GUEST_FS_AR_BYTES
:
3984 case GUEST_GS_AR_BYTES
:
3985 case GUEST_LDTR_AR_BYTES
:
3986 case GUEST_TR_AR_BYTES
:
3993 case GUEST_LDTR_BASE
:
3995 case GUEST_GDTR_BASE
:
3996 case GUEST_IDTR_BASE
:
3997 case GUEST_PENDING_DBG_EXCEPTIONS
:
4007 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu
*vcpu
,
4008 struct vmcs12
*vmcs12
)
4010 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
4012 vmcs12
->guest_es_selector
= vmcs_read16(GUEST_ES_SELECTOR
);
4013 vmcs12
->guest_cs_selector
= vmcs_read16(GUEST_CS_SELECTOR
);
4014 vmcs12
->guest_ss_selector
= vmcs_read16(GUEST_SS_SELECTOR
);
4015 vmcs12
->guest_ds_selector
= vmcs_read16(GUEST_DS_SELECTOR
);
4016 vmcs12
->guest_fs_selector
= vmcs_read16(GUEST_FS_SELECTOR
);
4017 vmcs12
->guest_gs_selector
= vmcs_read16(GUEST_GS_SELECTOR
);
4018 vmcs12
->guest_ldtr_selector
= vmcs_read16(GUEST_LDTR_SELECTOR
);
4019 vmcs12
->guest_tr_selector
= vmcs_read16(GUEST_TR_SELECTOR
);
4020 vmcs12
->guest_es_limit
= vmcs_read32(GUEST_ES_LIMIT
);
4021 vmcs12
->guest_cs_limit
= vmcs_read32(GUEST_CS_LIMIT
);
4022 vmcs12
->guest_ss_limit
= vmcs_read32(GUEST_SS_LIMIT
);
4023 vmcs12
->guest_ds_limit
= vmcs_read32(GUEST_DS_LIMIT
);
4024 vmcs12
->guest_fs_limit
= vmcs_read32(GUEST_FS_LIMIT
);
4025 vmcs12
->guest_gs_limit
= vmcs_read32(GUEST_GS_LIMIT
);
4026 vmcs12
->guest_ldtr_limit
= vmcs_read32(GUEST_LDTR_LIMIT
);
4027 vmcs12
->guest_tr_limit
= vmcs_read32(GUEST_TR_LIMIT
);
4028 vmcs12
->guest_gdtr_limit
= vmcs_read32(GUEST_GDTR_LIMIT
);
4029 vmcs12
->guest_idtr_limit
= vmcs_read32(GUEST_IDTR_LIMIT
);
4030 vmcs12
->guest_es_ar_bytes
= vmcs_read32(GUEST_ES_AR_BYTES
);
4031 vmcs12
->guest_ds_ar_bytes
= vmcs_read32(GUEST_DS_AR_BYTES
);
4032 vmcs12
->guest_fs_ar_bytes
= vmcs_read32(GUEST_FS_AR_BYTES
);
4033 vmcs12
->guest_gs_ar_bytes
= vmcs_read32(GUEST_GS_AR_BYTES
);
4034 vmcs12
->guest_ldtr_ar_bytes
= vmcs_read32(GUEST_LDTR_AR_BYTES
);
4035 vmcs12
->guest_tr_ar_bytes
= vmcs_read32(GUEST_TR_AR_BYTES
);
4036 vmcs12
->guest_es_base
= vmcs_readl(GUEST_ES_BASE
);
4037 vmcs12
->guest_cs_base
= vmcs_readl(GUEST_CS_BASE
);
4038 vmcs12
->guest_ss_base
= vmcs_readl(GUEST_SS_BASE
);
4039 vmcs12
->guest_ds_base
= vmcs_readl(GUEST_DS_BASE
);
4040 vmcs12
->guest_fs_base
= vmcs_readl(GUEST_FS_BASE
);
4041 vmcs12
->guest_gs_base
= vmcs_readl(GUEST_GS_BASE
);
4042 vmcs12
->guest_ldtr_base
= vmcs_readl(GUEST_LDTR_BASE
);
4043 vmcs12
->guest_tr_base
= vmcs_readl(GUEST_TR_BASE
);
4044 vmcs12
->guest_gdtr_base
= vmcs_readl(GUEST_GDTR_BASE
);
4045 vmcs12
->guest_idtr_base
= vmcs_readl(GUEST_IDTR_BASE
);
4046 vmcs12
->guest_pending_dbg_exceptions
=
4047 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS
);
4048 if (kvm_mpx_supported())
4049 vmcs12
->guest_bndcfgs
= vmcs_read64(GUEST_BNDCFGS
);
4051 vmx
->nested
.need_sync_vmcs02_to_vmcs12_rare
= false;
4054 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu
*vcpu
,
4055 struct vmcs12
*vmcs12
)
4057 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
4060 if (!vmx
->nested
.need_sync_vmcs02_to_vmcs12_rare
)
4064 WARN_ON_ONCE(vmx
->loaded_vmcs
!= &vmx
->vmcs01
);
4067 vmx
->loaded_vmcs
= &vmx
->nested
.vmcs02
;
4068 vmx_vcpu_load_vmcs(vcpu
, cpu
, &vmx
->vmcs01
);
4070 sync_vmcs02_to_vmcs12_rare(vcpu
, vmcs12
);
4072 vmx
->loaded_vmcs
= &vmx
->vmcs01
;
4073 vmx_vcpu_load_vmcs(vcpu
, cpu
, &vmx
->nested
.vmcs02
);
4078 * Update the guest state fields of vmcs12 to reflect changes that
4079 * occurred while L2 was running. (The "IA-32e mode guest" bit of the
4080 * VM-entry controls is also updated, since this is really a guest
4083 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu
*vcpu
, struct vmcs12
*vmcs12
)
4085 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
4087 if (evmptr_is_valid(vmx
->nested
.hv_evmcs_vmptr
))
4088 sync_vmcs02_to_vmcs12_rare(vcpu
, vmcs12
);
4090 vmx
->nested
.need_sync_vmcs02_to_vmcs12_rare
=
4091 !evmptr_is_valid(vmx
->nested
.hv_evmcs_vmptr
);
4093 vmcs12
->guest_cr0
= vmcs12_guest_cr0(vcpu
, vmcs12
);
4094 vmcs12
->guest_cr4
= vmcs12_guest_cr4(vcpu
, vmcs12
);
4096 vmcs12
->guest_rsp
= kvm_rsp_read(vcpu
);
4097 vmcs12
->guest_rip
= kvm_rip_read(vcpu
);
4098 vmcs12
->guest_rflags
= vmcs_readl(GUEST_RFLAGS
);
4100 vmcs12
->guest_cs_ar_bytes
= vmcs_read32(GUEST_CS_AR_BYTES
);
4101 vmcs12
->guest_ss_ar_bytes
= vmcs_read32(GUEST_SS_AR_BYTES
);
4103 vmcs12
->guest_interruptibility_info
=
4104 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO
);
4106 if (vcpu
->arch
.mp_state
== KVM_MP_STATE_HALTED
)
4107 vmcs12
->guest_activity_state
= GUEST_ACTIVITY_HLT
;
4108 else if (vcpu
->arch
.mp_state
== KVM_MP_STATE_INIT_RECEIVED
)
4109 vmcs12
->guest_activity_state
= GUEST_ACTIVITY_WAIT_SIPI
;
4111 vmcs12
->guest_activity_state
= GUEST_ACTIVITY_ACTIVE
;
4113 if (nested_cpu_has_preemption_timer(vmcs12
) &&
4114 vmcs12
->vm_exit_controls
& VM_EXIT_SAVE_VMX_PREEMPTION_TIMER
&&
4115 !vmx
->nested
.nested_run_pending
)
4116 vmcs12
->vmx_preemption_timer_value
=
4117 vmx_get_preemption_timer_value(vcpu
);
4120 * In some cases (usually, nested EPT), L2 is allowed to change its
4121 * own CR3 without exiting. If it has changed it, we must keep it.
4122 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
4123 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
4125 * Additionally, restore L2's PDPTR to vmcs12.
4128 vmcs12
->guest_cr3
= vmcs_readl(GUEST_CR3
);
4129 if (nested_cpu_has_ept(vmcs12
) && is_pae_paging(vcpu
)) {
4130 vmcs12
->guest_pdptr0
= vmcs_read64(GUEST_PDPTR0
);
4131 vmcs12
->guest_pdptr1
= vmcs_read64(GUEST_PDPTR1
);
4132 vmcs12
->guest_pdptr2
= vmcs_read64(GUEST_PDPTR2
);
4133 vmcs12
->guest_pdptr3
= vmcs_read64(GUEST_PDPTR3
);
4137 vmcs12
->guest_linear_address
= vmcs_readl(GUEST_LINEAR_ADDRESS
);
4139 if (nested_cpu_has_vid(vmcs12
))
4140 vmcs12
->guest_intr_status
= vmcs_read16(GUEST_INTR_STATUS
);
4142 vmcs12
->vm_entry_controls
=
4143 (vmcs12
->vm_entry_controls
& ~VM_ENTRY_IA32E_MODE
) |
4144 (vm_entry_controls_get(to_vmx(vcpu
)) & VM_ENTRY_IA32E_MODE
);
4146 if (vmcs12
->vm_exit_controls
& VM_EXIT_SAVE_DEBUG_CONTROLS
)
4147 kvm_get_dr(vcpu
, 7, (unsigned long *)&vmcs12
->guest_dr7
);
4149 if (vmcs12
->vm_exit_controls
& VM_EXIT_SAVE_IA32_EFER
)
4150 vmcs12
->guest_ia32_efer
= vcpu
->arch
.efer
;
4154 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
4155 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
4156 * and this function updates it to reflect the changes to the guest state while
4157 * L2 was running (and perhaps made some exits which were handled directly by L0
4158 * without going back to L1), and to reflect the exit reason.
4159 * Note that we do not have to copy here all VMCS fields, just those that
4160 * could have changed by the L2 guest or the exit - i.e., the guest-state and
4161 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
4162 * which already writes to vmcs12 directly.
4164 static void prepare_vmcs12(struct kvm_vcpu
*vcpu
, struct vmcs12
*vmcs12
,
4165 u32 vm_exit_reason
, u32 exit_intr_info
,
4166 unsigned long exit_qualification
)
4168 /* update exit information fields: */
4169 vmcs12
->vm_exit_reason
= vm_exit_reason
;
4170 if (to_vmx(vcpu
)->exit_reason
.enclave_mode
)
4171 vmcs12
->vm_exit_reason
|= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE
;
4172 vmcs12
->exit_qualification
= exit_qualification
;
4173 vmcs12
->vm_exit_intr_info
= exit_intr_info
;
4175 vmcs12
->idt_vectoring_info_field
= 0;
4176 vmcs12
->vm_exit_instruction_len
= vmcs_read32(VM_EXIT_INSTRUCTION_LEN
);
4177 vmcs12
->vmx_instruction_info
= vmcs_read32(VMX_INSTRUCTION_INFO
);
4179 if (!(vmcs12
->vm_exit_reason
& VMX_EXIT_REASONS_FAILED_VMENTRY
)) {
4180 vmcs12
->launch_state
= 1;
4182 /* vm_entry_intr_info_field is cleared on exit. Emulate this
4183 * instead of reading the real value. */
4184 vmcs12
->vm_entry_intr_info_field
&= ~INTR_INFO_VALID_MASK
;
4187 * Transfer the event that L0 or L1 may wanted to inject into
4188 * L2 to IDT_VECTORING_INFO_FIELD.
4190 vmcs12_save_pending_event(vcpu
, vmcs12
);
4193 * According to spec, there's no need to store the guest's
4194 * MSRs if the exit is due to a VM-entry failure that occurs
4195 * during or after loading the guest state. Since this exit
4196 * does not fall in that category, we need to save the MSRs.
4198 if (nested_vmx_store_msr(vcpu
,
4199 vmcs12
->vm_exit_msr_store_addr
,
4200 vmcs12
->vm_exit_msr_store_count
))
4201 nested_vmx_abort(vcpu
,
4202 VMX_ABORT_SAVE_GUEST_MSR_FAIL
);
4206 * Drop what we picked up for L2 via vmx_complete_interrupts. It is
4207 * preserved above and would only end up incorrectly in L1.
4209 vcpu
->arch
.nmi_injected
= false;
4210 kvm_clear_exception_queue(vcpu
);
4211 kvm_clear_interrupt_queue(vcpu
);
4215 * A part of what we need to when the nested L2 guest exits and we want to
4216 * run its L1 parent, is to reset L1's guest state to the host state specified
4218 * This function is to be called not only on normal nested exit, but also on
4219 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
4220 * Failures During or After Loading Guest State").
4221 * This function should be called when the active VMCS is L1's (vmcs01).
4223 static void load_vmcs12_host_state(struct kvm_vcpu
*vcpu
,
4224 struct vmcs12
*vmcs12
)
4226 enum vm_entry_failure_code ignored
;
4227 struct kvm_segment seg
;
4229 if (vmcs12
->vm_exit_controls
& VM_EXIT_LOAD_IA32_EFER
)
4230 vcpu
->arch
.efer
= vmcs12
->host_ia32_efer
;
4231 else if (vmcs12
->vm_exit_controls
& VM_EXIT_HOST_ADDR_SPACE_SIZE
)
4232 vcpu
->arch
.efer
|= (EFER_LMA
| EFER_LME
);
4234 vcpu
->arch
.efer
&= ~(EFER_LMA
| EFER_LME
);
4235 vmx_set_efer(vcpu
, vcpu
->arch
.efer
);
4237 kvm_rsp_write(vcpu
, vmcs12
->host_rsp
);
4238 kvm_rip_write(vcpu
, vmcs12
->host_rip
);
4239 vmx_set_rflags(vcpu
, X86_EFLAGS_FIXED
);
4240 vmx_set_interrupt_shadow(vcpu
, 0);
4243 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
4244 * actually changed, because vmx_set_cr0 refers to efer set above.
4246 * CR0_GUEST_HOST_MASK is already set in the original vmcs01
4247 * (KVM doesn't change it);
4249 vcpu
->arch
.cr0_guest_owned_bits
= KVM_POSSIBLE_CR0_GUEST_BITS
;
4250 vmx_set_cr0(vcpu
, vmcs12
->host_cr0
);
4252 /* Same as above - no reason to call set_cr4_guest_host_mask(). */
4253 vcpu
->arch
.cr4_guest_owned_bits
= ~vmcs_readl(CR4_GUEST_HOST_MASK
);
4254 vmx_set_cr4(vcpu
, vmcs12
->host_cr4
);
4256 nested_ept_uninit_mmu_context(vcpu
);
4259 * Only PDPTE load can fail as the value of cr3 was checked on entry and
4260 * couldn't have changed.
4262 if (nested_vmx_load_cr3(vcpu
, vmcs12
->host_cr3
, false, true, &ignored
))
4263 nested_vmx_abort(vcpu
, VMX_ABORT_LOAD_HOST_PDPTE_FAIL
);
4265 nested_vmx_transition_tlb_flush(vcpu
, vmcs12
, false);
4267 vmcs_write32(GUEST_SYSENTER_CS
, vmcs12
->host_ia32_sysenter_cs
);
4268 vmcs_writel(GUEST_SYSENTER_ESP
, vmcs12
->host_ia32_sysenter_esp
);
4269 vmcs_writel(GUEST_SYSENTER_EIP
, vmcs12
->host_ia32_sysenter_eip
);
4270 vmcs_writel(GUEST_IDTR_BASE
, vmcs12
->host_idtr_base
);
4271 vmcs_writel(GUEST_GDTR_BASE
, vmcs12
->host_gdtr_base
);
4272 vmcs_write32(GUEST_IDTR_LIMIT
, 0xFFFF);
4273 vmcs_write32(GUEST_GDTR_LIMIT
, 0xFFFF);
4275 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
4276 if (vmcs12
->vm_exit_controls
& VM_EXIT_CLEAR_BNDCFGS
)
4277 vmcs_write64(GUEST_BNDCFGS
, 0);
4279 if (vmcs12
->vm_exit_controls
& VM_EXIT_LOAD_IA32_PAT
) {
4280 vmcs_write64(GUEST_IA32_PAT
, vmcs12
->host_ia32_pat
);
4281 vcpu
->arch
.pat
= vmcs12
->host_ia32_pat
;
4283 if (vmcs12
->vm_exit_controls
& VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL
)
4284 WARN_ON_ONCE(kvm_set_msr(vcpu
, MSR_CORE_PERF_GLOBAL_CTRL
,
4285 vmcs12
->host_ia32_perf_global_ctrl
));
4287 /* Set L1 segment info according to Intel SDM
4288 27.5.2 Loading Host Segment and Descriptor-Table Registers */
4289 seg
= (struct kvm_segment
) {
4291 .limit
= 0xFFFFFFFF,
4292 .selector
= vmcs12
->host_cs_selector
,
4298 if (vmcs12
->vm_exit_controls
& VM_EXIT_HOST_ADDR_SPACE_SIZE
)
4302 vmx_set_segment(vcpu
, &seg
, VCPU_SREG_CS
);
4303 seg
= (struct kvm_segment
) {
4305 .limit
= 0xFFFFFFFF,
4312 seg
.selector
= vmcs12
->host_ds_selector
;
4313 vmx_set_segment(vcpu
, &seg
, VCPU_SREG_DS
);
4314 seg
.selector
= vmcs12
->host_es_selector
;
4315 vmx_set_segment(vcpu
, &seg
, VCPU_SREG_ES
);
4316 seg
.selector
= vmcs12
->host_ss_selector
;
4317 vmx_set_segment(vcpu
, &seg
, VCPU_SREG_SS
);
4318 seg
.selector
= vmcs12
->host_fs_selector
;
4319 seg
.base
= vmcs12
->host_fs_base
;
4320 vmx_set_segment(vcpu
, &seg
, VCPU_SREG_FS
);
4321 seg
.selector
= vmcs12
->host_gs_selector
;
4322 seg
.base
= vmcs12
->host_gs_base
;
4323 vmx_set_segment(vcpu
, &seg
, VCPU_SREG_GS
);
4324 seg
= (struct kvm_segment
) {
4325 .base
= vmcs12
->host_tr_base
,
4327 .selector
= vmcs12
->host_tr_selector
,
4331 vmx_set_segment(vcpu
, &seg
, VCPU_SREG_TR
);
4333 kvm_set_dr(vcpu
, 7, 0x400);
4334 vmcs_write64(GUEST_IA32_DEBUGCTL
, 0);
4336 if (cpu_has_vmx_msr_bitmap())
4337 vmx_update_msr_bitmap(vcpu
);
4339 if (nested_vmx_load_msr(vcpu
, vmcs12
->vm_exit_msr_load_addr
,
4340 vmcs12
->vm_exit_msr_load_count
))
4341 nested_vmx_abort(vcpu
, VMX_ABORT_LOAD_HOST_MSR_FAIL
);
4344 static inline u64
nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx
*vmx
)
4346 struct vmx_uret_msr
*efer_msr
;
4349 if (vm_entry_controls_get(vmx
) & VM_ENTRY_LOAD_IA32_EFER
)
4350 return vmcs_read64(GUEST_IA32_EFER
);
4352 if (cpu_has_load_ia32_efer())
4355 for (i
= 0; i
< vmx
->msr_autoload
.guest
.nr
; ++i
) {
4356 if (vmx
->msr_autoload
.guest
.val
[i
].index
== MSR_EFER
)
4357 return vmx
->msr_autoload
.guest
.val
[i
].value
;
4360 efer_msr
= vmx_find_uret_msr(vmx
, MSR_EFER
);
4362 return efer_msr
->data
;
4367 static void nested_vmx_restore_host_state(struct kvm_vcpu
*vcpu
)
4369 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
4370 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
4371 struct vmx_msr_entry g
, h
;
4375 vcpu
->arch
.pat
= vmcs_read64(GUEST_IA32_PAT
);
4377 if (vmcs12
->vm_entry_controls
& VM_ENTRY_LOAD_DEBUG_CONTROLS
) {
4379 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
4380 * as vmcs01.GUEST_DR7 contains a userspace defined value
4381 * and vcpu->arch.dr7 is not squirreled away before the
4382 * nested VMENTER (not worth adding a variable in nested_vmx).
4384 if (vcpu
->guest_debug
& KVM_GUESTDBG_USE_HW_BP
)
4385 kvm_set_dr(vcpu
, 7, DR7_FIXED_1
);
4387 WARN_ON(kvm_set_dr(vcpu
, 7, vmcs_readl(GUEST_DR7
)));
4391 * Note that calling vmx_set_{efer,cr0,cr4} is important as they
4392 * handle a variety of side effects to KVM's software model.
4394 vmx_set_efer(vcpu
, nested_vmx_get_vmcs01_guest_efer(vmx
));
4396 vcpu
->arch
.cr0_guest_owned_bits
= KVM_POSSIBLE_CR0_GUEST_BITS
;
4397 vmx_set_cr0(vcpu
, vmcs_readl(CR0_READ_SHADOW
));
4399 vcpu
->arch
.cr4_guest_owned_bits
= ~vmcs_readl(CR4_GUEST_HOST_MASK
);
4400 vmx_set_cr4(vcpu
, vmcs_readl(CR4_READ_SHADOW
));
4402 nested_ept_uninit_mmu_context(vcpu
);
4403 vcpu
->arch
.cr3
= vmcs_readl(GUEST_CR3
);
4404 kvm_register_mark_available(vcpu
, VCPU_EXREG_CR3
);
4407 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
4408 * from vmcs01 (if necessary). The PDPTRs are not loaded on
4409 * VMFail, like everything else we just need to ensure our
4410 * software model is up-to-date.
4412 if (enable_ept
&& is_pae_paging(vcpu
))
4413 ept_save_pdptrs(vcpu
);
4415 kvm_mmu_reset_context(vcpu
);
4417 if (cpu_has_vmx_msr_bitmap())
4418 vmx_update_msr_bitmap(vcpu
);
4421 * This nasty bit of open coding is a compromise between blindly
4422 * loading L1's MSRs using the exit load lists (incorrect emulation
4423 * of VMFail), leaving the nested VM's MSRs in the software model
4424 * (incorrect behavior) and snapshotting the modified MSRs (too
4425 * expensive since the lists are unbound by hardware). For each
4426 * MSR that was (prematurely) loaded from the nested VMEntry load
4427 * list, reload it from the exit load list if it exists and differs
4428 * from the guest value. The intent is to stuff host state as
4429 * silently as possible, not to fully process the exit load list.
4431 for (i
= 0; i
< vmcs12
->vm_entry_msr_load_count
; i
++) {
4432 gpa
= vmcs12
->vm_entry_msr_load_addr
+ (i
* sizeof(g
));
4433 if (kvm_vcpu_read_guest(vcpu
, gpa
, &g
, sizeof(g
))) {
4434 pr_debug_ratelimited(
4435 "%s read MSR index failed (%u, 0x%08llx)\n",
4440 for (j
= 0; j
< vmcs12
->vm_exit_msr_load_count
; j
++) {
4441 gpa
= vmcs12
->vm_exit_msr_load_addr
+ (j
* sizeof(h
));
4442 if (kvm_vcpu_read_guest(vcpu
, gpa
, &h
, sizeof(h
))) {
4443 pr_debug_ratelimited(
4444 "%s read MSR failed (%u, 0x%08llx)\n",
4448 if (h
.index
!= g
.index
)
4450 if (h
.value
== g
.value
)
4453 if (nested_vmx_load_msr_check(vcpu
, &h
)) {
4454 pr_debug_ratelimited(
4455 "%s check failed (%u, 0x%x, 0x%x)\n",
4456 __func__
, j
, h
.index
, h
.reserved
);
4460 if (kvm_set_msr(vcpu
, h
.index
, h
.value
)) {
4461 pr_debug_ratelimited(
4462 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
4463 __func__
, j
, h
.index
, h
.value
);
4472 nested_vmx_abort(vcpu
, VMX_ABORT_LOAD_HOST_MSR_FAIL
);
4476 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
4477 * and modify vmcs12 to make it see what it would expect to see there if
4478 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
4480 void nested_vmx_vmexit(struct kvm_vcpu
*vcpu
, u32 vm_exit_reason
,
4481 u32 exit_intr_info
, unsigned long exit_qualification
)
4483 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
4484 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
4486 /* trying to cancel vmlaunch/vmresume is a bug */
4487 WARN_ON_ONCE(vmx
->nested
.nested_run_pending
);
4489 /* Similarly, triple faults in L2 should never escape. */
4490 WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT
, vcpu
));
4492 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES
, vcpu
)) {
4494 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
4495 * Enlightened VMCS after migration and we still need to
4496 * do that when something is forcing L2->L1 exit prior to
4499 (void)nested_get_evmcs_page(vcpu
);
4502 /* Service the TLB flush request for L2 before switching to L1. */
4503 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT
, vcpu
))
4504 kvm_vcpu_flush_tlb_current(vcpu
);
4507 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
4508 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are
4509 * up-to-date before switching to L1.
4511 if (enable_ept
&& is_pae_paging(vcpu
))
4512 vmx_ept_load_pdptrs(vcpu
);
4514 leave_guest_mode(vcpu
);
4516 if (nested_cpu_has_preemption_timer(vmcs12
))
4517 hrtimer_cancel(&to_vmx(vcpu
)->nested
.preemption_timer
);
4519 if (nested_cpu_has(vmcs12
, CPU_BASED_USE_TSC_OFFSETTING
)) {
4520 vcpu
->arch
.tsc_offset
= vcpu
->arch
.l1_tsc_offset
;
4521 if (nested_cpu_has2(vmcs12
, SECONDARY_EXEC_TSC_SCALING
))
4522 vcpu
->arch
.tsc_scaling_ratio
= vcpu
->arch
.l1_tsc_scaling_ratio
;
4525 if (likely(!vmx
->fail
)) {
4526 sync_vmcs02_to_vmcs12(vcpu
, vmcs12
);
4528 if (vm_exit_reason
!= -1)
4529 prepare_vmcs12(vcpu
, vmcs12
, vm_exit_reason
,
4530 exit_intr_info
, exit_qualification
);
4533 * Must happen outside of sync_vmcs02_to_vmcs12() as it will
4534 * also be used to capture vmcs12 cache as part of
4535 * capturing nVMX state for snapshot (migration).
4537 * Otherwise, this flush will dirty guest memory at a
4538 * point it is already assumed by user-space to be
4541 nested_flush_cached_shadow_vmcs12(vcpu
, vmcs12
);
4544 * The only expected VM-instruction error is "VM entry with
4545 * invalid control field(s)." Anything else indicates a
4546 * problem with L0. And we should never get here with a
4547 * VMFail of any type if early consistency checks are enabled.
4549 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR
) !=
4550 VMXERR_ENTRY_INVALID_CONTROL_FIELD
);
4551 WARN_ON_ONCE(nested_early_check
);
4554 vmx_switch_vmcs(vcpu
, &vmx
->vmcs01
);
4556 /* Update any VMCS fields that might have changed while L2 ran */
4557 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT
, vmx
->msr_autoload
.host
.nr
);
4558 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT
, vmx
->msr_autoload
.guest
.nr
);
4559 vmcs_write64(TSC_OFFSET
, vcpu
->arch
.tsc_offset
);
4560 if (kvm_has_tsc_control
)
4561 vmcs_write64(TSC_MULTIPLIER
, vcpu
->arch
.tsc_scaling_ratio
);
4563 if (vmx
->nested
.l1_tpr_threshold
!= -1)
4564 vmcs_write32(TPR_THRESHOLD
, vmx
->nested
.l1_tpr_threshold
);
4566 if (vmx
->nested
.change_vmcs01_virtual_apic_mode
) {
4567 vmx
->nested
.change_vmcs01_virtual_apic_mode
= false;
4568 vmx_set_virtual_apic_mode(vcpu
);
4571 if (vmx
->nested
.update_vmcs01_cpu_dirty_logging
) {
4572 vmx
->nested
.update_vmcs01_cpu_dirty_logging
= false;
4573 vmx_update_cpu_dirty_logging(vcpu
);
4576 /* Unpin physical memory we referred to in vmcs02 */
4577 if (vmx
->nested
.apic_access_page
) {
4578 kvm_release_page_clean(vmx
->nested
.apic_access_page
);
4579 vmx
->nested
.apic_access_page
= NULL
;
4581 kvm_vcpu_unmap(vcpu
, &vmx
->nested
.virtual_apic_map
, true);
4582 kvm_vcpu_unmap(vcpu
, &vmx
->nested
.pi_desc_map
, true);
4583 vmx
->nested
.pi_desc
= NULL
;
4585 if (vmx
->nested
.reload_vmcs01_apic_access_page
) {
4586 vmx
->nested
.reload_vmcs01_apic_access_page
= false;
4587 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD
, vcpu
);
4590 if ((vm_exit_reason
!= -1) &&
4591 (enable_shadow_vmcs
|| evmptr_is_valid(vmx
->nested
.hv_evmcs_vmptr
)))
4592 vmx
->nested
.need_vmcs12_to_shadow_sync
= true;
4594 /* in case we halted in L2 */
4595 vcpu
->arch
.mp_state
= KVM_MP_STATE_RUNNABLE
;
4597 if (likely(!vmx
->fail
)) {
4598 if ((u16
)vm_exit_reason
== EXIT_REASON_EXTERNAL_INTERRUPT
&&
4599 nested_exit_intr_ack_set(vcpu
)) {
4600 int irq
= kvm_cpu_get_interrupt(vcpu
);
4602 vmcs12
->vm_exit_intr_info
= irq
|
4603 INTR_INFO_VALID_MASK
| INTR_TYPE_EXT_INTR
;
4606 if (vm_exit_reason
!= -1)
4607 trace_kvm_nested_vmexit_inject(vmcs12
->vm_exit_reason
,
4608 vmcs12
->exit_qualification
,
4609 vmcs12
->idt_vectoring_info_field
,
4610 vmcs12
->vm_exit_intr_info
,
4611 vmcs12
->vm_exit_intr_error_code
,
4614 load_vmcs12_host_state(vcpu
, vmcs12
);
4620 * After an early L2 VM-entry failure, we're now back
4621 * in L1 which thinks it just finished a VMLAUNCH or
4622 * VMRESUME instruction, so we need to set the failure
4623 * flag and the VM-instruction error field of the VMCS
4624 * accordingly, and skip the emulated instruction.
4626 (void)nested_vmx_fail(vcpu
, VMXERR_ENTRY_INVALID_CONTROL_FIELD
);
4629 * Restore L1's host state to KVM's software model. We're here
4630 * because a consistency check was caught by hardware, which
4631 * means some amount of guest state has been propagated to KVM's
4632 * model and needs to be unwound to the host's state.
4634 nested_vmx_restore_host_state(vcpu
);
4639 static void nested_vmx_triple_fault(struct kvm_vcpu
*vcpu
)
4641 nested_vmx_vmexit(vcpu
, EXIT_REASON_TRIPLE_FAULT
, 0, 0);
4645 * Decode the memory-address operand of a vmx instruction, as recorded on an
4646 * exit caused by such an instruction (run by a guest hypervisor).
4647 * On success, returns 0. When the operand is invalid, returns 1 and throws
4650 int get_vmx_mem_address(struct kvm_vcpu
*vcpu
, unsigned long exit_qualification
,
4651 u32 vmx_instruction_info
, bool wr
, int len
, gva_t
*ret
)
4655 struct kvm_segment s
;
4658 * According to Vol. 3B, "Information for VM Exits Due to Instruction
4659 * Execution", on an exit, vmx_instruction_info holds most of the
4660 * addressing components of the operand. Only the displacement part
4661 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4662 * For how an actual address is calculated from all these components,
4663 * refer to Vol. 1, "Operand Addressing".
4665 int scaling
= vmx_instruction_info
& 3;
4666 int addr_size
= (vmx_instruction_info
>> 7) & 7;
4667 bool is_reg
= vmx_instruction_info
& (1u << 10);
4668 int seg_reg
= (vmx_instruction_info
>> 15) & 7;
4669 int index_reg
= (vmx_instruction_info
>> 18) & 0xf;
4670 bool index_is_valid
= !(vmx_instruction_info
& (1u << 22));
4671 int base_reg
= (vmx_instruction_info
>> 23) & 0xf;
4672 bool base_is_valid
= !(vmx_instruction_info
& (1u << 27));
4675 kvm_queue_exception(vcpu
, UD_VECTOR
);
4679 /* Addr = segment_base + offset */
4680 /* offset = base + [index * scale] + displacement */
4681 off
= exit_qualification
; /* holds the displacement */
4683 off
= (gva_t
)sign_extend64(off
, 31);
4684 else if (addr_size
== 0)
4685 off
= (gva_t
)sign_extend64(off
, 15);
4687 off
+= kvm_register_read(vcpu
, base_reg
);
4689 off
+= kvm_register_read(vcpu
, index_reg
) << scaling
;
4690 vmx_get_segment(vcpu
, &s
, seg_reg
);
4693 * The effective address, i.e. @off, of a memory operand is truncated
4694 * based on the address size of the instruction. Note that this is
4695 * the *effective address*, i.e. the address prior to accounting for
4696 * the segment's base.
4698 if (addr_size
== 1) /* 32 bit */
4700 else if (addr_size
== 0) /* 16 bit */
4703 /* Checks for #GP/#SS exceptions. */
4705 if (is_long_mode(vcpu
)) {
4707 * The virtual/linear address is never truncated in 64-bit
4708 * mode, e.g. a 32-bit address size can yield a 64-bit virtual
4709 * address when using FS/GS with a non-zero base.
4711 if (seg_reg
== VCPU_SREG_FS
|| seg_reg
== VCPU_SREG_GS
)
4712 *ret
= s
.base
+ off
;
4716 /* Long mode: #GP(0)/#SS(0) if the memory address is in a
4717 * non-canonical form. This is the only check on the memory
4718 * destination for long mode!
4720 exn
= is_noncanonical_address(*ret
, vcpu
);
4723 * When not in long mode, the virtual/linear address is
4724 * unconditionally truncated to 32 bits regardless of the
4727 *ret
= (s
.base
+ off
) & 0xffffffff;
4729 /* Protected mode: apply checks for segment validity in the
4731 * - segment type check (#GP(0) may be thrown)
4732 * - usability check (#GP(0)/#SS(0))
4733 * - limit check (#GP(0)/#SS(0))
4736 /* #GP(0) if the destination operand is located in a
4737 * read-only data segment or any code segment.
4739 exn
= ((s
.type
& 0xa) == 0 || (s
.type
& 8));
4741 /* #GP(0) if the source operand is located in an
4742 * execute-only code segment
4744 exn
= ((s
.type
& 0xa) == 8);
4746 kvm_queue_exception_e(vcpu
, GP_VECTOR
, 0);
4749 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
4751 exn
= (s
.unusable
!= 0);
4754 * Protected mode: #GP(0)/#SS(0) if the memory operand is
4755 * outside the segment limit. All CPUs that support VMX ignore
4756 * limit checks for flat segments, i.e. segments with base==0,
4757 * limit==0xffffffff and of type expand-up data or code.
4759 if (!(s
.base
== 0 && s
.limit
== 0xffffffff &&
4760 ((s
.type
& 8) || !(s
.type
& 4))))
4761 exn
= exn
|| ((u64
)off
+ len
- 1 > s
.limit
);
4764 kvm_queue_exception_e(vcpu
,
4765 seg_reg
== VCPU_SREG_SS
?
4766 SS_VECTOR
: GP_VECTOR
,
4774 void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu
*vcpu
)
4776 struct vcpu_vmx
*vmx
;
4778 if (!nested_vmx_allowed(vcpu
))
4782 if (kvm_x86_ops
.pmu_ops
->is_valid_msr(vcpu
, MSR_CORE_PERF_GLOBAL_CTRL
)) {
4783 vmx
->nested
.msrs
.entry_ctls_high
|=
4784 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL
;
4785 vmx
->nested
.msrs
.exit_ctls_high
|=
4786 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL
;
4788 vmx
->nested
.msrs
.entry_ctls_high
&=
4789 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL
;
4790 vmx
->nested
.msrs
.exit_ctls_high
&=
4791 ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL
;
4795 static int nested_vmx_get_vmptr(struct kvm_vcpu
*vcpu
, gpa_t
*vmpointer
,
4799 struct x86_exception e
;
4802 if (get_vmx_mem_address(vcpu
, vmx_get_exit_qual(vcpu
),
4803 vmcs_read32(VMX_INSTRUCTION_INFO
), false,
4804 sizeof(*vmpointer
), &gva
)) {
4809 r
= kvm_read_guest_virt(vcpu
, gva
, vmpointer
, sizeof(*vmpointer
), &e
);
4810 if (r
!= X86EMUL_CONTINUE
) {
4811 *ret
= kvm_handle_memory_failure(vcpu
, r
, &e
);
4819 * Allocate a shadow VMCS and associate it with the currently loaded
4820 * VMCS, unless such a shadow VMCS already exists. The newly allocated
4821 * VMCS is also VMCLEARed, so that it is ready for use.
4823 static struct vmcs
*alloc_shadow_vmcs(struct kvm_vcpu
*vcpu
)
4825 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
4826 struct loaded_vmcs
*loaded_vmcs
= vmx
->loaded_vmcs
;
4829 * We should allocate a shadow vmcs for vmcs01 only when L1
4830 * executes VMXON and free it when L1 executes VMXOFF.
4831 * As it is invalid to execute VMXON twice, we shouldn't reach
4832 * here when vmcs01 already have an allocated shadow vmcs.
4834 WARN_ON(loaded_vmcs
== &vmx
->vmcs01
&& loaded_vmcs
->shadow_vmcs
);
4836 if (!loaded_vmcs
->shadow_vmcs
) {
4837 loaded_vmcs
->shadow_vmcs
= alloc_vmcs(true);
4838 if (loaded_vmcs
->shadow_vmcs
)
4839 vmcs_clear(loaded_vmcs
->shadow_vmcs
);
4841 return loaded_vmcs
->shadow_vmcs
;
4844 static int enter_vmx_operation(struct kvm_vcpu
*vcpu
)
4846 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
4849 r
= alloc_loaded_vmcs(&vmx
->nested
.vmcs02
);
4853 vmx
->nested
.cached_vmcs12
= kzalloc(VMCS12_SIZE
, GFP_KERNEL_ACCOUNT
);
4854 if (!vmx
->nested
.cached_vmcs12
)
4855 goto out_cached_vmcs12
;
4857 vmx
->nested
.cached_shadow_vmcs12
= kzalloc(VMCS12_SIZE
, GFP_KERNEL_ACCOUNT
);
4858 if (!vmx
->nested
.cached_shadow_vmcs12
)
4859 goto out_cached_shadow_vmcs12
;
4861 if (enable_shadow_vmcs
&& !alloc_shadow_vmcs(vcpu
))
4862 goto out_shadow_vmcs
;
4864 hrtimer_init(&vmx
->nested
.preemption_timer
, CLOCK_MONOTONIC
,
4865 HRTIMER_MODE_ABS_PINNED
);
4866 vmx
->nested
.preemption_timer
.function
= vmx_preemption_timer_fn
;
4868 vmx
->nested
.vpid02
= allocate_vpid();
4870 vmx
->nested
.vmcs02_initialized
= false;
4871 vmx
->nested
.vmxon
= true;
4873 if (vmx_pt_mode_is_host_guest()) {
4874 vmx
->pt_desc
.guest
.ctl
= 0;
4875 pt_update_intercept_for_msr(vcpu
);
4881 kfree(vmx
->nested
.cached_shadow_vmcs12
);
4883 out_cached_shadow_vmcs12
:
4884 kfree(vmx
->nested
.cached_vmcs12
);
4887 free_loaded_vmcs(&vmx
->nested
.vmcs02
);
4894 * Emulate the VMXON instruction.
4895 * Currently, we just remember that VMX is active, and do not save or even
4896 * inspect the argument to VMXON (the so-called "VMXON pointer") because we
4897 * do not currently need to store anything in that guest-allocated memory
4898 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
4899 * argument is different from the VMXON pointer (which the spec says they do).
4901 static int handle_vmon(struct kvm_vcpu
*vcpu
)
4906 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
4907 const u64 VMXON_NEEDED_FEATURES
= FEAT_CTL_LOCKED
4908 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX
;
4911 * The Intel VMX Instruction Reference lists a bunch of bits that are
4912 * prerequisite to running VMXON, most notably cr4.VMXE must be set to
4913 * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this).
4914 * Otherwise, we should fail with #UD. But most faulting conditions
4915 * have already been checked by hardware, prior to the VM-exit for
4916 * VMXON. We do test guest cr4.VMXE because processor CR4 always has
4917 * that bit set to 1 in non-root mode.
4919 if (!kvm_read_cr4_bits(vcpu
, X86_CR4_VMXE
)) {
4920 kvm_queue_exception(vcpu
, UD_VECTOR
);
4924 /* CPL=0 must be checked manually. */
4925 if (vmx_get_cpl(vcpu
)) {
4926 kvm_inject_gp(vcpu
, 0);
4930 if (vmx
->nested
.vmxon
)
4931 return nested_vmx_fail(vcpu
, VMXERR_VMXON_IN_VMX_ROOT_OPERATION
);
4933 if ((vmx
->msr_ia32_feature_control
& VMXON_NEEDED_FEATURES
)
4934 != VMXON_NEEDED_FEATURES
) {
4935 kvm_inject_gp(vcpu
, 0);
4939 if (nested_vmx_get_vmptr(vcpu
, &vmptr
, &ret
))
4944 * The first 4 bytes of VMXON region contain the supported
4945 * VMCS revision identifier
4947 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
4948 * which replaces physical address width with 32
4950 if (!page_address_valid(vcpu
, vmptr
))
4951 return nested_vmx_failInvalid(vcpu
);
4953 if (kvm_read_guest(vcpu
->kvm
, vmptr
, &revision
, sizeof(revision
)) ||
4954 revision
!= VMCS12_REVISION
)
4955 return nested_vmx_failInvalid(vcpu
);
4957 vmx
->nested
.vmxon_ptr
= vmptr
;
4958 ret
= enter_vmx_operation(vcpu
);
4962 return nested_vmx_succeed(vcpu
);
4965 static inline void nested_release_vmcs12(struct kvm_vcpu
*vcpu
)
4967 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
4969 if (vmx
->nested
.current_vmptr
== -1ull)
4972 copy_vmcs02_to_vmcs12_rare(vcpu
, get_vmcs12(vcpu
));
4974 if (enable_shadow_vmcs
) {
4975 /* copy to memory all shadowed fields in case
4976 they were modified */
4977 copy_shadow_to_vmcs12(vmx
);
4978 vmx_disable_shadow_vmcs(vmx
);
4980 vmx
->nested
.posted_intr_nv
= -1;
4982 /* Flush VMCS12 to guest memory */
4983 kvm_vcpu_write_guest_page(vcpu
,
4984 vmx
->nested
.current_vmptr
>> PAGE_SHIFT
,
4985 vmx
->nested
.cached_vmcs12
, 0, VMCS12_SIZE
);
4987 kvm_mmu_free_roots(vcpu
, &vcpu
->arch
.guest_mmu
, KVM_MMU_ROOTS_ALL
);
4989 vmx
->nested
.current_vmptr
= -1ull;
4992 /* Emulate the VMXOFF instruction */
4993 static int handle_vmoff(struct kvm_vcpu
*vcpu
)
4995 if (!nested_vmx_check_permission(vcpu
))
5000 /* Process a latched INIT during time CPU was in VMX operation */
5001 kvm_make_request(KVM_REQ_EVENT
, vcpu
);
5003 return nested_vmx_succeed(vcpu
);
5006 /* Emulate the VMCLEAR instruction */
5007 static int handle_vmclear(struct kvm_vcpu
*vcpu
)
5009 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
5015 if (!nested_vmx_check_permission(vcpu
))
5018 if (nested_vmx_get_vmptr(vcpu
, &vmptr
, &r
))
5021 if (!page_address_valid(vcpu
, vmptr
))
5022 return nested_vmx_fail(vcpu
, VMXERR_VMCLEAR_INVALID_ADDRESS
);
5024 if (vmptr
== vmx
->nested
.vmxon_ptr
)
5025 return nested_vmx_fail(vcpu
, VMXERR_VMCLEAR_VMXON_POINTER
);
5028 * When Enlightened VMEntry is enabled on the calling CPU we treat
5029 * memory area pointer by vmptr as Enlightened VMCS (as there's no good
5030 * way to distinguish it from VMCS12) and we must not corrupt it by
5031 * writing to the non-existent 'launch_state' field. The area doesn't
5032 * have to be the currently active EVMCS on the calling CPU and there's
5033 * nothing KVM has to do to transition it from 'active' to 'non-active'
5034 * state. It is possible that the area will stay mapped as
5035 * vmx->nested.hv_evmcs but this shouldn't be a problem.
5037 if (likely(!vmx
->nested
.enlightened_vmcs_enabled
||
5038 !nested_enlightened_vmentry(vcpu
, &evmcs_gpa
))) {
5039 if (vmptr
== vmx
->nested
.current_vmptr
)
5040 nested_release_vmcs12(vcpu
);
5042 kvm_vcpu_write_guest(vcpu
,
5043 vmptr
+ offsetof(struct vmcs12
,
5045 &zero
, sizeof(zero
));
5046 } else if (vmx
->nested
.hv_evmcs
&& vmptr
== vmx
->nested
.hv_evmcs_vmptr
) {
5047 nested_release_evmcs(vcpu
);
5050 return nested_vmx_succeed(vcpu
);
5053 /* Emulate the VMLAUNCH instruction */
5054 static int handle_vmlaunch(struct kvm_vcpu
*vcpu
)
5056 return nested_vmx_run(vcpu
, true);
5059 /* Emulate the VMRESUME instruction */
5060 static int handle_vmresume(struct kvm_vcpu
*vcpu
)
5063 return nested_vmx_run(vcpu
, false);
5066 static int handle_vmread(struct kvm_vcpu
*vcpu
)
5068 struct vmcs12
*vmcs12
= is_guest_mode(vcpu
) ? get_shadow_vmcs12(vcpu
)
5070 unsigned long exit_qualification
= vmx_get_exit_qual(vcpu
);
5071 u32 instr_info
= vmcs_read32(VMX_INSTRUCTION_INFO
);
5072 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
5073 struct x86_exception e
;
5074 unsigned long field
;
5080 if (!nested_vmx_check_permission(vcpu
))
5084 * In VMX non-root operation, when the VMCS-link pointer is -1ull,
5085 * any VMREAD sets the ALU flags for VMfailInvalid.
5087 if (vmx
->nested
.current_vmptr
== -1ull ||
5088 (is_guest_mode(vcpu
) &&
5089 get_vmcs12(vcpu
)->vmcs_link_pointer
== -1ull))
5090 return nested_vmx_failInvalid(vcpu
);
5092 /* Decode instruction info and find the field to read */
5093 field
= kvm_register_read(vcpu
, (((instr_info
) >> 28) & 0xf));
5095 offset
= vmcs_field_to_offset(field
);
5097 return nested_vmx_fail(vcpu
, VMXERR_UNSUPPORTED_VMCS_COMPONENT
);
5099 if (!is_guest_mode(vcpu
) && is_vmcs12_ext_field(field
))
5100 copy_vmcs02_to_vmcs12_rare(vcpu
, vmcs12
);
5102 /* Read the field, zero-extended to a u64 value */
5103 value
= vmcs12_read_any(vmcs12
, field
, offset
);
5106 * Now copy part of this value to register or memory, as requested.
5107 * Note that the number of bits actually copied is 32 or 64 depending
5108 * on the guest's mode (32 or 64 bit), not on the given field's length.
5110 if (instr_info
& BIT(10)) {
5111 kvm_register_write(vcpu
, (((instr_info
) >> 3) & 0xf), value
);
5113 len
= is_64_bit_mode(vcpu
) ? 8 : 4;
5114 if (get_vmx_mem_address(vcpu
, exit_qualification
,
5115 instr_info
, true, len
, &gva
))
5117 /* _system ok, nested_vmx_check_permission has verified cpl=0 */
5118 r
= kvm_write_guest_virt_system(vcpu
, gva
, &value
, len
, &e
);
5119 if (r
!= X86EMUL_CONTINUE
)
5120 return kvm_handle_memory_failure(vcpu
, r
, &e
);
5123 return nested_vmx_succeed(vcpu
);
5126 static bool is_shadow_field_rw(unsigned long field
)
5129 #define SHADOW_FIELD_RW(x, y) case x:
5130 #include "vmcs_shadow_fields.h"
5138 static bool is_shadow_field_ro(unsigned long field
)
5141 #define SHADOW_FIELD_RO(x, y) case x:
5142 #include "vmcs_shadow_fields.h"
5150 static int handle_vmwrite(struct kvm_vcpu
*vcpu
)
5152 struct vmcs12
*vmcs12
= is_guest_mode(vcpu
) ? get_shadow_vmcs12(vcpu
)
5154 unsigned long exit_qualification
= vmx_get_exit_qual(vcpu
);
5155 u32 instr_info
= vmcs_read32(VMX_INSTRUCTION_INFO
);
5156 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
5157 struct x86_exception e
;
5158 unsigned long field
;
5164 * The value to write might be 32 or 64 bits, depending on L1's long
5165 * mode, and eventually we need to write that into a field of several
5166 * possible lengths. The code below first zero-extends the value to 64
5167 * bit (value), and then copies only the appropriate number of
5168 * bits into the vmcs12 field.
5172 if (!nested_vmx_check_permission(vcpu
))
5176 * In VMX non-root operation, when the VMCS-link pointer is -1ull,
5177 * any VMWRITE sets the ALU flags for VMfailInvalid.
5179 if (vmx
->nested
.current_vmptr
== -1ull ||
5180 (is_guest_mode(vcpu
) &&
5181 get_vmcs12(vcpu
)->vmcs_link_pointer
== -1ull))
5182 return nested_vmx_failInvalid(vcpu
);
5184 if (instr_info
& BIT(10))
5185 value
= kvm_register_read(vcpu
, (((instr_info
) >> 3) & 0xf));
5187 len
= is_64_bit_mode(vcpu
) ? 8 : 4;
5188 if (get_vmx_mem_address(vcpu
, exit_qualification
,
5189 instr_info
, false, len
, &gva
))
5191 r
= kvm_read_guest_virt(vcpu
, gva
, &value
, len
, &e
);
5192 if (r
!= X86EMUL_CONTINUE
)
5193 return kvm_handle_memory_failure(vcpu
, r
, &e
);
5196 field
= kvm_register_read(vcpu
, (((instr_info
) >> 28) & 0xf));
5198 offset
= vmcs_field_to_offset(field
);
5200 return nested_vmx_fail(vcpu
, VMXERR_UNSUPPORTED_VMCS_COMPONENT
);
5203 * If the vCPU supports "VMWRITE to any supported field in the
5204 * VMCS," then the "read-only" fields are actually read/write.
5206 if (vmcs_field_readonly(field
) &&
5207 !nested_cpu_has_vmwrite_any_field(vcpu
))
5208 return nested_vmx_fail(vcpu
, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT
);
5211 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
5212 * vmcs12, else we may crush a field or consume a stale value.
5214 if (!is_guest_mode(vcpu
) && !is_shadow_field_rw(field
))
5215 copy_vmcs02_to_vmcs12_rare(vcpu
, vmcs12
);
5218 * Some Intel CPUs intentionally drop the reserved bits of the AR byte
5219 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM
5220 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
5221 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
5222 * from L1 will return a different value than VMREAD from L2 (L1 sees
5223 * the stripped down value, L2 sees the full value as stored by KVM).
5225 if (field
>= GUEST_ES_AR_BYTES
&& field
<= GUEST_TR_AR_BYTES
)
5228 vmcs12_write_any(vmcs12
, field
, offset
, value
);
5231 * Do not track vmcs12 dirty-state if in guest-mode as we actually
5232 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated
5233 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
5234 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
5236 if (!is_guest_mode(vcpu
) && !is_shadow_field_rw(field
)) {
5238 * L1 can read these fields without exiting, ensure the
5239 * shadow VMCS is up-to-date.
5241 if (enable_shadow_vmcs
&& is_shadow_field_ro(field
)) {
5243 vmcs_load(vmx
->vmcs01
.shadow_vmcs
);
5245 __vmcs_writel(field
, value
);
5247 vmcs_clear(vmx
->vmcs01
.shadow_vmcs
);
5248 vmcs_load(vmx
->loaded_vmcs
->vmcs
);
5251 vmx
->nested
.dirty_vmcs12
= true;
5254 return nested_vmx_succeed(vcpu
);
5257 static void set_current_vmptr(struct vcpu_vmx
*vmx
, gpa_t vmptr
)
5259 vmx
->nested
.current_vmptr
= vmptr
;
5260 if (enable_shadow_vmcs
) {
5261 secondary_exec_controls_setbit(vmx
, SECONDARY_EXEC_SHADOW_VMCS
);
5262 vmcs_write64(VMCS_LINK_POINTER
,
5263 __pa(vmx
->vmcs01
.shadow_vmcs
));
5264 vmx
->nested
.need_vmcs12_to_shadow_sync
= true;
5266 vmx
->nested
.dirty_vmcs12
= true;
5269 /* Emulate the VMPTRLD instruction */
5270 static int handle_vmptrld(struct kvm_vcpu
*vcpu
)
5272 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
5276 if (!nested_vmx_check_permission(vcpu
))
5279 if (nested_vmx_get_vmptr(vcpu
, &vmptr
, &r
))
5282 if (!page_address_valid(vcpu
, vmptr
))
5283 return nested_vmx_fail(vcpu
, VMXERR_VMPTRLD_INVALID_ADDRESS
);
5285 if (vmptr
== vmx
->nested
.vmxon_ptr
)
5286 return nested_vmx_fail(vcpu
, VMXERR_VMPTRLD_VMXON_POINTER
);
5288 /* Forbid normal VMPTRLD if Enlightened version was used */
5289 if (evmptr_is_valid(vmx
->nested
.hv_evmcs_vmptr
))
5292 if (vmx
->nested
.current_vmptr
!= vmptr
) {
5293 struct kvm_host_map map
;
5294 struct vmcs12
*new_vmcs12
;
5296 if (kvm_vcpu_map(vcpu
, gpa_to_gfn(vmptr
), &map
)) {
5298 * Reads from an unbacked page return all 1s,
5299 * which means that the 32 bits located at the
5300 * given physical address won't match the required
5301 * VMCS12_REVISION identifier.
5303 return nested_vmx_fail(vcpu
,
5304 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID
);
5307 new_vmcs12
= map
.hva
;
5309 if (new_vmcs12
->hdr
.revision_id
!= VMCS12_REVISION
||
5310 (new_vmcs12
->hdr
.shadow_vmcs
&&
5311 !nested_cpu_has_vmx_shadow_vmcs(vcpu
))) {
5312 kvm_vcpu_unmap(vcpu
, &map
, false);
5313 return nested_vmx_fail(vcpu
,
5314 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID
);
5317 nested_release_vmcs12(vcpu
);
5320 * Load VMCS12 from guest memory since it is not already
5323 memcpy(vmx
->nested
.cached_vmcs12
, new_vmcs12
, VMCS12_SIZE
);
5324 kvm_vcpu_unmap(vcpu
, &map
, false);
5326 set_current_vmptr(vmx
, vmptr
);
5329 return nested_vmx_succeed(vcpu
);
5332 /* Emulate the VMPTRST instruction */
5333 static int handle_vmptrst(struct kvm_vcpu
*vcpu
)
5335 unsigned long exit_qual
= vmx_get_exit_qual(vcpu
);
5336 u32 instr_info
= vmcs_read32(VMX_INSTRUCTION_INFO
);
5337 gpa_t current_vmptr
= to_vmx(vcpu
)->nested
.current_vmptr
;
5338 struct x86_exception e
;
5342 if (!nested_vmx_check_permission(vcpu
))
5345 if (unlikely(evmptr_is_valid(to_vmx(vcpu
)->nested
.hv_evmcs_vmptr
)))
5348 if (get_vmx_mem_address(vcpu
, exit_qual
, instr_info
,
5349 true, sizeof(gpa_t
), &gva
))
5351 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
5352 r
= kvm_write_guest_virt_system(vcpu
, gva
, (void *)¤t_vmptr
,
5354 if (r
!= X86EMUL_CONTINUE
)
5355 return kvm_handle_memory_failure(vcpu
, r
, &e
);
5357 return nested_vmx_succeed(vcpu
);
5360 #define EPTP_PA_MASK GENMASK_ULL(51, 12)
5362 static bool nested_ept_root_matches(hpa_t root_hpa
, u64 root_eptp
, u64 eptp
)
5364 return VALID_PAGE(root_hpa
) &&
5365 ((root_eptp
& EPTP_PA_MASK
) == (eptp
& EPTP_PA_MASK
));
5368 /* Emulate the INVEPT instruction */
5369 static int handle_invept(struct kvm_vcpu
*vcpu
)
5371 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
5372 u32 vmx_instruction_info
, types
;
5373 unsigned long type
, roots_to_free
;
5374 struct kvm_mmu
*mmu
;
5376 struct x86_exception e
;
5382 if (!(vmx
->nested
.msrs
.secondary_ctls_high
&
5383 SECONDARY_EXEC_ENABLE_EPT
) ||
5384 !(vmx
->nested
.msrs
.ept_caps
& VMX_EPT_INVEPT_BIT
)) {
5385 kvm_queue_exception(vcpu
, UD_VECTOR
);
5389 if (!nested_vmx_check_permission(vcpu
))
5392 vmx_instruction_info
= vmcs_read32(VMX_INSTRUCTION_INFO
);
5393 type
= kvm_register_read(vcpu
, (vmx_instruction_info
>> 28) & 0xf);
5395 types
= (vmx
->nested
.msrs
.ept_caps
>> VMX_EPT_EXTENT_SHIFT
) & 6;
5397 if (type
>= 32 || !(types
& (1 << type
)))
5398 return nested_vmx_fail(vcpu
, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID
);
5400 /* According to the Intel VMX instruction reference, the memory
5401 * operand is read even if it isn't needed (e.g., for type==global)
5403 if (get_vmx_mem_address(vcpu
, vmx_get_exit_qual(vcpu
),
5404 vmx_instruction_info
, false, sizeof(operand
), &gva
))
5406 r
= kvm_read_guest_virt(vcpu
, gva
, &operand
, sizeof(operand
), &e
);
5407 if (r
!= X86EMUL_CONTINUE
)
5408 return kvm_handle_memory_failure(vcpu
, r
, &e
);
5411 * Nested EPT roots are always held through guest_mmu,
5414 mmu
= &vcpu
->arch
.guest_mmu
;
5417 case VMX_EPT_EXTENT_CONTEXT
:
5418 if (!nested_vmx_check_eptp(vcpu
, operand
.eptp
))
5419 return nested_vmx_fail(vcpu
,
5420 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID
);
5423 if (nested_ept_root_matches(mmu
->root_hpa
, mmu
->root_pgd
,
5425 roots_to_free
|= KVM_MMU_ROOT_CURRENT
;
5427 for (i
= 0; i
< KVM_MMU_NUM_PREV_ROOTS
; i
++) {
5428 if (nested_ept_root_matches(mmu
->prev_roots
[i
].hpa
,
5429 mmu
->prev_roots
[i
].pgd
,
5431 roots_to_free
|= KVM_MMU_ROOT_PREVIOUS(i
);
5434 case VMX_EPT_EXTENT_GLOBAL
:
5435 roots_to_free
= KVM_MMU_ROOTS_ALL
;
5443 kvm_mmu_free_roots(vcpu
, mmu
, roots_to_free
);
5445 return nested_vmx_succeed(vcpu
);
5448 static int handle_invvpid(struct kvm_vcpu
*vcpu
)
5450 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
5451 u32 vmx_instruction_info
;
5452 unsigned long type
, types
;
5454 struct x86_exception e
;
5462 if (!(vmx
->nested
.msrs
.secondary_ctls_high
&
5463 SECONDARY_EXEC_ENABLE_VPID
) ||
5464 !(vmx
->nested
.msrs
.vpid_caps
& VMX_VPID_INVVPID_BIT
)) {
5465 kvm_queue_exception(vcpu
, UD_VECTOR
);
5469 if (!nested_vmx_check_permission(vcpu
))
5472 vmx_instruction_info
= vmcs_read32(VMX_INSTRUCTION_INFO
);
5473 type
= kvm_register_read(vcpu
, (vmx_instruction_info
>> 28) & 0xf);
5475 types
= (vmx
->nested
.msrs
.vpid_caps
&
5476 VMX_VPID_EXTENT_SUPPORTED_MASK
) >> 8;
5478 if (type
>= 32 || !(types
& (1 << type
)))
5479 return nested_vmx_fail(vcpu
,
5480 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID
);
5482 /* according to the intel vmx instruction reference, the memory
5483 * operand is read even if it isn't needed (e.g., for type==global)
5485 if (get_vmx_mem_address(vcpu
, vmx_get_exit_qual(vcpu
),
5486 vmx_instruction_info
, false, sizeof(operand
), &gva
))
5488 r
= kvm_read_guest_virt(vcpu
, gva
, &operand
, sizeof(operand
), &e
);
5489 if (r
!= X86EMUL_CONTINUE
)
5490 return kvm_handle_memory_failure(vcpu
, r
, &e
);
5492 if (operand
.vpid
>> 16)
5493 return nested_vmx_fail(vcpu
,
5494 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID
);
5496 vpid02
= nested_get_vpid02(vcpu
);
5498 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR
:
5499 if (!operand
.vpid
||
5500 is_noncanonical_address(operand
.gla
, vcpu
))
5501 return nested_vmx_fail(vcpu
,
5502 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID
);
5503 vpid_sync_vcpu_addr(vpid02
, operand
.gla
);
5505 case VMX_VPID_EXTENT_SINGLE_CONTEXT
:
5506 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL
:
5508 return nested_vmx_fail(vcpu
,
5509 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID
);
5510 vpid_sync_context(vpid02
);
5512 case VMX_VPID_EXTENT_ALL_CONTEXT
:
5513 vpid_sync_context(vpid02
);
5517 return kvm_skip_emulated_instruction(vcpu
);
5521 * Sync the shadow page tables if EPT is disabled, L1 is invalidating
5522 * linear mappings for L2 (tagged with L2's VPID). Free all roots as
5523 * VPIDs are not tracked in the MMU role.
5525 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
5526 * an MMU when EPT is disabled.
5528 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
5531 kvm_mmu_free_roots(vcpu
, &vcpu
->arch
.root_mmu
,
5534 return nested_vmx_succeed(vcpu
);
5537 static int nested_vmx_eptp_switching(struct kvm_vcpu
*vcpu
,
5538 struct vmcs12
*vmcs12
)
5540 u32 index
= kvm_rcx_read(vcpu
);
5542 bool accessed_dirty
;
5543 struct kvm_mmu
*mmu
= vcpu
->arch
.walk_mmu
;
5545 if (!nested_cpu_has_eptp_switching(vmcs12
) ||
5546 !nested_cpu_has_ept(vmcs12
))
5549 if (index
>= VMFUNC_EPTP_ENTRIES
)
5553 if (kvm_vcpu_read_guest_page(vcpu
, vmcs12
->eptp_list_address
>> PAGE_SHIFT
,
5554 &new_eptp
, index
* 8, 8))
5557 accessed_dirty
= !!(new_eptp
& VMX_EPTP_AD_ENABLE_BIT
);
5560 * If the (L2) guest does a vmfunc to the currently
5561 * active ept pointer, we don't have to do anything else
5563 if (vmcs12
->ept_pointer
!= new_eptp
) {
5564 if (!nested_vmx_check_eptp(vcpu
, new_eptp
))
5567 mmu
->ept_ad
= accessed_dirty
;
5568 mmu
->mmu_role
.base
.ad_disabled
= !accessed_dirty
;
5569 vmcs12
->ept_pointer
= new_eptp
;
5571 kvm_make_request(KVM_REQ_MMU_RELOAD
, vcpu
);
5577 static int handle_vmfunc(struct kvm_vcpu
*vcpu
)
5579 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
5580 struct vmcs12
*vmcs12
;
5581 u32 function
= kvm_rax_read(vcpu
);
5584 * VMFUNC is only supported for nested guests, but we always enable the
5585 * secondary control for simplicity; for non-nested mode, fake that we
5586 * didn't by injecting #UD.
5588 if (!is_guest_mode(vcpu
)) {
5589 kvm_queue_exception(vcpu
, UD_VECTOR
);
5593 vmcs12
= get_vmcs12(vcpu
);
5594 if ((vmcs12
->vm_function_control
& (1 << function
)) == 0)
5599 if (nested_vmx_eptp_switching(vcpu
, vmcs12
))
5605 return kvm_skip_emulated_instruction(vcpu
);
5609 * This is effectively a reflected VM-Exit, as opposed to a synthesized
5610 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode
5611 * EXIT_REASON_VMFUNC as the exit reason.
5613 nested_vmx_vmexit(vcpu
, vmx
->exit_reason
.full
,
5614 vmx_get_intr_info(vcpu
),
5615 vmx_get_exit_qual(vcpu
));
5620 * Return true if an IO instruction with the specified port and size should cause
5621 * a VM-exit into L1.
5623 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu
*vcpu
, unsigned int port
,
5626 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
5627 gpa_t bitmap
, last_bitmap
;
5630 last_bitmap
= (gpa_t
)-1;
5635 bitmap
= vmcs12
->io_bitmap_a
;
5636 else if (port
< 0x10000)
5637 bitmap
= vmcs12
->io_bitmap_b
;
5640 bitmap
+= (port
& 0x7fff) / 8;
5642 if (last_bitmap
!= bitmap
)
5643 if (kvm_vcpu_read_guest(vcpu
, bitmap
, &b
, 1))
5645 if (b
& (1 << (port
& 7)))
5650 last_bitmap
= bitmap
;
5656 static bool nested_vmx_exit_handled_io(struct kvm_vcpu
*vcpu
,
5657 struct vmcs12
*vmcs12
)
5659 unsigned long exit_qualification
;
5660 unsigned short port
;
5663 if (!nested_cpu_has(vmcs12
, CPU_BASED_USE_IO_BITMAPS
))
5664 return nested_cpu_has(vmcs12
, CPU_BASED_UNCOND_IO_EXITING
);
5666 exit_qualification
= vmx_get_exit_qual(vcpu
);
5668 port
= exit_qualification
>> 16;
5669 size
= (exit_qualification
& 7) + 1;
5671 return nested_vmx_check_io_bitmaps(vcpu
, port
, size
);
5675 * Return 1 if we should exit from L2 to L1 to handle an MSR access,
5676 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
5677 * disinterest in the current event (read or write a specific MSR) by using an
5678 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
5680 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu
*vcpu
,
5681 struct vmcs12
*vmcs12
,
5682 union vmx_exit_reason exit_reason
)
5684 u32 msr_index
= kvm_rcx_read(vcpu
);
5687 if (!nested_cpu_has(vmcs12
, CPU_BASED_USE_MSR_BITMAPS
))
5691 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
5692 * for the four combinations of read/write and low/high MSR numbers.
5693 * First we need to figure out which of the four to use:
5695 bitmap
= vmcs12
->msr_bitmap
;
5696 if (exit_reason
.basic
== EXIT_REASON_MSR_WRITE
)
5698 if (msr_index
>= 0xc0000000) {
5699 msr_index
-= 0xc0000000;
5703 /* Then read the msr_index'th bit from this bitmap: */
5704 if (msr_index
< 1024*8) {
5706 if (kvm_vcpu_read_guest(vcpu
, bitmap
+ msr_index
/8, &b
, 1))
5708 return 1 & (b
>> (msr_index
& 7));
5710 return true; /* let L1 handle the wrong parameter */
5714 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
5715 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
5716 * intercept (via guest_host_mask etc.) the current event.
5718 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu
*vcpu
,
5719 struct vmcs12
*vmcs12
)
5721 unsigned long exit_qualification
= vmx_get_exit_qual(vcpu
);
5722 int cr
= exit_qualification
& 15;
5726 switch ((exit_qualification
>> 4) & 3) {
5727 case 0: /* mov to cr */
5728 reg
= (exit_qualification
>> 8) & 15;
5729 val
= kvm_register_read(vcpu
, reg
);
5732 if (vmcs12
->cr0_guest_host_mask
&
5733 (val
^ vmcs12
->cr0_read_shadow
))
5737 if (nested_cpu_has(vmcs12
, CPU_BASED_CR3_LOAD_EXITING
))
5741 if (vmcs12
->cr4_guest_host_mask
&
5742 (vmcs12
->cr4_read_shadow
^ val
))
5746 if (nested_cpu_has(vmcs12
, CPU_BASED_CR8_LOAD_EXITING
))
5752 if ((vmcs12
->cr0_guest_host_mask
& X86_CR0_TS
) &&
5753 (vmcs12
->cr0_read_shadow
& X86_CR0_TS
))
5756 case 1: /* mov from cr */
5759 if (vmcs12
->cpu_based_vm_exec_control
&
5760 CPU_BASED_CR3_STORE_EXITING
)
5764 if (vmcs12
->cpu_based_vm_exec_control
&
5765 CPU_BASED_CR8_STORE_EXITING
)
5772 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5773 * cr0. Other attempted changes are ignored, with no exit.
5775 val
= (exit_qualification
>> LMSW_SOURCE_DATA_SHIFT
) & 0x0f;
5776 if (vmcs12
->cr0_guest_host_mask
& 0xe &
5777 (val
^ vmcs12
->cr0_read_shadow
))
5779 if ((vmcs12
->cr0_guest_host_mask
& 0x1) &&
5780 !(vmcs12
->cr0_read_shadow
& 0x1) &&
5788 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu
*vcpu
,
5789 struct vmcs12
*vmcs12
)
5793 if (!guest_cpuid_has(vcpu
, X86_FEATURE_SGX
) ||
5794 !nested_cpu_has2(vmcs12
, SECONDARY_EXEC_ENCLS_EXITING
))
5797 encls_leaf
= kvm_rax_read(vcpu
);
5798 if (encls_leaf
> 62)
5800 return vmcs12
->encls_exiting_bitmap
& BIT_ULL(encls_leaf
);
5803 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu
*vcpu
,
5804 struct vmcs12
*vmcs12
, gpa_t bitmap
)
5806 u32 vmx_instruction_info
;
5807 unsigned long field
;
5810 if (!nested_cpu_has_shadow_vmcs(vmcs12
))
5813 /* Decode instruction info and find the field to access */
5814 vmx_instruction_info
= vmcs_read32(VMX_INSTRUCTION_INFO
);
5815 field
= kvm_register_read(vcpu
, (((vmx_instruction_info
) >> 28) & 0xf));
5817 /* Out-of-range fields always cause a VM exit from L2 to L1 */
5821 if (kvm_vcpu_read_guest(vcpu
, bitmap
+ field
/8, &b
, 1))
5824 return 1 & (b
>> (field
& 7));
5827 static bool nested_vmx_exit_handled_mtf(struct vmcs12
*vmcs12
)
5829 u32 entry_intr_info
= vmcs12
->vm_entry_intr_info_field
;
5831 if (nested_cpu_has_mtf(vmcs12
))
5835 * An MTF VM-exit may be injected into the guest by setting the
5836 * interruption-type to 7 (other event) and the vector field to 0. Such
5837 * is the case regardless of the 'monitor trap flag' VM-execution
5840 return entry_intr_info
== (INTR_INFO_VALID_MASK
5841 | INTR_TYPE_OTHER_EVENT
);
5845 * Return true if L0 wants to handle an exit from L2 regardless of whether or not
5846 * L1 wants the exit. Only call this when in is_guest_mode (L2).
5848 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu
*vcpu
,
5849 union vmx_exit_reason exit_reason
)
5853 switch ((u16
)exit_reason
.basic
) {
5854 case EXIT_REASON_EXCEPTION_NMI
:
5855 intr_info
= vmx_get_intr_info(vcpu
);
5856 if (is_nmi(intr_info
))
5858 else if (is_page_fault(intr_info
))
5859 return vcpu
->arch
.apf
.host_apf_flags
|| !enable_ept
;
5860 else if (is_debug(intr_info
) &&
5862 (KVM_GUESTDBG_SINGLESTEP
| KVM_GUESTDBG_USE_HW_BP
))
5864 else if (is_breakpoint(intr_info
) &&
5865 vcpu
->guest_debug
& KVM_GUESTDBG_USE_SW_BP
)
5868 case EXIT_REASON_EXTERNAL_INTERRUPT
:
5870 case EXIT_REASON_MCE_DURING_VMENTRY
:
5872 case EXIT_REASON_EPT_VIOLATION
:
5874 * L0 always deals with the EPT violation. If nested EPT is
5875 * used, and the nested mmu code discovers that the address is
5876 * missing in the guest EPT table (EPT12), the EPT violation
5877 * will be injected with nested_ept_inject_page_fault()
5880 case EXIT_REASON_EPT_MISCONFIG
:
5882 * L2 never uses directly L1's EPT, but rather L0's own EPT
5883 * table (shadow on EPT) or a merged EPT table that L0 built
5884 * (EPT on EPT). So any problems with the structure of the
5885 * table is L0's fault.
5888 case EXIT_REASON_PREEMPTION_TIMER
:
5890 case EXIT_REASON_PML_FULL
:
5892 * PML is emulated for an L1 VMM and should never be enabled in
5893 * vmcs02, always "handle" PML_FULL by exiting to userspace.
5896 case EXIT_REASON_VMFUNC
:
5897 /* VM functions are emulated through L2->L0 vmexits. */
5906 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in
5907 * is_guest_mode (L2).
5909 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu
*vcpu
,
5910 union vmx_exit_reason exit_reason
)
5912 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
5915 switch ((u16
)exit_reason
.basic
) {
5916 case EXIT_REASON_EXCEPTION_NMI
:
5917 intr_info
= vmx_get_intr_info(vcpu
);
5918 if (is_nmi(intr_info
))
5920 else if (is_page_fault(intr_info
))
5922 return vmcs12
->exception_bitmap
&
5923 (1u << (intr_info
& INTR_INFO_VECTOR_MASK
));
5924 case EXIT_REASON_EXTERNAL_INTERRUPT
:
5925 return nested_exit_on_intr(vcpu
);
5926 case EXIT_REASON_TRIPLE_FAULT
:
5928 case EXIT_REASON_INTERRUPT_WINDOW
:
5929 return nested_cpu_has(vmcs12
, CPU_BASED_INTR_WINDOW_EXITING
);
5930 case EXIT_REASON_NMI_WINDOW
:
5931 return nested_cpu_has(vmcs12
, CPU_BASED_NMI_WINDOW_EXITING
);
5932 case EXIT_REASON_TASK_SWITCH
:
5934 case EXIT_REASON_CPUID
:
5936 case EXIT_REASON_HLT
:
5937 return nested_cpu_has(vmcs12
, CPU_BASED_HLT_EXITING
);
5938 case EXIT_REASON_INVD
:
5940 case EXIT_REASON_INVLPG
:
5941 return nested_cpu_has(vmcs12
, CPU_BASED_INVLPG_EXITING
);
5942 case EXIT_REASON_RDPMC
:
5943 return nested_cpu_has(vmcs12
, CPU_BASED_RDPMC_EXITING
);
5944 case EXIT_REASON_RDRAND
:
5945 return nested_cpu_has2(vmcs12
, SECONDARY_EXEC_RDRAND_EXITING
);
5946 case EXIT_REASON_RDSEED
:
5947 return nested_cpu_has2(vmcs12
, SECONDARY_EXEC_RDSEED_EXITING
);
5948 case EXIT_REASON_RDTSC
: case EXIT_REASON_RDTSCP
:
5949 return nested_cpu_has(vmcs12
, CPU_BASED_RDTSC_EXITING
);
5950 case EXIT_REASON_VMREAD
:
5951 return nested_vmx_exit_handled_vmcs_access(vcpu
, vmcs12
,
5952 vmcs12
->vmread_bitmap
);
5953 case EXIT_REASON_VMWRITE
:
5954 return nested_vmx_exit_handled_vmcs_access(vcpu
, vmcs12
,
5955 vmcs12
->vmwrite_bitmap
);
5956 case EXIT_REASON_VMCALL
: case EXIT_REASON_VMCLEAR
:
5957 case EXIT_REASON_VMLAUNCH
: case EXIT_REASON_VMPTRLD
:
5958 case EXIT_REASON_VMPTRST
: case EXIT_REASON_VMRESUME
:
5959 case EXIT_REASON_VMOFF
: case EXIT_REASON_VMON
:
5960 case EXIT_REASON_INVEPT
: case EXIT_REASON_INVVPID
:
5962 * VMX instructions trap unconditionally. This allows L1 to
5963 * emulate them for its L2 guest, i.e., allows 3-level nesting!
5966 case EXIT_REASON_CR_ACCESS
:
5967 return nested_vmx_exit_handled_cr(vcpu
, vmcs12
);
5968 case EXIT_REASON_DR_ACCESS
:
5969 return nested_cpu_has(vmcs12
, CPU_BASED_MOV_DR_EXITING
);
5970 case EXIT_REASON_IO_INSTRUCTION
:
5971 return nested_vmx_exit_handled_io(vcpu
, vmcs12
);
5972 case EXIT_REASON_GDTR_IDTR
: case EXIT_REASON_LDTR_TR
:
5973 return nested_cpu_has2(vmcs12
, SECONDARY_EXEC_DESC
);
5974 case EXIT_REASON_MSR_READ
:
5975 case EXIT_REASON_MSR_WRITE
:
5976 return nested_vmx_exit_handled_msr(vcpu
, vmcs12
, exit_reason
);
5977 case EXIT_REASON_INVALID_STATE
:
5979 case EXIT_REASON_MWAIT_INSTRUCTION
:
5980 return nested_cpu_has(vmcs12
, CPU_BASED_MWAIT_EXITING
);
5981 case EXIT_REASON_MONITOR_TRAP_FLAG
:
5982 return nested_vmx_exit_handled_mtf(vmcs12
);
5983 case EXIT_REASON_MONITOR_INSTRUCTION
:
5984 return nested_cpu_has(vmcs12
, CPU_BASED_MONITOR_EXITING
);
5985 case EXIT_REASON_PAUSE_INSTRUCTION
:
5986 return nested_cpu_has(vmcs12
, CPU_BASED_PAUSE_EXITING
) ||
5987 nested_cpu_has2(vmcs12
,
5988 SECONDARY_EXEC_PAUSE_LOOP_EXITING
);
5989 case EXIT_REASON_MCE_DURING_VMENTRY
:
5991 case EXIT_REASON_TPR_BELOW_THRESHOLD
:
5992 return nested_cpu_has(vmcs12
, CPU_BASED_TPR_SHADOW
);
5993 case EXIT_REASON_APIC_ACCESS
:
5994 case EXIT_REASON_APIC_WRITE
:
5995 case EXIT_REASON_EOI_INDUCED
:
5997 * The controls for "virtualize APIC accesses," "APIC-
5998 * register virtualization," and "virtual-interrupt
5999 * delivery" only come from vmcs12.
6002 case EXIT_REASON_INVPCID
:
6004 nested_cpu_has2(vmcs12
, SECONDARY_EXEC_ENABLE_INVPCID
) &&
6005 nested_cpu_has(vmcs12
, CPU_BASED_INVLPG_EXITING
);
6006 case EXIT_REASON_WBINVD
:
6007 return nested_cpu_has2(vmcs12
, SECONDARY_EXEC_WBINVD_EXITING
);
6008 case EXIT_REASON_XSETBV
:
6010 case EXIT_REASON_XSAVES
: case EXIT_REASON_XRSTORS
:
6012 * This should never happen, since it is not possible to
6013 * set XSS to a non-zero value---neither in L1 nor in L2.
6014 * If if it were, XSS would have to be checked against
6015 * the XSS exit bitmap in vmcs12.
6017 return nested_cpu_has2(vmcs12
, SECONDARY_EXEC_XSAVES
);
6018 case EXIT_REASON_UMWAIT
:
6019 case EXIT_REASON_TPAUSE
:
6020 return nested_cpu_has2(vmcs12
,
6021 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE
);
6022 case EXIT_REASON_ENCLS
:
6023 return nested_vmx_exit_handled_encls(vcpu
, vmcs12
);
6030 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was
6031 * reflected into L1.
6033 bool nested_vmx_reflect_vmexit(struct kvm_vcpu
*vcpu
)
6035 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
6036 union vmx_exit_reason exit_reason
= vmx
->exit_reason
;
6037 unsigned long exit_qual
;
6040 WARN_ON_ONCE(vmx
->nested
.nested_run_pending
);
6043 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
6044 * has already loaded L2's state.
6046 if (unlikely(vmx
->fail
)) {
6047 trace_kvm_nested_vmenter_failed(
6048 "hardware VM-instruction error: ",
6049 vmcs_read32(VM_INSTRUCTION_ERROR
));
6052 goto reflect_vmexit
;
6055 trace_kvm_nested_vmexit(exit_reason
.full
, vcpu
, KVM_ISA_VMX
);
6057 /* If L0 (KVM) wants the exit, it trumps L1's desires. */
6058 if (nested_vmx_l0_wants_exit(vcpu
, exit_reason
))
6061 /* If L1 doesn't want the exit, handle it in L0. */
6062 if (!nested_vmx_l1_wants_exit(vcpu
, exit_reason
))
6066 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For
6067 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would
6068 * need to be synthesized by querying the in-kernel LAPIC, but external
6069 * interrupts are never reflected to L1 so it's a non-issue.
6071 exit_intr_info
= vmx_get_intr_info(vcpu
);
6072 if (is_exception_with_error_code(exit_intr_info
)) {
6073 struct vmcs12
*vmcs12
= get_vmcs12(vcpu
);
6075 vmcs12
->vm_exit_intr_error_code
=
6076 vmcs_read32(VM_EXIT_INTR_ERROR_CODE
);
6078 exit_qual
= vmx_get_exit_qual(vcpu
);
6081 nested_vmx_vmexit(vcpu
, exit_reason
.full
, exit_intr_info
, exit_qual
);
6085 static int vmx_get_nested_state(struct kvm_vcpu
*vcpu
,
6086 struct kvm_nested_state __user
*user_kvm_nested_state
,
6089 struct vcpu_vmx
*vmx
;
6090 struct vmcs12
*vmcs12
;
6091 struct kvm_nested_state kvm_state
= {
6093 .format
= KVM_STATE_NESTED_FORMAT_VMX
,
6094 .size
= sizeof(kvm_state
),
6096 .hdr
.vmx
.vmxon_pa
= -1ull,
6097 .hdr
.vmx
.vmcs12_pa
= -1ull,
6098 .hdr
.vmx
.preemption_timer_deadline
= 0,
6100 struct kvm_vmx_nested_state_data __user
*user_vmx_nested_state
=
6101 &user_kvm_nested_state
->data
.vmx
[0];
6104 return kvm_state
.size
+ sizeof(*user_vmx_nested_state
);
6107 vmcs12
= get_vmcs12(vcpu
);
6109 if (nested_vmx_allowed(vcpu
) &&
6110 (vmx
->nested
.vmxon
|| vmx
->nested
.smm
.vmxon
)) {
6111 kvm_state
.hdr
.vmx
.vmxon_pa
= vmx
->nested
.vmxon_ptr
;
6112 kvm_state
.hdr
.vmx
.vmcs12_pa
= vmx
->nested
.current_vmptr
;
6114 if (vmx_has_valid_vmcs12(vcpu
)) {
6115 kvm_state
.size
+= sizeof(user_vmx_nested_state
->vmcs12
);
6117 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
6118 if (vmx
->nested
.hv_evmcs_vmptr
!= EVMPTR_INVALID
)
6119 kvm_state
.flags
|= KVM_STATE_NESTED_EVMCS
;
6121 if (is_guest_mode(vcpu
) &&
6122 nested_cpu_has_shadow_vmcs(vmcs12
) &&
6123 vmcs12
->vmcs_link_pointer
!= -1ull)
6124 kvm_state
.size
+= sizeof(user_vmx_nested_state
->shadow_vmcs12
);
6127 if (vmx
->nested
.smm
.vmxon
)
6128 kvm_state
.hdr
.vmx
.smm
.flags
|= KVM_STATE_NESTED_SMM_VMXON
;
6130 if (vmx
->nested
.smm
.guest_mode
)
6131 kvm_state
.hdr
.vmx
.smm
.flags
|= KVM_STATE_NESTED_SMM_GUEST_MODE
;
6133 if (is_guest_mode(vcpu
)) {
6134 kvm_state
.flags
|= KVM_STATE_NESTED_GUEST_MODE
;
6136 if (vmx
->nested
.nested_run_pending
)
6137 kvm_state
.flags
|= KVM_STATE_NESTED_RUN_PENDING
;
6139 if (vmx
->nested
.mtf_pending
)
6140 kvm_state
.flags
|= KVM_STATE_NESTED_MTF_PENDING
;
6142 if (nested_cpu_has_preemption_timer(vmcs12
) &&
6143 vmx
->nested
.has_preemption_timer_deadline
) {
6144 kvm_state
.hdr
.vmx
.flags
|=
6145 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE
;
6146 kvm_state
.hdr
.vmx
.preemption_timer_deadline
=
6147 vmx
->nested
.preemption_timer_deadline
;
6152 if (user_data_size
< kvm_state
.size
)
6155 if (copy_to_user(user_kvm_nested_state
, &kvm_state
, sizeof(kvm_state
)))
6158 if (!vmx_has_valid_vmcs12(vcpu
))
6162 * When running L2, the authoritative vmcs12 state is in the
6163 * vmcs02. When running L1, the authoritative vmcs12 state is
6164 * in the shadow or enlightened vmcs linked to vmcs01, unless
6165 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
6166 * vmcs12 state is in the vmcs12 already.
6168 if (is_guest_mode(vcpu
)) {
6169 sync_vmcs02_to_vmcs12(vcpu
, vmcs12
);
6170 sync_vmcs02_to_vmcs12_rare(vcpu
, vmcs12
);
6172 copy_vmcs02_to_vmcs12_rare(vcpu
, get_vmcs12(vcpu
));
6173 if (!vmx
->nested
.need_vmcs12_to_shadow_sync
) {
6174 if (evmptr_is_valid(vmx
->nested
.hv_evmcs_vmptr
))
6176 * L1 hypervisor is not obliged to keep eVMCS
6177 * clean fields data always up-to-date while
6178 * not in guest mode, 'hv_clean_fields' is only
6179 * supposed to be actual upon vmentry so we need
6180 * to ignore it here and do full copy.
6182 copy_enlightened_to_vmcs12(vmx
, 0);
6183 else if (enable_shadow_vmcs
)
6184 copy_shadow_to_vmcs12(vmx
);
6188 BUILD_BUG_ON(sizeof(user_vmx_nested_state
->vmcs12
) < VMCS12_SIZE
);
6189 BUILD_BUG_ON(sizeof(user_vmx_nested_state
->shadow_vmcs12
) < VMCS12_SIZE
);
6192 * Copy over the full allocated size of vmcs12 rather than just the size
6195 if (copy_to_user(user_vmx_nested_state
->vmcs12
, vmcs12
, VMCS12_SIZE
))
6198 if (nested_cpu_has_shadow_vmcs(vmcs12
) &&
6199 vmcs12
->vmcs_link_pointer
!= -1ull) {
6200 if (copy_to_user(user_vmx_nested_state
->shadow_vmcs12
,
6201 get_shadow_vmcs12(vcpu
), VMCS12_SIZE
))
6205 return kvm_state
.size
;
6209 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
6211 void vmx_leave_nested(struct kvm_vcpu
*vcpu
)
6213 if (is_guest_mode(vcpu
)) {
6214 to_vmx(vcpu
)->nested
.nested_run_pending
= 0;
6215 nested_vmx_vmexit(vcpu
, -1, 0, 0);
6220 static int vmx_set_nested_state(struct kvm_vcpu
*vcpu
,
6221 struct kvm_nested_state __user
*user_kvm_nested_state
,
6222 struct kvm_nested_state
*kvm_state
)
6224 struct vcpu_vmx
*vmx
= to_vmx(vcpu
);
6225 struct vmcs12
*vmcs12
;
6226 enum vm_entry_failure_code ignored
;
6227 struct kvm_vmx_nested_state_data __user
*user_vmx_nested_state
=
6228 &user_kvm_nested_state
->data
.vmx
[0];
6231 if (kvm_state
->format
!= KVM_STATE_NESTED_FORMAT_VMX
)
6234 if (kvm_state
->hdr
.vmx
.vmxon_pa
== -1ull) {
6235 if (kvm_state
->hdr
.vmx
.smm
.flags
)
6238 if (kvm_state
->hdr
.vmx
.vmcs12_pa
!= -1ull)
6242 * KVM_STATE_NESTED_EVMCS used to signal that KVM should
6243 * enable eVMCS capability on vCPU. However, since then
6244 * code was changed such that flag signals vmcs12 should
6245 * be copied into eVMCS in guest memory.
6247 * To preserve backwards compatability, allow user
6248 * to set this flag even when there is no VMXON region.
6250 if (kvm_state
->flags
& ~KVM_STATE_NESTED_EVMCS
)
6253 if (!nested_vmx_allowed(vcpu
))
6256 if (!page_address_valid(vcpu
, kvm_state
->hdr
.vmx
.vmxon_pa
))
6260 if ((kvm_state
->hdr
.vmx
.smm
.flags
& KVM_STATE_NESTED_SMM_GUEST_MODE
) &&
6261 (kvm_state
->flags
& KVM_STATE_NESTED_GUEST_MODE
))
6264 if (kvm_state
->hdr
.vmx
.smm
.flags
&
6265 ~(KVM_STATE_NESTED_SMM_GUEST_MODE
| KVM_STATE_NESTED_SMM_VMXON
))
6268 if (kvm_state
->hdr
.vmx
.flags
& ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE
)
6272 * SMM temporarily disables VMX, so we cannot be in guest mode,
6273 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
6278 (KVM_STATE_NESTED_GUEST_MODE
| KVM_STATE_NESTED_RUN_PENDING
))
6279 : kvm_state
->hdr
.vmx
.smm
.flags
)
6282 if ((kvm_state
->hdr
.vmx
.smm
.flags
& KVM_STATE_NESTED_SMM_GUEST_MODE
) &&
6283 !(kvm_state
->hdr
.vmx
.smm
.flags
& KVM_STATE_NESTED_SMM_VMXON
))
6286 if ((kvm_state
->flags
& KVM_STATE_NESTED_EVMCS
) &&
6287 (!nested_vmx_allowed(vcpu
) || !vmx
->nested
.enlightened_vmcs_enabled
))
6290 vmx_leave_nested(vcpu
);
6292 if (kvm_state
->hdr
.vmx
.vmxon_pa
== -1ull)
6295 vmx
->nested
.vmxon_ptr
= kvm_state
->hdr
.vmx
.vmxon_pa
;
6296 ret
= enter_vmx_operation(vcpu
);
6300 /* Empty 'VMXON' state is permitted if no VMCS loaded */
6301 if (kvm_state
->size
< sizeof(*kvm_state
) + sizeof(*vmcs12
)) {
6302 /* See vmx_has_valid_vmcs12. */
6303 if ((kvm_state
->flags
& KVM_STATE_NESTED_GUEST_MODE
) ||
6304 (kvm_state
->flags
& KVM_STATE_NESTED_EVMCS
) ||
6305 (kvm_state
->hdr
.vmx
.vmcs12_pa
!= -1ull))
6311 if (kvm_state
->hdr
.vmx
.vmcs12_pa
!= -1ull) {
6312 if (kvm_state
->hdr
.vmx
.vmcs12_pa
== kvm_state
->hdr
.vmx
.vmxon_pa
||
6313 !page_address_valid(vcpu
, kvm_state
->hdr
.vmx
.vmcs12_pa
))
6316 set_current_vmptr(vmx
, kvm_state
->hdr
.vmx
.vmcs12_pa
);
6317 } else if (kvm_state
->flags
& KVM_STATE_NESTED_EVMCS
) {
6319 * nested_vmx_handle_enlightened_vmptrld() cannot be called
6320 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be
6321 * restored yet. EVMCS will be mapped from
6322 * nested_get_vmcs12_pages().
6324 vmx
->nested
.hv_evmcs_vmptr
= EVMPTR_MAP_PENDING
;
6325 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES
, vcpu
);
6330 if (kvm_state
->hdr
.vmx
.smm
.flags
& KVM_STATE_NESTED_SMM_VMXON
) {
6331 vmx
->nested
.smm
.vmxon
= true;
6332 vmx
->nested
.vmxon
= false;
6334 if (kvm_state
->hdr
.vmx
.smm
.flags
& KVM_STATE_NESTED_SMM_GUEST_MODE
)
6335 vmx
->nested
.smm
.guest_mode
= true;
6338 vmcs12
= get_vmcs12(vcpu
);
6339 if (copy_from_user(vmcs12
, user_vmx_nested_state
->vmcs12
, sizeof(*vmcs12
)))
6342 if (vmcs12
->hdr
.revision_id
!= VMCS12_REVISION
)
6345 if (!(kvm_state
->flags
& KVM_STATE_NESTED_GUEST_MODE
))
6348 vmx
->nested
.nested_run_pending
=
6349 !!(kvm_state
->flags
& KVM_STATE_NESTED_RUN_PENDING
);
6351 vmx
->nested
.mtf_pending
=
6352 !!(kvm_state
->flags
& KVM_STATE_NESTED_MTF_PENDING
);
6355 if (nested_cpu_has_shadow_vmcs(vmcs12
) &&
6356 vmcs12
->vmcs_link_pointer
!= -1ull) {
6357 struct vmcs12
*shadow_vmcs12
= get_shadow_vmcs12(vcpu
);
6359 if (kvm_state
->size
<
6360 sizeof(*kvm_state
) +
6361 sizeof(user_vmx_nested_state
->vmcs12
) + sizeof(*shadow_vmcs12
))
6362 goto error_guest_mode
;
6364 if (copy_from_user(shadow_vmcs12
,
6365 user_vmx_nested_state
->shadow_vmcs12
,
6366 sizeof(*shadow_vmcs12
))) {
6368 goto error_guest_mode
;
6371 if (shadow_vmcs12
->hdr
.revision_id
!= VMCS12_REVISION
||
6372 !shadow_vmcs12
->hdr
.shadow_vmcs
)
6373 goto error_guest_mode
;
6376 vmx
->nested
.has_preemption_timer_deadline
= false;
6377 if (kvm_state
->hdr
.vmx
.flags
& KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE
) {
6378 vmx
->nested
.has_preemption_timer_deadline
= true;
6379 vmx
->nested
.preemption_timer_deadline
=
6380 kvm_state
->hdr
.vmx
.preemption_timer_deadline
;
6383 if (nested_vmx_check_controls(vcpu
, vmcs12
) ||
6384 nested_vmx_check_host_state(vcpu
, vmcs12
) ||
6385 nested_vmx_check_guest_state(vcpu
, vmcs12
, &ignored
))
6386 goto error_guest_mode
;
6388 vmx
->nested
.dirty_vmcs12
= true;
6389 ret
= nested_vmx_enter_non_root_mode(vcpu
, false);
6391 goto error_guest_mode
;
6396 vmx
->nested
.nested_run_pending
= 0;
6400 void nested_vmx_set_vmcs_shadowing_bitmap(void)
6402 if (enable_shadow_vmcs
) {
6403 vmcs_write64(VMREAD_BITMAP
, __pa(vmx_vmread_bitmap
));
6404 vmcs_write64(VMWRITE_BITMAP
, __pa(vmx_vmwrite_bitmap
));
6409 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
6410 * returned for the various VMX controls MSRs when nested VMX is enabled.
6411 * The same values should also be used to verify that vmcs12 control fields are
6412 * valid during nested entry from L1 to L2.
6413 * Each of these control msrs has a low and high 32-bit half: A low bit is on
6414 * if the corresponding bit in the (32-bit) control field *must* be on, and a
6415 * bit in the high half is on if the corresponding bit in the control field
6416 * may be on. See also vmx_control_verify().
6418 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs
*msrs
, u32 ept_caps
)
6421 * Note that as a general rule, the high half of the MSRs (bits in
6422 * the control fields which may be 1) should be initialized by the
6423 * intersection of the underlying hardware's MSR (i.e., features which
6424 * can be supported) and the list of features we want to expose -
6425 * because they are known to be properly supported in our code.
6426 * Also, usually, the low half of the MSRs (bits which must be 1) can
6427 * be set to 0, meaning that L1 may turn off any of these bits. The
6428 * reason is that if one of these bits is necessary, it will appear
6429 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
6430 * fields of vmcs01 and vmcs02, will turn these bits off - and
6431 * nested_vmx_l1_wants_exit() will not pass related exits to L1.
6432 * These rules have exceptions below.
6435 /* pin-based controls */
6436 rdmsr(MSR_IA32_VMX_PINBASED_CTLS
,
6437 msrs
->pinbased_ctls_low
,
6438 msrs
->pinbased_ctls_high
);
6439 msrs
->pinbased_ctls_low
|=
6440 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR
;
6441 msrs
->pinbased_ctls_high
&=
6442 PIN_BASED_EXT_INTR_MASK
|
6443 PIN_BASED_NMI_EXITING
|
6444 PIN_BASED_VIRTUAL_NMIS
|
6445 (enable_apicv
? PIN_BASED_POSTED_INTR
: 0);
6446 msrs
->pinbased_ctls_high
|=
6447 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR
|
6448 PIN_BASED_VMX_PREEMPTION_TIMER
;
6451 rdmsr(MSR_IA32_VMX_EXIT_CTLS
,
6452 msrs
->exit_ctls_low
,
6453 msrs
->exit_ctls_high
);
6454 msrs
->exit_ctls_low
=
6455 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR
;
6457 msrs
->exit_ctls_high
&=
6458 #ifdef CONFIG_X86_64
6459 VM_EXIT_HOST_ADDR_SPACE_SIZE
|
6461 VM_EXIT_LOAD_IA32_PAT
| VM_EXIT_SAVE_IA32_PAT
|
6462 VM_EXIT_CLEAR_BNDCFGS
| VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL
;
6463 msrs
->exit_ctls_high
|=
6464 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR
|
6465 VM_EXIT_LOAD_IA32_EFER
| VM_EXIT_SAVE_IA32_EFER
|
6466 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER
| VM_EXIT_ACK_INTR_ON_EXIT
;
6468 /* We support free control of debug control saving. */
6469 msrs
->exit_ctls_low
&= ~VM_EXIT_SAVE_DEBUG_CONTROLS
;
6471 /* entry controls */
6472 rdmsr(MSR_IA32_VMX_ENTRY_CTLS
,
6473 msrs
->entry_ctls_low
,
6474 msrs
->entry_ctls_high
);
6475 msrs
->entry_ctls_low
=
6476 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR
;
6477 msrs
->entry_ctls_high
&=
6478 #ifdef CONFIG_X86_64
6479 VM_ENTRY_IA32E_MODE
|
6481 VM_ENTRY_LOAD_IA32_PAT
| VM_ENTRY_LOAD_BNDCFGS
|
6482 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL
;
6483 msrs
->entry_ctls_high
|=
6484 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR
| VM_ENTRY_LOAD_IA32_EFER
);
6486 /* We support free control of debug control loading. */
6487 msrs
->entry_ctls_low
&= ~VM_ENTRY_LOAD_DEBUG_CONTROLS
;
6489 /* cpu-based controls */
6490 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS
,
6491 msrs
->procbased_ctls_low
,
6492 msrs
->procbased_ctls_high
);
6493 msrs
->procbased_ctls_low
=
6494 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR
;
6495 msrs
->procbased_ctls_high
&=
6496 CPU_BASED_INTR_WINDOW_EXITING
|
6497 CPU_BASED_NMI_WINDOW_EXITING
| CPU_BASED_USE_TSC_OFFSETTING
|
6498 CPU_BASED_HLT_EXITING
| CPU_BASED_INVLPG_EXITING
|
6499 CPU_BASED_MWAIT_EXITING
| CPU_BASED_CR3_LOAD_EXITING
|
6500 CPU_BASED_CR3_STORE_EXITING
|
6501 #ifdef CONFIG_X86_64
6502 CPU_BASED_CR8_LOAD_EXITING
| CPU_BASED_CR8_STORE_EXITING
|
6504 CPU_BASED_MOV_DR_EXITING
| CPU_BASED_UNCOND_IO_EXITING
|
6505 CPU_BASED_USE_IO_BITMAPS
| CPU_BASED_MONITOR_TRAP_FLAG
|
6506 CPU_BASED_MONITOR_EXITING
| CPU_BASED_RDPMC_EXITING
|
6507 CPU_BASED_RDTSC_EXITING
| CPU_BASED_PAUSE_EXITING
|
6508 CPU_BASED_TPR_SHADOW
| CPU_BASED_ACTIVATE_SECONDARY_CONTROLS
;
6510 * We can allow some features even when not supported by the
6511 * hardware. For example, L1 can specify an MSR bitmap - and we
6512 * can use it to avoid exits to L1 - even when L0 runs L2
6513 * without MSR bitmaps.
6515 msrs
->procbased_ctls_high
|=
6516 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR
|
6517 CPU_BASED_USE_MSR_BITMAPS
;
6519 /* We support free control of CR3 access interception. */
6520 msrs
->procbased_ctls_low
&=
6521 ~(CPU_BASED_CR3_LOAD_EXITING
| CPU_BASED_CR3_STORE_EXITING
);
6524 * secondary cpu-based controls. Do not include those that
6525 * depend on CPUID bits, they are added later by
6526 * vmx_vcpu_after_set_cpuid.
6528 if (msrs
->procbased_ctls_high
& CPU_BASED_ACTIVATE_SECONDARY_CONTROLS
)
6529 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2
,
6530 msrs
->secondary_ctls_low
,
6531 msrs
->secondary_ctls_high
);
6533 msrs
->secondary_ctls_low
= 0;
6534 msrs
->secondary_ctls_high
&=
6535 SECONDARY_EXEC_DESC
|
6536 SECONDARY_EXEC_ENABLE_RDTSCP
|
6537 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE
|
6538 SECONDARY_EXEC_WBINVD_EXITING
|
6539 SECONDARY_EXEC_APIC_REGISTER_VIRT
|
6540 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY
|
6541 SECONDARY_EXEC_RDRAND_EXITING
|
6542 SECONDARY_EXEC_ENABLE_INVPCID
|
6543 SECONDARY_EXEC_RDSEED_EXITING
|
6544 SECONDARY_EXEC_XSAVES
|
6545 SECONDARY_EXEC_TSC_SCALING
;
6548 * We can emulate "VMCS shadowing," even if the hardware
6549 * doesn't support it.
6551 msrs
->secondary_ctls_high
|=
6552 SECONDARY_EXEC_SHADOW_VMCS
;
6555 /* nested EPT: emulate EPT also to L1 */
6556 msrs
->secondary_ctls_high
|=
6557 SECONDARY_EXEC_ENABLE_EPT
;
6559 VMX_EPT_PAGE_WALK_4_BIT
|
6560 VMX_EPT_PAGE_WALK_5_BIT
|
6562 VMX_EPT_INVEPT_BIT
|
6563 VMX_EPT_EXECUTE_ONLY_BIT
;
6565 msrs
->ept_caps
&= ept_caps
;
6566 msrs
->ept_caps
|= VMX_EPT_EXTENT_GLOBAL_BIT
|
6567 VMX_EPT_EXTENT_CONTEXT_BIT
| VMX_EPT_2MB_PAGE_BIT
|
6568 VMX_EPT_1GB_PAGE_BIT
;
6569 if (enable_ept_ad_bits
) {
6570 msrs
->secondary_ctls_high
|=
6571 SECONDARY_EXEC_ENABLE_PML
;
6572 msrs
->ept_caps
|= VMX_EPT_AD_BIT
;
6576 if (cpu_has_vmx_vmfunc()) {
6577 msrs
->secondary_ctls_high
|=
6578 SECONDARY_EXEC_ENABLE_VMFUNC
;
6580 * Advertise EPTP switching unconditionally
6581 * since we emulate it
6584 msrs
->vmfunc_controls
=
6585 VMX_VMFUNC_EPTP_SWITCHING
;
6589 * Old versions of KVM use the single-context version without
6590 * checking for support, so declare that it is supported even
6591 * though it is treated as global context. The alternative is
6592 * not failing the single-context invvpid, and it is worse.
6595 msrs
->secondary_ctls_high
|=
6596 SECONDARY_EXEC_ENABLE_VPID
;
6597 msrs
->vpid_caps
= VMX_VPID_INVVPID_BIT
|
6598 VMX_VPID_EXTENT_SUPPORTED_MASK
;
6601 if (enable_unrestricted_guest
)
6602 msrs
->secondary_ctls_high
|=
6603 SECONDARY_EXEC_UNRESTRICTED_GUEST
;
6605 if (flexpriority_enabled
)
6606 msrs
->secondary_ctls_high
|=
6607 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES
;
6610 msrs
->secondary_ctls_high
|= SECONDARY_EXEC_ENCLS_EXITING
;
6612 /* miscellaneous data */
6613 rdmsr(MSR_IA32_VMX_MISC
,
6616 msrs
->misc_low
&= VMX_MISC_SAVE_EFER_LMA
;
6618 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS
|
6619 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE
|
6620 VMX_MISC_ACTIVITY_HLT
|
6621 VMX_MISC_ACTIVITY_WAIT_SIPI
;
6622 msrs
->misc_high
= 0;
6625 * This MSR reports some information about VMX support. We
6626 * should return information about the VMX we emulate for the
6627 * guest, and the VMCS structure we give it - not about the
6628 * VMX support of the underlying hardware.
6632 VMX_BASIC_TRUE_CTLS
|
6633 ((u64
)VMCS12_SIZE
<< VMX_BASIC_VMCS_SIZE_SHIFT
) |
6634 (VMX_BASIC_MEM_TYPE_WB
<< VMX_BASIC_MEM_TYPE_SHIFT
);
6636 if (cpu_has_vmx_basic_inout())
6637 msrs
->basic
|= VMX_BASIC_INOUT
;
6640 * These MSRs specify bits which the guest must keep fixed on
6641 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
6642 * We picked the standard core2 setting.
6644 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
6645 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE
6646 msrs
->cr0_fixed0
= VMXON_CR0_ALWAYSON
;
6647 msrs
->cr4_fixed0
= VMXON_CR4_ALWAYSON
;
6649 /* These MSRs specify bits which the guest must keep fixed off. */
6650 rdmsrl(MSR_IA32_VMX_CR0_FIXED1
, msrs
->cr0_fixed1
);
6651 rdmsrl(MSR_IA32_VMX_CR4_FIXED1
, msrs
->cr4_fixed1
);
6653 /* highest index: VMX_PREEMPTION_TIMER_VALUE */
6654 msrs
->vmcs_enum
= VMCS12_MAX_FIELD_INDEX
<< 1;
6657 void nested_vmx_hardware_unsetup(void)
6661 if (enable_shadow_vmcs
) {
6662 for (i
= 0; i
< VMX_BITMAP_NR
; i
++)
6663 free_page((unsigned long)vmx_bitmap
[i
]);
6667 __init
int nested_vmx_hardware_setup(int (*exit_handlers
[])(struct kvm_vcpu
*))
6671 if (!cpu_has_vmx_shadow_vmcs())
6672 enable_shadow_vmcs
= 0;
6673 if (enable_shadow_vmcs
) {
6674 for (i
= 0; i
< VMX_BITMAP_NR
; i
++) {
6676 * The vmx_bitmap is not tied to a VM and so should
6677 * not be charged to a memcg.
6679 vmx_bitmap
[i
] = (unsigned long *)
6680 __get_free_page(GFP_KERNEL
);
6681 if (!vmx_bitmap
[i
]) {
6682 nested_vmx_hardware_unsetup();
6687 init_vmcs_shadow_fields();
6690 exit_handlers
[EXIT_REASON_VMCLEAR
] = handle_vmclear
;
6691 exit_handlers
[EXIT_REASON_VMLAUNCH
] = handle_vmlaunch
;
6692 exit_handlers
[EXIT_REASON_VMPTRLD
] = handle_vmptrld
;
6693 exit_handlers
[EXIT_REASON_VMPTRST
] = handle_vmptrst
;
6694 exit_handlers
[EXIT_REASON_VMREAD
] = handle_vmread
;
6695 exit_handlers
[EXIT_REASON_VMRESUME
] = handle_vmresume
;
6696 exit_handlers
[EXIT_REASON_VMWRITE
] = handle_vmwrite
;
6697 exit_handlers
[EXIT_REASON_VMOFF
] = handle_vmoff
;
6698 exit_handlers
[EXIT_REASON_VMON
] = handle_vmon
;
6699 exit_handlers
[EXIT_REASON_INVEPT
] = handle_invept
;
6700 exit_handlers
[EXIT_REASON_INVVPID
] = handle_invvpid
;
6701 exit_handlers
[EXIT_REASON_VMFUNC
] = handle_vmfunc
;
6706 struct kvm_x86_nested_ops vmx_nested_ops
= {
6707 .check_events
= vmx_check_nested_events
,
6708 .hv_timer_pending
= nested_vmx_preemption_timer_pending
,
6709 .triple_fault
= nested_vmx_triple_fault
,
6710 .get_state
= vmx_get_nested_state
,
6711 .set_state
= vmx_set_nested_state
,
6712 .get_nested_state_pages
= vmx_get_nested_state_pages
,
6713 .write_log_dirty
= nested_vmx_write_pml_buffer
,
6714 .enable_evmcs
= nested_enable_evmcs
,
6715 .get_evmcs_version
= nested_get_evmcs_version
,