1 // SPDX-License-Identifier: GPL-2.0-only
3 * Kernel-based Virtual Machine driver for Linux
7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11 * Yaniv Kamay <yaniv@qumranet.com>
12 * Avi Kivity <avi@qumranet.com>
15 #define pr_fmt(fmt) "SVM: " fmt
17 #include <linux/kvm_types.h>
18 #include <linux/kvm_host.h>
19 #include <linux/kernel.h>
21 #include <asm/msr-index.h>
22 #include <asm/debugreg.h>
24 #include "kvm_emulate.h"
32 static void nested_svm_inject_npf_exit(struct kvm_vcpu
*vcpu
,
33 struct x86_exception
*fault
)
35 struct vcpu_svm
*svm
= to_svm(vcpu
);
37 if (svm
->vmcb
->control
.exit_code
!= SVM_EXIT_NPF
) {
39 * TODO: track the cause of the nested page fault, and
40 * correctly fill in the high bits of exit_info_1.
42 svm
->vmcb
->control
.exit_code
= SVM_EXIT_NPF
;
43 svm
->vmcb
->control
.exit_code_hi
= 0;
44 svm
->vmcb
->control
.exit_info_1
= (1ULL << 32);
45 svm
->vmcb
->control
.exit_info_2
= fault
->address
;
48 svm
->vmcb
->control
.exit_info_1
&= ~0xffffffffULL
;
49 svm
->vmcb
->control
.exit_info_1
|= fault
->error_code
;
51 nested_svm_vmexit(svm
);
54 static u64
nested_svm_get_tdp_pdptr(struct kvm_vcpu
*vcpu
, int index
)
56 struct vcpu_svm
*svm
= to_svm(vcpu
);
57 u64 cr3
= svm
->nested
.ctl
.nested_cr3
;
61 ret
= kvm_vcpu_read_guest_page(vcpu
, gpa_to_gfn(__sme_clr(cr3
)), &pdpte
,
62 offset_in_page(cr3
) + index
* 8, 8);
68 static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu
*vcpu
)
70 struct vcpu_svm
*svm
= to_svm(vcpu
);
72 return svm
->nested
.ctl
.nested_cr3
;
75 static void nested_svm_init_mmu_context(struct kvm_vcpu
*vcpu
)
77 struct vcpu_svm
*svm
= to_svm(vcpu
);
78 struct vmcb
*hsave
= svm
->nested
.hsave
;
80 WARN_ON(mmu_is_nested(vcpu
));
82 vcpu
->arch
.mmu
= &vcpu
->arch
.guest_mmu
;
83 kvm_init_shadow_npt_mmu(vcpu
, X86_CR0_PG
, hsave
->save
.cr4
, hsave
->save
.efer
,
84 svm
->nested
.ctl
.nested_cr3
);
85 vcpu
->arch
.mmu
->get_guest_pgd
= nested_svm_get_tdp_cr3
;
86 vcpu
->arch
.mmu
->get_pdptr
= nested_svm_get_tdp_pdptr
;
87 vcpu
->arch
.mmu
->inject_page_fault
= nested_svm_inject_npf_exit
;
88 reset_shadow_zero_bits_mask(vcpu
, vcpu
->arch
.mmu
);
89 vcpu
->arch
.walk_mmu
= &vcpu
->arch
.nested_mmu
;
92 static void nested_svm_uninit_mmu_context(struct kvm_vcpu
*vcpu
)
94 vcpu
->arch
.mmu
= &vcpu
->arch
.root_mmu
;
95 vcpu
->arch
.walk_mmu
= &vcpu
->arch
.root_mmu
;
98 void recalc_intercepts(struct vcpu_svm
*svm
)
100 struct vmcb_control_area
*c
, *h
, *g
;
102 vmcb_mark_dirty(svm
->vmcb
, VMCB_INTERCEPTS
);
104 if (!is_guest_mode(&svm
->vcpu
))
107 c
= &svm
->vmcb
->control
;
108 h
= &svm
->nested
.hsave
->control
;
109 g
= &svm
->nested
.ctl
;
111 svm
->nested
.host_intercept_exceptions
= h
->intercept_exceptions
;
113 c
->intercept_cr
= h
->intercept_cr
;
114 c
->intercept_dr
= h
->intercept_dr
;
115 c
->intercept_exceptions
= h
->intercept_exceptions
;
116 c
->intercept
= h
->intercept
;
118 if (g
->int_ctl
& V_INTR_MASKING_MASK
) {
119 /* We only want the cr8 intercept bits of L1 */
120 c
->intercept_cr
&= ~(1U << INTERCEPT_CR8_READ
);
121 c
->intercept_cr
&= ~(1U << INTERCEPT_CR8_WRITE
);
124 * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
125 * affect any interrupt we may want to inject; therefore,
126 * interrupt window vmexits are irrelevant to L0.
128 c
->intercept
&= ~(1ULL << INTERCEPT_VINTR
);
131 /* We don't want to see VMMCALLs from a nested guest */
132 c
->intercept
&= ~(1ULL << INTERCEPT_VMMCALL
);
134 c
->intercept_cr
|= g
->intercept_cr
;
135 c
->intercept_dr
|= g
->intercept_dr
;
136 c
->intercept_exceptions
|= g
->intercept_exceptions
;
137 c
->intercept
|= g
->intercept
;
140 static void copy_vmcb_control_area(struct vmcb_control_area
*dst
,
141 struct vmcb_control_area
*from
)
143 dst
->intercept_cr
= from
->intercept_cr
;
144 dst
->intercept_dr
= from
->intercept_dr
;
145 dst
->intercept_exceptions
= from
->intercept_exceptions
;
146 dst
->intercept
= from
->intercept
;
147 dst
->iopm_base_pa
= from
->iopm_base_pa
;
148 dst
->msrpm_base_pa
= from
->msrpm_base_pa
;
149 dst
->tsc_offset
= from
->tsc_offset
;
150 /* asid not copied, it is handled manually for svm->vmcb. */
151 dst
->tlb_ctl
= from
->tlb_ctl
;
152 dst
->int_ctl
= from
->int_ctl
;
153 dst
->int_vector
= from
->int_vector
;
154 dst
->int_state
= from
->int_state
;
155 dst
->exit_code
= from
->exit_code
;
156 dst
->exit_code_hi
= from
->exit_code_hi
;
157 dst
->exit_info_1
= from
->exit_info_1
;
158 dst
->exit_info_2
= from
->exit_info_2
;
159 dst
->exit_int_info
= from
->exit_int_info
;
160 dst
->exit_int_info_err
= from
->exit_int_info_err
;
161 dst
->nested_ctl
= from
->nested_ctl
;
162 dst
->event_inj
= from
->event_inj
;
163 dst
->event_inj_err
= from
->event_inj_err
;
164 dst
->nested_cr3
= from
->nested_cr3
;
165 dst
->virt_ext
= from
->virt_ext
;
166 dst
->pause_filter_count
= from
->pause_filter_count
;
167 dst
->pause_filter_thresh
= from
->pause_filter_thresh
;
170 static bool nested_svm_vmrun_msrpm(struct vcpu_svm
*svm
)
173 * This function merges the msr permission bitmaps of kvm and the
174 * nested vmcb. It is optimized in that it only merges the parts where
175 * the kvm msr permission bitmap may contain zero bits
179 if (!(svm
->nested
.ctl
.intercept
& (1ULL << INTERCEPT_MSR_PROT
)))
182 for (i
= 0; i
< MSRPM_OFFSETS
; i
++) {
186 if (msrpm_offsets
[i
] == 0xffffffff)
189 p
= msrpm_offsets
[i
];
190 offset
= svm
->nested
.ctl
.msrpm_base_pa
+ (p
* 4);
192 if (kvm_vcpu_read_guest(&svm
->vcpu
, offset
, &value
, 4))
195 svm
->nested
.msrpm
[p
] = svm
->msrpm
[p
] | value
;
198 svm
->vmcb
->control
.msrpm_base_pa
= __sme_set(__pa(svm
->nested
.msrpm
));
203 static bool nested_vmcb_check_controls(struct vmcb_control_area
*control
)
205 if ((control
->intercept
& (1ULL << INTERCEPT_VMRUN
)) == 0)
208 if (control
->asid
== 0)
211 if ((control
->nested_ctl
& SVM_NESTED_CTL_NP_ENABLE
) &&
218 static bool nested_vmcb_checks(struct vcpu_svm
*svm
, struct vmcb
*vmcb
)
220 bool nested_vmcb_lma
;
221 if ((vmcb
->save
.efer
& EFER_SVME
) == 0)
224 if (((vmcb
->save
.cr0
& X86_CR0_CD
) == 0) &&
225 (vmcb
->save
.cr0
& X86_CR0_NW
))
228 if (!kvm_dr6_valid(vmcb
->save
.dr6
) || !kvm_dr7_valid(vmcb
->save
.dr7
))
232 (vmcb
->save
.efer
& EFER_LME
) &&
233 (vmcb
->save
.cr0
& X86_CR0_PG
);
235 if (!nested_vmcb_lma
) {
236 if (vmcb
->save
.cr4
& X86_CR4_PAE
) {
237 if (vmcb
->save
.cr3
& MSR_CR3_LEGACY_PAE_RESERVED_MASK
)
240 if (vmcb
->save
.cr3
& MSR_CR3_LEGACY_RESERVED_MASK
)
244 if (!(vmcb
->save
.cr4
& X86_CR4_PAE
) ||
245 !(vmcb
->save
.cr0
& X86_CR0_PE
) ||
246 (vmcb
->save
.cr3
& MSR_CR3_LONG_RESERVED_MASK
))
249 if (kvm_valid_cr4(&svm
->vcpu
, vmcb
->save
.cr4
))
252 return nested_vmcb_check_controls(&vmcb
->control
);
255 static void load_nested_vmcb_control(struct vcpu_svm
*svm
,
256 struct vmcb_control_area
*control
)
258 copy_vmcb_control_area(&svm
->nested
.ctl
, control
);
260 /* Copy it here because nested_svm_check_controls will check it. */
261 svm
->nested
.ctl
.asid
= control
->asid
;
262 svm
->nested
.ctl
.msrpm_base_pa
&= ~0x0fffULL
;
263 svm
->nested
.ctl
.iopm_base_pa
&= ~0x0fffULL
;
267 * Synchronize fields that are written by the processor, so that
268 * they can be copied back into the nested_vmcb.
270 void sync_nested_vmcb_control(struct vcpu_svm
*svm
)
273 svm
->nested
.ctl
.event_inj
= svm
->vmcb
->control
.event_inj
;
274 svm
->nested
.ctl
.event_inj_err
= svm
->vmcb
->control
.event_inj_err
;
276 /* Only a few fields of int_ctl are written by the processor. */
277 mask
= V_IRQ_MASK
| V_TPR_MASK
;
278 if (!(svm
->nested
.ctl
.int_ctl
& V_INTR_MASKING_MASK
) &&
279 svm_is_intercept(svm
, INTERCEPT_VINTR
)) {
281 * In order to request an interrupt window, L0 is usurping
282 * svm->vmcb->control.int_ctl and possibly setting V_IRQ
283 * even if it was clear in L1's VMCB. Restoring it would be
284 * wrong. However, in this case V_IRQ will remain true until
285 * interrupt_window_interception calls svm_clear_vintr and
286 * restores int_ctl. We can just leave it aside.
290 svm
->nested
.ctl
.int_ctl
&= ~mask
;
291 svm
->nested
.ctl
.int_ctl
|= svm
->vmcb
->control
.int_ctl
& mask
;
295 * Transfer any event that L0 or L1 wanted to inject into L2 to
298 static void nested_vmcb_save_pending_event(struct vcpu_svm
*svm
,
299 struct vmcb
*nested_vmcb
)
301 struct kvm_vcpu
*vcpu
= &svm
->vcpu
;
302 u32 exit_int_info
= 0;
305 if (vcpu
->arch
.exception
.injected
) {
306 nr
= vcpu
->arch
.exception
.nr
;
307 exit_int_info
= nr
| SVM_EVTINJ_VALID
| SVM_EVTINJ_TYPE_EXEPT
;
309 if (vcpu
->arch
.exception
.has_error_code
) {
310 exit_int_info
|= SVM_EVTINJ_VALID_ERR
;
311 nested_vmcb
->control
.exit_int_info_err
=
312 vcpu
->arch
.exception
.error_code
;
315 } else if (vcpu
->arch
.nmi_injected
) {
316 exit_int_info
= SVM_EVTINJ_VALID
| SVM_EVTINJ_TYPE_NMI
;
318 } else if (vcpu
->arch
.interrupt
.injected
) {
319 nr
= vcpu
->arch
.interrupt
.nr
;
320 exit_int_info
= nr
| SVM_EVTINJ_VALID
;
322 if (vcpu
->arch
.interrupt
.soft
)
323 exit_int_info
|= SVM_EVTINJ_TYPE_SOFT
;
325 exit_int_info
|= SVM_EVTINJ_TYPE_INTR
;
328 nested_vmcb
->control
.exit_int_info
= exit_int_info
;
331 static inline bool nested_npt_enabled(struct vcpu_svm
*svm
)
333 return svm
->nested
.ctl
.nested_ctl
& SVM_NESTED_CTL_NP_ENABLE
;
337 * Load guest's/host's cr3 on nested vmentry or vmexit. @nested_npt is true
338 * if we are emulating VM-Entry into a guest with NPT enabled.
340 static int nested_svm_load_cr3(struct kvm_vcpu
*vcpu
, unsigned long cr3
,
343 if (cr3
& rsvd_bits(cpuid_maxphyaddr(vcpu
), 63))
346 if (!nested_npt
&& is_pae_paging(vcpu
) &&
347 (cr3
!= kvm_read_cr3(vcpu
) || pdptrs_changed(vcpu
))) {
348 if (!load_pdptrs(vcpu
, vcpu
->arch
.walk_mmu
, cr3
))
353 * TODO: optimize unconditional TLB flush/MMU sync here and in
354 * kvm_init_shadow_npt_mmu().
357 kvm_mmu_new_pgd(vcpu
, cr3
, false, false);
359 vcpu
->arch
.cr3
= cr3
;
360 kvm_register_mark_available(vcpu
, VCPU_EXREG_CR3
);
362 kvm_init_mmu(vcpu
, false);
367 static void nested_prepare_vmcb_save(struct vcpu_svm
*svm
, struct vmcb
*nested_vmcb
)
369 /* Load the nested guest state */
370 svm
->vmcb
->save
.es
= nested_vmcb
->save
.es
;
371 svm
->vmcb
->save
.cs
= nested_vmcb
->save
.cs
;
372 svm
->vmcb
->save
.ss
= nested_vmcb
->save
.ss
;
373 svm
->vmcb
->save
.ds
= nested_vmcb
->save
.ds
;
374 svm
->vmcb
->save
.gdtr
= nested_vmcb
->save
.gdtr
;
375 svm
->vmcb
->save
.idtr
= nested_vmcb
->save
.idtr
;
376 kvm_set_rflags(&svm
->vcpu
, nested_vmcb
->save
.rflags
);
377 svm_set_efer(&svm
->vcpu
, nested_vmcb
->save
.efer
);
378 svm_set_cr0(&svm
->vcpu
, nested_vmcb
->save
.cr0
);
379 svm_set_cr4(&svm
->vcpu
, nested_vmcb
->save
.cr4
);
380 svm
->vmcb
->save
.cr2
= svm
->vcpu
.arch
.cr2
= nested_vmcb
->save
.cr2
;
381 kvm_rax_write(&svm
->vcpu
, nested_vmcb
->save
.rax
);
382 kvm_rsp_write(&svm
->vcpu
, nested_vmcb
->save
.rsp
);
383 kvm_rip_write(&svm
->vcpu
, nested_vmcb
->save
.rip
);
385 /* In case we don't even reach vcpu_run, the fields are not updated */
386 svm
->vmcb
->save
.rax
= nested_vmcb
->save
.rax
;
387 svm
->vmcb
->save
.rsp
= nested_vmcb
->save
.rsp
;
388 svm
->vmcb
->save
.rip
= nested_vmcb
->save
.rip
;
389 svm
->vmcb
->save
.dr7
= nested_vmcb
->save
.dr7
;
390 svm
->vcpu
.arch
.dr6
= nested_vmcb
->save
.dr6
;
391 svm
->vmcb
->save
.cpl
= nested_vmcb
->save
.cpl
;
394 static void nested_prepare_vmcb_control(struct vcpu_svm
*svm
)
396 const u32 mask
= V_INTR_MASKING_MASK
| V_GIF_ENABLE_MASK
| V_GIF_MASK
;
398 if (nested_npt_enabled(svm
))
399 nested_svm_init_mmu_context(&svm
->vcpu
);
401 svm
->vmcb
->control
.tsc_offset
= svm
->vcpu
.arch
.tsc_offset
=
402 svm
->vcpu
.arch
.l1_tsc_offset
+ svm
->nested
.ctl
.tsc_offset
;
404 svm
->vmcb
->control
.int_ctl
=
405 (svm
->nested
.ctl
.int_ctl
& ~mask
) |
406 (svm
->nested
.hsave
->control
.int_ctl
& mask
);
408 svm
->vmcb
->control
.virt_ext
= svm
->nested
.ctl
.virt_ext
;
409 svm
->vmcb
->control
.int_vector
= svm
->nested
.ctl
.int_vector
;
410 svm
->vmcb
->control
.int_state
= svm
->nested
.ctl
.int_state
;
411 svm
->vmcb
->control
.event_inj
= svm
->nested
.ctl
.event_inj
;
412 svm
->vmcb
->control
.event_inj_err
= svm
->nested
.ctl
.event_inj_err
;
414 svm
->vmcb
->control
.pause_filter_count
= svm
->nested
.ctl
.pause_filter_count
;
415 svm
->vmcb
->control
.pause_filter_thresh
= svm
->nested
.ctl
.pause_filter_thresh
;
417 /* Enter Guest-Mode */
418 enter_guest_mode(&svm
->vcpu
);
421 * Merge guest and host intercepts - must be called with vcpu in
422 * guest-mode to take affect here
424 recalc_intercepts(svm
);
426 vmcb_mark_all_dirty(svm
->vmcb
);
429 int enter_svm_guest_mode(struct vcpu_svm
*svm
, u64 vmcb_gpa
,
430 struct vmcb
*nested_vmcb
)
434 svm
->nested
.vmcb
= vmcb_gpa
;
435 load_nested_vmcb_control(svm
, &nested_vmcb
->control
);
436 nested_prepare_vmcb_save(svm
, nested_vmcb
);
437 nested_prepare_vmcb_control(svm
);
439 ret
= nested_svm_load_cr3(&svm
->vcpu
, nested_vmcb
->save
.cr3
,
440 nested_npt_enabled(svm
));
444 svm_set_gif(svm
, true);
449 int nested_svm_vmrun(struct vcpu_svm
*svm
)
452 struct vmcb
*nested_vmcb
;
453 struct vmcb
*hsave
= svm
->nested
.hsave
;
454 struct vmcb
*vmcb
= svm
->vmcb
;
455 struct kvm_host_map map
;
458 if (is_smm(&svm
->vcpu
)) {
459 kvm_queue_exception(&svm
->vcpu
, UD_VECTOR
);
463 vmcb_gpa
= svm
->vmcb
->save
.rax
;
464 ret
= kvm_vcpu_map(&svm
->vcpu
, gpa_to_gfn(vmcb_gpa
), &map
);
465 if (ret
== -EINVAL
) {
466 kvm_inject_gp(&svm
->vcpu
, 0);
469 return kvm_skip_emulated_instruction(&svm
->vcpu
);
472 ret
= kvm_skip_emulated_instruction(&svm
->vcpu
);
474 nested_vmcb
= map
.hva
;
476 if (!nested_vmcb_checks(svm
, nested_vmcb
)) {
477 nested_vmcb
->control
.exit_code
= SVM_EXIT_ERR
;
478 nested_vmcb
->control
.exit_code_hi
= 0;
479 nested_vmcb
->control
.exit_info_1
= 0;
480 nested_vmcb
->control
.exit_info_2
= 0;
484 trace_kvm_nested_vmrun(svm
->vmcb
->save
.rip
, vmcb_gpa
,
485 nested_vmcb
->save
.rip
,
486 nested_vmcb
->control
.int_ctl
,
487 nested_vmcb
->control
.event_inj
,
488 nested_vmcb
->control
.nested_ctl
);
490 trace_kvm_nested_intercepts(nested_vmcb
->control
.intercept_cr
& 0xffff,
491 nested_vmcb
->control
.intercept_cr
>> 16,
492 nested_vmcb
->control
.intercept_exceptions
,
493 nested_vmcb
->control
.intercept
);
495 /* Clear internal status */
496 kvm_clear_exception_queue(&svm
->vcpu
);
497 kvm_clear_interrupt_queue(&svm
->vcpu
);
500 * Save the old vmcb, so we don't need to pick what we save, but can
501 * restore everything when a VMEXIT occurs
503 hsave
->save
.es
= vmcb
->save
.es
;
504 hsave
->save
.cs
= vmcb
->save
.cs
;
505 hsave
->save
.ss
= vmcb
->save
.ss
;
506 hsave
->save
.ds
= vmcb
->save
.ds
;
507 hsave
->save
.gdtr
= vmcb
->save
.gdtr
;
508 hsave
->save
.idtr
= vmcb
->save
.idtr
;
509 hsave
->save
.efer
= svm
->vcpu
.arch
.efer
;
510 hsave
->save
.cr0
= kvm_read_cr0(&svm
->vcpu
);
511 hsave
->save
.cr4
= svm
->vcpu
.arch
.cr4
;
512 hsave
->save
.rflags
= kvm_get_rflags(&svm
->vcpu
);
513 hsave
->save
.rip
= kvm_rip_read(&svm
->vcpu
);
514 hsave
->save
.rsp
= vmcb
->save
.rsp
;
515 hsave
->save
.rax
= vmcb
->save
.rax
;
517 hsave
->save
.cr3
= vmcb
->save
.cr3
;
519 hsave
->save
.cr3
= kvm_read_cr3(&svm
->vcpu
);
521 copy_vmcb_control_area(&hsave
->control
, &vmcb
->control
);
523 svm
->nested
.nested_run_pending
= 1;
525 if (enter_svm_guest_mode(svm
, vmcb_gpa
, nested_vmcb
))
528 if (nested_svm_vmrun_msrpm(svm
))
532 svm
->nested
.nested_run_pending
= 0;
534 svm
->vmcb
->control
.exit_code
= SVM_EXIT_ERR
;
535 svm
->vmcb
->control
.exit_code_hi
= 0;
536 svm
->vmcb
->control
.exit_info_1
= 0;
537 svm
->vmcb
->control
.exit_info_2
= 0;
539 nested_svm_vmexit(svm
);
542 kvm_vcpu_unmap(&svm
->vcpu
, &map
, true);
547 void nested_svm_vmloadsave(struct vmcb
*from_vmcb
, struct vmcb
*to_vmcb
)
549 to_vmcb
->save
.fs
= from_vmcb
->save
.fs
;
550 to_vmcb
->save
.gs
= from_vmcb
->save
.gs
;
551 to_vmcb
->save
.tr
= from_vmcb
->save
.tr
;
552 to_vmcb
->save
.ldtr
= from_vmcb
->save
.ldtr
;
553 to_vmcb
->save
.kernel_gs_base
= from_vmcb
->save
.kernel_gs_base
;
554 to_vmcb
->save
.star
= from_vmcb
->save
.star
;
555 to_vmcb
->save
.lstar
= from_vmcb
->save
.lstar
;
556 to_vmcb
->save
.cstar
= from_vmcb
->save
.cstar
;
557 to_vmcb
->save
.sfmask
= from_vmcb
->save
.sfmask
;
558 to_vmcb
->save
.sysenter_cs
= from_vmcb
->save
.sysenter_cs
;
559 to_vmcb
->save
.sysenter_esp
= from_vmcb
->save
.sysenter_esp
;
560 to_vmcb
->save
.sysenter_eip
= from_vmcb
->save
.sysenter_eip
;
563 int nested_svm_vmexit(struct vcpu_svm
*svm
)
566 struct vmcb
*nested_vmcb
;
567 struct vmcb
*hsave
= svm
->nested
.hsave
;
568 struct vmcb
*vmcb
= svm
->vmcb
;
569 struct kvm_host_map map
;
571 rc
= kvm_vcpu_map(&svm
->vcpu
, gpa_to_gfn(svm
->nested
.vmcb
), &map
);
574 kvm_inject_gp(&svm
->vcpu
, 0);
578 nested_vmcb
= map
.hva
;
580 /* Exit Guest-Mode */
581 leave_guest_mode(&svm
->vcpu
);
582 svm
->nested
.vmcb
= 0;
583 WARN_ON_ONCE(svm
->nested
.nested_run_pending
);
585 /* in case we halted in L2 */
586 svm
->vcpu
.arch
.mp_state
= KVM_MP_STATE_RUNNABLE
;
588 /* Give the current vmcb to the guest */
589 svm_set_gif(svm
, false);
591 nested_vmcb
->save
.es
= vmcb
->save
.es
;
592 nested_vmcb
->save
.cs
= vmcb
->save
.cs
;
593 nested_vmcb
->save
.ss
= vmcb
->save
.ss
;
594 nested_vmcb
->save
.ds
= vmcb
->save
.ds
;
595 nested_vmcb
->save
.gdtr
= vmcb
->save
.gdtr
;
596 nested_vmcb
->save
.idtr
= vmcb
->save
.idtr
;
597 nested_vmcb
->save
.efer
= svm
->vcpu
.arch
.efer
;
598 nested_vmcb
->save
.cr0
= kvm_read_cr0(&svm
->vcpu
);
599 nested_vmcb
->save
.cr3
= kvm_read_cr3(&svm
->vcpu
);
600 nested_vmcb
->save
.cr2
= vmcb
->save
.cr2
;
601 nested_vmcb
->save
.cr4
= svm
->vcpu
.arch
.cr4
;
602 nested_vmcb
->save
.rflags
= kvm_get_rflags(&svm
->vcpu
);
603 nested_vmcb
->save
.rip
= kvm_rip_read(&svm
->vcpu
);
604 nested_vmcb
->save
.rsp
= kvm_rsp_read(&svm
->vcpu
);
605 nested_vmcb
->save
.rax
= kvm_rax_read(&svm
->vcpu
);
606 nested_vmcb
->save
.dr7
= vmcb
->save
.dr7
;
607 nested_vmcb
->save
.dr6
= svm
->vcpu
.arch
.dr6
;
608 nested_vmcb
->save
.cpl
= vmcb
->save
.cpl
;
610 nested_vmcb
->control
.int_state
= vmcb
->control
.int_state
;
611 nested_vmcb
->control
.exit_code
= vmcb
->control
.exit_code
;
612 nested_vmcb
->control
.exit_code_hi
= vmcb
->control
.exit_code_hi
;
613 nested_vmcb
->control
.exit_info_1
= vmcb
->control
.exit_info_1
;
614 nested_vmcb
->control
.exit_info_2
= vmcb
->control
.exit_info_2
;
616 if (nested_vmcb
->control
.exit_code
!= SVM_EXIT_ERR
)
617 nested_vmcb_save_pending_event(svm
, nested_vmcb
);
619 if (svm
->nrips_enabled
)
620 nested_vmcb
->control
.next_rip
= vmcb
->control
.next_rip
;
622 nested_vmcb
->control
.int_ctl
= svm
->nested
.ctl
.int_ctl
;
623 nested_vmcb
->control
.tlb_ctl
= svm
->nested
.ctl
.tlb_ctl
;
624 nested_vmcb
->control
.event_inj
= svm
->nested
.ctl
.event_inj
;
625 nested_vmcb
->control
.event_inj_err
= svm
->nested
.ctl
.event_inj_err
;
627 nested_vmcb
->control
.pause_filter_count
=
628 svm
->vmcb
->control
.pause_filter_count
;
629 nested_vmcb
->control
.pause_filter_thresh
=
630 svm
->vmcb
->control
.pause_filter_thresh
;
632 /* Restore the original control entries */
633 copy_vmcb_control_area(&vmcb
->control
, &hsave
->control
);
635 svm
->vmcb
->control
.tsc_offset
= svm
->vcpu
.arch
.tsc_offset
=
636 svm
->vcpu
.arch
.l1_tsc_offset
;
638 svm
->nested
.ctl
.nested_cr3
= 0;
640 /* Restore selected save entries */
641 svm
->vmcb
->save
.es
= hsave
->save
.es
;
642 svm
->vmcb
->save
.cs
= hsave
->save
.cs
;
643 svm
->vmcb
->save
.ss
= hsave
->save
.ss
;
644 svm
->vmcb
->save
.ds
= hsave
->save
.ds
;
645 svm
->vmcb
->save
.gdtr
= hsave
->save
.gdtr
;
646 svm
->vmcb
->save
.idtr
= hsave
->save
.idtr
;
647 kvm_set_rflags(&svm
->vcpu
, hsave
->save
.rflags
);
648 svm_set_efer(&svm
->vcpu
, hsave
->save
.efer
);
649 svm_set_cr0(&svm
->vcpu
, hsave
->save
.cr0
| X86_CR0_PE
);
650 svm_set_cr4(&svm
->vcpu
, hsave
->save
.cr4
);
651 kvm_rax_write(&svm
->vcpu
, hsave
->save
.rax
);
652 kvm_rsp_write(&svm
->vcpu
, hsave
->save
.rsp
);
653 kvm_rip_write(&svm
->vcpu
, hsave
->save
.rip
);
654 svm
->vmcb
->save
.dr7
= 0;
655 svm
->vmcb
->save
.cpl
= 0;
656 svm
->vmcb
->control
.exit_int_info
= 0;
658 vmcb_mark_all_dirty(svm
->vmcb
);
660 trace_kvm_nested_vmexit_inject(nested_vmcb
->control
.exit_code
,
661 nested_vmcb
->control
.exit_info_1
,
662 nested_vmcb
->control
.exit_info_2
,
663 nested_vmcb
->control
.exit_int_info
,
664 nested_vmcb
->control
.exit_int_info_err
,
667 kvm_vcpu_unmap(&svm
->vcpu
, &map
, true);
669 nested_svm_uninit_mmu_context(&svm
->vcpu
);
671 rc
= nested_svm_load_cr3(&svm
->vcpu
, hsave
->save
.cr3
, false);
676 svm
->vmcb
->save
.cr3
= hsave
->save
.cr3
;
679 * Drop what we picked up for L2 via svm_complete_interrupts() so it
680 * doesn't end up in L1.
682 svm
->vcpu
.arch
.nmi_injected
= false;
683 kvm_clear_exception_queue(&svm
->vcpu
);
684 kvm_clear_interrupt_queue(&svm
->vcpu
);
690 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
692 void svm_leave_nested(struct vcpu_svm
*svm
)
694 if (is_guest_mode(&svm
->vcpu
)) {
695 struct vmcb
*hsave
= svm
->nested
.hsave
;
696 struct vmcb
*vmcb
= svm
->vmcb
;
698 svm
->nested
.nested_run_pending
= 0;
699 leave_guest_mode(&svm
->vcpu
);
700 copy_vmcb_control_area(&vmcb
->control
, &hsave
->control
);
701 nested_svm_uninit_mmu_context(&svm
->vcpu
);
705 static int nested_svm_exit_handled_msr(struct vcpu_svm
*svm
)
707 u32 offset
, msr
, value
;
710 if (!(svm
->nested
.ctl
.intercept
& (1ULL << INTERCEPT_MSR_PROT
)))
711 return NESTED_EXIT_HOST
;
713 msr
= svm
->vcpu
.arch
.regs
[VCPU_REGS_RCX
];
714 offset
= svm_msrpm_offset(msr
);
715 write
= svm
->vmcb
->control
.exit_info_1
& 1;
716 mask
= 1 << ((2 * (msr
& 0xf)) + write
);
718 if (offset
== MSR_INVALID
)
719 return NESTED_EXIT_DONE
;
721 /* Offset is in 32 bit units but need in 8 bit units */
724 if (kvm_vcpu_read_guest(&svm
->vcpu
, svm
->nested
.ctl
.msrpm_base_pa
+ offset
, &value
, 4))
725 return NESTED_EXIT_DONE
;
727 return (value
& mask
) ? NESTED_EXIT_DONE
: NESTED_EXIT_HOST
;
730 static int nested_svm_intercept_ioio(struct vcpu_svm
*svm
)
732 unsigned port
, size
, iopm_len
;
737 if (!(svm
->nested
.ctl
.intercept
& (1ULL << INTERCEPT_IOIO_PROT
)))
738 return NESTED_EXIT_HOST
;
740 port
= svm
->vmcb
->control
.exit_info_1
>> 16;
741 size
= (svm
->vmcb
->control
.exit_info_1
& SVM_IOIO_SIZE_MASK
) >>
743 gpa
= svm
->nested
.ctl
.iopm_base_pa
+ (port
/ 8);
744 start_bit
= port
% 8;
745 iopm_len
= (start_bit
+ size
> 8) ? 2 : 1;
746 mask
= (0xf >> (4 - size
)) << start_bit
;
749 if (kvm_vcpu_read_guest(&svm
->vcpu
, gpa
, &val
, iopm_len
))
750 return NESTED_EXIT_DONE
;
752 return (val
& mask
) ? NESTED_EXIT_DONE
: NESTED_EXIT_HOST
;
755 static int nested_svm_intercept(struct vcpu_svm
*svm
)
757 u32 exit_code
= svm
->vmcb
->control
.exit_code
;
758 int vmexit
= NESTED_EXIT_HOST
;
762 vmexit
= nested_svm_exit_handled_msr(svm
);
765 vmexit
= nested_svm_intercept_ioio(svm
);
767 case SVM_EXIT_READ_CR0
... SVM_EXIT_WRITE_CR8
: {
768 u32 bit
= 1U << (exit_code
- SVM_EXIT_READ_CR0
);
769 if (svm
->nested
.ctl
.intercept_cr
& bit
)
770 vmexit
= NESTED_EXIT_DONE
;
773 case SVM_EXIT_READ_DR0
... SVM_EXIT_WRITE_DR7
: {
774 u32 bit
= 1U << (exit_code
- SVM_EXIT_READ_DR0
);
775 if (svm
->nested
.ctl
.intercept_dr
& bit
)
776 vmexit
= NESTED_EXIT_DONE
;
779 case SVM_EXIT_EXCP_BASE
... SVM_EXIT_EXCP_BASE
+ 0x1f: {
781 * Host-intercepted exceptions have been checked already in
782 * nested_svm_exit_special. There is nothing to do here,
783 * the vmexit is injected by svm_check_nested_events.
785 vmexit
= NESTED_EXIT_DONE
;
789 vmexit
= NESTED_EXIT_DONE
;
793 u64 exit_bits
= 1ULL << (exit_code
- SVM_EXIT_INTR
);
794 if (svm
->nested
.ctl
.intercept
& exit_bits
)
795 vmexit
= NESTED_EXIT_DONE
;
802 int nested_svm_exit_handled(struct vcpu_svm
*svm
)
806 vmexit
= nested_svm_intercept(svm
);
808 if (vmexit
== NESTED_EXIT_DONE
)
809 nested_svm_vmexit(svm
);
814 int nested_svm_check_permissions(struct vcpu_svm
*svm
)
816 if (!(svm
->vcpu
.arch
.efer
& EFER_SVME
) ||
817 !is_paging(&svm
->vcpu
)) {
818 kvm_queue_exception(&svm
->vcpu
, UD_VECTOR
);
822 if (svm
->vmcb
->save
.cpl
) {
823 kvm_inject_gp(&svm
->vcpu
, 0);
830 static bool nested_exit_on_exception(struct vcpu_svm
*svm
)
832 unsigned int nr
= svm
->vcpu
.arch
.exception
.nr
;
834 return (svm
->nested
.ctl
.intercept_exceptions
& (1 << nr
));
837 static void nested_svm_inject_exception_vmexit(struct vcpu_svm
*svm
)
839 unsigned int nr
= svm
->vcpu
.arch
.exception
.nr
;
841 svm
->vmcb
->control
.exit_code
= SVM_EXIT_EXCP_BASE
+ nr
;
842 svm
->vmcb
->control
.exit_code_hi
= 0;
844 if (svm
->vcpu
.arch
.exception
.has_error_code
)
845 svm
->vmcb
->control
.exit_info_1
= svm
->vcpu
.arch
.exception
.error_code
;
848 * EXITINFO2 is undefined for all exception intercepts other
851 if (nr
== PF_VECTOR
) {
852 if (svm
->vcpu
.arch
.exception
.nested_apf
)
853 svm
->vmcb
->control
.exit_info_2
= svm
->vcpu
.arch
.apf
.nested_apf_token
;
854 else if (svm
->vcpu
.arch
.exception
.has_payload
)
855 svm
->vmcb
->control
.exit_info_2
= svm
->vcpu
.arch
.exception
.payload
;
857 svm
->vmcb
->control
.exit_info_2
= svm
->vcpu
.arch
.cr2
;
858 } else if (nr
== DB_VECTOR
) {
859 /* See inject_pending_event. */
860 kvm_deliver_exception_payload(&svm
->vcpu
);
861 if (svm
->vcpu
.arch
.dr7
& DR7_GD
) {
862 svm
->vcpu
.arch
.dr7
&= ~DR7_GD
;
863 kvm_update_dr7(&svm
->vcpu
);
866 WARN_ON(svm
->vcpu
.arch
.exception
.has_payload
);
868 nested_svm_vmexit(svm
);
871 static void nested_svm_smi(struct vcpu_svm
*svm
)
873 svm
->vmcb
->control
.exit_code
= SVM_EXIT_SMI
;
874 svm
->vmcb
->control
.exit_info_1
= 0;
875 svm
->vmcb
->control
.exit_info_2
= 0;
877 nested_svm_vmexit(svm
);
880 static void nested_svm_nmi(struct vcpu_svm
*svm
)
882 svm
->vmcb
->control
.exit_code
= SVM_EXIT_NMI
;
883 svm
->vmcb
->control
.exit_info_1
= 0;
884 svm
->vmcb
->control
.exit_info_2
= 0;
886 nested_svm_vmexit(svm
);
889 static void nested_svm_intr(struct vcpu_svm
*svm
)
891 trace_kvm_nested_intr_vmexit(svm
->vmcb
->save
.rip
);
893 svm
->vmcb
->control
.exit_code
= SVM_EXIT_INTR
;
894 svm
->vmcb
->control
.exit_info_1
= 0;
895 svm
->vmcb
->control
.exit_info_2
= 0;
897 nested_svm_vmexit(svm
);
900 static inline bool nested_exit_on_init(struct vcpu_svm
*svm
)
902 return (svm
->nested
.ctl
.intercept
& (1ULL << INTERCEPT_INIT
));
905 static void nested_svm_init(struct vcpu_svm
*svm
)
907 svm
->vmcb
->control
.exit_code
= SVM_EXIT_INIT
;
908 svm
->vmcb
->control
.exit_info_1
= 0;
909 svm
->vmcb
->control
.exit_info_2
= 0;
911 nested_svm_vmexit(svm
);
915 static int svm_check_nested_events(struct kvm_vcpu
*vcpu
)
917 struct vcpu_svm
*svm
= to_svm(vcpu
);
918 bool block_nested_events
=
919 kvm_event_needs_reinjection(vcpu
) || svm
->nested
.nested_run_pending
;
920 struct kvm_lapic
*apic
= vcpu
->arch
.apic
;
922 if (lapic_in_kernel(vcpu
) &&
923 test_bit(KVM_APIC_INIT
, &apic
->pending_events
)) {
924 if (block_nested_events
)
926 if (!nested_exit_on_init(svm
))
928 nested_svm_init(svm
);
932 if (vcpu
->arch
.exception
.pending
) {
933 if (block_nested_events
)
935 if (!nested_exit_on_exception(svm
))
937 nested_svm_inject_exception_vmexit(svm
);
941 if (vcpu
->arch
.smi_pending
&& !svm_smi_blocked(vcpu
)) {
942 if (block_nested_events
)
944 if (!nested_exit_on_smi(svm
))
950 if (vcpu
->arch
.nmi_pending
&& !svm_nmi_blocked(vcpu
)) {
951 if (block_nested_events
)
953 if (!nested_exit_on_nmi(svm
))
959 if (kvm_cpu_has_interrupt(vcpu
) && !svm_interrupt_blocked(vcpu
)) {
960 if (block_nested_events
)
962 if (!nested_exit_on_intr(svm
))
964 nested_svm_intr(svm
);
971 int nested_svm_exit_special(struct vcpu_svm
*svm
)
973 u32 exit_code
= svm
->vmcb
->control
.exit_code
;
979 return NESTED_EXIT_HOST
;
980 case SVM_EXIT_EXCP_BASE
... SVM_EXIT_EXCP_BASE
+ 0x1f: {
981 u32 excp_bits
= 1 << (exit_code
- SVM_EXIT_EXCP_BASE
);
983 if (get_host_vmcb(svm
)->control
.intercept_exceptions
& excp_bits
)
984 return NESTED_EXIT_HOST
;
985 else if (exit_code
== SVM_EXIT_EXCP_BASE
+ PF_VECTOR
&&
986 svm
->vcpu
.arch
.apf
.host_apf_flags
)
987 /* Trap async PF even if not shadowing */
988 return NESTED_EXIT_HOST
;
995 return NESTED_EXIT_CONTINUE
;
998 static int svm_get_nested_state(struct kvm_vcpu
*vcpu
,
999 struct kvm_nested_state __user
*user_kvm_nested_state
,
1002 struct vcpu_svm
*svm
;
1003 struct kvm_nested_state kvm_state
= {
1005 .format
= KVM_STATE_NESTED_FORMAT_SVM
,
1006 .size
= sizeof(kvm_state
),
1008 struct vmcb __user
*user_vmcb
= (struct vmcb __user
*)
1009 &user_kvm_nested_state
->data
.svm
[0];
1012 return kvm_state
.size
+ KVM_STATE_NESTED_SVM_VMCB_SIZE
;
1016 if (user_data_size
< kvm_state
.size
)
1019 /* First fill in the header and copy it out. */
1020 if (is_guest_mode(vcpu
)) {
1021 kvm_state
.hdr
.svm
.vmcb_pa
= svm
->nested
.vmcb
;
1022 kvm_state
.size
+= KVM_STATE_NESTED_SVM_VMCB_SIZE
;
1023 kvm_state
.flags
|= KVM_STATE_NESTED_GUEST_MODE
;
1025 if (svm
->nested
.nested_run_pending
)
1026 kvm_state
.flags
|= KVM_STATE_NESTED_RUN_PENDING
;
1030 kvm_state
.flags
|= KVM_STATE_NESTED_GIF_SET
;
1032 if (copy_to_user(user_kvm_nested_state
, &kvm_state
, sizeof(kvm_state
)))
1035 if (!is_guest_mode(vcpu
))
1039 * Copy over the full size of the VMCB rather than just the size
1042 if (clear_user(user_vmcb
, KVM_STATE_NESTED_SVM_VMCB_SIZE
))
1044 if (copy_to_user(&user_vmcb
->control
, &svm
->nested
.ctl
,
1045 sizeof(user_vmcb
->control
)))
1047 if (copy_to_user(&user_vmcb
->save
, &svm
->nested
.hsave
->save
,
1048 sizeof(user_vmcb
->save
)))
1052 return kvm_state
.size
;
1055 static int svm_set_nested_state(struct kvm_vcpu
*vcpu
,
1056 struct kvm_nested_state __user
*user_kvm_nested_state
,
1057 struct kvm_nested_state
*kvm_state
)
1059 struct vcpu_svm
*svm
= to_svm(vcpu
);
1060 struct vmcb
*hsave
= svm
->nested
.hsave
;
1061 struct vmcb __user
*user_vmcb
= (struct vmcb __user
*)
1062 &user_kvm_nested_state
->data
.svm
[0];
1063 struct vmcb_control_area ctl
;
1064 struct vmcb_save_area save
;
1067 if (kvm_state
->format
!= KVM_STATE_NESTED_FORMAT_SVM
)
1070 if (kvm_state
->flags
& ~(KVM_STATE_NESTED_GUEST_MODE
|
1071 KVM_STATE_NESTED_RUN_PENDING
|
1072 KVM_STATE_NESTED_GIF_SET
))
1076 * If in guest mode, vcpu->arch.efer actually refers to the L2 guest's
1077 * EFER.SVME, but EFER.SVME still has to be 1 for VMRUN to succeed.
1079 if (!(vcpu
->arch
.efer
& EFER_SVME
)) {
1080 /* GIF=1 and no guest mode are required if SVME=0. */
1081 if (kvm_state
->flags
!= KVM_STATE_NESTED_GIF_SET
)
1085 /* SMM temporarily disables SVM, so we cannot be in guest mode. */
1086 if (is_smm(vcpu
) && (kvm_state
->flags
& KVM_STATE_NESTED_GUEST_MODE
))
1089 if (!(kvm_state
->flags
& KVM_STATE_NESTED_GUEST_MODE
)) {
1090 svm_leave_nested(svm
);
1094 if (!page_address_valid(vcpu
, kvm_state
->hdr
.svm
.vmcb_pa
))
1096 if (kvm_state
->size
< sizeof(*kvm_state
) + KVM_STATE_NESTED_SVM_VMCB_SIZE
)
1098 if (copy_from_user(&ctl
, &user_vmcb
->control
, sizeof(ctl
)))
1100 if (copy_from_user(&save
, &user_vmcb
->save
, sizeof(save
)))
1103 if (!nested_vmcb_check_controls(&ctl
))
1107 * Processor state contains L2 state. Check that it is
1108 * valid for guest mode (see nested_vmcb_checks).
1110 cr0
= kvm_read_cr0(vcpu
);
1111 if (((cr0
& X86_CR0_CD
) == 0) && (cr0
& X86_CR0_NW
))
1115 * Validate host state saved from before VMRUN (see
1116 * nested_svm_check_permissions).
1117 * TODO: validate reserved bits for all saved state.
1119 if (!(save
.cr0
& X86_CR0_PG
))
1123 * All checks done, we can enter guest mode. L1 control fields
1124 * come from the nested save state. Guest state is already
1125 * in the registers, the save area of the nested state instead
1126 * contains saved L1 state.
1128 copy_vmcb_control_area(&hsave
->control
, &svm
->vmcb
->control
);
1131 svm
->nested
.vmcb
= kvm_state
->hdr
.svm
.vmcb_pa
;
1132 load_nested_vmcb_control(svm
, &ctl
);
1133 nested_prepare_vmcb_control(svm
);
1136 svm_set_gif(svm
, !!(kvm_state
->flags
& KVM_STATE_NESTED_GIF_SET
));
1140 struct kvm_x86_nested_ops svm_nested_ops
= {
1141 .check_events
= svm_check_nested_events
,
1142 .get_state
= svm_get_nested_state
,
1143 .set_state
= svm_set_nested_state
,