1 // SPDX-License-Identifier: GPL-2.0-only
3 * Kernel-based Virtual Machine driver for Linux
4 * cpuid support routines
6 * derived from arch/x86/kvm/x86.c
8 * Copyright 2011 Red Hat, Inc. and/or its affiliates.
9 * Copyright IBM Corporation, 2008
12 #include <linux/kvm_host.h>
13 #include <linux/export.h>
14 #include <linux/vmalloc.h>
15 #include <linux/uaccess.h>
16 #include <linux/sched/stat.h>
18 #include <asm/processor.h>
20 #include <asm/fpu/xstate.h>
29 * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
30 * aligned to sizeof(unsigned long) because it's not accessed via bitops.
32 u32 kvm_cpu_caps
[NR_KVM_CPU_CAPS
] __read_mostly
;
33 EXPORT_SYMBOL_GPL(kvm_cpu_caps
);
35 static u32
xstate_required_size(u64 xstate_bv
, bool compacted
)
38 u32 ret
= XSAVE_HDR_SIZE
+ XSAVE_HDR_OFFSET
;
40 xstate_bv
&= XFEATURE_MASK_EXTEND
;
42 if (xstate_bv
& 0x1) {
43 u32 eax
, ebx
, ecx
, edx
, offset
;
44 cpuid_count(0xD, feature_bit
, &eax
, &ebx
, &ecx
, &edx
);
45 offset
= compacted
? ret
: ebx
;
46 ret
= max(ret
, offset
+ eax
);
57 #define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
59 static inline struct kvm_cpuid_entry2
*cpuid_entry2_find(
60 struct kvm_cpuid_entry2
*entries
, int nent
, u32 function
, u32 index
)
62 struct kvm_cpuid_entry2
*e
;
65 for (i
= 0; i
< nent
; i
++) {
68 if (e
->function
== function
&&
69 (!(e
->flags
& KVM_CPUID_FLAG_SIGNIFCANT_INDEX
) || e
->index
== index
))
76 static int kvm_check_cpuid(struct kvm_cpuid_entry2
*entries
, int nent
)
78 struct kvm_cpuid_entry2
*best
;
81 * The existing code assumes virtual address is 48-bit or 57-bit in the
82 * canonical address checks; exit if it is ever changed.
84 best
= cpuid_entry2_find(entries
, nent
, 0x80000008, 0);
86 int vaddr_bits
= (best
->eax
& 0xff00) >> 8;
88 if (vaddr_bits
!= 48 && vaddr_bits
!= 57 && vaddr_bits
!= 0)
95 void kvm_update_pv_runtime(struct kvm_vcpu
*vcpu
)
97 struct kvm_cpuid_entry2
*best
;
99 best
= kvm_find_cpuid_entry(vcpu
, KVM_CPUID_FEATURES
, 0);
102 * save the feature bitmap to avoid cpuid lookup for every PV
106 vcpu
->arch
.pv_cpuid
.features
= best
->eax
;
109 void kvm_update_cpuid_runtime(struct kvm_vcpu
*vcpu
)
111 struct kvm_cpuid_entry2
*best
;
113 best
= kvm_find_cpuid_entry(vcpu
, 1, 0);
115 /* Update OSXSAVE bit */
116 if (boot_cpu_has(X86_FEATURE_XSAVE
))
117 cpuid_entry_change(best
, X86_FEATURE_OSXSAVE
,
118 kvm_read_cr4_bits(vcpu
, X86_CR4_OSXSAVE
));
120 cpuid_entry_change(best
, X86_FEATURE_APIC
,
121 vcpu
->arch
.apic_base
& MSR_IA32_APICBASE_ENABLE
);
124 best
= kvm_find_cpuid_entry(vcpu
, 7, 0);
125 if (best
&& boot_cpu_has(X86_FEATURE_PKU
) && best
->function
== 0x7)
126 cpuid_entry_change(best
, X86_FEATURE_OSPKE
,
127 kvm_read_cr4_bits(vcpu
, X86_CR4_PKE
));
129 best
= kvm_find_cpuid_entry(vcpu
, 0xD, 0);
131 best
->ebx
= xstate_required_size(vcpu
->arch
.xcr0
, false);
133 best
= kvm_find_cpuid_entry(vcpu
, 0xD, 1);
134 if (best
&& (cpuid_entry_has(best
, X86_FEATURE_XSAVES
) ||
135 cpuid_entry_has(best
, X86_FEATURE_XSAVEC
)))
136 best
->ebx
= xstate_required_size(vcpu
->arch
.xcr0
, true);
138 best
= kvm_find_cpuid_entry(vcpu
, KVM_CPUID_FEATURES
, 0);
139 if (kvm_hlt_in_guest(vcpu
->kvm
) && best
&&
140 (best
->eax
& (1 << KVM_FEATURE_PV_UNHALT
)))
141 best
->eax
&= ~(1 << KVM_FEATURE_PV_UNHALT
);
143 if (!kvm_check_has_quirk(vcpu
->kvm
, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT
)) {
144 best
= kvm_find_cpuid_entry(vcpu
, 0x1, 0);
146 cpuid_entry_change(best
, X86_FEATURE_MWAIT
,
147 vcpu
->arch
.ia32_misc_enable_msr
&
148 MSR_IA32_MISC_ENABLE_MWAIT
);
151 EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime
);
153 static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu
*vcpu
)
155 struct kvm_lapic
*apic
= vcpu
->arch
.apic
;
156 struct kvm_cpuid_entry2
*best
;
158 best
= kvm_find_cpuid_entry(vcpu
, 1, 0);
160 if (cpuid_entry_has(best
, X86_FEATURE_TSC_DEADLINE_TIMER
))
161 apic
->lapic_timer
.timer_mode_mask
= 3 << 17;
163 apic
->lapic_timer
.timer_mode_mask
= 1 << 17;
165 kvm_apic_set_version(vcpu
);
168 best
= kvm_find_cpuid_entry(vcpu
, 0xD, 0);
170 vcpu
->arch
.guest_supported_xcr0
= 0;
172 vcpu
->arch
.guest_supported_xcr0
=
173 (best
->eax
| ((u64
)best
->edx
<< 32)) & supported_xcr0
;
176 * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
177 * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
178 * requested XCR0 value. The enclave's XFRM must be a subset of XCRO
179 * at the time of EENTER, thus adjust the allowed XFRM by the guest's
180 * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to
181 * '1' even on CPUs that don't support XSAVE.
183 best
= kvm_find_cpuid_entry(vcpu
, 0x12, 0x1);
185 best
->ecx
&= vcpu
->arch
.guest_supported_xcr0
& 0xffffffff;
186 best
->edx
&= vcpu
->arch
.guest_supported_xcr0
>> 32;
187 best
->ecx
|= XFEATURE_MASK_FPSSE
;
190 kvm_update_pv_runtime(vcpu
);
192 vcpu
->arch
.maxphyaddr
= cpuid_query_maxphyaddr(vcpu
);
193 vcpu
->arch
.reserved_gpa_bits
= kvm_vcpu_reserved_gpa_bits_raw(vcpu
);
195 kvm_pmu_refresh(vcpu
);
196 vcpu
->arch
.cr4_guest_rsvd_bits
=
197 __cr4_reserved_bits(guest_cpuid_has
, vcpu
);
199 kvm_hv_set_cpuid(vcpu
);
201 /* Invoke the vendor callback only after the above state is updated. */
202 static_call(kvm_x86_vcpu_after_set_cpuid
)(vcpu
);
205 * Except for the MMU, which needs to do its thing any vendor specific
206 * adjustments to the reserved GPA bits.
208 kvm_mmu_after_set_cpuid(vcpu
);
211 int cpuid_query_maxphyaddr(struct kvm_vcpu
*vcpu
)
213 struct kvm_cpuid_entry2
*best
;
215 best
= kvm_find_cpuid_entry(vcpu
, 0x80000000, 0);
216 if (!best
|| best
->eax
< 0x80000008)
218 best
= kvm_find_cpuid_entry(vcpu
, 0x80000008, 0);
220 return best
->eax
& 0xff;
226 * This "raw" version returns the reserved GPA bits without any adjustments for
227 * encryption technologies that usurp bits. The raw mask should be used if and
228 * only if hardware does _not_ strip the usurped bits, e.g. in virtual MTRRs.
230 u64
kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu
*vcpu
)
232 return rsvd_bits(cpuid_maxphyaddr(vcpu
), 63);
235 static int kvm_set_cpuid(struct kvm_vcpu
*vcpu
, struct kvm_cpuid_entry2
*e2
,
240 r
= kvm_check_cpuid(e2
, nent
);
244 kvfree(vcpu
->arch
.cpuid_entries
);
245 vcpu
->arch
.cpuid_entries
= e2
;
246 vcpu
->arch
.cpuid_nent
= nent
;
248 kvm_update_cpuid_runtime(vcpu
);
249 kvm_vcpu_after_set_cpuid(vcpu
);
254 /* when an old userspace process fills a new kernel module */
255 int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu
*vcpu
,
256 struct kvm_cpuid
*cpuid
,
257 struct kvm_cpuid_entry __user
*entries
)
260 struct kvm_cpuid_entry
*e
= NULL
;
261 struct kvm_cpuid_entry2
*e2
= NULL
;
263 if (cpuid
->nent
> KVM_MAX_CPUID_ENTRIES
)
267 e
= vmemdup_user(entries
, array_size(sizeof(*e
), cpuid
->nent
));
271 e2
= kvmalloc_array(cpuid
->nent
, sizeof(*e2
), GFP_KERNEL_ACCOUNT
);
277 for (i
= 0; i
< cpuid
->nent
; i
++) {
278 e2
[i
].function
= e
[i
].function
;
279 e2
[i
].eax
= e
[i
].eax
;
280 e2
[i
].ebx
= e
[i
].ebx
;
281 e2
[i
].ecx
= e
[i
].ecx
;
282 e2
[i
].edx
= e
[i
].edx
;
285 e2
[i
].padding
[0] = 0;
286 e2
[i
].padding
[1] = 0;
287 e2
[i
].padding
[2] = 0;
290 r
= kvm_set_cpuid(vcpu
, e2
, cpuid
->nent
);
300 int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu
*vcpu
,
301 struct kvm_cpuid2
*cpuid
,
302 struct kvm_cpuid_entry2 __user
*entries
)
304 struct kvm_cpuid_entry2
*e2
= NULL
;
307 if (cpuid
->nent
> KVM_MAX_CPUID_ENTRIES
)
311 e2
= vmemdup_user(entries
, array_size(sizeof(*e2
), cpuid
->nent
));
316 r
= kvm_set_cpuid(vcpu
, e2
, cpuid
->nent
);
323 int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu
*vcpu
,
324 struct kvm_cpuid2
*cpuid
,
325 struct kvm_cpuid_entry2 __user
*entries
)
330 if (cpuid
->nent
< vcpu
->arch
.cpuid_nent
)
333 if (copy_to_user(entries
, vcpu
->arch
.cpuid_entries
,
334 vcpu
->arch
.cpuid_nent
* sizeof(struct kvm_cpuid_entry2
)))
339 cpuid
->nent
= vcpu
->arch
.cpuid_nent
;
343 /* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */
344 static __always_inline
void __kvm_cpu_cap_mask(unsigned int leaf
)
346 const struct cpuid_reg cpuid
= x86_feature_cpuid(leaf
* 32);
347 struct kvm_cpuid_entry2 entry
;
349 reverse_cpuid_check(leaf
);
351 cpuid_count(cpuid
.function
, cpuid
.index
,
352 &entry
.eax
, &entry
.ebx
, &entry
.ecx
, &entry
.edx
);
354 kvm_cpu_caps
[leaf
] &= *__cpuid_entry_get_reg(&entry
, cpuid
.reg
);
357 static __always_inline
358 void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf
, u32 mask
)
360 /* Use kvm_cpu_cap_mask for non-scattered leafs. */
361 BUILD_BUG_ON(leaf
< NCAPINTS
);
363 kvm_cpu_caps
[leaf
] = mask
;
365 __kvm_cpu_cap_mask(leaf
);
368 static __always_inline
void kvm_cpu_cap_mask(enum cpuid_leafs leaf
, u32 mask
)
370 /* Use kvm_cpu_cap_init_scattered for scattered leafs. */
371 BUILD_BUG_ON(leaf
>= NCAPINTS
);
373 kvm_cpu_caps
[leaf
] &= mask
;
375 __kvm_cpu_cap_mask(leaf
);
378 void kvm_set_cpu_caps(void)
381 unsigned int f_gbpages
= F(GBPAGES
);
382 unsigned int f_lm
= F(LM
);
384 unsigned int f_gbpages
= 0;
385 unsigned int f_lm
= 0;
387 memset(kvm_cpu_caps
, 0, sizeof(kvm_cpu_caps
));
389 BUILD_BUG_ON(sizeof(kvm_cpu_caps
) - (NKVMCAPINTS
* sizeof(*kvm_cpu_caps
)) >
390 sizeof(boot_cpu_data
.x86_capability
));
392 memcpy(&kvm_cpu_caps
, &boot_cpu_data
.x86_capability
,
393 sizeof(kvm_cpu_caps
) - (NKVMCAPINTS
* sizeof(*kvm_cpu_caps
)));
395 kvm_cpu_cap_mask(CPUID_1_ECX
,
397 * NOTE: MONITOR (and MWAIT) are emulated as NOP, but *not*
398 * advertised to guests via CPUID!
400 F(XMM3
) | F(PCLMULQDQ
) | 0 /* DTES64, MONITOR */ |
401 0 /* DS-CPL, VMX, SMX, EST */ |
402 0 /* TM2 */ | F(SSSE3
) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
403 F(FMA
) | F(CX16
) | 0 /* xTPR Update */ | F(PDCM
) |
404 F(PCID
) | 0 /* Reserved, DCA */ | F(XMM4_1
) |
405 F(XMM4_2
) | F(X2APIC
) | F(MOVBE
) | F(POPCNT
) |
406 0 /* Reserved*/ | F(AES
) | F(XSAVE
) | 0 /* OSXSAVE */ | F(AVX
) |
409 /* KVM emulates x2apic in software irrespective of host support. */
410 kvm_cpu_cap_set(X86_FEATURE_X2APIC
);
412 kvm_cpu_cap_mask(CPUID_1_EDX
,
413 F(FPU
) | F(VME
) | F(DE
) | F(PSE
) |
414 F(TSC
) | F(MSR
) | F(PAE
) | F(MCE
) |
415 F(CX8
) | F(APIC
) | 0 /* Reserved */ | F(SEP
) |
416 F(MTRR
) | F(PGE
) | F(MCA
) | F(CMOV
) |
417 F(PAT
) | F(PSE36
) | 0 /* PSN */ | F(CLFLUSH
) |
418 0 /* Reserved, DS, ACPI */ | F(MMX
) |
419 F(FXSR
) | F(XMM
) | F(XMM2
) | F(SELFSNOOP
) |
420 0 /* HTT, TM, Reserved, PBE */
423 kvm_cpu_cap_mask(CPUID_7_0_EBX
,
424 F(FSGSBASE
) | F(SGX
) | F(BMI1
) | F(HLE
) | F(AVX2
) |
425 F(FDP_EXCPTN_ONLY
) | F(SMEP
) | F(BMI2
) | F(ERMS
) | F(INVPCID
) |
426 F(RTM
) | F(ZERO_FCS_FDS
) | 0 /*MPX*/ | F(AVX512F
) |
427 F(AVX512DQ
) | F(RDSEED
) | F(ADX
) | F(SMAP
) | F(AVX512IFMA
) |
428 F(CLFLUSHOPT
) | F(CLWB
) | 0 /*INTEL_PT*/ | F(AVX512PF
) |
429 F(AVX512ER
) | F(AVX512CD
) | F(SHA_NI
) | F(AVX512BW
) |
432 kvm_cpu_cap_mask(CPUID_7_ECX
,
433 F(AVX512VBMI
) | F(LA57
) | F(PKU
) | 0 /*OSPKE*/ | F(RDPID
) |
434 F(AVX512_VPOPCNTDQ
) | F(UMIP
) | F(AVX512_VBMI2
) | F(GFNI
) |
435 F(VAES
) | F(VPCLMULQDQ
) | F(AVX512_VNNI
) | F(AVX512_BITALG
) |
436 F(CLDEMOTE
) | F(MOVDIRI
) | F(MOVDIR64B
) | 0 /*WAITPKG*/ |
437 F(SGX_LC
) | F(BUS_LOCK_DETECT
)
439 /* Set LA57 based on hardware capability. */
440 if (cpuid_ecx(7) & F(LA57
))
441 kvm_cpu_cap_set(X86_FEATURE_LA57
);
444 * PKU not yet implemented for shadow paging and requires OSPKE
445 * to be set on the host. Clear it if that is not the case
447 if (!tdp_enabled
|| !boot_cpu_has(X86_FEATURE_OSPKE
))
448 kvm_cpu_cap_clear(X86_FEATURE_PKU
);
450 kvm_cpu_cap_mask(CPUID_7_EDX
,
451 F(AVX512_4VNNIW
) | F(AVX512_4FMAPS
) | F(SPEC_CTRL
) |
452 F(SPEC_CTRL_SSBD
) | F(ARCH_CAPABILITIES
) | F(INTEL_STIBP
) |
453 F(MD_CLEAR
) | F(AVX512_VP2INTERSECT
) | F(FSRM
) |
454 F(SERIALIZE
) | F(TSXLDTRK
) | F(AVX512_FP16
)
457 /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
458 kvm_cpu_cap_set(X86_FEATURE_TSC_ADJUST
);
459 kvm_cpu_cap_set(X86_FEATURE_ARCH_CAPABILITIES
);
461 if (boot_cpu_has(X86_FEATURE_IBPB
) && boot_cpu_has(X86_FEATURE_IBRS
))
462 kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL
);
463 if (boot_cpu_has(X86_FEATURE_STIBP
))
464 kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP
);
465 if (boot_cpu_has(X86_FEATURE_AMD_SSBD
))
466 kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD
);
468 kvm_cpu_cap_mask(CPUID_7_1_EAX
,
469 F(AVX_VNNI
) | F(AVX512_BF16
)
472 kvm_cpu_cap_mask(CPUID_D_1_EAX
,
473 F(XSAVEOPT
) | F(XSAVEC
) | F(XGETBV1
) | F(XSAVES
)
476 kvm_cpu_cap_init_scattered(CPUID_12_EAX
,
480 kvm_cpu_cap_mask(CPUID_8000_0001_ECX
,
481 F(LAHF_LM
) | F(CMP_LEGACY
) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
482 F(CR8_LEGACY
) | F(ABM
) | F(SSE4A
) | F(MISALIGNSSE
) |
483 F(3DNOWPREFETCH
) | F(OSVW
) | 0 /* IBS */ | F(XOP
) |
484 0 /* SKINIT, WDT, LWP */ | F(FMA4
) | F(TBM
) |
485 F(TOPOEXT
) | F(PERFCTR_CORE
)
488 kvm_cpu_cap_mask(CPUID_8000_0001_EDX
,
489 F(FPU
) | F(VME
) | F(DE
) | F(PSE
) |
490 F(TSC
) | F(MSR
) | F(PAE
) | F(MCE
) |
491 F(CX8
) | F(APIC
) | 0 /* Reserved */ | F(SYSCALL
) |
492 F(MTRR
) | F(PGE
) | F(MCA
) | F(CMOV
) |
493 F(PAT
) | F(PSE36
) | 0 /* Reserved */ |
494 F(NX
) | 0 /* Reserved */ | F(MMXEXT
) | F(MMX
) |
495 F(FXSR
) | F(FXSR_OPT
) | f_gbpages
| F(RDTSCP
) |
496 0 /* Reserved */ | f_lm
| F(3DNOWEXT
) | F(3DNOW
)
499 if (!tdp_enabled
&& IS_ENABLED(CONFIG_X86_64
))
500 kvm_cpu_cap_set(X86_FEATURE_GBPAGES
);
502 kvm_cpu_cap_mask(CPUID_8000_0008_EBX
,
503 F(CLZERO
) | F(XSAVEERPTR
) |
504 F(WBNOINVD
) | F(AMD_IBPB
) | F(AMD_IBRS
) | F(AMD_SSBD
) | F(VIRT_SSBD
) |
505 F(AMD_SSB_NO
) | F(AMD_STIBP
) | F(AMD_STIBP_ALWAYS_ON
)
509 * AMD has separate bits for each SPEC_CTRL bit.
510 * arch/x86/kernel/cpu/bugs.c is kind enough to
511 * record that in cpufeatures so use them.
513 if (boot_cpu_has(X86_FEATURE_IBPB
))
514 kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB
);
515 if (boot_cpu_has(X86_FEATURE_IBRS
))
516 kvm_cpu_cap_set(X86_FEATURE_AMD_IBRS
);
517 if (boot_cpu_has(X86_FEATURE_STIBP
))
518 kvm_cpu_cap_set(X86_FEATURE_AMD_STIBP
);
519 if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD
))
520 kvm_cpu_cap_set(X86_FEATURE_AMD_SSBD
);
521 if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS
))
522 kvm_cpu_cap_set(X86_FEATURE_AMD_SSB_NO
);
524 * The preference is to use SPEC CTRL MSR instead of the
527 if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD
) &&
528 !boot_cpu_has(X86_FEATURE_AMD_SSBD
))
529 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD
);
532 * Hide all SVM features by default, SVM will set the cap bits for
533 * features it emulates and/or exposes for L1.
535 kvm_cpu_cap_mask(CPUID_8000_000A_EDX
, 0);
537 kvm_cpu_cap_mask(CPUID_8000_001F_EAX
,
538 0 /* SME */ | F(SEV
) | 0 /* VM_PAGE_FLUSH */ | F(SEV_ES
) |
541 kvm_cpu_cap_mask(CPUID_C000_0001_EDX
,
542 F(XSTORE
) | F(XSTORE_EN
) | F(XCRYPT
) | F(XCRYPT_EN
) |
543 F(ACE2
) | F(ACE2_EN
) | F(PHE
) | F(PHE_EN
) |
548 * Hide RDTSCP and RDPID if either feature is reported as supported but
549 * probing MSR_TSC_AUX failed. This is purely a sanity check and
550 * should never happen, but the guest will likely crash if RDTSCP or
551 * RDPID is misreported, and KVM has botched MSR_TSC_AUX emulation in
552 * the past. For example, the sanity check may fire if this instance of
553 * KVM is running as L1 on top of an older, broken KVM.
555 if (WARN_ON((kvm_cpu_cap_has(X86_FEATURE_RDTSCP
) ||
556 kvm_cpu_cap_has(X86_FEATURE_RDPID
)) &&
557 !kvm_is_supported_user_return_msr(MSR_TSC_AUX
))) {
558 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP
);
559 kvm_cpu_cap_clear(X86_FEATURE_RDPID
);
562 EXPORT_SYMBOL_GPL(kvm_set_cpu_caps
);
564 struct kvm_cpuid_array
{
565 struct kvm_cpuid_entry2
*entries
;
570 static struct kvm_cpuid_entry2
*do_host_cpuid(struct kvm_cpuid_array
*array
,
571 u32 function
, u32 index
)
573 struct kvm_cpuid_entry2
*entry
;
575 if (array
->nent
>= array
->maxnent
)
578 entry
= &array
->entries
[array
->nent
++];
580 entry
->function
= function
;
581 entry
->index
= index
;
584 cpuid_count(entry
->function
, entry
->index
,
585 &entry
->eax
, &entry
->ebx
, &entry
->ecx
, &entry
->edx
);
600 entry
->flags
|= KVM_CPUID_FLAG_SIGNIFCANT_INDEX
;
607 static int __do_cpuid_func_emulated(struct kvm_cpuid_array
*array
, u32 func
)
609 struct kvm_cpuid_entry2
*entry
;
611 if (array
->nent
>= array
->maxnent
)
614 entry
= &array
->entries
[array
->nent
];
615 entry
->function
= func
;
625 entry
->ecx
= F(MOVBE
);
629 entry
->flags
|= KVM_CPUID_FLAG_SIGNIFCANT_INDEX
;
631 if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP
))
632 entry
->ecx
= F(RDPID
);
642 static inline int __do_cpuid_func(struct kvm_cpuid_array
*array
, u32 function
)
644 struct kvm_cpuid_entry2
*entry
;
647 /* all calls to cpuid_count() should be made on the same cpu */
652 entry
= do_host_cpuid(array
, function
, 0);
658 /* Limited to the highest leaf implemented in KVM. */
659 entry
->eax
= min(entry
->eax
, 0x1fU
);
662 cpuid_entry_override(entry
, CPUID_1_EDX
);
663 cpuid_entry_override(entry
, CPUID_1_ECX
);
667 * On ancient CPUs, function 2 entries are STATEFUL. That is,
668 * CPUID(function=2, index=0) may return different results each
669 * time, with the least-significant byte in EAX enumerating the
670 * number of times software should do CPUID(2, 0).
672 * Modern CPUs, i.e. every CPU KVM has *ever* run on are less
673 * idiotic. Intel's SDM states that EAX & 0xff "will always
674 * return 01H. Software should ignore this value and not
675 * interpret it as an informational descriptor", while AMD's
676 * APM states that CPUID(2) is reserved.
678 * WARN if a frankenstein CPU that supports virtualization and
679 * a stateful CPUID.0x2 is encountered.
681 WARN_ON_ONCE((entry
->eax
& 0xff) > 1);
683 /* functions 4 and 0x8000001d have additional index. */
687 * Read entries until the cache type in the previous entry is
688 * zero, i.e. indicates an invalid entry.
690 for (i
= 1; entry
->eax
& 0x1f; ++i
) {
691 entry
= do_host_cpuid(array
, function
, i
);
696 case 6: /* Thermal management */
697 entry
->eax
= 0x4; /* allow ARAT */
702 /* function 7 has additional index. */
704 entry
->eax
= min(entry
->eax
, 1u);
705 cpuid_entry_override(entry
, CPUID_7_0_EBX
);
706 cpuid_entry_override(entry
, CPUID_7_ECX
);
707 cpuid_entry_override(entry
, CPUID_7_EDX
);
709 /* KVM only supports 0x7.0 and 0x7.1, capped above via min(). */
710 if (entry
->eax
== 1) {
711 entry
= do_host_cpuid(array
, function
, 1);
715 cpuid_entry_override(entry
, CPUID_7_1_EAX
);
723 case 0xa: { /* Architectural Performance Monitoring */
724 struct x86_pmu_capability cap
;
725 union cpuid10_eax eax
;
726 union cpuid10_edx edx
;
728 perf_get_x86_pmu_capability(&cap
);
731 * Only support guest architectural pmu on a host
732 * with architectural pmu.
735 memset(&cap
, 0, sizeof(cap
));
737 eax
.split
.version_id
= min(cap
.version
, 2);
738 eax
.split
.num_counters
= cap
.num_counters_gp
;
739 eax
.split
.bit_width
= cap
.bit_width_gp
;
740 eax
.split
.mask_length
= cap
.events_mask_len
;
742 edx
.split
.num_counters_fixed
= min(cap
.num_counters_fixed
, MAX_FIXED_COUNTERS
);
743 edx
.split
.bit_width_fixed
= cap
.bit_width_fixed
;
745 edx
.split
.anythread_deprecated
= 1;
746 edx
.split
.reserved1
= 0;
747 edx
.split
.reserved2
= 0;
749 entry
->eax
= eax
.full
;
750 entry
->ebx
= cap
.events_mask
;
752 entry
->edx
= edx
.full
;
756 * Per Intel's SDM, the 0x1f is a superset of 0xb,
757 * thus they can be handled by common code.
762 * Populate entries until the level type (ECX[15:8]) of the
763 * previous entry is zero. Note, CPUID EAX.{0x1f,0xb}.0 is
764 * the starting entry, filled by the primary do_host_cpuid().
766 for (i
= 1; entry
->ecx
& 0xff00; ++i
) {
767 entry
= do_host_cpuid(array
, function
, i
);
773 entry
->eax
&= supported_xcr0
;
774 entry
->ebx
= xstate_required_size(supported_xcr0
, false);
775 entry
->ecx
= entry
->ebx
;
776 entry
->edx
&= supported_xcr0
>> 32;
780 entry
= do_host_cpuid(array
, function
, 1);
784 cpuid_entry_override(entry
, CPUID_D_1_EAX
);
785 if (entry
->eax
& (F(XSAVES
)|F(XSAVEC
)))
786 entry
->ebx
= xstate_required_size(supported_xcr0
| supported_xss
,
789 WARN_ON_ONCE(supported_xss
!= 0);
792 entry
->ecx
&= supported_xss
;
793 entry
->edx
&= supported_xss
>> 32;
795 for (i
= 2; i
< 64; ++i
) {
797 if (supported_xcr0
& BIT_ULL(i
))
799 else if (supported_xss
& BIT_ULL(i
))
804 entry
= do_host_cpuid(array
, function
, i
);
809 * The supported check above should have filtered out
810 * invalid sub-leafs. Only valid sub-leafs should
811 * reach this point, and they should have a non-zero
812 * save state size. Furthermore, check whether the
813 * processor agrees with supported_xcr0/supported_xss
814 * on whether this is an XCR0- or IA32_XSS-managed area.
816 if (WARN_ON_ONCE(!entry
->eax
|| (entry
->ecx
& 0x1) != s_state
)) {
825 if (!kvm_cpu_cap_has(X86_FEATURE_SGX
)) {
826 entry
->eax
= entry
->ebx
= entry
->ecx
= entry
->edx
= 0;
831 * Index 0: Sub-features, MISCSELECT (a.k.a extended features)
832 * and max enclave sizes. The SGX sub-features and MISCSELECT
833 * are restricted by kernel and KVM capabilities (like most
834 * feature flags), while enclave size is unrestricted.
836 cpuid_entry_override(entry
, CPUID_12_EAX
);
837 entry
->ebx
&= SGX_MISC_EXINFO
;
839 entry
= do_host_cpuid(array
, function
, 1);
844 * Index 1: SECS.ATTRIBUTES. ATTRIBUTES are restricted a la
845 * feature flags. Advertise all supported flags, including
846 * privileged attributes that require explicit opt-in from
847 * userspace. ATTRIBUTES.XFRM is not adjusted as userspace is
848 * expected to derive it from supported XCR0.
850 entry
->eax
&= SGX_ATTR_DEBUG
| SGX_ATTR_MODE64BIT
|
851 SGX_ATTR_PROVISIONKEY
| SGX_ATTR_EINITTOKENKEY
|
857 if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT
)) {
858 entry
->eax
= entry
->ebx
= entry
->ecx
= entry
->edx
= 0;
862 for (i
= 1, max_idx
= entry
->eax
; i
<= max_idx
; ++i
) {
863 if (!do_host_cpuid(array
, function
, i
))
867 case KVM_CPUID_SIGNATURE
: {
868 static const char signature
[12] = "KVMKVMKVM\0\0";
869 const u32
*sigptr
= (const u32
*)signature
;
870 entry
->eax
= KVM_CPUID_FEATURES
;
871 entry
->ebx
= sigptr
[0];
872 entry
->ecx
= sigptr
[1];
873 entry
->edx
= sigptr
[2];
876 case KVM_CPUID_FEATURES
:
877 entry
->eax
= (1 << KVM_FEATURE_CLOCKSOURCE
) |
878 (1 << KVM_FEATURE_NOP_IO_DELAY
) |
879 (1 << KVM_FEATURE_CLOCKSOURCE2
) |
880 (1 << KVM_FEATURE_ASYNC_PF
) |
881 (1 << KVM_FEATURE_PV_EOI
) |
882 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT
) |
883 (1 << KVM_FEATURE_PV_UNHALT
) |
884 (1 << KVM_FEATURE_PV_TLB_FLUSH
) |
885 (1 << KVM_FEATURE_ASYNC_PF_VMEXIT
) |
886 (1 << KVM_FEATURE_PV_SEND_IPI
) |
887 (1 << KVM_FEATURE_POLL_CONTROL
) |
888 (1 << KVM_FEATURE_PV_SCHED_YIELD
) |
889 (1 << KVM_FEATURE_ASYNC_PF_INT
);
892 entry
->eax
|= (1 << KVM_FEATURE_STEAL_TIME
);
899 entry
->eax
= min(entry
->eax
, 0x8000001f);
902 cpuid_entry_override(entry
, CPUID_8000_0001_EDX
);
903 cpuid_entry_override(entry
, CPUID_8000_0001_ECX
);
906 /* L2 cache and TLB: pass through host info. */
908 case 0x80000007: /* Advanced power management */
909 /* invariant TSC is CPUID.80000007H:EDX[8] */
910 entry
->edx
&= (1 << 8);
911 /* mask against host */
912 entry
->edx
&= boot_cpu_data
.x86_power
;
913 entry
->eax
= entry
->ebx
= entry
->ecx
= 0;
916 unsigned g_phys_as
= (entry
->eax
>> 16) & 0xff;
917 unsigned virt_as
= max((entry
->eax
>> 8) & 0xff, 48U);
918 unsigned phys_as
= entry
->eax
& 0xff;
921 * If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as
922 * the guest operates in the same PA space as the host, i.e.
923 * reductions in MAXPHYADDR for memory encryption affect shadow
926 * If TDP is enabled but an explicit guest MAXPHYADDR is not
927 * provided, use the raw bare metal MAXPHYADDR as reductions to
928 * the HPAs do not affect GPAs.
931 g_phys_as
= boot_cpu_data
.x86_phys_bits
;
935 entry
->eax
= g_phys_as
| (virt_as
<< 8);
937 cpuid_entry_override(entry
, CPUID_8000_0008_EBX
);
941 if (!kvm_cpu_cap_has(X86_FEATURE_SVM
)) {
942 entry
->eax
= entry
->ebx
= entry
->ecx
= entry
->edx
= 0;
945 entry
->eax
= 1; /* SVM revision 1 */
946 entry
->ebx
= 8; /* Lets support 8 ASIDs in case we add proper
947 ASID emulation to nested SVM */
948 entry
->ecx
= 0; /* Reserved */
949 cpuid_entry_override(entry
, CPUID_8000_000A_EDX
);
952 entry
->ecx
= entry
->edx
= 0;
958 if (!kvm_cpu_cap_has(X86_FEATURE_SEV
)) {
959 entry
->eax
= entry
->ebx
= entry
->ecx
= entry
->edx
= 0;
961 cpuid_entry_override(entry
, CPUID_8000_001F_EAX
);
964 * Enumerate '0' for "PA bits reduction", the adjusted
965 * MAXPHYADDR is enumerated directly (see 0x80000008).
967 entry
->ebx
&= ~GENMASK(11, 6);
970 /*Add support for Centaur's CPUID instruction*/
972 /*Just support up to 0xC0000004 now*/
973 entry
->eax
= min(entry
->eax
, 0xC0000004);
976 cpuid_entry_override(entry
, CPUID_C000_0001_EDX
);
978 case 3: /* Processor serial number */
979 case 5: /* MONITOR/MWAIT */
984 entry
->eax
= entry
->ebx
= entry
->ecx
= entry
->edx
= 0;
996 static int do_cpuid_func(struct kvm_cpuid_array
*array
, u32 func
,
999 if (type
== KVM_GET_EMULATED_CPUID
)
1000 return __do_cpuid_func_emulated(array
, func
);
1002 return __do_cpuid_func(array
, func
);
1005 #define CENTAUR_CPUID_SIGNATURE 0xC0000000
1007 static int get_cpuid_func(struct kvm_cpuid_array
*array
, u32 func
,
1013 if (func
== CENTAUR_CPUID_SIGNATURE
&&
1014 boot_cpu_data
.x86_vendor
!= X86_VENDOR_CENTAUR
)
1017 r
= do_cpuid_func(array
, func
, type
);
1021 limit
= array
->entries
[array
->nent
- 1].eax
;
1022 for (func
= func
+ 1; func
<= limit
; ++func
) {
1023 r
= do_cpuid_func(array
, func
, type
);
1031 static bool sanity_check_entries(struct kvm_cpuid_entry2 __user
*entries
,
1032 __u32 num_entries
, unsigned int ioctl_type
)
1037 if (ioctl_type
!= KVM_GET_EMULATED_CPUID
)
1041 * We want to make sure that ->padding is being passed clean from
1042 * userspace in case we want to use it for something in the future.
1044 * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we
1045 * have to give ourselves satisfied only with the emulated side. /me
1048 for (i
= 0; i
< num_entries
; i
++) {
1049 if (copy_from_user(pad
, entries
[i
].padding
, sizeof(pad
)))
1052 if (pad
[0] || pad
[1] || pad
[2])
1058 int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2
*cpuid
,
1059 struct kvm_cpuid_entry2 __user
*entries
,
1062 static const u32 funcs
[] = {
1063 0, 0x80000000, CENTAUR_CPUID_SIGNATURE
, KVM_CPUID_SIGNATURE
,
1066 struct kvm_cpuid_array array
= {
1071 if (cpuid
->nent
< 1)
1073 if (cpuid
->nent
> KVM_MAX_CPUID_ENTRIES
)
1074 cpuid
->nent
= KVM_MAX_CPUID_ENTRIES
;
1076 if (sanity_check_entries(entries
, cpuid
->nent
, type
))
1079 array
.entries
= vzalloc(array_size(sizeof(struct kvm_cpuid_entry2
),
1084 array
.maxnent
= cpuid
->nent
;
1086 for (i
= 0; i
< ARRAY_SIZE(funcs
); i
++) {
1087 r
= get_cpuid_func(&array
, funcs
[i
], type
);
1091 cpuid
->nent
= array
.nent
;
1093 if (copy_to_user(entries
, array
.entries
,
1094 array
.nent
* sizeof(struct kvm_cpuid_entry2
)))
1098 vfree(array
.entries
);
1102 struct kvm_cpuid_entry2
*kvm_find_cpuid_entry(struct kvm_vcpu
*vcpu
,
1103 u32 function
, u32 index
)
1105 return cpuid_entry2_find(vcpu
->arch
.cpuid_entries
, vcpu
->arch
.cpuid_nent
,
1108 EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry
);
1111 * Intel CPUID semantics treats any query for an out-of-range leaf as if the
1112 * highest basic leaf (i.e. CPUID.0H:EAX) were requested. AMD CPUID semantics
1113 * returns all zeroes for any undefined leaf, whether or not the leaf is in
1114 * range. Centaur/VIA follows Intel semantics.
1116 * A leaf is considered out-of-range if its function is higher than the maximum
1117 * supported leaf of its associated class or if its associated class does not
1120 * There are three primary classes to be considered, with their respective
1121 * ranges described as "<base> - <top>[,<base2> - <top2>] inclusive. A primary
1122 * class exists if a guest CPUID entry for its <base> leaf exists. For a given
1123 * class, CPUID.<base>.EAX contains the max supported leaf for the class.
1125 * - Basic: 0x00000000 - 0x3fffffff, 0x50000000 - 0x7fffffff
1126 * - Hypervisor: 0x40000000 - 0x4fffffff
1127 * - Extended: 0x80000000 - 0xbfffffff
1128 * - Centaur: 0xc0000000 - 0xcfffffff
1130 * The Hypervisor class is further subdivided into sub-classes that each act as
1131 * their own independent class associated with a 0x100 byte range. E.g. if Qemu
1132 * is advertising support for both HyperV and KVM, the resulting Hypervisor
1133 * CPUID sub-classes are:
1135 * - HyperV: 0x40000000 - 0x400000ff
1136 * - KVM: 0x40000100 - 0x400001ff
1138 static struct kvm_cpuid_entry2
*
1139 get_out_of_range_cpuid_entry(struct kvm_vcpu
*vcpu
, u32
*fn_ptr
, u32 index
)
1141 struct kvm_cpuid_entry2
*basic
, *class;
1142 u32 function
= *fn_ptr
;
1144 basic
= kvm_find_cpuid_entry(vcpu
, 0, 0);
1148 if (is_guest_vendor_amd(basic
->ebx
, basic
->ecx
, basic
->edx
) ||
1149 is_guest_vendor_hygon(basic
->ebx
, basic
->ecx
, basic
->edx
))
1152 if (function
>= 0x40000000 && function
<= 0x4fffffff)
1153 class = kvm_find_cpuid_entry(vcpu
, function
& 0xffffff00, 0);
1154 else if (function
>= 0xc0000000)
1155 class = kvm_find_cpuid_entry(vcpu
, 0xc0000000, 0);
1157 class = kvm_find_cpuid_entry(vcpu
, function
& 0x80000000, 0);
1159 if (class && function
<= class->eax
)
1163 * Leaf specific adjustments are also applied when redirecting to the
1164 * max basic entry, e.g. if the max basic leaf is 0xb but there is no
1165 * entry for CPUID.0xb.index (see below), then the output value for EDX
1166 * needs to be pulled from CPUID.0xb.1.
1168 *fn_ptr
= basic
->eax
;
1171 * The class does not exist or the requested function is out of range;
1172 * the effective CPUID entry is the max basic leaf. Note, the index of
1173 * the original requested leaf is observed!
1175 return kvm_find_cpuid_entry(vcpu
, basic
->eax
, index
);
1178 bool kvm_cpuid(struct kvm_vcpu
*vcpu
, u32
*eax
, u32
*ebx
,
1179 u32
*ecx
, u32
*edx
, bool exact_only
)
1181 u32 orig_function
= *eax
, function
= *eax
, index
= *ecx
;
1182 struct kvm_cpuid_entry2
*entry
;
1183 bool exact
, used_max_basic
= false;
1185 entry
= kvm_find_cpuid_entry(vcpu
, function
, index
);
1188 if (!entry
&& !exact_only
) {
1189 entry
= get_out_of_range_cpuid_entry(vcpu
, &function
, index
);
1190 used_max_basic
= !!entry
;
1198 if (function
== 7 && index
== 0) {
1200 if (!__kvm_get_msr(vcpu
, MSR_IA32_TSX_CTRL
, &data
, true) &&
1201 (data
& TSX_CTRL_CPUID_CLEAR
))
1202 *ebx
&= ~(F(RTM
) | F(HLE
));
1205 *eax
= *ebx
= *ecx
= *edx
= 0;
1207 * When leaf 0BH or 1FH is defined, CL is pass-through
1208 * and EDX is always the x2APIC ID, even for undefined
1209 * subleaves. Index 1 will exist iff the leaf is
1210 * implemented, so we pass through CL iff leaf 1
1211 * exists. EDX can be copied from any existing index.
1213 if (function
== 0xb || function
== 0x1f) {
1214 entry
= kvm_find_cpuid_entry(vcpu
, function
, 1);
1216 *ecx
= index
& 0xff;
1221 trace_kvm_cpuid(orig_function
, index
, *eax
, *ebx
, *ecx
, *edx
, exact
,
1225 EXPORT_SYMBOL_GPL(kvm_cpuid
);
1227 int kvm_emulate_cpuid(struct kvm_vcpu
*vcpu
)
1229 u32 eax
, ebx
, ecx
, edx
;
1231 if (cpuid_fault_enabled(vcpu
) && !kvm_require_cpl(vcpu
, 0))
1234 eax
= kvm_rax_read(vcpu
);
1235 ecx
= kvm_rcx_read(vcpu
);
1236 kvm_cpuid(vcpu
, &eax
, &ebx
, &ecx
, &edx
, false);
1237 kvm_rax_write(vcpu
, eax
);
1238 kvm_rbx_write(vcpu
, ebx
);
1239 kvm_rcx_write(vcpu
, ecx
);
1240 kvm_rdx_write(vcpu
, edx
);
1241 return kvm_skip_emulated_instruction(vcpu
);
1243 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid
);