]> git.proxmox.com Git - mirror_qemu.git/blob - target/i386/kvm/kvm.c
kvm/msi: do explicit commit when adding msi routes
[mirror_qemu.git] / target / i386 / kvm / kvm.c
1 /*
2 * QEMU KVM support
3 *
4 * Copyright (C) 2006-2008 Qumranet Technologies
5 * Copyright IBM, Corp. 2008
6 *
7 * Authors:
8 * Anthony Liguori <aliguori@us.ibm.com>
9 *
10 * This work is licensed under the terms of the GNU GPL, version 2 or later.
11 * See the COPYING file in the top-level directory.
12 *
13 */
14
15 #include "qemu/osdep.h"
16 #include "qapi/qapi-events-run-state.h"
17 #include "qapi/error.h"
18 #include <sys/ioctl.h>
19 #include <sys/utsname.h>
20
21 #include <linux/kvm.h>
22 #include "standard-headers/asm-x86/kvm_para.h"
23
24 #include "cpu.h"
25 #include "host-cpu.h"
26 #include "sysemu/sysemu.h"
27 #include "sysemu/hw_accel.h"
28 #include "sysemu/kvm_int.h"
29 #include "sysemu/runstate.h"
30 #include "kvm_i386.h"
31 #include "sev.h"
32 #include "hyperv.h"
33 #include "hyperv-proto.h"
34
35 #include "exec/gdbstub.h"
36 #include "qemu/host-utils.h"
37 #include "qemu/main-loop.h"
38 #include "qemu/config-file.h"
39 #include "qemu/error-report.h"
40 #include "qemu/memalign.h"
41 #include "hw/i386/x86.h"
42 #include "hw/i386/apic.h"
43 #include "hw/i386/apic_internal.h"
44 #include "hw/i386/apic-msidef.h"
45 #include "hw/i386/intel_iommu.h"
46 #include "hw/i386/x86-iommu.h"
47 #include "hw/i386/e820_memory_layout.h"
48
49 #include "hw/pci/pci.h"
50 #include "hw/pci/msi.h"
51 #include "hw/pci/msix.h"
52 #include "migration/blocker.h"
53 #include "exec/memattrs.h"
54 #include "trace.h"
55
56 //#define DEBUG_KVM
57
58 #ifdef DEBUG_KVM
59 #define DPRINTF(fmt, ...) \
60 do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
61 #else
62 #define DPRINTF(fmt, ...) \
63 do { } while (0)
64 #endif
65
66 /* From arch/x86/kvm/lapic.h */
67 #define KVM_APIC_BUS_CYCLE_NS 1
68 #define KVM_APIC_BUS_FREQUENCY (1000000000ULL / KVM_APIC_BUS_CYCLE_NS)
69
70 #define MSR_KVM_WALL_CLOCK 0x11
71 #define MSR_KVM_SYSTEM_TIME 0x12
72
73 /* A 4096-byte buffer can hold the 8-byte kvm_msrs header, plus
74 * 255 kvm_msr_entry structs */
75 #define MSR_BUF_SIZE 4096
76
77 static void kvm_init_msrs(X86CPU *cpu);
78
79 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
80 KVM_CAP_INFO(SET_TSS_ADDR),
81 KVM_CAP_INFO(EXT_CPUID),
82 KVM_CAP_INFO(MP_STATE),
83 KVM_CAP_LAST_INFO
84 };
85
86 static bool has_msr_star;
87 static bool has_msr_hsave_pa;
88 static bool has_msr_tsc_aux;
89 static bool has_msr_tsc_adjust;
90 static bool has_msr_tsc_deadline;
91 static bool has_msr_feature_control;
92 static bool has_msr_misc_enable;
93 static bool has_msr_smbase;
94 static bool has_msr_bndcfgs;
95 static int lm_capable_kernel;
96 static bool has_msr_hv_hypercall;
97 static bool has_msr_hv_crash;
98 static bool has_msr_hv_reset;
99 static bool has_msr_hv_vpindex;
100 static bool hv_vpindex_settable;
101 static bool has_msr_hv_runtime;
102 static bool has_msr_hv_synic;
103 static bool has_msr_hv_stimer;
104 static bool has_msr_hv_frequencies;
105 static bool has_msr_hv_reenlightenment;
106 static bool has_msr_xss;
107 static bool has_msr_umwait;
108 static bool has_msr_spec_ctrl;
109 static bool has_tsc_scale_msr;
110 static bool has_msr_tsx_ctrl;
111 static bool has_msr_virt_ssbd;
112 static bool has_msr_smi_count;
113 static bool has_msr_arch_capabs;
114 static bool has_msr_core_capabs;
115 static bool has_msr_vmx_vmfunc;
116 static bool has_msr_ucode_rev;
117 static bool has_msr_vmx_procbased_ctls2;
118 static bool has_msr_perf_capabs;
119 static bool has_msr_pkrs;
120
121 static uint32_t has_architectural_pmu_version;
122 static uint32_t num_architectural_pmu_gp_counters;
123 static uint32_t num_architectural_pmu_fixed_counters;
124
125 static int has_xsave;
126 static int has_xcrs;
127 static int has_pit_state2;
128 static int has_sregs2;
129 static int has_exception_payload;
130
131 static bool has_msr_mcg_ext_ctl;
132
133 static struct kvm_cpuid2 *cpuid_cache;
134 static struct kvm_cpuid2 *hv_cpuid_cache;
135 static struct kvm_msr_list *kvm_feature_msrs;
136
137 #define BUS_LOCK_SLICE_TIME 1000000000ULL /* ns */
138 static RateLimit bus_lock_ratelimit_ctrl;
139
140 int kvm_has_pit_state2(void)
141 {
142 return has_pit_state2;
143 }
144
145 bool kvm_has_smm(void)
146 {
147 return kvm_vm_check_extension(kvm_state, KVM_CAP_X86_SMM);
148 }
149
150 bool kvm_has_adjust_clock_stable(void)
151 {
152 int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
153
154 return (ret == KVM_CLOCK_TSC_STABLE);
155 }
156
157 bool kvm_has_adjust_clock(void)
158 {
159 return kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
160 }
161
162 bool kvm_has_exception_payload(void)
163 {
164 return has_exception_payload;
165 }
166
167 static bool kvm_x2apic_api_set_flags(uint64_t flags)
168 {
169 KVMState *s = KVM_STATE(current_accel());
170
171 return !kvm_vm_enable_cap(s, KVM_CAP_X2APIC_API, 0, flags);
172 }
173
174 #define MEMORIZE(fn, _result) \
175 ({ \
176 static bool _memorized; \
177 \
178 if (_memorized) { \
179 return _result; \
180 } \
181 _memorized = true; \
182 _result = fn; \
183 })
184
185 static bool has_x2apic_api;
186
187 bool kvm_has_x2apic_api(void)
188 {
189 return has_x2apic_api;
190 }
191
192 bool kvm_enable_x2apic(void)
193 {
194 return MEMORIZE(
195 kvm_x2apic_api_set_flags(KVM_X2APIC_API_USE_32BIT_IDS |
196 KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK),
197 has_x2apic_api);
198 }
199
200 bool kvm_hv_vpindex_settable(void)
201 {
202 return hv_vpindex_settable;
203 }
204
205 static int kvm_get_tsc(CPUState *cs)
206 {
207 X86CPU *cpu = X86_CPU(cs);
208 CPUX86State *env = &cpu->env;
209 struct {
210 struct kvm_msrs info;
211 struct kvm_msr_entry entries[1];
212 } msr_data = {};
213 int ret;
214
215 if (env->tsc_valid) {
216 return 0;
217 }
218
219 memset(&msr_data, 0, sizeof(msr_data));
220 msr_data.info.nmsrs = 1;
221 msr_data.entries[0].index = MSR_IA32_TSC;
222 env->tsc_valid = !runstate_is_running();
223
224 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data);
225 if (ret < 0) {
226 return ret;
227 }
228
229 assert(ret == 1);
230 env->tsc = msr_data.entries[0].data;
231 return 0;
232 }
233
234 static inline void do_kvm_synchronize_tsc(CPUState *cpu, run_on_cpu_data arg)
235 {
236 kvm_get_tsc(cpu);
237 }
238
239 void kvm_synchronize_all_tsc(void)
240 {
241 CPUState *cpu;
242
243 if (kvm_enabled()) {
244 CPU_FOREACH(cpu) {
245 run_on_cpu(cpu, do_kvm_synchronize_tsc, RUN_ON_CPU_NULL);
246 }
247 }
248 }
249
250 static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
251 {
252 struct kvm_cpuid2 *cpuid;
253 int r, size;
254
255 size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
256 cpuid = g_malloc0(size);
257 cpuid->nent = max;
258 r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid);
259 if (r == 0 && cpuid->nent >= max) {
260 r = -E2BIG;
261 }
262 if (r < 0) {
263 if (r == -E2BIG) {
264 g_free(cpuid);
265 return NULL;
266 } else {
267 fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n",
268 strerror(-r));
269 exit(1);
270 }
271 }
272 return cpuid;
273 }
274
275 /* Run KVM_GET_SUPPORTED_CPUID ioctl(), allocating a buffer large enough
276 * for all entries.
277 */
278 static struct kvm_cpuid2 *get_supported_cpuid(KVMState *s)
279 {
280 struct kvm_cpuid2 *cpuid;
281 int max = 1;
282
283 if (cpuid_cache != NULL) {
284 return cpuid_cache;
285 }
286 while ((cpuid = try_get_cpuid(s, max)) == NULL) {
287 max *= 2;
288 }
289 cpuid_cache = cpuid;
290 return cpuid;
291 }
292
293 static bool host_tsx_broken(void)
294 {
295 int family, model, stepping;\
296 char vendor[CPUID_VENDOR_SZ + 1];
297
298 host_cpu_vendor_fms(vendor, &family, &model, &stepping);
299
300 /* Check if we are running on a Haswell host known to have broken TSX */
301 return !strcmp(vendor, CPUID_VENDOR_INTEL) &&
302 (family == 6) &&
303 ((model == 63 && stepping < 4) ||
304 model == 60 || model == 69 || model == 70);
305 }
306
307 /* Returns the value for a specific register on the cpuid entry
308 */
309 static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
310 {
311 uint32_t ret = 0;
312 switch (reg) {
313 case R_EAX:
314 ret = entry->eax;
315 break;
316 case R_EBX:
317 ret = entry->ebx;
318 break;
319 case R_ECX:
320 ret = entry->ecx;
321 break;
322 case R_EDX:
323 ret = entry->edx;
324 break;
325 }
326 return ret;
327 }
328
329 /* Find matching entry for function/index on kvm_cpuid2 struct
330 */
331 static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid,
332 uint32_t function,
333 uint32_t index)
334 {
335 int i;
336 for (i = 0; i < cpuid->nent; ++i) {
337 if (cpuid->entries[i].function == function &&
338 cpuid->entries[i].index == index) {
339 return &cpuid->entries[i];
340 }
341 }
342 /* not found: */
343 return NULL;
344 }
345
346 uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
347 uint32_t index, int reg)
348 {
349 struct kvm_cpuid2 *cpuid;
350 uint32_t ret = 0;
351 uint32_t cpuid_1_edx;
352
353 cpuid = get_supported_cpuid(s);
354
355 struct kvm_cpuid_entry2 *entry = cpuid_find_entry(cpuid, function, index);
356 if (entry) {
357 ret = cpuid_entry_get_reg(entry, reg);
358 }
359
360 /* Fixups for the data returned by KVM, below */
361
362 if (function == 1 && reg == R_EDX) {
363 /* KVM before 2.6.30 misreports the following features */
364 ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA;
365 } else if (function == 1 && reg == R_ECX) {
366 /* We can set the hypervisor flag, even if KVM does not return it on
367 * GET_SUPPORTED_CPUID
368 */
369 ret |= CPUID_EXT_HYPERVISOR;
370 /* tsc-deadline flag is not returned by GET_SUPPORTED_CPUID, but it
371 * can be enabled if the kernel has KVM_CAP_TSC_DEADLINE_TIMER,
372 * and the irqchip is in the kernel.
373 */
374 if (kvm_irqchip_in_kernel() &&
375 kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) {
376 ret |= CPUID_EXT_TSC_DEADLINE_TIMER;
377 }
378
379 /* x2apic is reported by GET_SUPPORTED_CPUID, but it can't be enabled
380 * without the in-kernel irqchip
381 */
382 if (!kvm_irqchip_in_kernel()) {
383 ret &= ~CPUID_EXT_X2APIC;
384 }
385
386 if (enable_cpu_pm) {
387 int disable_exits = kvm_check_extension(s,
388 KVM_CAP_X86_DISABLE_EXITS);
389
390 if (disable_exits & KVM_X86_DISABLE_EXITS_MWAIT) {
391 ret |= CPUID_EXT_MONITOR;
392 }
393 }
394 } else if (function == 6 && reg == R_EAX) {
395 ret |= CPUID_6_EAX_ARAT; /* safe to allow because of emulated APIC */
396 } else if (function == 7 && index == 0 && reg == R_EBX) {
397 if (host_tsx_broken()) {
398 ret &= ~(CPUID_7_0_EBX_RTM | CPUID_7_0_EBX_HLE);
399 }
400 } else if (function == 7 && index == 0 && reg == R_EDX) {
401 /*
402 * Linux v4.17-v4.20 incorrectly return ARCH_CAPABILITIES on SVM hosts.
403 * We can detect the bug by checking if MSR_IA32_ARCH_CAPABILITIES is
404 * returned by KVM_GET_MSR_INDEX_LIST.
405 */
406 if (!has_msr_arch_capabs) {
407 ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES;
408 }
409 } else if (function == 0x80000001 && reg == R_ECX) {
410 /*
411 * It's safe to enable TOPOEXT even if it's not returned by
412 * GET_SUPPORTED_CPUID. Unconditionally enabling TOPOEXT here allows
413 * us to keep CPU models including TOPOEXT runnable on older kernels.
414 */
415 ret |= CPUID_EXT3_TOPOEXT;
416 } else if (function == 0x80000001 && reg == R_EDX) {
417 /* On Intel, kvm returns cpuid according to the Intel spec,
418 * so add missing bits according to the AMD spec:
419 */
420 cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
421 ret |= cpuid_1_edx & CPUID_EXT2_AMD_ALIASES;
422 } else if (function == KVM_CPUID_FEATURES && reg == R_EAX) {
423 /* kvm_pv_unhalt is reported by GET_SUPPORTED_CPUID, but it can't
424 * be enabled without the in-kernel irqchip
425 */
426 if (!kvm_irqchip_in_kernel()) {
427 ret &= ~(1U << KVM_FEATURE_PV_UNHALT);
428 }
429 if (kvm_irqchip_is_split()) {
430 ret |= 1U << KVM_FEATURE_MSI_EXT_DEST_ID;
431 }
432 } else if (function == KVM_CPUID_FEATURES && reg == R_EDX) {
433 ret |= 1U << KVM_HINTS_REALTIME;
434 }
435
436 return ret;
437 }
438
439 uint64_t kvm_arch_get_supported_msr_feature(KVMState *s, uint32_t index)
440 {
441 struct {
442 struct kvm_msrs info;
443 struct kvm_msr_entry entries[1];
444 } msr_data = {};
445 uint64_t value;
446 uint32_t ret, can_be_one, must_be_one;
447
448 if (kvm_feature_msrs == NULL) { /* Host doesn't support feature MSRs */
449 return 0;
450 }
451
452 /* Check if requested MSR is supported feature MSR */
453 int i;
454 for (i = 0; i < kvm_feature_msrs->nmsrs; i++)
455 if (kvm_feature_msrs->indices[i] == index) {
456 break;
457 }
458 if (i == kvm_feature_msrs->nmsrs) {
459 return 0; /* if the feature MSR is not supported, simply return 0 */
460 }
461
462 msr_data.info.nmsrs = 1;
463 msr_data.entries[0].index = index;
464
465 ret = kvm_ioctl(s, KVM_GET_MSRS, &msr_data);
466 if (ret != 1) {
467 error_report("KVM get MSR (index=0x%x) feature failed, %s",
468 index, strerror(-ret));
469 exit(1);
470 }
471
472 value = msr_data.entries[0].data;
473 switch (index) {
474 case MSR_IA32_VMX_PROCBASED_CTLS2:
475 if (!has_msr_vmx_procbased_ctls2) {
476 /* KVM forgot to add these bits for some time, do this ourselves. */
477 if (kvm_arch_get_supported_cpuid(s, 0xD, 1, R_ECX) &
478 CPUID_XSAVE_XSAVES) {
479 value |= (uint64_t)VMX_SECONDARY_EXEC_XSAVES << 32;
480 }
481 if (kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX) &
482 CPUID_EXT_RDRAND) {
483 value |= (uint64_t)VMX_SECONDARY_EXEC_RDRAND_EXITING << 32;
484 }
485 if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) &
486 CPUID_7_0_EBX_INVPCID) {
487 value |= (uint64_t)VMX_SECONDARY_EXEC_ENABLE_INVPCID << 32;
488 }
489 if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) &
490 CPUID_7_0_EBX_RDSEED) {
491 value |= (uint64_t)VMX_SECONDARY_EXEC_RDSEED_EXITING << 32;
492 }
493 if (kvm_arch_get_supported_cpuid(s, 0x80000001, 0, R_EDX) &
494 CPUID_EXT2_RDTSCP) {
495 value |= (uint64_t)VMX_SECONDARY_EXEC_RDTSCP << 32;
496 }
497 }
498 /* fall through */
499 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
500 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
501 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
502 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
503 /*
504 * Return true for bits that can be one, but do not have to be one.
505 * The SDM tells us which bits could have a "must be one" setting,
506 * so we can do the opposite transformation in make_vmx_msr_value.
507 */
508 must_be_one = (uint32_t)value;
509 can_be_one = (uint32_t)(value >> 32);
510 return can_be_one & ~must_be_one;
511
512 default:
513 return value;
514 }
515 }
516
517 static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
518 int *max_banks)
519 {
520 int r;
521
522 r = kvm_check_extension(s, KVM_CAP_MCE);
523 if (r > 0) {
524 *max_banks = r;
525 return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
526 }
527 return -ENOSYS;
528 }
529
530 static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code)
531 {
532 CPUState *cs = CPU(cpu);
533 CPUX86State *env = &cpu->env;
534 uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
535 MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S;
536 uint64_t mcg_status = MCG_STATUS_MCIP;
537 int flags = 0;
538
539 if (code == BUS_MCEERR_AR) {
540 status |= MCI_STATUS_AR | 0x134;
541 mcg_status |= MCG_STATUS_EIPV;
542 } else {
543 status |= 0xc0;
544 mcg_status |= MCG_STATUS_RIPV;
545 }
546
547 flags = cpu_x86_support_mca_broadcast(env) ? MCE_INJECT_BROADCAST : 0;
548 /* We need to read back the value of MSR_EXT_MCG_CTL that was set by the
549 * guest kernel back into env->mcg_ext_ctl.
550 */
551 cpu_synchronize_state(cs);
552 if (env->mcg_ext_ctl & MCG_EXT_CTL_LMCE_EN) {
553 mcg_status |= MCG_STATUS_LMCE;
554 flags = 0;
555 }
556
557 cpu_x86_inject_mce(NULL, cpu, 9, status, mcg_status, paddr,
558 (MCM_ADDR_PHYS << 6) | 0xc, flags);
559 }
560
561 static void emit_hypervisor_memory_failure(MemoryFailureAction action, bool ar)
562 {
563 MemoryFailureFlags mff = {.action_required = ar, .recursive = false};
564
565 qapi_event_send_memory_failure(MEMORY_FAILURE_RECIPIENT_HYPERVISOR, action,
566 &mff);
567 }
568
569 static void hardware_memory_error(void *host_addr)
570 {
571 emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_FATAL, true);
572 error_report("QEMU got Hardware memory error at addr %p", host_addr);
573 exit(1);
574 }
575
576 void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
577 {
578 X86CPU *cpu = X86_CPU(c);
579 CPUX86State *env = &cpu->env;
580 ram_addr_t ram_addr;
581 hwaddr paddr;
582
583 /* If we get an action required MCE, it has been injected by KVM
584 * while the VM was running. An action optional MCE instead should
585 * be coming from the main thread, which qemu_init_sigbus identifies
586 * as the "early kill" thread.
587 */
588 assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
589
590 if ((env->mcg_cap & MCG_SER_P) && addr) {
591 ram_addr = qemu_ram_addr_from_host(addr);
592 if (ram_addr != RAM_ADDR_INVALID &&
593 kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
594 kvm_hwpoison_page_add(ram_addr);
595 kvm_mce_inject(cpu, paddr, code);
596
597 /*
598 * Use different logging severity based on error type.
599 * If there is additional MCE reporting on the hypervisor, QEMU VA
600 * could be another source to identify the PA and MCE details.
601 */
602 if (code == BUS_MCEERR_AR) {
603 error_report("Guest MCE Memory Error at QEMU addr %p and "
604 "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
605 addr, paddr, "BUS_MCEERR_AR");
606 } else {
607 warn_report("Guest MCE Memory Error at QEMU addr %p and "
608 "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
609 addr, paddr, "BUS_MCEERR_AO");
610 }
611
612 return;
613 }
614
615 if (code == BUS_MCEERR_AO) {
616 warn_report("Hardware memory error at addr %p of type %s "
617 "for memory used by QEMU itself instead of guest system!",
618 addr, "BUS_MCEERR_AO");
619 }
620 }
621
622 if (code == BUS_MCEERR_AR) {
623 hardware_memory_error(addr);
624 }
625
626 /* Hope we are lucky for AO MCE, just notify a event */
627 emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_IGNORE, false);
628 }
629
630 static void kvm_reset_exception(CPUX86State *env)
631 {
632 env->exception_nr = -1;
633 env->exception_pending = 0;
634 env->exception_injected = 0;
635 env->exception_has_payload = false;
636 env->exception_payload = 0;
637 }
638
639 static void kvm_queue_exception(CPUX86State *env,
640 int32_t exception_nr,
641 uint8_t exception_has_payload,
642 uint64_t exception_payload)
643 {
644 assert(env->exception_nr == -1);
645 assert(!env->exception_pending);
646 assert(!env->exception_injected);
647 assert(!env->exception_has_payload);
648
649 env->exception_nr = exception_nr;
650
651 if (has_exception_payload) {
652 env->exception_pending = 1;
653
654 env->exception_has_payload = exception_has_payload;
655 env->exception_payload = exception_payload;
656 } else {
657 env->exception_injected = 1;
658
659 if (exception_nr == EXCP01_DB) {
660 assert(exception_has_payload);
661 env->dr[6] = exception_payload;
662 } else if (exception_nr == EXCP0E_PAGE) {
663 assert(exception_has_payload);
664 env->cr[2] = exception_payload;
665 } else {
666 assert(!exception_has_payload);
667 }
668 }
669 }
670
671 static int kvm_inject_mce_oldstyle(X86CPU *cpu)
672 {
673 CPUX86State *env = &cpu->env;
674
675 if (!kvm_has_vcpu_events() && env->exception_nr == EXCP12_MCHK) {
676 unsigned int bank, bank_num = env->mcg_cap & 0xff;
677 struct kvm_x86_mce mce;
678
679 kvm_reset_exception(env);
680
681 /*
682 * There must be at least one bank in use if an MCE is pending.
683 * Find it and use its values for the event injection.
684 */
685 for (bank = 0; bank < bank_num; bank++) {
686 if (env->mce_banks[bank * 4 + 1] & MCI_STATUS_VAL) {
687 break;
688 }
689 }
690 assert(bank < bank_num);
691
692 mce.bank = bank;
693 mce.status = env->mce_banks[bank * 4 + 1];
694 mce.mcg_status = env->mcg_status;
695 mce.addr = env->mce_banks[bank * 4 + 2];
696 mce.misc = env->mce_banks[bank * 4 + 3];
697
698 return kvm_vcpu_ioctl(CPU(cpu), KVM_X86_SET_MCE, &mce);
699 }
700 return 0;
701 }
702
703 static void cpu_update_state(void *opaque, bool running, RunState state)
704 {
705 CPUX86State *env = opaque;
706
707 if (running) {
708 env->tsc_valid = false;
709 }
710 }
711
712 unsigned long kvm_arch_vcpu_id(CPUState *cs)
713 {
714 X86CPU *cpu = X86_CPU(cs);
715 return cpu->apic_id;
716 }
717
718 #ifndef KVM_CPUID_SIGNATURE_NEXT
719 #define KVM_CPUID_SIGNATURE_NEXT 0x40000100
720 #endif
721
722 static bool hyperv_enabled(X86CPU *cpu)
723 {
724 return kvm_check_extension(kvm_state, KVM_CAP_HYPERV) > 0 &&
725 ((cpu->hyperv_spinlock_attempts != HYPERV_SPINLOCK_NEVER_NOTIFY) ||
726 cpu->hyperv_features || cpu->hyperv_passthrough);
727 }
728
729 /*
730 * Check whether target_freq is within conservative
731 * ntp correctable bounds (250ppm) of freq
732 */
733 static inline bool freq_within_bounds(int freq, int target_freq)
734 {
735 int max_freq = freq + (freq * 250 / 1000000);
736 int min_freq = freq - (freq * 250 / 1000000);
737
738 if (target_freq >= min_freq && target_freq <= max_freq) {
739 return true;
740 }
741
742 return false;
743 }
744
745 static int kvm_arch_set_tsc_khz(CPUState *cs)
746 {
747 X86CPU *cpu = X86_CPU(cs);
748 CPUX86State *env = &cpu->env;
749 int r, cur_freq;
750 bool set_ioctl = false;
751
752 if (!env->tsc_khz) {
753 return 0;
754 }
755
756 cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
757 kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP;
758
759 /*
760 * If TSC scaling is supported, attempt to set TSC frequency.
761 */
762 if (kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL)) {
763 set_ioctl = true;
764 }
765
766 /*
767 * If desired TSC frequency is within bounds of NTP correction,
768 * attempt to set TSC frequency.
769 */
770 if (cur_freq != -ENOTSUP && freq_within_bounds(cur_freq, env->tsc_khz)) {
771 set_ioctl = true;
772 }
773
774 r = set_ioctl ?
775 kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) :
776 -ENOTSUP;
777
778 if (r < 0) {
779 /* When KVM_SET_TSC_KHZ fails, it's an error only if the current
780 * TSC frequency doesn't match the one we want.
781 */
782 cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
783 kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
784 -ENOTSUP;
785 if (cur_freq <= 0 || cur_freq != env->tsc_khz) {
786 warn_report("TSC frequency mismatch between "
787 "VM (%" PRId64 " kHz) and host (%d kHz), "
788 "and TSC scaling unavailable",
789 env->tsc_khz, cur_freq);
790 return r;
791 }
792 }
793
794 return 0;
795 }
796
797 static bool tsc_is_stable_and_known(CPUX86State *env)
798 {
799 if (!env->tsc_khz) {
800 return false;
801 }
802 return (env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC)
803 || env->user_tsc_khz;
804 }
805
806 static struct {
807 const char *desc;
808 struct {
809 uint32_t func;
810 int reg;
811 uint32_t bits;
812 } flags[2];
813 uint64_t dependencies;
814 } kvm_hyperv_properties[] = {
815 [HYPERV_FEAT_RELAXED] = {
816 .desc = "relaxed timing (hv-relaxed)",
817 .flags = {
818 {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
819 .bits = HV_RELAXED_TIMING_RECOMMENDED}
820 }
821 },
822 [HYPERV_FEAT_VAPIC] = {
823 .desc = "virtual APIC (hv-vapic)",
824 .flags = {
825 {.func = HV_CPUID_FEATURES, .reg = R_EAX,
826 .bits = HV_APIC_ACCESS_AVAILABLE}
827 }
828 },
829 [HYPERV_FEAT_TIME] = {
830 .desc = "clocksources (hv-time)",
831 .flags = {
832 {.func = HV_CPUID_FEATURES, .reg = R_EAX,
833 .bits = HV_TIME_REF_COUNT_AVAILABLE | HV_REFERENCE_TSC_AVAILABLE}
834 }
835 },
836 [HYPERV_FEAT_CRASH] = {
837 .desc = "crash MSRs (hv-crash)",
838 .flags = {
839 {.func = HV_CPUID_FEATURES, .reg = R_EDX,
840 .bits = HV_GUEST_CRASH_MSR_AVAILABLE}
841 }
842 },
843 [HYPERV_FEAT_RESET] = {
844 .desc = "reset MSR (hv-reset)",
845 .flags = {
846 {.func = HV_CPUID_FEATURES, .reg = R_EAX,
847 .bits = HV_RESET_AVAILABLE}
848 }
849 },
850 [HYPERV_FEAT_VPINDEX] = {
851 .desc = "VP_INDEX MSR (hv-vpindex)",
852 .flags = {
853 {.func = HV_CPUID_FEATURES, .reg = R_EAX,
854 .bits = HV_VP_INDEX_AVAILABLE}
855 }
856 },
857 [HYPERV_FEAT_RUNTIME] = {
858 .desc = "VP_RUNTIME MSR (hv-runtime)",
859 .flags = {
860 {.func = HV_CPUID_FEATURES, .reg = R_EAX,
861 .bits = HV_VP_RUNTIME_AVAILABLE}
862 }
863 },
864 [HYPERV_FEAT_SYNIC] = {
865 .desc = "synthetic interrupt controller (hv-synic)",
866 .flags = {
867 {.func = HV_CPUID_FEATURES, .reg = R_EAX,
868 .bits = HV_SYNIC_AVAILABLE}
869 }
870 },
871 [HYPERV_FEAT_STIMER] = {
872 .desc = "synthetic timers (hv-stimer)",
873 .flags = {
874 {.func = HV_CPUID_FEATURES, .reg = R_EAX,
875 .bits = HV_SYNTIMERS_AVAILABLE}
876 },
877 .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_TIME)
878 },
879 [HYPERV_FEAT_FREQUENCIES] = {
880 .desc = "frequency MSRs (hv-frequencies)",
881 .flags = {
882 {.func = HV_CPUID_FEATURES, .reg = R_EAX,
883 .bits = HV_ACCESS_FREQUENCY_MSRS},
884 {.func = HV_CPUID_FEATURES, .reg = R_EDX,
885 .bits = HV_FREQUENCY_MSRS_AVAILABLE}
886 }
887 },
888 [HYPERV_FEAT_REENLIGHTENMENT] = {
889 .desc = "reenlightenment MSRs (hv-reenlightenment)",
890 .flags = {
891 {.func = HV_CPUID_FEATURES, .reg = R_EAX,
892 .bits = HV_ACCESS_REENLIGHTENMENTS_CONTROL}
893 }
894 },
895 [HYPERV_FEAT_TLBFLUSH] = {
896 .desc = "paravirtualized TLB flush (hv-tlbflush)",
897 .flags = {
898 {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
899 .bits = HV_REMOTE_TLB_FLUSH_RECOMMENDED |
900 HV_EX_PROCESSOR_MASKS_RECOMMENDED}
901 },
902 .dependencies = BIT(HYPERV_FEAT_VPINDEX)
903 },
904 [HYPERV_FEAT_EVMCS] = {
905 .desc = "enlightened VMCS (hv-evmcs)",
906 .flags = {
907 {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
908 .bits = HV_ENLIGHTENED_VMCS_RECOMMENDED}
909 },
910 .dependencies = BIT(HYPERV_FEAT_VAPIC)
911 },
912 [HYPERV_FEAT_IPI] = {
913 .desc = "paravirtualized IPI (hv-ipi)",
914 .flags = {
915 {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
916 .bits = HV_CLUSTER_IPI_RECOMMENDED |
917 HV_EX_PROCESSOR_MASKS_RECOMMENDED}
918 },
919 .dependencies = BIT(HYPERV_FEAT_VPINDEX)
920 },
921 [HYPERV_FEAT_STIMER_DIRECT] = {
922 .desc = "direct mode synthetic timers (hv-stimer-direct)",
923 .flags = {
924 {.func = HV_CPUID_FEATURES, .reg = R_EDX,
925 .bits = HV_STIMER_DIRECT_MODE_AVAILABLE}
926 },
927 .dependencies = BIT(HYPERV_FEAT_STIMER)
928 },
929 [HYPERV_FEAT_AVIC] = {
930 .desc = "AVIC/APICv support (hv-avic/hv-apicv)",
931 .flags = {
932 {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
933 .bits = HV_DEPRECATING_AEOI_RECOMMENDED}
934 }
935 },
936 };
937
938 static struct kvm_cpuid2 *try_get_hv_cpuid(CPUState *cs, int max,
939 bool do_sys_ioctl)
940 {
941 struct kvm_cpuid2 *cpuid;
942 int r, size;
943
944 size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
945 cpuid = g_malloc0(size);
946 cpuid->nent = max;
947
948 if (do_sys_ioctl) {
949 r = kvm_ioctl(kvm_state, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
950 } else {
951 r = kvm_vcpu_ioctl(cs, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
952 }
953 if (r == 0 && cpuid->nent >= max) {
954 r = -E2BIG;
955 }
956 if (r < 0) {
957 if (r == -E2BIG) {
958 g_free(cpuid);
959 return NULL;
960 } else {
961 fprintf(stderr, "KVM_GET_SUPPORTED_HV_CPUID failed: %s\n",
962 strerror(-r));
963 exit(1);
964 }
965 }
966 return cpuid;
967 }
968
969 /*
970 * Run KVM_GET_SUPPORTED_HV_CPUID ioctl(), allocating a buffer large enough
971 * for all entries.
972 */
973 static struct kvm_cpuid2 *get_supported_hv_cpuid(CPUState *cs)
974 {
975 struct kvm_cpuid2 *cpuid;
976 /* 0x40000000..0x40000005, 0x4000000A, 0x40000080..0x40000080 leaves */
977 int max = 10;
978 int i;
979 bool do_sys_ioctl;
980
981 do_sys_ioctl =
982 kvm_check_extension(kvm_state, KVM_CAP_SYS_HYPERV_CPUID) > 0;
983
984 /*
985 * Non-empty KVM context is needed when KVM_CAP_SYS_HYPERV_CPUID is
986 * unsupported, kvm_hyperv_expand_features() checks for that.
987 */
988 assert(do_sys_ioctl || cs->kvm_state);
989
990 /*
991 * When the buffer is too small, KVM_GET_SUPPORTED_HV_CPUID fails with
992 * -E2BIG, however, it doesn't report back the right size. Keep increasing
993 * it and re-trying until we succeed.
994 */
995 while ((cpuid = try_get_hv_cpuid(cs, max, do_sys_ioctl)) == NULL) {
996 max++;
997 }
998
999 /*
1000 * KVM_GET_SUPPORTED_HV_CPUID does not set EVMCS CPUID bit before
1001 * KVM_CAP_HYPERV_ENLIGHTENED_VMCS is enabled but we want to get the
1002 * information early, just check for the capability and set the bit
1003 * manually.
1004 */
1005 if (!do_sys_ioctl && kvm_check_extension(cs->kvm_state,
1006 KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) {
1007 for (i = 0; i < cpuid->nent; i++) {
1008 if (cpuid->entries[i].function == HV_CPUID_ENLIGHTMENT_INFO) {
1009 cpuid->entries[i].eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED;
1010 }
1011 }
1012 }
1013
1014 return cpuid;
1015 }
1016
1017 /*
1018 * When KVM_GET_SUPPORTED_HV_CPUID is not supported we fill CPUID feature
1019 * leaves from KVM_CAP_HYPERV* and present MSRs data.
1020 */
1021 static struct kvm_cpuid2 *get_supported_hv_cpuid_legacy(CPUState *cs)
1022 {
1023 X86CPU *cpu = X86_CPU(cs);
1024 struct kvm_cpuid2 *cpuid;
1025 struct kvm_cpuid_entry2 *entry_feat, *entry_recomm;
1026
1027 /* HV_CPUID_FEATURES, HV_CPUID_ENLIGHTMENT_INFO */
1028 cpuid = g_malloc0(sizeof(*cpuid) + 2 * sizeof(*cpuid->entries));
1029 cpuid->nent = 2;
1030
1031 /* HV_CPUID_VENDOR_AND_MAX_FUNCTIONS */
1032 entry_feat = &cpuid->entries[0];
1033 entry_feat->function = HV_CPUID_FEATURES;
1034
1035 entry_recomm = &cpuid->entries[1];
1036 entry_recomm->function = HV_CPUID_ENLIGHTMENT_INFO;
1037 entry_recomm->ebx = cpu->hyperv_spinlock_attempts;
1038
1039 if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0) {
1040 entry_feat->eax |= HV_HYPERCALL_AVAILABLE;
1041 entry_feat->eax |= HV_APIC_ACCESS_AVAILABLE;
1042 entry_feat->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
1043 entry_recomm->eax |= HV_RELAXED_TIMING_RECOMMENDED;
1044 entry_recomm->eax |= HV_APIC_ACCESS_RECOMMENDED;
1045 }
1046
1047 if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_TIME) > 0) {
1048 entry_feat->eax |= HV_TIME_REF_COUNT_AVAILABLE;
1049 entry_feat->eax |= HV_REFERENCE_TSC_AVAILABLE;
1050 }
1051
1052 if (has_msr_hv_frequencies) {
1053 entry_feat->eax |= HV_ACCESS_FREQUENCY_MSRS;
1054 entry_feat->edx |= HV_FREQUENCY_MSRS_AVAILABLE;
1055 }
1056
1057 if (has_msr_hv_crash) {
1058 entry_feat->edx |= HV_GUEST_CRASH_MSR_AVAILABLE;
1059 }
1060
1061 if (has_msr_hv_reenlightenment) {
1062 entry_feat->eax |= HV_ACCESS_REENLIGHTENMENTS_CONTROL;
1063 }
1064
1065 if (has_msr_hv_reset) {
1066 entry_feat->eax |= HV_RESET_AVAILABLE;
1067 }
1068
1069 if (has_msr_hv_vpindex) {
1070 entry_feat->eax |= HV_VP_INDEX_AVAILABLE;
1071 }
1072
1073 if (has_msr_hv_runtime) {
1074 entry_feat->eax |= HV_VP_RUNTIME_AVAILABLE;
1075 }
1076
1077 if (has_msr_hv_synic) {
1078 unsigned int cap = cpu->hyperv_synic_kvm_only ?
1079 KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2;
1080
1081 if (kvm_check_extension(cs->kvm_state, cap) > 0) {
1082 entry_feat->eax |= HV_SYNIC_AVAILABLE;
1083 }
1084 }
1085
1086 if (has_msr_hv_stimer) {
1087 entry_feat->eax |= HV_SYNTIMERS_AVAILABLE;
1088 }
1089
1090 if (kvm_check_extension(cs->kvm_state,
1091 KVM_CAP_HYPERV_TLBFLUSH) > 0) {
1092 entry_recomm->eax |= HV_REMOTE_TLB_FLUSH_RECOMMENDED;
1093 entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
1094 }
1095
1096 if (kvm_check_extension(cs->kvm_state,
1097 KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) {
1098 entry_recomm->eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED;
1099 }
1100
1101 if (kvm_check_extension(cs->kvm_state,
1102 KVM_CAP_HYPERV_SEND_IPI) > 0) {
1103 entry_recomm->eax |= HV_CLUSTER_IPI_RECOMMENDED;
1104 entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
1105 }
1106
1107 return cpuid;
1108 }
1109
1110 static uint32_t hv_cpuid_get_host(CPUState *cs, uint32_t func, int reg)
1111 {
1112 struct kvm_cpuid_entry2 *entry;
1113 struct kvm_cpuid2 *cpuid;
1114
1115 if (hv_cpuid_cache) {
1116 cpuid = hv_cpuid_cache;
1117 } else {
1118 if (kvm_check_extension(kvm_state, KVM_CAP_HYPERV_CPUID) > 0) {
1119 cpuid = get_supported_hv_cpuid(cs);
1120 } else {
1121 /*
1122 * 'cs->kvm_state' may be NULL when Hyper-V features are expanded
1123 * before KVM context is created but this is only done when
1124 * KVM_CAP_SYS_HYPERV_CPUID is supported and it implies
1125 * KVM_CAP_HYPERV_CPUID.
1126 */
1127 assert(cs->kvm_state);
1128
1129 cpuid = get_supported_hv_cpuid_legacy(cs);
1130 }
1131 hv_cpuid_cache = cpuid;
1132 }
1133
1134 if (!cpuid) {
1135 return 0;
1136 }
1137
1138 entry = cpuid_find_entry(cpuid, func, 0);
1139 if (!entry) {
1140 return 0;
1141 }
1142
1143 return cpuid_entry_get_reg(entry, reg);
1144 }
1145
1146 static bool hyperv_feature_supported(CPUState *cs, int feature)
1147 {
1148 uint32_t func, bits;
1149 int i, reg;
1150
1151 for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties[feature].flags); i++) {
1152
1153 func = kvm_hyperv_properties[feature].flags[i].func;
1154 reg = kvm_hyperv_properties[feature].flags[i].reg;
1155 bits = kvm_hyperv_properties[feature].flags[i].bits;
1156
1157 if (!func) {
1158 continue;
1159 }
1160
1161 if ((hv_cpuid_get_host(cs, func, reg) & bits) != bits) {
1162 return false;
1163 }
1164 }
1165
1166 return true;
1167 }
1168
1169 /* Checks that all feature dependencies are enabled */
1170 static bool hv_feature_check_deps(X86CPU *cpu, int feature, Error **errp)
1171 {
1172 uint64_t deps;
1173 int dep_feat;
1174
1175 deps = kvm_hyperv_properties[feature].dependencies;
1176 while (deps) {
1177 dep_feat = ctz64(deps);
1178 if (!(hyperv_feat_enabled(cpu, dep_feat))) {
1179 error_setg(errp, "Hyper-V %s requires Hyper-V %s",
1180 kvm_hyperv_properties[feature].desc,
1181 kvm_hyperv_properties[dep_feat].desc);
1182 return false;
1183 }
1184 deps &= ~(1ull << dep_feat);
1185 }
1186
1187 return true;
1188 }
1189
1190 static uint32_t hv_build_cpuid_leaf(CPUState *cs, uint32_t func, int reg)
1191 {
1192 X86CPU *cpu = X86_CPU(cs);
1193 uint32_t r = 0;
1194 int i, j;
1195
1196 for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties); i++) {
1197 if (!hyperv_feat_enabled(cpu, i)) {
1198 continue;
1199 }
1200
1201 for (j = 0; j < ARRAY_SIZE(kvm_hyperv_properties[i].flags); j++) {
1202 if (kvm_hyperv_properties[i].flags[j].func != func) {
1203 continue;
1204 }
1205 if (kvm_hyperv_properties[i].flags[j].reg != reg) {
1206 continue;
1207 }
1208
1209 r |= kvm_hyperv_properties[i].flags[j].bits;
1210 }
1211 }
1212
1213 return r;
1214 }
1215
1216 /*
1217 * Expand Hyper-V CPU features. In partucular, check that all the requested
1218 * features are supported by the host and the sanity of the configuration
1219 * (that all the required dependencies are included). Also, this takes care
1220 * of 'hv_passthrough' mode and fills the environment with all supported
1221 * Hyper-V features.
1222 */
1223 bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp)
1224 {
1225 CPUState *cs = CPU(cpu);
1226 Error *local_err = NULL;
1227 int feat;
1228
1229 if (!hyperv_enabled(cpu))
1230 return true;
1231
1232 /*
1233 * When kvm_hyperv_expand_features is called at CPU feature expansion
1234 * time per-CPU kvm_state is not available yet so we can only proceed
1235 * when KVM_CAP_SYS_HYPERV_CPUID is supported.
1236 */
1237 if (!cs->kvm_state &&
1238 !kvm_check_extension(kvm_state, KVM_CAP_SYS_HYPERV_CPUID))
1239 return true;
1240
1241 if (cpu->hyperv_passthrough) {
1242 cpu->hyperv_vendor_id[0] =
1243 hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_EBX);
1244 cpu->hyperv_vendor_id[1] =
1245 hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_ECX);
1246 cpu->hyperv_vendor_id[2] =
1247 hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_EDX);
1248 cpu->hyperv_vendor = g_realloc(cpu->hyperv_vendor,
1249 sizeof(cpu->hyperv_vendor_id) + 1);
1250 memcpy(cpu->hyperv_vendor, cpu->hyperv_vendor_id,
1251 sizeof(cpu->hyperv_vendor_id));
1252 cpu->hyperv_vendor[sizeof(cpu->hyperv_vendor_id)] = 0;
1253
1254 cpu->hyperv_interface_id[0] =
1255 hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EAX);
1256 cpu->hyperv_interface_id[1] =
1257 hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EBX);
1258 cpu->hyperv_interface_id[2] =
1259 hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_ECX);
1260 cpu->hyperv_interface_id[3] =
1261 hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EDX);
1262
1263 cpu->hyperv_ver_id_build =
1264 hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EAX);
1265 cpu->hyperv_ver_id_major =
1266 hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EBX) >> 16;
1267 cpu->hyperv_ver_id_minor =
1268 hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EBX) & 0xffff;
1269 cpu->hyperv_ver_id_sp =
1270 hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_ECX);
1271 cpu->hyperv_ver_id_sb =
1272 hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EDX) >> 24;
1273 cpu->hyperv_ver_id_sn =
1274 hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EDX) & 0xffffff;
1275
1276 cpu->hv_max_vps = hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS,
1277 R_EAX);
1278 cpu->hyperv_limits[0] =
1279 hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_EBX);
1280 cpu->hyperv_limits[1] =
1281 hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_ECX);
1282 cpu->hyperv_limits[2] =
1283 hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_EDX);
1284
1285 cpu->hyperv_spinlock_attempts =
1286 hv_cpuid_get_host(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EBX);
1287
1288 /*
1289 * Mark feature as enabled in 'cpu->hyperv_features' as
1290 * hv_build_cpuid_leaf() uses this info to build guest CPUIDs.
1291 */
1292 for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) {
1293 if (hyperv_feature_supported(cs, feat)) {
1294 cpu->hyperv_features |= BIT(feat);
1295 }
1296 }
1297 } else {
1298 /* Check features availability and dependencies */
1299 for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) {
1300 /* If the feature was not requested skip it. */
1301 if (!hyperv_feat_enabled(cpu, feat)) {
1302 continue;
1303 }
1304
1305 /* Check if the feature is supported by KVM */
1306 if (!hyperv_feature_supported(cs, feat)) {
1307 error_setg(errp, "Hyper-V %s is not supported by kernel",
1308 kvm_hyperv_properties[feat].desc);
1309 return false;
1310 }
1311
1312 /* Check dependencies */
1313 if (!hv_feature_check_deps(cpu, feat, &local_err)) {
1314 error_propagate(errp, local_err);
1315 return false;
1316 }
1317 }
1318 }
1319
1320 /* Additional dependencies not covered by kvm_hyperv_properties[] */
1321 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) &&
1322 !cpu->hyperv_synic_kvm_only &&
1323 !hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)) {
1324 error_setg(errp, "Hyper-V %s requires Hyper-V %s",
1325 kvm_hyperv_properties[HYPERV_FEAT_SYNIC].desc,
1326 kvm_hyperv_properties[HYPERV_FEAT_VPINDEX].desc);
1327 return false;
1328 }
1329
1330 return true;
1331 }
1332
1333 /*
1334 * Fill in Hyper-V CPUIDs. Returns the number of entries filled in cpuid_ent.
1335 */
1336 static int hyperv_fill_cpuids(CPUState *cs,
1337 struct kvm_cpuid_entry2 *cpuid_ent)
1338 {
1339 X86CPU *cpu = X86_CPU(cs);
1340 struct kvm_cpuid_entry2 *c;
1341 uint32_t cpuid_i = 0;
1342
1343 c = &cpuid_ent[cpuid_i++];
1344 c->function = HV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
1345 c->eax = hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) ?
1346 HV_CPUID_NESTED_FEATURES : HV_CPUID_IMPLEMENT_LIMITS;
1347 c->ebx = cpu->hyperv_vendor_id[0];
1348 c->ecx = cpu->hyperv_vendor_id[1];
1349 c->edx = cpu->hyperv_vendor_id[2];
1350
1351 c = &cpuid_ent[cpuid_i++];
1352 c->function = HV_CPUID_INTERFACE;
1353 c->eax = cpu->hyperv_interface_id[0];
1354 c->ebx = cpu->hyperv_interface_id[1];
1355 c->ecx = cpu->hyperv_interface_id[2];
1356 c->edx = cpu->hyperv_interface_id[3];
1357
1358 c = &cpuid_ent[cpuid_i++];
1359 c->function = HV_CPUID_VERSION;
1360 c->eax = cpu->hyperv_ver_id_build;
1361 c->ebx = (uint32_t)cpu->hyperv_ver_id_major << 16 |
1362 cpu->hyperv_ver_id_minor;
1363 c->ecx = cpu->hyperv_ver_id_sp;
1364 c->edx = (uint32_t)cpu->hyperv_ver_id_sb << 24 |
1365 (cpu->hyperv_ver_id_sn & 0xffffff);
1366
1367 c = &cpuid_ent[cpuid_i++];
1368 c->function = HV_CPUID_FEATURES;
1369 c->eax = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EAX);
1370 c->ebx = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EBX);
1371 c->edx = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EDX);
1372
1373 /* Unconditionally required with any Hyper-V enlightenment */
1374 c->eax |= HV_HYPERCALL_AVAILABLE;
1375
1376 /* SynIC and Vmbus devices require messages/signals hypercalls */
1377 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) &&
1378 !cpu->hyperv_synic_kvm_only) {
1379 c->ebx |= HV_POST_MESSAGES | HV_SIGNAL_EVENTS;
1380 }
1381
1382
1383 /* Not exposed by KVM but needed to make CPU hotplug in Windows work */
1384 c->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
1385
1386 c = &cpuid_ent[cpuid_i++];
1387 c->function = HV_CPUID_ENLIGHTMENT_INFO;
1388 c->eax = hv_build_cpuid_leaf(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EAX);
1389 c->ebx = cpu->hyperv_spinlock_attempts;
1390
1391 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC) &&
1392 !hyperv_feat_enabled(cpu, HYPERV_FEAT_AVIC)) {
1393 c->eax |= HV_APIC_ACCESS_RECOMMENDED;
1394 }
1395
1396 if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_ON) {
1397 c->eax |= HV_NO_NONARCH_CORESHARING;
1398 } else if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO) {
1399 c->eax |= hv_cpuid_get_host(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EAX) &
1400 HV_NO_NONARCH_CORESHARING;
1401 }
1402
1403 c = &cpuid_ent[cpuid_i++];
1404 c->function = HV_CPUID_IMPLEMENT_LIMITS;
1405 c->eax = cpu->hv_max_vps;
1406 c->ebx = cpu->hyperv_limits[0];
1407 c->ecx = cpu->hyperv_limits[1];
1408 c->edx = cpu->hyperv_limits[2];
1409
1410 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
1411 uint32_t function;
1412
1413 /* Create zeroed 0x40000006..0x40000009 leaves */
1414 for (function = HV_CPUID_IMPLEMENT_LIMITS + 1;
1415 function < HV_CPUID_NESTED_FEATURES; function++) {
1416 c = &cpuid_ent[cpuid_i++];
1417 c->function = function;
1418 }
1419
1420 c = &cpuid_ent[cpuid_i++];
1421 c->function = HV_CPUID_NESTED_FEATURES;
1422 c->eax = cpu->hyperv_nested[0];
1423 }
1424
1425 return cpuid_i;
1426 }
1427
1428 static Error *hv_passthrough_mig_blocker;
1429 static Error *hv_no_nonarch_cs_mig_blocker;
1430
1431 /* Checks that the exposed eVMCS version range is supported by KVM */
1432 static bool evmcs_version_supported(uint16_t evmcs_version,
1433 uint16_t supported_evmcs_version)
1434 {
1435 uint8_t min_version = evmcs_version & 0xff;
1436 uint8_t max_version = evmcs_version >> 8;
1437 uint8_t min_supported_version = supported_evmcs_version & 0xff;
1438 uint8_t max_supported_version = supported_evmcs_version >> 8;
1439
1440 return (min_version >= min_supported_version) &&
1441 (max_version <= max_supported_version);
1442 }
1443
1444 #define DEFAULT_EVMCS_VERSION ((1 << 8) | 1)
1445
1446 static int hyperv_init_vcpu(X86CPU *cpu)
1447 {
1448 CPUState *cs = CPU(cpu);
1449 Error *local_err = NULL;
1450 int ret;
1451
1452 if (cpu->hyperv_passthrough && hv_passthrough_mig_blocker == NULL) {
1453 error_setg(&hv_passthrough_mig_blocker,
1454 "'hv-passthrough' CPU flag prevents migration, use explicit"
1455 " set of hv-* flags instead");
1456 ret = migrate_add_blocker(hv_passthrough_mig_blocker, &local_err);
1457 if (ret < 0) {
1458 error_report_err(local_err);
1459 return ret;
1460 }
1461 }
1462
1463 if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO &&
1464 hv_no_nonarch_cs_mig_blocker == NULL) {
1465 error_setg(&hv_no_nonarch_cs_mig_blocker,
1466 "'hv-no-nonarch-coresharing=auto' CPU flag prevents migration"
1467 " use explicit 'hv-no-nonarch-coresharing=on' instead (but"
1468 " make sure SMT is disabled and/or that vCPUs are properly"
1469 " pinned)");
1470 ret = migrate_add_blocker(hv_no_nonarch_cs_mig_blocker, &local_err);
1471 if (ret < 0) {
1472 error_report_err(local_err);
1473 return ret;
1474 }
1475 }
1476
1477 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) && !hv_vpindex_settable) {
1478 /*
1479 * the kernel doesn't support setting vp_index; assert that its value
1480 * is in sync
1481 */
1482 struct {
1483 struct kvm_msrs info;
1484 struct kvm_msr_entry entries[1];
1485 } msr_data = {
1486 .info.nmsrs = 1,
1487 .entries[0].index = HV_X64_MSR_VP_INDEX,
1488 };
1489
1490 ret = kvm_vcpu_ioctl(cs, KVM_GET_MSRS, &msr_data);
1491 if (ret < 0) {
1492 return ret;
1493 }
1494 assert(ret == 1);
1495
1496 if (msr_data.entries[0].data != hyperv_vp_index(CPU(cpu))) {
1497 error_report("kernel's vp_index != QEMU's vp_index");
1498 return -ENXIO;
1499 }
1500 }
1501
1502 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
1503 uint32_t synic_cap = cpu->hyperv_synic_kvm_only ?
1504 KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2;
1505 ret = kvm_vcpu_enable_cap(cs, synic_cap, 0);
1506 if (ret < 0) {
1507 error_report("failed to turn on HyperV SynIC in KVM: %s",
1508 strerror(-ret));
1509 return ret;
1510 }
1511
1512 if (!cpu->hyperv_synic_kvm_only) {
1513 ret = hyperv_x86_synic_add(cpu);
1514 if (ret < 0) {
1515 error_report("failed to create HyperV SynIC: %s",
1516 strerror(-ret));
1517 return ret;
1518 }
1519 }
1520 }
1521
1522 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
1523 uint16_t evmcs_version = DEFAULT_EVMCS_VERSION;
1524 uint16_t supported_evmcs_version;
1525
1526 ret = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, 0,
1527 (uintptr_t)&supported_evmcs_version);
1528
1529 /*
1530 * KVM is required to support EVMCS ver.1. as that's what 'hv-evmcs'
1531 * option sets. Note: we hardcode the maximum supported eVMCS version
1532 * to '1' as well so 'hv-evmcs' feature is migratable even when (and if)
1533 * ver.2 is implemented. A new option (e.g. 'hv-evmcs=2') will then have
1534 * to be added.
1535 */
1536 if (ret < 0) {
1537 error_report("Hyper-V %s is not supported by kernel",
1538 kvm_hyperv_properties[HYPERV_FEAT_EVMCS].desc);
1539 return ret;
1540 }
1541
1542 if (!evmcs_version_supported(evmcs_version, supported_evmcs_version)) {
1543 error_report("eVMCS version range [%d..%d] is not supported by "
1544 "kernel (supported: [%d..%d])", evmcs_version & 0xff,
1545 evmcs_version >> 8, supported_evmcs_version & 0xff,
1546 supported_evmcs_version >> 8);
1547 return -ENOTSUP;
1548 }
1549
1550 cpu->hyperv_nested[0] = evmcs_version;
1551 }
1552
1553 if (cpu->hyperv_enforce_cpuid) {
1554 ret = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENFORCE_CPUID, 0, 1);
1555 if (ret < 0) {
1556 error_report("failed to enable KVM_CAP_HYPERV_ENFORCE_CPUID: %s",
1557 strerror(-ret));
1558 return ret;
1559 }
1560 }
1561
1562 return 0;
1563 }
1564
1565 static Error *invtsc_mig_blocker;
1566
1567 #define KVM_MAX_CPUID_ENTRIES 100
1568
1569 int kvm_arch_init_vcpu(CPUState *cs)
1570 {
1571 struct {
1572 struct kvm_cpuid2 cpuid;
1573 struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES];
1574 } cpuid_data;
1575 /*
1576 * The kernel defines these structs with padding fields so there
1577 * should be no extra padding in our cpuid_data struct.
1578 */
1579 QEMU_BUILD_BUG_ON(sizeof(cpuid_data) !=
1580 sizeof(struct kvm_cpuid2) +
1581 sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES);
1582
1583 X86CPU *cpu = X86_CPU(cs);
1584 CPUX86State *env = &cpu->env;
1585 uint32_t limit, i, j, cpuid_i;
1586 uint32_t unused;
1587 struct kvm_cpuid_entry2 *c;
1588 uint32_t signature[3];
1589 int kvm_base = KVM_CPUID_SIGNATURE;
1590 int max_nested_state_len;
1591 int r;
1592 Error *local_err = NULL;
1593
1594 memset(&cpuid_data, 0, sizeof(cpuid_data));
1595
1596 cpuid_i = 0;
1597
1598 r = kvm_arch_set_tsc_khz(cs);
1599 if (r < 0) {
1600 return r;
1601 }
1602
1603 /* vcpu's TSC frequency is either specified by user, or following
1604 * the value used by KVM if the former is not present. In the
1605 * latter case, we query it from KVM and record in env->tsc_khz,
1606 * so that vcpu's TSC frequency can be migrated later via this field.
1607 */
1608 if (!env->tsc_khz) {
1609 r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
1610 kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
1611 -ENOTSUP;
1612 if (r > 0) {
1613 env->tsc_khz = r;
1614 }
1615 }
1616
1617 env->apic_bus_freq = KVM_APIC_BUS_FREQUENCY;
1618
1619 /*
1620 * kvm_hyperv_expand_features() is called here for the second time in case
1621 * KVM_CAP_SYS_HYPERV_CPUID is not supported. While we can't possibly handle
1622 * 'query-cpu-model-expansion' in this case as we don't have a KVM vCPU to
1623 * check which Hyper-V enlightenments are supported and which are not, we
1624 * can still proceed and check/expand Hyper-V enlightenments here so legacy
1625 * behavior is preserved.
1626 */
1627 if (!kvm_hyperv_expand_features(cpu, &local_err)) {
1628 error_report_err(local_err);
1629 return -ENOSYS;
1630 }
1631
1632 if (hyperv_enabled(cpu)) {
1633 r = hyperv_init_vcpu(cpu);
1634 if (r) {
1635 return r;
1636 }
1637
1638 cpuid_i = hyperv_fill_cpuids(cs, cpuid_data.entries);
1639 kvm_base = KVM_CPUID_SIGNATURE_NEXT;
1640 has_msr_hv_hypercall = true;
1641 }
1642
1643 if (cpu->expose_kvm) {
1644 memcpy(signature, "KVMKVMKVM\0\0\0", 12);
1645 c = &cpuid_data.entries[cpuid_i++];
1646 c->function = KVM_CPUID_SIGNATURE | kvm_base;
1647 c->eax = KVM_CPUID_FEATURES | kvm_base;
1648 c->ebx = signature[0];
1649 c->ecx = signature[1];
1650 c->edx = signature[2];
1651
1652 c = &cpuid_data.entries[cpuid_i++];
1653 c->function = KVM_CPUID_FEATURES | kvm_base;
1654 c->eax = env->features[FEAT_KVM];
1655 c->edx = env->features[FEAT_KVM_HINTS];
1656 }
1657
1658 cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
1659
1660 if (cpu->kvm_pv_enforce_cpuid) {
1661 r = kvm_vcpu_enable_cap(cs, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 0, 1);
1662 if (r < 0) {
1663 fprintf(stderr,
1664 "failed to enable KVM_CAP_ENFORCE_PV_FEATURE_CPUID: %s",
1665 strerror(-r));
1666 abort();
1667 }
1668 }
1669
1670 for (i = 0; i <= limit; i++) {
1671 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1672 fprintf(stderr, "unsupported level value: 0x%x\n", limit);
1673 abort();
1674 }
1675 c = &cpuid_data.entries[cpuid_i++];
1676
1677 switch (i) {
1678 case 2: {
1679 /* Keep reading function 2 till all the input is received */
1680 int times;
1681
1682 c->function = i;
1683 c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
1684 KVM_CPUID_FLAG_STATE_READ_NEXT;
1685 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1686 times = c->eax & 0xff;
1687
1688 for (j = 1; j < times; ++j) {
1689 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1690 fprintf(stderr, "cpuid_data is full, no space for "
1691 "cpuid(eax:2):eax & 0xf = 0x%x\n", times);
1692 abort();
1693 }
1694 c = &cpuid_data.entries[cpuid_i++];
1695 c->function = i;
1696 c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC;
1697 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1698 }
1699 break;
1700 }
1701 case 0x1f:
1702 if (env->nr_dies < 2) {
1703 break;
1704 }
1705 /* fallthrough */
1706 case 4:
1707 case 0xb:
1708 case 0xd:
1709 for (j = 0; ; j++) {
1710 if (i == 0xd && j == 64) {
1711 break;
1712 }
1713
1714 if (i == 0x1f && j == 64) {
1715 break;
1716 }
1717
1718 c->function = i;
1719 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1720 c->index = j;
1721 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1722
1723 if (i == 4 && c->eax == 0) {
1724 break;
1725 }
1726 if (i == 0xb && !(c->ecx & 0xff00)) {
1727 break;
1728 }
1729 if (i == 0x1f && !(c->ecx & 0xff00)) {
1730 break;
1731 }
1732 if (i == 0xd && c->eax == 0) {
1733 continue;
1734 }
1735 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1736 fprintf(stderr, "cpuid_data is full, no space for "
1737 "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
1738 abort();
1739 }
1740 c = &cpuid_data.entries[cpuid_i++];
1741 }
1742 break;
1743 case 0x7:
1744 case 0x12:
1745 for (j = 0; ; j++) {
1746 c->function = i;
1747 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1748 c->index = j;
1749 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1750
1751 if (j > 1 && (c->eax & 0xf) != 1) {
1752 break;
1753 }
1754
1755 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1756 fprintf(stderr, "cpuid_data is full, no space for "
1757 "cpuid(eax:0x12,ecx:0x%x)\n", j);
1758 abort();
1759 }
1760 c = &cpuid_data.entries[cpuid_i++];
1761 }
1762 break;
1763 case 0x14: {
1764 uint32_t times;
1765
1766 c->function = i;
1767 c->index = 0;
1768 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1769 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1770 times = c->eax;
1771
1772 for (j = 1; j <= times; ++j) {
1773 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1774 fprintf(stderr, "cpuid_data is full, no space for "
1775 "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
1776 abort();
1777 }
1778 c = &cpuid_data.entries[cpuid_i++];
1779 c->function = i;
1780 c->index = j;
1781 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1782 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1783 }
1784 break;
1785 }
1786 default:
1787 c->function = i;
1788 c->flags = 0;
1789 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1790 if (!c->eax && !c->ebx && !c->ecx && !c->edx) {
1791 /*
1792 * KVM already returns all zeroes if a CPUID entry is missing,
1793 * so we can omit it and avoid hitting KVM's 80-entry limit.
1794 */
1795 cpuid_i--;
1796 }
1797 break;
1798 }
1799 }
1800
1801 if (limit >= 0x0a) {
1802 uint32_t eax, edx;
1803
1804 cpu_x86_cpuid(env, 0x0a, 0, &eax, &unused, &unused, &edx);
1805
1806 has_architectural_pmu_version = eax & 0xff;
1807 if (has_architectural_pmu_version > 0) {
1808 num_architectural_pmu_gp_counters = (eax & 0xff00) >> 8;
1809
1810 /* Shouldn't be more than 32, since that's the number of bits
1811 * available in EBX to tell us _which_ counters are available.
1812 * Play it safe.
1813 */
1814 if (num_architectural_pmu_gp_counters > MAX_GP_COUNTERS) {
1815 num_architectural_pmu_gp_counters = MAX_GP_COUNTERS;
1816 }
1817
1818 if (has_architectural_pmu_version > 1) {
1819 num_architectural_pmu_fixed_counters = edx & 0x1f;
1820
1821 if (num_architectural_pmu_fixed_counters > MAX_FIXED_COUNTERS) {
1822 num_architectural_pmu_fixed_counters = MAX_FIXED_COUNTERS;
1823 }
1824 }
1825 }
1826 }
1827
1828 cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused);
1829
1830 for (i = 0x80000000; i <= limit; i++) {
1831 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1832 fprintf(stderr, "unsupported xlevel value: 0x%x\n", limit);
1833 abort();
1834 }
1835 c = &cpuid_data.entries[cpuid_i++];
1836
1837 switch (i) {
1838 case 0x8000001d:
1839 /* Query for all AMD cache information leaves */
1840 for (j = 0; ; j++) {
1841 c->function = i;
1842 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1843 c->index = j;
1844 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1845
1846 if (c->eax == 0) {
1847 break;
1848 }
1849 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1850 fprintf(stderr, "cpuid_data is full, no space for "
1851 "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
1852 abort();
1853 }
1854 c = &cpuid_data.entries[cpuid_i++];
1855 }
1856 break;
1857 default:
1858 c->function = i;
1859 c->flags = 0;
1860 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1861 if (!c->eax && !c->ebx && !c->ecx && !c->edx) {
1862 /*
1863 * KVM already returns all zeroes if a CPUID entry is missing,
1864 * so we can omit it and avoid hitting KVM's 80-entry limit.
1865 */
1866 cpuid_i--;
1867 }
1868 break;
1869 }
1870 }
1871
1872 /* Call Centaur's CPUID instructions they are supported. */
1873 if (env->cpuid_xlevel2 > 0) {
1874 cpu_x86_cpuid(env, 0xC0000000, 0, &limit, &unused, &unused, &unused);
1875
1876 for (i = 0xC0000000; i <= limit; i++) {
1877 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1878 fprintf(stderr, "unsupported xlevel2 value: 0x%x\n", limit);
1879 abort();
1880 }
1881 c = &cpuid_data.entries[cpuid_i++];
1882
1883 c->function = i;
1884 c->flags = 0;
1885 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1886 }
1887 }
1888
1889 cpuid_data.cpuid.nent = cpuid_i;
1890
1891 if (((env->cpuid_version >> 8)&0xF) >= 6
1892 && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==
1893 (CPUID_MCE | CPUID_MCA)
1894 && kvm_check_extension(cs->kvm_state, KVM_CAP_MCE) > 0) {
1895 uint64_t mcg_cap, unsupported_caps;
1896 int banks;
1897 int ret;
1898
1899 ret = kvm_get_mce_cap_supported(cs->kvm_state, &mcg_cap, &banks);
1900 if (ret < 0) {
1901 fprintf(stderr, "kvm_get_mce_cap_supported: %s", strerror(-ret));
1902 return ret;
1903 }
1904
1905 if (banks < (env->mcg_cap & MCG_CAP_BANKS_MASK)) {
1906 error_report("kvm: Unsupported MCE bank count (QEMU = %d, KVM = %d)",
1907 (int)(env->mcg_cap & MCG_CAP_BANKS_MASK), banks);
1908 return -ENOTSUP;
1909 }
1910
1911 unsupported_caps = env->mcg_cap & ~(mcg_cap | MCG_CAP_BANKS_MASK);
1912 if (unsupported_caps) {
1913 if (unsupported_caps & MCG_LMCE_P) {
1914 error_report("kvm: LMCE not supported");
1915 return -ENOTSUP;
1916 }
1917 warn_report("Unsupported MCG_CAP bits: 0x%" PRIx64,
1918 unsupported_caps);
1919 }
1920
1921 env->mcg_cap &= mcg_cap | MCG_CAP_BANKS_MASK;
1922 ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, &env->mcg_cap);
1923 if (ret < 0) {
1924 fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret));
1925 return ret;
1926 }
1927 }
1928
1929 cpu->vmsentry = qemu_add_vm_change_state_handler(cpu_update_state, env);
1930
1931 c = cpuid_find_entry(&cpuid_data.cpuid, 1, 0);
1932 if (c) {
1933 has_msr_feature_control = !!(c->ecx & CPUID_EXT_VMX) ||
1934 !!(c->ecx & CPUID_EXT_SMX);
1935 }
1936
1937 c = cpuid_find_entry(&cpuid_data.cpuid, 7, 0);
1938 if (c && (c->ebx & CPUID_7_0_EBX_SGX)) {
1939 has_msr_feature_control = true;
1940 }
1941
1942 if (env->mcg_cap & MCG_LMCE_P) {
1943 has_msr_mcg_ext_ctl = has_msr_feature_control = true;
1944 }
1945
1946 if (!env->user_tsc_khz) {
1947 if ((env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
1948 invtsc_mig_blocker == NULL) {
1949 error_setg(&invtsc_mig_blocker,
1950 "State blocked by non-migratable CPU device"
1951 " (invtsc flag)");
1952 r = migrate_add_blocker(invtsc_mig_blocker, &local_err);
1953 if (r < 0) {
1954 error_report_err(local_err);
1955 return r;
1956 }
1957 }
1958 }
1959
1960 if (cpu->vmware_cpuid_freq
1961 /* Guests depend on 0x40000000 to detect this feature, so only expose
1962 * it if KVM exposes leaf 0x40000000. (Conflicts with Hyper-V) */
1963 && cpu->expose_kvm
1964 && kvm_base == KVM_CPUID_SIGNATURE
1965 /* TSC clock must be stable and known for this feature. */
1966 && tsc_is_stable_and_known(env)) {
1967
1968 c = &cpuid_data.entries[cpuid_i++];
1969 c->function = KVM_CPUID_SIGNATURE | 0x10;
1970 c->eax = env->tsc_khz;
1971 c->ebx = env->apic_bus_freq / 1000; /* Hz to KHz */
1972 c->ecx = c->edx = 0;
1973
1974 c = cpuid_find_entry(&cpuid_data.cpuid, kvm_base, 0);
1975 c->eax = MAX(c->eax, KVM_CPUID_SIGNATURE | 0x10);
1976 }
1977
1978 cpuid_data.cpuid.nent = cpuid_i;
1979
1980 cpuid_data.cpuid.padding = 0;
1981 r = kvm_vcpu_ioctl(cs, KVM_SET_CPUID2, &cpuid_data);
1982 if (r) {
1983 goto fail;
1984 }
1985
1986 if (has_xsave) {
1987 env->xsave_buf_len = sizeof(struct kvm_xsave);
1988 env->xsave_buf = qemu_memalign(4096, env->xsave_buf_len);
1989 memset(env->xsave_buf, 0, env->xsave_buf_len);
1990
1991 /*
1992 * The allocated storage must be large enough for all of the
1993 * possible XSAVE state components.
1994 */
1995 assert(kvm_arch_get_supported_cpuid(kvm_state, 0xd, 0, R_ECX)
1996 <= env->xsave_buf_len);
1997 }
1998
1999 max_nested_state_len = kvm_max_nested_state_length();
2000 if (max_nested_state_len > 0) {
2001 assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data));
2002
2003 if (cpu_has_vmx(env) || cpu_has_svm(env)) {
2004 struct kvm_vmx_nested_state_hdr *vmx_hdr;
2005
2006 env->nested_state = g_malloc0(max_nested_state_len);
2007 env->nested_state->size = max_nested_state_len;
2008
2009 if (cpu_has_vmx(env)) {
2010 env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX;
2011 vmx_hdr = &env->nested_state->hdr.vmx;
2012 vmx_hdr->vmxon_pa = -1ull;
2013 vmx_hdr->vmcs12_pa = -1ull;
2014 } else {
2015 env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM;
2016 }
2017 }
2018 }
2019
2020 cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE);
2021
2022 if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) {
2023 has_msr_tsc_aux = false;
2024 }
2025
2026 kvm_init_msrs(cpu);
2027
2028 return 0;
2029
2030 fail:
2031 migrate_del_blocker(invtsc_mig_blocker);
2032
2033 return r;
2034 }
2035
2036 int kvm_arch_destroy_vcpu(CPUState *cs)
2037 {
2038 X86CPU *cpu = X86_CPU(cs);
2039 CPUX86State *env = &cpu->env;
2040
2041 if (cpu->kvm_msr_buf) {
2042 g_free(cpu->kvm_msr_buf);
2043 cpu->kvm_msr_buf = NULL;
2044 }
2045
2046 if (env->nested_state) {
2047 g_free(env->nested_state);
2048 env->nested_state = NULL;
2049 }
2050
2051 qemu_del_vm_change_state_handler(cpu->vmsentry);
2052
2053 return 0;
2054 }
2055
2056 void kvm_arch_reset_vcpu(X86CPU *cpu)
2057 {
2058 CPUX86State *env = &cpu->env;
2059
2060 env->xcr0 = 1;
2061 if (kvm_irqchip_in_kernel()) {
2062 env->mp_state = cpu_is_bsp(cpu) ? KVM_MP_STATE_RUNNABLE :
2063 KVM_MP_STATE_UNINITIALIZED;
2064 } else {
2065 env->mp_state = KVM_MP_STATE_RUNNABLE;
2066 }
2067
2068 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
2069 int i;
2070 for (i = 0; i < ARRAY_SIZE(env->msr_hv_synic_sint); i++) {
2071 env->msr_hv_synic_sint[i] = HV_SINT_MASKED;
2072 }
2073
2074 hyperv_x86_synic_reset(cpu);
2075 }
2076 /* enabled by default */
2077 env->poll_control_msr = 1;
2078
2079 sev_es_set_reset_vector(CPU(cpu));
2080 }
2081
2082 void kvm_arch_do_init_vcpu(X86CPU *cpu)
2083 {
2084 CPUX86State *env = &cpu->env;
2085
2086 /* APs get directly into wait-for-SIPI state. */
2087 if (env->mp_state == KVM_MP_STATE_UNINITIALIZED) {
2088 env->mp_state = KVM_MP_STATE_INIT_RECEIVED;
2089 }
2090 }
2091
2092 static int kvm_get_supported_feature_msrs(KVMState *s)
2093 {
2094 int ret = 0;
2095
2096 if (kvm_feature_msrs != NULL) {
2097 return 0;
2098 }
2099
2100 if (!kvm_check_extension(s, KVM_CAP_GET_MSR_FEATURES)) {
2101 return 0;
2102 }
2103
2104 struct kvm_msr_list msr_list;
2105
2106 msr_list.nmsrs = 0;
2107 ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, &msr_list);
2108 if (ret < 0 && ret != -E2BIG) {
2109 error_report("Fetch KVM feature MSR list failed: %s",
2110 strerror(-ret));
2111 return ret;
2112 }
2113
2114 assert(msr_list.nmsrs > 0);
2115 kvm_feature_msrs = (struct kvm_msr_list *) \
2116 g_malloc0(sizeof(msr_list) +
2117 msr_list.nmsrs * sizeof(msr_list.indices[0]));
2118
2119 kvm_feature_msrs->nmsrs = msr_list.nmsrs;
2120 ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, kvm_feature_msrs);
2121
2122 if (ret < 0) {
2123 error_report("Fetch KVM feature MSR list failed: %s",
2124 strerror(-ret));
2125 g_free(kvm_feature_msrs);
2126 kvm_feature_msrs = NULL;
2127 return ret;
2128 }
2129
2130 return 0;
2131 }
2132
2133 static int kvm_get_supported_msrs(KVMState *s)
2134 {
2135 int ret = 0;
2136 struct kvm_msr_list msr_list, *kvm_msr_list;
2137
2138 /*
2139 * Obtain MSR list from KVM. These are the MSRs that we must
2140 * save/restore.
2141 */
2142 msr_list.nmsrs = 0;
2143 ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list);
2144 if (ret < 0 && ret != -E2BIG) {
2145 return ret;
2146 }
2147 /*
2148 * Old kernel modules had a bug and could write beyond the provided
2149 * memory. Allocate at least a safe amount of 1K.
2150 */
2151 kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) +
2152 msr_list.nmsrs *
2153 sizeof(msr_list.indices[0])));
2154
2155 kvm_msr_list->nmsrs = msr_list.nmsrs;
2156 ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
2157 if (ret >= 0) {
2158 int i;
2159
2160 for (i = 0; i < kvm_msr_list->nmsrs; i++) {
2161 switch (kvm_msr_list->indices[i]) {
2162 case MSR_STAR:
2163 has_msr_star = true;
2164 break;
2165 case MSR_VM_HSAVE_PA:
2166 has_msr_hsave_pa = true;
2167 break;
2168 case MSR_TSC_AUX:
2169 has_msr_tsc_aux = true;
2170 break;
2171 case MSR_TSC_ADJUST:
2172 has_msr_tsc_adjust = true;
2173 break;
2174 case MSR_IA32_TSCDEADLINE:
2175 has_msr_tsc_deadline = true;
2176 break;
2177 case MSR_IA32_SMBASE:
2178 has_msr_smbase = true;
2179 break;
2180 case MSR_SMI_COUNT:
2181 has_msr_smi_count = true;
2182 break;
2183 case MSR_IA32_MISC_ENABLE:
2184 has_msr_misc_enable = true;
2185 break;
2186 case MSR_IA32_BNDCFGS:
2187 has_msr_bndcfgs = true;
2188 break;
2189 case MSR_IA32_XSS:
2190 has_msr_xss = true;
2191 break;
2192 case MSR_IA32_UMWAIT_CONTROL:
2193 has_msr_umwait = true;
2194 break;
2195 case HV_X64_MSR_CRASH_CTL:
2196 has_msr_hv_crash = true;
2197 break;
2198 case HV_X64_MSR_RESET:
2199 has_msr_hv_reset = true;
2200 break;
2201 case HV_X64_MSR_VP_INDEX:
2202 has_msr_hv_vpindex = true;
2203 break;
2204 case HV_X64_MSR_VP_RUNTIME:
2205 has_msr_hv_runtime = true;
2206 break;
2207 case HV_X64_MSR_SCONTROL:
2208 has_msr_hv_synic = true;
2209 break;
2210 case HV_X64_MSR_STIMER0_CONFIG:
2211 has_msr_hv_stimer = true;
2212 break;
2213 case HV_X64_MSR_TSC_FREQUENCY:
2214 has_msr_hv_frequencies = true;
2215 break;
2216 case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2217 has_msr_hv_reenlightenment = true;
2218 break;
2219 case MSR_IA32_SPEC_CTRL:
2220 has_msr_spec_ctrl = true;
2221 break;
2222 case MSR_AMD64_TSC_RATIO:
2223 has_tsc_scale_msr = true;
2224 break;
2225 case MSR_IA32_TSX_CTRL:
2226 has_msr_tsx_ctrl = true;
2227 break;
2228 case MSR_VIRT_SSBD:
2229 has_msr_virt_ssbd = true;
2230 break;
2231 case MSR_IA32_ARCH_CAPABILITIES:
2232 has_msr_arch_capabs = true;
2233 break;
2234 case MSR_IA32_CORE_CAPABILITY:
2235 has_msr_core_capabs = true;
2236 break;
2237 case MSR_IA32_PERF_CAPABILITIES:
2238 has_msr_perf_capabs = true;
2239 break;
2240 case MSR_IA32_VMX_VMFUNC:
2241 has_msr_vmx_vmfunc = true;
2242 break;
2243 case MSR_IA32_UCODE_REV:
2244 has_msr_ucode_rev = true;
2245 break;
2246 case MSR_IA32_VMX_PROCBASED_CTLS2:
2247 has_msr_vmx_procbased_ctls2 = true;
2248 break;
2249 case MSR_IA32_PKRS:
2250 has_msr_pkrs = true;
2251 break;
2252 }
2253 }
2254 }
2255
2256 g_free(kvm_msr_list);
2257
2258 return ret;
2259 }
2260
2261 static Notifier smram_machine_done;
2262 static KVMMemoryListener smram_listener;
2263 static AddressSpace smram_address_space;
2264 static MemoryRegion smram_as_root;
2265 static MemoryRegion smram_as_mem;
2266
2267 static void register_smram_listener(Notifier *n, void *unused)
2268 {
2269 MemoryRegion *smram =
2270 (MemoryRegion *) object_resolve_path("/machine/smram", NULL);
2271
2272 /* Outer container... */
2273 memory_region_init(&smram_as_root, OBJECT(kvm_state), "mem-container-smram", ~0ull);
2274 memory_region_set_enabled(&smram_as_root, true);
2275
2276 /* ... with two regions inside: normal system memory with low
2277 * priority, and...
2278 */
2279 memory_region_init_alias(&smram_as_mem, OBJECT(kvm_state), "mem-smram",
2280 get_system_memory(), 0, ~0ull);
2281 memory_region_add_subregion_overlap(&smram_as_root, 0, &smram_as_mem, 0);
2282 memory_region_set_enabled(&smram_as_mem, true);
2283
2284 if (smram) {
2285 /* ... SMRAM with higher priority */
2286 memory_region_add_subregion_overlap(&smram_as_root, 0, smram, 10);
2287 memory_region_set_enabled(smram, true);
2288 }
2289
2290 address_space_init(&smram_address_space, &smram_as_root, "KVM-SMRAM");
2291 kvm_memory_listener_register(kvm_state, &smram_listener,
2292 &smram_address_space, 1, "kvm-smram");
2293 }
2294
2295 int kvm_arch_init(MachineState *ms, KVMState *s)
2296 {
2297 uint64_t identity_base = 0xfffbc000;
2298 uint64_t shadow_mem;
2299 int ret;
2300 struct utsname utsname;
2301 Error *local_err = NULL;
2302
2303 /*
2304 * Initialize SEV context, if required
2305 *
2306 * If no memory encryption is requested (ms->cgs == NULL) this is
2307 * a no-op.
2308 *
2309 * It's also a no-op if a non-SEV confidential guest support
2310 * mechanism is selected. SEV is the only mechanism available to
2311 * select on x86 at present, so this doesn't arise, but if new
2312 * mechanisms are supported in future (e.g. TDX), they'll need
2313 * their own initialization either here or elsewhere.
2314 */
2315 ret = sev_kvm_init(ms->cgs, &local_err);
2316 if (ret < 0) {
2317 error_report_err(local_err);
2318 return ret;
2319 }
2320
2321 if (!kvm_check_extension(s, KVM_CAP_IRQ_ROUTING)) {
2322 error_report("kvm: KVM_CAP_IRQ_ROUTING not supported by KVM");
2323 return -ENOTSUP;
2324 }
2325
2326 has_xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
2327 has_xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
2328 has_pit_state2 = kvm_check_extension(s, KVM_CAP_PIT_STATE2);
2329 has_sregs2 = kvm_check_extension(s, KVM_CAP_SREGS2) > 0;
2330
2331 hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX);
2332
2333 has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD);
2334 if (has_exception_payload) {
2335 ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true);
2336 if (ret < 0) {
2337 error_report("kvm: Failed to enable exception payload cap: %s",
2338 strerror(-ret));
2339 return ret;
2340 }
2341 }
2342
2343 ret = kvm_get_supported_msrs(s);
2344 if (ret < 0) {
2345 return ret;
2346 }
2347
2348 kvm_get_supported_feature_msrs(s);
2349
2350 uname(&utsname);
2351 lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;
2352
2353 /*
2354 * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly.
2355 * In order to use vm86 mode, an EPT identity map and a TSS are needed.
2356 * Since these must be part of guest physical memory, we need to allocate
2357 * them, both by setting their start addresses in the kernel and by
2358 * creating a corresponding e820 entry. We need 4 pages before the BIOS.
2359 *
2360 * Older KVM versions may not support setting the identity map base. In
2361 * that case we need to stick with the default, i.e. a 256K maximum BIOS
2362 * size.
2363 */
2364 if (kvm_check_extension(s, KVM_CAP_SET_IDENTITY_MAP_ADDR)) {
2365 /* Allows up to 16M BIOSes. */
2366 identity_base = 0xfeffc000;
2367
2368 ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base);
2369 if (ret < 0) {
2370 return ret;
2371 }
2372 }
2373
2374 /* Set TSS base one page after EPT identity map. */
2375 ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, identity_base + 0x1000);
2376 if (ret < 0) {
2377 return ret;
2378 }
2379
2380 /* Tell fw_cfg to notify the BIOS to reserve the range. */
2381 ret = e820_add_entry(identity_base, 0x4000, E820_RESERVED);
2382 if (ret < 0) {
2383 fprintf(stderr, "e820_add_entry() table is full\n");
2384 return ret;
2385 }
2386
2387 shadow_mem = object_property_get_int(OBJECT(s), "kvm-shadow-mem", &error_abort);
2388 if (shadow_mem != -1) {
2389 shadow_mem /= 4096;
2390 ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem);
2391 if (ret < 0) {
2392 return ret;
2393 }
2394 }
2395
2396 if (kvm_check_extension(s, KVM_CAP_X86_SMM) &&
2397 object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE) &&
2398 x86_machine_is_smm_enabled(X86_MACHINE(ms))) {
2399 smram_machine_done.notify = register_smram_listener;
2400 qemu_add_machine_init_done_notifier(&smram_machine_done);
2401 }
2402
2403 if (enable_cpu_pm) {
2404 int disable_exits = kvm_check_extension(s, KVM_CAP_X86_DISABLE_EXITS);
2405 int ret;
2406
2407 /* Work around for kernel header with a typo. TODO: fix header and drop. */
2408 #if defined(KVM_X86_DISABLE_EXITS_HTL) && !defined(KVM_X86_DISABLE_EXITS_HLT)
2409 #define KVM_X86_DISABLE_EXITS_HLT KVM_X86_DISABLE_EXITS_HTL
2410 #endif
2411 if (disable_exits) {
2412 disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT |
2413 KVM_X86_DISABLE_EXITS_HLT |
2414 KVM_X86_DISABLE_EXITS_PAUSE |
2415 KVM_X86_DISABLE_EXITS_CSTATE);
2416 }
2417
2418 ret = kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0,
2419 disable_exits);
2420 if (ret < 0) {
2421 error_report("kvm: guest stopping CPU not supported: %s",
2422 strerror(-ret));
2423 }
2424 }
2425
2426 if (object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE)) {
2427 X86MachineState *x86ms = X86_MACHINE(ms);
2428
2429 if (x86ms->bus_lock_ratelimit > 0) {
2430 ret = kvm_check_extension(s, KVM_CAP_X86_BUS_LOCK_EXIT);
2431 if (!(ret & KVM_BUS_LOCK_DETECTION_EXIT)) {
2432 error_report("kvm: bus lock detection unsupported");
2433 return -ENOTSUP;
2434 }
2435 ret = kvm_vm_enable_cap(s, KVM_CAP_X86_BUS_LOCK_EXIT, 0,
2436 KVM_BUS_LOCK_DETECTION_EXIT);
2437 if (ret < 0) {
2438 error_report("kvm: Failed to enable bus lock detection cap: %s",
2439 strerror(-ret));
2440 return ret;
2441 }
2442 ratelimit_init(&bus_lock_ratelimit_ctrl);
2443 ratelimit_set_speed(&bus_lock_ratelimit_ctrl,
2444 x86ms->bus_lock_ratelimit, BUS_LOCK_SLICE_TIME);
2445 }
2446 }
2447
2448 return 0;
2449 }
2450
2451 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
2452 {
2453 lhs->selector = rhs->selector;
2454 lhs->base = rhs->base;
2455 lhs->limit = rhs->limit;
2456 lhs->type = 3;
2457 lhs->present = 1;
2458 lhs->dpl = 3;
2459 lhs->db = 0;
2460 lhs->s = 1;
2461 lhs->l = 0;
2462 lhs->g = 0;
2463 lhs->avl = 0;
2464 lhs->unusable = 0;
2465 }
2466
2467 static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
2468 {
2469 unsigned flags = rhs->flags;
2470 lhs->selector = rhs->selector;
2471 lhs->base = rhs->base;
2472 lhs->limit = rhs->limit;
2473 lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
2474 lhs->present = (flags & DESC_P_MASK) != 0;
2475 lhs->dpl = (flags >> DESC_DPL_SHIFT) & 3;
2476 lhs->db = (flags >> DESC_B_SHIFT) & 1;
2477 lhs->s = (flags & DESC_S_MASK) != 0;
2478 lhs->l = (flags >> DESC_L_SHIFT) & 1;
2479 lhs->g = (flags & DESC_G_MASK) != 0;
2480 lhs->avl = (flags & DESC_AVL_MASK) != 0;
2481 lhs->unusable = !lhs->present;
2482 lhs->padding = 0;
2483 }
2484
2485 static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
2486 {
2487 lhs->selector = rhs->selector;
2488 lhs->base = rhs->base;
2489 lhs->limit = rhs->limit;
2490 lhs->flags = (rhs->type << DESC_TYPE_SHIFT) |
2491 ((rhs->present && !rhs->unusable) * DESC_P_MASK) |
2492 (rhs->dpl << DESC_DPL_SHIFT) |
2493 (rhs->db << DESC_B_SHIFT) |
2494 (rhs->s * DESC_S_MASK) |
2495 (rhs->l << DESC_L_SHIFT) |
2496 (rhs->g * DESC_G_MASK) |
2497 (rhs->avl * DESC_AVL_MASK);
2498 }
2499
2500 static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set)
2501 {
2502 if (set) {
2503 *kvm_reg = *qemu_reg;
2504 } else {
2505 *qemu_reg = *kvm_reg;
2506 }
2507 }
2508
2509 static int kvm_getput_regs(X86CPU *cpu, int set)
2510 {
2511 CPUX86State *env = &cpu->env;
2512 struct kvm_regs regs;
2513 int ret = 0;
2514
2515 if (!set) {
2516 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_REGS, &regs);
2517 if (ret < 0) {
2518 return ret;
2519 }
2520 }
2521
2522 kvm_getput_reg(&regs.rax, &env->regs[R_EAX], set);
2523 kvm_getput_reg(&regs.rbx, &env->regs[R_EBX], set);
2524 kvm_getput_reg(&regs.rcx, &env->regs[R_ECX], set);
2525 kvm_getput_reg(&regs.rdx, &env->regs[R_EDX], set);
2526 kvm_getput_reg(&regs.rsi, &env->regs[R_ESI], set);
2527 kvm_getput_reg(&regs.rdi, &env->regs[R_EDI], set);
2528 kvm_getput_reg(&regs.rsp, &env->regs[R_ESP], set);
2529 kvm_getput_reg(&regs.rbp, &env->regs[R_EBP], set);
2530 #ifdef TARGET_X86_64
2531 kvm_getput_reg(&regs.r8, &env->regs[8], set);
2532 kvm_getput_reg(&regs.r9, &env->regs[9], set);
2533 kvm_getput_reg(&regs.r10, &env->regs[10], set);
2534 kvm_getput_reg(&regs.r11, &env->regs[11], set);
2535 kvm_getput_reg(&regs.r12, &env->regs[12], set);
2536 kvm_getput_reg(&regs.r13, &env->regs[13], set);
2537 kvm_getput_reg(&regs.r14, &env->regs[14], set);
2538 kvm_getput_reg(&regs.r15, &env->regs[15], set);
2539 #endif
2540
2541 kvm_getput_reg(&regs.rflags, &env->eflags, set);
2542 kvm_getput_reg(&regs.rip, &env->eip, set);
2543
2544 if (set) {
2545 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_REGS, &regs);
2546 }
2547
2548 return ret;
2549 }
2550
2551 static int kvm_put_fpu(X86CPU *cpu)
2552 {
2553 CPUX86State *env = &cpu->env;
2554 struct kvm_fpu fpu;
2555 int i;
2556
2557 memset(&fpu, 0, sizeof fpu);
2558 fpu.fsw = env->fpus & ~(7 << 11);
2559 fpu.fsw |= (env->fpstt & 7) << 11;
2560 fpu.fcw = env->fpuc;
2561 fpu.last_opcode = env->fpop;
2562 fpu.last_ip = env->fpip;
2563 fpu.last_dp = env->fpdp;
2564 for (i = 0; i < 8; ++i) {
2565 fpu.ftwx |= (!env->fptags[i]) << i;
2566 }
2567 memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
2568 for (i = 0; i < CPU_NB_REGS; i++) {
2569 stq_p(&fpu.xmm[i][0], env->xmm_regs[i].ZMM_Q(0));
2570 stq_p(&fpu.xmm[i][8], env->xmm_regs[i].ZMM_Q(1));
2571 }
2572 fpu.mxcsr = env->mxcsr;
2573
2574 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_FPU, &fpu);
2575 }
2576
2577 static int kvm_put_xsave(X86CPU *cpu)
2578 {
2579 CPUX86State *env = &cpu->env;
2580 void *xsave = env->xsave_buf;
2581
2582 if (!has_xsave) {
2583 return kvm_put_fpu(cpu);
2584 }
2585 x86_cpu_xsave_all_areas(cpu, xsave, env->xsave_buf_len);
2586
2587 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
2588 }
2589
2590 static int kvm_put_xcrs(X86CPU *cpu)
2591 {
2592 CPUX86State *env = &cpu->env;
2593 struct kvm_xcrs xcrs = {};
2594
2595 if (!has_xcrs) {
2596 return 0;
2597 }
2598
2599 xcrs.nr_xcrs = 1;
2600 xcrs.flags = 0;
2601 xcrs.xcrs[0].xcr = 0;
2602 xcrs.xcrs[0].value = env->xcr0;
2603 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XCRS, &xcrs);
2604 }
2605
2606 static int kvm_put_sregs(X86CPU *cpu)
2607 {
2608 CPUX86State *env = &cpu->env;
2609 struct kvm_sregs sregs;
2610
2611 /*
2612 * The interrupt_bitmap is ignored because KVM_SET_SREGS is
2613 * always followed by KVM_SET_VCPU_EVENTS.
2614 */
2615 memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap));
2616
2617 if ((env->eflags & VM_MASK)) {
2618 set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
2619 set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
2620 set_v8086_seg(&sregs.es, &env->segs[R_ES]);
2621 set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
2622 set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
2623 set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
2624 } else {
2625 set_seg(&sregs.cs, &env->segs[R_CS]);
2626 set_seg(&sregs.ds, &env->segs[R_DS]);
2627 set_seg(&sregs.es, &env->segs[R_ES]);
2628 set_seg(&sregs.fs, &env->segs[R_FS]);
2629 set_seg(&sregs.gs, &env->segs[R_GS]);
2630 set_seg(&sregs.ss, &env->segs[R_SS]);
2631 }
2632
2633 set_seg(&sregs.tr, &env->tr);
2634 set_seg(&sregs.ldt, &env->ldt);
2635
2636 sregs.idt.limit = env->idt.limit;
2637 sregs.idt.base = env->idt.base;
2638 memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
2639 sregs.gdt.limit = env->gdt.limit;
2640 sregs.gdt.base = env->gdt.base;
2641 memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
2642
2643 sregs.cr0 = env->cr[0];
2644 sregs.cr2 = env->cr[2];
2645 sregs.cr3 = env->cr[3];
2646 sregs.cr4 = env->cr[4];
2647
2648 sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state);
2649 sregs.apic_base = cpu_get_apic_base(cpu->apic_state);
2650
2651 sregs.efer = env->efer;
2652
2653 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs);
2654 }
2655
2656 static int kvm_put_sregs2(X86CPU *cpu)
2657 {
2658 CPUX86State *env = &cpu->env;
2659 struct kvm_sregs2 sregs;
2660 int i;
2661
2662 sregs.flags = 0;
2663
2664 if ((env->eflags & VM_MASK)) {
2665 set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
2666 set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
2667 set_v8086_seg(&sregs.es, &env->segs[R_ES]);
2668 set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
2669 set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
2670 set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
2671 } else {
2672 set_seg(&sregs.cs, &env->segs[R_CS]);
2673 set_seg(&sregs.ds, &env->segs[R_DS]);
2674 set_seg(&sregs.es, &env->segs[R_ES]);
2675 set_seg(&sregs.fs, &env->segs[R_FS]);
2676 set_seg(&sregs.gs, &env->segs[R_GS]);
2677 set_seg(&sregs.ss, &env->segs[R_SS]);
2678 }
2679
2680 set_seg(&sregs.tr, &env->tr);
2681 set_seg(&sregs.ldt, &env->ldt);
2682
2683 sregs.idt.limit = env->idt.limit;
2684 sregs.idt.base = env->idt.base;
2685 memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
2686 sregs.gdt.limit = env->gdt.limit;
2687 sregs.gdt.base = env->gdt.base;
2688 memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
2689
2690 sregs.cr0 = env->cr[0];
2691 sregs.cr2 = env->cr[2];
2692 sregs.cr3 = env->cr[3];
2693 sregs.cr4 = env->cr[4];
2694
2695 sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state);
2696 sregs.apic_base = cpu_get_apic_base(cpu->apic_state);
2697
2698 sregs.efer = env->efer;
2699
2700 if (env->pdptrs_valid) {
2701 for (i = 0; i < 4; i++) {
2702 sregs.pdptrs[i] = env->pdptrs[i];
2703 }
2704 sregs.flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
2705 }
2706
2707 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS2, &sregs);
2708 }
2709
2710
2711 static void kvm_msr_buf_reset(X86CPU *cpu)
2712 {
2713 memset(cpu->kvm_msr_buf, 0, MSR_BUF_SIZE);
2714 }
2715
2716 static void kvm_msr_entry_add(X86CPU *cpu, uint32_t index, uint64_t value)
2717 {
2718 struct kvm_msrs *msrs = cpu->kvm_msr_buf;
2719 void *limit = ((void *)msrs) + MSR_BUF_SIZE;
2720 struct kvm_msr_entry *entry = &msrs->entries[msrs->nmsrs];
2721
2722 assert((void *)(entry + 1) <= limit);
2723
2724 entry->index = index;
2725 entry->reserved = 0;
2726 entry->data = value;
2727 msrs->nmsrs++;
2728 }
2729
2730 static int kvm_put_one_msr(X86CPU *cpu, int index, uint64_t value)
2731 {
2732 kvm_msr_buf_reset(cpu);
2733 kvm_msr_entry_add(cpu, index, value);
2734
2735 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
2736 }
2737
2738 void kvm_put_apicbase(X86CPU *cpu, uint64_t value)
2739 {
2740 int ret;
2741
2742 ret = kvm_put_one_msr(cpu, MSR_IA32_APICBASE, value);
2743 assert(ret == 1);
2744 }
2745
2746 static int kvm_put_tscdeadline_msr(X86CPU *cpu)
2747 {
2748 CPUX86State *env = &cpu->env;
2749 int ret;
2750
2751 if (!has_msr_tsc_deadline) {
2752 return 0;
2753 }
2754
2755 ret = kvm_put_one_msr(cpu, MSR_IA32_TSCDEADLINE, env->tsc_deadline);
2756 if (ret < 0) {
2757 return ret;
2758 }
2759
2760 assert(ret == 1);
2761 return 0;
2762 }
2763
2764 /*
2765 * Provide a separate write service for the feature control MSR in order to
2766 * kick the VCPU out of VMXON or even guest mode on reset. This has to be done
2767 * before writing any other state because forcibly leaving nested mode
2768 * invalidates the VCPU state.
2769 */
2770 static int kvm_put_msr_feature_control(X86CPU *cpu)
2771 {
2772 int ret;
2773
2774 if (!has_msr_feature_control) {
2775 return 0;
2776 }
2777
2778 ret = kvm_put_one_msr(cpu, MSR_IA32_FEATURE_CONTROL,
2779 cpu->env.msr_ia32_feature_control);
2780 if (ret < 0) {
2781 return ret;
2782 }
2783
2784 assert(ret == 1);
2785 return 0;
2786 }
2787
2788 static uint64_t make_vmx_msr_value(uint32_t index, uint32_t features)
2789 {
2790 uint32_t default1, can_be_one, can_be_zero;
2791 uint32_t must_be_one;
2792
2793 switch (index) {
2794 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2795 default1 = 0x00000016;
2796 break;
2797 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2798 default1 = 0x0401e172;
2799 break;
2800 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2801 default1 = 0x000011ff;
2802 break;
2803 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2804 default1 = 0x00036dff;
2805 break;
2806 case MSR_IA32_VMX_PROCBASED_CTLS2:
2807 default1 = 0;
2808 break;
2809 default:
2810 abort();
2811 }
2812
2813 /* If a feature bit is set, the control can be either set or clear.
2814 * Otherwise the value is limited to either 0 or 1 by default1.
2815 */
2816 can_be_one = features | default1;
2817 can_be_zero = features | ~default1;
2818 must_be_one = ~can_be_zero;
2819
2820 /*
2821 * Bit 0:31 -> 0 if the control bit can be zero (i.e. 1 if it must be one).
2822 * Bit 32:63 -> 1 if the control bit can be one.
2823 */
2824 return must_be_one | (((uint64_t)can_be_one) << 32);
2825 }
2826
2827 static void kvm_msr_entry_add_vmx(X86CPU *cpu, FeatureWordArray f)
2828 {
2829 uint64_t kvm_vmx_basic =
2830 kvm_arch_get_supported_msr_feature(kvm_state,
2831 MSR_IA32_VMX_BASIC);
2832
2833 if (!kvm_vmx_basic) {
2834 /* If the kernel doesn't support VMX feature (kvm_intel.nested=0),
2835 * then kvm_vmx_basic will be 0 and KVM_SET_MSR will fail.
2836 */
2837 return;
2838 }
2839
2840 uint64_t kvm_vmx_misc =
2841 kvm_arch_get_supported_msr_feature(kvm_state,
2842 MSR_IA32_VMX_MISC);
2843 uint64_t kvm_vmx_ept_vpid =
2844 kvm_arch_get_supported_msr_feature(kvm_state,
2845 MSR_IA32_VMX_EPT_VPID_CAP);
2846
2847 /*
2848 * If the guest is 64-bit, a value of 1 is allowed for the host address
2849 * space size vmexit control.
2850 */
2851 uint64_t fixed_vmx_exit = f[FEAT_8000_0001_EDX] & CPUID_EXT2_LM
2852 ? (uint64_t)VMX_VM_EXIT_HOST_ADDR_SPACE_SIZE << 32 : 0;
2853
2854 /*
2855 * Bits 0-30, 32-44 and 50-53 come from the host. KVM should
2856 * not change them for backwards compatibility.
2857 */
2858 uint64_t fixed_vmx_basic = kvm_vmx_basic &
2859 (MSR_VMX_BASIC_VMCS_REVISION_MASK |
2860 MSR_VMX_BASIC_VMXON_REGION_SIZE_MASK |
2861 MSR_VMX_BASIC_VMCS_MEM_TYPE_MASK);
2862
2863 /*
2864 * Same for bits 0-4 and 25-27. Bits 16-24 (CR3 target count) can
2865 * change in the future but are always zero for now, clear them to be
2866 * future proof. Bits 32-63 in theory could change, though KVM does
2867 * not support dual-monitor treatment and probably never will; mask
2868 * them out as well.
2869 */
2870 uint64_t fixed_vmx_misc = kvm_vmx_misc &
2871 (MSR_VMX_MISC_PREEMPTION_TIMER_SHIFT_MASK |
2872 MSR_VMX_MISC_MAX_MSR_LIST_SIZE_MASK);
2873
2874 /*
2875 * EPT memory types should not change either, so we do not bother
2876 * adding features for them.
2877 */
2878 uint64_t fixed_vmx_ept_mask =
2879 (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_ENABLE_EPT ?
2880 MSR_VMX_EPT_UC | MSR_VMX_EPT_WB : 0);
2881 uint64_t fixed_vmx_ept_vpid = kvm_vmx_ept_vpid & fixed_vmx_ept_mask;
2882
2883 kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
2884 make_vmx_msr_value(MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
2885 f[FEAT_VMX_PROCBASED_CTLS]));
2886 kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PINBASED_CTLS,
2887 make_vmx_msr_value(MSR_IA32_VMX_TRUE_PINBASED_CTLS,
2888 f[FEAT_VMX_PINBASED_CTLS]));
2889 kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_EXIT_CTLS,
2890 make_vmx_msr_value(MSR_IA32_VMX_TRUE_EXIT_CTLS,
2891 f[FEAT_VMX_EXIT_CTLS]) | fixed_vmx_exit);
2892 kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_ENTRY_CTLS,
2893 make_vmx_msr_value(MSR_IA32_VMX_TRUE_ENTRY_CTLS,
2894 f[FEAT_VMX_ENTRY_CTLS]));
2895 kvm_msr_entry_add(cpu, MSR_IA32_VMX_PROCBASED_CTLS2,
2896 make_vmx_msr_value(MSR_IA32_VMX_PROCBASED_CTLS2,
2897 f[FEAT_VMX_SECONDARY_CTLS]));
2898 kvm_msr_entry_add(cpu, MSR_IA32_VMX_EPT_VPID_CAP,
2899 f[FEAT_VMX_EPT_VPID_CAPS] | fixed_vmx_ept_vpid);
2900 kvm_msr_entry_add(cpu, MSR_IA32_VMX_BASIC,
2901 f[FEAT_VMX_BASIC] | fixed_vmx_basic);
2902 kvm_msr_entry_add(cpu, MSR_IA32_VMX_MISC,
2903 f[FEAT_VMX_MISC] | fixed_vmx_misc);
2904 if (has_msr_vmx_vmfunc) {
2905 kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMFUNC, f[FEAT_VMX_VMFUNC]);
2906 }
2907
2908 /*
2909 * Just to be safe, write these with constant values. The CRn_FIXED1
2910 * MSRs are generated by KVM based on the vCPU's CPUID.
2911 */
2912 kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR0_FIXED0,
2913 CR0_PE_MASK | CR0_PG_MASK | CR0_NE_MASK);
2914 kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR4_FIXED0,
2915 CR4_VMXE_MASK);
2916
2917 if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) {
2918 /* TSC multiplier (0x2032). */
2919 kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x32);
2920 } else {
2921 /* Preemption timer (0x482E). */
2922 kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x2E);
2923 }
2924 }
2925
2926 static void kvm_msr_entry_add_perf(X86CPU *cpu, FeatureWordArray f)
2927 {
2928 uint64_t kvm_perf_cap =
2929 kvm_arch_get_supported_msr_feature(kvm_state,
2930 MSR_IA32_PERF_CAPABILITIES);
2931
2932 if (kvm_perf_cap) {
2933 kvm_msr_entry_add(cpu, MSR_IA32_PERF_CAPABILITIES,
2934 kvm_perf_cap & f[FEAT_PERF_CAPABILITIES]);
2935 }
2936 }
2937
2938 static int kvm_buf_set_msrs(X86CPU *cpu)
2939 {
2940 int ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
2941 if (ret < 0) {
2942 return ret;
2943 }
2944
2945 if (ret < cpu->kvm_msr_buf->nmsrs) {
2946 struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
2947 error_report("error: failed to set MSR 0x%" PRIx32 " to 0x%" PRIx64,
2948 (uint32_t)e->index, (uint64_t)e->data);
2949 }
2950
2951 assert(ret == cpu->kvm_msr_buf->nmsrs);
2952 return 0;
2953 }
2954
2955 static void kvm_init_msrs(X86CPU *cpu)
2956 {
2957 CPUX86State *env = &cpu->env;
2958
2959 kvm_msr_buf_reset(cpu);
2960 if (has_msr_arch_capabs) {
2961 kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES,
2962 env->features[FEAT_ARCH_CAPABILITIES]);
2963 }
2964
2965 if (has_msr_core_capabs) {
2966 kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY,
2967 env->features[FEAT_CORE_CAPABILITY]);
2968 }
2969
2970 if (has_msr_perf_capabs && cpu->enable_pmu) {
2971 kvm_msr_entry_add_perf(cpu, env->features);
2972 }
2973
2974 if (has_msr_ucode_rev) {
2975 kvm_msr_entry_add(cpu, MSR_IA32_UCODE_REV, cpu->ucode_rev);
2976 }
2977
2978 /*
2979 * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but
2980 * all kernels with MSR features should have them.
2981 */
2982 if (kvm_feature_msrs && cpu_has_vmx(env)) {
2983 kvm_msr_entry_add_vmx(cpu, env->features);
2984 }
2985
2986 assert(kvm_buf_set_msrs(cpu) == 0);
2987 }
2988
2989 static int kvm_put_msrs(X86CPU *cpu, int level)
2990 {
2991 CPUX86State *env = &cpu->env;
2992 int i;
2993
2994 kvm_msr_buf_reset(cpu);
2995
2996 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, env->sysenter_cs);
2997 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
2998 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
2999 kvm_msr_entry_add(cpu, MSR_PAT, env->pat);
3000 if (has_msr_star) {
3001 kvm_msr_entry_add(cpu, MSR_STAR, env->star);
3002 }
3003 if (has_msr_hsave_pa) {
3004 kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, env->vm_hsave);
3005 }
3006 if (has_msr_tsc_aux) {
3007 kvm_msr_entry_add(cpu, MSR_TSC_AUX, env->tsc_aux);
3008 }
3009 if (has_msr_tsc_adjust) {
3010 kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, env->tsc_adjust);
3011 }
3012 if (has_msr_misc_enable) {
3013 kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE,
3014 env->msr_ia32_misc_enable);
3015 }
3016 if (has_msr_smbase) {
3017 kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, env->smbase);
3018 }
3019 if (has_msr_smi_count) {
3020 kvm_msr_entry_add(cpu, MSR_SMI_COUNT, env->msr_smi_count);
3021 }
3022 if (has_msr_pkrs) {
3023 kvm_msr_entry_add(cpu, MSR_IA32_PKRS, env->pkrs);
3024 }
3025 if (has_msr_bndcfgs) {
3026 kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, env->msr_bndcfgs);
3027 }
3028 if (has_msr_xss) {
3029 kvm_msr_entry_add(cpu, MSR_IA32_XSS, env->xss);
3030 }
3031 if (has_msr_umwait) {
3032 kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, env->umwait);
3033 }
3034 if (has_msr_spec_ctrl) {
3035 kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, env->spec_ctrl);
3036 }
3037 if (has_tsc_scale_msr) {
3038 kvm_msr_entry_add(cpu, MSR_AMD64_TSC_RATIO, env->amd_tsc_scale_msr);
3039 }
3040
3041 if (has_msr_tsx_ctrl) {
3042 kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, env->tsx_ctrl);
3043 }
3044 if (has_msr_virt_ssbd) {
3045 kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, env->virt_ssbd);
3046 }
3047
3048 #ifdef TARGET_X86_64
3049 if (lm_capable_kernel) {
3050 kvm_msr_entry_add(cpu, MSR_CSTAR, env->cstar);
3051 kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, env->kernelgsbase);
3052 kvm_msr_entry_add(cpu, MSR_FMASK, env->fmask);
3053 kvm_msr_entry_add(cpu, MSR_LSTAR, env->lstar);
3054 }
3055 #endif
3056
3057 /*
3058 * The following MSRs have side effects on the guest or are too heavy
3059 * for normal writeback. Limit them to reset or full state updates.
3060 */
3061 if (level >= KVM_PUT_RESET_STATE) {
3062 kvm_msr_entry_add(cpu, MSR_IA32_TSC, env->tsc);
3063 kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, env->system_time_msr);
3064 kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
3065 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) {
3066 kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, env->async_pf_int_msr);
3067 }
3068 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
3069 kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, env->async_pf_en_msr);
3070 }
3071 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
3072 kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, env->pv_eoi_en_msr);
3073 }
3074 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
3075 kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, env->steal_time_msr);
3076 }
3077
3078 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
3079 kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, env->poll_control_msr);
3080 }
3081
3082 if (has_architectural_pmu_version > 0) {
3083 if (has_architectural_pmu_version > 1) {
3084 /* Stop the counter. */
3085 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
3086 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
3087 }
3088
3089 /* Set the counter values. */
3090 for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
3091 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i,
3092 env->msr_fixed_counters[i]);
3093 }
3094 for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
3095 kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i,
3096 env->msr_gp_counters[i]);
3097 kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i,
3098 env->msr_gp_evtsel[i]);
3099 }
3100 if (has_architectural_pmu_version > 1) {
3101 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS,
3102 env->msr_global_status);
3103 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
3104 env->msr_global_ovf_ctrl);
3105
3106 /* Now start the PMU. */
3107 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL,
3108 env->msr_fixed_ctr_ctrl);
3109 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL,
3110 env->msr_global_ctrl);
3111 }
3112 }
3113 /*
3114 * Hyper-V partition-wide MSRs: to avoid clearing them on cpu hot-add,
3115 * only sync them to KVM on the first cpu
3116 */
3117 if (current_cpu == first_cpu) {
3118 if (has_msr_hv_hypercall) {
3119 kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID,
3120 env->msr_hv_guest_os_id);
3121 kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL,
3122 env->msr_hv_hypercall);
3123 }
3124 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) {
3125 kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC,
3126 env->msr_hv_tsc);
3127 }
3128 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) {
3129 kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL,
3130 env->msr_hv_reenlightenment_control);
3131 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL,
3132 env->msr_hv_tsc_emulation_control);
3133 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS,
3134 env->msr_hv_tsc_emulation_status);
3135 }
3136 }
3137 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
3138 kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE,
3139 env->msr_hv_vapic);
3140 }
3141 if (has_msr_hv_crash) {
3142 int j;
3143
3144 for (j = 0; j < HV_CRASH_PARAMS; j++)
3145 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j,
3146 env->msr_hv_crash_params[j]);
3147
3148 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_NOTIFY);
3149 }
3150 if (has_msr_hv_runtime) {
3151 kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, env->msr_hv_runtime);
3152 }
3153 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)
3154 && hv_vpindex_settable) {
3155 kvm_msr_entry_add(cpu, HV_X64_MSR_VP_INDEX,
3156 hyperv_vp_index(CPU(cpu)));
3157 }
3158 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
3159 int j;
3160
3161 kvm_msr_entry_add(cpu, HV_X64_MSR_SVERSION, HV_SYNIC_VERSION);
3162
3163 kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL,
3164 env->msr_hv_synic_control);
3165 kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP,
3166 env->msr_hv_synic_evt_page);
3167 kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP,
3168 env->msr_hv_synic_msg_page);
3169
3170 for (j = 0; j < ARRAY_SIZE(env->msr_hv_synic_sint); j++) {
3171 kvm_msr_entry_add(cpu, HV_X64_MSR_SINT0 + j,
3172 env->msr_hv_synic_sint[j]);
3173 }
3174 }
3175 if (has_msr_hv_stimer) {
3176 int j;
3177
3178 for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_config); j++) {
3179 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_CONFIG + j * 2,
3180 env->msr_hv_stimer_config[j]);
3181 }
3182
3183 for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_count); j++) {
3184 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_COUNT + j * 2,
3185 env->msr_hv_stimer_count[j]);
3186 }
3187 }
3188 if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
3189 uint64_t phys_mask = MAKE_64BIT_MASK(0, cpu->phys_bits);
3190
3191 kvm_msr_entry_add(cpu, MSR_MTRRdefType, env->mtrr_deftype);
3192 kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, env->mtrr_fixed[0]);
3193 kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, env->mtrr_fixed[1]);
3194 kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, env->mtrr_fixed[2]);
3195 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, env->mtrr_fixed[3]);
3196 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, env->mtrr_fixed[4]);
3197 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, env->mtrr_fixed[5]);
3198 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, env->mtrr_fixed[6]);
3199 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, env->mtrr_fixed[7]);
3200 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, env->mtrr_fixed[8]);
3201 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, env->mtrr_fixed[9]);
3202 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, env->mtrr_fixed[10]);
3203 for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
3204 /* The CPU GPs if we write to a bit above the physical limit of
3205 * the host CPU (and KVM emulates that)
3206 */
3207 uint64_t mask = env->mtrr_var[i].mask;
3208 mask &= phys_mask;
3209
3210 kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i),
3211 env->mtrr_var[i].base);
3212 kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), mask);
3213 }
3214 }
3215 if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
3216 int addr_num = kvm_arch_get_supported_cpuid(kvm_state,
3217 0x14, 1, R_EAX) & 0x7;
3218
3219 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL,
3220 env->msr_rtit_ctrl);
3221 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS,
3222 env->msr_rtit_status);
3223 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE,
3224 env->msr_rtit_output_base);
3225 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK,
3226 env->msr_rtit_output_mask);
3227 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH,
3228 env->msr_rtit_cr3_match);
3229 for (i = 0; i < addr_num; i++) {
3230 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i,
3231 env->msr_rtit_addrs[i]);
3232 }
3233 }
3234
3235 if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) {
3236 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0,
3237 env->msr_ia32_sgxlepubkeyhash[0]);
3238 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1,
3239 env->msr_ia32_sgxlepubkeyhash[1]);
3240 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2,
3241 env->msr_ia32_sgxlepubkeyhash[2]);
3242 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3,
3243 env->msr_ia32_sgxlepubkeyhash[3]);
3244 }
3245
3246 /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see
3247 * kvm_put_msr_feature_control. */
3248 }
3249
3250 if (env->mcg_cap) {
3251 int i;
3252
3253 kvm_msr_entry_add(cpu, MSR_MCG_STATUS, env->mcg_status);
3254 kvm_msr_entry_add(cpu, MSR_MCG_CTL, env->mcg_ctl);
3255 if (has_msr_mcg_ext_ctl) {
3256 kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, env->mcg_ext_ctl);
3257 }
3258 for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
3259 kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, env->mce_banks[i]);
3260 }
3261 }
3262
3263 return kvm_buf_set_msrs(cpu);
3264 }
3265
3266
3267 static int kvm_get_fpu(X86CPU *cpu)
3268 {
3269 CPUX86State *env = &cpu->env;
3270 struct kvm_fpu fpu;
3271 int i, ret;
3272
3273 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_FPU, &fpu);
3274 if (ret < 0) {
3275 return ret;
3276 }
3277
3278 env->fpstt = (fpu.fsw >> 11) & 7;
3279 env->fpus = fpu.fsw;
3280 env->fpuc = fpu.fcw;
3281 env->fpop = fpu.last_opcode;
3282 env->fpip = fpu.last_ip;
3283 env->fpdp = fpu.last_dp;
3284 for (i = 0; i < 8; ++i) {
3285 env->fptags[i] = !((fpu.ftwx >> i) & 1);
3286 }
3287 memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
3288 for (i = 0; i < CPU_NB_REGS; i++) {
3289 env->xmm_regs[i].ZMM_Q(0) = ldq_p(&fpu.xmm[i][0]);
3290 env->xmm_regs[i].ZMM_Q(1) = ldq_p(&fpu.xmm[i][8]);
3291 }
3292 env->mxcsr = fpu.mxcsr;
3293
3294 return 0;
3295 }
3296
3297 static int kvm_get_xsave(X86CPU *cpu)
3298 {
3299 CPUX86State *env = &cpu->env;
3300 void *xsave = env->xsave_buf;
3301 int ret;
3302
3303 if (!has_xsave) {
3304 return kvm_get_fpu(cpu);
3305 }
3306
3307 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XSAVE, xsave);
3308 if (ret < 0) {
3309 return ret;
3310 }
3311 x86_cpu_xrstor_all_areas(cpu, xsave, env->xsave_buf_len);
3312
3313 return 0;
3314 }
3315
3316 static int kvm_get_xcrs(X86CPU *cpu)
3317 {
3318 CPUX86State *env = &cpu->env;
3319 int i, ret;
3320 struct kvm_xcrs xcrs;
3321
3322 if (!has_xcrs) {
3323 return 0;
3324 }
3325
3326 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XCRS, &xcrs);
3327 if (ret < 0) {
3328 return ret;
3329 }
3330
3331 for (i = 0; i < xcrs.nr_xcrs; i++) {
3332 /* Only support xcr0 now */
3333 if (xcrs.xcrs[i].xcr == 0) {
3334 env->xcr0 = xcrs.xcrs[i].value;
3335 break;
3336 }
3337 }
3338 return 0;
3339 }
3340
3341 static int kvm_get_sregs(X86CPU *cpu)
3342 {
3343 CPUX86State *env = &cpu->env;
3344 struct kvm_sregs sregs;
3345 int ret;
3346
3347 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
3348 if (ret < 0) {
3349 return ret;
3350 }
3351
3352 /*
3353 * The interrupt_bitmap is ignored because KVM_GET_SREGS is
3354 * always preceded by KVM_GET_VCPU_EVENTS.
3355 */
3356
3357 get_seg(&env->segs[R_CS], &sregs.cs);
3358 get_seg(&env->segs[R_DS], &sregs.ds);
3359 get_seg(&env->segs[R_ES], &sregs.es);
3360 get_seg(&env->segs[R_FS], &sregs.fs);
3361 get_seg(&env->segs[R_GS], &sregs.gs);
3362 get_seg(&env->segs[R_SS], &sregs.ss);
3363
3364 get_seg(&env->tr, &sregs.tr);
3365 get_seg(&env->ldt, &sregs.ldt);
3366
3367 env->idt.limit = sregs.idt.limit;
3368 env->idt.base = sregs.idt.base;
3369 env->gdt.limit = sregs.gdt.limit;
3370 env->gdt.base = sregs.gdt.base;
3371
3372 env->cr[0] = sregs.cr0;
3373 env->cr[2] = sregs.cr2;
3374 env->cr[3] = sregs.cr3;
3375 env->cr[4] = sregs.cr4;
3376
3377 env->efer = sregs.efer;
3378
3379 /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
3380 x86_update_hflags(env);
3381
3382 return 0;
3383 }
3384
3385 static int kvm_get_sregs2(X86CPU *cpu)
3386 {
3387 CPUX86State *env = &cpu->env;
3388 struct kvm_sregs2 sregs;
3389 int i, ret;
3390
3391 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS2, &sregs);
3392 if (ret < 0) {
3393 return ret;
3394 }
3395
3396 get_seg(&env->segs[R_CS], &sregs.cs);
3397 get_seg(&env->segs[R_DS], &sregs.ds);
3398 get_seg(&env->segs[R_ES], &sregs.es);
3399 get_seg(&env->segs[R_FS], &sregs.fs);
3400 get_seg(&env->segs[R_GS], &sregs.gs);
3401 get_seg(&env->segs[R_SS], &sregs.ss);
3402
3403 get_seg(&env->tr, &sregs.tr);
3404 get_seg(&env->ldt, &sregs.ldt);
3405
3406 env->idt.limit = sregs.idt.limit;
3407 env->idt.base = sregs.idt.base;
3408 env->gdt.limit = sregs.gdt.limit;
3409 env->gdt.base = sregs.gdt.base;
3410
3411 env->cr[0] = sregs.cr0;
3412 env->cr[2] = sregs.cr2;
3413 env->cr[3] = sregs.cr3;
3414 env->cr[4] = sregs.cr4;
3415
3416 env->efer = sregs.efer;
3417
3418 env->pdptrs_valid = sregs.flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
3419
3420 if (env->pdptrs_valid) {
3421 for (i = 0; i < 4; i++) {
3422 env->pdptrs[i] = sregs.pdptrs[i];
3423 }
3424 }
3425
3426 /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
3427 x86_update_hflags(env);
3428
3429 return 0;
3430 }
3431
3432 static int kvm_get_msrs(X86CPU *cpu)
3433 {
3434 CPUX86State *env = &cpu->env;
3435 struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries;
3436 int ret, i;
3437 uint64_t mtrr_top_bits;
3438
3439 kvm_msr_buf_reset(cpu);
3440
3441 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, 0);
3442 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, 0);
3443 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, 0);
3444 kvm_msr_entry_add(cpu, MSR_PAT, 0);
3445 if (has_msr_star) {
3446 kvm_msr_entry_add(cpu, MSR_STAR, 0);
3447 }
3448 if (has_msr_hsave_pa) {
3449 kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, 0);
3450 }
3451 if (has_msr_tsc_aux) {
3452 kvm_msr_entry_add(cpu, MSR_TSC_AUX, 0);
3453 }
3454 if (has_msr_tsc_adjust) {
3455 kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, 0);
3456 }
3457 if (has_msr_tsc_deadline) {
3458 kvm_msr_entry_add(cpu, MSR_IA32_TSCDEADLINE, 0);
3459 }
3460 if (has_msr_misc_enable) {
3461 kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE, 0);
3462 }
3463 if (has_msr_smbase) {
3464 kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, 0);
3465 }
3466 if (has_msr_smi_count) {
3467 kvm_msr_entry_add(cpu, MSR_SMI_COUNT, 0);
3468 }
3469 if (has_msr_feature_control) {
3470 kvm_msr_entry_add(cpu, MSR_IA32_FEATURE_CONTROL, 0);
3471 }
3472 if (has_msr_pkrs) {
3473 kvm_msr_entry_add(cpu, MSR_IA32_PKRS, 0);
3474 }
3475 if (has_msr_bndcfgs) {
3476 kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, 0);
3477 }
3478 if (has_msr_xss) {
3479 kvm_msr_entry_add(cpu, MSR_IA32_XSS, 0);
3480 }
3481 if (has_msr_umwait) {
3482 kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, 0);
3483 }
3484 if (has_msr_spec_ctrl) {
3485 kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, 0);
3486 }
3487 if (has_tsc_scale_msr) {
3488 kvm_msr_entry_add(cpu, MSR_AMD64_TSC_RATIO, 0);
3489 }
3490
3491 if (has_msr_tsx_ctrl) {
3492 kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, 0);
3493 }
3494 if (has_msr_virt_ssbd) {
3495 kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, 0);
3496 }
3497 if (!env->tsc_valid) {
3498 kvm_msr_entry_add(cpu, MSR_IA32_TSC, 0);
3499 env->tsc_valid = !runstate_is_running();
3500 }
3501
3502 #ifdef TARGET_X86_64
3503 if (lm_capable_kernel) {
3504 kvm_msr_entry_add(cpu, MSR_CSTAR, 0);
3505 kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, 0);
3506 kvm_msr_entry_add(cpu, MSR_FMASK, 0);
3507 kvm_msr_entry_add(cpu, MSR_LSTAR, 0);
3508 }
3509 #endif
3510 kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0);
3511 kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, 0);
3512 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) {
3513 kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, 0);
3514 }
3515 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
3516 kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, 0);
3517 }
3518 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
3519 kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, 0);
3520 }
3521 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
3522 kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, 0);
3523 }
3524 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
3525 kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, 1);
3526 }
3527 if (has_architectural_pmu_version > 0) {
3528 if (has_architectural_pmu_version > 1) {
3529 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
3530 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
3531 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS, 0);
3532 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 0);
3533 }
3534 for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
3535 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i, 0);
3536 }
3537 for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
3538 kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i, 0);
3539 kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i, 0);
3540 }
3541 }
3542
3543 if (env->mcg_cap) {
3544 kvm_msr_entry_add(cpu, MSR_MCG_STATUS, 0);
3545 kvm_msr_entry_add(cpu, MSR_MCG_CTL, 0);
3546 if (has_msr_mcg_ext_ctl) {
3547 kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, 0);
3548 }
3549 for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
3550 kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, 0);
3551 }
3552 }
3553
3554 if (has_msr_hv_hypercall) {
3555 kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL, 0);
3556 kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID, 0);
3557 }
3558 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
3559 kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE, 0);
3560 }
3561 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) {
3562 kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC, 0);
3563 }
3564 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) {
3565 kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0);
3566 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL, 0);
3567 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS, 0);
3568 }
3569 if (has_msr_hv_crash) {
3570 int j;
3571
3572 for (j = 0; j < HV_CRASH_PARAMS; j++) {
3573 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j, 0);
3574 }
3575 }
3576 if (has_msr_hv_runtime) {
3577 kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, 0);
3578 }
3579 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
3580 uint32_t msr;
3581
3582 kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL, 0);
3583 kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP, 0);
3584 kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP, 0);
3585 for (msr = HV_X64_MSR_SINT0; msr <= HV_X64_MSR_SINT15; msr++) {
3586 kvm_msr_entry_add(cpu, msr, 0);
3587 }
3588 }
3589 if (has_msr_hv_stimer) {
3590 uint32_t msr;
3591
3592 for (msr = HV_X64_MSR_STIMER0_CONFIG; msr <= HV_X64_MSR_STIMER3_COUNT;
3593 msr++) {
3594 kvm_msr_entry_add(cpu, msr, 0);
3595 }
3596 }
3597 if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
3598 kvm_msr_entry_add(cpu, MSR_MTRRdefType, 0);
3599 kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, 0);
3600 kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, 0);
3601 kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, 0);
3602 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, 0);
3603 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, 0);
3604 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, 0);
3605 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, 0);
3606 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, 0);
3607 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, 0);
3608 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, 0);
3609 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, 0);
3610 for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
3611 kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i), 0);
3612 kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), 0);
3613 }
3614 }
3615
3616 if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
3617 int addr_num =
3618 kvm_arch_get_supported_cpuid(kvm_state, 0x14, 1, R_EAX) & 0x7;
3619
3620 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL, 0);
3621 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS, 0);
3622 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE, 0);
3623 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK, 0);
3624 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH, 0);
3625 for (i = 0; i < addr_num; i++) {
3626 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i, 0);
3627 }
3628 }
3629
3630 if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) {
3631 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0, 0);
3632 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1, 0);
3633 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2, 0);
3634 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3, 0);
3635 }
3636
3637 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf);
3638 if (ret < 0) {
3639 return ret;
3640 }
3641
3642 if (ret < cpu->kvm_msr_buf->nmsrs) {
3643 struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
3644 error_report("error: failed to get MSR 0x%" PRIx32,
3645 (uint32_t)e->index);
3646 }
3647
3648 assert(ret == cpu->kvm_msr_buf->nmsrs);
3649 /*
3650 * MTRR masks: Each mask consists of 5 parts
3651 * a 10..0: must be zero
3652 * b 11 : valid bit
3653 * c n-1.12: actual mask bits
3654 * d 51..n: reserved must be zero
3655 * e 63.52: reserved must be zero
3656 *
3657 * 'n' is the number of physical bits supported by the CPU and is
3658 * apparently always <= 52. We know our 'n' but don't know what
3659 * the destinations 'n' is; it might be smaller, in which case
3660 * it masks (c) on loading. It might be larger, in which case
3661 * we fill 'd' so that d..c is consistent irrespetive of the 'n'
3662 * we're migrating to.
3663 */
3664
3665 if (cpu->fill_mtrr_mask) {
3666 QEMU_BUILD_BUG_ON(TARGET_PHYS_ADDR_SPACE_BITS > 52);
3667 assert(cpu->phys_bits <= TARGET_PHYS_ADDR_SPACE_BITS);
3668 mtrr_top_bits = MAKE_64BIT_MASK(cpu->phys_bits, 52 - cpu->phys_bits);
3669 } else {
3670 mtrr_top_bits = 0;
3671 }
3672
3673 for (i = 0; i < ret; i++) {
3674 uint32_t index = msrs[i].index;
3675 switch (index) {
3676 case MSR_IA32_SYSENTER_CS:
3677 env->sysenter_cs = msrs[i].data;
3678 break;
3679 case MSR_IA32_SYSENTER_ESP:
3680 env->sysenter_esp = msrs[i].data;
3681 break;
3682 case MSR_IA32_SYSENTER_EIP:
3683 env->sysenter_eip = msrs[i].data;
3684 break;
3685 case MSR_PAT:
3686 env->pat = msrs[i].data;
3687 break;
3688 case MSR_STAR:
3689 env->star = msrs[i].data;
3690 break;
3691 #ifdef TARGET_X86_64
3692 case MSR_CSTAR:
3693 env->cstar = msrs[i].data;
3694 break;
3695 case MSR_KERNELGSBASE:
3696 env->kernelgsbase = msrs[i].data;
3697 break;
3698 case MSR_FMASK:
3699 env->fmask = msrs[i].data;
3700 break;
3701 case MSR_LSTAR:
3702 env->lstar = msrs[i].data;
3703 break;
3704 #endif
3705 case MSR_IA32_TSC:
3706 env->tsc = msrs[i].data;
3707 break;
3708 case MSR_TSC_AUX:
3709 env->tsc_aux = msrs[i].data;
3710 break;
3711 case MSR_TSC_ADJUST:
3712 env->tsc_adjust = msrs[i].data;
3713 break;
3714 case MSR_IA32_TSCDEADLINE:
3715 env->tsc_deadline = msrs[i].data;
3716 break;
3717 case MSR_VM_HSAVE_PA:
3718 env->vm_hsave = msrs[i].data;
3719 break;
3720 case MSR_KVM_SYSTEM_TIME:
3721 env->system_time_msr = msrs[i].data;
3722 break;
3723 case MSR_KVM_WALL_CLOCK:
3724 env->wall_clock_msr = msrs[i].data;
3725 break;
3726 case MSR_MCG_STATUS:
3727 env->mcg_status = msrs[i].data;
3728 break;
3729 case MSR_MCG_CTL:
3730 env->mcg_ctl = msrs[i].data;
3731 break;
3732 case MSR_MCG_EXT_CTL:
3733 env->mcg_ext_ctl = msrs[i].data;
3734 break;
3735 case MSR_IA32_MISC_ENABLE:
3736 env->msr_ia32_misc_enable = msrs[i].data;
3737 break;
3738 case MSR_IA32_SMBASE:
3739 env->smbase = msrs[i].data;
3740 break;
3741 case MSR_SMI_COUNT:
3742 env->msr_smi_count = msrs[i].data;
3743 break;
3744 case MSR_IA32_FEATURE_CONTROL:
3745 env->msr_ia32_feature_control = msrs[i].data;
3746 break;
3747 case MSR_IA32_BNDCFGS:
3748 env->msr_bndcfgs = msrs[i].data;
3749 break;
3750 case MSR_IA32_XSS:
3751 env->xss = msrs[i].data;
3752 break;
3753 case MSR_IA32_UMWAIT_CONTROL:
3754 env->umwait = msrs[i].data;
3755 break;
3756 case MSR_IA32_PKRS:
3757 env->pkrs = msrs[i].data;
3758 break;
3759 default:
3760 if (msrs[i].index >= MSR_MC0_CTL &&
3761 msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
3762 env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
3763 }
3764 break;
3765 case MSR_KVM_ASYNC_PF_EN:
3766 env->async_pf_en_msr = msrs[i].data;
3767 break;
3768 case MSR_KVM_ASYNC_PF_INT:
3769 env->async_pf_int_msr = msrs[i].data;
3770 break;
3771 case MSR_KVM_PV_EOI_EN:
3772 env->pv_eoi_en_msr = msrs[i].data;
3773 break;
3774 case MSR_KVM_STEAL_TIME:
3775 env->steal_time_msr = msrs[i].data;
3776 break;
3777 case MSR_KVM_POLL_CONTROL: {
3778 env->poll_control_msr = msrs[i].data;
3779 break;
3780 }
3781 case MSR_CORE_PERF_FIXED_CTR_CTRL:
3782 env->msr_fixed_ctr_ctrl = msrs[i].data;
3783 break;
3784 case MSR_CORE_PERF_GLOBAL_CTRL:
3785 env->msr_global_ctrl = msrs[i].data;
3786 break;
3787 case MSR_CORE_PERF_GLOBAL_STATUS:
3788 env->msr_global_status = msrs[i].data;
3789 break;
3790 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
3791 env->msr_global_ovf_ctrl = msrs[i].data;
3792 break;
3793 case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR0 + MAX_FIXED_COUNTERS - 1:
3794 env->msr_fixed_counters[index - MSR_CORE_PERF_FIXED_CTR0] = msrs[i].data;
3795 break;
3796 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR0 + MAX_GP_COUNTERS - 1:
3797 env->msr_gp_counters[index - MSR_P6_PERFCTR0] = msrs[i].data;
3798 break;
3799 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1:
3800 env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data;
3801 break;
3802 case HV_X64_MSR_HYPERCALL:
3803 env->msr_hv_hypercall = msrs[i].data;
3804 break;
3805 case HV_X64_MSR_GUEST_OS_ID:
3806 env->msr_hv_guest_os_id = msrs[i].data;
3807 break;
3808 case HV_X64_MSR_APIC_ASSIST_PAGE:
3809 env->msr_hv_vapic = msrs[i].data;
3810 break;
3811 case HV_X64_MSR_REFERENCE_TSC:
3812 env->msr_hv_tsc = msrs[i].data;
3813 break;
3814 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
3815 env->msr_hv_crash_params[index - HV_X64_MSR_CRASH_P0] = msrs[i].data;
3816 break;
3817 case HV_X64_MSR_VP_RUNTIME:
3818 env->msr_hv_runtime = msrs[i].data;
3819 break;
3820 case HV_X64_MSR_SCONTROL:
3821 env->msr_hv_synic_control = msrs[i].data;
3822 break;
3823 case HV_X64_MSR_SIEFP:
3824 env->msr_hv_synic_evt_page = msrs[i].data;
3825 break;
3826 case HV_X64_MSR_SIMP:
3827 env->msr_hv_synic_msg_page = msrs[i].data;
3828 break;
3829 case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
3830 env->msr_hv_synic_sint[index - HV_X64_MSR_SINT0] = msrs[i].data;
3831 break;
3832 case HV_X64_MSR_STIMER0_CONFIG:
3833 case HV_X64_MSR_STIMER1_CONFIG:
3834 case HV_X64_MSR_STIMER2_CONFIG:
3835 case HV_X64_MSR_STIMER3_CONFIG:
3836 env->msr_hv_stimer_config[(index - HV_X64_MSR_STIMER0_CONFIG)/2] =
3837 msrs[i].data;
3838 break;
3839 case HV_X64_MSR_STIMER0_COUNT:
3840 case HV_X64_MSR_STIMER1_COUNT:
3841 case HV_X64_MSR_STIMER2_COUNT:
3842 case HV_X64_MSR_STIMER3_COUNT:
3843 env->msr_hv_stimer_count[(index - HV_X64_MSR_STIMER0_COUNT)/2] =
3844 msrs[i].data;
3845 break;
3846 case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
3847 env->msr_hv_reenlightenment_control = msrs[i].data;
3848 break;
3849 case HV_X64_MSR_TSC_EMULATION_CONTROL:
3850 env->msr_hv_tsc_emulation_control = msrs[i].data;
3851 break;
3852 case HV_X64_MSR_TSC_EMULATION_STATUS:
3853 env->msr_hv_tsc_emulation_status = msrs[i].data;
3854 break;
3855 case MSR_MTRRdefType:
3856 env->mtrr_deftype = msrs[i].data;
3857 break;
3858 case MSR_MTRRfix64K_00000:
3859 env->mtrr_fixed[0] = msrs[i].data;
3860 break;
3861 case MSR_MTRRfix16K_80000:
3862 env->mtrr_fixed[1] = msrs[i].data;
3863 break;
3864 case MSR_MTRRfix16K_A0000:
3865 env->mtrr_fixed[2] = msrs[i].data;
3866 break;
3867 case MSR_MTRRfix4K_C0000:
3868 env->mtrr_fixed[3] = msrs[i].data;
3869 break;
3870 case MSR_MTRRfix4K_C8000:
3871 env->mtrr_fixed[4] = msrs[i].data;
3872 break;
3873 case MSR_MTRRfix4K_D0000:
3874 env->mtrr_fixed[5] = msrs[i].data;
3875 break;
3876 case MSR_MTRRfix4K_D8000:
3877 env->mtrr_fixed[6] = msrs[i].data;
3878 break;
3879 case MSR_MTRRfix4K_E0000:
3880 env->mtrr_fixed[7] = msrs[i].data;
3881 break;
3882 case MSR_MTRRfix4K_E8000:
3883 env->mtrr_fixed[8] = msrs[i].data;
3884 break;
3885 case MSR_MTRRfix4K_F0000:
3886 env->mtrr_fixed[9] = msrs[i].data;
3887 break;
3888 case MSR_MTRRfix4K_F8000:
3889 env->mtrr_fixed[10] = msrs[i].data;
3890 break;
3891 case MSR_MTRRphysBase(0) ... MSR_MTRRphysMask(MSR_MTRRcap_VCNT - 1):
3892 if (index & 1) {
3893 env->mtrr_var[MSR_MTRRphysIndex(index)].mask = msrs[i].data |
3894 mtrr_top_bits;
3895 } else {
3896 env->mtrr_var[MSR_MTRRphysIndex(index)].base = msrs[i].data;
3897 }
3898 break;
3899 case MSR_IA32_SPEC_CTRL:
3900 env->spec_ctrl = msrs[i].data;
3901 break;
3902 case MSR_AMD64_TSC_RATIO:
3903 env->amd_tsc_scale_msr = msrs[i].data;
3904 break;
3905 case MSR_IA32_TSX_CTRL:
3906 env->tsx_ctrl = msrs[i].data;
3907 break;
3908 case MSR_VIRT_SSBD:
3909 env->virt_ssbd = msrs[i].data;
3910 break;
3911 case MSR_IA32_RTIT_CTL:
3912 env->msr_rtit_ctrl = msrs[i].data;
3913 break;
3914 case MSR_IA32_RTIT_STATUS:
3915 env->msr_rtit_status = msrs[i].data;
3916 break;
3917 case MSR_IA32_RTIT_OUTPUT_BASE:
3918 env->msr_rtit_output_base = msrs[i].data;
3919 break;
3920 case MSR_IA32_RTIT_OUTPUT_MASK:
3921 env->msr_rtit_output_mask = msrs[i].data;
3922 break;
3923 case MSR_IA32_RTIT_CR3_MATCH:
3924 env->msr_rtit_cr3_match = msrs[i].data;
3925 break;
3926 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
3927 env->msr_rtit_addrs[index - MSR_IA32_RTIT_ADDR0_A] = msrs[i].data;
3928 break;
3929 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
3930 env->msr_ia32_sgxlepubkeyhash[index - MSR_IA32_SGXLEPUBKEYHASH0] =
3931 msrs[i].data;
3932 break;
3933 }
3934 }
3935
3936 return 0;
3937 }
3938
3939 static int kvm_put_mp_state(X86CPU *cpu)
3940 {
3941 struct kvm_mp_state mp_state = { .mp_state = cpu->env.mp_state };
3942
3943 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MP_STATE, &mp_state);
3944 }
3945
3946 static int kvm_get_mp_state(X86CPU *cpu)
3947 {
3948 CPUState *cs = CPU(cpu);
3949 CPUX86State *env = &cpu->env;
3950 struct kvm_mp_state mp_state;
3951 int ret;
3952
3953 ret = kvm_vcpu_ioctl(cs, KVM_GET_MP_STATE, &mp_state);
3954 if (ret < 0) {
3955 return ret;
3956 }
3957 env->mp_state = mp_state.mp_state;
3958 if (kvm_irqchip_in_kernel()) {
3959 cs->halted = (mp_state.mp_state == KVM_MP_STATE_HALTED);
3960 }
3961 return 0;
3962 }
3963
3964 static int kvm_get_apic(X86CPU *cpu)
3965 {
3966 DeviceState *apic = cpu->apic_state;
3967 struct kvm_lapic_state kapic;
3968 int ret;
3969
3970 if (apic && kvm_irqchip_in_kernel()) {
3971 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_LAPIC, &kapic);
3972 if (ret < 0) {
3973 return ret;
3974 }
3975
3976 kvm_get_apic_state(apic, &kapic);
3977 }
3978 return 0;
3979 }
3980
3981 static int kvm_put_vcpu_events(X86CPU *cpu, int level)
3982 {
3983 CPUState *cs = CPU(cpu);
3984 CPUX86State *env = &cpu->env;
3985 struct kvm_vcpu_events events = {};
3986
3987 if (!kvm_has_vcpu_events()) {
3988 return 0;
3989 }
3990
3991 events.flags = 0;
3992
3993 if (has_exception_payload) {
3994 events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
3995 events.exception.pending = env->exception_pending;
3996 events.exception_has_payload = env->exception_has_payload;
3997 events.exception_payload = env->exception_payload;
3998 }
3999 events.exception.nr = env->exception_nr;
4000 events.exception.injected = env->exception_injected;
4001 events.exception.has_error_code = env->has_error_code;
4002 events.exception.error_code = env->error_code;
4003
4004 events.interrupt.injected = (env->interrupt_injected >= 0);
4005 events.interrupt.nr = env->interrupt_injected;
4006 events.interrupt.soft = env->soft_interrupt;
4007
4008 events.nmi.injected = env->nmi_injected;
4009 events.nmi.pending = env->nmi_pending;
4010 events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
4011
4012 events.sipi_vector = env->sipi_vector;
4013
4014 if (has_msr_smbase) {
4015 events.smi.smm = !!(env->hflags & HF_SMM_MASK);
4016 events.smi.smm_inside_nmi = !!(env->hflags2 & HF2_SMM_INSIDE_NMI_MASK);
4017 if (kvm_irqchip_in_kernel()) {
4018 /* As soon as these are moved to the kernel, remove them
4019 * from cs->interrupt_request.
4020 */
4021 events.smi.pending = cs->interrupt_request & CPU_INTERRUPT_SMI;
4022 events.smi.latched_init = cs->interrupt_request & CPU_INTERRUPT_INIT;
4023 cs->interrupt_request &= ~(CPU_INTERRUPT_INIT | CPU_INTERRUPT_SMI);
4024 } else {
4025 /* Keep these in cs->interrupt_request. */
4026 events.smi.pending = 0;
4027 events.smi.latched_init = 0;
4028 }
4029 /* Stop SMI delivery on old machine types to avoid a reboot
4030 * on an inward migration of an old VM.
4031 */
4032 if (!cpu->kvm_no_smi_migration) {
4033 events.flags |= KVM_VCPUEVENT_VALID_SMM;
4034 }
4035 }
4036
4037 if (level >= KVM_PUT_RESET_STATE) {
4038 events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING;
4039 if (env->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
4040 events.flags |= KVM_VCPUEVENT_VALID_SIPI_VECTOR;
4041 }
4042 }
4043
4044 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events);
4045 }
4046
4047 static int kvm_get_vcpu_events(X86CPU *cpu)
4048 {
4049 CPUX86State *env = &cpu->env;
4050 struct kvm_vcpu_events events;
4051 int ret;
4052
4053 if (!kvm_has_vcpu_events()) {
4054 return 0;
4055 }
4056
4057 memset(&events, 0, sizeof(events));
4058 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_VCPU_EVENTS, &events);
4059 if (ret < 0) {
4060 return ret;
4061 }
4062
4063 if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
4064 env->exception_pending = events.exception.pending;
4065 env->exception_has_payload = events.exception_has_payload;
4066 env->exception_payload = events.exception_payload;
4067 } else {
4068 env->exception_pending = 0;
4069 env->exception_has_payload = false;
4070 }
4071 env->exception_injected = events.exception.injected;
4072 env->exception_nr =
4073 (env->exception_pending || env->exception_injected) ?
4074 events.exception.nr : -1;
4075 env->has_error_code = events.exception.has_error_code;
4076 env->error_code = events.exception.error_code;
4077
4078 env->interrupt_injected =
4079 events.interrupt.injected ? events.interrupt.nr : -1;
4080 env->soft_interrupt = events.interrupt.soft;
4081
4082 env->nmi_injected = events.nmi.injected;
4083 env->nmi_pending = events.nmi.pending;
4084 if (events.nmi.masked) {
4085 env->hflags2 |= HF2_NMI_MASK;
4086 } else {
4087 env->hflags2 &= ~HF2_NMI_MASK;
4088 }
4089
4090 if (events.flags & KVM_VCPUEVENT_VALID_SMM) {
4091 if (events.smi.smm) {
4092 env->hflags |= HF_SMM_MASK;
4093 } else {
4094 env->hflags &= ~HF_SMM_MASK;
4095 }
4096 if (events.smi.pending) {
4097 cpu_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
4098 } else {
4099 cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
4100 }
4101 if (events.smi.smm_inside_nmi) {
4102 env->hflags2 |= HF2_SMM_INSIDE_NMI_MASK;
4103 } else {
4104 env->hflags2 &= ~HF2_SMM_INSIDE_NMI_MASK;
4105 }
4106 if (events.smi.latched_init) {
4107 cpu_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
4108 } else {
4109 cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
4110 }
4111 }
4112
4113 env->sipi_vector = events.sipi_vector;
4114
4115 return 0;
4116 }
4117
4118 static int kvm_guest_debug_workarounds(X86CPU *cpu)
4119 {
4120 CPUState *cs = CPU(cpu);
4121 CPUX86State *env = &cpu->env;
4122 int ret = 0;
4123 unsigned long reinject_trap = 0;
4124
4125 if (!kvm_has_vcpu_events()) {
4126 if (env->exception_nr == EXCP01_DB) {
4127 reinject_trap = KVM_GUESTDBG_INJECT_DB;
4128 } else if (env->exception_injected == EXCP03_INT3) {
4129 reinject_trap = KVM_GUESTDBG_INJECT_BP;
4130 }
4131 kvm_reset_exception(env);
4132 }
4133
4134 /*
4135 * Kernels before KVM_CAP_X86_ROBUST_SINGLESTEP overwrote flags.TF
4136 * injected via SET_GUEST_DEBUG while updating GP regs. Work around this
4137 * by updating the debug state once again if single-stepping is on.
4138 * Another reason to call kvm_update_guest_debug here is a pending debug
4139 * trap raise by the guest. On kernels without SET_VCPU_EVENTS we have to
4140 * reinject them via SET_GUEST_DEBUG.
4141 */
4142 if (reinject_trap ||
4143 (!kvm_has_robust_singlestep() && cs->singlestep_enabled)) {
4144 ret = kvm_update_guest_debug(cs, reinject_trap);
4145 }
4146 return ret;
4147 }
4148
4149 static int kvm_put_debugregs(X86CPU *cpu)
4150 {
4151 CPUX86State *env = &cpu->env;
4152 struct kvm_debugregs dbgregs;
4153 int i;
4154
4155 if (!kvm_has_debugregs()) {
4156 return 0;
4157 }
4158
4159 memset(&dbgregs, 0, sizeof(dbgregs));
4160 for (i = 0; i < 4; i++) {
4161 dbgregs.db[i] = env->dr[i];
4162 }
4163 dbgregs.dr6 = env->dr[6];
4164 dbgregs.dr7 = env->dr[7];
4165 dbgregs.flags = 0;
4166
4167 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_DEBUGREGS, &dbgregs);
4168 }
4169
4170 static int kvm_get_debugregs(X86CPU *cpu)
4171 {
4172 CPUX86State *env = &cpu->env;
4173 struct kvm_debugregs dbgregs;
4174 int i, ret;
4175
4176 if (!kvm_has_debugregs()) {
4177 return 0;
4178 }
4179
4180 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_DEBUGREGS, &dbgregs);
4181 if (ret < 0) {
4182 return ret;
4183 }
4184 for (i = 0; i < 4; i++) {
4185 env->dr[i] = dbgregs.db[i];
4186 }
4187 env->dr[4] = env->dr[6] = dbgregs.dr6;
4188 env->dr[5] = env->dr[7] = dbgregs.dr7;
4189
4190 return 0;
4191 }
4192
4193 static int kvm_put_nested_state(X86CPU *cpu)
4194 {
4195 CPUX86State *env = &cpu->env;
4196 int max_nested_state_len = kvm_max_nested_state_length();
4197
4198 if (!env->nested_state) {
4199 return 0;
4200 }
4201
4202 /*
4203 * Copy flags that are affected by reset from env->hflags and env->hflags2.
4204 */
4205 if (env->hflags & HF_GUEST_MASK) {
4206 env->nested_state->flags |= KVM_STATE_NESTED_GUEST_MODE;
4207 } else {
4208 env->nested_state->flags &= ~KVM_STATE_NESTED_GUEST_MODE;
4209 }
4210
4211 /* Don't set KVM_STATE_NESTED_GIF_SET on VMX as it is illegal */
4212 if (cpu_has_svm(env) && (env->hflags2 & HF2_GIF_MASK)) {
4213 env->nested_state->flags |= KVM_STATE_NESTED_GIF_SET;
4214 } else {
4215 env->nested_state->flags &= ~KVM_STATE_NESTED_GIF_SET;
4216 }
4217
4218 assert(env->nested_state->size <= max_nested_state_len);
4219 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state);
4220 }
4221
4222 static int kvm_get_nested_state(X86CPU *cpu)
4223 {
4224 CPUX86State *env = &cpu->env;
4225 int max_nested_state_len = kvm_max_nested_state_length();
4226 int ret;
4227
4228 if (!env->nested_state) {
4229 return 0;
4230 }
4231
4232 /*
4233 * It is possible that migration restored a smaller size into
4234 * nested_state->hdr.size than what our kernel support.
4235 * We preserve migration origin nested_state->hdr.size for
4236 * call to KVM_SET_NESTED_STATE but wish that our next call
4237 * to KVM_GET_NESTED_STATE will use max size our kernel support.
4238 */
4239 env->nested_state->size = max_nested_state_len;
4240
4241 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_NESTED_STATE, env->nested_state);
4242 if (ret < 0) {
4243 return ret;
4244 }
4245
4246 /*
4247 * Copy flags that are affected by reset to env->hflags and env->hflags2.
4248 */
4249 if (env->nested_state->flags & KVM_STATE_NESTED_GUEST_MODE) {
4250 env->hflags |= HF_GUEST_MASK;
4251 } else {
4252 env->hflags &= ~HF_GUEST_MASK;
4253 }
4254
4255 /* Keep HF2_GIF_MASK set on !SVM as x86_cpu_pending_interrupt() needs it */
4256 if (cpu_has_svm(env)) {
4257 if (env->nested_state->flags & KVM_STATE_NESTED_GIF_SET) {
4258 env->hflags2 |= HF2_GIF_MASK;
4259 } else {
4260 env->hflags2 &= ~HF2_GIF_MASK;
4261 }
4262 }
4263
4264 return ret;
4265 }
4266
4267 int kvm_arch_put_registers(CPUState *cpu, int level)
4268 {
4269 X86CPU *x86_cpu = X86_CPU(cpu);
4270 int ret;
4271
4272 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
4273
4274 /* must be before kvm_put_nested_state so that EFER.SVME is set */
4275 ret = has_sregs2 ? kvm_put_sregs2(x86_cpu) : kvm_put_sregs(x86_cpu);
4276 if (ret < 0) {
4277 return ret;
4278 }
4279
4280 if (level >= KVM_PUT_RESET_STATE) {
4281 ret = kvm_put_nested_state(x86_cpu);
4282 if (ret < 0) {
4283 return ret;
4284 }
4285
4286 ret = kvm_put_msr_feature_control(x86_cpu);
4287 if (ret < 0) {
4288 return ret;
4289 }
4290 }
4291
4292 if (level == KVM_PUT_FULL_STATE) {
4293 /* We don't check for kvm_arch_set_tsc_khz() errors here,
4294 * because TSC frequency mismatch shouldn't abort migration,
4295 * unless the user explicitly asked for a more strict TSC
4296 * setting (e.g. using an explicit "tsc-freq" option).
4297 */
4298 kvm_arch_set_tsc_khz(cpu);
4299 }
4300
4301 ret = kvm_getput_regs(x86_cpu, 1);
4302 if (ret < 0) {
4303 return ret;
4304 }
4305 ret = kvm_put_xsave(x86_cpu);
4306 if (ret < 0) {
4307 return ret;
4308 }
4309 ret = kvm_put_xcrs(x86_cpu);
4310 if (ret < 0) {
4311 return ret;
4312 }
4313 /* must be before kvm_put_msrs */
4314 ret = kvm_inject_mce_oldstyle(x86_cpu);
4315 if (ret < 0) {
4316 return ret;
4317 }
4318 ret = kvm_put_msrs(x86_cpu, level);
4319 if (ret < 0) {
4320 return ret;
4321 }
4322 ret = kvm_put_vcpu_events(x86_cpu, level);
4323 if (ret < 0) {
4324 return ret;
4325 }
4326 if (level >= KVM_PUT_RESET_STATE) {
4327 ret = kvm_put_mp_state(x86_cpu);
4328 if (ret < 0) {
4329 return ret;
4330 }
4331 }
4332
4333 ret = kvm_put_tscdeadline_msr(x86_cpu);
4334 if (ret < 0) {
4335 return ret;
4336 }
4337 ret = kvm_put_debugregs(x86_cpu);
4338 if (ret < 0) {
4339 return ret;
4340 }
4341 /* must be last */
4342 ret = kvm_guest_debug_workarounds(x86_cpu);
4343 if (ret < 0) {
4344 return ret;
4345 }
4346 return 0;
4347 }
4348
4349 int kvm_arch_get_registers(CPUState *cs)
4350 {
4351 X86CPU *cpu = X86_CPU(cs);
4352 int ret;
4353
4354 assert(cpu_is_stopped(cs) || qemu_cpu_is_self(cs));
4355
4356 ret = kvm_get_vcpu_events(cpu);
4357 if (ret < 0) {
4358 goto out;
4359 }
4360 /*
4361 * KVM_GET_MPSTATE can modify CS and RIP, call it before
4362 * KVM_GET_REGS and KVM_GET_SREGS.
4363 */
4364 ret = kvm_get_mp_state(cpu);
4365 if (ret < 0) {
4366 goto out;
4367 }
4368 ret = kvm_getput_regs(cpu, 0);
4369 if (ret < 0) {
4370 goto out;
4371 }
4372 ret = kvm_get_xsave(cpu);
4373 if (ret < 0) {
4374 goto out;
4375 }
4376 ret = kvm_get_xcrs(cpu);
4377 if (ret < 0) {
4378 goto out;
4379 }
4380 ret = has_sregs2 ? kvm_get_sregs2(cpu) : kvm_get_sregs(cpu);
4381 if (ret < 0) {
4382 goto out;
4383 }
4384 ret = kvm_get_msrs(cpu);
4385 if (ret < 0) {
4386 goto out;
4387 }
4388 ret = kvm_get_apic(cpu);
4389 if (ret < 0) {
4390 goto out;
4391 }
4392 ret = kvm_get_debugregs(cpu);
4393 if (ret < 0) {
4394 goto out;
4395 }
4396 ret = kvm_get_nested_state(cpu);
4397 if (ret < 0) {
4398 goto out;
4399 }
4400 ret = 0;
4401 out:
4402 cpu_sync_bndcs_hflags(&cpu->env);
4403 return ret;
4404 }
4405
4406 void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
4407 {
4408 X86CPU *x86_cpu = X86_CPU(cpu);
4409 CPUX86State *env = &x86_cpu->env;
4410 int ret;
4411
4412 /* Inject NMI */
4413 if (cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
4414 if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
4415 qemu_mutex_lock_iothread();
4416 cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
4417 qemu_mutex_unlock_iothread();
4418 DPRINTF("injected NMI\n");
4419 ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
4420 if (ret < 0) {
4421 fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
4422 strerror(-ret));
4423 }
4424 }
4425 if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
4426 qemu_mutex_lock_iothread();
4427 cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
4428 qemu_mutex_unlock_iothread();
4429 DPRINTF("injected SMI\n");
4430 ret = kvm_vcpu_ioctl(cpu, KVM_SMI);
4431 if (ret < 0) {
4432 fprintf(stderr, "KVM: injection failed, SMI lost (%s)\n",
4433 strerror(-ret));
4434 }
4435 }
4436 }
4437
4438 if (!kvm_pic_in_kernel()) {
4439 qemu_mutex_lock_iothread();
4440 }
4441
4442 /* Force the VCPU out of its inner loop to process any INIT requests
4443 * or (for userspace APIC, but it is cheap to combine the checks here)
4444 * pending TPR access reports.
4445 */
4446 if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
4447 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
4448 !(env->hflags & HF_SMM_MASK)) {
4449 cpu->exit_request = 1;
4450 }
4451 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
4452 cpu->exit_request = 1;
4453 }
4454 }
4455
4456 if (!kvm_pic_in_kernel()) {
4457 /* Try to inject an interrupt if the guest can accept it */
4458 if (run->ready_for_interrupt_injection &&
4459 (cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
4460 (env->eflags & IF_MASK)) {
4461 int irq;
4462
4463 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
4464 irq = cpu_get_pic_interrupt(env);
4465 if (irq >= 0) {
4466 struct kvm_interrupt intr;
4467
4468 intr.irq = irq;
4469 DPRINTF("injected interrupt %d\n", irq);
4470 ret = kvm_vcpu_ioctl(cpu, KVM_INTERRUPT, &intr);
4471 if (ret < 0) {
4472 fprintf(stderr,
4473 "KVM: injection failed, interrupt lost (%s)\n",
4474 strerror(-ret));
4475 }
4476 }
4477 }
4478
4479 /* If we have an interrupt but the guest is not ready to receive an
4480 * interrupt, request an interrupt window exit. This will
4481 * cause a return to userspace as soon as the guest is ready to
4482 * receive interrupts. */
4483 if ((cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
4484 run->request_interrupt_window = 1;
4485 } else {
4486 run->request_interrupt_window = 0;
4487 }
4488
4489 DPRINTF("setting tpr\n");
4490 run->cr8 = cpu_get_apic_tpr(x86_cpu->apic_state);
4491
4492 qemu_mutex_unlock_iothread();
4493 }
4494 }
4495
4496 static void kvm_rate_limit_on_bus_lock(void)
4497 {
4498 uint64_t delay_ns = ratelimit_calculate_delay(&bus_lock_ratelimit_ctrl, 1);
4499
4500 if (delay_ns) {
4501 g_usleep(delay_ns / SCALE_US);
4502 }
4503 }
4504
4505 MemTxAttrs kvm_arch_post_run(CPUState *cpu, struct kvm_run *run)
4506 {
4507 X86CPU *x86_cpu = X86_CPU(cpu);
4508 CPUX86State *env = &x86_cpu->env;
4509
4510 if (run->flags & KVM_RUN_X86_SMM) {
4511 env->hflags |= HF_SMM_MASK;
4512 } else {
4513 env->hflags &= ~HF_SMM_MASK;
4514 }
4515 if (run->if_flag) {
4516 env->eflags |= IF_MASK;
4517 } else {
4518 env->eflags &= ~IF_MASK;
4519 }
4520 if (run->flags & KVM_RUN_X86_BUS_LOCK) {
4521 kvm_rate_limit_on_bus_lock();
4522 }
4523
4524 /* We need to protect the apic state against concurrent accesses from
4525 * different threads in case the userspace irqchip is used. */
4526 if (!kvm_irqchip_in_kernel()) {
4527 qemu_mutex_lock_iothread();
4528 }
4529 cpu_set_apic_tpr(x86_cpu->apic_state, run->cr8);
4530 cpu_set_apic_base(x86_cpu->apic_state, run->apic_base);
4531 if (!kvm_irqchip_in_kernel()) {
4532 qemu_mutex_unlock_iothread();
4533 }
4534 return cpu_get_mem_attrs(env);
4535 }
4536
4537 int kvm_arch_process_async_events(CPUState *cs)
4538 {
4539 X86CPU *cpu = X86_CPU(cs);
4540 CPUX86State *env = &cpu->env;
4541
4542 if (cs->interrupt_request & CPU_INTERRUPT_MCE) {
4543 /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
4544 assert(env->mcg_cap);
4545
4546 cs->interrupt_request &= ~CPU_INTERRUPT_MCE;
4547
4548 kvm_cpu_synchronize_state(cs);
4549
4550 if (env->exception_nr == EXCP08_DBLE) {
4551 /* this means triple fault */
4552 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
4553 cs->exit_request = 1;
4554 return 0;
4555 }
4556 kvm_queue_exception(env, EXCP12_MCHK, 0, 0);
4557 env->has_error_code = 0;
4558
4559 cs->halted = 0;
4560 if (kvm_irqchip_in_kernel() && env->mp_state == KVM_MP_STATE_HALTED) {
4561 env->mp_state = KVM_MP_STATE_RUNNABLE;
4562 }
4563 }
4564
4565 if ((cs->interrupt_request & CPU_INTERRUPT_INIT) &&
4566 !(env->hflags & HF_SMM_MASK)) {
4567 kvm_cpu_synchronize_state(cs);
4568 do_cpu_init(cpu);
4569 }
4570
4571 if (kvm_irqchip_in_kernel()) {
4572 return 0;
4573 }
4574
4575 if (cs->interrupt_request & CPU_INTERRUPT_POLL) {
4576 cs->interrupt_request &= ~CPU_INTERRUPT_POLL;
4577 apic_poll_irq(cpu->apic_state);
4578 }
4579 if (((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
4580 (env->eflags & IF_MASK)) ||
4581 (cs->interrupt_request & CPU_INTERRUPT_NMI)) {
4582 cs->halted = 0;
4583 }
4584 if (cs->interrupt_request & CPU_INTERRUPT_SIPI) {
4585 kvm_cpu_synchronize_state(cs);
4586 do_cpu_sipi(cpu);
4587 }
4588 if (cs->interrupt_request & CPU_INTERRUPT_TPR) {
4589 cs->interrupt_request &= ~CPU_INTERRUPT_TPR;
4590 kvm_cpu_synchronize_state(cs);
4591 apic_handle_tpr_access_report(cpu->apic_state, env->eip,
4592 env->tpr_access_type);
4593 }
4594
4595 return cs->halted;
4596 }
4597
4598 static int kvm_handle_halt(X86CPU *cpu)
4599 {
4600 CPUState *cs = CPU(cpu);
4601 CPUX86State *env = &cpu->env;
4602
4603 if (!((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
4604 (env->eflags & IF_MASK)) &&
4605 !(cs->interrupt_request & CPU_INTERRUPT_NMI)) {
4606 cs->halted = 1;
4607 return EXCP_HLT;
4608 }
4609
4610 return 0;
4611 }
4612
4613 static int kvm_handle_tpr_access(X86CPU *cpu)
4614 {
4615 CPUState *cs = CPU(cpu);
4616 struct kvm_run *run = cs->kvm_run;
4617
4618 apic_handle_tpr_access_report(cpu->apic_state, run->tpr_access.rip,
4619 run->tpr_access.is_write ? TPR_ACCESS_WRITE
4620 : TPR_ACCESS_READ);
4621 return 1;
4622 }
4623
4624 int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
4625 {
4626 static const uint8_t int3 = 0xcc;
4627
4628 if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 0) ||
4629 cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&int3, 1, 1)) {
4630 return -EINVAL;
4631 }
4632 return 0;
4633 }
4634
4635 int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
4636 {
4637 uint8_t int3;
4638
4639 if (cpu_memory_rw_debug(cs, bp->pc, &int3, 1, 0)) {
4640 return -EINVAL;
4641 }
4642 if (int3 != 0xcc) {
4643 return 0;
4644 }
4645 if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 1)) {
4646 return -EINVAL;
4647 }
4648 return 0;
4649 }
4650
4651 static struct {
4652 target_ulong addr;
4653 int len;
4654 int type;
4655 } hw_breakpoint[4];
4656
4657 static int nb_hw_breakpoint;
4658
4659 static int find_hw_breakpoint(target_ulong addr, int len, int type)
4660 {
4661 int n;
4662
4663 for (n = 0; n < nb_hw_breakpoint; n++) {
4664 if (hw_breakpoint[n].addr == addr && hw_breakpoint[n].type == type &&
4665 (hw_breakpoint[n].len == len || len == -1)) {
4666 return n;
4667 }
4668 }
4669 return -1;
4670 }
4671
4672 int kvm_arch_insert_hw_breakpoint(target_ulong addr,
4673 target_ulong len, int type)
4674 {
4675 switch (type) {
4676 case GDB_BREAKPOINT_HW:
4677 len = 1;
4678 break;
4679 case GDB_WATCHPOINT_WRITE:
4680 case GDB_WATCHPOINT_ACCESS:
4681 switch (len) {
4682 case 1:
4683 break;
4684 case 2:
4685 case 4:
4686 case 8:
4687 if (addr & (len - 1)) {
4688 return -EINVAL;
4689 }
4690 break;
4691 default:
4692 return -EINVAL;
4693 }
4694 break;
4695 default:
4696 return -ENOSYS;
4697 }
4698
4699 if (nb_hw_breakpoint == 4) {
4700 return -ENOBUFS;
4701 }
4702 if (find_hw_breakpoint(addr, len, type) >= 0) {
4703 return -EEXIST;
4704 }
4705 hw_breakpoint[nb_hw_breakpoint].addr = addr;
4706 hw_breakpoint[nb_hw_breakpoint].len = len;
4707 hw_breakpoint[nb_hw_breakpoint].type = type;
4708 nb_hw_breakpoint++;
4709
4710 return 0;
4711 }
4712
4713 int kvm_arch_remove_hw_breakpoint(target_ulong addr,
4714 target_ulong len, int type)
4715 {
4716 int n;
4717
4718 n = find_hw_breakpoint(addr, (type == GDB_BREAKPOINT_HW) ? 1 : len, type);
4719 if (n < 0) {
4720 return -ENOENT;
4721 }
4722 nb_hw_breakpoint--;
4723 hw_breakpoint[n] = hw_breakpoint[nb_hw_breakpoint];
4724
4725 return 0;
4726 }
4727
4728 void kvm_arch_remove_all_hw_breakpoints(void)
4729 {
4730 nb_hw_breakpoint = 0;
4731 }
4732
4733 static CPUWatchpoint hw_watchpoint;
4734
4735 static int kvm_handle_debug(X86CPU *cpu,
4736 struct kvm_debug_exit_arch *arch_info)
4737 {
4738 CPUState *cs = CPU(cpu);
4739 CPUX86State *env = &cpu->env;
4740 int ret = 0;
4741 int n;
4742
4743 if (arch_info->exception == EXCP01_DB) {
4744 if (arch_info->dr6 & DR6_BS) {
4745 if (cs->singlestep_enabled) {
4746 ret = EXCP_DEBUG;
4747 }
4748 } else {
4749 for (n = 0; n < 4; n++) {
4750 if (arch_info->dr6 & (1 << n)) {
4751 switch ((arch_info->dr7 >> (16 + n*4)) & 0x3) {
4752 case 0x0:
4753 ret = EXCP_DEBUG;
4754 break;
4755 case 0x1:
4756 ret = EXCP_DEBUG;
4757 cs->watchpoint_hit = &hw_watchpoint;
4758 hw_watchpoint.vaddr = hw_breakpoint[n].addr;
4759 hw_watchpoint.flags = BP_MEM_WRITE;
4760 break;
4761 case 0x3:
4762 ret = EXCP_DEBUG;
4763 cs->watchpoint_hit = &hw_watchpoint;
4764 hw_watchpoint.vaddr = hw_breakpoint[n].addr;
4765 hw_watchpoint.flags = BP_MEM_ACCESS;
4766 break;
4767 }
4768 }
4769 }
4770 }
4771 } else if (kvm_find_sw_breakpoint(cs, arch_info->pc)) {
4772 ret = EXCP_DEBUG;
4773 }
4774 if (ret == 0) {
4775 cpu_synchronize_state(cs);
4776 assert(env->exception_nr == -1);
4777
4778 /* pass to guest */
4779 kvm_queue_exception(env, arch_info->exception,
4780 arch_info->exception == EXCP01_DB,
4781 arch_info->dr6);
4782 env->has_error_code = 0;
4783 }
4784
4785 return ret;
4786 }
4787
4788 void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg)
4789 {
4790 const uint8_t type_code[] = {
4791 [GDB_BREAKPOINT_HW] = 0x0,
4792 [GDB_WATCHPOINT_WRITE] = 0x1,
4793 [GDB_WATCHPOINT_ACCESS] = 0x3
4794 };
4795 const uint8_t len_code[] = {
4796 [1] = 0x0, [2] = 0x1, [4] = 0x3, [8] = 0x2
4797 };
4798 int n;
4799
4800 if (kvm_sw_breakpoints_active(cpu)) {
4801 dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
4802 }
4803 if (nb_hw_breakpoint > 0) {
4804 dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
4805 dbg->arch.debugreg[7] = 0x0600;
4806 for (n = 0; n < nb_hw_breakpoint; n++) {
4807 dbg->arch.debugreg[n] = hw_breakpoint[n].addr;
4808 dbg->arch.debugreg[7] |= (2 << (n * 2)) |
4809 (type_code[hw_breakpoint[n].type] << (16 + n*4)) |
4810 ((uint32_t)len_code[hw_breakpoint[n].len] << (18 + n*4));
4811 }
4812 }
4813 }
4814
4815 static bool has_sgx_provisioning;
4816
4817 static bool __kvm_enable_sgx_provisioning(KVMState *s)
4818 {
4819 int fd, ret;
4820
4821 if (!kvm_vm_check_extension(s, KVM_CAP_SGX_ATTRIBUTE)) {
4822 return false;
4823 }
4824
4825 fd = qemu_open_old("/dev/sgx_provision", O_RDONLY);
4826 if (fd < 0) {
4827 return false;
4828 }
4829
4830 ret = kvm_vm_enable_cap(s, KVM_CAP_SGX_ATTRIBUTE, 0, fd);
4831 if (ret) {
4832 error_report("Could not enable SGX PROVISIONKEY: %s", strerror(-ret));
4833 exit(1);
4834 }
4835 close(fd);
4836 return true;
4837 }
4838
4839 bool kvm_enable_sgx_provisioning(KVMState *s)
4840 {
4841 return MEMORIZE(__kvm_enable_sgx_provisioning(s), has_sgx_provisioning);
4842 }
4843
4844 static bool host_supports_vmx(void)
4845 {
4846 uint32_t ecx, unused;
4847
4848 host_cpuid(1, 0, &unused, &unused, &ecx, &unused);
4849 return ecx & CPUID_EXT_VMX;
4850 }
4851
4852 #define VMX_INVALID_GUEST_STATE 0x80000021
4853
4854 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
4855 {
4856 X86CPU *cpu = X86_CPU(cs);
4857 uint64_t code;
4858 int ret;
4859
4860 switch (run->exit_reason) {
4861 case KVM_EXIT_HLT:
4862 DPRINTF("handle_hlt\n");
4863 qemu_mutex_lock_iothread();
4864 ret = kvm_handle_halt(cpu);
4865 qemu_mutex_unlock_iothread();
4866 break;
4867 case KVM_EXIT_SET_TPR:
4868 ret = 0;
4869 break;
4870 case KVM_EXIT_TPR_ACCESS:
4871 qemu_mutex_lock_iothread();
4872 ret = kvm_handle_tpr_access(cpu);
4873 qemu_mutex_unlock_iothread();
4874 break;
4875 case KVM_EXIT_FAIL_ENTRY:
4876 code = run->fail_entry.hardware_entry_failure_reason;
4877 fprintf(stderr, "KVM: entry failed, hardware error 0x%" PRIx64 "\n",
4878 code);
4879 if (host_supports_vmx() && code == VMX_INVALID_GUEST_STATE) {
4880 fprintf(stderr,
4881 "\nIf you're running a guest on an Intel machine without "
4882 "unrestricted mode\n"
4883 "support, the failure can be most likely due to the guest "
4884 "entering an invalid\n"
4885 "state for Intel VT. For example, the guest maybe running "
4886 "in big real mode\n"
4887 "which is not supported on less recent Intel processors."
4888 "\n\n");
4889 }
4890 ret = -1;
4891 break;
4892 case KVM_EXIT_EXCEPTION:
4893 fprintf(stderr, "KVM: exception %d exit (error code 0x%x)\n",
4894 run->ex.exception, run->ex.error_code);
4895 ret = -1;
4896 break;
4897 case KVM_EXIT_DEBUG:
4898 DPRINTF("kvm_exit_debug\n");
4899 qemu_mutex_lock_iothread();
4900 ret = kvm_handle_debug(cpu, &run->debug.arch);
4901 qemu_mutex_unlock_iothread();
4902 break;
4903 case KVM_EXIT_HYPERV:
4904 ret = kvm_hv_handle_exit(cpu, &run->hyperv);
4905 break;
4906 case KVM_EXIT_IOAPIC_EOI:
4907 ioapic_eoi_broadcast(run->eoi.vector);
4908 ret = 0;
4909 break;
4910 case KVM_EXIT_X86_BUS_LOCK:
4911 /* already handled in kvm_arch_post_run */
4912 ret = 0;
4913 break;
4914 default:
4915 fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
4916 ret = -1;
4917 break;
4918 }
4919
4920 return ret;
4921 }
4922
4923 bool kvm_arch_stop_on_emulation_error(CPUState *cs)
4924 {
4925 X86CPU *cpu = X86_CPU(cs);
4926 CPUX86State *env = &cpu->env;
4927
4928 kvm_cpu_synchronize_state(cs);
4929 return !(env->cr[0] & CR0_PE_MASK) ||
4930 ((env->segs[R_CS].selector & 3) != 3);
4931 }
4932
4933 void kvm_arch_init_irq_routing(KVMState *s)
4934 {
4935 /* We know at this point that we're using the in-kernel
4936 * irqchip, so we can use irqfds, and on x86 we know
4937 * we can use msi via irqfd and GSI routing.
4938 */
4939 kvm_msi_via_irqfd_allowed = true;
4940 kvm_gsi_routing_allowed = true;
4941
4942 if (kvm_irqchip_is_split()) {
4943 KVMRouteChange c = kvm_irqchip_begin_route_changes(s);
4944 int i;
4945
4946 /* If the ioapic is in QEMU and the lapics are in KVM, reserve
4947 MSI routes for signaling interrupts to the local apics. */
4948 for (i = 0; i < IOAPIC_NUM_PINS; i++) {
4949 if (kvm_irqchip_add_msi_route(&c, 0, NULL) < 0) {
4950 error_report("Could not enable split IRQ mode.");
4951 exit(1);
4952 }
4953 }
4954 kvm_irqchip_commit_route_changes(&c);
4955 }
4956 }
4957
4958 int kvm_arch_irqchip_create(KVMState *s)
4959 {
4960 int ret;
4961 if (kvm_kernel_irqchip_split()) {
4962 ret = kvm_vm_enable_cap(s, KVM_CAP_SPLIT_IRQCHIP, 0, 24);
4963 if (ret) {
4964 error_report("Could not enable split irqchip mode: %s",
4965 strerror(-ret));
4966 exit(1);
4967 } else {
4968 DPRINTF("Enabled KVM_CAP_SPLIT_IRQCHIP\n");
4969 kvm_split_irqchip = true;
4970 return 1;
4971 }
4972 } else {
4973 return 0;
4974 }
4975 }
4976
4977 uint64_t kvm_swizzle_msi_ext_dest_id(uint64_t address)
4978 {
4979 CPUX86State *env;
4980 uint64_t ext_id;
4981
4982 if (!first_cpu) {
4983 return address;
4984 }
4985 env = &X86_CPU(first_cpu)->env;
4986 if (!(env->features[FEAT_KVM] & (1 << KVM_FEATURE_MSI_EXT_DEST_ID))) {
4987 return address;
4988 }
4989
4990 /*
4991 * If the remappable format bit is set, or the upper bits are
4992 * already set in address_hi, or the low extended bits aren't
4993 * there anyway, do nothing.
4994 */
4995 ext_id = address & (0xff << MSI_ADDR_DEST_IDX_SHIFT);
4996 if (!ext_id || (ext_id & (1 << MSI_ADDR_DEST_IDX_SHIFT)) || (address >> 32)) {
4997 return address;
4998 }
4999
5000 address &= ~ext_id;
5001 address |= ext_id << 35;
5002 return address;
5003 }
5004
5005 int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
5006 uint64_t address, uint32_t data, PCIDevice *dev)
5007 {
5008 X86IOMMUState *iommu = x86_iommu_get_default();
5009
5010 if (iommu) {
5011 X86IOMMUClass *class = X86_IOMMU_DEVICE_GET_CLASS(iommu);
5012
5013 if (class->int_remap) {
5014 int ret;
5015 MSIMessage src, dst;
5016
5017 src.address = route->u.msi.address_hi;
5018 src.address <<= VTD_MSI_ADDR_HI_SHIFT;
5019 src.address |= route->u.msi.address_lo;
5020 src.data = route->u.msi.data;
5021
5022 ret = class->int_remap(iommu, &src, &dst, dev ? \
5023 pci_requester_id(dev) : \
5024 X86_IOMMU_SID_INVALID);
5025 if (ret) {
5026 trace_kvm_x86_fixup_msi_error(route->gsi);
5027 return 1;
5028 }
5029
5030 /*
5031 * Handled untranslated compatibilty format interrupt with
5032 * extended destination ID in the low bits 11-5. */
5033 dst.address = kvm_swizzle_msi_ext_dest_id(dst.address);
5034
5035 route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT;
5036 route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK;
5037 route->u.msi.data = dst.data;
5038 return 0;
5039 }
5040 }
5041
5042 address = kvm_swizzle_msi_ext_dest_id(address);
5043 route->u.msi.address_hi = address >> VTD_MSI_ADDR_HI_SHIFT;
5044 route->u.msi.address_lo = address & VTD_MSI_ADDR_LO_MASK;
5045 return 0;
5046 }
5047
5048 typedef struct MSIRouteEntry MSIRouteEntry;
5049
5050 struct MSIRouteEntry {
5051 PCIDevice *dev; /* Device pointer */
5052 int vector; /* MSI/MSIX vector index */
5053 int virq; /* Virtual IRQ index */
5054 QLIST_ENTRY(MSIRouteEntry) list;
5055 };
5056
5057 /* List of used GSI routes */
5058 static QLIST_HEAD(, MSIRouteEntry) msi_route_list = \
5059 QLIST_HEAD_INITIALIZER(msi_route_list);
5060
5061 static void kvm_update_msi_routes_all(void *private, bool global,
5062 uint32_t index, uint32_t mask)
5063 {
5064 int cnt = 0, vector;
5065 MSIRouteEntry *entry;
5066 MSIMessage msg;
5067 PCIDevice *dev;
5068
5069 /* TODO: explicit route update */
5070 QLIST_FOREACH(entry, &msi_route_list, list) {
5071 cnt++;
5072 vector = entry->vector;
5073 dev = entry->dev;
5074 if (msix_enabled(dev) && !msix_is_masked(dev, vector)) {
5075 msg = msix_get_message(dev, vector);
5076 } else if (msi_enabled(dev) && !msi_is_masked(dev, vector)) {
5077 msg = msi_get_message(dev, vector);
5078 } else {
5079 /*
5080 * Either MSI/MSIX is disabled for the device, or the
5081 * specific message was masked out. Skip this one.
5082 */
5083 continue;
5084 }
5085 kvm_irqchip_update_msi_route(kvm_state, entry->virq, msg, dev);
5086 }
5087 kvm_irqchip_commit_routes(kvm_state);
5088 trace_kvm_x86_update_msi_routes(cnt);
5089 }
5090
5091 int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route,
5092 int vector, PCIDevice *dev)
5093 {
5094 static bool notify_list_inited = false;
5095 MSIRouteEntry *entry;
5096
5097 if (!dev) {
5098 /* These are (possibly) IOAPIC routes only used for split
5099 * kernel irqchip mode, while what we are housekeeping are
5100 * PCI devices only. */
5101 return 0;
5102 }
5103
5104 entry = g_new0(MSIRouteEntry, 1);
5105 entry->dev = dev;
5106 entry->vector = vector;
5107 entry->virq = route->gsi;
5108 QLIST_INSERT_HEAD(&msi_route_list, entry, list);
5109
5110 trace_kvm_x86_add_msi_route(route->gsi);
5111
5112 if (!notify_list_inited) {
5113 /* For the first time we do add route, add ourselves into
5114 * IOMMU's IEC notify list if needed. */
5115 X86IOMMUState *iommu = x86_iommu_get_default();
5116 if (iommu) {
5117 x86_iommu_iec_register_notifier(iommu,
5118 kvm_update_msi_routes_all,
5119 NULL);
5120 }
5121 notify_list_inited = true;
5122 }
5123 return 0;
5124 }
5125
5126 int kvm_arch_release_virq_post(int virq)
5127 {
5128 MSIRouteEntry *entry, *next;
5129 QLIST_FOREACH_SAFE(entry, &msi_route_list, list, next) {
5130 if (entry->virq == virq) {
5131 trace_kvm_x86_remove_msi_route(virq);
5132 QLIST_REMOVE(entry, list);
5133 g_free(entry);
5134 break;
5135 }
5136 }
5137 return 0;
5138 }
5139
5140 int kvm_arch_msi_data_to_gsi(uint32_t data)
5141 {
5142 abort();
5143 }
5144
5145 bool kvm_has_waitpkg(void)
5146 {
5147 return has_msr_umwait;
5148 }
5149
5150 bool kvm_arch_cpu_check_are_resettable(void)
5151 {
5152 return !sev_es_enabled();
5153 }