]> git.proxmox.com Git - mirror_qemu.git/blame - target-i386/kvm.c
ppc_booke: Pass PowerPCCPU to ppc_booke_timers_init()
[mirror_qemu.git] / target-i386 / kvm.c
CommitLineData
05330448
AL
1/*
2 * QEMU KVM support
3 *
4 * Copyright (C) 2006-2008 Qumranet Technologies
5 * Copyright IBM, Corp. 2008
6 *
7 * Authors:
8 * Anthony Liguori <aliguori@us.ibm.com>
9 *
10 * This work is licensed under the terms of the GNU GPL, version 2 or later.
11 * See the COPYING file in the top-level directory.
12 *
13 */
14
15#include <sys/types.h>
16#include <sys/ioctl.h>
17#include <sys/mman.h>
25d2e361 18#include <sys/utsname.h>
05330448
AL
19
20#include <linux/kvm.h>
5802e066 21#include <linux/kvm_para.h>
05330448
AL
22
23#include "qemu-common.h"
24#include "sysemu.h"
25#include "kvm.h"
1d31f66b 26#include "kvm_i386.h"
05330448 27#include "cpu.h"
e22a25c9 28#include "gdbstub.h"
0e607a80 29#include "host-utils.h"
4c5b10b7 30#include "hw/pc.h"
408392b3 31#include "hw/apic.h"
35bed8ee 32#include "ioport.h"
eab70139 33#include "hyperv.h"
b139bd30 34#include "hw/pci.h"
05330448
AL
35
36//#define DEBUG_KVM
37
38#ifdef DEBUG_KVM
8c0d577e 39#define DPRINTF(fmt, ...) \
05330448
AL
40 do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
41#else
8c0d577e 42#define DPRINTF(fmt, ...) \
05330448
AL
43 do { } while (0)
44#endif
45
1a03675d
GC
46#define MSR_KVM_WALL_CLOCK 0x11
47#define MSR_KVM_SYSTEM_TIME 0x12
48
c0532a76
MT
49#ifndef BUS_MCEERR_AR
50#define BUS_MCEERR_AR 4
51#endif
52#ifndef BUS_MCEERR_AO
53#define BUS_MCEERR_AO 5
54#endif
55
94a8d39a
JK
56const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
57 KVM_CAP_INFO(SET_TSS_ADDR),
58 KVM_CAP_INFO(EXT_CPUID),
59 KVM_CAP_INFO(MP_STATE),
60 KVM_CAP_LAST_INFO
61};
25d2e361 62
c3a3a7d3
JK
63static bool has_msr_star;
64static bool has_msr_hsave_pa;
aa82ba54 65static bool has_msr_tsc_deadline;
c5999bfc 66static bool has_msr_async_pf_en;
bc9a839d 67static bool has_msr_pv_eoi_en;
21e87c46 68static bool has_msr_misc_enable;
25d2e361 69static int lm_capable_kernel;
b827df58 70
1d31f66b
PM
71bool kvm_allows_irq0_override(void)
72{
73 return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
74}
75
b827df58
AK
76static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
77{
78 struct kvm_cpuid2 *cpuid;
79 int r, size;
80
81 size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
7267c094 82 cpuid = (struct kvm_cpuid2 *)g_malloc0(size);
b827df58
AK
83 cpuid->nent = max;
84 r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid);
76ae317f
MM
85 if (r == 0 && cpuid->nent >= max) {
86 r = -E2BIG;
87 }
b827df58
AK
88 if (r < 0) {
89 if (r == -E2BIG) {
7267c094 90 g_free(cpuid);
b827df58
AK
91 return NULL;
92 } else {
93 fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n",
94 strerror(-r));
95 exit(1);
96 }
97 }
98 return cpuid;
99}
100
dd87f8a6
EH
101/* Run KVM_GET_SUPPORTED_CPUID ioctl(), allocating a buffer large enough
102 * for all entries.
103 */
104static struct kvm_cpuid2 *get_supported_cpuid(KVMState *s)
105{
106 struct kvm_cpuid2 *cpuid;
107 int max = 1;
108 while ((cpuid = try_get_cpuid(s, max)) == NULL) {
109 max *= 2;
110 }
111 return cpuid;
112}
113
0c31b744
GC
114struct kvm_para_features {
115 int cap;
116 int feature;
117} para_features[] = {
118 { KVM_CAP_CLOCKSOURCE, KVM_FEATURE_CLOCKSOURCE },
119 { KVM_CAP_NOP_IO_DELAY, KVM_FEATURE_NOP_IO_DELAY },
120 { KVM_CAP_PV_MMU, KVM_FEATURE_MMU_OP },
0c31b744 121 { KVM_CAP_ASYNC_PF, KVM_FEATURE_ASYNC_PF },
0c31b744
GC
122 { -1, -1 }
123};
124
ba9bc59e 125static int get_para_features(KVMState *s)
0c31b744
GC
126{
127 int i, features = 0;
128
129 for (i = 0; i < ARRAY_SIZE(para_features) - 1; i++) {
ba9bc59e 130 if (kvm_check_extension(s, para_features[i].cap)) {
0c31b744
GC
131 features |= (1 << para_features[i].feature);
132 }
133 }
134
135 return features;
136}
0c31b744
GC
137
138
829ae2f9
EH
139/* Returns the value for a specific register on the cpuid entry
140 */
141static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
142{
143 uint32_t ret = 0;
144 switch (reg) {
145 case R_EAX:
146 ret = entry->eax;
147 break;
148 case R_EBX:
149 ret = entry->ebx;
150 break;
151 case R_ECX:
152 ret = entry->ecx;
153 break;
154 case R_EDX:
155 ret = entry->edx;
156 break;
157 }
158 return ret;
159}
160
4fb73f1d
EH
161/* Find matching entry for function/index on kvm_cpuid2 struct
162 */
163static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid,
164 uint32_t function,
165 uint32_t index)
166{
167 int i;
168 for (i = 0; i < cpuid->nent; ++i) {
169 if (cpuid->entries[i].function == function &&
170 cpuid->entries[i].index == index) {
171 return &cpuid->entries[i];
172 }
173 }
174 /* not found: */
175 return NULL;
176}
177
ba9bc59e 178uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
c958a8bd 179 uint32_t index, int reg)
b827df58
AK
180{
181 struct kvm_cpuid2 *cpuid;
b827df58
AK
182 uint32_t ret = 0;
183 uint32_t cpuid_1_edx;
8c723b79 184 bool found = false;
b827df58 185
dd87f8a6 186 cpuid = get_supported_cpuid(s);
b827df58 187
4fb73f1d
EH
188 struct kvm_cpuid_entry2 *entry = cpuid_find_entry(cpuid, function, index);
189 if (entry) {
190 found = true;
191 ret = cpuid_entry_get_reg(entry, reg);
b827df58
AK
192 }
193
7b46e5ce
EH
194 /* Fixups for the data returned by KVM, below */
195
c2acb022
EH
196 if (function == 1 && reg == R_EDX) {
197 /* KVM before 2.6.30 misreports the following features */
198 ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA;
84bd945c
EH
199 } else if (function == 1 && reg == R_ECX) {
200 /* We can set the hypervisor flag, even if KVM does not return it on
201 * GET_SUPPORTED_CPUID
202 */
203 ret |= CPUID_EXT_HYPERVISOR;
ac67ee26
EH
204 /* tsc-deadline flag is not returned by GET_SUPPORTED_CPUID, but it
205 * can be enabled if the kernel has KVM_CAP_TSC_DEADLINE_TIMER,
206 * and the irqchip is in the kernel.
207 */
208 if (kvm_irqchip_in_kernel() &&
209 kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) {
210 ret |= CPUID_EXT_TSC_DEADLINE_TIMER;
211 }
41e5e76d
EH
212
213 /* x2apic is reported by GET_SUPPORTED_CPUID, but it can't be enabled
214 * without the in-kernel irqchip
215 */
216 if (!kvm_irqchip_in_kernel()) {
217 ret &= ~CPUID_EXT_X2APIC;
b827df58 218 }
c2acb022
EH
219 } else if (function == 0x80000001 && reg == R_EDX) {
220 /* On Intel, kvm returns cpuid according to the Intel spec,
221 * so add missing bits according to the AMD spec:
222 */
223 cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
224 ret |= cpuid_1_edx & CPUID_EXT2_AMD_ALIASES;
b827df58
AK
225 }
226
7267c094 227 g_free(cpuid);
b827df58 228
0c31b744 229 /* fallback for older kernels */
8c723b79 230 if ((function == KVM_CPUID_FEATURES) && !found) {
ba9bc59e 231 ret = get_para_features(s);
b9bec74b 232 }
0c31b744
GC
233
234 return ret;
bb0300dc 235}
bb0300dc 236
3c85e74f
HY
237typedef struct HWPoisonPage {
238 ram_addr_t ram_addr;
239 QLIST_ENTRY(HWPoisonPage) list;
240} HWPoisonPage;
241
242static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
243 QLIST_HEAD_INITIALIZER(hwpoison_page_list);
244
245static void kvm_unpoison_all(void *param)
246{
247 HWPoisonPage *page, *next_page;
248
249 QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
250 QLIST_REMOVE(page, list);
251 qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
7267c094 252 g_free(page);
3c85e74f
HY
253 }
254}
255
3c85e74f
HY
256static void kvm_hwpoison_page_add(ram_addr_t ram_addr)
257{
258 HWPoisonPage *page;
259
260 QLIST_FOREACH(page, &hwpoison_page_list, list) {
261 if (page->ram_addr == ram_addr) {
262 return;
263 }
264 }
7267c094 265 page = g_malloc(sizeof(HWPoisonPage));
3c85e74f
HY
266 page->ram_addr = ram_addr;
267 QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
268}
269
e7701825
MT
270static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
271 int *max_banks)
272{
273 int r;
274
14a09518 275 r = kvm_check_extension(s, KVM_CAP_MCE);
e7701825
MT
276 if (r > 0) {
277 *max_banks = r;
278 return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
279 }
280 return -ENOSYS;
281}
282
bee615d4 283static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code)
e7701825 284{
bee615d4 285 CPUX86State *env = &cpu->env;
c34d440a
JK
286 uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
287 MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S;
288 uint64_t mcg_status = MCG_STATUS_MCIP;
e7701825 289
c34d440a
JK
290 if (code == BUS_MCEERR_AR) {
291 status |= MCI_STATUS_AR | 0x134;
292 mcg_status |= MCG_STATUS_EIPV;
293 } else {
294 status |= 0xc0;
295 mcg_status |= MCG_STATUS_RIPV;
419fb20a 296 }
8c5cf3b6 297 cpu_x86_inject_mce(NULL, cpu, 9, status, mcg_status, paddr,
c34d440a
JK
298 (MCM_ADDR_PHYS << 6) | 0xc,
299 cpu_x86_support_mca_broadcast(env) ?
300 MCE_INJECT_BROADCAST : 0);
419fb20a 301}
419fb20a
JK
302
303static void hardware_memory_error(void)
304{
305 fprintf(stderr, "Hardware memory error!\n");
306 exit(1);
307}
308
20d695a9 309int kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
419fb20a 310{
20d695a9
AF
311 X86CPU *cpu = X86_CPU(c);
312 CPUX86State *env = &cpu->env;
419fb20a 313 ram_addr_t ram_addr;
a8170e5e 314 hwaddr paddr;
419fb20a
JK
315
316 if ((env->mcg_cap & MCG_SER_P) && addr
c34d440a
JK
317 && (code == BUS_MCEERR_AR || code == BUS_MCEERR_AO)) {
318 if (qemu_ram_addr_from_host(addr, &ram_addr) ||
9f213ed9 319 !kvm_physical_memory_addr_from_host(env->kvm_state, addr, &paddr)) {
419fb20a
JK
320 fprintf(stderr, "Hardware memory error for memory used by "
321 "QEMU itself instead of guest system!\n");
322 /* Hope we are lucky for AO MCE */
323 if (code == BUS_MCEERR_AO) {
324 return 0;
325 } else {
326 hardware_memory_error();
327 }
328 }
3c85e74f 329 kvm_hwpoison_page_add(ram_addr);
bee615d4 330 kvm_mce_inject(cpu, paddr, code);
e56ff191 331 } else {
419fb20a
JK
332 if (code == BUS_MCEERR_AO) {
333 return 0;
334 } else if (code == BUS_MCEERR_AR) {
335 hardware_memory_error();
336 } else {
337 return 1;
338 }
339 }
340 return 0;
341}
342
343int kvm_arch_on_sigbus(int code, void *addr)
344{
419fb20a 345 if ((first_cpu->mcg_cap & MCG_SER_P) && addr && code == BUS_MCEERR_AO) {
419fb20a 346 ram_addr_t ram_addr;
a8170e5e 347 hwaddr paddr;
419fb20a
JK
348
349 /* Hope we are lucky for AO MCE */
c34d440a 350 if (qemu_ram_addr_from_host(addr, &ram_addr) ||
9f213ed9
AK
351 !kvm_physical_memory_addr_from_host(first_cpu->kvm_state, addr,
352 &paddr)) {
419fb20a
JK
353 fprintf(stderr, "Hardware memory error for memory used by "
354 "QEMU itself instead of guest system!: %p\n", addr);
355 return 0;
356 }
3c85e74f 357 kvm_hwpoison_page_add(ram_addr);
bee615d4 358 kvm_mce_inject(x86_env_get_cpu(first_cpu), paddr, code);
e56ff191 359 } else {
419fb20a
JK
360 if (code == BUS_MCEERR_AO) {
361 return 0;
362 } else if (code == BUS_MCEERR_AR) {
363 hardware_memory_error();
364 } else {
365 return 1;
366 }
367 }
368 return 0;
369}
e7701825 370
1bc22652 371static int kvm_inject_mce_oldstyle(X86CPU *cpu)
ab443475 372{
1bc22652
AF
373 CPUX86State *env = &cpu->env;
374
ab443475
JK
375 if (!kvm_has_vcpu_events() && env->exception_injected == EXCP12_MCHK) {
376 unsigned int bank, bank_num = env->mcg_cap & 0xff;
377 struct kvm_x86_mce mce;
378
379 env->exception_injected = -1;
380
381 /*
382 * There must be at least one bank in use if an MCE is pending.
383 * Find it and use its values for the event injection.
384 */
385 for (bank = 0; bank < bank_num; bank++) {
386 if (env->mce_banks[bank * 4 + 1] & MCI_STATUS_VAL) {
387 break;
388 }
389 }
390 assert(bank < bank_num);
391
392 mce.bank = bank;
393 mce.status = env->mce_banks[bank * 4 + 1];
394 mce.mcg_status = env->mcg_status;
395 mce.addr = env->mce_banks[bank * 4 + 2];
396 mce.misc = env->mce_banks[bank * 4 + 3];
397
1bc22652 398 return kvm_vcpu_ioctl(CPU(cpu), KVM_X86_SET_MCE, &mce);
ab443475 399 }
ab443475
JK
400 return 0;
401}
402
1dfb4dd9 403static void cpu_update_state(void *opaque, int running, RunState state)
b8cc45d6 404{
317ac620 405 CPUX86State *env = opaque;
b8cc45d6
GC
406
407 if (running) {
408 env->tsc_valid = false;
409 }
410}
411
20d695a9 412int kvm_arch_init_vcpu(CPUState *cs)
05330448
AL
413{
414 struct {
486bd5a2
AL
415 struct kvm_cpuid2 cpuid;
416 struct kvm_cpuid_entry2 entries[100];
541dc0d4 417 } QEMU_PACKED cpuid_data;
20d695a9
AF
418 X86CPU *cpu = X86_CPU(cs);
419 CPUX86State *env = &cpu->env;
486bd5a2 420 uint32_t limit, i, j, cpuid_i;
a33609ca 421 uint32_t unused;
bb0300dc 422 struct kvm_cpuid_entry2 *c;
bb0300dc 423 uint32_t signature[3];
e7429073 424 int r;
05330448
AL
425
426 cpuid_i = 0;
427
bb0300dc 428 /* Paravirtualization CPUIDs */
bb0300dc
GN
429 c = &cpuid_data.entries[cpuid_i++];
430 memset(c, 0, sizeof(*c));
431 c->function = KVM_CPUID_SIGNATURE;
eab70139
VR
432 if (!hyperv_enabled()) {
433 memcpy(signature, "KVMKVMKVM\0\0\0", 12);
434 c->eax = 0;
435 } else {
436 memcpy(signature, "Microsoft Hv", 12);
437 c->eax = HYPERV_CPUID_MIN;
438 }
bb0300dc
GN
439 c->ebx = signature[0];
440 c->ecx = signature[1];
441 c->edx = signature[2];
442
443 c = &cpuid_data.entries[cpuid_i++];
444 memset(c, 0, sizeof(*c));
445 c->function = KVM_CPUID_FEATURES;
ea85c9e4 446 c->eax = env->cpuid_kvm_features;
0c31b744 447
eab70139
VR
448 if (hyperv_enabled()) {
449 memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12);
450 c->eax = signature[0];
451
452 c = &cpuid_data.entries[cpuid_i++];
453 memset(c, 0, sizeof(*c));
454 c->function = HYPERV_CPUID_VERSION;
455 c->eax = 0x00001bbc;
456 c->ebx = 0x00060001;
457
458 c = &cpuid_data.entries[cpuid_i++];
459 memset(c, 0, sizeof(*c));
460 c->function = HYPERV_CPUID_FEATURES;
461 if (hyperv_relaxed_timing_enabled()) {
462 c->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE;
463 }
464 if (hyperv_vapic_recommended()) {
465 c->eax |= HV_X64_MSR_HYPERCALL_AVAILABLE;
466 c->eax |= HV_X64_MSR_APIC_ACCESS_AVAILABLE;
467 }
468
469 c = &cpuid_data.entries[cpuid_i++];
470 memset(c, 0, sizeof(*c));
471 c->function = HYPERV_CPUID_ENLIGHTMENT_INFO;
472 if (hyperv_relaxed_timing_enabled()) {
473 c->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED;
474 }
475 if (hyperv_vapic_recommended()) {
476 c->eax |= HV_X64_APIC_ACCESS_RECOMMENDED;
477 }
478 c->ebx = hyperv_get_spinlock_retries();
479
480 c = &cpuid_data.entries[cpuid_i++];
481 memset(c, 0, sizeof(*c));
482 c->function = HYPERV_CPUID_IMPLEMENT_LIMITS;
483 c->eax = 0x40;
484 c->ebx = 0x40;
485
486 c = &cpuid_data.entries[cpuid_i++];
487 memset(c, 0, sizeof(*c));
488 c->function = KVM_CPUID_SIGNATURE_NEXT;
489 memcpy(signature, "KVMKVMKVM\0\0\0", 12);
490 c->eax = 0;
491 c->ebx = signature[0];
492 c->ecx = signature[1];
493 c->edx = signature[2];
494 }
495
0c31b744 496 has_msr_async_pf_en = c->eax & (1 << KVM_FEATURE_ASYNC_PF);
bb0300dc 497
bc9a839d
MT
498 has_msr_pv_eoi_en = c->eax & (1 << KVM_FEATURE_PV_EOI);
499
a33609ca 500 cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
05330448
AL
501
502 for (i = 0; i <= limit; i++) {
bb0300dc 503 c = &cpuid_data.entries[cpuid_i++];
486bd5a2
AL
504
505 switch (i) {
a36b1029
AL
506 case 2: {
507 /* Keep reading function 2 till all the input is received */
508 int times;
509
a36b1029 510 c->function = i;
a33609ca
AL
511 c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
512 KVM_CPUID_FLAG_STATE_READ_NEXT;
513 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
514 times = c->eax & 0xff;
a36b1029
AL
515
516 for (j = 1; j < times; ++j) {
a33609ca 517 c = &cpuid_data.entries[cpuid_i++];
a36b1029 518 c->function = i;
a33609ca
AL
519 c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC;
520 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
a36b1029
AL
521 }
522 break;
523 }
486bd5a2
AL
524 case 4:
525 case 0xb:
526 case 0xd:
527 for (j = 0; ; j++) {
31e8c696
AP
528 if (i == 0xd && j == 64) {
529 break;
530 }
486bd5a2
AL
531 c->function = i;
532 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
533 c->index = j;
a33609ca 534 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
486bd5a2 535
b9bec74b 536 if (i == 4 && c->eax == 0) {
486bd5a2 537 break;
b9bec74b
JK
538 }
539 if (i == 0xb && !(c->ecx & 0xff00)) {
486bd5a2 540 break;
b9bec74b
JK
541 }
542 if (i == 0xd && c->eax == 0) {
31e8c696 543 continue;
b9bec74b 544 }
a33609ca 545 c = &cpuid_data.entries[cpuid_i++];
486bd5a2
AL
546 }
547 break;
548 default:
486bd5a2 549 c->function = i;
a33609ca
AL
550 c->flags = 0;
551 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
486bd5a2
AL
552 break;
553 }
05330448 554 }
a33609ca 555 cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused);
05330448
AL
556
557 for (i = 0x80000000; i <= limit; i++) {
bb0300dc 558 c = &cpuid_data.entries[cpuid_i++];
05330448 559
05330448 560 c->function = i;
a33609ca
AL
561 c->flags = 0;
562 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
05330448
AL
563 }
564
b3baa152
BW
565 /* Call Centaur's CPUID instructions they are supported. */
566 if (env->cpuid_xlevel2 > 0) {
b3baa152
BW
567 cpu_x86_cpuid(env, 0xC0000000, 0, &limit, &unused, &unused, &unused);
568
569 for (i = 0xC0000000; i <= limit; i++) {
570 c = &cpuid_data.entries[cpuid_i++];
571
572 c->function = i;
573 c->flags = 0;
574 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
575 }
576 }
577
05330448
AL
578 cpuid_data.cpuid.nent = cpuid_i;
579
e7701825
MT
580 if (((env->cpuid_version >> 8)&0xF) >= 6
581 && (env->cpuid_features&(CPUID_MCE|CPUID_MCA)) == (CPUID_MCE|CPUID_MCA)
582 && kvm_check_extension(env->kvm_state, KVM_CAP_MCE) > 0) {
583 uint64_t mcg_cap;
584 int banks;
32a42024 585 int ret;
e7701825 586
75d49497
JK
587 ret = kvm_get_mce_cap_supported(env->kvm_state, &mcg_cap, &banks);
588 if (ret < 0) {
589 fprintf(stderr, "kvm_get_mce_cap_supported: %s", strerror(-ret));
590 return ret;
e7701825 591 }
75d49497
JK
592
593 if (banks > MCE_BANKS_DEF) {
594 banks = MCE_BANKS_DEF;
595 }
596 mcg_cap &= MCE_CAP_DEF;
597 mcg_cap |= banks;
1bc22652 598 ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, &mcg_cap);
75d49497
JK
599 if (ret < 0) {
600 fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret));
601 return ret;
602 }
603
604 env->mcg_cap = mcg_cap;
e7701825 605 }
e7701825 606
b8cc45d6
GC
607 qemu_add_vm_change_state_handler(cpu_update_state, env);
608
7e680753 609 cpuid_data.cpuid.padding = 0;
1bc22652 610 r = kvm_vcpu_ioctl(cs, KVM_SET_CPUID2, &cpuid_data);
fdc9c41a
JK
611 if (r) {
612 return r;
613 }
e7429073 614
e7429073
JR
615 r = kvm_check_extension(env->kvm_state, KVM_CAP_TSC_CONTROL);
616 if (r && env->tsc_khz) {
1bc22652 617 r = kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz);
e7429073
JR
618 if (r < 0) {
619 fprintf(stderr, "KVM_SET_TSC_KHZ failed\n");
620 return r;
621 }
622 }
e7429073 623
fabacc0f
JK
624 if (kvm_has_xsave()) {
625 env->kvm_xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
626 }
627
e7429073 628 return 0;
05330448
AL
629}
630
20d695a9 631void kvm_arch_reset_vcpu(CPUState *cs)
caa5af0f 632{
20d695a9
AF
633 X86CPU *cpu = X86_CPU(cs);
634 CPUX86State *env = &cpu->env;
dd673288 635
e73223a5 636 env->exception_injected = -1;
0e607a80 637 env->interrupt_injected = -1;
1a5e9d2f 638 env->xcr0 = 1;
ddced198 639 if (kvm_irqchip_in_kernel()) {
dd673288 640 env->mp_state = cpu_is_bsp(cpu) ? KVM_MP_STATE_RUNNABLE :
ddced198
MT
641 KVM_MP_STATE_UNINITIALIZED;
642 } else {
643 env->mp_state = KVM_MP_STATE_RUNNABLE;
644 }
caa5af0f
JK
645}
646
c3a3a7d3 647static int kvm_get_supported_msrs(KVMState *s)
05330448 648{
75b10c43 649 static int kvm_supported_msrs;
c3a3a7d3 650 int ret = 0;
05330448
AL
651
652 /* first time */
75b10c43 653 if (kvm_supported_msrs == 0) {
05330448
AL
654 struct kvm_msr_list msr_list, *kvm_msr_list;
655
75b10c43 656 kvm_supported_msrs = -1;
05330448
AL
657
658 /* Obtain MSR list from KVM. These are the MSRs that we must
659 * save/restore */
4c9f7372 660 msr_list.nmsrs = 0;
c3a3a7d3 661 ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list);
6fb6d245 662 if (ret < 0 && ret != -E2BIG) {
c3a3a7d3 663 return ret;
6fb6d245 664 }
d9db889f
JK
665 /* Old kernel modules had a bug and could write beyond the provided
666 memory. Allocate at least a safe amount of 1K. */
7267c094 667 kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) +
d9db889f
JK
668 msr_list.nmsrs *
669 sizeof(msr_list.indices[0])));
05330448 670
55308450 671 kvm_msr_list->nmsrs = msr_list.nmsrs;
c3a3a7d3 672 ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
05330448
AL
673 if (ret >= 0) {
674 int i;
675
676 for (i = 0; i < kvm_msr_list->nmsrs; i++) {
677 if (kvm_msr_list->indices[i] == MSR_STAR) {
c3a3a7d3 678 has_msr_star = true;
75b10c43
MT
679 continue;
680 }
681 if (kvm_msr_list->indices[i] == MSR_VM_HSAVE_PA) {
c3a3a7d3 682 has_msr_hsave_pa = true;
75b10c43 683 continue;
05330448 684 }
aa82ba54
LJ
685 if (kvm_msr_list->indices[i] == MSR_IA32_TSCDEADLINE) {
686 has_msr_tsc_deadline = true;
687 continue;
688 }
21e87c46
AK
689 if (kvm_msr_list->indices[i] == MSR_IA32_MISC_ENABLE) {
690 has_msr_misc_enable = true;
691 continue;
692 }
05330448
AL
693 }
694 }
695
7267c094 696 g_free(kvm_msr_list);
05330448
AL
697 }
698
c3a3a7d3 699 return ret;
05330448
AL
700}
701
cad1e282 702int kvm_arch_init(KVMState *s)
20420430 703{
39d6960a 704 QemuOptsList *list = qemu_find_opts("machine");
11076198 705 uint64_t identity_base = 0xfffbc000;
39d6960a 706 uint64_t shadow_mem;
20420430 707 int ret;
25d2e361 708 struct utsname utsname;
20420430 709
c3a3a7d3 710 ret = kvm_get_supported_msrs(s);
20420430 711 if (ret < 0) {
20420430
SY
712 return ret;
713 }
25d2e361
MT
714
715 uname(&utsname);
716 lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;
717
4c5b10b7 718 /*
11076198
JK
719 * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly.
720 * In order to use vm86 mode, an EPT identity map and a TSS are needed.
721 * Since these must be part of guest physical memory, we need to allocate
722 * them, both by setting their start addresses in the kernel and by
723 * creating a corresponding e820 entry. We need 4 pages before the BIOS.
724 *
725 * Older KVM versions may not support setting the identity map base. In
726 * that case we need to stick with the default, i.e. a 256K maximum BIOS
727 * size.
4c5b10b7 728 */
11076198
JK
729 if (kvm_check_extension(s, KVM_CAP_SET_IDENTITY_MAP_ADDR)) {
730 /* Allows up to 16M BIOSes. */
731 identity_base = 0xfeffc000;
732
733 ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base);
734 if (ret < 0) {
735 return ret;
736 }
4c5b10b7 737 }
e56ff191 738
11076198
JK
739 /* Set TSS base one page after EPT identity map. */
740 ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, identity_base + 0x1000);
20420430
SY
741 if (ret < 0) {
742 return ret;
743 }
744
11076198
JK
745 /* Tell fw_cfg to notify the BIOS to reserve the range. */
746 ret = e820_add_entry(identity_base, 0x4000, E820_RESERVED);
20420430 747 if (ret < 0) {
11076198 748 fprintf(stderr, "e820_add_entry() table is full\n");
20420430
SY
749 return ret;
750 }
3c85e74f 751 qemu_register_reset(kvm_unpoison_all, NULL);
20420430 752
39d6960a
JK
753 if (!QTAILQ_EMPTY(&list->head)) {
754 shadow_mem = qemu_opt_get_size(QTAILQ_FIRST(&list->head),
755 "kvm_shadow_mem", -1);
756 if (shadow_mem != -1) {
757 shadow_mem /= 4096;
758 ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem);
759 if (ret < 0) {
760 return ret;
761 }
762 }
763 }
11076198 764 return 0;
05330448 765}
b9bec74b 766
05330448
AL
767static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
768{
769 lhs->selector = rhs->selector;
770 lhs->base = rhs->base;
771 lhs->limit = rhs->limit;
772 lhs->type = 3;
773 lhs->present = 1;
774 lhs->dpl = 3;
775 lhs->db = 0;
776 lhs->s = 1;
777 lhs->l = 0;
778 lhs->g = 0;
779 lhs->avl = 0;
780 lhs->unusable = 0;
781}
782
783static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
784{
785 unsigned flags = rhs->flags;
786 lhs->selector = rhs->selector;
787 lhs->base = rhs->base;
788 lhs->limit = rhs->limit;
789 lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
790 lhs->present = (flags & DESC_P_MASK) != 0;
acaa7550 791 lhs->dpl = (flags >> DESC_DPL_SHIFT) & 3;
05330448
AL
792 lhs->db = (flags >> DESC_B_SHIFT) & 1;
793 lhs->s = (flags & DESC_S_MASK) != 0;
794 lhs->l = (flags >> DESC_L_SHIFT) & 1;
795 lhs->g = (flags & DESC_G_MASK) != 0;
796 lhs->avl = (flags & DESC_AVL_MASK) != 0;
797 lhs->unusable = 0;
7e680753 798 lhs->padding = 0;
05330448
AL
799}
800
801static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
802{
803 lhs->selector = rhs->selector;
804 lhs->base = rhs->base;
805 lhs->limit = rhs->limit;
b9bec74b
JK
806 lhs->flags = (rhs->type << DESC_TYPE_SHIFT) |
807 (rhs->present * DESC_P_MASK) |
808 (rhs->dpl << DESC_DPL_SHIFT) |
809 (rhs->db << DESC_B_SHIFT) |
810 (rhs->s * DESC_S_MASK) |
811 (rhs->l << DESC_L_SHIFT) |
812 (rhs->g * DESC_G_MASK) |
813 (rhs->avl * DESC_AVL_MASK);
05330448
AL
814}
815
816static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set)
817{
b9bec74b 818 if (set) {
05330448 819 *kvm_reg = *qemu_reg;
b9bec74b 820 } else {
05330448 821 *qemu_reg = *kvm_reg;
b9bec74b 822 }
05330448
AL
823}
824
1bc22652 825static int kvm_getput_regs(X86CPU *cpu, int set)
05330448 826{
1bc22652 827 CPUX86State *env = &cpu->env;
05330448
AL
828 struct kvm_regs regs;
829 int ret = 0;
830
831 if (!set) {
1bc22652 832 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_REGS, &regs);
b9bec74b 833 if (ret < 0) {
05330448 834 return ret;
b9bec74b 835 }
05330448
AL
836 }
837
838 kvm_getput_reg(&regs.rax, &env->regs[R_EAX], set);
839 kvm_getput_reg(&regs.rbx, &env->regs[R_EBX], set);
840 kvm_getput_reg(&regs.rcx, &env->regs[R_ECX], set);
841 kvm_getput_reg(&regs.rdx, &env->regs[R_EDX], set);
842 kvm_getput_reg(&regs.rsi, &env->regs[R_ESI], set);
843 kvm_getput_reg(&regs.rdi, &env->regs[R_EDI], set);
844 kvm_getput_reg(&regs.rsp, &env->regs[R_ESP], set);
845 kvm_getput_reg(&regs.rbp, &env->regs[R_EBP], set);
846#ifdef TARGET_X86_64
847 kvm_getput_reg(&regs.r8, &env->regs[8], set);
848 kvm_getput_reg(&regs.r9, &env->regs[9], set);
849 kvm_getput_reg(&regs.r10, &env->regs[10], set);
850 kvm_getput_reg(&regs.r11, &env->regs[11], set);
851 kvm_getput_reg(&regs.r12, &env->regs[12], set);
852 kvm_getput_reg(&regs.r13, &env->regs[13], set);
853 kvm_getput_reg(&regs.r14, &env->regs[14], set);
854 kvm_getput_reg(&regs.r15, &env->regs[15], set);
855#endif
856
857 kvm_getput_reg(&regs.rflags, &env->eflags, set);
858 kvm_getput_reg(&regs.rip, &env->eip, set);
859
b9bec74b 860 if (set) {
1bc22652 861 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_REGS, &regs);
b9bec74b 862 }
05330448
AL
863
864 return ret;
865}
866
1bc22652 867static int kvm_put_fpu(X86CPU *cpu)
05330448 868{
1bc22652 869 CPUX86State *env = &cpu->env;
05330448
AL
870 struct kvm_fpu fpu;
871 int i;
872
873 memset(&fpu, 0, sizeof fpu);
874 fpu.fsw = env->fpus & ~(7 << 11);
875 fpu.fsw |= (env->fpstt & 7) << 11;
876 fpu.fcw = env->fpuc;
42cc8fa6
JK
877 fpu.last_opcode = env->fpop;
878 fpu.last_ip = env->fpip;
879 fpu.last_dp = env->fpdp;
b9bec74b
JK
880 for (i = 0; i < 8; ++i) {
881 fpu.ftwx |= (!env->fptags[i]) << i;
882 }
05330448
AL
883 memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
884 memcpy(fpu.xmm, env->xmm_regs, sizeof env->xmm_regs);
885 fpu.mxcsr = env->mxcsr;
886
1bc22652 887 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_FPU, &fpu);
05330448
AL
888}
889
6b42494b
JK
890#define XSAVE_FCW_FSW 0
891#define XSAVE_FTW_FOP 1
f1665b21
SY
892#define XSAVE_CWD_RIP 2
893#define XSAVE_CWD_RDP 4
894#define XSAVE_MXCSR 6
895#define XSAVE_ST_SPACE 8
896#define XSAVE_XMM_SPACE 40
897#define XSAVE_XSTATE_BV 128
898#define XSAVE_YMMH_SPACE 144
f1665b21 899
1bc22652 900static int kvm_put_xsave(X86CPU *cpu)
f1665b21 901{
1bc22652 902 CPUX86State *env = &cpu->env;
fabacc0f 903 struct kvm_xsave* xsave = env->kvm_xsave_buf;
42cc8fa6 904 uint16_t cwd, swd, twd;
fabacc0f 905 int i, r;
f1665b21 906
b9bec74b 907 if (!kvm_has_xsave()) {
1bc22652 908 return kvm_put_fpu(cpu);
b9bec74b 909 }
f1665b21 910
f1665b21 911 memset(xsave, 0, sizeof(struct kvm_xsave));
6115c0a8 912 twd = 0;
f1665b21
SY
913 swd = env->fpus & ~(7 << 11);
914 swd |= (env->fpstt & 7) << 11;
915 cwd = env->fpuc;
b9bec74b 916 for (i = 0; i < 8; ++i) {
f1665b21 917 twd |= (!env->fptags[i]) << i;
b9bec74b 918 }
6b42494b
JK
919 xsave->region[XSAVE_FCW_FSW] = (uint32_t)(swd << 16) + cwd;
920 xsave->region[XSAVE_FTW_FOP] = (uint32_t)(env->fpop << 16) + twd;
42cc8fa6
JK
921 memcpy(&xsave->region[XSAVE_CWD_RIP], &env->fpip, sizeof(env->fpip));
922 memcpy(&xsave->region[XSAVE_CWD_RDP], &env->fpdp, sizeof(env->fpdp));
f1665b21
SY
923 memcpy(&xsave->region[XSAVE_ST_SPACE], env->fpregs,
924 sizeof env->fpregs);
925 memcpy(&xsave->region[XSAVE_XMM_SPACE], env->xmm_regs,
926 sizeof env->xmm_regs);
927 xsave->region[XSAVE_MXCSR] = env->mxcsr;
928 *(uint64_t *)&xsave->region[XSAVE_XSTATE_BV] = env->xstate_bv;
929 memcpy(&xsave->region[XSAVE_YMMH_SPACE], env->ymmh_regs,
930 sizeof env->ymmh_regs);
1bc22652 931 r = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
0f53994f 932 return r;
f1665b21
SY
933}
934
1bc22652 935static int kvm_put_xcrs(X86CPU *cpu)
f1665b21 936{
1bc22652 937 CPUX86State *env = &cpu->env;
f1665b21
SY
938 struct kvm_xcrs xcrs;
939
b9bec74b 940 if (!kvm_has_xcrs()) {
f1665b21 941 return 0;
b9bec74b 942 }
f1665b21
SY
943
944 xcrs.nr_xcrs = 1;
945 xcrs.flags = 0;
946 xcrs.xcrs[0].xcr = 0;
947 xcrs.xcrs[0].value = env->xcr0;
1bc22652 948 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XCRS, &xcrs);
f1665b21
SY
949}
950
1bc22652 951static int kvm_put_sregs(X86CPU *cpu)
05330448 952{
1bc22652 953 CPUX86State *env = &cpu->env;
05330448
AL
954 struct kvm_sregs sregs;
955
0e607a80
JK
956 memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap));
957 if (env->interrupt_injected >= 0) {
958 sregs.interrupt_bitmap[env->interrupt_injected / 64] |=
959 (uint64_t)1 << (env->interrupt_injected % 64);
960 }
05330448
AL
961
962 if ((env->eflags & VM_MASK)) {
b9bec74b
JK
963 set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
964 set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
965 set_v8086_seg(&sregs.es, &env->segs[R_ES]);
966 set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
967 set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
968 set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
05330448 969 } else {
b9bec74b
JK
970 set_seg(&sregs.cs, &env->segs[R_CS]);
971 set_seg(&sregs.ds, &env->segs[R_DS]);
972 set_seg(&sregs.es, &env->segs[R_ES]);
973 set_seg(&sregs.fs, &env->segs[R_FS]);
974 set_seg(&sregs.gs, &env->segs[R_GS]);
975 set_seg(&sregs.ss, &env->segs[R_SS]);
05330448
AL
976 }
977
978 set_seg(&sregs.tr, &env->tr);
979 set_seg(&sregs.ldt, &env->ldt);
980
981 sregs.idt.limit = env->idt.limit;
982 sregs.idt.base = env->idt.base;
7e680753 983 memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
05330448
AL
984 sregs.gdt.limit = env->gdt.limit;
985 sregs.gdt.base = env->gdt.base;
7e680753 986 memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
05330448
AL
987
988 sregs.cr0 = env->cr[0];
989 sregs.cr2 = env->cr[2];
990 sregs.cr3 = env->cr[3];
991 sregs.cr4 = env->cr[4];
992
4a942cea
BS
993 sregs.cr8 = cpu_get_apic_tpr(env->apic_state);
994 sregs.apic_base = cpu_get_apic_base(env->apic_state);
05330448
AL
995
996 sregs.efer = env->efer;
997
1bc22652 998 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs);
05330448
AL
999}
1000
1001static void kvm_msr_entry_set(struct kvm_msr_entry *entry,
1002 uint32_t index, uint64_t value)
1003{
1004 entry->index = index;
1005 entry->data = value;
1006}
1007
1bc22652 1008static int kvm_put_msrs(X86CPU *cpu, int level)
05330448 1009{
1bc22652 1010 CPUX86State *env = &cpu->env;
05330448
AL
1011 struct {
1012 struct kvm_msrs info;
1013 struct kvm_msr_entry entries[100];
1014 } msr_data;
1015 struct kvm_msr_entry *msrs = msr_data.entries;
d8da8574 1016 int n = 0;
05330448
AL
1017
1018 kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
1019 kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
1020 kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
0c03266a 1021 kvm_msr_entry_set(&msrs[n++], MSR_PAT, env->pat);
c3a3a7d3 1022 if (has_msr_star) {
b9bec74b
JK
1023 kvm_msr_entry_set(&msrs[n++], MSR_STAR, env->star);
1024 }
c3a3a7d3 1025 if (has_msr_hsave_pa) {
75b10c43 1026 kvm_msr_entry_set(&msrs[n++], MSR_VM_HSAVE_PA, env->vm_hsave);
b9bec74b 1027 }
aa82ba54
LJ
1028 if (has_msr_tsc_deadline) {
1029 kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSCDEADLINE, env->tsc_deadline);
1030 }
21e87c46
AK
1031 if (has_msr_misc_enable) {
1032 kvm_msr_entry_set(&msrs[n++], MSR_IA32_MISC_ENABLE,
1033 env->msr_ia32_misc_enable);
1034 }
05330448 1035#ifdef TARGET_X86_64
25d2e361
MT
1036 if (lm_capable_kernel) {
1037 kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar);
1038 kvm_msr_entry_set(&msrs[n++], MSR_KERNELGSBASE, env->kernelgsbase);
1039 kvm_msr_entry_set(&msrs[n++], MSR_FMASK, env->fmask);
1040 kvm_msr_entry_set(&msrs[n++], MSR_LSTAR, env->lstar);
1041 }
05330448 1042#endif
ea643051 1043 if (level == KVM_PUT_FULL_STATE) {
384331a6
MT
1044 /*
1045 * KVM is yet unable to synchronize TSC values of multiple VCPUs on
1046 * writeback. Until this is fixed, we only write the offset to SMP
1047 * guests after migration, desynchronizing the VCPUs, but avoiding
1048 * huge jump-backs that would occur without any writeback at all.
1049 */
1050 if (smp_cpus == 1 || env->tsc != 0) {
1051 kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
1052 }
ff5c186b
JK
1053 }
1054 /*
1055 * The following paravirtual MSRs have side effects on the guest or are
1056 * too heavy for normal writeback. Limit them to reset or full state
1057 * updates.
1058 */
1059 if (level >= KVM_PUT_RESET_STATE) {
ea643051
JK
1060 kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME,
1061 env->system_time_msr);
1062 kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
c5999bfc
JK
1063 if (has_msr_async_pf_en) {
1064 kvm_msr_entry_set(&msrs[n++], MSR_KVM_ASYNC_PF_EN,
1065 env->async_pf_en_msr);
1066 }
bc9a839d
MT
1067 if (has_msr_pv_eoi_en) {
1068 kvm_msr_entry_set(&msrs[n++], MSR_KVM_PV_EOI_EN,
1069 env->pv_eoi_en_msr);
1070 }
eab70139
VR
1071 if (hyperv_hypercall_available()) {
1072 kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_GUEST_OS_ID, 0);
1073 kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_HYPERCALL, 0);
1074 }
1075 if (hyperv_vapic_recommended()) {
1076 kvm_msr_entry_set(&msrs[n++], HV_X64_MSR_APIC_ASSIST_PAGE, 0);
1077 }
ea643051 1078 }
57780495 1079 if (env->mcg_cap) {
d8da8574 1080 int i;
b9bec74b 1081
c34d440a
JK
1082 kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
1083 kvm_msr_entry_set(&msrs[n++], MSR_MCG_CTL, env->mcg_ctl);
1084 for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
1085 kvm_msr_entry_set(&msrs[n++], MSR_MC0_CTL + i, env->mce_banks[i]);
57780495
MT
1086 }
1087 }
1a03675d 1088
05330448
AL
1089 msr_data.info.nmsrs = n;
1090
1bc22652 1091 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, &msr_data);
05330448
AL
1092
1093}
1094
1095
1bc22652 1096static int kvm_get_fpu(X86CPU *cpu)
05330448 1097{
1bc22652 1098 CPUX86State *env = &cpu->env;
05330448
AL
1099 struct kvm_fpu fpu;
1100 int i, ret;
1101
1bc22652 1102 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_FPU, &fpu);
b9bec74b 1103 if (ret < 0) {
05330448 1104 return ret;
b9bec74b 1105 }
05330448
AL
1106
1107 env->fpstt = (fpu.fsw >> 11) & 7;
1108 env->fpus = fpu.fsw;
1109 env->fpuc = fpu.fcw;
42cc8fa6
JK
1110 env->fpop = fpu.last_opcode;
1111 env->fpip = fpu.last_ip;
1112 env->fpdp = fpu.last_dp;
b9bec74b
JK
1113 for (i = 0; i < 8; ++i) {
1114 env->fptags[i] = !((fpu.ftwx >> i) & 1);
1115 }
05330448
AL
1116 memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
1117 memcpy(env->xmm_regs, fpu.xmm, sizeof env->xmm_regs);
1118 env->mxcsr = fpu.mxcsr;
1119
1120 return 0;
1121}
1122
1bc22652 1123static int kvm_get_xsave(X86CPU *cpu)
f1665b21 1124{
1bc22652 1125 CPUX86State *env = &cpu->env;
fabacc0f 1126 struct kvm_xsave* xsave = env->kvm_xsave_buf;
f1665b21 1127 int ret, i;
42cc8fa6 1128 uint16_t cwd, swd, twd;
f1665b21 1129
b9bec74b 1130 if (!kvm_has_xsave()) {
1bc22652 1131 return kvm_get_fpu(cpu);
b9bec74b 1132 }
f1665b21 1133
1bc22652 1134 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XSAVE, xsave);
0f53994f 1135 if (ret < 0) {
f1665b21 1136 return ret;
0f53994f 1137 }
f1665b21 1138
6b42494b
JK
1139 cwd = (uint16_t)xsave->region[XSAVE_FCW_FSW];
1140 swd = (uint16_t)(xsave->region[XSAVE_FCW_FSW] >> 16);
1141 twd = (uint16_t)xsave->region[XSAVE_FTW_FOP];
1142 env->fpop = (uint16_t)(xsave->region[XSAVE_FTW_FOP] >> 16);
f1665b21
SY
1143 env->fpstt = (swd >> 11) & 7;
1144 env->fpus = swd;
1145 env->fpuc = cwd;
b9bec74b 1146 for (i = 0; i < 8; ++i) {
f1665b21 1147 env->fptags[i] = !((twd >> i) & 1);
b9bec74b 1148 }
42cc8fa6
JK
1149 memcpy(&env->fpip, &xsave->region[XSAVE_CWD_RIP], sizeof(env->fpip));
1150 memcpy(&env->fpdp, &xsave->region[XSAVE_CWD_RDP], sizeof(env->fpdp));
f1665b21
SY
1151 env->mxcsr = xsave->region[XSAVE_MXCSR];
1152 memcpy(env->fpregs, &xsave->region[XSAVE_ST_SPACE],
1153 sizeof env->fpregs);
1154 memcpy(env->xmm_regs, &xsave->region[XSAVE_XMM_SPACE],
1155 sizeof env->xmm_regs);
1156 env->xstate_bv = *(uint64_t *)&xsave->region[XSAVE_XSTATE_BV];
1157 memcpy(env->ymmh_regs, &xsave->region[XSAVE_YMMH_SPACE],
1158 sizeof env->ymmh_regs);
1159 return 0;
f1665b21
SY
1160}
1161
1bc22652 1162static int kvm_get_xcrs(X86CPU *cpu)
f1665b21 1163{
1bc22652 1164 CPUX86State *env = &cpu->env;
f1665b21
SY
1165 int i, ret;
1166 struct kvm_xcrs xcrs;
1167
b9bec74b 1168 if (!kvm_has_xcrs()) {
f1665b21 1169 return 0;
b9bec74b 1170 }
f1665b21 1171
1bc22652 1172 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XCRS, &xcrs);
b9bec74b 1173 if (ret < 0) {
f1665b21 1174 return ret;
b9bec74b 1175 }
f1665b21 1176
b9bec74b 1177 for (i = 0; i < xcrs.nr_xcrs; i++) {
f1665b21
SY
1178 /* Only support xcr0 now */
1179 if (xcrs.xcrs[0].xcr == 0) {
1180 env->xcr0 = xcrs.xcrs[0].value;
1181 break;
1182 }
b9bec74b 1183 }
f1665b21 1184 return 0;
f1665b21
SY
1185}
1186
1bc22652 1187static int kvm_get_sregs(X86CPU *cpu)
05330448 1188{
1bc22652 1189 CPUX86State *env = &cpu->env;
05330448
AL
1190 struct kvm_sregs sregs;
1191 uint32_t hflags;
0e607a80 1192 int bit, i, ret;
05330448 1193
1bc22652 1194 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
b9bec74b 1195 if (ret < 0) {
05330448 1196 return ret;
b9bec74b 1197 }
05330448 1198
0e607a80
JK
1199 /* There can only be one pending IRQ set in the bitmap at a time, so try
1200 to find it and save its number instead (-1 for none). */
1201 env->interrupt_injected = -1;
1202 for (i = 0; i < ARRAY_SIZE(sregs.interrupt_bitmap); i++) {
1203 if (sregs.interrupt_bitmap[i]) {
1204 bit = ctz64(sregs.interrupt_bitmap[i]);
1205 env->interrupt_injected = i * 64 + bit;
1206 break;
1207 }
1208 }
05330448
AL
1209
1210 get_seg(&env->segs[R_CS], &sregs.cs);
1211 get_seg(&env->segs[R_DS], &sregs.ds);
1212 get_seg(&env->segs[R_ES], &sregs.es);
1213 get_seg(&env->segs[R_FS], &sregs.fs);
1214 get_seg(&env->segs[R_GS], &sregs.gs);
1215 get_seg(&env->segs[R_SS], &sregs.ss);
1216
1217 get_seg(&env->tr, &sregs.tr);
1218 get_seg(&env->ldt, &sregs.ldt);
1219
1220 env->idt.limit = sregs.idt.limit;
1221 env->idt.base = sregs.idt.base;
1222 env->gdt.limit = sregs.gdt.limit;
1223 env->gdt.base = sregs.gdt.base;
1224
1225 env->cr[0] = sregs.cr0;
1226 env->cr[2] = sregs.cr2;
1227 env->cr[3] = sregs.cr3;
1228 env->cr[4] = sregs.cr4;
1229
05330448 1230 env->efer = sregs.efer;
cce47516
JK
1231
1232 /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
05330448 1233
b9bec74b
JK
1234#define HFLAG_COPY_MASK \
1235 ~( HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
1236 HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \
1237 HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \
1238 HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK)
05330448
AL
1239
1240 hflags = (env->segs[R_CS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
1241 hflags |= (env->cr[0] & CR0_PE_MASK) << (HF_PE_SHIFT - CR0_PE_SHIFT);
1242 hflags |= (env->cr[0] << (HF_MP_SHIFT - CR0_MP_SHIFT)) &
b9bec74b 1243 (HF_MP_MASK | HF_EM_MASK | HF_TS_MASK);
05330448
AL
1244 hflags |= (env->eflags & (HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK));
1245 hflags |= (env->cr[4] & CR4_OSFXSR_MASK) <<
b9bec74b 1246 (HF_OSFXSR_SHIFT - CR4_OSFXSR_SHIFT);
05330448
AL
1247
1248 if (env->efer & MSR_EFER_LMA) {
1249 hflags |= HF_LMA_MASK;
1250 }
1251
1252 if ((hflags & HF_LMA_MASK) && (env->segs[R_CS].flags & DESC_L_MASK)) {
1253 hflags |= HF_CS32_MASK | HF_SS32_MASK | HF_CS64_MASK;
1254 } else {
1255 hflags |= (env->segs[R_CS].flags & DESC_B_MASK) >>
b9bec74b 1256 (DESC_B_SHIFT - HF_CS32_SHIFT);
05330448 1257 hflags |= (env->segs[R_SS].flags & DESC_B_MASK) >>
b9bec74b
JK
1258 (DESC_B_SHIFT - HF_SS32_SHIFT);
1259 if (!(env->cr[0] & CR0_PE_MASK) || (env->eflags & VM_MASK) ||
1260 !(hflags & HF_CS32_MASK)) {
1261 hflags |= HF_ADDSEG_MASK;
1262 } else {
1263 hflags |= ((env->segs[R_DS].base | env->segs[R_ES].base |
1264 env->segs[R_SS].base) != 0) << HF_ADDSEG_SHIFT;
1265 }
05330448
AL
1266 }
1267 env->hflags = (env->hflags & HFLAG_COPY_MASK) | hflags;
05330448
AL
1268
1269 return 0;
1270}
1271
1bc22652 1272static int kvm_get_msrs(X86CPU *cpu)
05330448 1273{
1bc22652 1274 CPUX86State *env = &cpu->env;
05330448
AL
1275 struct {
1276 struct kvm_msrs info;
1277 struct kvm_msr_entry entries[100];
1278 } msr_data;
1279 struct kvm_msr_entry *msrs = msr_data.entries;
1280 int ret, i, n;
1281
1282 n = 0;
1283 msrs[n++].index = MSR_IA32_SYSENTER_CS;
1284 msrs[n++].index = MSR_IA32_SYSENTER_ESP;
1285 msrs[n++].index = MSR_IA32_SYSENTER_EIP;
0c03266a 1286 msrs[n++].index = MSR_PAT;
c3a3a7d3 1287 if (has_msr_star) {
b9bec74b
JK
1288 msrs[n++].index = MSR_STAR;
1289 }
c3a3a7d3 1290 if (has_msr_hsave_pa) {
75b10c43 1291 msrs[n++].index = MSR_VM_HSAVE_PA;
b9bec74b 1292 }
aa82ba54
LJ
1293 if (has_msr_tsc_deadline) {
1294 msrs[n++].index = MSR_IA32_TSCDEADLINE;
1295 }
21e87c46
AK
1296 if (has_msr_misc_enable) {
1297 msrs[n++].index = MSR_IA32_MISC_ENABLE;
1298 }
b8cc45d6
GC
1299
1300 if (!env->tsc_valid) {
1301 msrs[n++].index = MSR_IA32_TSC;
1354869c 1302 env->tsc_valid = !runstate_is_running();
b8cc45d6
GC
1303 }
1304
05330448 1305#ifdef TARGET_X86_64
25d2e361
MT
1306 if (lm_capable_kernel) {
1307 msrs[n++].index = MSR_CSTAR;
1308 msrs[n++].index = MSR_KERNELGSBASE;
1309 msrs[n++].index = MSR_FMASK;
1310 msrs[n++].index = MSR_LSTAR;
1311 }
05330448 1312#endif
1a03675d
GC
1313 msrs[n++].index = MSR_KVM_SYSTEM_TIME;
1314 msrs[n++].index = MSR_KVM_WALL_CLOCK;
c5999bfc
JK
1315 if (has_msr_async_pf_en) {
1316 msrs[n++].index = MSR_KVM_ASYNC_PF_EN;
1317 }
bc9a839d
MT
1318 if (has_msr_pv_eoi_en) {
1319 msrs[n++].index = MSR_KVM_PV_EOI_EN;
1320 }
1a03675d 1321
57780495
MT
1322 if (env->mcg_cap) {
1323 msrs[n++].index = MSR_MCG_STATUS;
1324 msrs[n++].index = MSR_MCG_CTL;
b9bec74b 1325 for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
57780495 1326 msrs[n++].index = MSR_MC0_CTL + i;
b9bec74b 1327 }
57780495 1328 }
57780495 1329
05330448 1330 msr_data.info.nmsrs = n;
1bc22652 1331 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data);
b9bec74b 1332 if (ret < 0) {
05330448 1333 return ret;
b9bec74b 1334 }
05330448
AL
1335
1336 for (i = 0; i < ret; i++) {
1337 switch (msrs[i].index) {
1338 case MSR_IA32_SYSENTER_CS:
1339 env->sysenter_cs = msrs[i].data;
1340 break;
1341 case MSR_IA32_SYSENTER_ESP:
1342 env->sysenter_esp = msrs[i].data;
1343 break;
1344 case MSR_IA32_SYSENTER_EIP:
1345 env->sysenter_eip = msrs[i].data;
1346 break;
0c03266a
JK
1347 case MSR_PAT:
1348 env->pat = msrs[i].data;
1349 break;
05330448
AL
1350 case MSR_STAR:
1351 env->star = msrs[i].data;
1352 break;
1353#ifdef TARGET_X86_64
1354 case MSR_CSTAR:
1355 env->cstar = msrs[i].data;
1356 break;
1357 case MSR_KERNELGSBASE:
1358 env->kernelgsbase = msrs[i].data;
1359 break;
1360 case MSR_FMASK:
1361 env->fmask = msrs[i].data;
1362 break;
1363 case MSR_LSTAR:
1364 env->lstar = msrs[i].data;
1365 break;
1366#endif
1367 case MSR_IA32_TSC:
1368 env->tsc = msrs[i].data;
1369 break;
aa82ba54
LJ
1370 case MSR_IA32_TSCDEADLINE:
1371 env->tsc_deadline = msrs[i].data;
1372 break;
aa851e36
MT
1373 case MSR_VM_HSAVE_PA:
1374 env->vm_hsave = msrs[i].data;
1375 break;
1a03675d
GC
1376 case MSR_KVM_SYSTEM_TIME:
1377 env->system_time_msr = msrs[i].data;
1378 break;
1379 case MSR_KVM_WALL_CLOCK:
1380 env->wall_clock_msr = msrs[i].data;
1381 break;
57780495
MT
1382 case MSR_MCG_STATUS:
1383 env->mcg_status = msrs[i].data;
1384 break;
1385 case MSR_MCG_CTL:
1386 env->mcg_ctl = msrs[i].data;
1387 break;
21e87c46
AK
1388 case MSR_IA32_MISC_ENABLE:
1389 env->msr_ia32_misc_enable = msrs[i].data;
1390 break;
57780495 1391 default:
57780495
MT
1392 if (msrs[i].index >= MSR_MC0_CTL &&
1393 msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
1394 env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
57780495 1395 }
d8da8574 1396 break;
f6584ee2
GN
1397 case MSR_KVM_ASYNC_PF_EN:
1398 env->async_pf_en_msr = msrs[i].data;
1399 break;
bc9a839d
MT
1400 case MSR_KVM_PV_EOI_EN:
1401 env->pv_eoi_en_msr = msrs[i].data;
1402 break;
05330448
AL
1403 }
1404 }
1405
1406 return 0;
1407}
1408
1bc22652 1409static int kvm_put_mp_state(X86CPU *cpu)
9bdbe550 1410{
1bc22652 1411 struct kvm_mp_state mp_state = { .mp_state = cpu->env.mp_state };
9bdbe550 1412
1bc22652 1413 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MP_STATE, &mp_state);
9bdbe550
HB
1414}
1415
23d02d9b 1416static int kvm_get_mp_state(X86CPU *cpu)
9bdbe550 1417{
23d02d9b 1418 CPUX86State *env = &cpu->env;
9bdbe550
HB
1419 struct kvm_mp_state mp_state;
1420 int ret;
1421
1bc22652 1422 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MP_STATE, &mp_state);
9bdbe550
HB
1423 if (ret < 0) {
1424 return ret;
1425 }
1426 env->mp_state = mp_state.mp_state;
c14750e8
JK
1427 if (kvm_irqchip_in_kernel()) {
1428 env->halted = (mp_state.mp_state == KVM_MP_STATE_HALTED);
1429 }
9bdbe550
HB
1430 return 0;
1431}
1432
1bc22652 1433static int kvm_get_apic(X86CPU *cpu)
680c1c6f 1434{
1bc22652 1435 CPUX86State *env = &cpu->env;
680c1c6f
JK
1436 DeviceState *apic = env->apic_state;
1437 struct kvm_lapic_state kapic;
1438 int ret;
1439
3d4b2649 1440 if (apic && kvm_irqchip_in_kernel()) {
1bc22652 1441 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_LAPIC, &kapic);
680c1c6f
JK
1442 if (ret < 0) {
1443 return ret;
1444 }
1445
1446 kvm_get_apic_state(apic, &kapic);
1447 }
1448 return 0;
1449}
1450
1bc22652 1451static int kvm_put_apic(X86CPU *cpu)
680c1c6f 1452{
1bc22652 1453 CPUX86State *env = &cpu->env;
680c1c6f
JK
1454 DeviceState *apic = env->apic_state;
1455 struct kvm_lapic_state kapic;
1456
3d4b2649 1457 if (apic && kvm_irqchip_in_kernel()) {
680c1c6f
JK
1458 kvm_put_apic_state(apic, &kapic);
1459
1bc22652 1460 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_LAPIC, &kapic);
680c1c6f
JK
1461 }
1462 return 0;
1463}
1464
1bc22652 1465static int kvm_put_vcpu_events(X86CPU *cpu, int level)
a0fb002c 1466{
1bc22652 1467 CPUX86State *env = &cpu->env;
a0fb002c
JK
1468 struct kvm_vcpu_events events;
1469
1470 if (!kvm_has_vcpu_events()) {
1471 return 0;
1472 }
1473
31827373
JK
1474 events.exception.injected = (env->exception_injected >= 0);
1475 events.exception.nr = env->exception_injected;
a0fb002c
JK
1476 events.exception.has_error_code = env->has_error_code;
1477 events.exception.error_code = env->error_code;
7e680753 1478 events.exception.pad = 0;
a0fb002c
JK
1479
1480 events.interrupt.injected = (env->interrupt_injected >= 0);
1481 events.interrupt.nr = env->interrupt_injected;
1482 events.interrupt.soft = env->soft_interrupt;
1483
1484 events.nmi.injected = env->nmi_injected;
1485 events.nmi.pending = env->nmi_pending;
1486 events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
7e680753 1487 events.nmi.pad = 0;
a0fb002c
JK
1488
1489 events.sipi_vector = env->sipi_vector;
1490
ea643051
JK
1491 events.flags = 0;
1492 if (level >= KVM_PUT_RESET_STATE) {
1493 events.flags |=
1494 KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR;
1495 }
aee028b9 1496
1bc22652 1497 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events);
a0fb002c
JK
1498}
1499
1bc22652 1500static int kvm_get_vcpu_events(X86CPU *cpu)
a0fb002c 1501{
1bc22652 1502 CPUX86State *env = &cpu->env;
a0fb002c
JK
1503 struct kvm_vcpu_events events;
1504 int ret;
1505
1506 if (!kvm_has_vcpu_events()) {
1507 return 0;
1508 }
1509
1bc22652 1510 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_VCPU_EVENTS, &events);
a0fb002c
JK
1511 if (ret < 0) {
1512 return ret;
1513 }
31827373 1514 env->exception_injected =
a0fb002c
JK
1515 events.exception.injected ? events.exception.nr : -1;
1516 env->has_error_code = events.exception.has_error_code;
1517 env->error_code = events.exception.error_code;
1518
1519 env->interrupt_injected =
1520 events.interrupt.injected ? events.interrupt.nr : -1;
1521 env->soft_interrupt = events.interrupt.soft;
1522
1523 env->nmi_injected = events.nmi.injected;
1524 env->nmi_pending = events.nmi.pending;
1525 if (events.nmi.masked) {
1526 env->hflags2 |= HF2_NMI_MASK;
1527 } else {
1528 env->hflags2 &= ~HF2_NMI_MASK;
1529 }
1530
1531 env->sipi_vector = events.sipi_vector;
a0fb002c
JK
1532
1533 return 0;
1534}
1535
1bc22652 1536static int kvm_guest_debug_workarounds(X86CPU *cpu)
b0b1d690 1537{
1bc22652 1538 CPUX86State *env = &cpu->env;
b0b1d690 1539 int ret = 0;
b0b1d690
JK
1540 unsigned long reinject_trap = 0;
1541
1542 if (!kvm_has_vcpu_events()) {
1543 if (env->exception_injected == 1) {
1544 reinject_trap = KVM_GUESTDBG_INJECT_DB;
1545 } else if (env->exception_injected == 3) {
1546 reinject_trap = KVM_GUESTDBG_INJECT_BP;
1547 }
1548 env->exception_injected = -1;
1549 }
1550
1551 /*
1552 * Kernels before KVM_CAP_X86_ROBUST_SINGLESTEP overwrote flags.TF
1553 * injected via SET_GUEST_DEBUG while updating GP regs. Work around this
1554 * by updating the debug state once again if single-stepping is on.
1555 * Another reason to call kvm_update_guest_debug here is a pending debug
1556 * trap raise by the guest. On kernels without SET_VCPU_EVENTS we have to
1557 * reinject them via SET_GUEST_DEBUG.
1558 */
1559 if (reinject_trap ||
1560 (!kvm_has_robust_singlestep() && env->singlestep_enabled)) {
1561 ret = kvm_update_guest_debug(env, reinject_trap);
1562 }
b0b1d690
JK
1563 return ret;
1564}
1565
1bc22652 1566static int kvm_put_debugregs(X86CPU *cpu)
ff44f1a3 1567{
1bc22652 1568 CPUX86State *env = &cpu->env;
ff44f1a3
JK
1569 struct kvm_debugregs dbgregs;
1570 int i;
1571
1572 if (!kvm_has_debugregs()) {
1573 return 0;
1574 }
1575
1576 for (i = 0; i < 4; i++) {
1577 dbgregs.db[i] = env->dr[i];
1578 }
1579 dbgregs.dr6 = env->dr[6];
1580 dbgregs.dr7 = env->dr[7];
1581 dbgregs.flags = 0;
1582
1bc22652 1583 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_DEBUGREGS, &dbgregs);
ff44f1a3
JK
1584}
1585
1bc22652 1586static int kvm_get_debugregs(X86CPU *cpu)
ff44f1a3 1587{
1bc22652 1588 CPUX86State *env = &cpu->env;
ff44f1a3
JK
1589 struct kvm_debugregs dbgregs;
1590 int i, ret;
1591
1592 if (!kvm_has_debugregs()) {
1593 return 0;
1594 }
1595
1bc22652 1596 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_DEBUGREGS, &dbgregs);
ff44f1a3 1597 if (ret < 0) {
b9bec74b 1598 return ret;
ff44f1a3
JK
1599 }
1600 for (i = 0; i < 4; i++) {
1601 env->dr[i] = dbgregs.db[i];
1602 }
1603 env->dr[4] = env->dr[6] = dbgregs.dr6;
1604 env->dr[5] = env->dr[7] = dbgregs.dr7;
ff44f1a3
JK
1605
1606 return 0;
1607}
1608
20d695a9 1609int kvm_arch_put_registers(CPUState *cpu, int level)
05330448 1610{
20d695a9 1611 X86CPU *x86_cpu = X86_CPU(cpu);
05330448
AL
1612 int ret;
1613
2fa45344 1614 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
dbaa07c4 1615
1bc22652 1616 ret = kvm_getput_regs(x86_cpu, 1);
b9bec74b 1617 if (ret < 0) {
05330448 1618 return ret;
b9bec74b 1619 }
1bc22652 1620 ret = kvm_put_xsave(x86_cpu);
b9bec74b 1621 if (ret < 0) {
f1665b21 1622 return ret;
b9bec74b 1623 }
1bc22652 1624 ret = kvm_put_xcrs(x86_cpu);
b9bec74b 1625 if (ret < 0) {
05330448 1626 return ret;
b9bec74b 1627 }
1bc22652 1628 ret = kvm_put_sregs(x86_cpu);
b9bec74b 1629 if (ret < 0) {
05330448 1630 return ret;
b9bec74b 1631 }
ab443475 1632 /* must be before kvm_put_msrs */
1bc22652 1633 ret = kvm_inject_mce_oldstyle(x86_cpu);
ab443475
JK
1634 if (ret < 0) {
1635 return ret;
1636 }
1bc22652 1637 ret = kvm_put_msrs(x86_cpu, level);
b9bec74b 1638 if (ret < 0) {
05330448 1639 return ret;
b9bec74b 1640 }
ea643051 1641 if (level >= KVM_PUT_RESET_STATE) {
1bc22652 1642 ret = kvm_put_mp_state(x86_cpu);
b9bec74b 1643 if (ret < 0) {
ea643051 1644 return ret;
b9bec74b 1645 }
1bc22652 1646 ret = kvm_put_apic(x86_cpu);
680c1c6f
JK
1647 if (ret < 0) {
1648 return ret;
1649 }
ea643051 1650 }
1bc22652 1651 ret = kvm_put_vcpu_events(x86_cpu, level);
b9bec74b 1652 if (ret < 0) {
a0fb002c 1653 return ret;
b9bec74b 1654 }
1bc22652 1655 ret = kvm_put_debugregs(x86_cpu);
b9bec74b 1656 if (ret < 0) {
b0b1d690 1657 return ret;
b9bec74b 1658 }
b0b1d690 1659 /* must be last */
1bc22652 1660 ret = kvm_guest_debug_workarounds(x86_cpu);
b9bec74b 1661 if (ret < 0) {
ff44f1a3 1662 return ret;
b9bec74b 1663 }
05330448
AL
1664 return 0;
1665}
1666
20d695a9 1667int kvm_arch_get_registers(CPUState *cs)
05330448 1668{
20d695a9 1669 X86CPU *cpu = X86_CPU(cs);
05330448
AL
1670 int ret;
1671
20d695a9 1672 assert(cpu_is_stopped(cs) || qemu_cpu_is_self(cs));
dbaa07c4 1673
1bc22652 1674 ret = kvm_getput_regs(cpu, 0);
b9bec74b 1675 if (ret < 0) {
05330448 1676 return ret;
b9bec74b 1677 }
1bc22652 1678 ret = kvm_get_xsave(cpu);
b9bec74b 1679 if (ret < 0) {
f1665b21 1680 return ret;
b9bec74b 1681 }
1bc22652 1682 ret = kvm_get_xcrs(cpu);
b9bec74b 1683 if (ret < 0) {
05330448 1684 return ret;
b9bec74b 1685 }
1bc22652 1686 ret = kvm_get_sregs(cpu);
b9bec74b 1687 if (ret < 0) {
05330448 1688 return ret;
b9bec74b 1689 }
1bc22652 1690 ret = kvm_get_msrs(cpu);
b9bec74b 1691 if (ret < 0) {
05330448 1692 return ret;
b9bec74b 1693 }
23d02d9b 1694 ret = kvm_get_mp_state(cpu);
b9bec74b 1695 if (ret < 0) {
5a2e3c2e 1696 return ret;
b9bec74b 1697 }
1bc22652 1698 ret = kvm_get_apic(cpu);
680c1c6f
JK
1699 if (ret < 0) {
1700 return ret;
1701 }
1bc22652 1702 ret = kvm_get_vcpu_events(cpu);
b9bec74b 1703 if (ret < 0) {
a0fb002c 1704 return ret;
b9bec74b 1705 }
1bc22652 1706 ret = kvm_get_debugregs(cpu);
b9bec74b 1707 if (ret < 0) {
ff44f1a3 1708 return ret;
b9bec74b 1709 }
05330448
AL
1710 return 0;
1711}
1712
20d695a9 1713void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
05330448 1714{
20d695a9
AF
1715 X86CPU *x86_cpu = X86_CPU(cpu);
1716 CPUX86State *env = &x86_cpu->env;
ce377af3
JK
1717 int ret;
1718
276ce815
LJ
1719 /* Inject NMI */
1720 if (env->interrupt_request & CPU_INTERRUPT_NMI) {
1721 env->interrupt_request &= ~CPU_INTERRUPT_NMI;
1722 DPRINTF("injected NMI\n");
1bc22652 1723 ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
ce377af3
JK
1724 if (ret < 0) {
1725 fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
1726 strerror(-ret));
1727 }
276ce815
LJ
1728 }
1729
db1669bc 1730 if (!kvm_irqchip_in_kernel()) {
d362e757
JK
1731 /* Force the VCPU out of its inner loop to process any INIT requests
1732 * or pending TPR access reports. */
1733 if (env->interrupt_request &
1734 (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
db1669bc 1735 env->exit_request = 1;
05330448 1736 }
05330448 1737
db1669bc
JK
1738 /* Try to inject an interrupt if the guest can accept it */
1739 if (run->ready_for_interrupt_injection &&
1740 (env->interrupt_request & CPU_INTERRUPT_HARD) &&
1741 (env->eflags & IF_MASK)) {
1742 int irq;
1743
1744 env->interrupt_request &= ~CPU_INTERRUPT_HARD;
1745 irq = cpu_get_pic_interrupt(env);
1746 if (irq >= 0) {
1747 struct kvm_interrupt intr;
1748
1749 intr.irq = irq;
db1669bc 1750 DPRINTF("injected interrupt %d\n", irq);
1bc22652 1751 ret = kvm_vcpu_ioctl(cpu, KVM_INTERRUPT, &intr);
ce377af3
JK
1752 if (ret < 0) {
1753 fprintf(stderr,
1754 "KVM: injection failed, interrupt lost (%s)\n",
1755 strerror(-ret));
1756 }
db1669bc
JK
1757 }
1758 }
05330448 1759
db1669bc
JK
1760 /* If we have an interrupt but the guest is not ready to receive an
1761 * interrupt, request an interrupt window exit. This will
1762 * cause a return to userspace as soon as the guest is ready to
1763 * receive interrupts. */
1764 if ((env->interrupt_request & CPU_INTERRUPT_HARD)) {
1765 run->request_interrupt_window = 1;
1766 } else {
1767 run->request_interrupt_window = 0;
1768 }
1769
1770 DPRINTF("setting tpr\n");
1771 run->cr8 = cpu_get_apic_tpr(env->apic_state);
1772 }
05330448
AL
1773}
1774
20d695a9 1775void kvm_arch_post_run(CPUState *cpu, struct kvm_run *run)
05330448 1776{
20d695a9
AF
1777 X86CPU *x86_cpu = X86_CPU(cpu);
1778 CPUX86State *env = &x86_cpu->env;
1779
b9bec74b 1780 if (run->if_flag) {
05330448 1781 env->eflags |= IF_MASK;
b9bec74b 1782 } else {
05330448 1783 env->eflags &= ~IF_MASK;
b9bec74b 1784 }
4a942cea
BS
1785 cpu_set_apic_tpr(env->apic_state, run->cr8);
1786 cpu_set_apic_base(env->apic_state, run->apic_base);
05330448
AL
1787}
1788
20d695a9 1789int kvm_arch_process_async_events(CPUState *cs)
0af691d7 1790{
20d695a9
AF
1791 X86CPU *cpu = X86_CPU(cs);
1792 CPUX86State *env = &cpu->env;
232fc23b 1793
ab443475
JK
1794 if (env->interrupt_request & CPU_INTERRUPT_MCE) {
1795 /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
1796 assert(env->mcg_cap);
1797
1798 env->interrupt_request &= ~CPU_INTERRUPT_MCE;
1799
1800 kvm_cpu_synchronize_state(env);
1801
1802 if (env->exception_injected == EXCP08_DBLE) {
1803 /* this means triple fault */
1804 qemu_system_reset_request();
1805 env->exit_request = 1;
1806 return 0;
1807 }
1808 env->exception_injected = EXCP12_MCHK;
1809 env->has_error_code = 0;
1810
1811 env->halted = 0;
1812 if (kvm_irqchip_in_kernel() && env->mp_state == KVM_MP_STATE_HALTED) {
1813 env->mp_state = KVM_MP_STATE_RUNNABLE;
1814 }
1815 }
1816
db1669bc
JK
1817 if (kvm_irqchip_in_kernel()) {
1818 return 0;
1819 }
1820
5d62c43a
JK
1821 if (env->interrupt_request & CPU_INTERRUPT_POLL) {
1822 env->interrupt_request &= ~CPU_INTERRUPT_POLL;
1823 apic_poll_irq(env->apic_state);
1824 }
4601f7b0
JK
1825 if (((env->interrupt_request & CPU_INTERRUPT_HARD) &&
1826 (env->eflags & IF_MASK)) ||
1827 (env->interrupt_request & CPU_INTERRUPT_NMI)) {
6792a57b
JK
1828 env->halted = 0;
1829 }
0af691d7
MT
1830 if (env->interrupt_request & CPU_INTERRUPT_INIT) {
1831 kvm_cpu_synchronize_state(env);
232fc23b 1832 do_cpu_init(cpu);
0af691d7 1833 }
0af691d7
MT
1834 if (env->interrupt_request & CPU_INTERRUPT_SIPI) {
1835 kvm_cpu_synchronize_state(env);
232fc23b 1836 do_cpu_sipi(cpu);
0af691d7 1837 }
d362e757
JK
1838 if (env->interrupt_request & CPU_INTERRUPT_TPR) {
1839 env->interrupt_request &= ~CPU_INTERRUPT_TPR;
1840 kvm_cpu_synchronize_state(env);
1841 apic_handle_tpr_access_report(env->apic_state, env->eip,
1842 env->tpr_access_type);
1843 }
0af691d7
MT
1844
1845 return env->halted;
1846}
1847
839b5630 1848static int kvm_handle_halt(X86CPU *cpu)
05330448 1849{
839b5630
AF
1850 CPUX86State *env = &cpu->env;
1851
05330448
AL
1852 if (!((env->interrupt_request & CPU_INTERRUPT_HARD) &&
1853 (env->eflags & IF_MASK)) &&
1854 !(env->interrupt_request & CPU_INTERRUPT_NMI)) {
1855 env->halted = 1;
bb4ea393 1856 return EXCP_HLT;
05330448
AL
1857 }
1858
bb4ea393 1859 return 0;
05330448
AL
1860}
1861
317ac620 1862static int kvm_handle_tpr_access(CPUX86State *env)
d362e757
JK
1863{
1864 struct kvm_run *run = env->kvm_run;
1865
1866 apic_handle_tpr_access_report(env->apic_state, run->tpr_access.rip,
1867 run->tpr_access.is_write ? TPR_ACCESS_WRITE
1868 : TPR_ACCESS_READ);
1869 return 1;
1870}
1871
20d695a9 1872int kvm_arch_insert_sw_breakpoint(CPUState *cpu, struct kvm_sw_breakpoint *bp)
e22a25c9 1873{
20d695a9 1874 CPUX86State *env = &X86_CPU(cpu)->env;
38972938 1875 static const uint8_t int3 = 0xcc;
64bf3f4e 1876
e22a25c9 1877 if (cpu_memory_rw_debug(env, bp->pc, (uint8_t *)&bp->saved_insn, 1, 0) ||
b9bec74b 1878 cpu_memory_rw_debug(env, bp->pc, (uint8_t *)&int3, 1, 1)) {
e22a25c9 1879 return -EINVAL;
b9bec74b 1880 }
e22a25c9
AL
1881 return 0;
1882}
1883
20d695a9 1884int kvm_arch_remove_sw_breakpoint(CPUState *cpu, struct kvm_sw_breakpoint *bp)
e22a25c9 1885{
20d695a9 1886 CPUX86State *env = &X86_CPU(cpu)->env;
e22a25c9
AL
1887 uint8_t int3;
1888
1889 if (cpu_memory_rw_debug(env, bp->pc, &int3, 1, 0) || int3 != 0xcc ||
b9bec74b 1890 cpu_memory_rw_debug(env, bp->pc, (uint8_t *)&bp->saved_insn, 1, 1)) {
e22a25c9 1891 return -EINVAL;
b9bec74b 1892 }
e22a25c9
AL
1893 return 0;
1894}
1895
1896static struct {
1897 target_ulong addr;
1898 int len;
1899 int type;
1900} hw_breakpoint[4];
1901
1902static int nb_hw_breakpoint;
1903
1904static int find_hw_breakpoint(target_ulong addr, int len, int type)
1905{
1906 int n;
1907
b9bec74b 1908 for (n = 0; n < nb_hw_breakpoint; n++) {
e22a25c9 1909 if (hw_breakpoint[n].addr == addr && hw_breakpoint[n].type == type &&
b9bec74b 1910 (hw_breakpoint[n].len == len || len == -1)) {
e22a25c9 1911 return n;
b9bec74b
JK
1912 }
1913 }
e22a25c9
AL
1914 return -1;
1915}
1916
1917int kvm_arch_insert_hw_breakpoint(target_ulong addr,
1918 target_ulong len, int type)
1919{
1920 switch (type) {
1921 case GDB_BREAKPOINT_HW:
1922 len = 1;
1923 break;
1924 case GDB_WATCHPOINT_WRITE:
1925 case GDB_WATCHPOINT_ACCESS:
1926 switch (len) {
1927 case 1:
1928 break;
1929 case 2:
1930 case 4:
1931 case 8:
b9bec74b 1932 if (addr & (len - 1)) {
e22a25c9 1933 return -EINVAL;
b9bec74b 1934 }
e22a25c9
AL
1935 break;
1936 default:
1937 return -EINVAL;
1938 }
1939 break;
1940 default:
1941 return -ENOSYS;
1942 }
1943
b9bec74b 1944 if (nb_hw_breakpoint == 4) {
e22a25c9 1945 return -ENOBUFS;
b9bec74b
JK
1946 }
1947 if (find_hw_breakpoint(addr, len, type) >= 0) {
e22a25c9 1948 return -EEXIST;
b9bec74b 1949 }
e22a25c9
AL
1950 hw_breakpoint[nb_hw_breakpoint].addr = addr;
1951 hw_breakpoint[nb_hw_breakpoint].len = len;
1952 hw_breakpoint[nb_hw_breakpoint].type = type;
1953 nb_hw_breakpoint++;
1954
1955 return 0;
1956}
1957
1958int kvm_arch_remove_hw_breakpoint(target_ulong addr,
1959 target_ulong len, int type)
1960{
1961 int n;
1962
1963 n = find_hw_breakpoint(addr, (type == GDB_BREAKPOINT_HW) ? 1 : len, type);
b9bec74b 1964 if (n < 0) {
e22a25c9 1965 return -ENOENT;
b9bec74b 1966 }
e22a25c9
AL
1967 nb_hw_breakpoint--;
1968 hw_breakpoint[n] = hw_breakpoint[nb_hw_breakpoint];
1969
1970 return 0;
1971}
1972
1973void kvm_arch_remove_all_hw_breakpoints(void)
1974{
1975 nb_hw_breakpoint = 0;
1976}
1977
1978static CPUWatchpoint hw_watchpoint;
1979
48405526
BS
1980static int kvm_handle_debug(CPUX86State *env,
1981 struct kvm_debug_exit_arch *arch_info)
e22a25c9 1982{
f2574737 1983 int ret = 0;
e22a25c9
AL
1984 int n;
1985
1986 if (arch_info->exception == 1) {
1987 if (arch_info->dr6 & (1 << 14)) {
48405526 1988 if (env->singlestep_enabled) {
f2574737 1989 ret = EXCP_DEBUG;
b9bec74b 1990 }
e22a25c9 1991 } else {
b9bec74b
JK
1992 for (n = 0; n < 4; n++) {
1993 if (arch_info->dr6 & (1 << n)) {
e22a25c9
AL
1994 switch ((arch_info->dr7 >> (16 + n*4)) & 0x3) {
1995 case 0x0:
f2574737 1996 ret = EXCP_DEBUG;
e22a25c9
AL
1997 break;
1998 case 0x1:
f2574737 1999 ret = EXCP_DEBUG;
48405526 2000 env->watchpoint_hit = &hw_watchpoint;
e22a25c9
AL
2001 hw_watchpoint.vaddr = hw_breakpoint[n].addr;
2002 hw_watchpoint.flags = BP_MEM_WRITE;
2003 break;
2004 case 0x3:
f2574737 2005 ret = EXCP_DEBUG;
48405526 2006 env->watchpoint_hit = &hw_watchpoint;
e22a25c9
AL
2007 hw_watchpoint.vaddr = hw_breakpoint[n].addr;
2008 hw_watchpoint.flags = BP_MEM_ACCESS;
2009 break;
2010 }
b9bec74b
JK
2011 }
2012 }
e22a25c9 2013 }
48405526 2014 } else if (kvm_find_sw_breakpoint(env, arch_info->pc)) {
f2574737 2015 ret = EXCP_DEBUG;
b9bec74b 2016 }
f2574737 2017 if (ret == 0) {
48405526
BS
2018 cpu_synchronize_state(env);
2019 assert(env->exception_injected == -1);
b0b1d690 2020
f2574737 2021 /* pass to guest */
48405526
BS
2022 env->exception_injected = arch_info->exception;
2023 env->has_error_code = 0;
b0b1d690 2024 }
e22a25c9 2025
f2574737 2026 return ret;
e22a25c9
AL
2027}
2028
20d695a9 2029void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg)
e22a25c9 2030{
20d695a9 2031 CPUX86State *env = &X86_CPU(cpu)->env;
e22a25c9
AL
2032 const uint8_t type_code[] = {
2033 [GDB_BREAKPOINT_HW] = 0x0,
2034 [GDB_WATCHPOINT_WRITE] = 0x1,
2035 [GDB_WATCHPOINT_ACCESS] = 0x3
2036 };
2037 const uint8_t len_code[] = {
2038 [1] = 0x0, [2] = 0x1, [4] = 0x3, [8] = 0x2
2039 };
2040 int n;
2041
b9bec74b 2042 if (kvm_sw_breakpoints_active(env)) {
e22a25c9 2043 dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
b9bec74b 2044 }
e22a25c9
AL
2045 if (nb_hw_breakpoint > 0) {
2046 dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
2047 dbg->arch.debugreg[7] = 0x0600;
2048 for (n = 0; n < nb_hw_breakpoint; n++) {
2049 dbg->arch.debugreg[n] = hw_breakpoint[n].addr;
2050 dbg->arch.debugreg[7] |= (2 << (n * 2)) |
2051 (type_code[hw_breakpoint[n].type] << (16 + n*4)) |
95c077c9 2052 ((uint32_t)len_code[hw_breakpoint[n].len] << (18 + n*4));
e22a25c9
AL
2053 }
2054 }
2055}
4513d923 2056
2a4dac83
JK
2057static bool host_supports_vmx(void)
2058{
2059 uint32_t ecx, unused;
2060
2061 host_cpuid(1, 0, &unused, &unused, &ecx, &unused);
2062 return ecx & CPUID_EXT_VMX;
2063}
2064
2065#define VMX_INVALID_GUEST_STATE 0x80000021
2066
20d695a9 2067int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
2a4dac83 2068{
20d695a9
AF
2069 X86CPU *cpu = X86_CPU(cs);
2070 CPUX86State *env = &cpu->env;
2a4dac83
JK
2071 uint64_t code;
2072 int ret;
2073
2074 switch (run->exit_reason) {
2075 case KVM_EXIT_HLT:
2076 DPRINTF("handle_hlt\n");
839b5630 2077 ret = kvm_handle_halt(cpu);
2a4dac83
JK
2078 break;
2079 case KVM_EXIT_SET_TPR:
2080 ret = 0;
2081 break;
d362e757
JK
2082 case KVM_EXIT_TPR_ACCESS:
2083 ret = kvm_handle_tpr_access(env);
2084 break;
2a4dac83
JK
2085 case KVM_EXIT_FAIL_ENTRY:
2086 code = run->fail_entry.hardware_entry_failure_reason;
2087 fprintf(stderr, "KVM: entry failed, hardware error 0x%" PRIx64 "\n",
2088 code);
2089 if (host_supports_vmx() && code == VMX_INVALID_GUEST_STATE) {
2090 fprintf(stderr,
12619721 2091 "\nIf you're running a guest on an Intel machine without "
2a4dac83
JK
2092 "unrestricted mode\n"
2093 "support, the failure can be most likely due to the guest "
2094 "entering an invalid\n"
2095 "state for Intel VT. For example, the guest maybe running "
2096 "in big real mode\n"
2097 "which is not supported on less recent Intel processors."
2098 "\n\n");
2099 }
2100 ret = -1;
2101 break;
2102 case KVM_EXIT_EXCEPTION:
2103 fprintf(stderr, "KVM: exception %d exit (error code 0x%x)\n",
2104 run->ex.exception, run->ex.error_code);
2105 ret = -1;
2106 break;
f2574737
JK
2107 case KVM_EXIT_DEBUG:
2108 DPRINTF("kvm_exit_debug\n");
48405526 2109 ret = kvm_handle_debug(env, &run->debug.arch);
f2574737 2110 break;
2a4dac83
JK
2111 default:
2112 fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
2113 ret = -1;
2114 break;
2115 }
2116
2117 return ret;
2118}
2119
20d695a9 2120bool kvm_arch_stop_on_emulation_error(CPUState *cs)
4513d923 2121{
20d695a9
AF
2122 X86CPU *cpu = X86_CPU(cs);
2123 CPUX86State *env = &cpu->env;
2124
d1f86636 2125 kvm_cpu_synchronize_state(env);
b9bec74b
JK
2126 return !(env->cr[0] & CR0_PE_MASK) ||
2127 ((env->segs[R_CS].selector & 3) != 3);
4513d923 2128}
84b058d7
JK
2129
2130void kvm_arch_init_irq_routing(KVMState *s)
2131{
2132 if (!kvm_check_extension(s, KVM_CAP_IRQ_ROUTING)) {
2133 /* If kernel can't do irq routing, interrupt source
2134 * override 0->2 cannot be set up as required by HPET.
2135 * So we have to disable it.
2136 */
2137 no_hpet = 1;
2138 }
cc7e0ddf 2139 /* We know at this point that we're using the in-kernel
614e41bc 2140 * irqchip, so we can use irqfds, and on x86 we know
f3e1bed8 2141 * we can use msi via irqfd and GSI routing.
cc7e0ddf
PM
2142 */
2143 kvm_irqfds_allowed = true;
614e41bc 2144 kvm_msi_via_irqfd_allowed = true;
f3e1bed8 2145 kvm_gsi_routing_allowed = true;
84b058d7 2146}
b139bd30
JK
2147
2148/* Classic KVM device assignment interface. Will remain x86 only. */
2149int kvm_device_pci_assign(KVMState *s, PCIHostDeviceAddress *dev_addr,
2150 uint32_t flags, uint32_t *dev_id)
2151{
2152 struct kvm_assigned_pci_dev dev_data = {
2153 .segnr = dev_addr->domain,
2154 .busnr = dev_addr->bus,
2155 .devfn = PCI_DEVFN(dev_addr->slot, dev_addr->function),
2156 .flags = flags,
2157 };
2158 int ret;
2159
2160 dev_data.assigned_dev_id =
2161 (dev_addr->domain << 16) | (dev_addr->bus << 8) | dev_data.devfn;
2162
2163 ret = kvm_vm_ioctl(s, KVM_ASSIGN_PCI_DEVICE, &dev_data);
2164 if (ret < 0) {
2165 return ret;
2166 }
2167
2168 *dev_id = dev_data.assigned_dev_id;
2169
2170 return 0;
2171}
2172
2173int kvm_device_pci_deassign(KVMState *s, uint32_t dev_id)
2174{
2175 struct kvm_assigned_pci_dev dev_data = {
2176 .assigned_dev_id = dev_id,
2177 };
2178
2179 return kvm_vm_ioctl(s, KVM_DEASSIGN_PCI_DEVICE, &dev_data);
2180}
2181
2182static int kvm_assign_irq_internal(KVMState *s, uint32_t dev_id,
2183 uint32_t irq_type, uint32_t guest_irq)
2184{
2185 struct kvm_assigned_irq assigned_irq = {
2186 .assigned_dev_id = dev_id,
2187 .guest_irq = guest_irq,
2188 .flags = irq_type,
2189 };
2190
2191 if (kvm_check_extension(s, KVM_CAP_ASSIGN_DEV_IRQ)) {
2192 return kvm_vm_ioctl(s, KVM_ASSIGN_DEV_IRQ, &assigned_irq);
2193 } else {
2194 return kvm_vm_ioctl(s, KVM_ASSIGN_IRQ, &assigned_irq);
2195 }
2196}
2197
2198int kvm_device_intx_assign(KVMState *s, uint32_t dev_id, bool use_host_msi,
2199 uint32_t guest_irq)
2200{
2201 uint32_t irq_type = KVM_DEV_IRQ_GUEST_INTX |
2202 (use_host_msi ? KVM_DEV_IRQ_HOST_MSI : KVM_DEV_IRQ_HOST_INTX);
2203
2204 return kvm_assign_irq_internal(s, dev_id, irq_type, guest_irq);
2205}
2206
2207int kvm_device_intx_set_mask(KVMState *s, uint32_t dev_id, bool masked)
2208{
2209 struct kvm_assigned_pci_dev dev_data = {
2210 .assigned_dev_id = dev_id,
2211 .flags = masked ? KVM_DEV_ASSIGN_MASK_INTX : 0,
2212 };
2213
2214 return kvm_vm_ioctl(s, KVM_ASSIGN_SET_INTX_MASK, &dev_data);
2215}
2216
2217static int kvm_deassign_irq_internal(KVMState *s, uint32_t dev_id,
2218 uint32_t type)
2219{
2220 struct kvm_assigned_irq assigned_irq = {
2221 .assigned_dev_id = dev_id,
2222 .flags = type,
2223 };
2224
2225 return kvm_vm_ioctl(s, KVM_DEASSIGN_DEV_IRQ, &assigned_irq);
2226}
2227
2228int kvm_device_intx_deassign(KVMState *s, uint32_t dev_id, bool use_host_msi)
2229{
2230 return kvm_deassign_irq_internal(s, dev_id, KVM_DEV_IRQ_GUEST_INTX |
2231 (use_host_msi ? KVM_DEV_IRQ_HOST_MSI : KVM_DEV_IRQ_HOST_INTX));
2232}
2233
2234int kvm_device_msi_assign(KVMState *s, uint32_t dev_id, int virq)
2235{
2236 return kvm_assign_irq_internal(s, dev_id, KVM_DEV_IRQ_HOST_MSI |
2237 KVM_DEV_IRQ_GUEST_MSI, virq);
2238}
2239
2240int kvm_device_msi_deassign(KVMState *s, uint32_t dev_id)
2241{
2242 return kvm_deassign_irq_internal(s, dev_id, KVM_DEV_IRQ_GUEST_MSI |
2243 KVM_DEV_IRQ_HOST_MSI);
2244}
2245
2246bool kvm_device_msix_supported(KVMState *s)
2247{
2248 /* The kernel lacks a corresponding KVM_CAP, so we probe by calling
2249 * KVM_ASSIGN_SET_MSIX_NR with an invalid parameter. */
2250 return kvm_vm_ioctl(s, KVM_ASSIGN_SET_MSIX_NR, NULL) == -EFAULT;
2251}
2252
2253int kvm_device_msix_init_vectors(KVMState *s, uint32_t dev_id,
2254 uint32_t nr_vectors)
2255{
2256 struct kvm_assigned_msix_nr msix_nr = {
2257 .assigned_dev_id = dev_id,
2258 .entry_nr = nr_vectors,
2259 };
2260
2261 return kvm_vm_ioctl(s, KVM_ASSIGN_SET_MSIX_NR, &msix_nr);
2262}
2263
2264int kvm_device_msix_set_vector(KVMState *s, uint32_t dev_id, uint32_t vector,
2265 int virq)
2266{
2267 struct kvm_assigned_msix_entry msix_entry = {
2268 .assigned_dev_id = dev_id,
2269 .gsi = virq,
2270 .entry = vector,
2271 };
2272
2273 return kvm_vm_ioctl(s, KVM_ASSIGN_SET_MSIX_ENTRY, &msix_entry);
2274}
2275
2276int kvm_device_msix_assign(KVMState *s, uint32_t dev_id)
2277{
2278 return kvm_assign_irq_internal(s, dev_id, KVM_DEV_IRQ_HOST_MSIX |
2279 KVM_DEV_IRQ_GUEST_MSIX, 0);
2280}
2281
2282int kvm_device_msix_deassign(KVMState *s, uint32_t dev_id)
2283{
2284 return kvm_deassign_irq_internal(s, dev_id, KVM_DEV_IRQ_GUEST_MSIX |
2285 KVM_DEV_IRQ_HOST_MSIX);
2286}