]> git.proxmox.com Git - qemu.git/blob - target-i386/kvm.c
Add "broadcast" option for mce command
[qemu.git] / target-i386 / kvm.c
1 /*
2 * QEMU KVM support
3 *
4 * Copyright (C) 2006-2008 Qumranet Technologies
5 * Copyright IBM, Corp. 2008
6 *
7 * Authors:
8 * Anthony Liguori <aliguori@us.ibm.com>
9 *
10 * This work is licensed under the terms of the GNU GPL, version 2 or later.
11 * See the COPYING file in the top-level directory.
12 *
13 */
14
15 #include <sys/types.h>
16 #include <sys/ioctl.h>
17 #include <sys/mman.h>
18 #include <sys/utsname.h>
19
20 #include <linux/kvm.h>
21
22 #include "qemu-common.h"
23 #include "sysemu.h"
24 #include "kvm.h"
25 #include "cpu.h"
26 #include "gdbstub.h"
27 #include "host-utils.h"
28 #include "hw/pc.h"
29 #include "hw/apic.h"
30 #include "ioport.h"
31 #include "kvm_x86.h"
32
33 #ifdef CONFIG_KVM_PARA
34 #include <linux/kvm_para.h>
35 #endif
36 //
37 //#define DEBUG_KVM
38
39 #ifdef DEBUG_KVM
40 #define DPRINTF(fmt, ...) \
41 do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
42 #else
43 #define DPRINTF(fmt, ...) \
44 do { } while (0)
45 #endif
46
47 #define MSR_KVM_WALL_CLOCK 0x11
48 #define MSR_KVM_SYSTEM_TIME 0x12
49
50 #ifndef BUS_MCEERR_AR
51 #define BUS_MCEERR_AR 4
52 #endif
53 #ifndef BUS_MCEERR_AO
54 #define BUS_MCEERR_AO 5
55 #endif
56
57 static int lm_capable_kernel;
58
59 #ifdef KVM_CAP_EXT_CPUID
60
61 static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
62 {
63 struct kvm_cpuid2 *cpuid;
64 int r, size;
65
66 size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
67 cpuid = (struct kvm_cpuid2 *)qemu_mallocz(size);
68 cpuid->nent = max;
69 r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid);
70 if (r == 0 && cpuid->nent >= max) {
71 r = -E2BIG;
72 }
73 if (r < 0) {
74 if (r == -E2BIG) {
75 qemu_free(cpuid);
76 return NULL;
77 } else {
78 fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n",
79 strerror(-r));
80 exit(1);
81 }
82 }
83 return cpuid;
84 }
85
86 uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function,
87 uint32_t index, int reg)
88 {
89 struct kvm_cpuid2 *cpuid;
90 int i, max;
91 uint32_t ret = 0;
92 uint32_t cpuid_1_edx;
93
94 if (!kvm_check_extension(env->kvm_state, KVM_CAP_EXT_CPUID)) {
95 return -1U;
96 }
97
98 max = 1;
99 while ((cpuid = try_get_cpuid(env->kvm_state, max)) == NULL) {
100 max *= 2;
101 }
102
103 for (i = 0; i < cpuid->nent; ++i) {
104 if (cpuid->entries[i].function == function &&
105 cpuid->entries[i].index == index) {
106 switch (reg) {
107 case R_EAX:
108 ret = cpuid->entries[i].eax;
109 break;
110 case R_EBX:
111 ret = cpuid->entries[i].ebx;
112 break;
113 case R_ECX:
114 ret = cpuid->entries[i].ecx;
115 break;
116 case R_EDX:
117 ret = cpuid->entries[i].edx;
118 switch (function) {
119 case 1:
120 /* KVM before 2.6.30 misreports the following features */
121 ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA;
122 break;
123 case 0x80000001:
124 /* On Intel, kvm returns cpuid according to the Intel spec,
125 * so add missing bits according to the AMD spec:
126 */
127 cpuid_1_edx = kvm_arch_get_supported_cpuid(env, 1, 0, R_EDX);
128 ret |= cpuid_1_edx & 0x183f7ff;
129 break;
130 }
131 break;
132 }
133 }
134 }
135
136 qemu_free(cpuid);
137
138 return ret;
139 }
140
141 #else
142
143 uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function,
144 uint32_t index, int reg)
145 {
146 return -1U;
147 }
148
149 #endif
150
151 #ifdef CONFIG_KVM_PARA
152 struct kvm_para_features {
153 int cap;
154 int feature;
155 } para_features[] = {
156 #ifdef KVM_CAP_CLOCKSOURCE
157 { KVM_CAP_CLOCKSOURCE, KVM_FEATURE_CLOCKSOURCE },
158 #endif
159 #ifdef KVM_CAP_NOP_IO_DELAY
160 { KVM_CAP_NOP_IO_DELAY, KVM_FEATURE_NOP_IO_DELAY },
161 #endif
162 #ifdef KVM_CAP_PV_MMU
163 { KVM_CAP_PV_MMU, KVM_FEATURE_MMU_OP },
164 #endif
165 #ifdef KVM_CAP_ASYNC_PF
166 { KVM_CAP_ASYNC_PF, KVM_FEATURE_ASYNC_PF },
167 #endif
168 { -1, -1 }
169 };
170
171 static int get_para_features(CPUState *env)
172 {
173 int i, features = 0;
174
175 for (i = 0; i < ARRAY_SIZE(para_features) - 1; i++) {
176 if (kvm_check_extension(env->kvm_state, para_features[i].cap))
177 features |= (1 << para_features[i].feature);
178 }
179
180 return features;
181 }
182 #endif
183
184 #ifdef KVM_CAP_MCE
185 static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
186 int *max_banks)
187 {
188 int r;
189
190 r = kvm_check_extension(s, KVM_CAP_MCE);
191 if (r > 0) {
192 *max_banks = r;
193 return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
194 }
195 return -ENOSYS;
196 }
197
198 static int kvm_setup_mce(CPUState *env, uint64_t *mcg_cap)
199 {
200 return kvm_vcpu_ioctl(env, KVM_X86_SETUP_MCE, mcg_cap);
201 }
202
203 static int kvm_set_mce(CPUState *env, struct kvm_x86_mce *m)
204 {
205 return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, m);
206 }
207
208 static int kvm_get_msr(CPUState *env, struct kvm_msr_entry *msrs, int n)
209 {
210 struct kvm_msrs *kmsrs = qemu_malloc(sizeof *kmsrs + n * sizeof *msrs);
211 int r;
212
213 kmsrs->nmsrs = n;
214 memcpy(kmsrs->entries, msrs, n * sizeof *msrs);
215 r = kvm_vcpu_ioctl(env, KVM_GET_MSRS, kmsrs);
216 memcpy(msrs, kmsrs->entries, n * sizeof *msrs);
217 free(kmsrs);
218 return r;
219 }
220
221 /* FIXME: kill this and kvm_get_msr, use env->mcg_status instead */
222 static int kvm_mce_in_exception(CPUState *env)
223 {
224 struct kvm_msr_entry msr_mcg_status = {
225 .index = MSR_MCG_STATUS,
226 };
227 int r;
228
229 r = kvm_get_msr(env, &msr_mcg_status, 1);
230 if (r == -1 || r == 0) {
231 return -1;
232 }
233 return !!(msr_mcg_status.data & MCG_STATUS_MCIP);
234 }
235
236 struct kvm_x86_mce_data
237 {
238 CPUState *env;
239 struct kvm_x86_mce *mce;
240 int abort_on_error;
241 };
242
243 static void kvm_do_inject_x86_mce(void *_data)
244 {
245 struct kvm_x86_mce_data *data = _data;
246 int r;
247
248 /* If there is an MCE exception being processed, ignore this SRAO MCE */
249 if ((data->env->mcg_cap & MCG_SER_P) &&
250 !(data->mce->status & MCI_STATUS_AR)) {
251 r = kvm_mce_in_exception(data->env);
252 if (r == -1) {
253 fprintf(stderr, "Failed to get MCE status\n");
254 } else if (r) {
255 return;
256 }
257 }
258
259 r = kvm_set_mce(data->env, data->mce);
260 if (r < 0) {
261 perror("kvm_set_mce FAILED");
262 if (data->abort_on_error) {
263 abort();
264 }
265 }
266 }
267
268 static void kvm_mce_broadcast_rest(CPUState *env);
269 #endif
270
271 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
272 uint64_t mcg_status, uint64_t addr, uint64_t misc,
273 int flag)
274 {
275 #ifdef KVM_CAP_MCE
276 struct kvm_x86_mce mce = {
277 .bank = bank,
278 .status = status,
279 .mcg_status = mcg_status,
280 .addr = addr,
281 .misc = misc,
282 };
283 struct kvm_x86_mce_data data = {
284 .env = cenv,
285 .mce = &mce,
286 };
287
288 if (!cenv->mcg_cap) {
289 fprintf(stderr, "MCE support is not enabled!\n");
290 return;
291 }
292
293 if (flag & MCE_BROADCAST) {
294 kvm_mce_broadcast_rest(cenv);
295 }
296
297 run_on_cpu(cenv, kvm_do_inject_x86_mce, &data);
298 #else
299 if (flag & ABORT_ON_ERROR) {
300 abort();
301 }
302 #endif
303 }
304
305 int kvm_arch_init_vcpu(CPUState *env)
306 {
307 struct {
308 struct kvm_cpuid2 cpuid;
309 struct kvm_cpuid_entry2 entries[100];
310 } __attribute__((packed)) cpuid_data;
311 uint32_t limit, i, j, cpuid_i;
312 uint32_t unused;
313 struct kvm_cpuid_entry2 *c;
314 #ifdef KVM_CPUID_SIGNATURE
315 uint32_t signature[3];
316 #endif
317
318 env->mp_state = KVM_MP_STATE_RUNNABLE;
319
320 env->cpuid_features &= kvm_arch_get_supported_cpuid(env, 1, 0, R_EDX);
321
322 i = env->cpuid_ext_features & CPUID_EXT_HYPERVISOR;
323 env->cpuid_ext_features &= kvm_arch_get_supported_cpuid(env, 1, 0, R_ECX);
324 env->cpuid_ext_features |= i;
325
326 env->cpuid_ext2_features &= kvm_arch_get_supported_cpuid(env, 0x80000001,
327 0, R_EDX);
328 env->cpuid_ext3_features &= kvm_arch_get_supported_cpuid(env, 0x80000001,
329 0, R_ECX);
330 env->cpuid_svm_features &= kvm_arch_get_supported_cpuid(env, 0x8000000A,
331 0, R_EDX);
332
333
334 cpuid_i = 0;
335
336 #ifdef CONFIG_KVM_PARA
337 /* Paravirtualization CPUIDs */
338 memcpy(signature, "KVMKVMKVM\0\0\0", 12);
339 c = &cpuid_data.entries[cpuid_i++];
340 memset(c, 0, sizeof(*c));
341 c->function = KVM_CPUID_SIGNATURE;
342 c->eax = 0;
343 c->ebx = signature[0];
344 c->ecx = signature[1];
345 c->edx = signature[2];
346
347 c = &cpuid_data.entries[cpuid_i++];
348 memset(c, 0, sizeof(*c));
349 c->function = KVM_CPUID_FEATURES;
350 c->eax = env->cpuid_kvm_features & get_para_features(env);
351 #endif
352
353 cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
354
355 for (i = 0; i <= limit; i++) {
356 c = &cpuid_data.entries[cpuid_i++];
357
358 switch (i) {
359 case 2: {
360 /* Keep reading function 2 till all the input is received */
361 int times;
362
363 c->function = i;
364 c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
365 KVM_CPUID_FLAG_STATE_READ_NEXT;
366 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
367 times = c->eax & 0xff;
368
369 for (j = 1; j < times; ++j) {
370 c = &cpuid_data.entries[cpuid_i++];
371 c->function = i;
372 c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC;
373 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
374 }
375 break;
376 }
377 case 4:
378 case 0xb:
379 case 0xd:
380 for (j = 0; ; j++) {
381 c->function = i;
382 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
383 c->index = j;
384 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
385
386 if (i == 4 && c->eax == 0)
387 break;
388 if (i == 0xb && !(c->ecx & 0xff00))
389 break;
390 if (i == 0xd && c->eax == 0)
391 break;
392
393 c = &cpuid_data.entries[cpuid_i++];
394 }
395 break;
396 default:
397 c->function = i;
398 c->flags = 0;
399 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
400 break;
401 }
402 }
403 cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused);
404
405 for (i = 0x80000000; i <= limit; i++) {
406 c = &cpuid_data.entries[cpuid_i++];
407
408 c->function = i;
409 c->flags = 0;
410 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
411 }
412
413 cpuid_data.cpuid.nent = cpuid_i;
414
415 #ifdef KVM_CAP_MCE
416 if (((env->cpuid_version >> 8)&0xF) >= 6
417 && (env->cpuid_features&(CPUID_MCE|CPUID_MCA)) == (CPUID_MCE|CPUID_MCA)
418 && kvm_check_extension(env->kvm_state, KVM_CAP_MCE) > 0) {
419 uint64_t mcg_cap;
420 int banks;
421
422 if (kvm_get_mce_cap_supported(env->kvm_state, &mcg_cap, &banks))
423 perror("kvm_get_mce_cap_supported FAILED");
424 else {
425 if (banks > MCE_BANKS_DEF)
426 banks = MCE_BANKS_DEF;
427 mcg_cap &= MCE_CAP_DEF;
428 mcg_cap |= banks;
429 if (kvm_setup_mce(env, &mcg_cap))
430 perror("kvm_setup_mce FAILED");
431 else
432 env->mcg_cap = mcg_cap;
433 }
434 }
435 #endif
436
437 return kvm_vcpu_ioctl(env, KVM_SET_CPUID2, &cpuid_data);
438 }
439
440 void kvm_arch_reset_vcpu(CPUState *env)
441 {
442 env->exception_injected = -1;
443 env->interrupt_injected = -1;
444 env->nmi_injected = 0;
445 env->nmi_pending = 0;
446 if (kvm_irqchip_in_kernel()) {
447 env->mp_state = cpu_is_bsp(env) ? KVM_MP_STATE_RUNNABLE :
448 KVM_MP_STATE_UNINITIALIZED;
449 } else {
450 env->mp_state = KVM_MP_STATE_RUNNABLE;
451 }
452 }
453
454 int has_msr_star;
455 int has_msr_hsave_pa;
456
457 static void kvm_supported_msrs(CPUState *env)
458 {
459 static int kvm_supported_msrs;
460 int ret;
461
462 /* first time */
463 if (kvm_supported_msrs == 0) {
464 struct kvm_msr_list msr_list, *kvm_msr_list;
465
466 kvm_supported_msrs = -1;
467
468 /* Obtain MSR list from KVM. These are the MSRs that we must
469 * save/restore */
470 msr_list.nmsrs = 0;
471 ret = kvm_ioctl(env->kvm_state, KVM_GET_MSR_INDEX_LIST, &msr_list);
472 if (ret < 0 && ret != -E2BIG) {
473 return;
474 }
475 /* Old kernel modules had a bug and could write beyond the provided
476 memory. Allocate at least a safe amount of 1K. */
477 kvm_msr_list = qemu_mallocz(MAX(1024, sizeof(msr_list) +
478 msr_list.nmsrs *
479 sizeof(msr_list.indices[0])));
480
481 kvm_msr_list->nmsrs = msr_list.nmsrs;
482 ret = kvm_ioctl(env->kvm_state, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
483 if (ret >= 0) {
484 int i;
485
486 for (i = 0; i < kvm_msr_list->nmsrs; i++) {
487 if (kvm_msr_list->indices[i] == MSR_STAR) {
488 has_msr_star = 1;
489 continue;
490 }
491 if (kvm_msr_list->indices[i] == MSR_VM_HSAVE_PA) {
492 has_msr_hsave_pa = 1;
493 continue;
494 }
495 }
496 }
497
498 free(kvm_msr_list);
499 }
500
501 return;
502 }
503
504 static int kvm_has_msr_hsave_pa(CPUState *env)
505 {
506 kvm_supported_msrs(env);
507 return has_msr_hsave_pa;
508 }
509
510 static int kvm_has_msr_star(CPUState *env)
511 {
512 kvm_supported_msrs(env);
513 return has_msr_star;
514 }
515
516 static int kvm_init_identity_map_page(KVMState *s)
517 {
518 #ifdef KVM_CAP_SET_IDENTITY_MAP_ADDR
519 int ret;
520 uint64_t addr = 0xfffbc000;
521
522 if (!kvm_check_extension(s, KVM_CAP_SET_IDENTITY_MAP_ADDR)) {
523 return 0;
524 }
525
526 ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &addr);
527 if (ret < 0) {
528 fprintf(stderr, "kvm_set_identity_map_addr: %s\n", strerror(ret));
529 return ret;
530 }
531 #endif
532 return 0;
533 }
534
535 int kvm_arch_init(KVMState *s, int smp_cpus)
536 {
537 int ret;
538
539 struct utsname utsname;
540
541 uname(&utsname);
542 lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;
543
544 /* create vm86 tss. KVM uses vm86 mode to emulate 16-bit code
545 * directly. In order to use vm86 mode, a TSS is needed. Since this
546 * must be part of guest physical memory, we need to allocate it. Older
547 * versions of KVM just assumed that it would be at the end of physical
548 * memory but that doesn't work with more than 4GB of memory. We simply
549 * refuse to work with those older versions of KVM. */
550 ret = kvm_check_extension(s, KVM_CAP_SET_TSS_ADDR);
551 if (ret <= 0) {
552 fprintf(stderr, "kvm does not support KVM_CAP_SET_TSS_ADDR\n");
553 return ret;
554 }
555
556 /* this address is 3 pages before the bios, and the bios should present
557 * as unavaible memory. FIXME, need to ensure the e820 map deals with
558 * this?
559 */
560 /*
561 * Tell fw_cfg to notify the BIOS to reserve the range.
562 */
563 if (e820_add_entry(0xfffbc000, 0x4000, E820_RESERVED) < 0) {
564 perror("e820_add_entry() table is full");
565 exit(1);
566 }
567 ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, 0xfffbd000);
568 if (ret < 0) {
569 return ret;
570 }
571
572 return kvm_init_identity_map_page(s);
573 }
574
575 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
576 {
577 lhs->selector = rhs->selector;
578 lhs->base = rhs->base;
579 lhs->limit = rhs->limit;
580 lhs->type = 3;
581 lhs->present = 1;
582 lhs->dpl = 3;
583 lhs->db = 0;
584 lhs->s = 1;
585 lhs->l = 0;
586 lhs->g = 0;
587 lhs->avl = 0;
588 lhs->unusable = 0;
589 }
590
591 static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
592 {
593 unsigned flags = rhs->flags;
594 lhs->selector = rhs->selector;
595 lhs->base = rhs->base;
596 lhs->limit = rhs->limit;
597 lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
598 lhs->present = (flags & DESC_P_MASK) != 0;
599 lhs->dpl = rhs->selector & 3;
600 lhs->db = (flags >> DESC_B_SHIFT) & 1;
601 lhs->s = (flags & DESC_S_MASK) != 0;
602 lhs->l = (flags >> DESC_L_SHIFT) & 1;
603 lhs->g = (flags & DESC_G_MASK) != 0;
604 lhs->avl = (flags & DESC_AVL_MASK) != 0;
605 lhs->unusable = 0;
606 }
607
608 static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
609 {
610 lhs->selector = rhs->selector;
611 lhs->base = rhs->base;
612 lhs->limit = rhs->limit;
613 lhs->flags =
614 (rhs->type << DESC_TYPE_SHIFT)
615 | (rhs->present * DESC_P_MASK)
616 | (rhs->dpl << DESC_DPL_SHIFT)
617 | (rhs->db << DESC_B_SHIFT)
618 | (rhs->s * DESC_S_MASK)
619 | (rhs->l << DESC_L_SHIFT)
620 | (rhs->g * DESC_G_MASK)
621 | (rhs->avl * DESC_AVL_MASK);
622 }
623
624 static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set)
625 {
626 if (set)
627 *kvm_reg = *qemu_reg;
628 else
629 *qemu_reg = *kvm_reg;
630 }
631
632 static int kvm_getput_regs(CPUState *env, int set)
633 {
634 struct kvm_regs regs;
635 int ret = 0;
636
637 if (!set) {
638 ret = kvm_vcpu_ioctl(env, KVM_GET_REGS, &regs);
639 if (ret < 0)
640 return ret;
641 }
642
643 kvm_getput_reg(&regs.rax, &env->regs[R_EAX], set);
644 kvm_getput_reg(&regs.rbx, &env->regs[R_EBX], set);
645 kvm_getput_reg(&regs.rcx, &env->regs[R_ECX], set);
646 kvm_getput_reg(&regs.rdx, &env->regs[R_EDX], set);
647 kvm_getput_reg(&regs.rsi, &env->regs[R_ESI], set);
648 kvm_getput_reg(&regs.rdi, &env->regs[R_EDI], set);
649 kvm_getput_reg(&regs.rsp, &env->regs[R_ESP], set);
650 kvm_getput_reg(&regs.rbp, &env->regs[R_EBP], set);
651 #ifdef TARGET_X86_64
652 kvm_getput_reg(&regs.r8, &env->regs[8], set);
653 kvm_getput_reg(&regs.r9, &env->regs[9], set);
654 kvm_getput_reg(&regs.r10, &env->regs[10], set);
655 kvm_getput_reg(&regs.r11, &env->regs[11], set);
656 kvm_getput_reg(&regs.r12, &env->regs[12], set);
657 kvm_getput_reg(&regs.r13, &env->regs[13], set);
658 kvm_getput_reg(&regs.r14, &env->regs[14], set);
659 kvm_getput_reg(&regs.r15, &env->regs[15], set);
660 #endif
661
662 kvm_getput_reg(&regs.rflags, &env->eflags, set);
663 kvm_getput_reg(&regs.rip, &env->eip, set);
664
665 if (set)
666 ret = kvm_vcpu_ioctl(env, KVM_SET_REGS, &regs);
667
668 return ret;
669 }
670
671 static int kvm_put_fpu(CPUState *env)
672 {
673 struct kvm_fpu fpu;
674 int i;
675
676 memset(&fpu, 0, sizeof fpu);
677 fpu.fsw = env->fpus & ~(7 << 11);
678 fpu.fsw |= (env->fpstt & 7) << 11;
679 fpu.fcw = env->fpuc;
680 for (i = 0; i < 8; ++i)
681 fpu.ftwx |= (!env->fptags[i]) << i;
682 memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
683 memcpy(fpu.xmm, env->xmm_regs, sizeof env->xmm_regs);
684 fpu.mxcsr = env->mxcsr;
685
686 return kvm_vcpu_ioctl(env, KVM_SET_FPU, &fpu);
687 }
688
689 #ifdef KVM_CAP_XSAVE
690 #define XSAVE_CWD_RIP 2
691 #define XSAVE_CWD_RDP 4
692 #define XSAVE_MXCSR 6
693 #define XSAVE_ST_SPACE 8
694 #define XSAVE_XMM_SPACE 40
695 #define XSAVE_XSTATE_BV 128
696 #define XSAVE_YMMH_SPACE 144
697 #endif
698
699 static int kvm_put_xsave(CPUState *env)
700 {
701 #ifdef KVM_CAP_XSAVE
702 int i, r;
703 struct kvm_xsave* xsave;
704 uint16_t cwd, swd, twd, fop;
705
706 if (!kvm_has_xsave())
707 return kvm_put_fpu(env);
708
709 xsave = qemu_memalign(4096, sizeof(struct kvm_xsave));
710 memset(xsave, 0, sizeof(struct kvm_xsave));
711 cwd = swd = twd = fop = 0;
712 swd = env->fpus & ~(7 << 11);
713 swd |= (env->fpstt & 7) << 11;
714 cwd = env->fpuc;
715 for (i = 0; i < 8; ++i)
716 twd |= (!env->fptags[i]) << i;
717 xsave->region[0] = (uint32_t)(swd << 16) + cwd;
718 xsave->region[1] = (uint32_t)(fop << 16) + twd;
719 memcpy(&xsave->region[XSAVE_ST_SPACE], env->fpregs,
720 sizeof env->fpregs);
721 memcpy(&xsave->region[XSAVE_XMM_SPACE], env->xmm_regs,
722 sizeof env->xmm_regs);
723 xsave->region[XSAVE_MXCSR] = env->mxcsr;
724 *(uint64_t *)&xsave->region[XSAVE_XSTATE_BV] = env->xstate_bv;
725 memcpy(&xsave->region[XSAVE_YMMH_SPACE], env->ymmh_regs,
726 sizeof env->ymmh_regs);
727 r = kvm_vcpu_ioctl(env, KVM_SET_XSAVE, xsave);
728 qemu_free(xsave);
729 return r;
730 #else
731 return kvm_put_fpu(env);
732 #endif
733 }
734
735 static int kvm_put_xcrs(CPUState *env)
736 {
737 #ifdef KVM_CAP_XCRS
738 struct kvm_xcrs xcrs;
739
740 if (!kvm_has_xcrs())
741 return 0;
742
743 xcrs.nr_xcrs = 1;
744 xcrs.flags = 0;
745 xcrs.xcrs[0].xcr = 0;
746 xcrs.xcrs[0].value = env->xcr0;
747 return kvm_vcpu_ioctl(env, KVM_SET_XCRS, &xcrs);
748 #else
749 return 0;
750 #endif
751 }
752
753 static int kvm_put_sregs(CPUState *env)
754 {
755 struct kvm_sregs sregs;
756
757 memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap));
758 if (env->interrupt_injected >= 0) {
759 sregs.interrupt_bitmap[env->interrupt_injected / 64] |=
760 (uint64_t)1 << (env->interrupt_injected % 64);
761 }
762
763 if ((env->eflags & VM_MASK)) {
764 set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
765 set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
766 set_v8086_seg(&sregs.es, &env->segs[R_ES]);
767 set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
768 set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
769 set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
770 } else {
771 set_seg(&sregs.cs, &env->segs[R_CS]);
772 set_seg(&sregs.ds, &env->segs[R_DS]);
773 set_seg(&sregs.es, &env->segs[R_ES]);
774 set_seg(&sregs.fs, &env->segs[R_FS]);
775 set_seg(&sregs.gs, &env->segs[R_GS]);
776 set_seg(&sregs.ss, &env->segs[R_SS]);
777
778 if (env->cr[0] & CR0_PE_MASK) {
779 /* force ss cpl to cs cpl */
780 sregs.ss.selector = (sregs.ss.selector & ~3) |
781 (sregs.cs.selector & 3);
782 sregs.ss.dpl = sregs.ss.selector & 3;
783 }
784 }
785
786 set_seg(&sregs.tr, &env->tr);
787 set_seg(&sregs.ldt, &env->ldt);
788
789 sregs.idt.limit = env->idt.limit;
790 sregs.idt.base = env->idt.base;
791 sregs.gdt.limit = env->gdt.limit;
792 sregs.gdt.base = env->gdt.base;
793
794 sregs.cr0 = env->cr[0];
795 sregs.cr2 = env->cr[2];
796 sregs.cr3 = env->cr[3];
797 sregs.cr4 = env->cr[4];
798
799 sregs.cr8 = cpu_get_apic_tpr(env->apic_state);
800 sregs.apic_base = cpu_get_apic_base(env->apic_state);
801
802 sregs.efer = env->efer;
803
804 return kvm_vcpu_ioctl(env, KVM_SET_SREGS, &sregs);
805 }
806
807 static void kvm_msr_entry_set(struct kvm_msr_entry *entry,
808 uint32_t index, uint64_t value)
809 {
810 entry->index = index;
811 entry->data = value;
812 }
813
814 static int kvm_put_msrs(CPUState *env, int level)
815 {
816 struct {
817 struct kvm_msrs info;
818 struct kvm_msr_entry entries[100];
819 } msr_data;
820 struct kvm_msr_entry *msrs = msr_data.entries;
821 int n = 0;
822
823 kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
824 kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
825 kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
826 if (kvm_has_msr_star(env))
827 kvm_msr_entry_set(&msrs[n++], MSR_STAR, env->star);
828 if (kvm_has_msr_hsave_pa(env))
829 kvm_msr_entry_set(&msrs[n++], MSR_VM_HSAVE_PA, env->vm_hsave);
830 #ifdef TARGET_X86_64
831 if (lm_capable_kernel) {
832 kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar);
833 kvm_msr_entry_set(&msrs[n++], MSR_KERNELGSBASE, env->kernelgsbase);
834 kvm_msr_entry_set(&msrs[n++], MSR_FMASK, env->fmask);
835 kvm_msr_entry_set(&msrs[n++], MSR_LSTAR, env->lstar);
836 }
837 #endif
838 if (level == KVM_PUT_FULL_STATE) {
839 /*
840 * KVM is yet unable to synchronize TSC values of multiple VCPUs on
841 * writeback. Until this is fixed, we only write the offset to SMP
842 * guests after migration, desynchronizing the VCPUs, but avoiding
843 * huge jump-backs that would occur without any writeback at all.
844 */
845 if (smp_cpus == 1 || env->tsc != 0) {
846 kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
847 }
848 kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME,
849 env->system_time_msr);
850 kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
851 #ifdef KVM_CAP_ASYNC_PF
852 kvm_msr_entry_set(&msrs[n++], MSR_KVM_ASYNC_PF_EN, env->async_pf_en_msr);
853 #endif
854 }
855 #ifdef KVM_CAP_MCE
856 if (env->mcg_cap) {
857 int i;
858 if (level == KVM_PUT_RESET_STATE)
859 kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
860 else if (level == KVM_PUT_FULL_STATE) {
861 kvm_msr_entry_set(&msrs[n++], MSR_MCG_STATUS, env->mcg_status);
862 kvm_msr_entry_set(&msrs[n++], MSR_MCG_CTL, env->mcg_ctl);
863 for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++)
864 kvm_msr_entry_set(&msrs[n++], MSR_MC0_CTL + i, env->mce_banks[i]);
865 }
866 }
867 #endif
868
869 msr_data.info.nmsrs = n;
870
871 return kvm_vcpu_ioctl(env, KVM_SET_MSRS, &msr_data);
872
873 }
874
875
876 static int kvm_get_fpu(CPUState *env)
877 {
878 struct kvm_fpu fpu;
879 int i, ret;
880
881 ret = kvm_vcpu_ioctl(env, KVM_GET_FPU, &fpu);
882 if (ret < 0)
883 return ret;
884
885 env->fpstt = (fpu.fsw >> 11) & 7;
886 env->fpus = fpu.fsw;
887 env->fpuc = fpu.fcw;
888 for (i = 0; i < 8; ++i)
889 env->fptags[i] = !((fpu.ftwx >> i) & 1);
890 memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
891 memcpy(env->xmm_regs, fpu.xmm, sizeof env->xmm_regs);
892 env->mxcsr = fpu.mxcsr;
893
894 return 0;
895 }
896
897 static int kvm_get_xsave(CPUState *env)
898 {
899 #ifdef KVM_CAP_XSAVE
900 struct kvm_xsave* xsave;
901 int ret, i;
902 uint16_t cwd, swd, twd, fop;
903
904 if (!kvm_has_xsave())
905 return kvm_get_fpu(env);
906
907 xsave = qemu_memalign(4096, sizeof(struct kvm_xsave));
908 ret = kvm_vcpu_ioctl(env, KVM_GET_XSAVE, xsave);
909 if (ret < 0) {
910 qemu_free(xsave);
911 return ret;
912 }
913
914 cwd = (uint16_t)xsave->region[0];
915 swd = (uint16_t)(xsave->region[0] >> 16);
916 twd = (uint16_t)xsave->region[1];
917 fop = (uint16_t)(xsave->region[1] >> 16);
918 env->fpstt = (swd >> 11) & 7;
919 env->fpus = swd;
920 env->fpuc = cwd;
921 for (i = 0; i < 8; ++i)
922 env->fptags[i] = !((twd >> i) & 1);
923 env->mxcsr = xsave->region[XSAVE_MXCSR];
924 memcpy(env->fpregs, &xsave->region[XSAVE_ST_SPACE],
925 sizeof env->fpregs);
926 memcpy(env->xmm_regs, &xsave->region[XSAVE_XMM_SPACE],
927 sizeof env->xmm_regs);
928 env->xstate_bv = *(uint64_t *)&xsave->region[XSAVE_XSTATE_BV];
929 memcpy(env->ymmh_regs, &xsave->region[XSAVE_YMMH_SPACE],
930 sizeof env->ymmh_regs);
931 qemu_free(xsave);
932 return 0;
933 #else
934 return kvm_get_fpu(env);
935 #endif
936 }
937
938 static int kvm_get_xcrs(CPUState *env)
939 {
940 #ifdef KVM_CAP_XCRS
941 int i, ret;
942 struct kvm_xcrs xcrs;
943
944 if (!kvm_has_xcrs())
945 return 0;
946
947 ret = kvm_vcpu_ioctl(env, KVM_GET_XCRS, &xcrs);
948 if (ret < 0)
949 return ret;
950
951 for (i = 0; i < xcrs.nr_xcrs; i++)
952 /* Only support xcr0 now */
953 if (xcrs.xcrs[0].xcr == 0) {
954 env->xcr0 = xcrs.xcrs[0].value;
955 break;
956 }
957 return 0;
958 #else
959 return 0;
960 #endif
961 }
962
963 static int kvm_get_sregs(CPUState *env)
964 {
965 struct kvm_sregs sregs;
966 uint32_t hflags;
967 int bit, i, ret;
968
969 ret = kvm_vcpu_ioctl(env, KVM_GET_SREGS, &sregs);
970 if (ret < 0)
971 return ret;
972
973 /* There can only be one pending IRQ set in the bitmap at a time, so try
974 to find it and save its number instead (-1 for none). */
975 env->interrupt_injected = -1;
976 for (i = 0; i < ARRAY_SIZE(sregs.interrupt_bitmap); i++) {
977 if (sregs.interrupt_bitmap[i]) {
978 bit = ctz64(sregs.interrupt_bitmap[i]);
979 env->interrupt_injected = i * 64 + bit;
980 break;
981 }
982 }
983
984 get_seg(&env->segs[R_CS], &sregs.cs);
985 get_seg(&env->segs[R_DS], &sregs.ds);
986 get_seg(&env->segs[R_ES], &sregs.es);
987 get_seg(&env->segs[R_FS], &sregs.fs);
988 get_seg(&env->segs[R_GS], &sregs.gs);
989 get_seg(&env->segs[R_SS], &sregs.ss);
990
991 get_seg(&env->tr, &sregs.tr);
992 get_seg(&env->ldt, &sregs.ldt);
993
994 env->idt.limit = sregs.idt.limit;
995 env->idt.base = sregs.idt.base;
996 env->gdt.limit = sregs.gdt.limit;
997 env->gdt.base = sregs.gdt.base;
998
999 env->cr[0] = sregs.cr0;
1000 env->cr[2] = sregs.cr2;
1001 env->cr[3] = sregs.cr3;
1002 env->cr[4] = sregs.cr4;
1003
1004 cpu_set_apic_base(env->apic_state, sregs.apic_base);
1005
1006 env->efer = sregs.efer;
1007 //cpu_set_apic_tpr(env->apic_state, sregs.cr8);
1008
1009 #define HFLAG_COPY_MASK ~( \
1010 HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
1011 HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \
1012 HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \
1013 HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK)
1014
1015
1016
1017 hflags = (env->segs[R_CS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
1018 hflags |= (env->cr[0] & CR0_PE_MASK) << (HF_PE_SHIFT - CR0_PE_SHIFT);
1019 hflags |= (env->cr[0] << (HF_MP_SHIFT - CR0_MP_SHIFT)) &
1020 (HF_MP_MASK | HF_EM_MASK | HF_TS_MASK);
1021 hflags |= (env->eflags & (HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK));
1022 hflags |= (env->cr[4] & CR4_OSFXSR_MASK) <<
1023 (HF_OSFXSR_SHIFT - CR4_OSFXSR_SHIFT);
1024
1025 if (env->efer & MSR_EFER_LMA) {
1026 hflags |= HF_LMA_MASK;
1027 }
1028
1029 if ((hflags & HF_LMA_MASK) && (env->segs[R_CS].flags & DESC_L_MASK)) {
1030 hflags |= HF_CS32_MASK | HF_SS32_MASK | HF_CS64_MASK;
1031 } else {
1032 hflags |= (env->segs[R_CS].flags & DESC_B_MASK) >>
1033 (DESC_B_SHIFT - HF_CS32_SHIFT);
1034 hflags |= (env->segs[R_SS].flags & DESC_B_MASK) >>
1035 (DESC_B_SHIFT - HF_SS32_SHIFT);
1036 if (!(env->cr[0] & CR0_PE_MASK) ||
1037 (env->eflags & VM_MASK) ||
1038 !(hflags & HF_CS32_MASK)) {
1039 hflags |= HF_ADDSEG_MASK;
1040 } else {
1041 hflags |= ((env->segs[R_DS].base |
1042 env->segs[R_ES].base |
1043 env->segs[R_SS].base) != 0) <<
1044 HF_ADDSEG_SHIFT;
1045 }
1046 }
1047 env->hflags = (env->hflags & HFLAG_COPY_MASK) | hflags;
1048
1049 return 0;
1050 }
1051
1052 static int kvm_get_msrs(CPUState *env)
1053 {
1054 struct {
1055 struct kvm_msrs info;
1056 struct kvm_msr_entry entries[100];
1057 } msr_data;
1058 struct kvm_msr_entry *msrs = msr_data.entries;
1059 int ret, i, n;
1060
1061 n = 0;
1062 msrs[n++].index = MSR_IA32_SYSENTER_CS;
1063 msrs[n++].index = MSR_IA32_SYSENTER_ESP;
1064 msrs[n++].index = MSR_IA32_SYSENTER_EIP;
1065 if (kvm_has_msr_star(env))
1066 msrs[n++].index = MSR_STAR;
1067 if (kvm_has_msr_hsave_pa(env))
1068 msrs[n++].index = MSR_VM_HSAVE_PA;
1069 msrs[n++].index = MSR_IA32_TSC;
1070 #ifdef TARGET_X86_64
1071 if (lm_capable_kernel) {
1072 msrs[n++].index = MSR_CSTAR;
1073 msrs[n++].index = MSR_KERNELGSBASE;
1074 msrs[n++].index = MSR_FMASK;
1075 msrs[n++].index = MSR_LSTAR;
1076 }
1077 #endif
1078 msrs[n++].index = MSR_KVM_SYSTEM_TIME;
1079 msrs[n++].index = MSR_KVM_WALL_CLOCK;
1080 #ifdef KVM_CAP_ASYNC_PF
1081 msrs[n++].index = MSR_KVM_ASYNC_PF_EN;
1082 #endif
1083
1084 #ifdef KVM_CAP_MCE
1085 if (env->mcg_cap) {
1086 msrs[n++].index = MSR_MCG_STATUS;
1087 msrs[n++].index = MSR_MCG_CTL;
1088 for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++)
1089 msrs[n++].index = MSR_MC0_CTL + i;
1090 }
1091 #endif
1092
1093 msr_data.info.nmsrs = n;
1094 ret = kvm_vcpu_ioctl(env, KVM_GET_MSRS, &msr_data);
1095 if (ret < 0)
1096 return ret;
1097
1098 for (i = 0; i < ret; i++) {
1099 switch (msrs[i].index) {
1100 case MSR_IA32_SYSENTER_CS:
1101 env->sysenter_cs = msrs[i].data;
1102 break;
1103 case MSR_IA32_SYSENTER_ESP:
1104 env->sysenter_esp = msrs[i].data;
1105 break;
1106 case MSR_IA32_SYSENTER_EIP:
1107 env->sysenter_eip = msrs[i].data;
1108 break;
1109 case MSR_STAR:
1110 env->star = msrs[i].data;
1111 break;
1112 #ifdef TARGET_X86_64
1113 case MSR_CSTAR:
1114 env->cstar = msrs[i].data;
1115 break;
1116 case MSR_KERNELGSBASE:
1117 env->kernelgsbase = msrs[i].data;
1118 break;
1119 case MSR_FMASK:
1120 env->fmask = msrs[i].data;
1121 break;
1122 case MSR_LSTAR:
1123 env->lstar = msrs[i].data;
1124 break;
1125 #endif
1126 case MSR_IA32_TSC:
1127 env->tsc = msrs[i].data;
1128 break;
1129 case MSR_VM_HSAVE_PA:
1130 env->vm_hsave = msrs[i].data;
1131 break;
1132 case MSR_KVM_SYSTEM_TIME:
1133 env->system_time_msr = msrs[i].data;
1134 break;
1135 case MSR_KVM_WALL_CLOCK:
1136 env->wall_clock_msr = msrs[i].data;
1137 break;
1138 #ifdef KVM_CAP_MCE
1139 case MSR_MCG_STATUS:
1140 env->mcg_status = msrs[i].data;
1141 break;
1142 case MSR_MCG_CTL:
1143 env->mcg_ctl = msrs[i].data;
1144 break;
1145 #endif
1146 default:
1147 #ifdef KVM_CAP_MCE
1148 if (msrs[i].index >= MSR_MC0_CTL &&
1149 msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
1150 env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
1151 }
1152 #endif
1153 break;
1154 #ifdef KVM_CAP_ASYNC_PF
1155 case MSR_KVM_ASYNC_PF_EN:
1156 env->async_pf_en_msr = msrs[i].data;
1157 break;
1158 #endif
1159 }
1160 }
1161
1162 return 0;
1163 }
1164
1165 static int kvm_put_mp_state(CPUState *env)
1166 {
1167 struct kvm_mp_state mp_state = { .mp_state = env->mp_state };
1168
1169 return kvm_vcpu_ioctl(env, KVM_SET_MP_STATE, &mp_state);
1170 }
1171
1172 static int kvm_get_mp_state(CPUState *env)
1173 {
1174 struct kvm_mp_state mp_state;
1175 int ret;
1176
1177 ret = kvm_vcpu_ioctl(env, KVM_GET_MP_STATE, &mp_state);
1178 if (ret < 0) {
1179 return ret;
1180 }
1181 env->mp_state = mp_state.mp_state;
1182 return 0;
1183 }
1184
1185 static int kvm_put_vcpu_events(CPUState *env, int level)
1186 {
1187 #ifdef KVM_CAP_VCPU_EVENTS
1188 struct kvm_vcpu_events events;
1189
1190 if (!kvm_has_vcpu_events()) {
1191 return 0;
1192 }
1193
1194 events.exception.injected = (env->exception_injected >= 0);
1195 events.exception.nr = env->exception_injected;
1196 events.exception.has_error_code = env->has_error_code;
1197 events.exception.error_code = env->error_code;
1198
1199 events.interrupt.injected = (env->interrupt_injected >= 0);
1200 events.interrupt.nr = env->interrupt_injected;
1201 events.interrupt.soft = env->soft_interrupt;
1202
1203 events.nmi.injected = env->nmi_injected;
1204 events.nmi.pending = env->nmi_pending;
1205 events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
1206
1207 events.sipi_vector = env->sipi_vector;
1208
1209 events.flags = 0;
1210 if (level >= KVM_PUT_RESET_STATE) {
1211 events.flags |=
1212 KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR;
1213 }
1214
1215 return kvm_vcpu_ioctl(env, KVM_SET_VCPU_EVENTS, &events);
1216 #else
1217 return 0;
1218 #endif
1219 }
1220
1221 static int kvm_get_vcpu_events(CPUState *env)
1222 {
1223 #ifdef KVM_CAP_VCPU_EVENTS
1224 struct kvm_vcpu_events events;
1225 int ret;
1226
1227 if (!kvm_has_vcpu_events()) {
1228 return 0;
1229 }
1230
1231 ret = kvm_vcpu_ioctl(env, KVM_GET_VCPU_EVENTS, &events);
1232 if (ret < 0) {
1233 return ret;
1234 }
1235 env->exception_injected =
1236 events.exception.injected ? events.exception.nr : -1;
1237 env->has_error_code = events.exception.has_error_code;
1238 env->error_code = events.exception.error_code;
1239
1240 env->interrupt_injected =
1241 events.interrupt.injected ? events.interrupt.nr : -1;
1242 env->soft_interrupt = events.interrupt.soft;
1243
1244 env->nmi_injected = events.nmi.injected;
1245 env->nmi_pending = events.nmi.pending;
1246 if (events.nmi.masked) {
1247 env->hflags2 |= HF2_NMI_MASK;
1248 } else {
1249 env->hflags2 &= ~HF2_NMI_MASK;
1250 }
1251
1252 env->sipi_vector = events.sipi_vector;
1253 #endif
1254
1255 return 0;
1256 }
1257
1258 static int kvm_guest_debug_workarounds(CPUState *env)
1259 {
1260 int ret = 0;
1261 #ifdef KVM_CAP_SET_GUEST_DEBUG
1262 unsigned long reinject_trap = 0;
1263
1264 if (!kvm_has_vcpu_events()) {
1265 if (env->exception_injected == 1) {
1266 reinject_trap = KVM_GUESTDBG_INJECT_DB;
1267 } else if (env->exception_injected == 3) {
1268 reinject_trap = KVM_GUESTDBG_INJECT_BP;
1269 }
1270 env->exception_injected = -1;
1271 }
1272
1273 /*
1274 * Kernels before KVM_CAP_X86_ROBUST_SINGLESTEP overwrote flags.TF
1275 * injected via SET_GUEST_DEBUG while updating GP regs. Work around this
1276 * by updating the debug state once again if single-stepping is on.
1277 * Another reason to call kvm_update_guest_debug here is a pending debug
1278 * trap raise by the guest. On kernels without SET_VCPU_EVENTS we have to
1279 * reinject them via SET_GUEST_DEBUG.
1280 */
1281 if (reinject_trap ||
1282 (!kvm_has_robust_singlestep() && env->singlestep_enabled)) {
1283 ret = kvm_update_guest_debug(env, reinject_trap);
1284 }
1285 #endif /* KVM_CAP_SET_GUEST_DEBUG */
1286 return ret;
1287 }
1288
1289 static int kvm_put_debugregs(CPUState *env)
1290 {
1291 #ifdef KVM_CAP_DEBUGREGS
1292 struct kvm_debugregs dbgregs;
1293 int i;
1294
1295 if (!kvm_has_debugregs()) {
1296 return 0;
1297 }
1298
1299 for (i = 0; i < 4; i++) {
1300 dbgregs.db[i] = env->dr[i];
1301 }
1302 dbgregs.dr6 = env->dr[6];
1303 dbgregs.dr7 = env->dr[7];
1304 dbgregs.flags = 0;
1305
1306 return kvm_vcpu_ioctl(env, KVM_SET_DEBUGREGS, &dbgregs);
1307 #else
1308 return 0;
1309 #endif
1310 }
1311
1312 static int kvm_get_debugregs(CPUState *env)
1313 {
1314 #ifdef KVM_CAP_DEBUGREGS
1315 struct kvm_debugregs dbgregs;
1316 int i, ret;
1317
1318 if (!kvm_has_debugregs()) {
1319 return 0;
1320 }
1321
1322 ret = kvm_vcpu_ioctl(env, KVM_GET_DEBUGREGS, &dbgregs);
1323 if (ret < 0) {
1324 return ret;
1325 }
1326 for (i = 0; i < 4; i++) {
1327 env->dr[i] = dbgregs.db[i];
1328 }
1329 env->dr[4] = env->dr[6] = dbgregs.dr6;
1330 env->dr[5] = env->dr[7] = dbgregs.dr7;
1331 #endif
1332
1333 return 0;
1334 }
1335
1336 int kvm_arch_put_registers(CPUState *env, int level)
1337 {
1338 int ret;
1339
1340 assert(cpu_is_stopped(env) || qemu_cpu_self(env));
1341
1342 ret = kvm_getput_regs(env, 1);
1343 if (ret < 0)
1344 return ret;
1345
1346 ret = kvm_put_xsave(env);
1347 if (ret < 0)
1348 return ret;
1349
1350 ret = kvm_put_xcrs(env);
1351 if (ret < 0)
1352 return ret;
1353
1354 ret = kvm_put_sregs(env);
1355 if (ret < 0)
1356 return ret;
1357
1358 ret = kvm_put_msrs(env, level);
1359 if (ret < 0)
1360 return ret;
1361
1362 if (level >= KVM_PUT_RESET_STATE) {
1363 ret = kvm_put_mp_state(env);
1364 if (ret < 0)
1365 return ret;
1366 }
1367
1368 ret = kvm_put_vcpu_events(env, level);
1369 if (ret < 0)
1370 return ret;
1371
1372 /* must be last */
1373 ret = kvm_guest_debug_workarounds(env);
1374 if (ret < 0)
1375 return ret;
1376
1377 ret = kvm_put_debugregs(env);
1378 if (ret < 0)
1379 return ret;
1380
1381 return 0;
1382 }
1383
1384 int kvm_arch_get_registers(CPUState *env)
1385 {
1386 int ret;
1387
1388 assert(cpu_is_stopped(env) || qemu_cpu_self(env));
1389
1390 ret = kvm_getput_regs(env, 0);
1391 if (ret < 0)
1392 return ret;
1393
1394 ret = kvm_get_xsave(env);
1395 if (ret < 0)
1396 return ret;
1397
1398 ret = kvm_get_xcrs(env);
1399 if (ret < 0)
1400 return ret;
1401
1402 ret = kvm_get_sregs(env);
1403 if (ret < 0)
1404 return ret;
1405
1406 ret = kvm_get_msrs(env);
1407 if (ret < 0)
1408 return ret;
1409
1410 ret = kvm_get_mp_state(env);
1411 if (ret < 0)
1412 return ret;
1413
1414 ret = kvm_get_vcpu_events(env);
1415 if (ret < 0)
1416 return ret;
1417
1418 ret = kvm_get_debugregs(env);
1419 if (ret < 0)
1420 return ret;
1421
1422 return 0;
1423 }
1424
1425 int kvm_arch_pre_run(CPUState *env, struct kvm_run *run)
1426 {
1427 /* Inject NMI */
1428 if (env->interrupt_request & CPU_INTERRUPT_NMI) {
1429 env->interrupt_request &= ~CPU_INTERRUPT_NMI;
1430 DPRINTF("injected NMI\n");
1431 kvm_vcpu_ioctl(env, KVM_NMI);
1432 }
1433
1434 /* Try to inject an interrupt if the guest can accept it */
1435 if (run->ready_for_interrupt_injection &&
1436 (env->interrupt_request & CPU_INTERRUPT_HARD) &&
1437 (env->eflags & IF_MASK)) {
1438 int irq;
1439
1440 env->interrupt_request &= ~CPU_INTERRUPT_HARD;
1441 irq = cpu_get_pic_interrupt(env);
1442 if (irq >= 0) {
1443 struct kvm_interrupt intr;
1444 intr.irq = irq;
1445 /* FIXME: errors */
1446 DPRINTF("injected interrupt %d\n", irq);
1447 kvm_vcpu_ioctl(env, KVM_INTERRUPT, &intr);
1448 }
1449 }
1450
1451 /* If we have an interrupt but the guest is not ready to receive an
1452 * interrupt, request an interrupt window exit. This will
1453 * cause a return to userspace as soon as the guest is ready to
1454 * receive interrupts. */
1455 if ((env->interrupt_request & CPU_INTERRUPT_HARD))
1456 run->request_interrupt_window = 1;
1457 else
1458 run->request_interrupt_window = 0;
1459
1460 DPRINTF("setting tpr\n");
1461 run->cr8 = cpu_get_apic_tpr(env->apic_state);
1462
1463 return 0;
1464 }
1465
1466 int kvm_arch_post_run(CPUState *env, struct kvm_run *run)
1467 {
1468 if (run->if_flag)
1469 env->eflags |= IF_MASK;
1470 else
1471 env->eflags &= ~IF_MASK;
1472
1473 cpu_set_apic_tpr(env->apic_state, run->cr8);
1474 cpu_set_apic_base(env->apic_state, run->apic_base);
1475
1476 return 0;
1477 }
1478
1479 int kvm_arch_process_irqchip_events(CPUState *env)
1480 {
1481 if (env->interrupt_request & CPU_INTERRUPT_INIT) {
1482 kvm_cpu_synchronize_state(env);
1483 do_cpu_init(env);
1484 env->exception_index = EXCP_HALTED;
1485 }
1486
1487 if (env->interrupt_request & CPU_INTERRUPT_SIPI) {
1488 kvm_cpu_synchronize_state(env);
1489 do_cpu_sipi(env);
1490 }
1491
1492 return env->halted;
1493 }
1494
1495 static int kvm_handle_halt(CPUState *env)
1496 {
1497 if (!((env->interrupt_request & CPU_INTERRUPT_HARD) &&
1498 (env->eflags & IF_MASK)) &&
1499 !(env->interrupt_request & CPU_INTERRUPT_NMI)) {
1500 env->halted = 1;
1501 env->exception_index = EXCP_HLT;
1502 return 0;
1503 }
1504
1505 return 1;
1506 }
1507
1508 int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run)
1509 {
1510 int ret = 0;
1511
1512 switch (run->exit_reason) {
1513 case KVM_EXIT_HLT:
1514 DPRINTF("handle_hlt\n");
1515 ret = kvm_handle_halt(env);
1516 break;
1517 }
1518
1519 return ret;
1520 }
1521
1522 #ifdef KVM_CAP_SET_GUEST_DEBUG
1523 int kvm_arch_insert_sw_breakpoint(CPUState *env, struct kvm_sw_breakpoint *bp)
1524 {
1525 static const uint8_t int3 = 0xcc;
1526
1527 if (cpu_memory_rw_debug(env, bp->pc, (uint8_t *)&bp->saved_insn, 1, 0) ||
1528 cpu_memory_rw_debug(env, bp->pc, (uint8_t *)&int3, 1, 1))
1529 return -EINVAL;
1530 return 0;
1531 }
1532
1533 int kvm_arch_remove_sw_breakpoint(CPUState *env, struct kvm_sw_breakpoint *bp)
1534 {
1535 uint8_t int3;
1536
1537 if (cpu_memory_rw_debug(env, bp->pc, &int3, 1, 0) || int3 != 0xcc ||
1538 cpu_memory_rw_debug(env, bp->pc, (uint8_t *)&bp->saved_insn, 1, 1))
1539 return -EINVAL;
1540 return 0;
1541 }
1542
1543 static struct {
1544 target_ulong addr;
1545 int len;
1546 int type;
1547 } hw_breakpoint[4];
1548
1549 static int nb_hw_breakpoint;
1550
1551 static int find_hw_breakpoint(target_ulong addr, int len, int type)
1552 {
1553 int n;
1554
1555 for (n = 0; n < nb_hw_breakpoint; n++)
1556 if (hw_breakpoint[n].addr == addr && hw_breakpoint[n].type == type &&
1557 (hw_breakpoint[n].len == len || len == -1))
1558 return n;
1559 return -1;
1560 }
1561
1562 int kvm_arch_insert_hw_breakpoint(target_ulong addr,
1563 target_ulong len, int type)
1564 {
1565 switch (type) {
1566 case GDB_BREAKPOINT_HW:
1567 len = 1;
1568 break;
1569 case GDB_WATCHPOINT_WRITE:
1570 case GDB_WATCHPOINT_ACCESS:
1571 switch (len) {
1572 case 1:
1573 break;
1574 case 2:
1575 case 4:
1576 case 8:
1577 if (addr & (len - 1))
1578 return -EINVAL;
1579 break;
1580 default:
1581 return -EINVAL;
1582 }
1583 break;
1584 default:
1585 return -ENOSYS;
1586 }
1587
1588 if (nb_hw_breakpoint == 4)
1589 return -ENOBUFS;
1590
1591 if (find_hw_breakpoint(addr, len, type) >= 0)
1592 return -EEXIST;
1593
1594 hw_breakpoint[nb_hw_breakpoint].addr = addr;
1595 hw_breakpoint[nb_hw_breakpoint].len = len;
1596 hw_breakpoint[nb_hw_breakpoint].type = type;
1597 nb_hw_breakpoint++;
1598
1599 return 0;
1600 }
1601
1602 int kvm_arch_remove_hw_breakpoint(target_ulong addr,
1603 target_ulong len, int type)
1604 {
1605 int n;
1606
1607 n = find_hw_breakpoint(addr, (type == GDB_BREAKPOINT_HW) ? 1 : len, type);
1608 if (n < 0)
1609 return -ENOENT;
1610
1611 nb_hw_breakpoint--;
1612 hw_breakpoint[n] = hw_breakpoint[nb_hw_breakpoint];
1613
1614 return 0;
1615 }
1616
1617 void kvm_arch_remove_all_hw_breakpoints(void)
1618 {
1619 nb_hw_breakpoint = 0;
1620 }
1621
1622 static CPUWatchpoint hw_watchpoint;
1623
1624 int kvm_arch_debug(struct kvm_debug_exit_arch *arch_info)
1625 {
1626 int handle = 0;
1627 int n;
1628
1629 if (arch_info->exception == 1) {
1630 if (arch_info->dr6 & (1 << 14)) {
1631 if (cpu_single_env->singlestep_enabled)
1632 handle = 1;
1633 } else {
1634 for (n = 0; n < 4; n++)
1635 if (arch_info->dr6 & (1 << n))
1636 switch ((arch_info->dr7 >> (16 + n*4)) & 0x3) {
1637 case 0x0:
1638 handle = 1;
1639 break;
1640 case 0x1:
1641 handle = 1;
1642 cpu_single_env->watchpoint_hit = &hw_watchpoint;
1643 hw_watchpoint.vaddr = hw_breakpoint[n].addr;
1644 hw_watchpoint.flags = BP_MEM_WRITE;
1645 break;
1646 case 0x3:
1647 handle = 1;
1648 cpu_single_env->watchpoint_hit = &hw_watchpoint;
1649 hw_watchpoint.vaddr = hw_breakpoint[n].addr;
1650 hw_watchpoint.flags = BP_MEM_ACCESS;
1651 break;
1652 }
1653 }
1654 } else if (kvm_find_sw_breakpoint(cpu_single_env, arch_info->pc))
1655 handle = 1;
1656
1657 if (!handle) {
1658 cpu_synchronize_state(cpu_single_env);
1659 assert(cpu_single_env->exception_injected == -1);
1660
1661 cpu_single_env->exception_injected = arch_info->exception;
1662 cpu_single_env->has_error_code = 0;
1663 }
1664
1665 return handle;
1666 }
1667
1668 void kvm_arch_update_guest_debug(CPUState *env, struct kvm_guest_debug *dbg)
1669 {
1670 const uint8_t type_code[] = {
1671 [GDB_BREAKPOINT_HW] = 0x0,
1672 [GDB_WATCHPOINT_WRITE] = 0x1,
1673 [GDB_WATCHPOINT_ACCESS] = 0x3
1674 };
1675 const uint8_t len_code[] = {
1676 [1] = 0x0, [2] = 0x1, [4] = 0x3, [8] = 0x2
1677 };
1678 int n;
1679
1680 if (kvm_sw_breakpoints_active(env))
1681 dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
1682
1683 if (nb_hw_breakpoint > 0) {
1684 dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
1685 dbg->arch.debugreg[7] = 0x0600;
1686 for (n = 0; n < nb_hw_breakpoint; n++) {
1687 dbg->arch.debugreg[n] = hw_breakpoint[n].addr;
1688 dbg->arch.debugreg[7] |= (2 << (n * 2)) |
1689 (type_code[hw_breakpoint[n].type] << (16 + n*4)) |
1690 (len_code[hw_breakpoint[n].len] << (18 + n*4));
1691 }
1692 }
1693 /* Legal xcr0 for loading */
1694 env->xcr0 = 1;
1695 }
1696 #endif /* KVM_CAP_SET_GUEST_DEBUG */
1697
1698 bool kvm_arch_stop_on_emulation_error(CPUState *env)
1699 {
1700 return !(env->cr[0] & CR0_PE_MASK) ||
1701 ((env->segs[R_CS].selector & 3) != 3);
1702 }
1703
1704 static void hardware_memory_error(void)
1705 {
1706 fprintf(stderr, "Hardware memory error!\n");
1707 exit(1);
1708 }
1709
1710 #ifdef KVM_CAP_MCE
1711 static void kvm_mce_broadcast_rest(CPUState *env)
1712 {
1713 CPUState *cenv;
1714 int family, model, cpuver = env->cpuid_version;
1715
1716 family = (cpuver >> 8) & 0xf;
1717 model = ((cpuver >> 12) & 0xf0) + ((cpuver >> 4) & 0xf);
1718
1719 /* Broadcast MCA signal for processor version 06H_EH and above */
1720 if ((family == 6 && model >= 14) || family > 6) {
1721 for (cenv = first_cpu; cenv != NULL; cenv = cenv->next_cpu) {
1722 if (cenv == env) {
1723 continue;
1724 }
1725 kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
1726 MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0,
1727 ABORT_ON_ERROR);
1728 }
1729 }
1730 }
1731 #endif
1732
1733 int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr)
1734 {
1735 #if defined(KVM_CAP_MCE)
1736 struct kvm_x86_mce mce = {
1737 .bank = 9,
1738 };
1739 void *vaddr;
1740 ram_addr_t ram_addr;
1741 target_phys_addr_t paddr;
1742 int r;
1743
1744 if ((env->mcg_cap & MCG_SER_P) && addr
1745 && (code == BUS_MCEERR_AR
1746 || code == BUS_MCEERR_AO)) {
1747 if (code == BUS_MCEERR_AR) {
1748 /* Fake an Intel architectural Data Load SRAR UCR */
1749 mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1750 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1751 | MCI_STATUS_AR | 0x134;
1752 mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
1753 mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
1754 } else {
1755 /*
1756 * If there is an MCE excpetion being processed, ignore
1757 * this SRAO MCE
1758 */
1759 r = kvm_mce_in_exception(env);
1760 if (r == -1) {
1761 fprintf(stderr, "Failed to get MCE status\n");
1762 } else if (r) {
1763 return 0;
1764 }
1765 /* Fake an Intel architectural Memory scrubbing UCR */
1766 mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1767 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1768 | 0xc0;
1769 mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
1770 mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
1771 }
1772 vaddr = (void *)addr;
1773 if (qemu_ram_addr_from_host(vaddr, &ram_addr) ||
1774 !kvm_physical_memory_addr_from_ram(env->kvm_state, ram_addr, &paddr)) {
1775 fprintf(stderr, "Hardware memory error for memory used by "
1776 "QEMU itself instead of guest system!\n");
1777 /* Hope we are lucky for AO MCE */
1778 if (code == BUS_MCEERR_AO) {
1779 return 0;
1780 } else {
1781 hardware_memory_error();
1782 }
1783 }
1784 mce.addr = paddr;
1785 r = kvm_set_mce(env, &mce);
1786 if (r < 0) {
1787 fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
1788 abort();
1789 }
1790 kvm_mce_broadcast_rest(env);
1791 } else
1792 #endif
1793 {
1794 if (code == BUS_MCEERR_AO) {
1795 return 0;
1796 } else if (code == BUS_MCEERR_AR) {
1797 hardware_memory_error();
1798 } else {
1799 return 1;
1800 }
1801 }
1802 return 0;
1803 }
1804
1805 int kvm_on_sigbus(int code, void *addr)
1806 {
1807 #if defined(KVM_CAP_MCE)
1808 if ((first_cpu->mcg_cap & MCG_SER_P) && addr && code == BUS_MCEERR_AO) {
1809 uint64_t status;
1810 void *vaddr;
1811 ram_addr_t ram_addr;
1812 target_phys_addr_t paddr;
1813
1814 /* Hope we are lucky for AO MCE */
1815 vaddr = addr;
1816 if (qemu_ram_addr_from_host(vaddr, &ram_addr) ||
1817 !kvm_physical_memory_addr_from_ram(first_cpu->kvm_state, ram_addr, &paddr)) {
1818 fprintf(stderr, "Hardware memory error for memory used by "
1819 "QEMU itself instead of guest system!: %p\n", addr);
1820 return 0;
1821 }
1822 status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
1823 | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
1824 | 0xc0;
1825 kvm_inject_x86_mce(first_cpu, 9, status,
1826 MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
1827 (MCM_ADDR_PHYS << 6) | 0xc, ABORT_ON_ERROR);
1828 kvm_mce_broadcast_rest(first_cpu);
1829 } else
1830 #endif
1831 {
1832 if (code == BUS_MCEERR_AO) {
1833 return 0;
1834 } else if (code == BUS_MCEERR_AR) {
1835 hardware_memory_error();
1836 } else {
1837 return 1;
1838 }
1839 }
1840 return 0;
1841 }