2 * QEMU Windows Hypervisor Platform accelerator (WHPX)
4 * Copyright Microsoft Corp. 2017
6 * This work is licensed under the terms of the GNU GPL, version 2 or later.
7 * See the COPYING file in the top-level directory.
11 #include "qemu/osdep.h"
13 #include "exec/address-spaces.h"
14 #include "exec/ioport.h"
15 #include "exec/gdbstub.h"
16 #include "qemu/accel.h"
17 #include "sysemu/whpx.h"
18 #include "sysemu/cpus.h"
19 #include "sysemu/runstate.h"
20 #include "qemu/main-loop.h"
21 #include "hw/boards.h"
22 #include "hw/i386/ioapic.h"
23 #include "hw/i386/apic_internal.h"
24 #include "qemu/error-report.h"
25 #include "qapi/error.h"
26 #include "qapi/qapi-types-common.h"
27 #include "qapi/qapi-visit-common.h"
28 #include "migration/blocker.h"
31 #include "whpx-internal.h"
32 #include "whpx-accel-ops.h"
34 #include <WinHvPlatform.h>
35 #include <WinHvEmulation.h>
37 #define HYPERV_APIC_BUS_FREQUENCY (200000000ULL)
39 static const WHV_REGISTER_NAME whpx_register_names
[] = {
41 /* X64 General purpose registers */
61 /* X64 Segment registers */
71 /* X64 Table registers */
75 /* X64 Control Registers */
82 /* X64 Debug Registers */
92 /* X64 Floating Point and Vector Registers */
109 WHvX64RegisterFpMmx0
,
110 WHvX64RegisterFpMmx1
,
111 WHvX64RegisterFpMmx2
,
112 WHvX64RegisterFpMmx3
,
113 WHvX64RegisterFpMmx4
,
114 WHvX64RegisterFpMmx5
,
115 WHvX64RegisterFpMmx6
,
116 WHvX64RegisterFpMmx7
,
117 WHvX64RegisterFpControlStatus
,
118 WHvX64RegisterXmmControlStatus
,
123 WHvX64RegisterKernelGsBase
,
125 WHvX64RegisterApicBase
,
126 /* WHvX64RegisterPat, */
127 WHvX64RegisterSysenterCs
,
128 WHvX64RegisterSysenterEip
,
129 WHvX64RegisterSysenterEsp
,
134 WHvX64RegisterSfmask
,
137 /* Interrupt / Event Registers */
139 * WHvRegisterPendingInterruption,
140 * WHvRegisterInterruptState,
141 * WHvRegisterPendingEvent0,
142 * WHvRegisterPendingEvent1
143 * WHvX64RegisterDeliverabilityNotifications,
147 struct whpx_register_set
{
148 WHV_REGISTER_VALUE values
[RTL_NUMBER_OF(whpx_register_names
)];
152 * The current implementation of instruction stepping sets the TF flag
153 * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
154 * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
156 * This approach has a few limitations:
157 * 1. Stepping over a PUSHF/SAHF instruction will save the TF flag
158 * along with the other flags, possibly restoring it later. It would
159 * result in another INT1 when the flags are restored, triggering
160 * a stop in gdb that could be cleared by doing another step.
162 * Stepping over a POPF/LAHF instruction will let it overwrite the
163 * TF flags, ending the stepping mode.
165 * 2. Stepping over an instruction raising an exception (e.g. INT, DIV,
166 * or anything that could result in a page fault) will save the flags
167 * to the stack, clear the TF flag, and let the guest execute the
168 * handler. Normally, the guest will restore the original flags,
169 * that will continue single-stepping.
171 * 3. Debuggers running on the guest may wish to set TF to do instruction
172 * stepping. INT1 events generated by it would be intercepted by us,
173 * as long as the gdb is connected to QEMU.
175 * In practice this means that:
176 * 1. Stepping through flags-modifying instructions may cause gdb to
177 * continue or stop in unexpected places. This will be fully recoverable
178 * and will not crash the target.
180 * 2. Stepping over an instruction that triggers an exception will step
181 * over the exception handler, not into it.
183 * 3. Debugging the guest via gdb, while running debugger on the guest
184 * at the same time may lead to unexpected effects. Removing all
185 * breakpoints set via QEMU will prevent any further interference
186 * with the guest-level debuggers.
188 * The limitations can be addressed as shown below:
189 * 1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
190 * stepping through them. The exact semantics of the instructions is
191 * defined in the "Combined Volume Set of Intel 64 and IA-32
192 * Architectures Software Developer's Manuals", however it involves a
193 * fair amount of corner cases due to compatibility with real mode,
194 * virtual 8086 mode, and differences between 64-bit and 32-bit modes.
196 * 2. We could step into the guest's exception handlers using the following
198 * a. Temporarily enable catching of all exception types via
199 * whpx_set_exception_exit_bitmap().
200 * b. Once an exception is intercepted, read the IDT/GDT and locate
201 * the original handler.
202 * c. Patch the original handler, injecting an INT3 at the beginning.
203 * d. Update the exception exit bitmap to only catch the
204 * WHvX64ExceptionTypeBreakpointTrap exception.
205 * e. Let the affected CPU run in the exclusive mode.
206 * f. Restore the original handler and the exception exit bitmap.
207 * Note that handling all corner cases related to IDT/GDT is harder
208 * than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
211 * 3. In order to properly support guest-level debugging in parallel with
212 * the QEMU-level debugging, we would need to be able to pass some INT1
213 * events to the guest. This could be done via the following methods:
214 * a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
215 * it seems to only work for interrupts and not software
217 * b. Locating and patching the original handler by parsing IDT/GDT.
218 * This involves relatively complex logic outlined in the previous
220 * c. Emulating the exception invocation (i.e. manually updating RIP,
221 * RFLAGS, and pushing the old values to stack). This is even more
222 * complicated than the previous option, since it involves checking
223 * CPL, gate attributes, and doing various adjustments depending
224 * on the current CPU mode, whether the CPL is changing, etc.
226 typedef enum WhpxStepMode
{
228 /* Halt other VCPUs */
233 WHV_EMULATOR_HANDLE emulator
;
234 bool window_registered
;
236 bool ready_for_pic_interrupt
;
239 bool interruption_pending
;
241 /* Must be the last field as it may have a tail */
242 WHV_RUN_VP_EXIT_CONTEXT exit_ctx
;
245 static bool whpx_allowed
;
246 static bool whp_dispatch_initialized
;
247 static HMODULE hWinHvPlatform
, hWinHvEmulation
;
248 static uint32_t max_vcpu_index
;
249 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap
;
251 struct whpx_state whpx_global
;
252 struct WHPDispatch whp_dispatch
;
254 static bool whpx_has_xsave(void)
256 return whpx_xsave_cap
.XsaveSupport
;
263 static struct whpx_vcpu
*get_whpx_vcpu(CPUState
*cpu
)
265 return (struct whpx_vcpu
*)cpu
->hax_vcpu
;
268 static WHV_X64_SEGMENT_REGISTER
whpx_seg_q2h(const SegmentCache
*qs
, int v86
,
271 WHV_X64_SEGMENT_REGISTER hs
;
272 unsigned flags
= qs
->flags
;
275 hs
.Limit
= qs
->limit
;
276 hs
.Selector
= qs
->selector
;
282 hs
.DescriptorPrivilegeLevel
= 3;
283 hs
.NonSystemSegment
= 1;
286 hs
.Attributes
= (flags
>> DESC_TYPE_SHIFT
);
289 /* hs.Base &= 0xfffff; */
296 static SegmentCache
whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER
*hs
)
301 qs
.limit
= hs
->Limit
;
302 qs
.selector
= hs
->Selector
;
304 qs
.flags
= ((uint32_t)hs
->Attributes
) << DESC_TYPE_SHIFT
;
309 /* X64 Extended Control Registers */
310 static void whpx_set_xcrs(CPUState
*cpu
)
312 CPUX86State
*env
= cpu
->env_ptr
;
314 struct whpx_state
*whpx
= &whpx_global
;
315 WHV_REGISTER_VALUE xcr0
;
316 WHV_REGISTER_NAME xcr0_name
= WHvX64RegisterXCr0
;
318 if (!whpx_has_xsave()) {
322 /* Only xcr0 is supported by the hypervisor currently */
323 xcr0
.Reg64
= env
->xcr0
;
324 hr
= whp_dispatch
.WHvSetVirtualProcessorRegisters(
325 whpx
->partition
, cpu
->cpu_index
, &xcr0_name
, 1, &xcr0
);
327 error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr
);
331 static int whpx_set_tsc(CPUState
*cpu
)
333 CPUX86State
*env
= cpu
->env_ptr
;
334 WHV_REGISTER_NAME tsc_reg
= WHvX64RegisterTsc
;
335 WHV_REGISTER_VALUE tsc_val
;
337 struct whpx_state
*whpx
= &whpx_global
;
340 * Suspend the partition prior to setting the TSC to reduce the variance
341 * in TSC across vCPUs. When the first vCPU runs post suspend, the
342 * partition is automatically resumed.
344 if (whp_dispatch
.WHvSuspendPartitionTime
) {
347 * Unable to suspend partition while setting TSC is not a fatal
348 * error. It just increases the likelihood of TSC variance between
349 * vCPUs and some guest OS are able to handle that just fine.
351 hr
= whp_dispatch
.WHvSuspendPartitionTime(whpx
->partition
);
353 warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr
);
357 tsc_val
.Reg64
= env
->tsc
;
358 hr
= whp_dispatch
.WHvSetVirtualProcessorRegisters(
359 whpx
->partition
, cpu
->cpu_index
, &tsc_reg
, 1, &tsc_val
);
361 error_report("WHPX: Failed to set TSC, hr=%08lx", hr
);
369 * The CR8 register in the CPU is mapped to the TPR register of the APIC,
370 * however, they use a slightly different encoding. Specifically:
372 * APIC.TPR[bits 7:4] = CR8[bits 3:0]
374 * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
375 * and IA-32 Architectures Software Developer's Manual.
377 * The functions below translate the value of CR8 to TPR and vice versa.
380 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr
)
385 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8
)
390 static void whpx_set_registers(CPUState
*cpu
, int level
)
392 struct whpx_state
*whpx
= &whpx_global
;
393 struct whpx_vcpu
*vcpu
= get_whpx_vcpu(cpu
);
394 CPUX86State
*env
= cpu
->env_ptr
;
395 X86CPU
*x86_cpu
= X86_CPU(cpu
);
396 struct whpx_register_set vcxt
;
403 assert(cpu_is_stopped(cpu
) || qemu_cpu_is_self(cpu
));
406 * Following MSRs have side effects on the guest or are too heavy for
407 * runtime. Limit them to full state update.
409 if (level
>= WHPX_SET_RESET_STATE
) {
413 memset(&vcxt
, 0, sizeof(struct whpx_register_set
));
415 v86
= (env
->eflags
& VM_MASK
);
416 r86
= !(env
->cr
[0] & CR0_PE_MASK
);
418 vcpu
->tpr
= whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu
->apic_state
));
419 vcpu
->apic_base
= cpu_get_apic_base(x86_cpu
->apic_state
);
423 /* Indexes for first 16 registers match between HV and QEMU definitions */
425 for (idx
= 0; idx
< CPU_NB_REGS
; idx
+= 1) {
426 vcxt
.values
[idx
].Reg64
= (uint64_t)env
->regs
[idx
];
430 /* Same goes for RIP and RFLAGS */
431 assert(whpx_register_names
[idx
] == WHvX64RegisterRip
);
432 vcxt
.values
[idx
++].Reg64
= env
->eip
;
434 assert(whpx_register_names
[idx
] == WHvX64RegisterRflags
);
435 vcxt
.values
[idx
++].Reg64
= env
->eflags
;
437 /* Translate 6+4 segment registers. HV and QEMU order matches */
438 assert(idx
== WHvX64RegisterEs
);
439 for (i
= 0; i
< 6; i
+= 1, idx
+= 1) {
440 vcxt
.values
[idx
].Segment
= whpx_seg_q2h(&env
->segs
[i
], v86
, r86
);
443 assert(idx
== WHvX64RegisterLdtr
);
444 vcxt
.values
[idx
++].Segment
= whpx_seg_q2h(&env
->ldt
, 0, 0);
446 assert(idx
== WHvX64RegisterTr
);
447 vcxt
.values
[idx
++].Segment
= whpx_seg_q2h(&env
->tr
, 0, 0);
449 assert(idx
== WHvX64RegisterIdtr
);
450 vcxt
.values
[idx
].Table
.Base
= env
->idt
.base
;
451 vcxt
.values
[idx
].Table
.Limit
= env
->idt
.limit
;
454 assert(idx
== WHvX64RegisterGdtr
);
455 vcxt
.values
[idx
].Table
.Base
= env
->gdt
.base
;
456 vcxt
.values
[idx
].Table
.Limit
= env
->gdt
.limit
;
459 /* CR0, 2, 3, 4, 8 */
460 assert(whpx_register_names
[idx
] == WHvX64RegisterCr0
);
461 vcxt
.values
[idx
++].Reg64
= env
->cr
[0];
462 assert(whpx_register_names
[idx
] == WHvX64RegisterCr2
);
463 vcxt
.values
[idx
++].Reg64
= env
->cr
[2];
464 assert(whpx_register_names
[idx
] == WHvX64RegisterCr3
);
465 vcxt
.values
[idx
++].Reg64
= env
->cr
[3];
466 assert(whpx_register_names
[idx
] == WHvX64RegisterCr4
);
467 vcxt
.values
[idx
++].Reg64
= env
->cr
[4];
468 assert(whpx_register_names
[idx
] == WHvX64RegisterCr8
);
469 vcxt
.values
[idx
++].Reg64
= vcpu
->tpr
;
471 /* 8 Debug Registers - Skipped */
474 * Extended control registers needs to be handled separately depending
475 * on whether xsave is supported/enabled or not.
479 /* 16 XMM registers */
480 assert(whpx_register_names
[idx
] == WHvX64RegisterXmm0
);
482 for (i
= 0; i
< sizeof(env
->xmm_regs
) / sizeof(ZMMReg
); i
+= 1, idx
+= 1) {
483 vcxt
.values
[idx
].Reg128
.Low64
= env
->xmm_regs
[i
].ZMM_Q(0);
484 vcxt
.values
[idx
].Reg128
.High64
= env
->xmm_regs
[i
].ZMM_Q(1);
489 assert(whpx_register_names
[idx
] == WHvX64RegisterFpMmx0
);
490 for (i
= 0; i
< 8; i
+= 1, idx
+= 1) {
491 vcxt
.values
[idx
].Fp
.AsUINT128
.Low64
= env
->fpregs
[i
].mmx
.MMX_Q(0);
492 /* vcxt.values[idx].Fp.AsUINT128.High64 =
493 env->fpregs[i].mmx.MMX_Q(1);
497 /* FP control status register */
498 assert(whpx_register_names
[idx
] == WHvX64RegisterFpControlStatus
);
499 vcxt
.values
[idx
].FpControlStatus
.FpControl
= env
->fpuc
;
500 vcxt
.values
[idx
].FpControlStatus
.FpStatus
=
501 (env
->fpus
& ~0x3800) | (env
->fpstt
& 0x7) << 11;
502 vcxt
.values
[idx
].FpControlStatus
.FpTag
= 0;
503 for (i
= 0; i
< 8; ++i
) {
504 vcxt
.values
[idx
].FpControlStatus
.FpTag
|= (!env
->fptags
[i
]) << i
;
506 vcxt
.values
[idx
].FpControlStatus
.Reserved
= 0;
507 vcxt
.values
[idx
].FpControlStatus
.LastFpOp
= env
->fpop
;
508 vcxt
.values
[idx
].FpControlStatus
.LastFpRip
= env
->fpip
;
511 /* XMM control status register */
512 assert(whpx_register_names
[idx
] == WHvX64RegisterXmmControlStatus
);
513 vcxt
.values
[idx
].XmmControlStatus
.LastFpRdp
= 0;
514 vcxt
.values
[idx
].XmmControlStatus
.XmmStatusControl
= env
->mxcsr
;
515 vcxt
.values
[idx
].XmmControlStatus
.XmmStatusControlMask
= 0x0000ffff;
519 assert(whpx_register_names
[idx
] == WHvX64RegisterEfer
);
520 vcxt
.values
[idx
++].Reg64
= env
->efer
;
522 assert(whpx_register_names
[idx
] == WHvX64RegisterKernelGsBase
);
523 vcxt
.values
[idx
++].Reg64
= env
->kernelgsbase
;
526 assert(whpx_register_names
[idx
] == WHvX64RegisterApicBase
);
527 vcxt
.values
[idx
++].Reg64
= vcpu
->apic_base
;
529 /* WHvX64RegisterPat - Skipped */
531 assert(whpx_register_names
[idx
] == WHvX64RegisterSysenterCs
);
532 vcxt
.values
[idx
++].Reg64
= env
->sysenter_cs
;
533 assert(whpx_register_names
[idx
] == WHvX64RegisterSysenterEip
);
534 vcxt
.values
[idx
++].Reg64
= env
->sysenter_eip
;
535 assert(whpx_register_names
[idx
] == WHvX64RegisterSysenterEsp
);
536 vcxt
.values
[idx
++].Reg64
= env
->sysenter_esp
;
537 assert(whpx_register_names
[idx
] == WHvX64RegisterStar
);
538 vcxt
.values
[idx
++].Reg64
= env
->star
;
540 assert(whpx_register_names
[idx
] == WHvX64RegisterLstar
);
541 vcxt
.values
[idx
++].Reg64
= env
->lstar
;
542 assert(whpx_register_names
[idx
] == WHvX64RegisterCstar
);
543 vcxt
.values
[idx
++].Reg64
= env
->cstar
;
544 assert(whpx_register_names
[idx
] == WHvX64RegisterSfmask
);
545 vcxt
.values
[idx
++].Reg64
= env
->fmask
;
548 /* Interrupt / Event Registers - Skipped */
550 assert(idx
== RTL_NUMBER_OF(whpx_register_names
));
552 hr
= whp_dispatch
.WHvSetVirtualProcessorRegisters(
553 whpx
->partition
, cpu
->cpu_index
,
555 RTL_NUMBER_OF(whpx_register_names
),
559 error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
566 static int whpx_get_tsc(CPUState
*cpu
)
568 CPUX86State
*env
= cpu
->env_ptr
;
569 WHV_REGISTER_NAME tsc_reg
= WHvX64RegisterTsc
;
570 WHV_REGISTER_VALUE tsc_val
;
572 struct whpx_state
*whpx
= &whpx_global
;
574 hr
= whp_dispatch
.WHvGetVirtualProcessorRegisters(
575 whpx
->partition
, cpu
->cpu_index
, &tsc_reg
, 1, &tsc_val
);
577 error_report("WHPX: Failed to get TSC, hr=%08lx", hr
);
581 env
->tsc
= tsc_val
.Reg64
;
585 /* X64 Extended Control Registers */
586 static void whpx_get_xcrs(CPUState
*cpu
)
588 CPUX86State
*env
= cpu
->env_ptr
;
590 struct whpx_state
*whpx
= &whpx_global
;
591 WHV_REGISTER_VALUE xcr0
;
592 WHV_REGISTER_NAME xcr0_name
= WHvX64RegisterXCr0
;
594 if (!whpx_has_xsave()) {
598 /* Only xcr0 is supported by the hypervisor currently */
599 hr
= whp_dispatch
.WHvGetVirtualProcessorRegisters(
600 whpx
->partition
, cpu
->cpu_index
, &xcr0_name
, 1, &xcr0
);
602 error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr
);
606 env
->xcr0
= xcr0
.Reg64
;
609 static void whpx_get_registers(CPUState
*cpu
)
611 struct whpx_state
*whpx
= &whpx_global
;
612 struct whpx_vcpu
*vcpu
= get_whpx_vcpu(cpu
);
613 CPUX86State
*env
= cpu
->env_ptr
;
614 X86CPU
*x86_cpu
= X86_CPU(cpu
);
615 struct whpx_register_set vcxt
;
616 uint64_t tpr
, apic_base
;
622 assert(cpu_is_stopped(cpu
) || qemu_cpu_is_self(cpu
));
624 if (!env
->tsc_valid
) {
626 env
->tsc_valid
= !runstate_is_running();
629 hr
= whp_dispatch
.WHvGetVirtualProcessorRegisters(
630 whpx
->partition
, cpu
->cpu_index
,
632 RTL_NUMBER_OF(whpx_register_names
),
635 error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
639 if (whpx_apic_in_platform()) {
641 * Fetch the TPR value from the emulated APIC. It may get overwritten
642 * below with the value from CR8 returned by
643 * WHvGetVirtualProcessorRegisters().
645 whpx_apic_get(x86_cpu
->apic_state
);
646 vcpu
->tpr
= whpx_apic_tpr_to_cr8(
647 cpu_get_apic_tpr(x86_cpu
->apic_state
));
652 /* Indexes for first 16 registers match between HV and QEMU definitions */
654 for (idx
= 0; idx
< CPU_NB_REGS
; idx
+= 1) {
655 env
->regs
[idx
] = vcxt
.values
[idx
].Reg64
;
659 /* Same goes for RIP and RFLAGS */
660 assert(whpx_register_names
[idx
] == WHvX64RegisterRip
);
661 env
->eip
= vcxt
.values
[idx
++].Reg64
;
662 assert(whpx_register_names
[idx
] == WHvX64RegisterRflags
);
663 env
->eflags
= vcxt
.values
[idx
++].Reg64
;
665 /* Translate 6+4 segment registers. HV and QEMU order matches */
666 assert(idx
== WHvX64RegisterEs
);
667 for (i
= 0; i
< 6; i
+= 1, idx
+= 1) {
668 env
->segs
[i
] = whpx_seg_h2q(&vcxt
.values
[idx
].Segment
);
671 assert(idx
== WHvX64RegisterLdtr
);
672 env
->ldt
= whpx_seg_h2q(&vcxt
.values
[idx
++].Segment
);
673 assert(idx
== WHvX64RegisterTr
);
674 env
->tr
= whpx_seg_h2q(&vcxt
.values
[idx
++].Segment
);
675 assert(idx
== WHvX64RegisterIdtr
);
676 env
->idt
.base
= vcxt
.values
[idx
].Table
.Base
;
677 env
->idt
.limit
= vcxt
.values
[idx
].Table
.Limit
;
679 assert(idx
== WHvX64RegisterGdtr
);
680 env
->gdt
.base
= vcxt
.values
[idx
].Table
.Base
;
681 env
->gdt
.limit
= vcxt
.values
[idx
].Table
.Limit
;
684 /* CR0, 2, 3, 4, 8 */
685 assert(whpx_register_names
[idx
] == WHvX64RegisterCr0
);
686 env
->cr
[0] = vcxt
.values
[idx
++].Reg64
;
687 assert(whpx_register_names
[idx
] == WHvX64RegisterCr2
);
688 env
->cr
[2] = vcxt
.values
[idx
++].Reg64
;
689 assert(whpx_register_names
[idx
] == WHvX64RegisterCr3
);
690 env
->cr
[3] = vcxt
.values
[idx
++].Reg64
;
691 assert(whpx_register_names
[idx
] == WHvX64RegisterCr4
);
692 env
->cr
[4] = vcxt
.values
[idx
++].Reg64
;
693 assert(whpx_register_names
[idx
] == WHvX64RegisterCr8
);
694 tpr
= vcxt
.values
[idx
++].Reg64
;
695 if (tpr
!= vcpu
->tpr
) {
697 cpu_set_apic_tpr(x86_cpu
->apic_state
, whpx_cr8_to_apic_tpr(tpr
));
700 /* 8 Debug Registers - Skipped */
703 * Extended control registers needs to be handled separately depending
704 * on whether xsave is supported/enabled or not.
708 /* 16 XMM registers */
709 assert(whpx_register_names
[idx
] == WHvX64RegisterXmm0
);
711 for (i
= 0; i
< sizeof(env
->xmm_regs
) / sizeof(ZMMReg
); i
+= 1, idx
+= 1) {
712 env
->xmm_regs
[i
].ZMM_Q(0) = vcxt
.values
[idx
].Reg128
.Low64
;
713 env
->xmm_regs
[i
].ZMM_Q(1) = vcxt
.values
[idx
].Reg128
.High64
;
718 assert(whpx_register_names
[idx
] == WHvX64RegisterFpMmx0
);
719 for (i
= 0; i
< 8; i
+= 1, idx
+= 1) {
720 env
->fpregs
[i
].mmx
.MMX_Q(0) = vcxt
.values
[idx
].Fp
.AsUINT128
.Low64
;
721 /* env->fpregs[i].mmx.MMX_Q(1) =
722 vcxt.values[idx].Fp.AsUINT128.High64;
726 /* FP control status register */
727 assert(whpx_register_names
[idx
] == WHvX64RegisterFpControlStatus
);
728 env
->fpuc
= vcxt
.values
[idx
].FpControlStatus
.FpControl
;
729 env
->fpstt
= (vcxt
.values
[idx
].FpControlStatus
.FpStatus
>> 11) & 0x7;
730 env
->fpus
= vcxt
.values
[idx
].FpControlStatus
.FpStatus
& ~0x3800;
731 for (i
= 0; i
< 8; ++i
) {
732 env
->fptags
[i
] = !((vcxt
.values
[idx
].FpControlStatus
.FpTag
>> i
) & 1);
734 env
->fpop
= vcxt
.values
[idx
].FpControlStatus
.LastFpOp
;
735 env
->fpip
= vcxt
.values
[idx
].FpControlStatus
.LastFpRip
;
738 /* XMM control status register */
739 assert(whpx_register_names
[idx
] == WHvX64RegisterXmmControlStatus
);
740 env
->mxcsr
= vcxt
.values
[idx
].XmmControlStatus
.XmmStatusControl
;
744 assert(whpx_register_names
[idx
] == WHvX64RegisterEfer
);
745 env
->efer
= vcxt
.values
[idx
++].Reg64
;
747 assert(whpx_register_names
[idx
] == WHvX64RegisterKernelGsBase
);
748 env
->kernelgsbase
= vcxt
.values
[idx
++].Reg64
;
751 assert(whpx_register_names
[idx
] == WHvX64RegisterApicBase
);
752 apic_base
= vcxt
.values
[idx
++].Reg64
;
753 if (apic_base
!= vcpu
->apic_base
) {
754 vcpu
->apic_base
= apic_base
;
755 cpu_set_apic_base(x86_cpu
->apic_state
, vcpu
->apic_base
);
758 /* WHvX64RegisterPat - Skipped */
760 assert(whpx_register_names
[idx
] == WHvX64RegisterSysenterCs
);
761 env
->sysenter_cs
= vcxt
.values
[idx
++].Reg64
;
762 assert(whpx_register_names
[idx
] == WHvX64RegisterSysenterEip
);
763 env
->sysenter_eip
= vcxt
.values
[idx
++].Reg64
;
764 assert(whpx_register_names
[idx
] == WHvX64RegisterSysenterEsp
);
765 env
->sysenter_esp
= vcxt
.values
[idx
++].Reg64
;
766 assert(whpx_register_names
[idx
] == WHvX64RegisterStar
);
767 env
->star
= vcxt
.values
[idx
++].Reg64
;
769 assert(whpx_register_names
[idx
] == WHvX64RegisterLstar
);
770 env
->lstar
= vcxt
.values
[idx
++].Reg64
;
771 assert(whpx_register_names
[idx
] == WHvX64RegisterCstar
);
772 env
->cstar
= vcxt
.values
[idx
++].Reg64
;
773 assert(whpx_register_names
[idx
] == WHvX64RegisterSfmask
);
774 env
->fmask
= vcxt
.values
[idx
++].Reg64
;
777 /* Interrupt / Event Registers - Skipped */
779 assert(idx
== RTL_NUMBER_OF(whpx_register_names
));
781 if (whpx_apic_in_platform()) {
782 whpx_apic_get(x86_cpu
->apic_state
);
785 x86_update_hflags(env
);
790 static HRESULT CALLBACK
whpx_emu_ioport_callback(
792 WHV_EMULATOR_IO_ACCESS_INFO
*IoAccess
)
794 MemTxAttrs attrs
= { 0 };
795 address_space_rw(&address_space_io
, IoAccess
->Port
, attrs
,
796 &IoAccess
->Data
, IoAccess
->AccessSize
,
797 IoAccess
->Direction
);
801 static HRESULT CALLBACK
whpx_emu_mmio_callback(
803 WHV_EMULATOR_MEMORY_ACCESS_INFO
*ma
)
805 cpu_physical_memory_rw(ma
->GpaAddress
, ma
->Data
, ma
->AccessSize
,
810 static HRESULT CALLBACK
whpx_emu_getreg_callback(
812 const WHV_REGISTER_NAME
*RegisterNames
,
813 UINT32 RegisterCount
,
814 WHV_REGISTER_VALUE
*RegisterValues
)
817 struct whpx_state
*whpx
= &whpx_global
;
818 CPUState
*cpu
= (CPUState
*)ctx
;
820 hr
= whp_dispatch
.WHvGetVirtualProcessorRegisters(
821 whpx
->partition
, cpu
->cpu_index
,
822 RegisterNames
, RegisterCount
,
825 error_report("WHPX: Failed to get virtual processor registers,"
832 static HRESULT CALLBACK
whpx_emu_setreg_callback(
834 const WHV_REGISTER_NAME
*RegisterNames
,
835 UINT32 RegisterCount
,
836 const WHV_REGISTER_VALUE
*RegisterValues
)
839 struct whpx_state
*whpx
= &whpx_global
;
840 CPUState
*cpu
= (CPUState
*)ctx
;
842 hr
= whp_dispatch
.WHvSetVirtualProcessorRegisters(
843 whpx
->partition
, cpu
->cpu_index
,
844 RegisterNames
, RegisterCount
,
847 error_report("WHPX: Failed to set virtual processor registers,"
852 * The emulator just successfully wrote the register state. We clear the
853 * dirty state so we avoid the double write on resume of the VP.
855 cpu
->vcpu_dirty
= false;
860 static HRESULT CALLBACK
whpx_emu_translate_callback(
862 WHV_GUEST_VIRTUAL_ADDRESS Gva
,
863 WHV_TRANSLATE_GVA_FLAGS TranslateFlags
,
864 WHV_TRANSLATE_GVA_RESULT_CODE
*TranslationResult
,
865 WHV_GUEST_PHYSICAL_ADDRESS
*Gpa
)
868 struct whpx_state
*whpx
= &whpx_global
;
869 CPUState
*cpu
= (CPUState
*)ctx
;
870 WHV_TRANSLATE_GVA_RESULT res
;
872 hr
= whp_dispatch
.WHvTranslateGva(whpx
->partition
, cpu
->cpu_index
,
873 Gva
, TranslateFlags
, &res
, Gpa
);
875 error_report("WHPX: Failed to translate GVA, hr=%08lx", hr
);
877 *TranslationResult
= res
.ResultCode
;
883 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks
= {
884 .Size
= sizeof(WHV_EMULATOR_CALLBACKS
),
885 .WHvEmulatorIoPortCallback
= whpx_emu_ioport_callback
,
886 .WHvEmulatorMemoryCallback
= whpx_emu_mmio_callback
,
887 .WHvEmulatorGetVirtualProcessorRegisters
= whpx_emu_getreg_callback
,
888 .WHvEmulatorSetVirtualProcessorRegisters
= whpx_emu_setreg_callback
,
889 .WHvEmulatorTranslateGvaPage
= whpx_emu_translate_callback
,
892 static int whpx_handle_mmio(CPUState
*cpu
, WHV_MEMORY_ACCESS_CONTEXT
*ctx
)
895 struct whpx_vcpu
*vcpu
= get_whpx_vcpu(cpu
);
896 WHV_EMULATOR_STATUS emu_status
;
898 hr
= whp_dispatch
.WHvEmulatorTryMmioEmulation(
900 &vcpu
->exit_ctx
.VpContext
, ctx
,
903 error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr
);
907 if (!emu_status
.EmulationSuccessful
) {
908 error_report("WHPX: Failed to emulate MMIO access with"
909 " EmulatorReturnStatus: %u", emu_status
.AsUINT32
);
916 static int whpx_handle_portio(CPUState
*cpu
,
917 WHV_X64_IO_PORT_ACCESS_CONTEXT
*ctx
)
920 struct whpx_vcpu
*vcpu
= get_whpx_vcpu(cpu
);
921 WHV_EMULATOR_STATUS emu_status
;
923 hr
= whp_dispatch
.WHvEmulatorTryIoEmulation(
925 &vcpu
->exit_ctx
.VpContext
, ctx
,
928 error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr
);
932 if (!emu_status
.EmulationSuccessful
) {
933 error_report("WHPX: Failed to emulate PortIO access with"
934 " EmulatorReturnStatus: %u", emu_status
.AsUINT32
);
942 * Controls whether we should intercept various exceptions on the guest,
943 * namely breakpoint/single-step events.
945 * The 'exceptions' argument accepts a bitmask, e.g:
946 * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
948 static HRESULT
whpx_set_exception_exit_bitmap(UINT64 exceptions
)
950 struct whpx_state
*whpx
= &whpx_global
;
951 WHV_PARTITION_PROPERTY prop
= { 0, };
954 if (exceptions
== whpx
->exception_exit_bitmap
) {
958 prop
.ExceptionExitBitmap
= exceptions
;
960 hr
= whp_dispatch
.WHvSetPartitionProperty(
962 WHvPartitionPropertyCodeExceptionExitBitmap
,
964 sizeof(WHV_PARTITION_PROPERTY
));
967 whpx
->exception_exit_bitmap
= exceptions
;
975 * This function is called before/after stepping over a single instruction.
976 * It will update the CPU registers to arm/disarm the instruction stepping
979 static HRESULT
whpx_vcpu_configure_single_stepping(CPUState
*cpu
,
981 uint64_t *exit_context_rflags
)
983 WHV_REGISTER_NAME reg_name
;
984 WHV_REGISTER_VALUE reg_value
;
986 struct whpx_state
*whpx
= &whpx_global
;
989 * If we are trying to step over a single instruction, we need to set the
990 * TF bit in rflags. Otherwise, clear it.
992 reg_name
= WHvX64RegisterRflags
;
993 hr
= whp_dispatch
.WHvGetVirtualProcessorRegisters(
1001 error_report("WHPX: Failed to get rflags, hr=%08lx", hr
);
1005 if (exit_context_rflags
) {
1006 assert(*exit_context_rflags
== reg_value
.Reg64
);
1010 /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
1011 reg_value
.Reg64
|= TF_MASK
;
1013 reg_value
.Reg64
&= ~TF_MASK
;
1016 if (exit_context_rflags
) {
1017 *exit_context_rflags
= reg_value
.Reg64
;
1020 hr
= whp_dispatch
.WHvSetVirtualProcessorRegisters(
1028 error_report("WHPX: Failed to set rflags,"
1034 reg_name
= WHvRegisterInterruptState
;
1035 reg_value
.Reg64
= 0;
1037 /* Suspend delivery of hardware interrupts during single-stepping. */
1038 reg_value
.InterruptState
.InterruptShadow
= set
!= 0;
1040 hr
= whp_dispatch
.WHvSetVirtualProcessorRegisters(
1048 error_report("WHPX: Failed to set InterruptState,"
1056 * We have just finished stepping over a single instruction,
1057 * and intercepted the INT1 generated by it.
1058 * We need to now hide the INT1 from the guest,
1059 * as it would not be expecting it.
1062 reg_name
= WHvX64RegisterPendingDebugException
;
1063 hr
= whp_dispatch
.WHvGetVirtualProcessorRegisters(
1071 error_report("WHPX: Failed to get pending debug exceptions,"
1076 if (reg_value
.PendingDebugException
.SingleStep
) {
1077 reg_value
.PendingDebugException
.SingleStep
= 0;
1079 hr
= whp_dispatch
.WHvSetVirtualProcessorRegisters(
1087 error_report("WHPX: Failed to clear pending debug exceptions,"
1098 /* Tries to find a breakpoint at the specified address. */
1099 static struct whpx_breakpoint
*whpx_lookup_breakpoint_by_addr(uint64_t address
)
1101 struct whpx_state
*whpx
= &whpx_global
;
1104 if (whpx
->breakpoints
.breakpoints
) {
1105 for (i
= 0; i
< whpx
->breakpoints
.breakpoints
->used
; i
++) {
1106 if (address
== whpx
->breakpoints
.breakpoints
->data
[i
].address
) {
1107 return &whpx
->breakpoints
.breakpoints
->data
[i
];
1116 * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
1117 * debugging user-mode applications. Since the WHPX API does not offer
1118 * an easy way to pass the intercepted exception back to the guest, we
1119 * resort to using INT1 instead, and let the guest always handle INT3.
1121 static const uint8_t whpx_breakpoint_instruction
= 0xF1;
1124 * The WHPX QEMU backend implements breakpoints by writing the INT1
1125 * instruction into memory (ignoring the DRx registers). This raises a few
1126 * issues that need to be carefully handled:
1128 * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
1129 * at the same location, and later remove them in arbitrary order.
1130 * This should not cause memory corruption, and should only remove the
1131 * physical breakpoint instruction when the last QEMU breakpoint is gone.
1133 * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
1134 * physical location. Hence, physically adding/removing a breakpoint can
1135 * theoretically fail at any time. We need to keep track of it.
1137 * The function below rebuilds a list of low-level breakpoints (one per
1138 * address, tracking the original instruction and any errors) from the list of
1139 * high-level breakpoints (set via cpu_breakpoint_insert()).
1141 * In order to optimize performance, this function stores the list of
1142 * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
1143 * low-level ones, so that it won't be re-invoked until these breakpoints
1146 * Note that this function decides which breakpoints should be inserted into,
1147 * memory, but doesn't actually do it. The memory accessing is done in
1148 * whpx_apply_breakpoints().
1150 static void whpx_translate_cpu_breakpoints(
1151 struct whpx_breakpoints
*breakpoints
,
1153 int cpu_breakpoint_count
)
1156 int cpu_bp_index
= 0;
1158 breakpoints
->original_addresses
=
1159 g_renew(vaddr
, breakpoints
->original_addresses
, cpu_breakpoint_count
);
1161 breakpoints
->original_address_count
= cpu_breakpoint_count
;
1163 int max_breakpoints
= cpu_breakpoint_count
+
1164 (breakpoints
->breakpoints
? breakpoints
->breakpoints
->used
: 0);
1166 struct whpx_breakpoint_collection
*new_breakpoints
=
1167 (struct whpx_breakpoint_collection
*)g_malloc0(
1168 sizeof(struct whpx_breakpoint_collection
) +
1169 max_breakpoints
* sizeof(struct whpx_breakpoint
));
1171 new_breakpoints
->allocated
= max_breakpoints
;
1172 new_breakpoints
->used
= 0;
1175 * 1. Preserve all old breakpoints that could not be automatically
1176 * cleared when the CPU got stopped.
1178 if (breakpoints
->breakpoints
) {
1180 for (i
= 0; i
< breakpoints
->breakpoints
->used
; i
++) {
1181 if (breakpoints
->breakpoints
->data
[i
].state
!= WHPX_BP_CLEARED
) {
1182 new_breakpoints
->data
[new_breakpoints
->used
++] =
1183 breakpoints
->breakpoints
->data
[i
];
1188 /* 2. Map all CPU breakpoints to WHPX breakpoints */
1189 QTAILQ_FOREACH(bp
, &cpu
->breakpoints
, entry
) {
1193 /* This will be used to detect changed CPU breakpoints later. */
1194 breakpoints
->original_addresses
[cpu_bp_index
++] = bp
->pc
;
1196 for (i
= 0; i
< new_breakpoints
->used
; i
++) {
1198 * WARNING: This loop has O(N^2) complexity, where N is the
1199 * number of breakpoints. It should not be a bottleneck in
1200 * real-world scenarios, since it only needs to run once after
1201 * the breakpoints have been modified.
1202 * If this ever becomes a concern, it can be optimized by storing
1203 * high-level breakpoint objects in a tree or hash map.
1206 if (new_breakpoints
->data
[i
].address
== bp
->pc
) {
1207 /* There was already a breakpoint at this address. */
1208 if (new_breakpoints
->data
[i
].state
== WHPX_BP_CLEAR_PENDING
) {
1209 new_breakpoints
->data
[i
].state
= WHPX_BP_SET
;
1210 } else if (new_breakpoints
->data
[i
].state
== WHPX_BP_SET
) {
1211 new_breakpoints
->data
[i
].state
= WHPX_BP_SET_PENDING
;
1219 if (!found
&& new_breakpoints
->used
< new_breakpoints
->allocated
) {
1220 /* No WHPX breakpoint at this address. Create one. */
1221 new_breakpoints
->data
[new_breakpoints
->used
].address
= bp
->pc
;
1222 new_breakpoints
->data
[new_breakpoints
->used
].state
=
1223 WHPX_BP_SET_PENDING
;
1224 new_breakpoints
->used
++;
1229 * Free the previous breakpoint list. This can be optimized by keeping
1230 * it as shadow buffer for the next computation instead of freeing
1233 g_free(breakpoints
->breakpoints
);
1235 breakpoints
->breakpoints
= new_breakpoints
;
1239 * Physically inserts/removes the breakpoints by reading and writing the
1240 * physical memory, keeping a track of the failed attempts.
1242 * Passing resuming=true will try to set all previously unset breakpoints.
1243 * Passing resuming=false will remove all inserted ones.
1245 static void whpx_apply_breakpoints(
1246 struct whpx_breakpoint_collection
*breakpoints
,
1255 for (i
= 0; i
< breakpoints
->used
; i
++) {
1256 /* Decide what to do right now based on the last known state. */
1257 WhpxBreakpointState state
= breakpoints
->data
[i
].state
;
1259 case WHPX_BP_CLEARED
:
1261 state
= WHPX_BP_SET_PENDING
;
1264 case WHPX_BP_SET_PENDING
:
1266 state
= WHPX_BP_CLEARED
;
1271 state
= WHPX_BP_CLEAR_PENDING
;
1274 case WHPX_BP_CLEAR_PENDING
:
1276 state
= WHPX_BP_SET
;
1281 if (state
== WHPX_BP_SET_PENDING
) {
1282 /* Remember the original instruction. */
1283 rc
= cpu_memory_rw_debug(cpu
,
1284 breakpoints
->data
[i
].address
,
1285 &breakpoints
->data
[i
].original_instruction
,
1290 /* Write the breakpoint instruction. */
1291 rc
= cpu_memory_rw_debug(cpu
,
1292 breakpoints
->data
[i
].address
,
1293 (void *)&whpx_breakpoint_instruction
,
1299 state
= WHPX_BP_SET
;
1304 if (state
== WHPX_BP_CLEAR_PENDING
) {
1305 /* Restore the original instruction. */
1306 rc
= cpu_memory_rw_debug(cpu
,
1307 breakpoints
->data
[i
].address
,
1308 &breakpoints
->data
[i
].original_instruction
,
1313 state
= WHPX_BP_CLEARED
;
1317 breakpoints
->data
[i
].state
= state
;
1322 * This function is called when the a VCPU is about to start and no other
1323 * VCPUs have been started so far. Since the VCPU start order could be
1324 * arbitrary, it doesn't have to be VCPU#0.
1326 * It is used to commit the breakpoints into memory, and configure WHPX
1327 * to intercept debug exceptions.
1329 * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
1330 * more VCPUs are already running, so this is the best place to do it.
1332 static int whpx_first_vcpu_starting(CPUState
*cpu
)
1334 struct whpx_state
*whpx
= &whpx_global
;
1337 g_assert(qemu_mutex_iothread_locked());
1339 if (!QTAILQ_EMPTY(&cpu
->breakpoints
) ||
1340 (whpx
->breakpoints
.breakpoints
&&
1341 whpx
->breakpoints
.breakpoints
->used
)) {
1344 bool update_pending
= false;
1346 QTAILQ_FOREACH(bp
, &cpu
->breakpoints
, entry
) {
1347 if (i
>= whpx
->breakpoints
.original_address_count
||
1348 bp
->pc
!= whpx
->breakpoints
.original_addresses
[i
]) {
1349 update_pending
= true;
1355 if (i
!= whpx
->breakpoints
.original_address_count
) {
1356 update_pending
= true;
1359 if (update_pending
) {
1361 * The CPU breakpoints have changed since the last call to
1362 * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
1363 * now be recomputed.
1365 whpx_translate_cpu_breakpoints(&whpx
->breakpoints
, cpu
, i
);
1368 /* Actually insert the breakpoints into the memory. */
1369 whpx_apply_breakpoints(whpx
->breakpoints
.breakpoints
, cpu
, true);
1372 uint64_t exception_mask
;
1373 if (whpx
->step_pending
||
1374 (whpx
->breakpoints
.breakpoints
&&
1375 whpx
->breakpoints
.breakpoints
->used
)) {
1377 * We are either attempting to single-step one or more CPUs, or
1378 * have one or more breakpoints enabled. Both require intercepting
1379 * the WHvX64ExceptionTypeBreakpointTrap exception.
1382 exception_mask
= 1UL << WHvX64ExceptionTypeDebugTrapOrFault
;
1384 /* Let the guest handle all exceptions. */
1388 hr
= whpx_set_exception_exit_bitmap(exception_mask
);
1389 if (!SUCCEEDED(hr
)) {
1390 error_report("WHPX: Failed to update exception exit mask,"
1399 * This function is called when the last VCPU has finished running.
1400 * It is used to remove any previously set breakpoints from memory.
1402 static int whpx_last_vcpu_stopping(CPUState
*cpu
)
1404 whpx_apply_breakpoints(whpx_global
.breakpoints
.breakpoints
, cpu
, false);
1408 /* Returns the address of the next instruction that is about to be executed. */
1409 static vaddr
whpx_vcpu_get_pc(CPUState
*cpu
, bool exit_context_valid
)
1411 if (cpu
->vcpu_dirty
) {
1412 /* The CPU registers have been modified by other parts of QEMU. */
1413 CPUArchState
*env
= (CPUArchState
*)(cpu
->env_ptr
);
1415 } else if (exit_context_valid
) {
1417 * The CPU registers have not been modified by neither other parts
1418 * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
1419 * This is the most common case.
1421 struct whpx_vcpu
*vcpu
= get_whpx_vcpu(cpu
);
1422 return vcpu
->exit_ctx
.VpContext
.Rip
;
1425 * The CPU registers have been modified by a call to
1426 * WHvSetVirtualProcessorRegisters() and must be re-queried from
1429 WHV_REGISTER_VALUE reg_value
;
1430 WHV_REGISTER_NAME reg_name
= WHvX64RegisterRip
;
1432 struct whpx_state
*whpx
= &whpx_global
;
1434 hr
= whp_dispatch
.WHvGetVirtualProcessorRegisters(
1442 error_report("WHPX: Failed to get PC, hr=%08lx", hr
);
1446 return reg_value
.Reg64
;
1450 static int whpx_handle_halt(CPUState
*cpu
)
1452 CPUX86State
*env
= cpu
->env_ptr
;
1455 qemu_mutex_lock_iothread();
1456 if (!((cpu
->interrupt_request
& CPU_INTERRUPT_HARD
) &&
1457 (env
->eflags
& IF_MASK
)) &&
1458 !(cpu
->interrupt_request
& CPU_INTERRUPT_NMI
)) {
1459 cpu
->exception_index
= EXCP_HLT
;
1463 qemu_mutex_unlock_iothread();
1468 static void whpx_vcpu_pre_run(CPUState
*cpu
)
1471 struct whpx_state
*whpx
= &whpx_global
;
1472 struct whpx_vcpu
*vcpu
= get_whpx_vcpu(cpu
);
1473 CPUX86State
*env
= cpu
->env_ptr
;
1474 X86CPU
*x86_cpu
= X86_CPU(cpu
);
1477 WHV_X64_PENDING_INTERRUPTION_REGISTER new_int
;
1478 UINT32 reg_count
= 0;
1479 WHV_REGISTER_VALUE reg_values
[3];
1480 WHV_REGISTER_NAME reg_names
[3];
1482 memset(&new_int
, 0, sizeof(new_int
));
1483 memset(reg_values
, 0, sizeof(reg_values
));
1485 qemu_mutex_lock_iothread();
1488 if (!vcpu
->interruption_pending
&&
1489 cpu
->interrupt_request
& (CPU_INTERRUPT_NMI
| CPU_INTERRUPT_SMI
)) {
1490 if (cpu
->interrupt_request
& CPU_INTERRUPT_NMI
) {
1491 cpu
->interrupt_request
&= ~CPU_INTERRUPT_NMI
;
1492 vcpu
->interruptable
= false;
1493 new_int
.InterruptionType
= WHvX64PendingNmi
;
1494 new_int
.InterruptionPending
= 1;
1495 new_int
.InterruptionVector
= 2;
1497 if (cpu
->interrupt_request
& CPU_INTERRUPT_SMI
) {
1498 cpu
->interrupt_request
&= ~CPU_INTERRUPT_SMI
;
1503 * Force the VCPU out of its inner loop to process any INIT requests or
1504 * commit pending TPR access.
1506 if (cpu
->interrupt_request
& (CPU_INTERRUPT_INIT
| CPU_INTERRUPT_TPR
)) {
1507 if ((cpu
->interrupt_request
& CPU_INTERRUPT_INIT
) &&
1508 !(env
->hflags
& HF_SMM_MASK
)) {
1509 cpu
->exit_request
= 1;
1511 if (cpu
->interrupt_request
& CPU_INTERRUPT_TPR
) {
1512 cpu
->exit_request
= 1;
1516 /* Get pending hard interruption or replay one that was overwritten */
1517 if (!whpx_apic_in_platform()) {
1518 if (!vcpu
->interruption_pending
&&
1519 vcpu
->interruptable
&& (env
->eflags
& IF_MASK
)) {
1520 assert(!new_int
.InterruptionPending
);
1521 if (cpu
->interrupt_request
& CPU_INTERRUPT_HARD
) {
1522 cpu
->interrupt_request
&= ~CPU_INTERRUPT_HARD
;
1523 irq
= cpu_get_pic_interrupt(env
);
1525 new_int
.InterruptionType
= WHvX64PendingInterrupt
;
1526 new_int
.InterruptionPending
= 1;
1527 new_int
.InterruptionVector
= irq
;
1532 /* Setup interrupt state if new one was prepared */
1533 if (new_int
.InterruptionPending
) {
1534 reg_values
[reg_count
].PendingInterruption
= new_int
;
1535 reg_names
[reg_count
] = WHvRegisterPendingInterruption
;
1538 } else if (vcpu
->ready_for_pic_interrupt
&&
1539 (cpu
->interrupt_request
& CPU_INTERRUPT_HARD
)) {
1540 cpu
->interrupt_request
&= ~CPU_INTERRUPT_HARD
;
1541 irq
= cpu_get_pic_interrupt(env
);
1543 reg_names
[reg_count
] = WHvRegisterPendingEvent
;
1544 reg_values
[reg_count
].ExtIntEvent
= (WHV_X64_PENDING_EXT_INT_EVENT
)
1547 .EventType
= WHvX64PendingEventExtInt
,
1554 /* Sync the TPR to the CR8 if was modified during the intercept */
1555 tpr
= whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu
->apic_state
));
1556 if (tpr
!= vcpu
->tpr
) {
1558 reg_values
[reg_count
].Reg64
= tpr
;
1559 cpu
->exit_request
= 1;
1560 reg_names
[reg_count
] = WHvX64RegisterCr8
;
1564 /* Update the state of the interrupt delivery notification */
1565 if (!vcpu
->window_registered
&&
1566 cpu
->interrupt_request
& CPU_INTERRUPT_HARD
) {
1567 reg_values
[reg_count
].DeliverabilityNotifications
=
1568 (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER
) {
1569 .InterruptNotification
= 1
1571 vcpu
->window_registered
= 1;
1572 reg_names
[reg_count
] = WHvX64RegisterDeliverabilityNotifications
;
1576 qemu_mutex_unlock_iothread();
1577 vcpu
->ready_for_pic_interrupt
= false;
1580 hr
= whp_dispatch
.WHvSetVirtualProcessorRegisters(
1581 whpx
->partition
, cpu
->cpu_index
,
1582 reg_names
, reg_count
, reg_values
);
1584 error_report("WHPX: Failed to set interrupt state registers,"
1592 static void whpx_vcpu_post_run(CPUState
*cpu
)
1594 struct whpx_vcpu
*vcpu
= get_whpx_vcpu(cpu
);
1595 CPUX86State
*env
= cpu
->env_ptr
;
1596 X86CPU
*x86_cpu
= X86_CPU(cpu
);
1598 env
->eflags
= vcpu
->exit_ctx
.VpContext
.Rflags
;
1600 uint64_t tpr
= vcpu
->exit_ctx
.VpContext
.Cr8
;
1601 if (vcpu
->tpr
!= tpr
) {
1603 qemu_mutex_lock_iothread();
1604 cpu_set_apic_tpr(x86_cpu
->apic_state
, whpx_cr8_to_apic_tpr(vcpu
->tpr
));
1605 qemu_mutex_unlock_iothread();
1608 vcpu
->interruption_pending
=
1609 vcpu
->exit_ctx
.VpContext
.ExecutionState
.InterruptionPending
;
1611 vcpu
->interruptable
=
1612 !vcpu
->exit_ctx
.VpContext
.ExecutionState
.InterruptShadow
;
1617 static void whpx_vcpu_process_async_events(CPUState
*cpu
)
1619 CPUX86State
*env
= cpu
->env_ptr
;
1620 X86CPU
*x86_cpu
= X86_CPU(cpu
);
1621 struct whpx_vcpu
*vcpu
= get_whpx_vcpu(cpu
);
1623 if ((cpu
->interrupt_request
& CPU_INTERRUPT_INIT
) &&
1624 !(env
->hflags
& HF_SMM_MASK
)) {
1625 whpx_cpu_synchronize_state(cpu
);
1626 do_cpu_init(x86_cpu
);
1627 vcpu
->interruptable
= true;
1630 if (cpu
->interrupt_request
& CPU_INTERRUPT_POLL
) {
1631 cpu
->interrupt_request
&= ~CPU_INTERRUPT_POLL
;
1632 apic_poll_irq(x86_cpu
->apic_state
);
1635 if (((cpu
->interrupt_request
& CPU_INTERRUPT_HARD
) &&
1636 (env
->eflags
& IF_MASK
)) ||
1637 (cpu
->interrupt_request
& CPU_INTERRUPT_NMI
)) {
1638 cpu
->halted
= false;
1641 if (cpu
->interrupt_request
& CPU_INTERRUPT_SIPI
) {
1642 whpx_cpu_synchronize_state(cpu
);
1643 do_cpu_sipi(x86_cpu
);
1646 if (cpu
->interrupt_request
& CPU_INTERRUPT_TPR
) {
1647 cpu
->interrupt_request
&= ~CPU_INTERRUPT_TPR
;
1648 whpx_cpu_synchronize_state(cpu
);
1649 apic_handle_tpr_access_report(x86_cpu
->apic_state
, env
->eip
,
1650 env
->tpr_access_type
);
1656 static int whpx_vcpu_run(CPUState
*cpu
)
1659 struct whpx_state
*whpx
= &whpx_global
;
1660 struct whpx_vcpu
*vcpu
= get_whpx_vcpu(cpu
);
1661 struct whpx_breakpoint
*stepped_over_bp
= NULL
;
1662 WhpxStepMode exclusive_step_mode
= WHPX_STEP_NONE
;
1665 g_assert(qemu_mutex_iothread_locked());
1667 if (whpx
->running_cpus
++ == 0) {
1668 /* Insert breakpoints into memory, update exception exit bitmap. */
1669 ret
= whpx_first_vcpu_starting(cpu
);
1675 if (whpx
->breakpoints
.breakpoints
&&
1676 whpx
->breakpoints
.breakpoints
->used
> 0)
1678 uint64_t pc
= whpx_vcpu_get_pc(cpu
, true);
1679 stepped_over_bp
= whpx_lookup_breakpoint_by_addr(pc
);
1680 if (stepped_over_bp
&& stepped_over_bp
->state
!= WHPX_BP_SET
) {
1681 stepped_over_bp
= NULL
;
1684 if (stepped_over_bp
) {
1686 * We are trying to run the instruction overwritten by an active
1687 * breakpoint. We will temporarily disable the breakpoint, suspend
1688 * other CPUs, and step over the instruction.
1690 exclusive_step_mode
= WHPX_STEP_EXCLUSIVE
;
1694 if (exclusive_step_mode
== WHPX_STEP_NONE
) {
1695 whpx_vcpu_process_async_events(cpu
);
1696 if (cpu
->halted
&& !whpx_apic_in_platform()) {
1697 cpu
->exception_index
= EXCP_HLT
;
1698 qatomic_set(&cpu
->exit_request
, false);
1703 qemu_mutex_unlock_iothread();
1705 if (exclusive_step_mode
!= WHPX_STEP_NONE
) {
1707 g_assert(cpu
== current_cpu
);
1708 g_assert(!cpu
->running
);
1709 cpu
->running
= true;
1711 hr
= whpx_set_exception_exit_bitmap(
1712 1UL << WHvX64ExceptionTypeDebugTrapOrFault
);
1713 if (!SUCCEEDED(hr
)) {
1714 error_report("WHPX: Failed to update exception exit mask, "
1719 if (stepped_over_bp
) {
1720 /* Temporarily disable the triggered breakpoint. */
1721 cpu_memory_rw_debug(cpu
,
1722 stepped_over_bp
->address
,
1723 &stepped_over_bp
->original_instruction
,
1728 cpu_exec_start(cpu
);
1732 if (cpu
->vcpu_dirty
) {
1733 whpx_set_registers(cpu
, WHPX_SET_RUNTIME_STATE
);
1734 cpu
->vcpu_dirty
= false;
1737 if (exclusive_step_mode
== WHPX_STEP_NONE
) {
1738 whpx_vcpu_pre_run(cpu
);
1740 if (qatomic_read(&cpu
->exit_request
)) {
1741 whpx_vcpu_kick(cpu
);
1745 if (exclusive_step_mode
!= WHPX_STEP_NONE
|| cpu
->singlestep_enabled
) {
1746 whpx_vcpu_configure_single_stepping(cpu
, true, NULL
);
1749 hr
= whp_dispatch
.WHvRunVirtualProcessor(
1750 whpx
->partition
, cpu
->cpu_index
,
1751 &vcpu
->exit_ctx
, sizeof(vcpu
->exit_ctx
));
1754 error_report("WHPX: Failed to exec a virtual processor,"
1760 if (exclusive_step_mode
!= WHPX_STEP_NONE
|| cpu
->singlestep_enabled
) {
1761 whpx_vcpu_configure_single_stepping(cpu
,
1763 &vcpu
->exit_ctx
.VpContext
.Rflags
);
1766 whpx_vcpu_post_run(cpu
);
1768 switch (vcpu
->exit_ctx
.ExitReason
) {
1769 case WHvRunVpExitReasonMemoryAccess
:
1770 ret
= whpx_handle_mmio(cpu
, &vcpu
->exit_ctx
.MemoryAccess
);
1773 case WHvRunVpExitReasonX64IoPortAccess
:
1774 ret
= whpx_handle_portio(cpu
, &vcpu
->exit_ctx
.IoPortAccess
);
1777 case WHvRunVpExitReasonX64InterruptWindow
:
1778 vcpu
->ready_for_pic_interrupt
= 1;
1779 vcpu
->window_registered
= 0;
1783 case WHvRunVpExitReasonX64ApicEoi
:
1784 assert(whpx_apic_in_platform());
1785 ioapic_eoi_broadcast(vcpu
->exit_ctx
.ApicEoi
.InterruptVector
);
1788 case WHvRunVpExitReasonX64Halt
:
1790 * WARNING: as of build 19043.1526 (21H1), this exit reason is no
1793 ret
= whpx_handle_halt(cpu
);
1796 case WHvRunVpExitReasonX64ApicInitSipiTrap
: {
1797 WHV_INTERRUPT_CONTROL ipi
= {0};
1798 uint64_t icr
= vcpu
->exit_ctx
.ApicInitSipi
.ApicIcr
;
1799 uint32_t delivery_mode
=
1800 (icr
& APIC_ICR_DELIV_MOD
) >> APIC_ICR_DELIV_MOD_SHIFT
;
1801 int dest_shorthand
=
1802 (icr
& APIC_ICR_DEST_SHORT
) >> APIC_ICR_DEST_SHORT_SHIFT
;
1803 bool broadcast
= false;
1804 bool include_self
= false;
1807 /* We only registered for INIT and SIPI exits. */
1808 if ((delivery_mode
!= APIC_DM_INIT
) &&
1809 (delivery_mode
!= APIC_DM_SIPI
)) {
1811 "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
1815 if (delivery_mode
== APIC_DM_INIT
) {
1816 ipi
.Type
= WHvX64InterruptTypeInit
;
1818 ipi
.Type
= WHvX64InterruptTypeSipi
;
1821 ipi
.DestinationMode
=
1822 ((icr
& APIC_ICR_DEST_MOD
) >> APIC_ICR_DEST_MOD_SHIFT
) ?
1823 WHvX64InterruptDestinationModeLogical
:
1824 WHvX64InterruptDestinationModePhysical
;
1827 ((icr
& APIC_ICR_TRIGGER_MOD
) >> APIC_ICR_TRIGGER_MOD_SHIFT
) ?
1828 WHvX64InterruptTriggerModeLevel
:
1829 WHvX64InterruptTriggerModeEdge
;
1831 ipi
.Vector
= icr
& APIC_VECTOR_MASK
;
1832 switch (dest_shorthand
) {
1833 /* no shorthand. Bits 56-63 contain the destination. */
1835 ipi
.Destination
= (icr
>> 56) & APIC_VECTOR_MASK
;
1836 hr
= whp_dispatch
.WHvRequestInterrupt(whpx
->partition
,
1839 error_report("WHPX: Failed to request interrupt hr=%08lx",
1847 include_self
= true;
1850 /* broadcast, including self */
1853 include_self
= true;
1856 /* broadcast, excluding self */
1862 if (!broadcast
&& !include_self
) {
1866 for (i
= 0; i
<= max_vcpu_index
; i
++) {
1867 if (i
== cpu
->cpu_index
&& !include_self
) {
1872 * Assuming that APIC Ids are identity mapped since
1873 * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
1874 * are not handled yet and the hypervisor doesn't allow the
1875 * guest to modify the APIC ID.
1877 ipi
.Destination
= i
;
1878 hr
= whp_dispatch
.WHvRequestInterrupt(whpx
->partition
,
1882 "WHPX: Failed to request SIPI for %d, hr=%08lx",
1890 case WHvRunVpExitReasonCanceled
:
1891 if (exclusive_step_mode
!= WHPX_STEP_NONE
) {
1893 * We are trying to step over a single instruction, and
1894 * likely got a request to stop from another thread.
1895 * Delay it until we are done stepping
1900 cpu
->exception_index
= EXCP_INTERRUPT
;
1904 case WHvRunVpExitReasonX64MsrAccess
: {
1905 WHV_REGISTER_VALUE reg_values
[3] = {0};
1906 WHV_REGISTER_NAME reg_names
[3];
1909 reg_names
[0] = WHvX64RegisterRip
;
1910 reg_names
[1] = WHvX64RegisterRax
;
1911 reg_names
[2] = WHvX64RegisterRdx
;
1913 reg_values
[0].Reg64
=
1914 vcpu
->exit_ctx
.VpContext
.Rip
+
1915 vcpu
->exit_ctx
.VpContext
.InstructionLength
;
1918 * For all unsupported MSR access we:
1922 reg_count
= vcpu
->exit_ctx
.MsrAccess
.AccessInfo
.IsWrite
?
1925 hr
= whp_dispatch
.WHvSetVirtualProcessorRegisters(
1928 reg_names
, reg_count
,
1932 error_report("WHPX: Failed to set MsrAccess state "
1933 " registers, hr=%08lx", hr
);
1938 case WHvRunVpExitReasonX64Cpuid
: {
1939 WHV_REGISTER_VALUE reg_values
[5];
1940 WHV_REGISTER_NAME reg_names
[5];
1941 UINT32 reg_count
= 5;
1942 UINT64 cpuid_fn
, rip
= 0, rax
= 0, rcx
= 0, rdx
= 0, rbx
= 0;
1943 X86CPU
*x86_cpu
= X86_CPU(cpu
);
1944 CPUX86State
*env
= &x86_cpu
->env
;
1946 memset(reg_values
, 0, sizeof(reg_values
));
1948 rip
= vcpu
->exit_ctx
.VpContext
.Rip
+
1949 vcpu
->exit_ctx
.VpContext
.InstructionLength
;
1950 cpuid_fn
= vcpu
->exit_ctx
.CpuidAccess
.Rax
;
1953 * Ideally, these should be supplied to the hypervisor during VCPU
1954 * initialization and it should be able to satisfy this request.
1955 * But, currently, WHPX doesn't support setting CPUID values in the
1956 * hypervisor once the partition has been setup, which is too late
1957 * since VCPUs are realized later. For now, use the values from
1958 * QEMU to satisfy these requests, until WHPX adds support for
1959 * being able to set these values in the hypervisor at runtime.
1961 cpu_x86_cpuid(env
, cpuid_fn
, 0, (UINT32
*)&rax
, (UINT32
*)&rbx
,
1962 (UINT32
*)&rcx
, (UINT32
*)&rdx
);
1965 /* Expose the vmware cpu frequency cpuid leaf */
1967 rbx
= rcx
= rdx
= 0;
1972 rbx
= env
->apic_bus_freq
/ 1000; /* Hz to KHz */
1977 /* Remove any support of OSVW */
1978 rcx
&= ~CPUID_EXT3_OSVW
;
1982 reg_names
[0] = WHvX64RegisterRip
;
1983 reg_names
[1] = WHvX64RegisterRax
;
1984 reg_names
[2] = WHvX64RegisterRcx
;
1985 reg_names
[3] = WHvX64RegisterRdx
;
1986 reg_names
[4] = WHvX64RegisterRbx
;
1988 reg_values
[0].Reg64
= rip
;
1989 reg_values
[1].Reg64
= rax
;
1990 reg_values
[2].Reg64
= rcx
;
1991 reg_values
[3].Reg64
= rdx
;
1992 reg_values
[4].Reg64
= rbx
;
1994 hr
= whp_dispatch
.WHvSetVirtualProcessorRegisters(
1995 whpx
->partition
, cpu
->cpu_index
,
2001 error_report("WHPX: Failed to set CpuidAccess state registers,"
2007 case WHvRunVpExitReasonException
:
2008 whpx_get_registers(cpu
);
2010 if ((vcpu
->exit_ctx
.VpException
.ExceptionType
==
2011 WHvX64ExceptionTypeDebugTrapOrFault
) &&
2012 (vcpu
->exit_ctx
.VpException
.InstructionByteCount
>= 1) &&
2013 (vcpu
->exit_ctx
.VpException
.InstructionBytes
[0] ==
2014 whpx_breakpoint_instruction
)) {
2015 /* Stopped at a software breakpoint. */
2016 cpu
->exception_index
= EXCP_DEBUG
;
2017 } else if ((vcpu
->exit_ctx
.VpException
.ExceptionType
==
2018 WHvX64ExceptionTypeDebugTrapOrFault
) &&
2019 !cpu
->singlestep_enabled
) {
2021 * Just finished stepping over a breakpoint, but the
2022 * gdb does not expect us to do single-stepping.
2023 * Don't do anything special.
2025 cpu
->exception_index
= EXCP_INTERRUPT
;
2027 /* Another exception or debug event. Report it to GDB. */
2028 cpu
->exception_index
= EXCP_DEBUG
;
2033 case WHvRunVpExitReasonNone
:
2034 case WHvRunVpExitReasonUnrecoverableException
:
2035 case WHvRunVpExitReasonInvalidVpRegisterValue
:
2036 case WHvRunVpExitReasonUnsupportedFeature
:
2038 error_report("WHPX: Unexpected VP exit code %d",
2039 vcpu
->exit_ctx
.ExitReason
);
2040 whpx_get_registers(cpu
);
2041 qemu_mutex_lock_iothread();
2042 qemu_system_guest_panicked(cpu_get_crash_info(cpu
));
2043 qemu_mutex_unlock_iothread();
2049 if (stepped_over_bp
) {
2050 /* Restore the breakpoint we stepped over */
2051 cpu_memory_rw_debug(cpu
,
2052 stepped_over_bp
->address
,
2053 (void *)&whpx_breakpoint_instruction
,
2058 if (exclusive_step_mode
!= WHPX_STEP_NONE
) {
2059 g_assert(cpu_in_exclusive_context(cpu
));
2060 cpu
->running
= false;
2063 exclusive_step_mode
= WHPX_STEP_NONE
;
2068 qemu_mutex_lock_iothread();
2071 if (--whpx
->running_cpus
== 0) {
2072 whpx_last_vcpu_stopping(cpu
);
2075 qatomic_set(&cpu
->exit_request
, false);
2080 static void do_whpx_cpu_synchronize_state(CPUState
*cpu
, run_on_cpu_data arg
)
2082 if (!cpu
->vcpu_dirty
) {
2083 whpx_get_registers(cpu
);
2084 cpu
->vcpu_dirty
= true;
2088 static void do_whpx_cpu_synchronize_post_reset(CPUState
*cpu
,
2089 run_on_cpu_data arg
)
2091 whpx_set_registers(cpu
, WHPX_SET_RESET_STATE
);
2092 cpu
->vcpu_dirty
= false;
2095 static void do_whpx_cpu_synchronize_post_init(CPUState
*cpu
,
2096 run_on_cpu_data arg
)
2098 whpx_set_registers(cpu
, WHPX_SET_FULL_STATE
);
2099 cpu
->vcpu_dirty
= false;
2102 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState
*cpu
,
2103 run_on_cpu_data arg
)
2105 cpu
->vcpu_dirty
= true;
2112 void whpx_cpu_synchronize_state(CPUState
*cpu
)
2114 if (!cpu
->vcpu_dirty
) {
2115 run_on_cpu(cpu
, do_whpx_cpu_synchronize_state
, RUN_ON_CPU_NULL
);
2119 void whpx_cpu_synchronize_post_reset(CPUState
*cpu
)
2121 run_on_cpu(cpu
, do_whpx_cpu_synchronize_post_reset
, RUN_ON_CPU_NULL
);
2124 void whpx_cpu_synchronize_post_init(CPUState
*cpu
)
2126 run_on_cpu(cpu
, do_whpx_cpu_synchronize_post_init
, RUN_ON_CPU_NULL
);
2129 void whpx_cpu_synchronize_pre_loadvm(CPUState
*cpu
)
2131 run_on_cpu(cpu
, do_whpx_cpu_synchronize_pre_loadvm
, RUN_ON_CPU_NULL
);
2134 void whpx_cpu_synchronize_pre_resume(bool step_pending
)
2136 whpx_global
.step_pending
= step_pending
;
2143 static Error
*whpx_migration_blocker
;
2145 static void whpx_cpu_update_state(void *opaque
, bool running
, RunState state
)
2147 CPUX86State
*env
= opaque
;
2150 env
->tsc_valid
= false;
2154 int whpx_init_vcpu(CPUState
*cpu
)
2157 struct whpx_state
*whpx
= &whpx_global
;
2158 struct whpx_vcpu
*vcpu
= NULL
;
2159 Error
*local_error
= NULL
;
2160 CPUX86State
*env
= cpu
->env_ptr
;
2161 X86CPU
*x86_cpu
= X86_CPU(cpu
);
2165 /* Add migration blockers for all unsupported features of the
2166 * Windows Hypervisor Platform
2168 if (whpx_migration_blocker
== NULL
) {
2169 error_setg(&whpx_migration_blocker
,
2170 "State blocked due to non-migratable CPUID feature support,"
2171 "dirty memory tracking support, and XSAVE/XRSTOR support");
2173 if (migrate_add_blocker(whpx_migration_blocker
, &local_error
) < 0) {
2174 error_report_err(local_error
);
2175 error_free(whpx_migration_blocker
);
2181 vcpu
= g_new0(struct whpx_vcpu
, 1);
2184 error_report("WHPX: Failed to allocte VCPU context.");
2189 hr
= whp_dispatch
.WHvEmulatorCreateEmulator(
2190 &whpx_emu_callbacks
,
2193 error_report("WHPX: Failed to setup instruction completion support,"
2199 hr
= whp_dispatch
.WHvCreateVirtualProcessor(
2200 whpx
->partition
, cpu
->cpu_index
, 0);
2202 error_report("WHPX: Failed to create a virtual processor,"
2204 whp_dispatch
.WHvEmulatorDestroyEmulator(vcpu
->emulator
);
2210 * vcpu's TSC frequency is either specified by user, or use the value
2211 * provided by Hyper-V if the former is not present. In the latter case, we
2212 * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
2213 * frequency can be migrated later via this field.
2215 if (!env
->tsc_khz
) {
2216 hr
= whp_dispatch
.WHvGetCapability(
2217 WHvCapabilityCodeProcessorClockFrequency
, &freq
, sizeof(freq
),
2219 if (hr
!= WHV_E_UNKNOWN_CAPABILITY
) {
2221 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr
);
2223 env
->tsc_khz
= freq
/ 1000; /* Hz to KHz */
2228 env
->apic_bus_freq
= HYPERV_APIC_BUS_FREQUENCY
;
2229 hr
= whp_dispatch
.WHvGetCapability(
2230 WHvCapabilityCodeInterruptClockFrequency
, &freq
, sizeof(freq
), NULL
);
2231 if (hr
!= WHV_E_UNKNOWN_CAPABILITY
) {
2233 printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr
);
2235 env
->apic_bus_freq
= freq
;
2240 * If the vmware cpuid frequency leaf option is set, and we have a valid
2241 * tsc value, trap the corresponding cpuid's.
2243 if (x86_cpu
->vmware_cpuid_freq
&& env
->tsc_khz
) {
2244 UINT32 cpuidExitList
[] = {1, 0x80000001, 0x40000000, 0x40000010};
2246 hr
= whp_dispatch
.WHvSetPartitionProperty(
2248 WHvPartitionPropertyCodeCpuidExitList
,
2250 RTL_NUMBER_OF(cpuidExitList
) * sizeof(UINT32
));
2253 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2260 vcpu
->interruptable
= true;
2261 cpu
->vcpu_dirty
= true;
2262 cpu
->hax_vcpu
= (struct hax_vcpu_state
*)vcpu
;
2263 max_vcpu_index
= max(max_vcpu_index
, cpu
->cpu_index
);
2264 qemu_add_vm_change_state_handler(whpx_cpu_update_state
, cpu
->env_ptr
);
2274 int whpx_vcpu_exec(CPUState
*cpu
)
2280 if (cpu
->exception_index
>= EXCP_INTERRUPT
) {
2281 ret
= cpu
->exception_index
;
2282 cpu
->exception_index
= -1;
2286 fatal
= whpx_vcpu_run(cpu
);
2289 error_report("WHPX: Failed to exec a virtual processor");
2297 void whpx_destroy_vcpu(CPUState
*cpu
)
2299 struct whpx_state
*whpx
= &whpx_global
;
2300 struct whpx_vcpu
*vcpu
= get_whpx_vcpu(cpu
);
2302 whp_dispatch
.WHvDeleteVirtualProcessor(whpx
->partition
, cpu
->cpu_index
);
2303 whp_dispatch
.WHvEmulatorDestroyEmulator(vcpu
->emulator
);
2304 g_free(cpu
->hax_vcpu
);
2308 void whpx_vcpu_kick(CPUState
*cpu
)
2310 struct whpx_state
*whpx
= &whpx_global
;
2311 whp_dispatch
.WHvCancelRunVirtualProcessor(
2312 whpx
->partition
, cpu
->cpu_index
, 0);
2319 static void whpx_update_mapping(hwaddr start_pa
, ram_addr_t size
,
2320 void *host_va
, int add
, int rom
,
2323 struct whpx_state
*whpx
= &whpx_global
;
2328 printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
2329 (void*)start_pa, (void*)size, host_va,
2330 (rom ? "ROM" : "RAM"), name);
2332 printf("WHPX: DEL PA:%p Size:%p, Host:%p, '%s'\n",
2333 (void*)start_pa, (void*)size, host_va, name);
2338 hr
= whp_dispatch
.WHvMapGpaRange(whpx
->partition
,
2342 (WHvMapGpaRangeFlagRead
|
2343 WHvMapGpaRangeFlagExecute
|
2344 (rom
? 0 : WHvMapGpaRangeFlagWrite
)));
2346 hr
= whp_dispatch
.WHvUnmapGpaRange(whpx
->partition
,
2352 error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
2353 " Host:%p, hr=%08lx",
2354 (add
? "MAP" : "UNMAP"), name
,
2355 (void *)(uintptr_t)start_pa
, (void *)size
, host_va
, hr
);
2359 static void whpx_process_section(MemoryRegionSection
*section
, int add
)
2361 MemoryRegion
*mr
= section
->mr
;
2362 hwaddr start_pa
= section
->offset_within_address_space
;
2363 ram_addr_t size
= int128_get64(section
->size
);
2367 if (!memory_region_is_ram(mr
)) {
2371 delta
= qemu_real_host_page_size() - (start_pa
& ~qemu_real_host_page_mask());
2372 delta
&= ~qemu_real_host_page_mask();
2378 size
&= qemu_real_host_page_mask();
2379 if (!size
|| (start_pa
& ~qemu_real_host_page_mask())) {
2383 host_va
= (uintptr_t)memory_region_get_ram_ptr(mr
)
2384 + section
->offset_within_region
+ delta
;
2386 whpx_update_mapping(start_pa
, size
, (void *)(uintptr_t)host_va
, add
,
2387 memory_region_is_rom(mr
), mr
->name
);
2390 static void whpx_region_add(MemoryListener
*listener
,
2391 MemoryRegionSection
*section
)
2393 memory_region_ref(section
->mr
);
2394 whpx_process_section(section
, 1);
2397 static void whpx_region_del(MemoryListener
*listener
,
2398 MemoryRegionSection
*section
)
2400 whpx_process_section(section
, 0);
2401 memory_region_unref(section
->mr
);
2404 static void whpx_transaction_begin(MemoryListener
*listener
)
2408 static void whpx_transaction_commit(MemoryListener
*listener
)
2412 static void whpx_log_sync(MemoryListener
*listener
,
2413 MemoryRegionSection
*section
)
2415 MemoryRegion
*mr
= section
->mr
;
2417 if (!memory_region_is_ram(mr
)) {
2421 memory_region_set_dirty(mr
, 0, int128_get64(section
->size
));
2424 static MemoryListener whpx_memory_listener
= {
2426 .begin
= whpx_transaction_begin
,
2427 .commit
= whpx_transaction_commit
,
2428 .region_add
= whpx_region_add
,
2429 .region_del
= whpx_region_del
,
2430 .log_sync
= whpx_log_sync
,
2434 static void whpx_memory_init(void)
2436 memory_listener_register(&whpx_memory_listener
, &address_space_memory
);
2440 * Load the functions from the given library, using the given handle. If a
2441 * handle is provided, it is used, otherwise the library is opened. The
2442 * handle will be updated on return with the opened one.
2444 static bool load_whp_dispatch_fns(HMODULE
*handle
,
2445 WHPFunctionList function_list
)
2447 HMODULE hLib
= *handle
;
2449 #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
2450 #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
2451 #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
2452 whp_dispatch.function_name = \
2453 (function_name ## _t)GetProcAddress(hLib, #function_name); \
2455 #define WHP_LOAD_FIELD(return_type, function_name, signature) \
2456 whp_dispatch.function_name = \
2457 (function_name ## _t)GetProcAddress(hLib, #function_name); \
2458 if (!whp_dispatch.function_name) { \
2459 error_report("Could not load function %s", #function_name); \
2463 #define WHP_LOAD_LIB(lib_name, handle_lib) \
2464 if (!handle_lib) { \
2465 handle_lib = LoadLibrary(lib_name); \
2466 if (!handle_lib) { \
2467 error_report("Could not load library %s.", lib_name); \
2472 switch (function_list) {
2473 case WINHV_PLATFORM_FNS_DEFAULT
:
2474 WHP_LOAD_LIB(WINHV_PLATFORM_DLL
, hLib
)
2475 LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD
)
2478 case WINHV_EMULATION_FNS_DEFAULT
:
2479 WHP_LOAD_LIB(WINHV_EMULATION_DLL
, hLib
)
2480 LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD
)
2483 case WINHV_PLATFORM_FNS_SUPPLEMENTAL
:
2484 WHP_LOAD_LIB(WINHV_PLATFORM_DLL
, hLib
)
2485 LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL
)
2500 static void whpx_set_kernel_irqchip(Object
*obj
, Visitor
*v
,
2501 const char *name
, void *opaque
,
2504 struct whpx_state
*whpx
= &whpx_global
;
2507 if (!visit_type_OnOffSplit(v
, name
, &mode
, errp
)) {
2512 case ON_OFF_SPLIT_ON
:
2513 whpx
->kernel_irqchip_allowed
= true;
2514 whpx
->kernel_irqchip_required
= true;
2517 case ON_OFF_SPLIT_OFF
:
2518 whpx
->kernel_irqchip_allowed
= false;
2519 whpx
->kernel_irqchip_required
= false;
2522 case ON_OFF_SPLIT_SPLIT
:
2523 error_setg(errp
, "WHPX: split irqchip currently not supported");
2524 error_append_hint(errp
,
2525 "Try without kernel-irqchip or with kernel-irqchip=on|off");
2530 * The value was checked in visit_type_OnOffSplit() above. If
2531 * we get here, then something is wrong in QEMU.
2541 static int whpx_accel_init(MachineState
*ms
)
2543 struct whpx_state
*whpx
;
2546 WHV_CAPABILITY whpx_cap
;
2547 UINT32 whpx_cap_size
;
2548 WHV_PARTITION_PROPERTY prop
;
2549 UINT32 cpuidExitList
[] = {1, 0x80000001};
2550 WHV_CAPABILITY_FEATURES features
= {0};
2552 whpx
= &whpx_global
;
2554 if (!init_whp_dispatch()) {
2559 whpx
->mem_quota
= ms
->ram_size
;
2561 hr
= whp_dispatch
.WHvGetCapability(
2562 WHvCapabilityCodeHypervisorPresent
, &whpx_cap
,
2563 sizeof(whpx_cap
), &whpx_cap_size
);
2564 if (FAILED(hr
) || !whpx_cap
.HypervisorPresent
) {
2565 error_report("WHPX: No accelerator found, hr=%08lx", hr
);
2570 hr
= whp_dispatch
.WHvGetCapability(
2571 WHvCapabilityCodeFeatures
, &features
, sizeof(features
), NULL
);
2573 error_report("WHPX: Failed to query capabilities, hr=%08lx", hr
);
2578 hr
= whp_dispatch
.WHvCreatePartition(&whpx
->partition
);
2580 error_report("WHPX: Failed to create partition, hr=%08lx", hr
);
2586 * Query the XSAVE capability of the partition. Any error here is not
2589 hr
= whp_dispatch
.WHvGetPartitionProperty(
2591 WHvPartitionPropertyCodeProcessorXsaveFeatures
,
2593 sizeof(whpx_xsave_cap
),
2597 * Windows version which don't support this property will return with the
2598 * specific error code.
2600 if (FAILED(hr
) && hr
!= WHV_E_UNKNOWN_PROPERTY
) {
2601 error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr
);
2604 if (!whpx_has_xsave()) {
2605 printf("WHPX: Partition is not XSAVE capable\n");
2608 memset(&prop
, 0, sizeof(WHV_PARTITION_PROPERTY
));
2609 prop
.ProcessorCount
= ms
->smp
.cpus
;
2610 hr
= whp_dispatch
.WHvSetPartitionProperty(
2612 WHvPartitionPropertyCodeProcessorCount
,
2614 sizeof(WHV_PARTITION_PROPERTY
));
2617 error_report("WHPX: Failed to set partition core count to %d,"
2618 " hr=%08lx", ms
->smp
.cores
, hr
);
2624 * Error out if WHP doesn't support apic emulation and user is requiring
2627 if (whpx
->kernel_irqchip_required
&& (!features
.LocalApicEmulation
||
2628 !whp_dispatch
.WHvSetVirtualProcessorInterruptControllerState2
)) {
2629 error_report("WHPX: kernel irqchip requested, but unavailable. "
2630 "Try without kernel-irqchip or with kernel-irqchip=off");
2635 if (whpx
->kernel_irqchip_allowed
&& features
.LocalApicEmulation
&&
2636 whp_dispatch
.WHvSetVirtualProcessorInterruptControllerState2
) {
2637 WHV_X64_LOCAL_APIC_EMULATION_MODE mode
=
2638 WHvX64LocalApicEmulationModeXApic
;
2639 printf("WHPX: setting APIC emulation mode in the hypervisor\n");
2640 hr
= whp_dispatch
.WHvSetPartitionProperty(
2642 WHvPartitionPropertyCodeLocalApicEmulationMode
,
2646 error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr
);
2647 if (whpx
->kernel_irqchip_required
) {
2648 error_report("WHPX: kernel irqchip requested, but unavailable");
2653 whpx
->apic_in_platform
= true;
2657 /* Register for MSR and CPUID exits */
2658 memset(&prop
, 0, sizeof(WHV_PARTITION_PROPERTY
));
2659 prop
.ExtendedVmExits
.X64MsrExit
= 1;
2660 prop
.ExtendedVmExits
.X64CpuidExit
= 1;
2661 prop
.ExtendedVmExits
.ExceptionExit
= 1;
2662 if (whpx_apic_in_platform()) {
2663 prop
.ExtendedVmExits
.X64ApicInitSipiExitTrap
= 1;
2666 hr
= whp_dispatch
.WHvSetPartitionProperty(
2668 WHvPartitionPropertyCodeExtendedVmExits
,
2670 sizeof(WHV_PARTITION_PROPERTY
));
2672 error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr
);
2677 hr
= whp_dispatch
.WHvSetPartitionProperty(
2679 WHvPartitionPropertyCodeCpuidExitList
,
2681 RTL_NUMBER_OF(cpuidExitList
) * sizeof(UINT32
));
2684 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2691 * We do not want to intercept any exceptions from the guest,
2692 * until we actually start debugging with gdb.
2694 whpx
->exception_exit_bitmap
= -1;
2695 hr
= whpx_set_exception_exit_bitmap(0);
2698 error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr
);
2703 hr
= whp_dispatch
.WHvSetupPartition(whpx
->partition
);
2705 error_report("WHPX: Failed to setup partition, hr=%08lx", hr
);
2712 printf("Windows Hypervisor Platform accelerator is operational\n");
2717 if (NULL
!= whpx
->partition
) {
2718 whp_dispatch
.WHvDeletePartition(whpx
->partition
);
2719 whpx
->partition
= NULL
;
2725 int whpx_enabled(void)
2727 return whpx_allowed
;
2730 bool whpx_apic_in_platform(void) {
2731 return whpx_global
.apic_in_platform
;
2734 static void whpx_accel_class_init(ObjectClass
*oc
, void *data
)
2736 AccelClass
*ac
= ACCEL_CLASS(oc
);
2738 ac
->init_machine
= whpx_accel_init
;
2739 ac
->allowed
= &whpx_allowed
;
2741 object_class_property_add(oc
, "kernel-irqchip", "on|off|split",
2742 NULL
, whpx_set_kernel_irqchip
,
2744 object_class_property_set_description(oc
, "kernel-irqchip",
2745 "Configure WHPX in-kernel irqchip");
2748 static void whpx_accel_instance_init(Object
*obj
)
2750 struct whpx_state
*whpx
= &whpx_global
;
2752 memset(whpx
, 0, sizeof(struct whpx_state
));
2753 /* Turn on kernel-irqchip, by default */
2754 whpx
->kernel_irqchip_allowed
= true;
2757 static const TypeInfo whpx_accel_type
= {
2758 .name
= ACCEL_CLASS_NAME("whpx"),
2759 .parent
= TYPE_ACCEL
,
2760 .instance_init
= whpx_accel_instance_init
,
2761 .class_init
= whpx_accel_class_init
,
2764 static void whpx_type_init(void)
2766 type_register_static(&whpx_accel_type
);
2769 bool init_whp_dispatch(void)
2771 if (whp_dispatch_initialized
) {
2775 if (!load_whp_dispatch_fns(&hWinHvPlatform
, WINHV_PLATFORM_FNS_DEFAULT
)) {
2779 if (!load_whp_dispatch_fns(&hWinHvEmulation
, WINHV_EMULATION_FNS_DEFAULT
)) {
2783 assert(load_whp_dispatch_fns(&hWinHvPlatform
,
2784 WINHV_PLATFORM_FNS_SUPPLEMENTAL
));
2785 whp_dispatch_initialized
= true;
2789 if (hWinHvPlatform
) {
2790 FreeLibrary(hWinHvPlatform
);
2793 if (hWinHvEmulation
) {
2794 FreeLibrary(hWinHvEmulation
);
2800 type_init(whpx_type_init
);