2 * Xen HVM emulation support in KVM
4 * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
5 * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
7 * This work is licensed under the terms of the GNU GPL, version 2 or later.
8 * See the COPYING file in the top-level directory.
12 #include "qemu/osdep.h"
14 #include "qemu/main-loop.h"
15 #include "hw/xen/xen.h"
16 #include "sysemu/kvm_int.h"
17 #include "sysemu/kvm_xen.h"
18 #include "kvm/kvm_i386.h"
19 #include "exec/address-spaces.h"
22 #include "sysemu/runstate.h"
24 #include "hw/pci/msi.h"
25 #include "hw/i386/apic-msidef.h"
26 #include "hw/i386/kvm/xen_overlay.h"
27 #include "hw/i386/kvm/xen_evtchn.h"
28 #include "hw/i386/kvm/xen_gnttab.h"
30 #include "hw/xen/interface/version.h"
31 #include "hw/xen/interface/sched.h"
32 #include "hw/xen/interface/memory.h"
33 #include "hw/xen/interface/hvm/hvm_op.h"
34 #include "hw/xen/interface/hvm/params.h"
35 #include "hw/xen/interface/vcpu.h"
36 #include "hw/xen/interface/event_channel.h"
38 #include "xen-compat.h"
41 #define hypercall_compat32(longmode) (!(longmode))
43 #define hypercall_compat32(longmode) (false)
46 static bool kvm_gva_to_gpa(CPUState
*cs
, uint64_t gva
, uint64_t *gpa
,
47 size_t *len
, bool is_write
)
49 struct kvm_translation tr
= {
50 .linear_address
= gva
,
54 *len
= TARGET_PAGE_SIZE
- (gva
& ~TARGET_PAGE_MASK
);
57 if (kvm_vcpu_ioctl(cs
, KVM_TRANSLATE
, &tr
) || !tr
.valid
||
58 (is_write
&& !tr
.writeable
)) {
61 *gpa
= tr
.physical_address
;
65 static int kvm_gva_rw(CPUState
*cs
, uint64_t gva
, void *_buf
, size_t sz
,
68 uint8_t *buf
= (uint8_t *)_buf
;
73 if (!kvm_gva_to_gpa(cs
, gva
, &gpa
, &len
, is_write
)) {
80 cpu_physical_memory_rw(gpa
, buf
, len
, is_write
);
90 static inline int kvm_copy_from_gva(CPUState
*cs
, uint64_t gva
, void *buf
,
93 return kvm_gva_rw(cs
, gva
, buf
, sz
, false);
96 static inline int kvm_copy_to_gva(CPUState
*cs
, uint64_t gva
, void *buf
,
99 return kvm_gva_rw(cs
, gva
, buf
, sz
, true);
102 int kvm_xen_init(KVMState
*s
, uint32_t hypercall_msr
)
104 const int required_caps
= KVM_XEN_HVM_CONFIG_HYPERCALL_MSR
|
105 KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL
| KVM_XEN_HVM_CONFIG_SHARED_INFO
;
106 struct kvm_xen_hvm_config cfg
= {
107 .msr
= hypercall_msr
,
108 .flags
= KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL
,
112 xen_caps
= kvm_check_extension(s
, KVM_CAP_XEN_HVM
);
113 if (required_caps
& ~xen_caps
) {
114 error_report("kvm: Xen HVM guest support not present or insufficient");
118 if (xen_caps
& KVM_XEN_HVM_CONFIG_EVTCHN_SEND
) {
119 struct kvm_xen_hvm_attr ha
= {
120 .type
= KVM_XEN_ATTR_TYPE_XEN_VERSION
,
121 .u
.xen_version
= s
->xen_version
,
123 (void)kvm_vm_ioctl(s
, KVM_XEN_HVM_SET_ATTR
, &ha
);
125 cfg
.flags
|= KVM_XEN_HVM_CONFIG_EVTCHN_SEND
;
128 ret
= kvm_vm_ioctl(s
, KVM_XEN_HVM_CONFIG
, &cfg
);
130 error_report("kvm: Failed to enable Xen HVM support: %s",
135 /* If called a second time, don't repeat the rest of the setup. */
141 * Event channel delivery via GSI/PCI_INTX needs to poll the vcpu_info
142 * of vCPU0 to deassert the IRQ when ->evtchn_upcall_pending is cleared.
144 * In the kernel, there's a notifier hook on the PIC/IOAPIC which allows
145 * such things to be polled at precisely the right time. We *could* do
146 * it nicely in the kernel: check vcpu_info[0]->evtchn_upcall_pending at
147 * the moment the IRQ is acked, and see if it should be reasserted.
149 * But the in-kernel irqchip is deprecated, so we're unlikely to add
150 * that support in the kernel. Insist on using the split irqchip mode
153 * This leaves us polling for the level going low in QEMU, which lacks
154 * the appropriate hooks in its PIC/IOAPIC code. Even VFIO is sending a
155 * spurious 'ack' to an INTX IRQ every time there's any MMIO access to
156 * the device (for which it has to unmap the device and trap access, for
157 * some period after an IRQ!!). In the Xen case, we do it on exit from
158 * KVM_RUN, if the flag is set to say that the GSI is currently asserted.
159 * Which is kind of icky, but less so than the VFIO one. I may fix them
162 if (!kvm_kernel_irqchip_split()) {
163 error_report("kvm: Xen support requires kernel-irqchip=split");
167 s
->xen_caps
= xen_caps
;
171 int kvm_xen_init_vcpu(CPUState
*cs
)
173 X86CPU
*cpu
= X86_CPU(cs
);
174 CPUX86State
*env
= &cpu
->env
;
178 * The kernel needs to know the Xen/ACPI vCPU ID because that's
179 * what the guest uses in hypercalls such as timers. It doesn't
180 * match the APIC ID which is generally used for talking to the
181 * kernel about vCPUs. And if vCPU threads race with creating
182 * their KVM vCPUs out of order, it doesn't necessarily match
183 * with the kernel's internal vCPU indices either.
185 if (kvm_xen_has_cap(EVTCHN_SEND
)) {
186 struct kvm_xen_vcpu_attr va
= {
187 .type
= KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID
,
188 .u
.vcpu_id
= cs
->cpu_index
,
190 err
= kvm_vcpu_ioctl(cs
, KVM_XEN_VCPU_SET_ATTR
, &va
);
192 error_report("kvm: Failed to set Xen vCPU ID attribute: %s",
198 env
->xen_vcpu_info_gpa
= INVALID_GPA
;
199 env
->xen_vcpu_info_default_gpa
= INVALID_GPA
;
200 env
->xen_vcpu_time_info_gpa
= INVALID_GPA
;
201 env
->xen_vcpu_runstate_gpa
= INVALID_GPA
;
206 uint32_t kvm_xen_get_caps(void)
208 return kvm_state
->xen_caps
;
211 static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
212 int cmd
, uint64_t arg
)
217 case XENVER_get_features
: {
218 struct xen_feature_info fi
;
220 /* No need for 32/64 compat handling */
221 qemu_build_assert(sizeof(fi
) == 8);
223 err
= kvm_copy_from_gva(CPU(cpu
), arg
, &fi
, sizeof(fi
));
229 if (fi
.submap_idx
== 0) {
230 fi
.submap
|= 1 << XENFEAT_writable_page_tables
|
231 1 << XENFEAT_writable_descriptor_tables
|
232 1 << XENFEAT_auto_translated_physmap
|
233 1 << XENFEAT_supervisor_mode_kernel
|
234 1 << XENFEAT_hvm_callback_vector
;
237 err
= kvm_copy_to_gva(CPU(cpu
), arg
, &fi
, sizeof(fi
));
245 exit
->u
.hcall
.result
= err
;
249 static int kvm_xen_set_vcpu_attr(CPUState
*cs
, uint16_t type
, uint64_t gpa
)
251 struct kvm_xen_vcpu_attr xhsi
;
256 trace_kvm_xen_set_vcpu_attr(cs
->cpu_index
, type
, gpa
);
258 return kvm_vcpu_ioctl(cs
, KVM_XEN_VCPU_SET_ATTR
, &xhsi
);
261 static int kvm_xen_set_vcpu_callback_vector(CPUState
*cs
)
263 uint8_t vector
= X86_CPU(cs
)->env
.xen_vcpu_callback_vector
;
264 struct kvm_xen_vcpu_attr xva
;
266 xva
.type
= KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR
;
267 xva
.u
.vector
= vector
;
269 trace_kvm_xen_set_vcpu_callback(cs
->cpu_index
, vector
);
271 return kvm_vcpu_ioctl(cs
, KVM_XEN_HVM_SET_ATTR
, &xva
);
274 static void do_set_vcpu_callback_vector(CPUState
*cs
, run_on_cpu_data data
)
276 X86CPU
*cpu
= X86_CPU(cs
);
277 CPUX86State
*env
= &cpu
->env
;
279 env
->xen_vcpu_callback_vector
= data
.host_int
;
281 if (kvm_xen_has_cap(EVTCHN_SEND
)) {
282 kvm_xen_set_vcpu_callback_vector(cs
);
286 static int set_vcpu_info(CPUState
*cs
, uint64_t gpa
)
288 X86CPU
*cpu
= X86_CPU(cs
);
289 CPUX86State
*env
= &cpu
->env
;
290 MemoryRegionSection mrs
= { .mr
= NULL
};
291 void *vcpu_info_hva
= NULL
;
294 ret
= kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO
, gpa
);
295 if (ret
|| gpa
== INVALID_GPA
) {
299 mrs
= memory_region_find(get_system_memory(), gpa
,
300 sizeof(struct vcpu_info
));
301 if (mrs
.mr
&& mrs
.mr
->ram_block
&&
302 !int128_lt(mrs
.size
, int128_make64(sizeof(struct vcpu_info
)))) {
303 vcpu_info_hva
= qemu_map_ram_ptr(mrs
.mr
->ram_block
,
304 mrs
.offset_within_region
);
306 if (!vcpu_info_hva
) {
308 memory_region_unref(mrs
.mr
);
315 if (env
->xen_vcpu_info_mr
) {
316 memory_region_unref(env
->xen_vcpu_info_mr
);
318 env
->xen_vcpu_info_hva
= vcpu_info_hva
;
319 env
->xen_vcpu_info_mr
= mrs
.mr
;
323 static void do_set_vcpu_info_default_gpa(CPUState
*cs
, run_on_cpu_data data
)
325 X86CPU
*cpu
= X86_CPU(cs
);
326 CPUX86State
*env
= &cpu
->env
;
328 env
->xen_vcpu_info_default_gpa
= data
.host_ulong
;
330 /* Changing the default does nothing if a vcpu_info was explicitly set. */
331 if (env
->xen_vcpu_info_gpa
== INVALID_GPA
) {
332 set_vcpu_info(cs
, env
->xen_vcpu_info_default_gpa
);
336 static void do_set_vcpu_info_gpa(CPUState
*cs
, run_on_cpu_data data
)
338 X86CPU
*cpu
= X86_CPU(cs
);
339 CPUX86State
*env
= &cpu
->env
;
341 env
->xen_vcpu_info_gpa
= data
.host_ulong
;
343 set_vcpu_info(cs
, env
->xen_vcpu_info_gpa
);
346 void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id
)
348 CPUState
*cs
= qemu_get_cpu(vcpu_id
);
353 return X86_CPU(cs
)->env
.xen_vcpu_info_hva
;
356 void kvm_xen_maybe_deassert_callback(CPUState
*cs
)
358 CPUX86State
*env
= &X86_CPU(cs
)->env
;
359 struct vcpu_info
*vi
= env
->xen_vcpu_info_hva
;
364 /* If the evtchn_upcall_pending flag is cleared, turn the GSI off. */
365 if (!vi
->evtchn_upcall_pending
) {
366 qemu_mutex_lock_iothread();
368 * Check again now we have the lock, because it may have been
369 * asserted in the interim. And we don't want to take the lock
370 * every time because this is a fast path.
372 if (!vi
->evtchn_upcall_pending
) {
373 X86_CPU(cs
)->env
.xen_callback_asserted
= false;
374 xen_evtchn_set_callback_level(0);
376 qemu_mutex_unlock_iothread();
380 void kvm_xen_set_callback_asserted(void)
382 CPUState
*cs
= qemu_get_cpu(0);
385 X86_CPU(cs
)->env
.xen_callback_asserted
= true;
389 void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id
, int type
)
391 CPUState
*cs
= qemu_get_cpu(vcpu_id
);
398 vector
= X86_CPU(cs
)->env
.xen_vcpu_callback_vector
;
401 * The per-vCPU callback vector injected via lapic. Just
402 * deliver it as an MSI.
405 .address
= APIC_DEFAULT_ADDRESS
| X86_CPU(cs
)->apic_id
,
406 .data
= vector
| (1UL << MSI_DATA_LEVEL_SHIFT
),
408 kvm_irqchip_send_msi(kvm_state
, msg
);
413 case HVM_PARAM_CALLBACK_TYPE_VECTOR
:
415 * If the evtchn_upcall_pending field in the vcpu_info is set, then
416 * KVM will automatically deliver the vector on entering the vCPU
417 * so all we have to do is kick it out.
422 case HVM_PARAM_CALLBACK_TYPE_GSI
:
423 case HVM_PARAM_CALLBACK_TYPE_PCI_INTX
:
425 xen_evtchn_set_callback_level(1);
431 static int kvm_xen_set_vcpu_timer(CPUState
*cs
)
433 X86CPU
*cpu
= X86_CPU(cs
);
434 CPUX86State
*env
= &cpu
->env
;
436 struct kvm_xen_vcpu_attr va
= {
437 .type
= KVM_XEN_VCPU_ATTR_TYPE_TIMER
,
438 .u
.timer
.port
= env
->xen_virq
[VIRQ_TIMER
],
439 .u
.timer
.priority
= KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL
,
440 .u
.timer
.expires_ns
= env
->xen_singleshot_timer_ns
,
443 return kvm_vcpu_ioctl(cs
, KVM_XEN_VCPU_SET_ATTR
, &va
);
446 static void do_set_vcpu_timer_virq(CPUState
*cs
, run_on_cpu_data data
)
448 kvm_xen_set_vcpu_timer(cs
);
451 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id
, uint16_t virq
, uint16_t port
)
453 CPUState
*cs
= qemu_get_cpu(vcpu_id
);
459 /* cpu.h doesn't include the actual Xen header. */
460 qemu_build_assert(NR_VIRQS
== XEN_NR_VIRQS
);
462 if (virq
>= NR_VIRQS
) {
466 if (port
&& X86_CPU(cs
)->env
.xen_virq
[virq
]) {
470 X86_CPU(cs
)->env
.xen_virq
[virq
] = port
;
471 if (virq
== VIRQ_TIMER
&& kvm_xen_has_cap(EVTCHN_SEND
)) {
472 async_run_on_cpu(cs
, do_set_vcpu_timer_virq
,
473 RUN_ON_CPU_HOST_INT(port
));
478 static void do_set_vcpu_time_info_gpa(CPUState
*cs
, run_on_cpu_data data
)
480 X86CPU
*cpu
= X86_CPU(cs
);
481 CPUX86State
*env
= &cpu
->env
;
483 env
->xen_vcpu_time_info_gpa
= data
.host_ulong
;
485 kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
,
486 env
->xen_vcpu_time_info_gpa
);
489 static void do_set_vcpu_runstate_gpa(CPUState
*cs
, run_on_cpu_data data
)
491 X86CPU
*cpu
= X86_CPU(cs
);
492 CPUX86State
*env
= &cpu
->env
;
494 env
->xen_vcpu_runstate_gpa
= data
.host_ulong
;
496 kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR
,
497 env
->xen_vcpu_runstate_gpa
);
500 static void do_vcpu_soft_reset(CPUState
*cs
, run_on_cpu_data data
)
502 X86CPU
*cpu
= X86_CPU(cs
);
503 CPUX86State
*env
= &cpu
->env
;
505 env
->xen_vcpu_info_gpa
= INVALID_GPA
;
506 env
->xen_vcpu_info_default_gpa
= INVALID_GPA
;
507 env
->xen_vcpu_time_info_gpa
= INVALID_GPA
;
508 env
->xen_vcpu_runstate_gpa
= INVALID_GPA
;
509 env
->xen_vcpu_callback_vector
= 0;
510 env
->xen_singleshot_timer_ns
= 0;
511 memset(env
->xen_virq
, 0, sizeof(env
->xen_virq
));
513 set_vcpu_info(cs
, INVALID_GPA
);
514 kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
,
516 kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR
,
518 if (kvm_xen_has_cap(EVTCHN_SEND
)) {
519 kvm_xen_set_vcpu_callback_vector(cs
);
520 kvm_xen_set_vcpu_timer(cs
);
525 static int xen_set_shared_info(uint64_t gfn
)
527 uint64_t gpa
= gfn
<< TARGET_PAGE_BITS
;
530 QEMU_IOTHREAD_LOCK_GUARD();
533 * The xen_overlay device tells KVM about it too, since it had to
534 * do that on migration load anyway (unless we're going to jump
535 * through lots of hoops to maintain the fiction that this isn't
538 err
= xen_overlay_map_shinfo_page(gpa
);
543 trace_kvm_xen_set_shared_info(gfn
);
545 for (i
= 0; i
< XEN_LEGACY_MAX_VCPUS
; i
++) {
546 CPUState
*cpu
= qemu_get_cpu(i
);
548 async_run_on_cpu(cpu
, do_set_vcpu_info_default_gpa
,
549 RUN_ON_CPU_HOST_ULONG(gpa
));
551 gpa
+= sizeof(vcpu_info_t
);
557 static int add_to_physmap_one(uint32_t space
, uint64_t idx
, uint64_t gfn
)
560 case XENMAPSPACE_shared_info
:
564 return xen_set_shared_info(gfn
);
566 case XENMAPSPACE_grant_table
:
567 return xen_gnttab_map_page(idx
, gfn
);
569 case XENMAPSPACE_gmfn
:
570 case XENMAPSPACE_gmfn_range
:
573 case XENMAPSPACE_gmfn_foreign
:
574 case XENMAPSPACE_dev_mmio
:
582 static int do_add_to_physmap(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
585 struct xen_add_to_physmap xatp
;
586 CPUState
*cs
= CPU(cpu
);
588 if (hypercall_compat32(exit
->u
.hcall
.longmode
)) {
589 struct compat_xen_add_to_physmap xatp32
;
591 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap
) == 16);
592 if (kvm_copy_from_gva(cs
, arg
, &xatp32
, sizeof(xatp32
))) {
595 xatp
.domid
= xatp32
.domid
;
596 xatp
.size
= xatp32
.size
;
597 xatp
.space
= xatp32
.space
;
598 xatp
.idx
= xatp32
.idx
;
599 xatp
.gpfn
= xatp32
.gpfn
;
601 if (kvm_copy_from_gva(cs
, arg
, &xatp
, sizeof(xatp
))) {
606 if (xatp
.domid
!= DOMID_SELF
&& xatp
.domid
!= xen_domid
) {
610 return add_to_physmap_one(xatp
.space
, xatp
.idx
, xatp
.gpfn
);
613 static int do_add_to_physmap_batch(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
616 struct xen_add_to_physmap_batch xatpb
;
617 unsigned long idxs_gva
, gpfns_gva
, errs_gva
;
618 CPUState
*cs
= CPU(cpu
);
621 if (hypercall_compat32(exit
->u
.hcall
.longmode
)) {
622 struct compat_xen_add_to_physmap_batch xatpb32
;
624 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch
) == 20);
625 if (kvm_copy_from_gva(cs
, arg
, &xatpb32
, sizeof(xatpb32
))) {
628 xatpb
.domid
= xatpb32
.domid
;
629 xatpb
.space
= xatpb32
.space
;
630 xatpb
.size
= xatpb32
.size
;
632 idxs_gva
= xatpb32
.idxs
.c
;
633 gpfns_gva
= xatpb32
.gpfns
.c
;
634 errs_gva
= xatpb32
.errs
.c
;
635 op_sz
= sizeof(uint32_t);
637 if (kvm_copy_from_gva(cs
, arg
, &xatpb
, sizeof(xatpb
))) {
640 op_sz
= sizeof(unsigned long);
641 idxs_gva
= (unsigned long)xatpb
.idxs
.p
;
642 gpfns_gva
= (unsigned long)xatpb
.gpfns
.p
;
643 errs_gva
= (unsigned long)xatpb
.errs
.p
;
646 if (xatpb
.domid
!= DOMID_SELF
&& xatpb
.domid
!= xen_domid
) {
650 /* Explicitly invalid for the batch op. Not that we implement it anyway. */
651 if (xatpb
.space
== XENMAPSPACE_gmfn_range
) {
655 while (xatpb
.size
--) {
656 unsigned long idx
= 0;
657 unsigned long gpfn
= 0;
660 /* For 32-bit compat this only copies the low 32 bits of each */
661 if (kvm_copy_from_gva(cs
, idxs_gva
, &idx
, op_sz
) ||
662 kvm_copy_from_gva(cs
, gpfns_gva
, &gpfn
, op_sz
)) {
668 err
= add_to_physmap_one(xatpb
.space
, idx
, gpfn
);
670 if (kvm_copy_to_gva(cs
, errs_gva
, &err
, sizeof(err
))) {
673 errs_gva
+= sizeof(err
);
678 static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
679 int cmd
, uint64_t arg
)
684 case XENMEM_add_to_physmap
:
685 err
= do_add_to_physmap(exit
, cpu
, arg
);
688 case XENMEM_add_to_physmap_batch
:
689 err
= do_add_to_physmap_batch(exit
, cpu
, arg
);
696 exit
->u
.hcall
.result
= err
;
700 static bool handle_set_param(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
703 CPUState
*cs
= CPU(cpu
);
704 struct xen_hvm_param hp
;
707 /* No need for 32/64 compat handling */
708 qemu_build_assert(sizeof(hp
) == 16);
710 if (kvm_copy_from_gva(cs
, arg
, &hp
, sizeof(hp
))) {
715 if (hp
.domid
!= DOMID_SELF
&& hp
.domid
!= xen_domid
) {
721 case HVM_PARAM_CALLBACK_IRQ
:
722 qemu_mutex_lock_iothread();
723 err
= xen_evtchn_set_callback_param(hp
.value
);
724 qemu_mutex_unlock_iothread();
725 xen_set_long_mode(exit
->u
.hcall
.longmode
);
732 exit
->u
.hcall
.result
= err
;
736 static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit
*exit
,
737 X86CPU
*cpu
, uint64_t arg
)
739 struct xen_hvm_evtchn_upcall_vector up
;
742 /* No need for 32/64 compat handling */
743 qemu_build_assert(sizeof(up
) == 8);
745 if (kvm_copy_from_gva(CPU(cpu
), arg
, &up
, sizeof(up
))) {
749 if (up
.vector
< 0x10) {
753 target_cs
= qemu_get_cpu(up
.vcpu
);
758 async_run_on_cpu(target_cs
, do_set_vcpu_callback_vector
,
759 RUN_ON_CPU_HOST_INT(up
.vector
));
763 static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
764 int cmd
, uint64_t arg
)
768 case HVMOP_set_evtchn_upcall_vector
:
769 ret
= kvm_xen_hcall_evtchn_upcall_vector(exit
, cpu
,
770 exit
->u
.hcall
.params
[0]);
773 case HVMOP_pagetable_dying
:
777 case HVMOP_set_param
:
778 return handle_set_param(exit
, cpu
, arg
);
784 exit
->u
.hcall
.result
= ret
;
788 static int vcpuop_register_vcpu_info(CPUState
*cs
, CPUState
*target
,
791 struct vcpu_register_vcpu_info rvi
;
794 /* No need for 32/64 compat handling */
795 qemu_build_assert(sizeof(rvi
) == 16);
796 qemu_build_assert(sizeof(struct vcpu_info
) == 64);
802 if (kvm_copy_from_gva(cs
, arg
, &rvi
, sizeof(rvi
))) {
806 if (rvi
.offset
> TARGET_PAGE_SIZE
- sizeof(struct vcpu_info
)) {
810 gpa
= ((rvi
.mfn
<< TARGET_PAGE_BITS
) + rvi
.offset
);
811 async_run_on_cpu(target
, do_set_vcpu_info_gpa
, RUN_ON_CPU_HOST_ULONG(gpa
));
815 static int vcpuop_register_vcpu_time_info(CPUState
*cs
, CPUState
*target
,
818 struct vcpu_register_time_memory_area tma
;
822 /* No need for 32/64 compat handling */
823 qemu_build_assert(sizeof(tma
) == 8);
824 qemu_build_assert(sizeof(struct vcpu_time_info
) == 32);
830 if (kvm_copy_from_gva(cs
, arg
, &tma
, sizeof(tma
))) {
835 * Xen actually uses the GVA and does the translation through the guest
836 * page tables each time. But Linux/KVM uses the GPA, on the assumption
837 * that guests only ever use *global* addresses (kernel virtual addresses)
838 * for it. If Linux is changed to redo the GVA→GPA translation each time,
839 * it will offer a new vCPU attribute for that, and we'll use it instead.
841 if (!kvm_gva_to_gpa(cs
, tma
.addr
.p
, &gpa
, &len
, false) ||
842 len
< sizeof(struct vcpu_time_info
)) {
846 async_run_on_cpu(target
, do_set_vcpu_time_info_gpa
,
847 RUN_ON_CPU_HOST_ULONG(gpa
));
851 static int vcpuop_register_runstate_info(CPUState
*cs
, CPUState
*target
,
854 struct vcpu_register_runstate_memory_area rma
;
858 /* No need for 32/64 compat handling */
859 qemu_build_assert(sizeof(rma
) == 8);
860 /* The runstate area actually does change size, but Linux copes. */
866 if (kvm_copy_from_gva(cs
, arg
, &rma
, sizeof(rma
))) {
870 /* As with vcpu_time_info, Xen actually uses the GVA but KVM doesn't. */
871 if (!kvm_gva_to_gpa(cs
, rma
.addr
.p
, &gpa
, &len
, false)) {
875 async_run_on_cpu(target
, do_set_vcpu_runstate_gpa
,
876 RUN_ON_CPU_HOST_ULONG(gpa
));
880 static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
881 int cmd
, int vcpu_id
, uint64_t arg
)
883 CPUState
*dest
= qemu_get_cpu(vcpu_id
);
884 CPUState
*cs
= CPU(cpu
);
888 case VCPUOP_register_runstate_memory_area
:
889 err
= vcpuop_register_runstate_info(cs
, dest
, arg
);
891 case VCPUOP_register_vcpu_time_memory_area
:
892 err
= vcpuop_register_vcpu_time_info(cs
, dest
, arg
);
894 case VCPUOP_register_vcpu_info
:
895 err
= vcpuop_register_vcpu_info(cs
, dest
, arg
);
902 exit
->u
.hcall
.result
= err
;
906 static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
907 int cmd
, uint64_t arg
)
909 CPUState
*cs
= CPU(cpu
);
913 case EVTCHNOP_init_control
:
914 case EVTCHNOP_expand_array
:
915 case EVTCHNOP_set_priority
:
916 /* We do not support FIFO channels at this point */
920 case EVTCHNOP_status
: {
921 struct evtchn_status status
;
923 qemu_build_assert(sizeof(status
) == 24);
924 if (kvm_copy_from_gva(cs
, arg
, &status
, sizeof(status
))) {
929 err
= xen_evtchn_status_op(&status
);
930 if (!err
&& kvm_copy_to_gva(cs
, arg
, &status
, sizeof(status
))) {
935 case EVTCHNOP_close
: {
936 struct evtchn_close close
;
938 qemu_build_assert(sizeof(close
) == 4);
939 if (kvm_copy_from_gva(cs
, arg
, &close
, sizeof(close
))) {
944 err
= xen_evtchn_close_op(&close
);
947 case EVTCHNOP_unmask
: {
948 struct evtchn_unmask unmask
;
950 qemu_build_assert(sizeof(unmask
) == 4);
951 if (kvm_copy_from_gva(cs
, arg
, &unmask
, sizeof(unmask
))) {
956 err
= xen_evtchn_unmask_op(&unmask
);
959 case EVTCHNOP_bind_virq
: {
960 struct evtchn_bind_virq virq
;
962 qemu_build_assert(sizeof(virq
) == 12);
963 if (kvm_copy_from_gva(cs
, arg
, &virq
, sizeof(virq
))) {
968 err
= xen_evtchn_bind_virq_op(&virq
);
969 if (!err
&& kvm_copy_to_gva(cs
, arg
, &virq
, sizeof(virq
))) {
974 case EVTCHNOP_bind_ipi
: {
975 struct evtchn_bind_ipi ipi
;
977 qemu_build_assert(sizeof(ipi
) == 8);
978 if (kvm_copy_from_gva(cs
, arg
, &ipi
, sizeof(ipi
))) {
983 err
= xen_evtchn_bind_ipi_op(&ipi
);
984 if (!err
&& kvm_copy_to_gva(cs
, arg
, &ipi
, sizeof(ipi
))) {
989 case EVTCHNOP_send
: {
990 struct evtchn_send send
;
992 qemu_build_assert(sizeof(send
) == 4);
993 if (kvm_copy_from_gva(cs
, arg
, &send
, sizeof(send
))) {
998 err
= xen_evtchn_send_op(&send
);
1001 case EVTCHNOP_alloc_unbound
: {
1002 struct evtchn_alloc_unbound alloc
;
1004 qemu_build_assert(sizeof(alloc
) == 8);
1005 if (kvm_copy_from_gva(cs
, arg
, &alloc
, sizeof(alloc
))) {
1010 err
= xen_evtchn_alloc_unbound_op(&alloc
);
1011 if (!err
&& kvm_copy_to_gva(cs
, arg
, &alloc
, sizeof(alloc
))) {
1016 case EVTCHNOP_bind_interdomain
: {
1017 struct evtchn_bind_interdomain interdomain
;
1019 qemu_build_assert(sizeof(interdomain
) == 12);
1020 if (kvm_copy_from_gva(cs
, arg
, &interdomain
, sizeof(interdomain
))) {
1025 err
= xen_evtchn_bind_interdomain_op(&interdomain
);
1027 kvm_copy_to_gva(cs
, arg
, &interdomain
, sizeof(interdomain
))) {
1032 case EVTCHNOP_bind_vcpu
: {
1033 struct evtchn_bind_vcpu vcpu
;
1035 qemu_build_assert(sizeof(vcpu
) == 8);
1036 if (kvm_copy_from_gva(cs
, arg
, &vcpu
, sizeof(vcpu
))) {
1041 err
= xen_evtchn_bind_vcpu_op(&vcpu
);
1044 case EVTCHNOP_reset
: {
1045 struct evtchn_reset reset
;
1047 qemu_build_assert(sizeof(reset
) == 2);
1048 if (kvm_copy_from_gva(cs
, arg
, &reset
, sizeof(reset
))) {
1053 err
= xen_evtchn_reset_op(&reset
);
1060 exit
->u
.hcall
.result
= err
;
1064 int kvm_xen_soft_reset(void)
1069 assert(qemu_mutex_iothread_locked());
1071 trace_kvm_xen_soft_reset();
1073 err
= xen_evtchn_soft_reset();
1079 * Zero is the reset/startup state for HVM_PARAM_CALLBACK_IRQ. Strictly,
1080 * it maps to HVM_PARAM_CALLBACK_TYPE_GSI with GSI#0, but Xen refuses to
1081 * to deliver to the timer interrupt and treats that as 'disabled'.
1083 err
= xen_evtchn_set_callback_param(0);
1089 async_run_on_cpu(cpu
, do_vcpu_soft_reset
, RUN_ON_CPU_NULL
);
1092 err
= xen_overlay_map_shinfo_page(INVALID_GFN
);
1100 static int schedop_shutdown(CPUState
*cs
, uint64_t arg
)
1102 struct sched_shutdown shutdown
;
1105 /* No need for 32/64 compat handling */
1106 qemu_build_assert(sizeof(shutdown
) == 4);
1108 if (kvm_copy_from_gva(cs
, arg
, &shutdown
, sizeof(shutdown
))) {
1112 switch (shutdown
.reason
) {
1113 case SHUTDOWN_crash
:
1114 cpu_dump_state(cs
, stderr
, CPU_DUMP_CODE
);
1115 qemu_system_guest_panicked(NULL
);
1118 case SHUTDOWN_reboot
:
1119 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET
);
1122 case SHUTDOWN_poweroff
:
1123 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN
);
1126 case SHUTDOWN_soft_reset
:
1127 qemu_mutex_lock_iothread();
1128 ret
= kvm_xen_soft_reset();
1129 qemu_mutex_unlock_iothread();
1140 static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
1141 int cmd
, uint64_t arg
)
1143 CPUState
*cs
= CPU(cpu
);
1147 case SCHEDOP_shutdown
:
1148 err
= schedop_shutdown(cs
, arg
);
1153 * Linux will panic if this doesn't work. Just yield; it's not
1154 * worth overthinking it because with event channel handling
1155 * in KVM, the kernel will intercept this and it will never
1156 * reach QEMU anyway. The semantics of the hypercall explicltly
1157 * permit spurious wakeups.
1168 exit
->u
.hcall
.result
= err
;
1172 static bool do_kvm_xen_handle_exit(X86CPU
*cpu
, struct kvm_xen_exit
*exit
)
1174 uint16_t code
= exit
->u
.hcall
.input
;
1176 if (exit
->u
.hcall
.cpl
> 0) {
1177 exit
->u
.hcall
.result
= -EPERM
;
1182 case __HYPERVISOR_sched_op
:
1183 return kvm_xen_hcall_sched_op(exit
, cpu
, exit
->u
.hcall
.params
[0],
1184 exit
->u
.hcall
.params
[1]);
1185 case __HYPERVISOR_event_channel_op
:
1186 return kvm_xen_hcall_evtchn_op(exit
, cpu
, exit
->u
.hcall
.params
[0],
1187 exit
->u
.hcall
.params
[1]);
1188 case __HYPERVISOR_vcpu_op
:
1189 return kvm_xen_hcall_vcpu_op(exit
, cpu
,
1190 exit
->u
.hcall
.params
[0],
1191 exit
->u
.hcall
.params
[1],
1192 exit
->u
.hcall
.params
[2]);
1193 case __HYPERVISOR_hvm_op
:
1194 return kvm_xen_hcall_hvm_op(exit
, cpu
, exit
->u
.hcall
.params
[0],
1195 exit
->u
.hcall
.params
[1]);
1196 case __HYPERVISOR_memory_op
:
1197 return kvm_xen_hcall_memory_op(exit
, cpu
, exit
->u
.hcall
.params
[0],
1198 exit
->u
.hcall
.params
[1]);
1199 case __HYPERVISOR_xen_version
:
1200 return kvm_xen_hcall_xen_version(exit
, cpu
, exit
->u
.hcall
.params
[0],
1201 exit
->u
.hcall
.params
[1]);
1207 int kvm_xen_handle_exit(X86CPU
*cpu
, struct kvm_xen_exit
*exit
)
1209 if (exit
->type
!= KVM_EXIT_XEN_HCALL
) {
1214 * The kernel latches the guest 32/64 mode when the MSR is used to fill
1215 * the hypercall page. So if we see a hypercall in a mode that doesn't
1216 * match our own idea of the guest mode, fetch the kernel's idea of the
1217 * "long mode" to remain in sync.
1219 if (exit
->u
.hcall
.longmode
!= xen_is_long_mode()) {
1220 xen_sync_long_mode();
1223 if (!do_kvm_xen_handle_exit(cpu
, exit
)) {
1225 * Some hypercalls will be deliberately "implemented" by returning
1226 * -ENOSYS. This case is for hypercalls which are unexpected.
1228 exit
->u
.hcall
.result
= -ENOSYS
;
1229 qemu_log_mask(LOG_UNIMP
, "Unimplemented Xen hypercall %"
1230 PRId64
" (0x%" PRIx64
" 0x%" PRIx64
" 0x%" PRIx64
")\n",
1231 (uint64_t)exit
->u
.hcall
.input
,
1232 (uint64_t)exit
->u
.hcall
.params
[0],
1233 (uint64_t)exit
->u
.hcall
.params
[1],
1234 (uint64_t)exit
->u
.hcall
.params
[2]);
1237 trace_kvm_xen_hypercall(CPU(cpu
)->cpu_index
, exit
->u
.hcall
.cpl
,
1238 exit
->u
.hcall
.input
, exit
->u
.hcall
.params
[0],
1239 exit
->u
.hcall
.params
[1], exit
->u
.hcall
.params
[2],
1240 exit
->u
.hcall
.result
);
1244 uint16_t kvm_xen_get_gnttab_max_frames(void)
1246 KVMState
*s
= KVM_STATE(current_accel());
1247 return s
->xen_gnttab_max_frames
;
1250 int kvm_put_xen_state(CPUState
*cs
)
1252 X86CPU
*cpu
= X86_CPU(cs
);
1253 CPUX86State
*env
= &cpu
->env
;
1257 gpa
= env
->xen_vcpu_info_gpa
;
1258 if (gpa
== INVALID_GPA
) {
1259 gpa
= env
->xen_vcpu_info_default_gpa
;
1262 if (gpa
!= INVALID_GPA
) {
1263 ret
= set_vcpu_info(cs
, gpa
);
1269 gpa
= env
->xen_vcpu_time_info_gpa
;
1270 if (gpa
!= INVALID_GPA
) {
1271 ret
= kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
,
1278 gpa
= env
->xen_vcpu_runstate_gpa
;
1279 if (gpa
!= INVALID_GPA
) {
1280 ret
= kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR
,
1287 if (!kvm_xen_has_cap(EVTCHN_SEND
)) {
1291 if (env
->xen_vcpu_callback_vector
) {
1292 ret
= kvm_xen_set_vcpu_callback_vector(cs
);
1298 if (env
->xen_virq
[VIRQ_TIMER
]) {
1299 ret
= kvm_xen_set_vcpu_timer(cs
);
1307 int kvm_get_xen_state(CPUState
*cs
)
1309 X86CPU
*cpu
= X86_CPU(cs
);
1310 CPUX86State
*env
= &cpu
->env
;
1315 * The kernel does not mark vcpu_info as dirty when it delivers interrupts
1316 * to it. It's up to userspace to *assume* that any page shared thus is
1317 * always considered dirty. The shared_info page is different since it's
1318 * an overlay and migrated separately anyway.
1320 gpa
= env
->xen_vcpu_info_gpa
;
1321 if (gpa
== INVALID_GPA
) {
1322 gpa
= env
->xen_vcpu_info_default_gpa
;
1324 if (gpa
!= INVALID_GPA
) {
1325 MemoryRegionSection mrs
= memory_region_find(get_system_memory(),
1327 sizeof(struct vcpu_info
));
1329 !int128_lt(mrs
.size
, int128_make64(sizeof(struct vcpu_info
)))) {
1330 memory_region_set_dirty(mrs
.mr
, mrs
.offset_within_region
,
1331 sizeof(struct vcpu_info
));
1335 if (!kvm_xen_has_cap(EVTCHN_SEND
)) {
1340 * If the kernel is accelerating timers, read out the current value of the
1341 * singleshot timer deadline.
1343 if (env
->xen_virq
[VIRQ_TIMER
]) {
1344 struct kvm_xen_vcpu_attr va
= {
1345 .type
= KVM_XEN_VCPU_ATTR_TYPE_TIMER
,
1347 ret
= kvm_vcpu_ioctl(cs
, KVM_XEN_VCPU_GET_ATTR
, &va
);
1351 env
->xen_singleshot_timer_ns
= va
.u
.timer
.expires_ns
;