2 * Xen HVM emulation support in KVM
4 * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
5 * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
7 * This work is licensed under the terms of the GNU GPL, version 2 or later.
8 * See the COPYING file in the top-level directory.
12 #include "qemu/osdep.h"
14 #include "qemu/main-loop.h"
15 #include "qemu/error-report.h"
16 #include "hw/xen/xen.h"
17 #include "sysemu/kvm_int.h"
18 #include "sysemu/kvm_xen.h"
19 #include "kvm/kvm_i386.h"
20 #include "exec/address-spaces.h"
23 #include "sysemu/runstate.h"
25 #include "hw/pci/msi.h"
26 #include "hw/i386/apic-msidef.h"
27 #include "hw/i386/e820_memory_layout.h"
28 #include "hw/i386/kvm/xen_overlay.h"
29 #include "hw/i386/kvm/xen_evtchn.h"
30 #include "hw/i386/kvm/xen_gnttab.h"
31 #include "hw/i386/kvm/xen_primary_console.h"
32 #include "hw/i386/kvm/xen_xenstore.h"
34 #include "hw/xen/interface/version.h"
35 #include "hw/xen/interface/sched.h"
36 #include "hw/xen/interface/memory.h"
37 #include "hw/xen/interface/hvm/hvm_op.h"
38 #include "hw/xen/interface/hvm/params.h"
39 #include "hw/xen/interface/vcpu.h"
40 #include "hw/xen/interface/event_channel.h"
41 #include "hw/xen/interface/grant_table.h"
43 #include "xen-compat.h"
45 static void xen_vcpu_singleshot_timer_event(void *opaque
);
46 static void xen_vcpu_periodic_timer_event(void *opaque
);
47 static int vcpuop_stop_singleshot_timer(CPUState
*cs
);
50 #define hypercall_compat32(longmode) (!(longmode))
52 #define hypercall_compat32(longmode) (false)
55 static bool kvm_gva_to_gpa(CPUState
*cs
, uint64_t gva
, uint64_t *gpa
,
56 size_t *len
, bool is_write
)
58 struct kvm_translation tr
= {
59 .linear_address
= gva
,
63 *len
= TARGET_PAGE_SIZE
- (gva
& ~TARGET_PAGE_MASK
);
66 if (kvm_vcpu_ioctl(cs
, KVM_TRANSLATE
, &tr
) || !tr
.valid
||
67 (is_write
&& !tr
.writeable
)) {
70 *gpa
= tr
.physical_address
;
74 static int kvm_gva_rw(CPUState
*cs
, uint64_t gva
, void *_buf
, size_t sz
,
77 uint8_t *buf
= (uint8_t *)_buf
;
82 if (!kvm_gva_to_gpa(cs
, gva
, &gpa
, &len
, is_write
)) {
89 cpu_physical_memory_rw(gpa
, buf
, len
, is_write
);
99 static inline int kvm_copy_from_gva(CPUState
*cs
, uint64_t gva
, void *buf
,
102 return kvm_gva_rw(cs
, gva
, buf
, sz
, false);
105 static inline int kvm_copy_to_gva(CPUState
*cs
, uint64_t gva
, void *buf
,
108 return kvm_gva_rw(cs
, gva
, buf
, sz
, true);
111 int kvm_xen_init(KVMState
*s
, uint32_t hypercall_msr
)
113 const int required_caps
= KVM_XEN_HVM_CONFIG_HYPERCALL_MSR
|
114 KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL
| KVM_XEN_HVM_CONFIG_SHARED_INFO
;
115 struct kvm_xen_hvm_config cfg
= {
116 .msr
= hypercall_msr
,
117 .flags
= KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL
,
121 xen_caps
= kvm_check_extension(s
, KVM_CAP_XEN_HVM
);
122 if (required_caps
& ~xen_caps
) {
123 error_report("kvm: Xen HVM guest support not present or insufficient");
127 if (xen_caps
& KVM_XEN_HVM_CONFIG_EVTCHN_SEND
) {
128 struct kvm_xen_hvm_attr ha
= {
129 .type
= KVM_XEN_ATTR_TYPE_XEN_VERSION
,
130 .u
.xen_version
= s
->xen_version
,
132 (void)kvm_vm_ioctl(s
, KVM_XEN_HVM_SET_ATTR
, &ha
);
134 cfg
.flags
|= KVM_XEN_HVM_CONFIG_EVTCHN_SEND
;
137 ret
= kvm_vm_ioctl(s
, KVM_XEN_HVM_CONFIG
, &cfg
);
139 error_report("kvm: Failed to enable Xen HVM support: %s",
144 /* If called a second time, don't repeat the rest of the setup. */
150 * Event channel delivery via GSI/PCI_INTX needs to poll the vcpu_info
151 * of vCPU0 to deassert the IRQ when ->evtchn_upcall_pending is cleared.
153 * In the kernel, there's a notifier hook on the PIC/IOAPIC which allows
154 * such things to be polled at precisely the right time. We *could* do
155 * it nicely in the kernel: check vcpu_info[0]->evtchn_upcall_pending at
156 * the moment the IRQ is acked, and see if it should be reasserted.
158 * But the in-kernel irqchip is deprecated, so we're unlikely to add
159 * that support in the kernel. Insist on using the split irqchip mode
162 * This leaves us polling for the level going low in QEMU, which lacks
163 * the appropriate hooks in its PIC/IOAPIC code. Even VFIO is sending a
164 * spurious 'ack' to an INTX IRQ every time there's any MMIO access to
165 * the device (for which it has to unmap the device and trap access, for
166 * some period after an IRQ!!). In the Xen case, we do it on exit from
167 * KVM_RUN, if the flag is set to say that the GSI is currently asserted.
168 * Which is kind of icky, but less so than the VFIO one. I may fix them
171 if (!kvm_kernel_irqchip_split()) {
172 error_report("kvm: Xen support requires kernel-irqchip=split");
176 s
->xen_caps
= xen_caps
;
178 /* Tell fw_cfg to notify the BIOS to reserve the range. */
179 ret
= e820_add_entry(XEN_SPECIAL_AREA_ADDR
, XEN_SPECIAL_AREA_SIZE
,
182 fprintf(stderr
, "e820_add_entry() table is full\n");
186 /* The pages couldn't be overlaid until KVM was initialized */
187 xen_primary_console_reset();
188 xen_xenstore_reset();
193 int kvm_xen_init_vcpu(CPUState
*cs
)
195 X86CPU
*cpu
= X86_CPU(cs
);
196 CPUX86State
*env
= &cpu
->env
;
200 * The kernel needs to know the Xen/ACPI vCPU ID because that's
201 * what the guest uses in hypercalls such as timers. It doesn't
202 * match the APIC ID which is generally used for talking to the
203 * kernel about vCPUs. And if vCPU threads race with creating
204 * their KVM vCPUs out of order, it doesn't necessarily match
205 * with the kernel's internal vCPU indices either.
207 if (kvm_xen_has_cap(EVTCHN_SEND
)) {
208 struct kvm_xen_vcpu_attr va
= {
209 .type
= KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID
,
210 .u
.vcpu_id
= cs
->cpu_index
,
212 err
= kvm_vcpu_ioctl(cs
, KVM_XEN_VCPU_SET_ATTR
, &va
);
214 error_report("kvm: Failed to set Xen vCPU ID attribute: %s",
220 env
->xen_vcpu_info_gpa
= INVALID_GPA
;
221 env
->xen_vcpu_info_default_gpa
= INVALID_GPA
;
222 env
->xen_vcpu_time_info_gpa
= INVALID_GPA
;
223 env
->xen_vcpu_runstate_gpa
= INVALID_GPA
;
225 qemu_mutex_init(&env
->xen_timers_lock
);
226 env
->xen_singleshot_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL
,
227 xen_vcpu_singleshot_timer_event
,
229 if (!env
->xen_singleshot_timer
) {
232 env
->xen_singleshot_timer
->opaque
= cs
;
234 env
->xen_periodic_timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL
,
235 xen_vcpu_periodic_timer_event
,
237 if (!env
->xen_periodic_timer
) {
240 env
->xen_periodic_timer
->opaque
= cs
;
245 uint32_t kvm_xen_get_caps(void)
247 return kvm_state
->xen_caps
;
250 static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
251 int cmd
, uint64_t arg
)
256 case XENVER_get_features
: {
257 struct xen_feature_info fi
;
259 /* No need for 32/64 compat handling */
260 qemu_build_assert(sizeof(fi
) == 8);
262 err
= kvm_copy_from_gva(CPU(cpu
), arg
, &fi
, sizeof(fi
));
268 if (fi
.submap_idx
== 0) {
269 fi
.submap
|= 1 << XENFEAT_writable_page_tables
|
270 1 << XENFEAT_writable_descriptor_tables
|
271 1 << XENFEAT_auto_translated_physmap
|
272 1 << XENFEAT_hvm_callback_vector
|
273 1 << XENFEAT_hvm_safe_pvclock
|
274 1 << XENFEAT_hvm_pirqs
;
277 err
= kvm_copy_to_gva(CPU(cpu
), arg
, &fi
, sizeof(fi
));
285 exit
->u
.hcall
.result
= err
;
289 static int kvm_xen_set_vcpu_attr(CPUState
*cs
, uint16_t type
, uint64_t gpa
)
291 struct kvm_xen_vcpu_attr xhsi
;
296 trace_kvm_xen_set_vcpu_attr(cs
->cpu_index
, type
, gpa
);
298 return kvm_vcpu_ioctl(cs
, KVM_XEN_VCPU_SET_ATTR
, &xhsi
);
301 static int kvm_xen_set_vcpu_callback_vector(CPUState
*cs
)
303 uint8_t vector
= X86_CPU(cs
)->env
.xen_vcpu_callback_vector
;
304 struct kvm_xen_vcpu_attr xva
;
306 xva
.type
= KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR
;
307 xva
.u
.vector
= vector
;
309 trace_kvm_xen_set_vcpu_callback(cs
->cpu_index
, vector
);
311 return kvm_vcpu_ioctl(cs
, KVM_XEN_VCPU_SET_ATTR
, &xva
);
314 static void do_set_vcpu_callback_vector(CPUState
*cs
, run_on_cpu_data data
)
316 X86CPU
*cpu
= X86_CPU(cs
);
317 CPUX86State
*env
= &cpu
->env
;
319 env
->xen_vcpu_callback_vector
= data
.host_int
;
321 if (kvm_xen_has_cap(EVTCHN_SEND
)) {
322 kvm_xen_set_vcpu_callback_vector(cs
);
326 static int set_vcpu_info(CPUState
*cs
, uint64_t gpa
)
328 X86CPU
*cpu
= X86_CPU(cs
);
329 CPUX86State
*env
= &cpu
->env
;
330 MemoryRegionSection mrs
= { .mr
= NULL
};
331 void *vcpu_info_hva
= NULL
;
334 ret
= kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO
, gpa
);
335 if (ret
|| gpa
== INVALID_GPA
) {
339 mrs
= memory_region_find(get_system_memory(), gpa
,
340 sizeof(struct vcpu_info
));
341 if (mrs
.mr
&& mrs
.mr
->ram_block
&&
342 !int128_lt(mrs
.size
, int128_make64(sizeof(struct vcpu_info
)))) {
343 vcpu_info_hva
= qemu_map_ram_ptr(mrs
.mr
->ram_block
,
344 mrs
.offset_within_region
);
346 if (!vcpu_info_hva
) {
348 memory_region_unref(mrs
.mr
);
355 if (env
->xen_vcpu_info_mr
) {
356 memory_region_unref(env
->xen_vcpu_info_mr
);
358 env
->xen_vcpu_info_hva
= vcpu_info_hva
;
359 env
->xen_vcpu_info_mr
= mrs
.mr
;
363 static void do_set_vcpu_info_default_gpa(CPUState
*cs
, run_on_cpu_data data
)
365 X86CPU
*cpu
= X86_CPU(cs
);
366 CPUX86State
*env
= &cpu
->env
;
368 env
->xen_vcpu_info_default_gpa
= data
.host_ulong
;
370 /* Changing the default does nothing if a vcpu_info was explicitly set. */
371 if (env
->xen_vcpu_info_gpa
== INVALID_GPA
) {
372 set_vcpu_info(cs
, env
->xen_vcpu_info_default_gpa
);
376 static void do_set_vcpu_info_gpa(CPUState
*cs
, run_on_cpu_data data
)
378 X86CPU
*cpu
= X86_CPU(cs
);
379 CPUX86State
*env
= &cpu
->env
;
381 env
->xen_vcpu_info_gpa
= data
.host_ulong
;
383 set_vcpu_info(cs
, env
->xen_vcpu_info_gpa
);
386 void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id
)
388 CPUState
*cs
= qemu_get_cpu(vcpu_id
);
393 return X86_CPU(cs
)->env
.xen_vcpu_info_hva
;
396 void kvm_xen_maybe_deassert_callback(CPUState
*cs
)
398 CPUX86State
*env
= &X86_CPU(cs
)->env
;
399 struct vcpu_info
*vi
= env
->xen_vcpu_info_hva
;
404 /* If the evtchn_upcall_pending flag is cleared, turn the GSI off. */
405 if (!vi
->evtchn_upcall_pending
) {
406 qemu_mutex_lock_iothread();
408 * Check again now we have the lock, because it may have been
409 * asserted in the interim. And we don't want to take the lock
410 * every time because this is a fast path.
412 if (!vi
->evtchn_upcall_pending
) {
413 X86_CPU(cs
)->env
.xen_callback_asserted
= false;
414 xen_evtchn_set_callback_level(0);
416 qemu_mutex_unlock_iothread();
420 void kvm_xen_set_callback_asserted(void)
422 CPUState
*cs
= qemu_get_cpu(0);
425 X86_CPU(cs
)->env
.xen_callback_asserted
= true;
429 bool kvm_xen_has_vcpu_callback_vector(void)
431 CPUState
*cs
= qemu_get_cpu(0);
433 return cs
&& !!X86_CPU(cs
)->env
.xen_vcpu_callback_vector
;
436 void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id
, int type
)
438 CPUState
*cs
= qemu_get_cpu(vcpu_id
);
445 vector
= X86_CPU(cs
)->env
.xen_vcpu_callback_vector
;
448 * The per-vCPU callback vector injected via lapic. Just
449 * deliver it as an MSI.
452 .address
= APIC_DEFAULT_ADDRESS
|
453 (X86_CPU(cs
)->apic_id
<< MSI_ADDR_DEST_ID_SHIFT
),
454 .data
= vector
| (1UL << MSI_DATA_LEVEL_SHIFT
),
456 kvm_irqchip_send_msi(kvm_state
, msg
);
461 case HVM_PARAM_CALLBACK_TYPE_VECTOR
:
463 * If the evtchn_upcall_pending field in the vcpu_info is set, then
464 * KVM will automatically deliver the vector on entering the vCPU
465 * so all we have to do is kick it out.
470 case HVM_PARAM_CALLBACK_TYPE_GSI
:
471 case HVM_PARAM_CALLBACK_TYPE_PCI_INTX
:
473 xen_evtchn_set_callback_level(1);
479 /* Must always be called with xen_timers_lock held */
480 static int kvm_xen_set_vcpu_timer(CPUState
*cs
)
482 X86CPU
*cpu
= X86_CPU(cs
);
483 CPUX86State
*env
= &cpu
->env
;
485 struct kvm_xen_vcpu_attr va
= {
486 .type
= KVM_XEN_VCPU_ATTR_TYPE_TIMER
,
487 .u
.timer
.port
= env
->xen_virq
[VIRQ_TIMER
],
488 .u
.timer
.priority
= KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL
,
489 .u
.timer
.expires_ns
= env
->xen_singleshot_timer_ns
,
492 return kvm_vcpu_ioctl(cs
, KVM_XEN_VCPU_SET_ATTR
, &va
);
495 static void do_set_vcpu_timer_virq(CPUState
*cs
, run_on_cpu_data data
)
497 QEMU_LOCK_GUARD(&X86_CPU(cs
)->env
.xen_timers_lock
);
498 kvm_xen_set_vcpu_timer(cs
);
501 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id
, uint16_t virq
, uint16_t port
)
503 CPUState
*cs
= qemu_get_cpu(vcpu_id
);
509 /* cpu.h doesn't include the actual Xen header. */
510 qemu_build_assert(NR_VIRQS
== XEN_NR_VIRQS
);
512 if (virq
>= NR_VIRQS
) {
516 if (port
&& X86_CPU(cs
)->env
.xen_virq
[virq
]) {
520 X86_CPU(cs
)->env
.xen_virq
[virq
] = port
;
521 if (virq
== VIRQ_TIMER
&& kvm_xen_has_cap(EVTCHN_SEND
)) {
522 async_run_on_cpu(cs
, do_set_vcpu_timer_virq
,
523 RUN_ON_CPU_HOST_INT(port
));
528 static void do_set_vcpu_time_info_gpa(CPUState
*cs
, run_on_cpu_data data
)
530 X86CPU
*cpu
= X86_CPU(cs
);
531 CPUX86State
*env
= &cpu
->env
;
533 env
->xen_vcpu_time_info_gpa
= data
.host_ulong
;
535 kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
,
536 env
->xen_vcpu_time_info_gpa
);
539 static void do_set_vcpu_runstate_gpa(CPUState
*cs
, run_on_cpu_data data
)
541 X86CPU
*cpu
= X86_CPU(cs
);
542 CPUX86State
*env
= &cpu
->env
;
544 env
->xen_vcpu_runstate_gpa
= data
.host_ulong
;
546 kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR
,
547 env
->xen_vcpu_runstate_gpa
);
550 static void do_vcpu_soft_reset(CPUState
*cs
, run_on_cpu_data data
)
552 X86CPU
*cpu
= X86_CPU(cs
);
553 CPUX86State
*env
= &cpu
->env
;
555 env
->xen_vcpu_info_gpa
= INVALID_GPA
;
556 env
->xen_vcpu_info_default_gpa
= INVALID_GPA
;
557 env
->xen_vcpu_time_info_gpa
= INVALID_GPA
;
558 env
->xen_vcpu_runstate_gpa
= INVALID_GPA
;
559 env
->xen_vcpu_callback_vector
= 0;
560 memset(env
->xen_virq
, 0, sizeof(env
->xen_virq
));
562 set_vcpu_info(cs
, INVALID_GPA
);
563 kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
,
565 kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR
,
567 if (kvm_xen_has_cap(EVTCHN_SEND
)) {
568 kvm_xen_set_vcpu_callback_vector(cs
);
570 QEMU_LOCK_GUARD(&X86_CPU(cs
)->env
.xen_timers_lock
);
571 env
->xen_singleshot_timer_ns
= 0;
572 kvm_xen_set_vcpu_timer(cs
);
574 vcpuop_stop_singleshot_timer(cs
);
579 static int xen_set_shared_info(uint64_t gfn
)
581 uint64_t gpa
= gfn
<< TARGET_PAGE_BITS
;
584 QEMU_IOTHREAD_LOCK_GUARD();
587 * The xen_overlay device tells KVM about it too, since it had to
588 * do that on migration load anyway (unless we're going to jump
589 * through lots of hoops to maintain the fiction that this isn't
592 err
= xen_overlay_map_shinfo_page(gpa
);
597 trace_kvm_xen_set_shared_info(gfn
);
599 for (i
= 0; i
< XEN_LEGACY_MAX_VCPUS
; i
++) {
600 CPUState
*cpu
= qemu_get_cpu(i
);
602 async_run_on_cpu(cpu
, do_set_vcpu_info_default_gpa
,
603 RUN_ON_CPU_HOST_ULONG(gpa
));
605 gpa
+= sizeof(vcpu_info_t
);
611 static int add_to_physmap_one(uint32_t space
, uint64_t idx
, uint64_t gfn
)
614 case XENMAPSPACE_shared_info
:
618 return xen_set_shared_info(gfn
);
620 case XENMAPSPACE_grant_table
:
621 return xen_gnttab_map_page(idx
, gfn
);
623 case XENMAPSPACE_gmfn
:
624 case XENMAPSPACE_gmfn_range
:
627 case XENMAPSPACE_gmfn_foreign
:
628 case XENMAPSPACE_dev_mmio
:
636 static int do_add_to_physmap(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
639 struct xen_add_to_physmap xatp
;
640 CPUState
*cs
= CPU(cpu
);
642 if (hypercall_compat32(exit
->u
.hcall
.longmode
)) {
643 struct compat_xen_add_to_physmap xatp32
;
645 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap
) == 16);
646 if (kvm_copy_from_gva(cs
, arg
, &xatp32
, sizeof(xatp32
))) {
649 xatp
.domid
= xatp32
.domid
;
650 xatp
.size
= xatp32
.size
;
651 xatp
.space
= xatp32
.space
;
652 xatp
.idx
= xatp32
.idx
;
653 xatp
.gpfn
= xatp32
.gpfn
;
655 if (kvm_copy_from_gva(cs
, arg
, &xatp
, sizeof(xatp
))) {
660 if (xatp
.domid
!= DOMID_SELF
&& xatp
.domid
!= xen_domid
) {
664 return add_to_physmap_one(xatp
.space
, xatp
.idx
, xatp
.gpfn
);
667 static int do_add_to_physmap_batch(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
670 struct xen_add_to_physmap_batch xatpb
;
671 unsigned long idxs_gva
, gpfns_gva
, errs_gva
;
672 CPUState
*cs
= CPU(cpu
);
675 if (hypercall_compat32(exit
->u
.hcall
.longmode
)) {
676 struct compat_xen_add_to_physmap_batch xatpb32
;
678 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch
) == 20);
679 if (kvm_copy_from_gva(cs
, arg
, &xatpb32
, sizeof(xatpb32
))) {
682 xatpb
.domid
= xatpb32
.domid
;
683 xatpb
.space
= xatpb32
.space
;
684 xatpb
.size
= xatpb32
.size
;
686 idxs_gva
= xatpb32
.idxs
.c
;
687 gpfns_gva
= xatpb32
.gpfns
.c
;
688 errs_gva
= xatpb32
.errs
.c
;
689 op_sz
= sizeof(uint32_t);
691 if (kvm_copy_from_gva(cs
, arg
, &xatpb
, sizeof(xatpb
))) {
694 op_sz
= sizeof(unsigned long);
695 idxs_gva
= (unsigned long)xatpb
.idxs
.p
;
696 gpfns_gva
= (unsigned long)xatpb
.gpfns
.p
;
697 errs_gva
= (unsigned long)xatpb
.errs
.p
;
700 if (xatpb
.domid
!= DOMID_SELF
&& xatpb
.domid
!= xen_domid
) {
704 /* Explicitly invalid for the batch op. Not that we implement it anyway. */
705 if (xatpb
.space
== XENMAPSPACE_gmfn_range
) {
709 while (xatpb
.size
--) {
710 unsigned long idx
= 0;
711 unsigned long gpfn
= 0;
714 /* For 32-bit compat this only copies the low 32 bits of each */
715 if (kvm_copy_from_gva(cs
, idxs_gva
, &idx
, op_sz
) ||
716 kvm_copy_from_gva(cs
, gpfns_gva
, &gpfn
, op_sz
)) {
722 err
= add_to_physmap_one(xatpb
.space
, idx
, gpfn
);
724 if (kvm_copy_to_gva(cs
, errs_gva
, &err
, sizeof(err
))) {
727 errs_gva
+= sizeof(err
);
732 static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
733 int cmd
, uint64_t arg
)
738 case XENMEM_add_to_physmap
:
739 err
= do_add_to_physmap(exit
, cpu
, arg
);
742 case XENMEM_add_to_physmap_batch
:
743 err
= do_add_to_physmap_batch(exit
, cpu
, arg
);
750 exit
->u
.hcall
.result
= err
;
754 static bool handle_set_param(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
757 CPUState
*cs
= CPU(cpu
);
758 struct xen_hvm_param hp
;
761 /* No need for 32/64 compat handling */
762 qemu_build_assert(sizeof(hp
) == 16);
764 if (kvm_copy_from_gva(cs
, arg
, &hp
, sizeof(hp
))) {
769 if (hp
.domid
!= DOMID_SELF
&& hp
.domid
!= xen_domid
) {
775 case HVM_PARAM_CALLBACK_IRQ
:
776 qemu_mutex_lock_iothread();
777 err
= xen_evtchn_set_callback_param(hp
.value
);
778 qemu_mutex_unlock_iothread();
779 xen_set_long_mode(exit
->u
.hcall
.longmode
);
786 exit
->u
.hcall
.result
= err
;
790 static bool handle_get_param(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
793 CPUState
*cs
= CPU(cpu
);
794 struct xen_hvm_param hp
;
797 /* No need for 32/64 compat handling */
798 qemu_build_assert(sizeof(hp
) == 16);
800 if (kvm_copy_from_gva(cs
, arg
, &hp
, sizeof(hp
))) {
805 if (hp
.domid
!= DOMID_SELF
&& hp
.domid
!= xen_domid
) {
811 case HVM_PARAM_STORE_PFN
:
812 hp
.value
= XEN_SPECIAL_PFN(XENSTORE
);
814 case HVM_PARAM_STORE_EVTCHN
:
815 hp
.value
= xen_xenstore_get_port();
817 case HVM_PARAM_CONSOLE_PFN
:
818 hp
.value
= xen_primary_console_get_pfn();
823 case HVM_PARAM_CONSOLE_EVTCHN
:
824 hp
.value
= xen_primary_console_get_port();
833 if (!err
&& kvm_copy_to_gva(cs
, arg
, &hp
, sizeof(hp
))) {
837 exit
->u
.hcall
.result
= err
;
841 static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit
*exit
,
842 X86CPU
*cpu
, uint64_t arg
)
844 struct xen_hvm_evtchn_upcall_vector up
;
847 /* No need for 32/64 compat handling */
848 qemu_build_assert(sizeof(up
) == 8);
850 if (kvm_copy_from_gva(CPU(cpu
), arg
, &up
, sizeof(up
))) {
854 if (up
.vector
< 0x10) {
858 target_cs
= qemu_get_cpu(up
.vcpu
);
863 async_run_on_cpu(target_cs
, do_set_vcpu_callback_vector
,
864 RUN_ON_CPU_HOST_INT(up
.vector
));
868 static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
869 int cmd
, uint64_t arg
)
873 case HVMOP_set_evtchn_upcall_vector
:
874 ret
= kvm_xen_hcall_evtchn_upcall_vector(exit
, cpu
, arg
);
877 case HVMOP_pagetable_dying
:
881 case HVMOP_set_param
:
882 return handle_set_param(exit
, cpu
, arg
);
884 case HVMOP_get_param
:
885 return handle_get_param(exit
, cpu
, arg
);
891 exit
->u
.hcall
.result
= ret
;
895 static int vcpuop_register_vcpu_info(CPUState
*cs
, CPUState
*target
,
898 struct vcpu_register_vcpu_info rvi
;
901 /* No need for 32/64 compat handling */
902 qemu_build_assert(sizeof(rvi
) == 16);
903 qemu_build_assert(sizeof(struct vcpu_info
) == 64);
909 if (kvm_copy_from_gva(cs
, arg
, &rvi
, sizeof(rvi
))) {
913 if (rvi
.offset
> TARGET_PAGE_SIZE
- sizeof(struct vcpu_info
)) {
917 gpa
= ((rvi
.mfn
<< TARGET_PAGE_BITS
) + rvi
.offset
);
918 async_run_on_cpu(target
, do_set_vcpu_info_gpa
, RUN_ON_CPU_HOST_ULONG(gpa
));
922 static int vcpuop_register_vcpu_time_info(CPUState
*cs
, CPUState
*target
,
925 struct vcpu_register_time_memory_area tma
;
929 /* No need for 32/64 compat handling */
930 qemu_build_assert(sizeof(tma
) == 8);
931 qemu_build_assert(sizeof(struct vcpu_time_info
) == 32);
937 if (kvm_copy_from_gva(cs
, arg
, &tma
, sizeof(tma
))) {
942 * Xen actually uses the GVA and does the translation through the guest
943 * page tables each time. But Linux/KVM uses the GPA, on the assumption
944 * that guests only ever use *global* addresses (kernel virtual addresses)
945 * for it. If Linux is changed to redo the GVA→GPA translation each time,
946 * it will offer a new vCPU attribute for that, and we'll use it instead.
948 if (!kvm_gva_to_gpa(cs
, tma
.addr
.p
, &gpa
, &len
, false) ||
949 len
< sizeof(struct vcpu_time_info
)) {
953 async_run_on_cpu(target
, do_set_vcpu_time_info_gpa
,
954 RUN_ON_CPU_HOST_ULONG(gpa
));
958 static int vcpuop_register_runstate_info(CPUState
*cs
, CPUState
*target
,
961 struct vcpu_register_runstate_memory_area rma
;
965 /* No need for 32/64 compat handling */
966 qemu_build_assert(sizeof(rma
) == 8);
967 /* The runstate area actually does change size, but Linux copes. */
973 if (kvm_copy_from_gva(cs
, arg
, &rma
, sizeof(rma
))) {
977 /* As with vcpu_time_info, Xen actually uses the GVA but KVM doesn't. */
978 if (!kvm_gva_to_gpa(cs
, rma
.addr
.p
, &gpa
, &len
, false)) {
982 async_run_on_cpu(target
, do_set_vcpu_runstate_gpa
,
983 RUN_ON_CPU_HOST_ULONG(gpa
));
987 static uint64_t kvm_get_current_ns(void)
989 struct kvm_clock_data data
;
992 ret
= kvm_vm_ioctl(kvm_state
, KVM_GET_CLOCK
, &data
);
994 fprintf(stderr
, "KVM_GET_CLOCK failed: %s\n", strerror(ret
));
1001 static void xen_vcpu_singleshot_timer_event(void *opaque
)
1003 CPUState
*cpu
= opaque
;
1004 CPUX86State
*env
= &X86_CPU(cpu
)->env
;
1005 uint16_t port
= env
->xen_virq
[VIRQ_TIMER
];
1008 xen_evtchn_set_port(port
);
1011 qemu_mutex_lock(&env
->xen_timers_lock
);
1012 env
->xen_singleshot_timer_ns
= 0;
1013 qemu_mutex_unlock(&env
->xen_timers_lock
);
1016 static void xen_vcpu_periodic_timer_event(void *opaque
)
1018 CPUState
*cpu
= opaque
;
1019 CPUX86State
*env
= &X86_CPU(cpu
)->env
;
1020 uint16_t port
= env
->xen_virq
[VIRQ_TIMER
];
1024 xen_evtchn_set_port(port
);
1027 qemu_mutex_lock(&env
->xen_timers_lock
);
1029 qemu_now
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
1030 timer_mod_ns(env
->xen_periodic_timer
,
1031 qemu_now
+ env
->xen_periodic_timer_period
);
1033 qemu_mutex_unlock(&env
->xen_timers_lock
);
1036 static int do_set_periodic_timer(CPUState
*target
, uint64_t period_ns
)
1038 CPUX86State
*tenv
= &X86_CPU(target
)->env
;
1041 timer_del(tenv
->xen_periodic_timer
);
1043 qemu_mutex_lock(&tenv
->xen_timers_lock
);
1045 qemu_now
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
1046 timer_mod_ns(tenv
->xen_periodic_timer
, qemu_now
+ period_ns
);
1047 tenv
->xen_periodic_timer_period
= period_ns
;
1049 qemu_mutex_unlock(&tenv
->xen_timers_lock
);
1053 #define MILLISECS(_ms) ((int64_t)((_ms) * 1000000ULL))
1054 #define MICROSECS(_us) ((int64_t)((_us) * 1000ULL))
1055 #define STIME_MAX ((time_t)((int64_t)~0ull >> 1))
1056 /* Chosen so (NOW() + delta) won't overflow without an uptime of 200 years */
1057 #define STIME_DELTA_MAX ((int64_t)((uint64_t)~0ull >> 2))
1059 static int vcpuop_set_periodic_timer(CPUState
*cs
, CPUState
*target
,
1062 struct vcpu_set_periodic_timer spt
;
1064 qemu_build_assert(sizeof(spt
) == 8);
1065 if (kvm_copy_from_gva(cs
, arg
, &spt
, sizeof(spt
))) {
1069 if (spt
.period_ns
< MILLISECS(1) || spt
.period_ns
> STIME_DELTA_MAX
) {
1073 return do_set_periodic_timer(target
, spt
.period_ns
);
1076 static int vcpuop_stop_periodic_timer(CPUState
*target
)
1078 CPUX86State
*tenv
= &X86_CPU(target
)->env
;
1080 qemu_mutex_lock(&tenv
->xen_timers_lock
);
1082 timer_del(tenv
->xen_periodic_timer
);
1083 tenv
->xen_periodic_timer_period
= 0;
1085 qemu_mutex_unlock(&tenv
->xen_timers_lock
);
1090 * Userspace handling of timer, for older kernels.
1091 * Must always be called with xen_timers_lock held.
1093 static int do_set_singleshot_timer(CPUState
*cs
, uint64_t timeout_abs
,
1096 CPUX86State
*env
= &X86_CPU(cs
)->env
;
1097 int64_t now
= kvm_get_current_ns();
1098 int64_t qemu_now
= qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
);
1099 int64_t delta
= timeout_abs
- now
;
1101 if (linux_wa
&& unlikely((int64_t)timeout_abs
< 0 ||
1102 (delta
> 0 && (uint32_t)(delta
>> 50) != 0))) {
1104 * Xen has a 'Linux workaround' in do_set_timer_op() which checks
1105 * for negative absolute timeout values (caused by integer
1106 * overflow), and for values about 13 days in the future (2^50ns)
1107 * which would be caused by jiffies overflow. For those cases, it
1108 * sets the timeout 100ms in the future (not *too* soon, since if
1109 * a guest really did set a long timeout on purpose we don't want
1110 * to keep churning CPU time by waking it up).
1112 delta
= (100 * SCALE_MS
);
1113 timeout_abs
= now
+ delta
;
1116 timer_mod_ns(env
->xen_singleshot_timer
, qemu_now
+ delta
);
1117 env
->xen_singleshot_timer_ns
= now
+ delta
;
1121 static int vcpuop_set_singleshot_timer(CPUState
*cs
, uint64_t arg
)
1123 struct vcpu_set_singleshot_timer sst
= { 0 };
1126 * The struct is a uint64_t followed by a uint32_t. On 32-bit that
1127 * makes it 12 bytes. On 64-bit it gets padded to 16. The parts
1128 * that get used are identical, and there's four bytes of padding
1129 * unused at the end. For true Xen compatibility we should attempt
1130 * to copy the full 16 bytes from 64-bit guests, and return -EFAULT
1131 * if we can't get the padding too. But that's daft. Just copy what
1134 qemu_build_assert(offsetof(struct vcpu_set_singleshot_timer
, flags
) == 8);
1135 qemu_build_assert(sizeof(sst
) >= 12);
1137 if (kvm_copy_from_gva(cs
, arg
, &sst
, 12)) {
1141 QEMU_LOCK_GUARD(&X86_CPU(cs
)->env
.xen_timers_lock
);
1144 * We ignore the VCPU_SSHOTTMR_future flag, just as Xen now does.
1145 * The only guest that ever used it, got it wrong.
1146 * https://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=19c6cbd909
1148 return do_set_singleshot_timer(cs
, sst
.timeout_abs_ns
, false);
1151 static int vcpuop_stop_singleshot_timer(CPUState
*cs
)
1153 CPUX86State
*env
= &X86_CPU(cs
)->env
;
1155 qemu_mutex_lock(&env
->xen_timers_lock
);
1157 timer_del(env
->xen_singleshot_timer
);
1158 env
->xen_singleshot_timer_ns
= 0;
1160 qemu_mutex_unlock(&env
->xen_timers_lock
);
1164 static bool kvm_xen_hcall_set_timer_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
1169 if (unlikely(timeout
== 0)) {
1170 err
= vcpuop_stop_singleshot_timer(CPU(cpu
));
1172 QEMU_LOCK_GUARD(&X86_CPU(cpu
)->env
.xen_timers_lock
);
1173 err
= do_set_singleshot_timer(CPU(cpu
), timeout
, true);
1175 exit
->u
.hcall
.result
= err
;
1179 static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
1180 int cmd
, int vcpu_id
, uint64_t arg
)
1182 CPUState
*cs
= CPU(cpu
);
1183 CPUState
*dest
= cs
->cpu_index
== vcpu_id
? cs
: qemu_get_cpu(vcpu_id
);
1192 case VCPUOP_register_runstate_memory_area
:
1193 err
= vcpuop_register_runstate_info(cs
, dest
, arg
);
1195 case VCPUOP_register_vcpu_time_memory_area
:
1196 err
= vcpuop_register_vcpu_time_info(cs
, dest
, arg
);
1198 case VCPUOP_register_vcpu_info
:
1199 err
= vcpuop_register_vcpu_info(cs
, dest
, arg
);
1201 case VCPUOP_set_singleshot_timer
: {
1202 if (cs
->cpu_index
== vcpu_id
) {
1203 err
= vcpuop_set_singleshot_timer(dest
, arg
);
1209 case VCPUOP_stop_singleshot_timer
:
1210 if (cs
->cpu_index
== vcpu_id
) {
1211 err
= vcpuop_stop_singleshot_timer(dest
);
1216 case VCPUOP_set_periodic_timer
: {
1217 err
= vcpuop_set_periodic_timer(cs
, dest
, arg
);
1220 case VCPUOP_stop_periodic_timer
:
1221 err
= vcpuop_stop_periodic_timer(dest
);
1229 exit
->u
.hcall
.result
= err
;
1233 static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
1234 int cmd
, uint64_t arg
)
1236 CPUState
*cs
= CPU(cpu
);
1240 case EVTCHNOP_init_control
:
1241 case EVTCHNOP_expand_array
:
1242 case EVTCHNOP_set_priority
:
1243 /* We do not support FIFO channels at this point */
1247 case EVTCHNOP_status
: {
1248 struct evtchn_status status
;
1250 qemu_build_assert(sizeof(status
) == 24);
1251 if (kvm_copy_from_gva(cs
, arg
, &status
, sizeof(status
))) {
1256 err
= xen_evtchn_status_op(&status
);
1257 if (!err
&& kvm_copy_to_gva(cs
, arg
, &status
, sizeof(status
))) {
1262 case EVTCHNOP_close
: {
1263 struct evtchn_close close
;
1265 qemu_build_assert(sizeof(close
) == 4);
1266 if (kvm_copy_from_gva(cs
, arg
, &close
, sizeof(close
))) {
1271 err
= xen_evtchn_close_op(&close
);
1274 case EVTCHNOP_unmask
: {
1275 struct evtchn_unmask unmask
;
1277 qemu_build_assert(sizeof(unmask
) == 4);
1278 if (kvm_copy_from_gva(cs
, arg
, &unmask
, sizeof(unmask
))) {
1283 err
= xen_evtchn_unmask_op(&unmask
);
1286 case EVTCHNOP_bind_virq
: {
1287 struct evtchn_bind_virq virq
;
1289 qemu_build_assert(sizeof(virq
) == 12);
1290 if (kvm_copy_from_gva(cs
, arg
, &virq
, sizeof(virq
))) {
1295 err
= xen_evtchn_bind_virq_op(&virq
);
1296 if (!err
&& kvm_copy_to_gva(cs
, arg
, &virq
, sizeof(virq
))) {
1301 case EVTCHNOP_bind_pirq
: {
1302 struct evtchn_bind_pirq pirq
;
1304 qemu_build_assert(sizeof(pirq
) == 12);
1305 if (kvm_copy_from_gva(cs
, arg
, &pirq
, sizeof(pirq
))) {
1310 err
= xen_evtchn_bind_pirq_op(&pirq
);
1311 if (!err
&& kvm_copy_to_gva(cs
, arg
, &pirq
, sizeof(pirq
))) {
1316 case EVTCHNOP_bind_ipi
: {
1317 struct evtchn_bind_ipi ipi
;
1319 qemu_build_assert(sizeof(ipi
) == 8);
1320 if (kvm_copy_from_gva(cs
, arg
, &ipi
, sizeof(ipi
))) {
1325 err
= xen_evtchn_bind_ipi_op(&ipi
);
1326 if (!err
&& kvm_copy_to_gva(cs
, arg
, &ipi
, sizeof(ipi
))) {
1331 case EVTCHNOP_send
: {
1332 struct evtchn_send send
;
1334 qemu_build_assert(sizeof(send
) == 4);
1335 if (kvm_copy_from_gva(cs
, arg
, &send
, sizeof(send
))) {
1340 err
= xen_evtchn_send_op(&send
);
1343 case EVTCHNOP_alloc_unbound
: {
1344 struct evtchn_alloc_unbound alloc
;
1346 qemu_build_assert(sizeof(alloc
) == 8);
1347 if (kvm_copy_from_gva(cs
, arg
, &alloc
, sizeof(alloc
))) {
1352 err
= xen_evtchn_alloc_unbound_op(&alloc
);
1353 if (!err
&& kvm_copy_to_gva(cs
, arg
, &alloc
, sizeof(alloc
))) {
1358 case EVTCHNOP_bind_interdomain
: {
1359 struct evtchn_bind_interdomain interdomain
;
1361 qemu_build_assert(sizeof(interdomain
) == 12);
1362 if (kvm_copy_from_gva(cs
, arg
, &interdomain
, sizeof(interdomain
))) {
1367 err
= xen_evtchn_bind_interdomain_op(&interdomain
);
1369 kvm_copy_to_gva(cs
, arg
, &interdomain
, sizeof(interdomain
))) {
1374 case EVTCHNOP_bind_vcpu
: {
1375 struct evtchn_bind_vcpu vcpu
;
1377 qemu_build_assert(sizeof(vcpu
) == 8);
1378 if (kvm_copy_from_gva(cs
, arg
, &vcpu
, sizeof(vcpu
))) {
1383 err
= xen_evtchn_bind_vcpu_op(&vcpu
);
1386 case EVTCHNOP_reset
: {
1387 struct evtchn_reset reset
;
1389 qemu_build_assert(sizeof(reset
) == 2);
1390 if (kvm_copy_from_gva(cs
, arg
, &reset
, sizeof(reset
))) {
1395 err
= xen_evtchn_reset_op(&reset
);
1402 exit
->u
.hcall
.result
= err
;
1406 int kvm_xen_soft_reset(void)
1411 assert(qemu_mutex_iothread_locked());
1413 trace_kvm_xen_soft_reset();
1415 err
= xen_evtchn_soft_reset();
1421 * Zero is the reset/startup state for HVM_PARAM_CALLBACK_IRQ. Strictly,
1422 * it maps to HVM_PARAM_CALLBACK_TYPE_GSI with GSI#0, but Xen refuses to
1423 * to deliver to the timer interrupt and treats that as 'disabled'.
1425 err
= xen_evtchn_set_callback_param(0);
1431 async_run_on_cpu(cpu
, do_vcpu_soft_reset
, RUN_ON_CPU_NULL
);
1434 err
= xen_overlay_map_shinfo_page(INVALID_GFN
);
1439 err
= xen_gnttab_reset();
1444 err
= xen_primary_console_reset();
1449 err
= xen_xenstore_reset();
1457 static int schedop_shutdown(CPUState
*cs
, uint64_t arg
)
1459 struct sched_shutdown shutdown
;
1462 /* No need for 32/64 compat handling */
1463 qemu_build_assert(sizeof(shutdown
) == 4);
1465 if (kvm_copy_from_gva(cs
, arg
, &shutdown
, sizeof(shutdown
))) {
1469 switch (shutdown
.reason
) {
1470 case SHUTDOWN_crash
:
1471 cpu_dump_state(cs
, stderr
, CPU_DUMP_CODE
);
1472 qemu_system_guest_panicked(NULL
);
1475 case SHUTDOWN_reboot
:
1476 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET
);
1479 case SHUTDOWN_poweroff
:
1480 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN
);
1483 case SHUTDOWN_soft_reset
:
1484 qemu_mutex_lock_iothread();
1485 ret
= kvm_xen_soft_reset();
1486 qemu_mutex_unlock_iothread();
1497 static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
1498 int cmd
, uint64_t arg
)
1500 CPUState
*cs
= CPU(cpu
);
1504 case SCHEDOP_shutdown
:
1505 err
= schedop_shutdown(cs
, arg
);
1510 * Linux will panic if this doesn't work. Just yield; it's not
1511 * worth overthinking it because with event channel handling
1512 * in KVM, the kernel will intercept this and it will never
1513 * reach QEMU anyway. The semantics of the hypercall explicltly
1514 * permit spurious wakeups.
1525 exit
->u
.hcall
.result
= err
;
1529 static bool kvm_xen_hcall_gnttab_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
1530 int cmd
, uint64_t arg
, int count
)
1532 CPUState
*cs
= CPU(cpu
);
1536 case GNTTABOP_set_version
: {
1537 struct gnttab_set_version set
;
1539 qemu_build_assert(sizeof(set
) == 4);
1540 if (kvm_copy_from_gva(cs
, arg
, &set
, sizeof(set
))) {
1545 err
= xen_gnttab_set_version_op(&set
);
1546 if (!err
&& kvm_copy_to_gva(cs
, arg
, &set
, sizeof(set
))) {
1551 case GNTTABOP_get_version
: {
1552 struct gnttab_get_version get
;
1554 qemu_build_assert(sizeof(get
) == 8);
1555 if (kvm_copy_from_gva(cs
, arg
, &get
, sizeof(get
))) {
1560 err
= xen_gnttab_get_version_op(&get
);
1561 if (!err
&& kvm_copy_to_gva(cs
, arg
, &get
, sizeof(get
))) {
1566 case GNTTABOP_query_size
: {
1567 struct gnttab_query_size size
;
1569 qemu_build_assert(sizeof(size
) == 16);
1570 if (kvm_copy_from_gva(cs
, arg
, &size
, sizeof(size
))) {
1575 err
= xen_gnttab_query_size_op(&size
);
1576 if (!err
&& kvm_copy_to_gva(cs
, arg
, &size
, sizeof(size
))) {
1581 case GNTTABOP_setup_table
:
1583 case GNTTABOP_map_grant_ref
:
1584 case GNTTABOP_unmap_grant_ref
:
1585 case GNTTABOP_swap_grant_ref
:
1589 /* Xen explicitly returns -ENOSYS to HVM guests for all others */
1594 exit
->u
.hcall
.result
= err
;
1598 static bool kvm_xen_hcall_physdev_op(struct kvm_xen_exit
*exit
, X86CPU
*cpu
,
1599 int cmd
, uint64_t arg
)
1601 CPUState
*cs
= CPU(cpu
);
1605 case PHYSDEVOP_map_pirq
: {
1606 struct physdev_map_pirq map
;
1608 if (hypercall_compat32(exit
->u
.hcall
.longmode
)) {
1609 struct compat_physdev_map_pirq
*map32
= (void *)&map
;
1611 if (kvm_copy_from_gva(cs
, arg
, map32
, sizeof(*map32
))) {
1616 * The only thing that's different is the alignment of the
1617 * uint64_t table_base at the end, which gets padding to make
1618 * it 64-bit aligned in the 64-bit version.
1620 qemu_build_assert(sizeof(*map32
) == 36);
1621 qemu_build_assert(offsetof(struct physdev_map_pirq
, entry_nr
) ==
1622 offsetof(struct compat_physdev_map_pirq
, entry_nr
));
1623 memmove(&map
.table_base
, &map32
->table_base
, sizeof(map
.table_base
));
1625 if (kvm_copy_from_gva(cs
, arg
, &map
, sizeof(map
))) {
1630 err
= xen_physdev_map_pirq(&map
);
1632 * Since table_base is an IN parameter and won't be changed, just
1633 * copy the size of the compat structure back to the guest.
1635 if (!err
&& kvm_copy_to_gva(cs
, arg
, &map
,
1636 sizeof(struct compat_physdev_map_pirq
))) {
1641 case PHYSDEVOP_unmap_pirq
: {
1642 struct physdev_unmap_pirq unmap
;
1644 qemu_build_assert(sizeof(unmap
) == 8);
1645 if (kvm_copy_from_gva(cs
, arg
, &unmap
, sizeof(unmap
))) {
1650 err
= xen_physdev_unmap_pirq(&unmap
);
1651 if (!err
&& kvm_copy_to_gva(cs
, arg
, &unmap
, sizeof(unmap
))) {
1656 case PHYSDEVOP_eoi
: {
1657 struct physdev_eoi eoi
;
1659 qemu_build_assert(sizeof(eoi
) == 4);
1660 if (kvm_copy_from_gva(cs
, arg
, &eoi
, sizeof(eoi
))) {
1665 err
= xen_physdev_eoi_pirq(&eoi
);
1666 if (!err
&& kvm_copy_to_gva(cs
, arg
, &eoi
, sizeof(eoi
))) {
1671 case PHYSDEVOP_irq_status_query
: {
1672 struct physdev_irq_status_query query
;
1674 qemu_build_assert(sizeof(query
) == 8);
1675 if (kvm_copy_from_gva(cs
, arg
, &query
, sizeof(query
))) {
1680 err
= xen_physdev_query_pirq(&query
);
1681 if (!err
&& kvm_copy_to_gva(cs
, arg
, &query
, sizeof(query
))) {
1686 case PHYSDEVOP_get_free_pirq
: {
1687 struct physdev_get_free_pirq get
;
1689 qemu_build_assert(sizeof(get
) == 8);
1690 if (kvm_copy_from_gva(cs
, arg
, &get
, sizeof(get
))) {
1695 err
= xen_physdev_get_free_pirq(&get
);
1696 if (!err
&& kvm_copy_to_gva(cs
, arg
, &get
, sizeof(get
))) {
1701 case PHYSDEVOP_pirq_eoi_gmfn_v2
: /* FreeBSD 13 makes this hypercall */
1709 exit
->u
.hcall
.result
= err
;
1713 static bool do_kvm_xen_handle_exit(X86CPU
*cpu
, struct kvm_xen_exit
*exit
)
1715 uint16_t code
= exit
->u
.hcall
.input
;
1717 if (exit
->u
.hcall
.cpl
> 0) {
1718 exit
->u
.hcall
.result
= -EPERM
;
1723 case __HYPERVISOR_set_timer_op
:
1724 if (exit
->u
.hcall
.longmode
) {
1725 return kvm_xen_hcall_set_timer_op(exit
, cpu
,
1726 exit
->u
.hcall
.params
[0]);
1728 /* In 32-bit mode, the 64-bit timer value is in two args. */
1729 uint64_t val
= ((uint64_t)exit
->u
.hcall
.params
[1]) << 32 |
1730 (uint32_t)exit
->u
.hcall
.params
[0];
1731 return kvm_xen_hcall_set_timer_op(exit
, cpu
, val
);
1733 case __HYPERVISOR_grant_table_op
:
1734 return kvm_xen_hcall_gnttab_op(exit
, cpu
, exit
->u
.hcall
.params
[0],
1735 exit
->u
.hcall
.params
[1],
1736 exit
->u
.hcall
.params
[2]);
1737 case __HYPERVISOR_sched_op
:
1738 return kvm_xen_hcall_sched_op(exit
, cpu
, exit
->u
.hcall
.params
[0],
1739 exit
->u
.hcall
.params
[1]);
1740 case __HYPERVISOR_event_channel_op
:
1741 return kvm_xen_hcall_evtchn_op(exit
, cpu
, exit
->u
.hcall
.params
[0],
1742 exit
->u
.hcall
.params
[1]);
1743 case __HYPERVISOR_vcpu_op
:
1744 return kvm_xen_hcall_vcpu_op(exit
, cpu
,
1745 exit
->u
.hcall
.params
[0],
1746 exit
->u
.hcall
.params
[1],
1747 exit
->u
.hcall
.params
[2]);
1748 case __HYPERVISOR_hvm_op
:
1749 return kvm_xen_hcall_hvm_op(exit
, cpu
, exit
->u
.hcall
.params
[0],
1750 exit
->u
.hcall
.params
[1]);
1751 case __HYPERVISOR_memory_op
:
1752 return kvm_xen_hcall_memory_op(exit
, cpu
, exit
->u
.hcall
.params
[0],
1753 exit
->u
.hcall
.params
[1]);
1754 case __HYPERVISOR_physdev_op
:
1755 return kvm_xen_hcall_physdev_op(exit
, cpu
, exit
->u
.hcall
.params
[0],
1756 exit
->u
.hcall
.params
[1]);
1757 case __HYPERVISOR_xen_version
:
1758 return kvm_xen_hcall_xen_version(exit
, cpu
, exit
->u
.hcall
.params
[0],
1759 exit
->u
.hcall
.params
[1]);
1765 int kvm_xen_handle_exit(X86CPU
*cpu
, struct kvm_xen_exit
*exit
)
1767 if (exit
->type
!= KVM_EXIT_XEN_HCALL
) {
1772 * The kernel latches the guest 32/64 mode when the MSR is used to fill
1773 * the hypercall page. So if we see a hypercall in a mode that doesn't
1774 * match our own idea of the guest mode, fetch the kernel's idea of the
1775 * "long mode" to remain in sync.
1777 if (exit
->u
.hcall
.longmode
!= xen_is_long_mode()) {
1778 xen_sync_long_mode();
1781 if (!do_kvm_xen_handle_exit(cpu
, exit
)) {
1783 * Some hypercalls will be deliberately "implemented" by returning
1784 * -ENOSYS. This case is for hypercalls which are unexpected.
1786 exit
->u
.hcall
.result
= -ENOSYS
;
1787 qemu_log_mask(LOG_UNIMP
, "Unimplemented Xen hypercall %"
1788 PRId64
" (0x%" PRIx64
" 0x%" PRIx64
" 0x%" PRIx64
")\n",
1789 (uint64_t)exit
->u
.hcall
.input
,
1790 (uint64_t)exit
->u
.hcall
.params
[0],
1791 (uint64_t)exit
->u
.hcall
.params
[1],
1792 (uint64_t)exit
->u
.hcall
.params
[2]);
1795 trace_kvm_xen_hypercall(CPU(cpu
)->cpu_index
, exit
->u
.hcall
.cpl
,
1796 exit
->u
.hcall
.input
, exit
->u
.hcall
.params
[0],
1797 exit
->u
.hcall
.params
[1], exit
->u
.hcall
.params
[2],
1798 exit
->u
.hcall
.result
);
1802 uint16_t kvm_xen_get_gnttab_max_frames(void)
1804 KVMState
*s
= KVM_STATE(current_accel());
1805 return s
->xen_gnttab_max_frames
;
1808 uint16_t kvm_xen_get_evtchn_max_pirq(void)
1810 KVMState
*s
= KVM_STATE(current_accel());
1811 return s
->xen_evtchn_max_pirq
;
1814 int kvm_put_xen_state(CPUState
*cs
)
1816 X86CPU
*cpu
= X86_CPU(cs
);
1817 CPUX86State
*env
= &cpu
->env
;
1821 gpa
= env
->xen_vcpu_info_gpa
;
1822 if (gpa
== INVALID_GPA
) {
1823 gpa
= env
->xen_vcpu_info_default_gpa
;
1826 if (gpa
!= INVALID_GPA
) {
1827 ret
= set_vcpu_info(cs
, gpa
);
1833 gpa
= env
->xen_vcpu_time_info_gpa
;
1834 if (gpa
!= INVALID_GPA
) {
1835 ret
= kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO
,
1842 gpa
= env
->xen_vcpu_runstate_gpa
;
1843 if (gpa
!= INVALID_GPA
) {
1844 ret
= kvm_xen_set_vcpu_attr(cs
, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR
,
1851 if (env
->xen_periodic_timer_period
) {
1852 ret
= do_set_periodic_timer(cs
, env
->xen_periodic_timer_period
);
1858 if (!kvm_xen_has_cap(EVTCHN_SEND
)) {
1860 * If the kernel has EVTCHN_SEND support then it handles timers too,
1861 * so the timer will be restored by kvm_xen_set_vcpu_timer() below.
1863 QEMU_LOCK_GUARD(&env
->xen_timers_lock
);
1864 if (env
->xen_singleshot_timer_ns
) {
1865 ret
= do_set_singleshot_timer(cs
, env
->xen_singleshot_timer_ns
,
1874 if (env
->xen_vcpu_callback_vector
) {
1875 ret
= kvm_xen_set_vcpu_callback_vector(cs
);
1881 if (env
->xen_virq
[VIRQ_TIMER
]) {
1882 do_set_vcpu_timer_virq(cs
,
1883 RUN_ON_CPU_HOST_INT(env
->xen_virq
[VIRQ_TIMER
]));
1888 int kvm_get_xen_state(CPUState
*cs
)
1890 X86CPU
*cpu
= X86_CPU(cs
);
1891 CPUX86State
*env
= &cpu
->env
;
1896 * The kernel does not mark vcpu_info as dirty when it delivers interrupts
1897 * to it. It's up to userspace to *assume* that any page shared thus is
1898 * always considered dirty. The shared_info page is different since it's
1899 * an overlay and migrated separately anyway.
1901 gpa
= env
->xen_vcpu_info_gpa
;
1902 if (gpa
== INVALID_GPA
) {
1903 gpa
= env
->xen_vcpu_info_default_gpa
;
1905 if (gpa
!= INVALID_GPA
) {
1906 MemoryRegionSection mrs
= memory_region_find(get_system_memory(),
1908 sizeof(struct vcpu_info
));
1910 !int128_lt(mrs
.size
, int128_make64(sizeof(struct vcpu_info
)))) {
1911 memory_region_set_dirty(mrs
.mr
, mrs
.offset_within_region
,
1912 sizeof(struct vcpu_info
));
1916 if (!kvm_xen_has_cap(EVTCHN_SEND
)) {
1921 * If the kernel is accelerating timers, read out the current value of the
1922 * singleshot timer deadline.
1924 if (env
->xen_virq
[VIRQ_TIMER
]) {
1925 struct kvm_xen_vcpu_attr va
= {
1926 .type
= KVM_XEN_VCPU_ATTR_TYPE_TIMER
,
1928 ret
= kvm_vcpu_ioctl(cs
, KVM_XEN_VCPU_GET_ATTR
, &va
);
1934 * This locking is fairly pointless, and is here to appease Coverity.
1935 * There is an unavoidable race condition if a different vCPU sets a
1936 * timer for this vCPU after the value has been read out. But that's
1937 * OK in practice because *all* the vCPUs need to be stopped before
1938 * we set about migrating their state.
1940 QEMU_LOCK_GUARD(&X86_CPU(cs
)->env
.xen_timers_lock
);
1941 env
->xen_singleshot_timer_ns
= va
.u
.timer
.expires_ns
;