]> git.proxmox.com Git - mirror_qemu.git/blob - target/i386/kvm/xen-emu.c
c0631f9cf439fd580756f716501a31305b778ce0
[mirror_qemu.git] / target / i386 / kvm / xen-emu.c
1 /*
2 * Xen HVM emulation support in KVM
3 *
4 * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
5 * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
6 *
7 * This work is licensed under the terms of the GNU GPL, version 2 or later.
8 * See the COPYING file in the top-level directory.
9 *
10 */
11
12 #include "qemu/osdep.h"
13 #include "qemu/log.h"
14 #include "qemu/main-loop.h"
15 #include "qemu/error-report.h"
16 #include "hw/xen/xen.h"
17 #include "sysemu/kvm_int.h"
18 #include "sysemu/kvm_xen.h"
19 #include "kvm/kvm_i386.h"
20 #include "exec/address-spaces.h"
21 #include "xen-emu.h"
22 #include "trace.h"
23 #include "sysemu/runstate.h"
24
25 #include "hw/pci/msi.h"
26 #include "hw/i386/apic-msidef.h"
27 #include "hw/i386/e820_memory_layout.h"
28 #include "hw/i386/kvm/xen_overlay.h"
29 #include "hw/i386/kvm/xen_evtchn.h"
30 #include "hw/i386/kvm/xen_gnttab.h"
31 #include "hw/i386/kvm/xen_primary_console.h"
32 #include "hw/i386/kvm/xen_xenstore.h"
33
34 #include "hw/xen/interface/version.h"
35 #include "hw/xen/interface/sched.h"
36 #include "hw/xen/interface/memory.h"
37 #include "hw/xen/interface/hvm/hvm_op.h"
38 #include "hw/xen/interface/hvm/params.h"
39 #include "hw/xen/interface/vcpu.h"
40 #include "hw/xen/interface/event_channel.h"
41 #include "hw/xen/interface/grant_table.h"
42
43 #include "xen-compat.h"
44
45 static void xen_vcpu_singleshot_timer_event(void *opaque);
46 static void xen_vcpu_periodic_timer_event(void *opaque);
47 static int vcpuop_stop_singleshot_timer(CPUState *cs);
48
49 #ifdef TARGET_X86_64
50 #define hypercall_compat32(longmode) (!(longmode))
51 #else
52 #define hypercall_compat32(longmode) (false)
53 #endif
54
55 static bool kvm_gva_to_gpa(CPUState *cs, uint64_t gva, uint64_t *gpa,
56 size_t *len, bool is_write)
57 {
58 struct kvm_translation tr = {
59 .linear_address = gva,
60 };
61
62 if (len) {
63 *len = TARGET_PAGE_SIZE - (gva & ~TARGET_PAGE_MASK);
64 }
65
66 if (kvm_vcpu_ioctl(cs, KVM_TRANSLATE, &tr) || !tr.valid ||
67 (is_write && !tr.writeable)) {
68 return false;
69 }
70 *gpa = tr.physical_address;
71 return true;
72 }
73
74 static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz,
75 bool is_write)
76 {
77 uint8_t *buf = (uint8_t *)_buf;
78 uint64_t gpa;
79 size_t len;
80
81 while (sz) {
82 if (!kvm_gva_to_gpa(cs, gva, &gpa, &len, is_write)) {
83 return -EFAULT;
84 }
85 if (len > sz) {
86 len = sz;
87 }
88
89 cpu_physical_memory_rw(gpa, buf, len, is_write);
90
91 buf += len;
92 sz -= len;
93 gva += len;
94 }
95
96 return 0;
97 }
98
99 static inline int kvm_copy_from_gva(CPUState *cs, uint64_t gva, void *buf,
100 size_t sz)
101 {
102 return kvm_gva_rw(cs, gva, buf, sz, false);
103 }
104
105 static inline int kvm_copy_to_gva(CPUState *cs, uint64_t gva, void *buf,
106 size_t sz)
107 {
108 return kvm_gva_rw(cs, gva, buf, sz, true);
109 }
110
111 int kvm_xen_init(KVMState *s, uint32_t hypercall_msr)
112 {
113 const int required_caps = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
114 KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO;
115 struct kvm_xen_hvm_config cfg = {
116 .msr = hypercall_msr,
117 .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
118 };
119 int xen_caps, ret;
120
121 xen_caps = kvm_check_extension(s, KVM_CAP_XEN_HVM);
122 if (required_caps & ~xen_caps) {
123 error_report("kvm: Xen HVM guest support not present or insufficient");
124 return -ENOSYS;
125 }
126
127 if (xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND) {
128 struct kvm_xen_hvm_attr ha = {
129 .type = KVM_XEN_ATTR_TYPE_XEN_VERSION,
130 .u.xen_version = s->xen_version,
131 };
132 (void)kvm_vm_ioctl(s, KVM_XEN_HVM_SET_ATTR, &ha);
133
134 cfg.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
135 }
136
137 ret = kvm_vm_ioctl(s, KVM_XEN_HVM_CONFIG, &cfg);
138 if (ret < 0) {
139 error_report("kvm: Failed to enable Xen HVM support: %s",
140 strerror(-ret));
141 return ret;
142 }
143
144 /* If called a second time, don't repeat the rest of the setup. */
145 if (s->xen_caps) {
146 return 0;
147 }
148
149 /*
150 * Event channel delivery via GSI/PCI_INTX needs to poll the vcpu_info
151 * of vCPU0 to deassert the IRQ when ->evtchn_upcall_pending is cleared.
152 *
153 * In the kernel, there's a notifier hook on the PIC/IOAPIC which allows
154 * such things to be polled at precisely the right time. We *could* do
155 * it nicely in the kernel: check vcpu_info[0]->evtchn_upcall_pending at
156 * the moment the IRQ is acked, and see if it should be reasserted.
157 *
158 * But the in-kernel irqchip is deprecated, so we're unlikely to add
159 * that support in the kernel. Insist on using the split irqchip mode
160 * instead.
161 *
162 * This leaves us polling for the level going low in QEMU, which lacks
163 * the appropriate hooks in its PIC/IOAPIC code. Even VFIO is sending a
164 * spurious 'ack' to an INTX IRQ every time there's any MMIO access to
165 * the device (for which it has to unmap the device and trap access, for
166 * some period after an IRQ!!). In the Xen case, we do it on exit from
167 * KVM_RUN, if the flag is set to say that the GSI is currently asserted.
168 * Which is kind of icky, but less so than the VFIO one. I may fix them
169 * both later...
170 */
171 if (!kvm_kernel_irqchip_split()) {
172 error_report("kvm: Xen support requires kernel-irqchip=split");
173 return -EINVAL;
174 }
175
176 s->xen_caps = xen_caps;
177
178 /* Tell fw_cfg to notify the BIOS to reserve the range. */
179 ret = e820_add_entry(XEN_SPECIAL_AREA_ADDR, XEN_SPECIAL_AREA_SIZE,
180 E820_RESERVED);
181 if (ret < 0) {
182 fprintf(stderr, "e820_add_entry() table is full\n");
183 return ret;
184 }
185
186 /* The pages couldn't be overlaid until KVM was initialized */
187 xen_primary_console_reset();
188 xen_xenstore_reset();
189
190 return 0;
191 }
192
193 int kvm_xen_init_vcpu(CPUState *cs)
194 {
195 X86CPU *cpu = X86_CPU(cs);
196 CPUX86State *env = &cpu->env;
197 int err;
198
199 /*
200 * The kernel needs to know the Xen/ACPI vCPU ID because that's
201 * what the guest uses in hypercalls such as timers. It doesn't
202 * match the APIC ID which is generally used for talking to the
203 * kernel about vCPUs. And if vCPU threads race with creating
204 * their KVM vCPUs out of order, it doesn't necessarily match
205 * with the kernel's internal vCPU indices either.
206 */
207 if (kvm_xen_has_cap(EVTCHN_SEND)) {
208 struct kvm_xen_vcpu_attr va = {
209 .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID,
210 .u.vcpu_id = cs->cpu_index,
211 };
212 err = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va);
213 if (err) {
214 error_report("kvm: Failed to set Xen vCPU ID attribute: %s",
215 strerror(-err));
216 return err;
217 }
218 }
219
220 env->xen_vcpu_info_gpa = INVALID_GPA;
221 env->xen_vcpu_info_default_gpa = INVALID_GPA;
222 env->xen_vcpu_time_info_gpa = INVALID_GPA;
223 env->xen_vcpu_runstate_gpa = INVALID_GPA;
224
225 qemu_mutex_init(&env->xen_timers_lock);
226 env->xen_singleshot_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
227 xen_vcpu_singleshot_timer_event,
228 cpu);
229 if (!env->xen_singleshot_timer) {
230 return -ENOMEM;
231 }
232 env->xen_singleshot_timer->opaque = cs;
233
234 env->xen_periodic_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
235 xen_vcpu_periodic_timer_event,
236 cpu);
237 if (!env->xen_periodic_timer) {
238 return -ENOMEM;
239 }
240 env->xen_periodic_timer->opaque = cs;
241
242 return 0;
243 }
244
245 uint32_t kvm_xen_get_caps(void)
246 {
247 return kvm_state->xen_caps;
248 }
249
250 static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit *exit, X86CPU *cpu,
251 int cmd, uint64_t arg)
252 {
253 int err = 0;
254
255 switch (cmd) {
256 case XENVER_get_features: {
257 struct xen_feature_info fi;
258
259 /* No need for 32/64 compat handling */
260 qemu_build_assert(sizeof(fi) == 8);
261
262 err = kvm_copy_from_gva(CPU(cpu), arg, &fi, sizeof(fi));
263 if (err) {
264 break;
265 }
266
267 fi.submap = 0;
268 if (fi.submap_idx == 0) {
269 fi.submap |= 1 << XENFEAT_writable_page_tables |
270 1 << XENFEAT_writable_descriptor_tables |
271 1 << XENFEAT_auto_translated_physmap |
272 1 << XENFEAT_hvm_callback_vector |
273 1 << XENFEAT_hvm_safe_pvclock |
274 1 << XENFEAT_hvm_pirqs;
275 }
276
277 err = kvm_copy_to_gva(CPU(cpu), arg, &fi, sizeof(fi));
278 break;
279 }
280
281 default:
282 return false;
283 }
284
285 exit->u.hcall.result = err;
286 return true;
287 }
288
289 static int kvm_xen_set_vcpu_attr(CPUState *cs, uint16_t type, uint64_t gpa)
290 {
291 struct kvm_xen_vcpu_attr xhsi;
292
293 xhsi.type = type;
294 xhsi.u.gpa = gpa;
295
296 trace_kvm_xen_set_vcpu_attr(cs->cpu_index, type, gpa);
297
298 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xhsi);
299 }
300
301 static int kvm_xen_set_vcpu_callback_vector(CPUState *cs)
302 {
303 uint8_t vector = X86_CPU(cs)->env.xen_vcpu_callback_vector;
304 struct kvm_xen_vcpu_attr xva;
305
306 xva.type = KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR;
307 xva.u.vector = vector;
308
309 trace_kvm_xen_set_vcpu_callback(cs->cpu_index, vector);
310
311 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xva);
312 }
313
314 static void do_set_vcpu_callback_vector(CPUState *cs, run_on_cpu_data data)
315 {
316 X86CPU *cpu = X86_CPU(cs);
317 CPUX86State *env = &cpu->env;
318
319 env->xen_vcpu_callback_vector = data.host_int;
320
321 if (kvm_xen_has_cap(EVTCHN_SEND)) {
322 kvm_xen_set_vcpu_callback_vector(cs);
323 }
324 }
325
326 static int set_vcpu_info(CPUState *cs, uint64_t gpa)
327 {
328 X86CPU *cpu = X86_CPU(cs);
329 CPUX86State *env = &cpu->env;
330 MemoryRegionSection mrs = { .mr = NULL };
331 void *vcpu_info_hva = NULL;
332 int ret;
333
334 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, gpa);
335 if (ret || gpa == INVALID_GPA) {
336 goto out;
337 }
338
339 mrs = memory_region_find(get_system_memory(), gpa,
340 sizeof(struct vcpu_info));
341 if (mrs.mr && mrs.mr->ram_block &&
342 !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) {
343 vcpu_info_hva = qemu_map_ram_ptr(mrs.mr->ram_block,
344 mrs.offset_within_region);
345 }
346 if (!vcpu_info_hva) {
347 if (mrs.mr) {
348 memory_region_unref(mrs.mr);
349 mrs.mr = NULL;
350 }
351 ret = -EINVAL;
352 }
353
354 out:
355 if (env->xen_vcpu_info_mr) {
356 memory_region_unref(env->xen_vcpu_info_mr);
357 }
358 env->xen_vcpu_info_hva = vcpu_info_hva;
359 env->xen_vcpu_info_mr = mrs.mr;
360 return ret;
361 }
362
363 static void do_set_vcpu_info_default_gpa(CPUState *cs, run_on_cpu_data data)
364 {
365 X86CPU *cpu = X86_CPU(cs);
366 CPUX86State *env = &cpu->env;
367
368 env->xen_vcpu_info_default_gpa = data.host_ulong;
369
370 /* Changing the default does nothing if a vcpu_info was explicitly set. */
371 if (env->xen_vcpu_info_gpa == INVALID_GPA) {
372 set_vcpu_info(cs, env->xen_vcpu_info_default_gpa);
373 }
374 }
375
376 static void do_set_vcpu_info_gpa(CPUState *cs, run_on_cpu_data data)
377 {
378 X86CPU *cpu = X86_CPU(cs);
379 CPUX86State *env = &cpu->env;
380
381 env->xen_vcpu_info_gpa = data.host_ulong;
382
383 set_vcpu_info(cs, env->xen_vcpu_info_gpa);
384 }
385
386 void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id)
387 {
388 CPUState *cs = qemu_get_cpu(vcpu_id);
389 if (!cs) {
390 return NULL;
391 }
392
393 return X86_CPU(cs)->env.xen_vcpu_info_hva;
394 }
395
396 void kvm_xen_maybe_deassert_callback(CPUState *cs)
397 {
398 CPUX86State *env = &X86_CPU(cs)->env;
399 struct vcpu_info *vi = env->xen_vcpu_info_hva;
400 if (!vi) {
401 return;
402 }
403
404 /* If the evtchn_upcall_pending flag is cleared, turn the GSI off. */
405 if (!vi->evtchn_upcall_pending) {
406 qemu_mutex_lock_iothread();
407 /*
408 * Check again now we have the lock, because it may have been
409 * asserted in the interim. And we don't want to take the lock
410 * every time because this is a fast path.
411 */
412 if (!vi->evtchn_upcall_pending) {
413 X86_CPU(cs)->env.xen_callback_asserted = false;
414 xen_evtchn_set_callback_level(0);
415 }
416 qemu_mutex_unlock_iothread();
417 }
418 }
419
420 void kvm_xen_set_callback_asserted(void)
421 {
422 CPUState *cs = qemu_get_cpu(0);
423
424 if (cs) {
425 X86_CPU(cs)->env.xen_callback_asserted = true;
426 }
427 }
428
429 bool kvm_xen_has_vcpu_callback_vector(void)
430 {
431 CPUState *cs = qemu_get_cpu(0);
432
433 return cs && !!X86_CPU(cs)->env.xen_vcpu_callback_vector;
434 }
435
436 void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type)
437 {
438 CPUState *cs = qemu_get_cpu(vcpu_id);
439 uint8_t vector;
440
441 if (!cs) {
442 return;
443 }
444
445 vector = X86_CPU(cs)->env.xen_vcpu_callback_vector;
446 if (vector) {
447 /*
448 * The per-vCPU callback vector injected via lapic. Just
449 * deliver it as an MSI.
450 */
451 MSIMessage msg = {
452 .address = APIC_DEFAULT_ADDRESS |
453 (X86_CPU(cs)->apic_id << MSI_ADDR_DEST_ID_SHIFT),
454 .data = vector | (1UL << MSI_DATA_LEVEL_SHIFT),
455 };
456 kvm_irqchip_send_msi(kvm_state, msg);
457 return;
458 }
459
460 switch (type) {
461 case HVM_PARAM_CALLBACK_TYPE_VECTOR:
462 /*
463 * If the evtchn_upcall_pending field in the vcpu_info is set, then
464 * KVM will automatically deliver the vector on entering the vCPU
465 * so all we have to do is kick it out.
466 */
467 qemu_cpu_kick(cs);
468 break;
469
470 case HVM_PARAM_CALLBACK_TYPE_GSI:
471 case HVM_PARAM_CALLBACK_TYPE_PCI_INTX:
472 if (vcpu_id == 0) {
473 xen_evtchn_set_callback_level(1);
474 }
475 break;
476 }
477 }
478
479 /* Must always be called with xen_timers_lock held */
480 static int kvm_xen_set_vcpu_timer(CPUState *cs)
481 {
482 X86CPU *cpu = X86_CPU(cs);
483 CPUX86State *env = &cpu->env;
484
485 struct kvm_xen_vcpu_attr va = {
486 .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
487 .u.timer.port = env->xen_virq[VIRQ_TIMER],
488 .u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
489 .u.timer.expires_ns = env->xen_singleshot_timer_ns,
490 };
491
492 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va);
493 }
494
495 static void do_set_vcpu_timer_virq(CPUState *cs, run_on_cpu_data data)
496 {
497 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock);
498 kvm_xen_set_vcpu_timer(cs);
499 }
500
501 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port)
502 {
503 CPUState *cs = qemu_get_cpu(vcpu_id);
504
505 if (!cs) {
506 return -ENOENT;
507 }
508
509 /* cpu.h doesn't include the actual Xen header. */
510 qemu_build_assert(NR_VIRQS == XEN_NR_VIRQS);
511
512 if (virq >= NR_VIRQS) {
513 return -EINVAL;
514 }
515
516 if (port && X86_CPU(cs)->env.xen_virq[virq]) {
517 return -EEXIST;
518 }
519
520 X86_CPU(cs)->env.xen_virq[virq] = port;
521 if (virq == VIRQ_TIMER && kvm_xen_has_cap(EVTCHN_SEND)) {
522 async_run_on_cpu(cs, do_set_vcpu_timer_virq,
523 RUN_ON_CPU_HOST_INT(port));
524 }
525 return 0;
526 }
527
528 static void do_set_vcpu_time_info_gpa(CPUState *cs, run_on_cpu_data data)
529 {
530 X86CPU *cpu = X86_CPU(cs);
531 CPUX86State *env = &cpu->env;
532
533 env->xen_vcpu_time_info_gpa = data.host_ulong;
534
535 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
536 env->xen_vcpu_time_info_gpa);
537 }
538
539 static void do_set_vcpu_runstate_gpa(CPUState *cs, run_on_cpu_data data)
540 {
541 X86CPU *cpu = X86_CPU(cs);
542 CPUX86State *env = &cpu->env;
543
544 env->xen_vcpu_runstate_gpa = data.host_ulong;
545
546 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
547 env->xen_vcpu_runstate_gpa);
548 }
549
550 static void do_vcpu_soft_reset(CPUState *cs, run_on_cpu_data data)
551 {
552 X86CPU *cpu = X86_CPU(cs);
553 CPUX86State *env = &cpu->env;
554
555 env->xen_vcpu_info_gpa = INVALID_GPA;
556 env->xen_vcpu_info_default_gpa = INVALID_GPA;
557 env->xen_vcpu_time_info_gpa = INVALID_GPA;
558 env->xen_vcpu_runstate_gpa = INVALID_GPA;
559 env->xen_vcpu_callback_vector = 0;
560 memset(env->xen_virq, 0, sizeof(env->xen_virq));
561
562 set_vcpu_info(cs, INVALID_GPA);
563 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
564 INVALID_GPA);
565 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
566 INVALID_GPA);
567 if (kvm_xen_has_cap(EVTCHN_SEND)) {
568 kvm_xen_set_vcpu_callback_vector(cs);
569
570 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock);
571 env->xen_singleshot_timer_ns = 0;
572 kvm_xen_set_vcpu_timer(cs);
573 } else {
574 vcpuop_stop_singleshot_timer(cs);
575 };
576
577 }
578
579 static int xen_set_shared_info(uint64_t gfn)
580 {
581 uint64_t gpa = gfn << TARGET_PAGE_BITS;
582 int i, err;
583
584 QEMU_IOTHREAD_LOCK_GUARD();
585
586 /*
587 * The xen_overlay device tells KVM about it too, since it had to
588 * do that on migration load anyway (unless we're going to jump
589 * through lots of hoops to maintain the fiction that this isn't
590 * KVM-specific.
591 */
592 err = xen_overlay_map_shinfo_page(gpa);
593 if (err) {
594 return err;
595 }
596
597 trace_kvm_xen_set_shared_info(gfn);
598
599 for (i = 0; i < XEN_LEGACY_MAX_VCPUS; i++) {
600 CPUState *cpu = qemu_get_cpu(i);
601 if (cpu) {
602 async_run_on_cpu(cpu, do_set_vcpu_info_default_gpa,
603 RUN_ON_CPU_HOST_ULONG(gpa));
604 }
605 gpa += sizeof(vcpu_info_t);
606 }
607
608 return err;
609 }
610
611 static int add_to_physmap_one(uint32_t space, uint64_t idx, uint64_t gfn)
612 {
613 switch (space) {
614 case XENMAPSPACE_shared_info:
615 if (idx > 0) {
616 return -EINVAL;
617 }
618 return xen_set_shared_info(gfn);
619
620 case XENMAPSPACE_grant_table:
621 return xen_gnttab_map_page(idx, gfn);
622
623 case XENMAPSPACE_gmfn:
624 case XENMAPSPACE_gmfn_range:
625 return -ENOTSUP;
626
627 case XENMAPSPACE_gmfn_foreign:
628 case XENMAPSPACE_dev_mmio:
629 return -EPERM;
630
631 default:
632 return -EINVAL;
633 }
634 }
635
636 static int do_add_to_physmap(struct kvm_xen_exit *exit, X86CPU *cpu,
637 uint64_t arg)
638 {
639 struct xen_add_to_physmap xatp;
640 CPUState *cs = CPU(cpu);
641
642 if (hypercall_compat32(exit->u.hcall.longmode)) {
643 struct compat_xen_add_to_physmap xatp32;
644
645 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap) == 16);
646 if (kvm_copy_from_gva(cs, arg, &xatp32, sizeof(xatp32))) {
647 return -EFAULT;
648 }
649 xatp.domid = xatp32.domid;
650 xatp.size = xatp32.size;
651 xatp.space = xatp32.space;
652 xatp.idx = xatp32.idx;
653 xatp.gpfn = xatp32.gpfn;
654 } else {
655 if (kvm_copy_from_gva(cs, arg, &xatp, sizeof(xatp))) {
656 return -EFAULT;
657 }
658 }
659
660 if (xatp.domid != DOMID_SELF && xatp.domid != xen_domid) {
661 return -ESRCH;
662 }
663
664 return add_to_physmap_one(xatp.space, xatp.idx, xatp.gpfn);
665 }
666
667 static int do_add_to_physmap_batch(struct kvm_xen_exit *exit, X86CPU *cpu,
668 uint64_t arg)
669 {
670 struct xen_add_to_physmap_batch xatpb;
671 unsigned long idxs_gva, gpfns_gva, errs_gva;
672 CPUState *cs = CPU(cpu);
673 size_t op_sz;
674
675 if (hypercall_compat32(exit->u.hcall.longmode)) {
676 struct compat_xen_add_to_physmap_batch xatpb32;
677
678 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch) == 20);
679 if (kvm_copy_from_gva(cs, arg, &xatpb32, sizeof(xatpb32))) {
680 return -EFAULT;
681 }
682 xatpb.domid = xatpb32.domid;
683 xatpb.space = xatpb32.space;
684 xatpb.size = xatpb32.size;
685
686 idxs_gva = xatpb32.idxs.c;
687 gpfns_gva = xatpb32.gpfns.c;
688 errs_gva = xatpb32.errs.c;
689 op_sz = sizeof(uint32_t);
690 } else {
691 if (kvm_copy_from_gva(cs, arg, &xatpb, sizeof(xatpb))) {
692 return -EFAULT;
693 }
694 op_sz = sizeof(unsigned long);
695 idxs_gva = (unsigned long)xatpb.idxs.p;
696 gpfns_gva = (unsigned long)xatpb.gpfns.p;
697 errs_gva = (unsigned long)xatpb.errs.p;
698 }
699
700 if (xatpb.domid != DOMID_SELF && xatpb.domid != xen_domid) {
701 return -ESRCH;
702 }
703
704 /* Explicitly invalid for the batch op. Not that we implement it anyway. */
705 if (xatpb.space == XENMAPSPACE_gmfn_range) {
706 return -EINVAL;
707 }
708
709 while (xatpb.size--) {
710 unsigned long idx = 0;
711 unsigned long gpfn = 0;
712 int err;
713
714 /* For 32-bit compat this only copies the low 32 bits of each */
715 if (kvm_copy_from_gva(cs, idxs_gva, &idx, op_sz) ||
716 kvm_copy_from_gva(cs, gpfns_gva, &gpfn, op_sz)) {
717 return -EFAULT;
718 }
719 idxs_gva += op_sz;
720 gpfns_gva += op_sz;
721
722 err = add_to_physmap_one(xatpb.space, idx, gpfn);
723
724 if (kvm_copy_to_gva(cs, errs_gva, &err, sizeof(err))) {
725 return -EFAULT;
726 }
727 errs_gva += sizeof(err);
728 }
729 return 0;
730 }
731
732 static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit, X86CPU *cpu,
733 int cmd, uint64_t arg)
734 {
735 int err;
736
737 switch (cmd) {
738 case XENMEM_add_to_physmap:
739 err = do_add_to_physmap(exit, cpu, arg);
740 break;
741
742 case XENMEM_add_to_physmap_batch:
743 err = do_add_to_physmap_batch(exit, cpu, arg);
744 break;
745
746 default:
747 return false;
748 }
749
750 exit->u.hcall.result = err;
751 return true;
752 }
753
754 static bool handle_set_param(struct kvm_xen_exit *exit, X86CPU *cpu,
755 uint64_t arg)
756 {
757 CPUState *cs = CPU(cpu);
758 struct xen_hvm_param hp;
759 int err = 0;
760
761 /* No need for 32/64 compat handling */
762 qemu_build_assert(sizeof(hp) == 16);
763
764 if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
765 err = -EFAULT;
766 goto out;
767 }
768
769 if (hp.domid != DOMID_SELF && hp.domid != xen_domid) {
770 err = -ESRCH;
771 goto out;
772 }
773
774 switch (hp.index) {
775 case HVM_PARAM_CALLBACK_IRQ:
776 qemu_mutex_lock_iothread();
777 err = xen_evtchn_set_callback_param(hp.value);
778 qemu_mutex_unlock_iothread();
779 xen_set_long_mode(exit->u.hcall.longmode);
780 break;
781 default:
782 return false;
783 }
784
785 out:
786 exit->u.hcall.result = err;
787 return true;
788 }
789
790 static bool handle_get_param(struct kvm_xen_exit *exit, X86CPU *cpu,
791 uint64_t arg)
792 {
793 CPUState *cs = CPU(cpu);
794 struct xen_hvm_param hp;
795 int err = 0;
796
797 /* No need for 32/64 compat handling */
798 qemu_build_assert(sizeof(hp) == 16);
799
800 if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
801 err = -EFAULT;
802 goto out;
803 }
804
805 if (hp.domid != DOMID_SELF && hp.domid != xen_domid) {
806 err = -ESRCH;
807 goto out;
808 }
809
810 switch (hp.index) {
811 case HVM_PARAM_STORE_PFN:
812 hp.value = XEN_SPECIAL_PFN(XENSTORE);
813 break;
814 case HVM_PARAM_STORE_EVTCHN:
815 hp.value = xen_xenstore_get_port();
816 break;
817 case HVM_PARAM_CONSOLE_PFN:
818 hp.value = xen_primary_console_get_pfn();
819 if (!hp.value) {
820 err = -EINVAL;
821 }
822 break;
823 case HVM_PARAM_CONSOLE_EVTCHN:
824 hp.value = xen_primary_console_get_port();
825 if (!hp.value) {
826 err = -EINVAL;
827 }
828 break;
829 default:
830 return false;
831 }
832
833 if (!err && kvm_copy_to_gva(cs, arg, &hp, sizeof(hp))) {
834 err = -EFAULT;
835 }
836 out:
837 exit->u.hcall.result = err;
838 return true;
839 }
840
841 static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit *exit,
842 X86CPU *cpu, uint64_t arg)
843 {
844 struct xen_hvm_evtchn_upcall_vector up;
845 CPUState *target_cs;
846
847 /* No need for 32/64 compat handling */
848 qemu_build_assert(sizeof(up) == 8);
849
850 if (kvm_copy_from_gva(CPU(cpu), arg, &up, sizeof(up))) {
851 return -EFAULT;
852 }
853
854 if (up.vector < 0x10) {
855 return -EINVAL;
856 }
857
858 target_cs = qemu_get_cpu(up.vcpu);
859 if (!target_cs) {
860 return -EINVAL;
861 }
862
863 async_run_on_cpu(target_cs, do_set_vcpu_callback_vector,
864 RUN_ON_CPU_HOST_INT(up.vector));
865 return 0;
866 }
867
868 static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, X86CPU *cpu,
869 int cmd, uint64_t arg)
870 {
871 int ret = -ENOSYS;
872 switch (cmd) {
873 case HVMOP_set_evtchn_upcall_vector:
874 ret = kvm_xen_hcall_evtchn_upcall_vector(exit, cpu, arg);
875 break;
876
877 case HVMOP_pagetable_dying:
878 ret = -ENOSYS;
879 break;
880
881 case HVMOP_set_param:
882 return handle_set_param(exit, cpu, arg);
883
884 case HVMOP_get_param:
885 return handle_get_param(exit, cpu, arg);
886
887 default:
888 return false;
889 }
890
891 exit->u.hcall.result = ret;
892 return true;
893 }
894
895 static int vcpuop_register_vcpu_info(CPUState *cs, CPUState *target,
896 uint64_t arg)
897 {
898 struct vcpu_register_vcpu_info rvi;
899 uint64_t gpa;
900
901 /* No need for 32/64 compat handling */
902 qemu_build_assert(sizeof(rvi) == 16);
903 qemu_build_assert(sizeof(struct vcpu_info) == 64);
904
905 if (!target) {
906 return -ENOENT;
907 }
908
909 if (kvm_copy_from_gva(cs, arg, &rvi, sizeof(rvi))) {
910 return -EFAULT;
911 }
912
913 if (rvi.offset > TARGET_PAGE_SIZE - sizeof(struct vcpu_info)) {
914 return -EINVAL;
915 }
916
917 gpa = ((rvi.mfn << TARGET_PAGE_BITS) + rvi.offset);
918 async_run_on_cpu(target, do_set_vcpu_info_gpa, RUN_ON_CPU_HOST_ULONG(gpa));
919 return 0;
920 }
921
922 static int vcpuop_register_vcpu_time_info(CPUState *cs, CPUState *target,
923 uint64_t arg)
924 {
925 struct vcpu_register_time_memory_area tma;
926 uint64_t gpa;
927 size_t len;
928
929 /* No need for 32/64 compat handling */
930 qemu_build_assert(sizeof(tma) == 8);
931 qemu_build_assert(sizeof(struct vcpu_time_info) == 32);
932
933 if (!target) {
934 return -ENOENT;
935 }
936
937 if (kvm_copy_from_gva(cs, arg, &tma, sizeof(tma))) {
938 return -EFAULT;
939 }
940
941 /*
942 * Xen actually uses the GVA and does the translation through the guest
943 * page tables each time. But Linux/KVM uses the GPA, on the assumption
944 * that guests only ever use *global* addresses (kernel virtual addresses)
945 * for it. If Linux is changed to redo the GVA→GPA translation each time,
946 * it will offer a new vCPU attribute for that, and we'll use it instead.
947 */
948 if (!kvm_gva_to_gpa(cs, tma.addr.p, &gpa, &len, false) ||
949 len < sizeof(struct vcpu_time_info)) {
950 return -EFAULT;
951 }
952
953 async_run_on_cpu(target, do_set_vcpu_time_info_gpa,
954 RUN_ON_CPU_HOST_ULONG(gpa));
955 return 0;
956 }
957
958 static int vcpuop_register_runstate_info(CPUState *cs, CPUState *target,
959 uint64_t arg)
960 {
961 struct vcpu_register_runstate_memory_area rma;
962 uint64_t gpa;
963 size_t len;
964
965 /* No need for 32/64 compat handling */
966 qemu_build_assert(sizeof(rma) == 8);
967 /* The runstate area actually does change size, but Linux copes. */
968
969 if (!target) {
970 return -ENOENT;
971 }
972
973 if (kvm_copy_from_gva(cs, arg, &rma, sizeof(rma))) {
974 return -EFAULT;
975 }
976
977 /* As with vcpu_time_info, Xen actually uses the GVA but KVM doesn't. */
978 if (!kvm_gva_to_gpa(cs, rma.addr.p, &gpa, &len, false)) {
979 return -EFAULT;
980 }
981
982 async_run_on_cpu(target, do_set_vcpu_runstate_gpa,
983 RUN_ON_CPU_HOST_ULONG(gpa));
984 return 0;
985 }
986
987 static uint64_t kvm_get_current_ns(void)
988 {
989 struct kvm_clock_data data;
990 int ret;
991
992 ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
993 if (ret < 0) {
994 fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
995 abort();
996 }
997
998 return data.clock;
999 }
1000
1001 static void xen_vcpu_singleshot_timer_event(void *opaque)
1002 {
1003 CPUState *cpu = opaque;
1004 CPUX86State *env = &X86_CPU(cpu)->env;
1005 uint16_t port = env->xen_virq[VIRQ_TIMER];
1006
1007 if (likely(port)) {
1008 xen_evtchn_set_port(port);
1009 }
1010
1011 qemu_mutex_lock(&env->xen_timers_lock);
1012 env->xen_singleshot_timer_ns = 0;
1013 qemu_mutex_unlock(&env->xen_timers_lock);
1014 }
1015
1016 static void xen_vcpu_periodic_timer_event(void *opaque)
1017 {
1018 CPUState *cpu = opaque;
1019 CPUX86State *env = &X86_CPU(cpu)->env;
1020 uint16_t port = env->xen_virq[VIRQ_TIMER];
1021 int64_t qemu_now;
1022
1023 if (likely(port)) {
1024 xen_evtchn_set_port(port);
1025 }
1026
1027 qemu_mutex_lock(&env->xen_timers_lock);
1028
1029 qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1030 timer_mod_ns(env->xen_periodic_timer,
1031 qemu_now + env->xen_periodic_timer_period);
1032
1033 qemu_mutex_unlock(&env->xen_timers_lock);
1034 }
1035
1036 static int do_set_periodic_timer(CPUState *target, uint64_t period_ns)
1037 {
1038 CPUX86State *tenv = &X86_CPU(target)->env;
1039 int64_t qemu_now;
1040
1041 timer_del(tenv->xen_periodic_timer);
1042
1043 qemu_mutex_lock(&tenv->xen_timers_lock);
1044
1045 qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1046 timer_mod_ns(tenv->xen_periodic_timer, qemu_now + period_ns);
1047 tenv->xen_periodic_timer_period = period_ns;
1048
1049 qemu_mutex_unlock(&tenv->xen_timers_lock);
1050 return 0;
1051 }
1052
1053 #define MILLISECS(_ms) ((int64_t)((_ms) * 1000000ULL))
1054 #define MICROSECS(_us) ((int64_t)((_us) * 1000ULL))
1055 #define STIME_MAX ((time_t)((int64_t)~0ull >> 1))
1056 /* Chosen so (NOW() + delta) won't overflow without an uptime of 200 years */
1057 #define STIME_DELTA_MAX ((int64_t)((uint64_t)~0ull >> 2))
1058
1059 static int vcpuop_set_periodic_timer(CPUState *cs, CPUState *target,
1060 uint64_t arg)
1061 {
1062 struct vcpu_set_periodic_timer spt;
1063
1064 qemu_build_assert(sizeof(spt) == 8);
1065 if (kvm_copy_from_gva(cs, arg, &spt, sizeof(spt))) {
1066 return -EFAULT;
1067 }
1068
1069 if (spt.period_ns < MILLISECS(1) || spt.period_ns > STIME_DELTA_MAX) {
1070 return -EINVAL;
1071 }
1072
1073 return do_set_periodic_timer(target, spt.period_ns);
1074 }
1075
1076 static int vcpuop_stop_periodic_timer(CPUState *target)
1077 {
1078 CPUX86State *tenv = &X86_CPU(target)->env;
1079
1080 qemu_mutex_lock(&tenv->xen_timers_lock);
1081
1082 timer_del(tenv->xen_periodic_timer);
1083 tenv->xen_periodic_timer_period = 0;
1084
1085 qemu_mutex_unlock(&tenv->xen_timers_lock);
1086 return 0;
1087 }
1088
1089 /*
1090 * Userspace handling of timer, for older kernels.
1091 * Must always be called with xen_timers_lock held.
1092 */
1093 static int do_set_singleshot_timer(CPUState *cs, uint64_t timeout_abs,
1094 bool linux_wa)
1095 {
1096 CPUX86State *env = &X86_CPU(cs)->env;
1097 int64_t now = kvm_get_current_ns();
1098 int64_t qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1099 int64_t delta = timeout_abs - now;
1100
1101 if (linux_wa && unlikely((int64_t)timeout_abs < 0 ||
1102 (delta > 0 && (uint32_t)(delta >> 50) != 0))) {
1103 /*
1104 * Xen has a 'Linux workaround' in do_set_timer_op() which checks
1105 * for negative absolute timeout values (caused by integer
1106 * overflow), and for values about 13 days in the future (2^50ns)
1107 * which would be caused by jiffies overflow. For those cases, it
1108 * sets the timeout 100ms in the future (not *too* soon, since if
1109 * a guest really did set a long timeout on purpose we don't want
1110 * to keep churning CPU time by waking it up).
1111 */
1112 delta = (100 * SCALE_MS);
1113 timeout_abs = now + delta;
1114 }
1115
1116 timer_mod_ns(env->xen_singleshot_timer, qemu_now + delta);
1117 env->xen_singleshot_timer_ns = now + delta;
1118 return 0;
1119 }
1120
1121 static int vcpuop_set_singleshot_timer(CPUState *cs, uint64_t arg)
1122 {
1123 struct vcpu_set_singleshot_timer sst = { 0 };
1124
1125 /*
1126 * The struct is a uint64_t followed by a uint32_t. On 32-bit that
1127 * makes it 12 bytes. On 64-bit it gets padded to 16. The parts
1128 * that get used are identical, and there's four bytes of padding
1129 * unused at the end. For true Xen compatibility we should attempt
1130 * to copy the full 16 bytes from 64-bit guests, and return -EFAULT
1131 * if we can't get the padding too. But that's daft. Just copy what
1132 * we need.
1133 */
1134 qemu_build_assert(offsetof(struct vcpu_set_singleshot_timer, flags) == 8);
1135 qemu_build_assert(sizeof(sst) >= 12);
1136
1137 if (kvm_copy_from_gva(cs, arg, &sst, 12)) {
1138 return -EFAULT;
1139 }
1140
1141 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock);
1142
1143 /*
1144 * We ignore the VCPU_SSHOTTMR_future flag, just as Xen now does.
1145 * The only guest that ever used it, got it wrong.
1146 * https://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=19c6cbd909
1147 */
1148 return do_set_singleshot_timer(cs, sst.timeout_abs_ns, false);
1149 }
1150
1151 static int vcpuop_stop_singleshot_timer(CPUState *cs)
1152 {
1153 CPUX86State *env = &X86_CPU(cs)->env;
1154
1155 qemu_mutex_lock(&env->xen_timers_lock);
1156
1157 timer_del(env->xen_singleshot_timer);
1158 env->xen_singleshot_timer_ns = 0;
1159
1160 qemu_mutex_unlock(&env->xen_timers_lock);
1161 return 0;
1162 }
1163
1164 static bool kvm_xen_hcall_set_timer_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1165 uint64_t timeout)
1166 {
1167 int err;
1168
1169 if (unlikely(timeout == 0)) {
1170 err = vcpuop_stop_singleshot_timer(CPU(cpu));
1171 } else {
1172 QEMU_LOCK_GUARD(&X86_CPU(cpu)->env.xen_timers_lock);
1173 err = do_set_singleshot_timer(CPU(cpu), timeout, true);
1174 }
1175 exit->u.hcall.result = err;
1176 return true;
1177 }
1178
1179 static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1180 int cmd, int vcpu_id, uint64_t arg)
1181 {
1182 CPUState *cs = CPU(cpu);
1183 CPUState *dest = cs->cpu_index == vcpu_id ? cs : qemu_get_cpu(vcpu_id);
1184 int err;
1185
1186 if (!dest) {
1187 err = -ENOENT;
1188 goto out;
1189 }
1190
1191 switch (cmd) {
1192 case VCPUOP_register_runstate_memory_area:
1193 err = vcpuop_register_runstate_info(cs, dest, arg);
1194 break;
1195 case VCPUOP_register_vcpu_time_memory_area:
1196 err = vcpuop_register_vcpu_time_info(cs, dest, arg);
1197 break;
1198 case VCPUOP_register_vcpu_info:
1199 err = vcpuop_register_vcpu_info(cs, dest, arg);
1200 break;
1201 case VCPUOP_set_singleshot_timer: {
1202 if (cs->cpu_index == vcpu_id) {
1203 err = vcpuop_set_singleshot_timer(dest, arg);
1204 } else {
1205 err = -EINVAL;
1206 }
1207 break;
1208 }
1209 case VCPUOP_stop_singleshot_timer:
1210 if (cs->cpu_index == vcpu_id) {
1211 err = vcpuop_stop_singleshot_timer(dest);
1212 } else {
1213 err = -EINVAL;
1214 }
1215 break;
1216 case VCPUOP_set_periodic_timer: {
1217 err = vcpuop_set_periodic_timer(cs, dest, arg);
1218 break;
1219 }
1220 case VCPUOP_stop_periodic_timer:
1221 err = vcpuop_stop_periodic_timer(dest);
1222 break;
1223
1224 default:
1225 return false;
1226 }
1227
1228 out:
1229 exit->u.hcall.result = err;
1230 return true;
1231 }
1232
1233 static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1234 int cmd, uint64_t arg)
1235 {
1236 CPUState *cs = CPU(cpu);
1237 int err = -ENOSYS;
1238
1239 switch (cmd) {
1240 case EVTCHNOP_init_control:
1241 case EVTCHNOP_expand_array:
1242 case EVTCHNOP_set_priority:
1243 /* We do not support FIFO channels at this point */
1244 err = -ENOSYS;
1245 break;
1246
1247 case EVTCHNOP_status: {
1248 struct evtchn_status status;
1249
1250 qemu_build_assert(sizeof(status) == 24);
1251 if (kvm_copy_from_gva(cs, arg, &status, sizeof(status))) {
1252 err = -EFAULT;
1253 break;
1254 }
1255
1256 err = xen_evtchn_status_op(&status);
1257 if (!err && kvm_copy_to_gva(cs, arg, &status, sizeof(status))) {
1258 err = -EFAULT;
1259 }
1260 break;
1261 }
1262 case EVTCHNOP_close: {
1263 struct evtchn_close close;
1264
1265 qemu_build_assert(sizeof(close) == 4);
1266 if (kvm_copy_from_gva(cs, arg, &close, sizeof(close))) {
1267 err = -EFAULT;
1268 break;
1269 }
1270
1271 err = xen_evtchn_close_op(&close);
1272 break;
1273 }
1274 case EVTCHNOP_unmask: {
1275 struct evtchn_unmask unmask;
1276
1277 qemu_build_assert(sizeof(unmask) == 4);
1278 if (kvm_copy_from_gva(cs, arg, &unmask, sizeof(unmask))) {
1279 err = -EFAULT;
1280 break;
1281 }
1282
1283 err = xen_evtchn_unmask_op(&unmask);
1284 break;
1285 }
1286 case EVTCHNOP_bind_virq: {
1287 struct evtchn_bind_virq virq;
1288
1289 qemu_build_assert(sizeof(virq) == 12);
1290 if (kvm_copy_from_gva(cs, arg, &virq, sizeof(virq))) {
1291 err = -EFAULT;
1292 break;
1293 }
1294
1295 err = xen_evtchn_bind_virq_op(&virq);
1296 if (!err && kvm_copy_to_gva(cs, arg, &virq, sizeof(virq))) {
1297 err = -EFAULT;
1298 }
1299 break;
1300 }
1301 case EVTCHNOP_bind_pirq: {
1302 struct evtchn_bind_pirq pirq;
1303
1304 qemu_build_assert(sizeof(pirq) == 12);
1305 if (kvm_copy_from_gva(cs, arg, &pirq, sizeof(pirq))) {
1306 err = -EFAULT;
1307 break;
1308 }
1309
1310 err = xen_evtchn_bind_pirq_op(&pirq);
1311 if (!err && kvm_copy_to_gva(cs, arg, &pirq, sizeof(pirq))) {
1312 err = -EFAULT;
1313 }
1314 break;
1315 }
1316 case EVTCHNOP_bind_ipi: {
1317 struct evtchn_bind_ipi ipi;
1318
1319 qemu_build_assert(sizeof(ipi) == 8);
1320 if (kvm_copy_from_gva(cs, arg, &ipi, sizeof(ipi))) {
1321 err = -EFAULT;
1322 break;
1323 }
1324
1325 err = xen_evtchn_bind_ipi_op(&ipi);
1326 if (!err && kvm_copy_to_gva(cs, arg, &ipi, sizeof(ipi))) {
1327 err = -EFAULT;
1328 }
1329 break;
1330 }
1331 case EVTCHNOP_send: {
1332 struct evtchn_send send;
1333
1334 qemu_build_assert(sizeof(send) == 4);
1335 if (kvm_copy_from_gva(cs, arg, &send, sizeof(send))) {
1336 err = -EFAULT;
1337 break;
1338 }
1339
1340 err = xen_evtchn_send_op(&send);
1341 break;
1342 }
1343 case EVTCHNOP_alloc_unbound: {
1344 struct evtchn_alloc_unbound alloc;
1345
1346 qemu_build_assert(sizeof(alloc) == 8);
1347 if (kvm_copy_from_gva(cs, arg, &alloc, sizeof(alloc))) {
1348 err = -EFAULT;
1349 break;
1350 }
1351
1352 err = xen_evtchn_alloc_unbound_op(&alloc);
1353 if (!err && kvm_copy_to_gva(cs, arg, &alloc, sizeof(alloc))) {
1354 err = -EFAULT;
1355 }
1356 break;
1357 }
1358 case EVTCHNOP_bind_interdomain: {
1359 struct evtchn_bind_interdomain interdomain;
1360
1361 qemu_build_assert(sizeof(interdomain) == 12);
1362 if (kvm_copy_from_gva(cs, arg, &interdomain, sizeof(interdomain))) {
1363 err = -EFAULT;
1364 break;
1365 }
1366
1367 err = xen_evtchn_bind_interdomain_op(&interdomain);
1368 if (!err &&
1369 kvm_copy_to_gva(cs, arg, &interdomain, sizeof(interdomain))) {
1370 err = -EFAULT;
1371 }
1372 break;
1373 }
1374 case EVTCHNOP_bind_vcpu: {
1375 struct evtchn_bind_vcpu vcpu;
1376
1377 qemu_build_assert(sizeof(vcpu) == 8);
1378 if (kvm_copy_from_gva(cs, arg, &vcpu, sizeof(vcpu))) {
1379 err = -EFAULT;
1380 break;
1381 }
1382
1383 err = xen_evtchn_bind_vcpu_op(&vcpu);
1384 break;
1385 }
1386 case EVTCHNOP_reset: {
1387 struct evtchn_reset reset;
1388
1389 qemu_build_assert(sizeof(reset) == 2);
1390 if (kvm_copy_from_gva(cs, arg, &reset, sizeof(reset))) {
1391 err = -EFAULT;
1392 break;
1393 }
1394
1395 err = xen_evtchn_reset_op(&reset);
1396 break;
1397 }
1398 default:
1399 return false;
1400 }
1401
1402 exit->u.hcall.result = err;
1403 return true;
1404 }
1405
1406 int kvm_xen_soft_reset(void)
1407 {
1408 CPUState *cpu;
1409 int err;
1410
1411 assert(qemu_mutex_iothread_locked());
1412
1413 trace_kvm_xen_soft_reset();
1414
1415 err = xen_evtchn_soft_reset();
1416 if (err) {
1417 return err;
1418 }
1419
1420 /*
1421 * Zero is the reset/startup state for HVM_PARAM_CALLBACK_IRQ. Strictly,
1422 * it maps to HVM_PARAM_CALLBACK_TYPE_GSI with GSI#0, but Xen refuses to
1423 * to deliver to the timer interrupt and treats that as 'disabled'.
1424 */
1425 err = xen_evtchn_set_callback_param(0);
1426 if (err) {
1427 return err;
1428 }
1429
1430 CPU_FOREACH(cpu) {
1431 async_run_on_cpu(cpu, do_vcpu_soft_reset, RUN_ON_CPU_NULL);
1432 }
1433
1434 err = xen_overlay_map_shinfo_page(INVALID_GFN);
1435 if (err) {
1436 return err;
1437 }
1438
1439 err = xen_gnttab_reset();
1440 if (err) {
1441 return err;
1442 }
1443
1444 err = xen_primary_console_reset();
1445 if (err) {
1446 return err;
1447 }
1448
1449 err = xen_xenstore_reset();
1450 if (err) {
1451 return err;
1452 }
1453
1454 return 0;
1455 }
1456
1457 static int schedop_shutdown(CPUState *cs, uint64_t arg)
1458 {
1459 struct sched_shutdown shutdown;
1460 int ret = 0;
1461
1462 /* No need for 32/64 compat handling */
1463 qemu_build_assert(sizeof(shutdown) == 4);
1464
1465 if (kvm_copy_from_gva(cs, arg, &shutdown, sizeof(shutdown))) {
1466 return -EFAULT;
1467 }
1468
1469 switch (shutdown.reason) {
1470 case SHUTDOWN_crash:
1471 cpu_dump_state(cs, stderr, CPU_DUMP_CODE);
1472 qemu_system_guest_panicked(NULL);
1473 break;
1474
1475 case SHUTDOWN_reboot:
1476 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
1477 break;
1478
1479 case SHUTDOWN_poweroff:
1480 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
1481 break;
1482
1483 case SHUTDOWN_soft_reset:
1484 qemu_mutex_lock_iothread();
1485 ret = kvm_xen_soft_reset();
1486 qemu_mutex_unlock_iothread();
1487 break;
1488
1489 default:
1490 ret = -EINVAL;
1491 break;
1492 }
1493
1494 return ret;
1495 }
1496
1497 static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1498 int cmd, uint64_t arg)
1499 {
1500 CPUState *cs = CPU(cpu);
1501 int err = -ENOSYS;
1502
1503 switch (cmd) {
1504 case SCHEDOP_shutdown:
1505 err = schedop_shutdown(cs, arg);
1506 break;
1507
1508 case SCHEDOP_poll:
1509 /*
1510 * Linux will panic if this doesn't work. Just yield; it's not
1511 * worth overthinking it because with event channel handling
1512 * in KVM, the kernel will intercept this and it will never
1513 * reach QEMU anyway. The semantics of the hypercall explicltly
1514 * permit spurious wakeups.
1515 */
1516 case SCHEDOP_yield:
1517 sched_yield();
1518 err = 0;
1519 break;
1520
1521 default:
1522 return false;
1523 }
1524
1525 exit->u.hcall.result = err;
1526 return true;
1527 }
1528
1529 static bool kvm_xen_hcall_gnttab_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1530 int cmd, uint64_t arg, int count)
1531 {
1532 CPUState *cs = CPU(cpu);
1533 int err;
1534
1535 switch (cmd) {
1536 case GNTTABOP_set_version: {
1537 struct gnttab_set_version set;
1538
1539 qemu_build_assert(sizeof(set) == 4);
1540 if (kvm_copy_from_gva(cs, arg, &set, sizeof(set))) {
1541 err = -EFAULT;
1542 break;
1543 }
1544
1545 err = xen_gnttab_set_version_op(&set);
1546 if (!err && kvm_copy_to_gva(cs, arg, &set, sizeof(set))) {
1547 err = -EFAULT;
1548 }
1549 break;
1550 }
1551 case GNTTABOP_get_version: {
1552 struct gnttab_get_version get;
1553
1554 qemu_build_assert(sizeof(get) == 8);
1555 if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) {
1556 err = -EFAULT;
1557 break;
1558 }
1559
1560 err = xen_gnttab_get_version_op(&get);
1561 if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) {
1562 err = -EFAULT;
1563 }
1564 break;
1565 }
1566 case GNTTABOP_query_size: {
1567 struct gnttab_query_size size;
1568
1569 qemu_build_assert(sizeof(size) == 16);
1570 if (kvm_copy_from_gva(cs, arg, &size, sizeof(size))) {
1571 err = -EFAULT;
1572 break;
1573 }
1574
1575 err = xen_gnttab_query_size_op(&size);
1576 if (!err && kvm_copy_to_gva(cs, arg, &size, sizeof(size))) {
1577 err = -EFAULT;
1578 }
1579 break;
1580 }
1581 case GNTTABOP_setup_table:
1582 case GNTTABOP_copy:
1583 case GNTTABOP_map_grant_ref:
1584 case GNTTABOP_unmap_grant_ref:
1585 case GNTTABOP_swap_grant_ref:
1586 return false;
1587
1588 default:
1589 /* Xen explicitly returns -ENOSYS to HVM guests for all others */
1590 err = -ENOSYS;
1591 break;
1592 }
1593
1594 exit->u.hcall.result = err;
1595 return true;
1596 }
1597
1598 static bool kvm_xen_hcall_physdev_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1599 int cmd, uint64_t arg)
1600 {
1601 CPUState *cs = CPU(cpu);
1602 int err;
1603
1604 switch (cmd) {
1605 case PHYSDEVOP_map_pirq: {
1606 struct physdev_map_pirq map;
1607
1608 if (hypercall_compat32(exit->u.hcall.longmode)) {
1609 struct compat_physdev_map_pirq *map32 = (void *)&map;
1610
1611 if (kvm_copy_from_gva(cs, arg, map32, sizeof(*map32))) {
1612 return -EFAULT;
1613 }
1614
1615 /*
1616 * The only thing that's different is the alignment of the
1617 * uint64_t table_base at the end, which gets padding to make
1618 * it 64-bit aligned in the 64-bit version.
1619 */
1620 qemu_build_assert(sizeof(*map32) == 36);
1621 qemu_build_assert(offsetof(struct physdev_map_pirq, entry_nr) ==
1622 offsetof(struct compat_physdev_map_pirq, entry_nr));
1623 memmove(&map.table_base, &map32->table_base, sizeof(map.table_base));
1624 } else {
1625 if (kvm_copy_from_gva(cs, arg, &map, sizeof(map))) {
1626 err = -EFAULT;
1627 break;
1628 }
1629 }
1630 err = xen_physdev_map_pirq(&map);
1631 /*
1632 * Since table_base is an IN parameter and won't be changed, just
1633 * copy the size of the compat structure back to the guest.
1634 */
1635 if (!err && kvm_copy_to_gva(cs, arg, &map,
1636 sizeof(struct compat_physdev_map_pirq))) {
1637 err = -EFAULT;
1638 }
1639 break;
1640 }
1641 case PHYSDEVOP_unmap_pirq: {
1642 struct physdev_unmap_pirq unmap;
1643
1644 qemu_build_assert(sizeof(unmap) == 8);
1645 if (kvm_copy_from_gva(cs, arg, &unmap, sizeof(unmap))) {
1646 err = -EFAULT;
1647 break;
1648 }
1649
1650 err = xen_physdev_unmap_pirq(&unmap);
1651 if (!err && kvm_copy_to_gva(cs, arg, &unmap, sizeof(unmap))) {
1652 err = -EFAULT;
1653 }
1654 break;
1655 }
1656 case PHYSDEVOP_eoi: {
1657 struct physdev_eoi eoi;
1658
1659 qemu_build_assert(sizeof(eoi) == 4);
1660 if (kvm_copy_from_gva(cs, arg, &eoi, sizeof(eoi))) {
1661 err = -EFAULT;
1662 break;
1663 }
1664
1665 err = xen_physdev_eoi_pirq(&eoi);
1666 if (!err && kvm_copy_to_gva(cs, arg, &eoi, sizeof(eoi))) {
1667 err = -EFAULT;
1668 }
1669 break;
1670 }
1671 case PHYSDEVOP_irq_status_query: {
1672 struct physdev_irq_status_query query;
1673
1674 qemu_build_assert(sizeof(query) == 8);
1675 if (kvm_copy_from_gva(cs, arg, &query, sizeof(query))) {
1676 err = -EFAULT;
1677 break;
1678 }
1679
1680 err = xen_physdev_query_pirq(&query);
1681 if (!err && kvm_copy_to_gva(cs, arg, &query, sizeof(query))) {
1682 err = -EFAULT;
1683 }
1684 break;
1685 }
1686 case PHYSDEVOP_get_free_pirq: {
1687 struct physdev_get_free_pirq get;
1688
1689 qemu_build_assert(sizeof(get) == 8);
1690 if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) {
1691 err = -EFAULT;
1692 break;
1693 }
1694
1695 err = xen_physdev_get_free_pirq(&get);
1696 if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) {
1697 err = -EFAULT;
1698 }
1699 break;
1700 }
1701 case PHYSDEVOP_pirq_eoi_gmfn_v2: /* FreeBSD 13 makes this hypercall */
1702 err = -ENOSYS;
1703 break;
1704
1705 default:
1706 return false;
1707 }
1708
1709 exit->u.hcall.result = err;
1710 return true;
1711 }
1712
1713 static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
1714 {
1715 uint16_t code = exit->u.hcall.input;
1716
1717 if (exit->u.hcall.cpl > 0) {
1718 exit->u.hcall.result = -EPERM;
1719 return true;
1720 }
1721
1722 switch (code) {
1723 case __HYPERVISOR_set_timer_op:
1724 if (exit->u.hcall.longmode) {
1725 return kvm_xen_hcall_set_timer_op(exit, cpu,
1726 exit->u.hcall.params[0]);
1727 } else {
1728 /* In 32-bit mode, the 64-bit timer value is in two args. */
1729 uint64_t val = ((uint64_t)exit->u.hcall.params[1]) << 32 |
1730 (uint32_t)exit->u.hcall.params[0];
1731 return kvm_xen_hcall_set_timer_op(exit, cpu, val);
1732 }
1733 case __HYPERVISOR_grant_table_op:
1734 return kvm_xen_hcall_gnttab_op(exit, cpu, exit->u.hcall.params[0],
1735 exit->u.hcall.params[1],
1736 exit->u.hcall.params[2]);
1737 case __HYPERVISOR_sched_op:
1738 return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0],
1739 exit->u.hcall.params[1]);
1740 case __HYPERVISOR_event_channel_op:
1741 return kvm_xen_hcall_evtchn_op(exit, cpu, exit->u.hcall.params[0],
1742 exit->u.hcall.params[1]);
1743 case __HYPERVISOR_vcpu_op:
1744 return kvm_xen_hcall_vcpu_op(exit, cpu,
1745 exit->u.hcall.params[0],
1746 exit->u.hcall.params[1],
1747 exit->u.hcall.params[2]);
1748 case __HYPERVISOR_hvm_op:
1749 return kvm_xen_hcall_hvm_op(exit, cpu, exit->u.hcall.params[0],
1750 exit->u.hcall.params[1]);
1751 case __HYPERVISOR_memory_op:
1752 return kvm_xen_hcall_memory_op(exit, cpu, exit->u.hcall.params[0],
1753 exit->u.hcall.params[1]);
1754 case __HYPERVISOR_physdev_op:
1755 return kvm_xen_hcall_physdev_op(exit, cpu, exit->u.hcall.params[0],
1756 exit->u.hcall.params[1]);
1757 case __HYPERVISOR_xen_version:
1758 return kvm_xen_hcall_xen_version(exit, cpu, exit->u.hcall.params[0],
1759 exit->u.hcall.params[1]);
1760 default:
1761 return false;
1762 }
1763 }
1764
1765 int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
1766 {
1767 if (exit->type != KVM_EXIT_XEN_HCALL) {
1768 return -1;
1769 }
1770
1771 /*
1772 * The kernel latches the guest 32/64 mode when the MSR is used to fill
1773 * the hypercall page. So if we see a hypercall in a mode that doesn't
1774 * match our own idea of the guest mode, fetch the kernel's idea of the
1775 * "long mode" to remain in sync.
1776 */
1777 if (exit->u.hcall.longmode != xen_is_long_mode()) {
1778 xen_sync_long_mode();
1779 }
1780
1781 if (!do_kvm_xen_handle_exit(cpu, exit)) {
1782 /*
1783 * Some hypercalls will be deliberately "implemented" by returning
1784 * -ENOSYS. This case is for hypercalls which are unexpected.
1785 */
1786 exit->u.hcall.result = -ENOSYS;
1787 qemu_log_mask(LOG_UNIMP, "Unimplemented Xen hypercall %"
1788 PRId64 " (0x%" PRIx64 " 0x%" PRIx64 " 0x%" PRIx64 ")\n",
1789 (uint64_t)exit->u.hcall.input,
1790 (uint64_t)exit->u.hcall.params[0],
1791 (uint64_t)exit->u.hcall.params[1],
1792 (uint64_t)exit->u.hcall.params[2]);
1793 }
1794
1795 trace_kvm_xen_hypercall(CPU(cpu)->cpu_index, exit->u.hcall.cpl,
1796 exit->u.hcall.input, exit->u.hcall.params[0],
1797 exit->u.hcall.params[1], exit->u.hcall.params[2],
1798 exit->u.hcall.result);
1799 return 0;
1800 }
1801
1802 uint16_t kvm_xen_get_gnttab_max_frames(void)
1803 {
1804 KVMState *s = KVM_STATE(current_accel());
1805 return s->xen_gnttab_max_frames;
1806 }
1807
1808 uint16_t kvm_xen_get_evtchn_max_pirq(void)
1809 {
1810 KVMState *s = KVM_STATE(current_accel());
1811 return s->xen_evtchn_max_pirq;
1812 }
1813
1814 int kvm_put_xen_state(CPUState *cs)
1815 {
1816 X86CPU *cpu = X86_CPU(cs);
1817 CPUX86State *env = &cpu->env;
1818 uint64_t gpa;
1819 int ret;
1820
1821 gpa = env->xen_vcpu_info_gpa;
1822 if (gpa == INVALID_GPA) {
1823 gpa = env->xen_vcpu_info_default_gpa;
1824 }
1825
1826 if (gpa != INVALID_GPA) {
1827 ret = set_vcpu_info(cs, gpa);
1828 if (ret < 0) {
1829 return ret;
1830 }
1831 }
1832
1833 gpa = env->xen_vcpu_time_info_gpa;
1834 if (gpa != INVALID_GPA) {
1835 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
1836 gpa);
1837 if (ret < 0) {
1838 return ret;
1839 }
1840 }
1841
1842 gpa = env->xen_vcpu_runstate_gpa;
1843 if (gpa != INVALID_GPA) {
1844 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
1845 gpa);
1846 if (ret < 0) {
1847 return ret;
1848 }
1849 }
1850
1851 if (env->xen_periodic_timer_period) {
1852 ret = do_set_periodic_timer(cs, env->xen_periodic_timer_period);
1853 if (ret < 0) {
1854 return ret;
1855 }
1856 }
1857
1858 if (!kvm_xen_has_cap(EVTCHN_SEND)) {
1859 /*
1860 * If the kernel has EVTCHN_SEND support then it handles timers too,
1861 * so the timer will be restored by kvm_xen_set_vcpu_timer() below.
1862 */
1863 QEMU_LOCK_GUARD(&env->xen_timers_lock);
1864 if (env->xen_singleshot_timer_ns) {
1865 ret = do_set_singleshot_timer(cs, env->xen_singleshot_timer_ns,
1866 false);
1867 if (ret < 0) {
1868 return ret;
1869 }
1870 }
1871 return 0;
1872 }
1873
1874 if (env->xen_vcpu_callback_vector) {
1875 ret = kvm_xen_set_vcpu_callback_vector(cs);
1876 if (ret < 0) {
1877 return ret;
1878 }
1879 }
1880
1881 if (env->xen_virq[VIRQ_TIMER]) {
1882 do_set_vcpu_timer_virq(cs,
1883 RUN_ON_CPU_HOST_INT(env->xen_virq[VIRQ_TIMER]));
1884 }
1885 return 0;
1886 }
1887
1888 int kvm_get_xen_state(CPUState *cs)
1889 {
1890 X86CPU *cpu = X86_CPU(cs);
1891 CPUX86State *env = &cpu->env;
1892 uint64_t gpa;
1893 int ret;
1894
1895 /*
1896 * The kernel does not mark vcpu_info as dirty when it delivers interrupts
1897 * to it. It's up to userspace to *assume* that any page shared thus is
1898 * always considered dirty. The shared_info page is different since it's
1899 * an overlay and migrated separately anyway.
1900 */
1901 gpa = env->xen_vcpu_info_gpa;
1902 if (gpa == INVALID_GPA) {
1903 gpa = env->xen_vcpu_info_default_gpa;
1904 }
1905 if (gpa != INVALID_GPA) {
1906 MemoryRegionSection mrs = memory_region_find(get_system_memory(),
1907 gpa,
1908 sizeof(struct vcpu_info));
1909 if (mrs.mr &&
1910 !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) {
1911 memory_region_set_dirty(mrs.mr, mrs.offset_within_region,
1912 sizeof(struct vcpu_info));
1913 }
1914 }
1915
1916 if (!kvm_xen_has_cap(EVTCHN_SEND)) {
1917 return 0;
1918 }
1919
1920 /*
1921 * If the kernel is accelerating timers, read out the current value of the
1922 * singleshot timer deadline.
1923 */
1924 if (env->xen_virq[VIRQ_TIMER]) {
1925 struct kvm_xen_vcpu_attr va = {
1926 .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
1927 };
1928 ret = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_GET_ATTR, &va);
1929 if (ret < 0) {
1930 return ret;
1931 }
1932
1933 /*
1934 * This locking is fairly pointless, and is here to appease Coverity.
1935 * There is an unavoidable race condition if a different vCPU sets a
1936 * timer for this vCPU after the value has been read out. But that's
1937 * OK in practice because *all* the vCPUs need to be stopped before
1938 * we set about migrating their state.
1939 */
1940 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock);
1941 env->xen_singleshot_timer_ns = va.u.timer.expires_ns;
1942 }
1943
1944 return 0;
1945 }