]> git.proxmox.com Git - mirror_qemu.git/blob - accel/kvm/kvm-all.c
Merge tag 'pull-ufs-20231013' of https://gitlab.com/jeuk20.kim/qemu into staging
[mirror_qemu.git] / accel / kvm / kvm-all.c
1 /*
2 * QEMU KVM support
3 *
4 * Copyright IBM, Corp. 2008
5 * Red Hat, Inc. 2008
6 *
7 * Authors:
8 * Anthony Liguori <aliguori@us.ibm.com>
9 * Glauber Costa <gcosta@redhat.com>
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12 * See the COPYING file in the top-level directory.
13 *
14 */
15
16 #include "qemu/osdep.h"
17 #include <sys/ioctl.h>
18 #include <poll.h>
19
20 #include <linux/kvm.h>
21
22 #include "qemu/atomic.h"
23 #include "qemu/option.h"
24 #include "qemu/config-file.h"
25 #include "qemu/error-report.h"
26 #include "qapi/error.h"
27 #include "hw/pci/msi.h"
28 #include "hw/pci/msix.h"
29 #include "hw/s390x/adapter.h"
30 #include "exec/gdbstub.h"
31 #include "sysemu/kvm_int.h"
32 #include "sysemu/runstate.h"
33 #include "sysemu/cpus.h"
34 #include "sysemu/accel-blocker.h"
35 #include "qemu/bswap.h"
36 #include "exec/memory.h"
37 #include "exec/ram_addr.h"
38 #include "qemu/event_notifier.h"
39 #include "qemu/main-loop.h"
40 #include "trace.h"
41 #include "hw/irq.h"
42 #include "qapi/visitor.h"
43 #include "qapi/qapi-types-common.h"
44 #include "qapi/qapi-visit-common.h"
45 #include "sysemu/reset.h"
46 #include "qemu/guest-random.h"
47 #include "sysemu/hw_accel.h"
48 #include "kvm-cpus.h"
49 #include "sysemu/dirtylimit.h"
50 #include "qemu/range.h"
51
52 #include "hw/boards.h"
53 #include "sysemu/stats.h"
54
55 /* This check must be after config-host.h is included */
56 #ifdef CONFIG_EVENTFD
57 #include <sys/eventfd.h>
58 #endif
59
60 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
61 * need to use the real host PAGE_SIZE, as that's what KVM will use.
62 */
63 #ifdef PAGE_SIZE
64 #undef PAGE_SIZE
65 #endif
66 #define PAGE_SIZE qemu_real_host_page_size()
67
68 #ifndef KVM_GUESTDBG_BLOCKIRQ
69 #define KVM_GUESTDBG_BLOCKIRQ 0
70 #endif
71
72 //#define DEBUG_KVM
73
74 #ifdef DEBUG_KVM
75 #define DPRINTF(fmt, ...) \
76 do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
77 #else
78 #define DPRINTF(fmt, ...) \
79 do { } while (0)
80 #endif
81
82 struct KVMParkedVcpu {
83 unsigned long vcpu_id;
84 int kvm_fd;
85 QLIST_ENTRY(KVMParkedVcpu) node;
86 };
87
88 KVMState *kvm_state;
89 bool kvm_kernel_irqchip;
90 bool kvm_split_irqchip;
91 bool kvm_async_interrupts_allowed;
92 bool kvm_halt_in_kernel_allowed;
93 bool kvm_eventfds_allowed;
94 bool kvm_irqfds_allowed;
95 bool kvm_resamplefds_allowed;
96 bool kvm_msi_via_irqfd_allowed;
97 bool kvm_gsi_routing_allowed;
98 bool kvm_gsi_direct_mapping;
99 bool kvm_allowed;
100 bool kvm_readonly_mem_allowed;
101 bool kvm_vm_attributes_allowed;
102 bool kvm_direct_msi_allowed;
103 bool kvm_ioeventfd_any_length_allowed;
104 bool kvm_msi_use_devid;
105 bool kvm_has_guest_debug;
106 static int kvm_sstep_flags;
107 static bool kvm_immediate_exit;
108 static hwaddr kvm_max_slot_size = ~0;
109
110 static const KVMCapabilityInfo kvm_required_capabilites[] = {
111 KVM_CAP_INFO(USER_MEMORY),
112 KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
113 KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS),
114 KVM_CAP_LAST_INFO
115 };
116
117 static NotifierList kvm_irqchip_change_notifiers =
118 NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
119
120 struct KVMResampleFd {
121 int gsi;
122 EventNotifier *resample_event;
123 QLIST_ENTRY(KVMResampleFd) node;
124 };
125 typedef struct KVMResampleFd KVMResampleFd;
126
127 /*
128 * Only used with split irqchip where we need to do the resample fd
129 * kick for the kernel from userspace.
130 */
131 static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
132 QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
133
134 static QemuMutex kml_slots_lock;
135
136 #define kvm_slots_lock() qemu_mutex_lock(&kml_slots_lock)
137 #define kvm_slots_unlock() qemu_mutex_unlock(&kml_slots_lock)
138
139 static void kvm_slot_init_dirty_bitmap(KVMSlot *mem);
140
141 static inline void kvm_resample_fd_remove(int gsi)
142 {
143 KVMResampleFd *rfd;
144
145 QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
146 if (rfd->gsi == gsi) {
147 QLIST_REMOVE(rfd, node);
148 g_free(rfd);
149 break;
150 }
151 }
152 }
153
154 static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event)
155 {
156 KVMResampleFd *rfd = g_new0(KVMResampleFd, 1);
157
158 rfd->gsi = gsi;
159 rfd->resample_event = event;
160
161 QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node);
162 }
163
164 void kvm_resample_fd_notify(int gsi)
165 {
166 KVMResampleFd *rfd;
167
168 QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
169 if (rfd->gsi == gsi) {
170 event_notifier_set(rfd->resample_event);
171 trace_kvm_resample_fd_notify(gsi);
172 return;
173 }
174 }
175 }
176
177 unsigned int kvm_get_max_memslots(void)
178 {
179 KVMState *s = KVM_STATE(current_accel());
180
181 return s->nr_slots;
182 }
183
184 unsigned int kvm_get_free_memslots(void)
185 {
186 unsigned int used_slots = 0;
187 KVMState *s = kvm_state;
188 int i;
189
190 kvm_slots_lock();
191 for (i = 0; i < s->nr_as; i++) {
192 if (!s->as[i].ml) {
193 continue;
194 }
195 used_slots = MAX(used_slots, s->as[i].ml->nr_used_slots);
196 }
197 kvm_slots_unlock();
198
199 return s->nr_slots - used_slots;
200 }
201
202 /* Called with KVMMemoryListener.slots_lock held */
203 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
204 {
205 KVMState *s = kvm_state;
206 int i;
207
208 for (i = 0; i < s->nr_slots; i++) {
209 if (kml->slots[i].memory_size == 0) {
210 return &kml->slots[i];
211 }
212 }
213
214 return NULL;
215 }
216
217 /* Called with KVMMemoryListener.slots_lock held */
218 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
219 {
220 KVMSlot *slot = kvm_get_free_slot(kml);
221
222 if (slot) {
223 return slot;
224 }
225
226 fprintf(stderr, "%s: no free slot available\n", __func__);
227 abort();
228 }
229
230 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
231 hwaddr start_addr,
232 hwaddr size)
233 {
234 KVMState *s = kvm_state;
235 int i;
236
237 for (i = 0; i < s->nr_slots; i++) {
238 KVMSlot *mem = &kml->slots[i];
239
240 if (start_addr == mem->start_addr && size == mem->memory_size) {
241 return mem;
242 }
243 }
244
245 return NULL;
246 }
247
248 /*
249 * Calculate and align the start address and the size of the section.
250 * Return the size. If the size is 0, the aligned section is empty.
251 */
252 static hwaddr kvm_align_section(MemoryRegionSection *section,
253 hwaddr *start)
254 {
255 hwaddr size = int128_get64(section->size);
256 hwaddr delta, aligned;
257
258 /* kvm works in page size chunks, but the function may be called
259 with sub-page size and unaligned start address. Pad the start
260 address to next and truncate size to previous page boundary. */
261 aligned = ROUND_UP(section->offset_within_address_space,
262 qemu_real_host_page_size());
263 delta = aligned - section->offset_within_address_space;
264 *start = aligned;
265 if (delta > size) {
266 return 0;
267 }
268
269 return (size - delta) & qemu_real_host_page_mask();
270 }
271
272 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
273 hwaddr *phys_addr)
274 {
275 KVMMemoryListener *kml = &s->memory_listener;
276 int i, ret = 0;
277
278 kvm_slots_lock();
279 for (i = 0; i < s->nr_slots; i++) {
280 KVMSlot *mem = &kml->slots[i];
281
282 if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
283 *phys_addr = mem->start_addr + (ram - mem->ram);
284 ret = 1;
285 break;
286 }
287 }
288 kvm_slots_unlock();
289
290 return ret;
291 }
292
293 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new)
294 {
295 KVMState *s = kvm_state;
296 struct kvm_userspace_memory_region mem;
297 int ret;
298
299 mem.slot = slot->slot | (kml->as_id << 16);
300 mem.guest_phys_addr = slot->start_addr;
301 mem.userspace_addr = (unsigned long)slot->ram;
302 mem.flags = slot->flags;
303
304 if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) {
305 /* Set the slot size to 0 before setting the slot to the desired
306 * value. This is needed based on KVM commit 75d61fbc. */
307 mem.memory_size = 0;
308 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
309 if (ret < 0) {
310 goto err;
311 }
312 }
313 mem.memory_size = slot->memory_size;
314 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
315 slot->old_flags = mem.flags;
316 err:
317 trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr,
318 mem.memory_size, mem.userspace_addr, ret);
319 if (ret < 0) {
320 error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d,"
321 " start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s",
322 __func__, mem.slot, slot->start_addr,
323 (uint64_t)mem.memory_size, strerror(errno));
324 }
325 return ret;
326 }
327
328 static int do_kvm_destroy_vcpu(CPUState *cpu)
329 {
330 KVMState *s = kvm_state;
331 long mmap_size;
332 struct KVMParkedVcpu *vcpu = NULL;
333 int ret = 0;
334
335 DPRINTF("kvm_destroy_vcpu\n");
336
337 ret = kvm_arch_destroy_vcpu(cpu);
338 if (ret < 0) {
339 goto err;
340 }
341
342 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
343 if (mmap_size < 0) {
344 ret = mmap_size;
345 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
346 goto err;
347 }
348
349 ret = munmap(cpu->kvm_run, mmap_size);
350 if (ret < 0) {
351 goto err;
352 }
353
354 if (cpu->kvm_dirty_gfns) {
355 ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_bytes);
356 if (ret < 0) {
357 goto err;
358 }
359 }
360
361 vcpu = g_malloc0(sizeof(*vcpu));
362 vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
363 vcpu->kvm_fd = cpu->kvm_fd;
364 QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
365 err:
366 return ret;
367 }
368
369 void kvm_destroy_vcpu(CPUState *cpu)
370 {
371 if (do_kvm_destroy_vcpu(cpu) < 0) {
372 error_report("kvm_destroy_vcpu failed");
373 exit(EXIT_FAILURE);
374 }
375 }
376
377 static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
378 {
379 struct KVMParkedVcpu *cpu;
380
381 QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
382 if (cpu->vcpu_id == vcpu_id) {
383 int kvm_fd;
384
385 QLIST_REMOVE(cpu, node);
386 kvm_fd = cpu->kvm_fd;
387 g_free(cpu);
388 return kvm_fd;
389 }
390 }
391
392 return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
393 }
394
395 int kvm_init_vcpu(CPUState *cpu, Error **errp)
396 {
397 KVMState *s = kvm_state;
398 long mmap_size;
399 int ret;
400
401 trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
402
403 ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
404 if (ret < 0) {
405 error_setg_errno(errp, -ret, "kvm_init_vcpu: kvm_get_vcpu failed (%lu)",
406 kvm_arch_vcpu_id(cpu));
407 goto err;
408 }
409
410 cpu->kvm_fd = ret;
411 cpu->kvm_state = s;
412 cpu->vcpu_dirty = true;
413 cpu->dirty_pages = 0;
414 cpu->throttle_us_per_full = 0;
415
416 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
417 if (mmap_size < 0) {
418 ret = mmap_size;
419 error_setg_errno(errp, -mmap_size,
420 "kvm_init_vcpu: KVM_GET_VCPU_MMAP_SIZE failed");
421 goto err;
422 }
423
424 cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
425 cpu->kvm_fd, 0);
426 if (cpu->kvm_run == MAP_FAILED) {
427 ret = -errno;
428 error_setg_errno(errp, ret,
429 "kvm_init_vcpu: mmap'ing vcpu state failed (%lu)",
430 kvm_arch_vcpu_id(cpu));
431 goto err;
432 }
433
434 if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
435 s->coalesced_mmio_ring =
436 (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
437 }
438
439 if (s->kvm_dirty_ring_size) {
440 /* Use MAP_SHARED to share pages with the kernel */
441 cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_bytes,
442 PROT_READ | PROT_WRITE, MAP_SHARED,
443 cpu->kvm_fd,
444 PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
445 if (cpu->kvm_dirty_gfns == MAP_FAILED) {
446 ret = -errno;
447 DPRINTF("mmap'ing vcpu dirty gfns failed: %d\n", ret);
448 goto err;
449 }
450 }
451
452 ret = kvm_arch_init_vcpu(cpu);
453 if (ret < 0) {
454 error_setg_errno(errp, -ret,
455 "kvm_init_vcpu: kvm_arch_init_vcpu failed (%lu)",
456 kvm_arch_vcpu_id(cpu));
457 }
458 cpu->kvm_vcpu_stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL);
459
460 err:
461 return ret;
462 }
463
464 /*
465 * dirty pages logging control
466 */
467
468 static int kvm_mem_flags(MemoryRegion *mr)
469 {
470 bool readonly = mr->readonly || memory_region_is_romd(mr);
471 int flags = 0;
472
473 if (memory_region_get_dirty_log_mask(mr) != 0) {
474 flags |= KVM_MEM_LOG_DIRTY_PAGES;
475 }
476 if (readonly && kvm_readonly_mem_allowed) {
477 flags |= KVM_MEM_READONLY;
478 }
479 return flags;
480 }
481
482 /* Called with KVMMemoryListener.slots_lock held */
483 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
484 MemoryRegion *mr)
485 {
486 mem->flags = kvm_mem_flags(mr);
487
488 /* If nothing changed effectively, no need to issue ioctl */
489 if (mem->flags == mem->old_flags) {
490 return 0;
491 }
492
493 kvm_slot_init_dirty_bitmap(mem);
494 return kvm_set_user_memory_region(kml, mem, false);
495 }
496
497 static int kvm_section_update_flags(KVMMemoryListener *kml,
498 MemoryRegionSection *section)
499 {
500 hwaddr start_addr, size, slot_size;
501 KVMSlot *mem;
502 int ret = 0;
503
504 size = kvm_align_section(section, &start_addr);
505 if (!size) {
506 return 0;
507 }
508
509 kvm_slots_lock();
510
511 while (size && !ret) {
512 slot_size = MIN(kvm_max_slot_size, size);
513 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
514 if (!mem) {
515 /* We don't have a slot if we want to trap every access. */
516 goto out;
517 }
518
519 ret = kvm_slot_update_flags(kml, mem, section->mr);
520 start_addr += slot_size;
521 size -= slot_size;
522 }
523
524 out:
525 kvm_slots_unlock();
526 return ret;
527 }
528
529 static void kvm_log_start(MemoryListener *listener,
530 MemoryRegionSection *section,
531 int old, int new)
532 {
533 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
534 int r;
535
536 if (old != 0) {
537 return;
538 }
539
540 r = kvm_section_update_flags(kml, section);
541 if (r < 0) {
542 abort();
543 }
544 }
545
546 static void kvm_log_stop(MemoryListener *listener,
547 MemoryRegionSection *section,
548 int old, int new)
549 {
550 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
551 int r;
552
553 if (new != 0) {
554 return;
555 }
556
557 r = kvm_section_update_flags(kml, section);
558 if (r < 0) {
559 abort();
560 }
561 }
562
563 /* get kvm's dirty pages bitmap and update qemu's */
564 static void kvm_slot_sync_dirty_pages(KVMSlot *slot)
565 {
566 ram_addr_t start = slot->ram_start_offset;
567 ram_addr_t pages = slot->memory_size / qemu_real_host_page_size();
568
569 cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages);
570 }
571
572 static void kvm_slot_reset_dirty_pages(KVMSlot *slot)
573 {
574 memset(slot->dirty_bmap, 0, slot->dirty_bmap_size);
575 }
576
577 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
578
579 /* Allocate the dirty bitmap for a slot */
580 static void kvm_slot_init_dirty_bitmap(KVMSlot *mem)
581 {
582 if (!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) || mem->dirty_bmap) {
583 return;
584 }
585
586 /*
587 * XXX bad kernel interface alert
588 * For dirty bitmap, kernel allocates array of size aligned to
589 * bits-per-long. But for case when the kernel is 64bits and
590 * the userspace is 32bits, userspace can't align to the same
591 * bits-per-long, since sizeof(long) is different between kernel
592 * and user space. This way, userspace will provide buffer which
593 * may be 4 bytes less than the kernel will use, resulting in
594 * userspace memory corruption (which is not detectable by valgrind
595 * too, in most cases).
596 * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
597 * a hope that sizeof(long) won't become >8 any time soon.
598 *
599 * Note: the granule of kvm dirty log is qemu_real_host_page_size.
600 * And mem->memory_size is aligned to it (otherwise this mem can't
601 * be registered to KVM).
602 */
603 hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size(),
604 /*HOST_LONG_BITS*/ 64) / 8;
605 mem->dirty_bmap = g_malloc0(bitmap_size);
606 mem->dirty_bmap_size = bitmap_size;
607 }
608
609 /*
610 * Sync dirty bitmap from kernel to KVMSlot.dirty_bmap, return true if
611 * succeeded, false otherwise
612 */
613 static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot *slot)
614 {
615 struct kvm_dirty_log d = {};
616 int ret;
617
618 d.dirty_bitmap = slot->dirty_bmap;
619 d.slot = slot->slot | (slot->as_id << 16);
620 ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d);
621
622 if (ret == -ENOENT) {
623 /* kernel does not have dirty bitmap in this slot */
624 ret = 0;
625 }
626 if (ret) {
627 error_report_once("%s: KVM_GET_DIRTY_LOG failed with %d",
628 __func__, ret);
629 }
630 return ret == 0;
631 }
632
633 /* Should be with all slots_lock held for the address spaces. */
634 static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
635 uint32_t slot_id, uint64_t offset)
636 {
637 KVMMemoryListener *kml;
638 KVMSlot *mem;
639
640 if (as_id >= s->nr_as) {
641 return;
642 }
643
644 kml = s->as[as_id].ml;
645 mem = &kml->slots[slot_id];
646
647 if (!mem->memory_size || offset >=
648 (mem->memory_size / qemu_real_host_page_size())) {
649 return;
650 }
651
652 set_bit(offset, mem->dirty_bmap);
653 }
654
655 static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
656 {
657 /*
658 * Read the flags before the value. Pairs with barrier in
659 * KVM's kvm_dirty_ring_push() function.
660 */
661 return qatomic_load_acquire(&gfn->flags) == KVM_DIRTY_GFN_F_DIRTY;
662 }
663
664 static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
665 {
666 /*
667 * Use a store-release so that the CPU that executes KVM_RESET_DIRTY_RINGS
668 * sees the full content of the ring:
669 *
670 * CPU0 CPU1 CPU2
671 * ------------------------------------------------------------------------------
672 * fill gfn0
673 * store-rel flags for gfn0
674 * load-acq flags for gfn0
675 * store-rel RESET for gfn0
676 * ioctl(RESET_RINGS)
677 * load-acq flags for gfn0
678 * check if flags have RESET
679 *
680 * The synchronization goes from CPU2 to CPU0 to CPU1.
681 */
682 qatomic_store_release(&gfn->flags, KVM_DIRTY_GFN_F_RESET);
683 }
684
685 /*
686 * Should be with all slots_lock held for the address spaces. It returns the
687 * dirty page we've collected on this dirty ring.
688 */
689 static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
690 {
691 struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur;
692 uint32_t ring_size = s->kvm_dirty_ring_size;
693 uint32_t count = 0, fetch = cpu->kvm_fetch_index;
694
695 /*
696 * It's possible that we race with vcpu creation code where the vcpu is
697 * put onto the vcpus list but not yet initialized the dirty ring
698 * structures. If so, skip it.
699 */
700 if (!cpu->created) {
701 return 0;
702 }
703
704 assert(dirty_gfns && ring_size);
705 trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index);
706
707 while (true) {
708 cur = &dirty_gfns[fetch % ring_size];
709 if (!dirty_gfn_is_dirtied(cur)) {
710 break;
711 }
712 kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff,
713 cur->offset);
714 dirty_gfn_set_collected(cur);
715 trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset);
716 fetch++;
717 count++;
718 }
719 cpu->kvm_fetch_index = fetch;
720 cpu->dirty_pages += count;
721
722 return count;
723 }
724
725 /* Must be with slots_lock held */
726 static uint64_t kvm_dirty_ring_reap_locked(KVMState *s, CPUState* cpu)
727 {
728 int ret;
729 uint64_t total = 0;
730 int64_t stamp;
731
732 stamp = get_clock();
733
734 if (cpu) {
735 total = kvm_dirty_ring_reap_one(s, cpu);
736 } else {
737 CPU_FOREACH(cpu) {
738 total += kvm_dirty_ring_reap_one(s, cpu);
739 }
740 }
741
742 if (total) {
743 ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS);
744 assert(ret == total);
745 }
746
747 stamp = get_clock() - stamp;
748
749 if (total) {
750 trace_kvm_dirty_ring_reap(total, stamp / 1000);
751 }
752
753 return total;
754 }
755
756 /*
757 * Currently for simplicity, we must hold BQL before calling this. We can
758 * consider to drop the BQL if we're clear with all the race conditions.
759 */
760 static uint64_t kvm_dirty_ring_reap(KVMState *s, CPUState *cpu)
761 {
762 uint64_t total;
763
764 /*
765 * We need to lock all kvm slots for all address spaces here,
766 * because:
767 *
768 * (1) We need to mark dirty for dirty bitmaps in multiple slots
769 * and for tons of pages, so it's better to take the lock here
770 * once rather than once per page. And more importantly,
771 *
772 * (2) We must _NOT_ publish dirty bits to the other threads
773 * (e.g., the migration thread) via the kvm memory slot dirty
774 * bitmaps before correctly re-protect those dirtied pages.
775 * Otherwise we can have potential risk of data corruption if
776 * the page data is read in the other thread before we do
777 * reset below.
778 */
779 kvm_slots_lock();
780 total = kvm_dirty_ring_reap_locked(s, cpu);
781 kvm_slots_unlock();
782
783 return total;
784 }
785
786 static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg)
787 {
788 /* No need to do anything */
789 }
790
791 /*
792 * Kick all vcpus out in a synchronized way. When returned, we
793 * guarantee that every vcpu has been kicked and at least returned to
794 * userspace once.
795 */
796 static void kvm_cpu_synchronize_kick_all(void)
797 {
798 CPUState *cpu;
799
800 CPU_FOREACH(cpu) {
801 run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL);
802 }
803 }
804
805 /*
806 * Flush all the existing dirty pages to the KVM slot buffers. When
807 * this call returns, we guarantee that all the touched dirty pages
808 * before calling this function have been put into the per-kvmslot
809 * dirty bitmap.
810 *
811 * This function must be called with BQL held.
812 */
813 static void kvm_dirty_ring_flush(void)
814 {
815 trace_kvm_dirty_ring_flush(0);
816 /*
817 * The function needs to be serialized. Since this function
818 * should always be with BQL held, serialization is guaranteed.
819 * However, let's be sure of it.
820 */
821 assert(qemu_mutex_iothread_locked());
822 /*
823 * First make sure to flush the hardware buffers by kicking all
824 * vcpus out in a synchronous way.
825 */
826 kvm_cpu_synchronize_kick_all();
827 kvm_dirty_ring_reap(kvm_state, NULL);
828 trace_kvm_dirty_ring_flush(1);
829 }
830
831 /**
832 * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space
833 *
834 * This function will first try to fetch dirty bitmap from the kernel,
835 * and then updates qemu's dirty bitmap.
836 *
837 * NOTE: caller must be with kml->slots_lock held.
838 *
839 * @kml: the KVM memory listener object
840 * @section: the memory section to sync the dirty bitmap with
841 */
842 static void kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
843 MemoryRegionSection *section)
844 {
845 KVMState *s = kvm_state;
846 KVMSlot *mem;
847 hwaddr start_addr, size;
848 hwaddr slot_size;
849
850 size = kvm_align_section(section, &start_addr);
851 while (size) {
852 slot_size = MIN(kvm_max_slot_size, size);
853 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
854 if (!mem) {
855 /* We don't have a slot if we want to trap every access. */
856 return;
857 }
858 if (kvm_slot_get_dirty_log(s, mem)) {
859 kvm_slot_sync_dirty_pages(mem);
860 }
861 start_addr += slot_size;
862 size -= slot_size;
863 }
864 }
865
866 /* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */
867 #define KVM_CLEAR_LOG_SHIFT 6
868 #define KVM_CLEAR_LOG_ALIGN (qemu_real_host_page_size() << KVM_CLEAR_LOG_SHIFT)
869 #define KVM_CLEAR_LOG_MASK (-KVM_CLEAR_LOG_ALIGN)
870
871 static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start,
872 uint64_t size)
873 {
874 KVMState *s = kvm_state;
875 uint64_t end, bmap_start, start_delta, bmap_npages;
876 struct kvm_clear_dirty_log d;
877 unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size();
878 int ret;
879
880 /*
881 * We need to extend either the start or the size or both to
882 * satisfy the KVM interface requirement. Firstly, do the start
883 * page alignment on 64 host pages
884 */
885 bmap_start = start & KVM_CLEAR_LOG_MASK;
886 start_delta = start - bmap_start;
887 bmap_start /= psize;
888
889 /*
890 * The kernel interface has restriction on the size too, that either:
891 *
892 * (1) the size is 64 host pages aligned (just like the start), or
893 * (2) the size fills up until the end of the KVM memslot.
894 */
895 bmap_npages = DIV_ROUND_UP(size + start_delta, KVM_CLEAR_LOG_ALIGN)
896 << KVM_CLEAR_LOG_SHIFT;
897 end = mem->memory_size / psize;
898 if (bmap_npages > end - bmap_start) {
899 bmap_npages = end - bmap_start;
900 }
901 start_delta /= psize;
902
903 /*
904 * Prepare the bitmap to clear dirty bits. Here we must guarantee
905 * that we won't clear any unknown dirty bits otherwise we might
906 * accidentally clear some set bits which are not yet synced from
907 * the kernel into QEMU's bitmap, then we'll lose track of the
908 * guest modifications upon those pages (which can directly lead
909 * to guest data loss or panic after migration).
910 *
911 * Layout of the KVMSlot.dirty_bmap:
912 *
913 * |<-------- bmap_npages -----------..>|
914 * [1]
915 * start_delta size
916 * |----------------|-------------|------------------|------------|
917 * ^ ^ ^ ^
918 * | | | |
919 * start bmap_start (start) end
920 * of memslot of memslot
921 *
922 * [1] bmap_npages can be aligned to either 64 pages or the end of slot
923 */
924
925 assert(bmap_start % BITS_PER_LONG == 0);
926 /* We should never do log_clear before log_sync */
927 assert(mem->dirty_bmap);
928 if (start_delta || bmap_npages - size / psize) {
929 /* Slow path - we need to manipulate a temp bitmap */
930 bmap_clear = bitmap_new(bmap_npages);
931 bitmap_copy_with_src_offset(bmap_clear, mem->dirty_bmap,
932 bmap_start, start_delta + size / psize);
933 /*
934 * We need to fill the holes at start because that was not
935 * specified by the caller and we extended the bitmap only for
936 * 64 pages alignment
937 */
938 bitmap_clear(bmap_clear, 0, start_delta);
939 d.dirty_bitmap = bmap_clear;
940 } else {
941 /*
942 * Fast path - both start and size align well with BITS_PER_LONG
943 * (or the end of memory slot)
944 */
945 d.dirty_bitmap = mem->dirty_bmap + BIT_WORD(bmap_start);
946 }
947
948 d.first_page = bmap_start;
949 /* It should never overflow. If it happens, say something */
950 assert(bmap_npages <= UINT32_MAX);
951 d.num_pages = bmap_npages;
952 d.slot = mem->slot | (as_id << 16);
953
954 ret = kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d);
955 if (ret < 0 && ret != -ENOENT) {
956 error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, "
957 "start=0x%"PRIx64", size=0x%"PRIx32", errno=%d",
958 __func__, d.slot, (uint64_t)d.first_page,
959 (uint32_t)d.num_pages, ret);
960 } else {
961 ret = 0;
962 trace_kvm_clear_dirty_log(d.slot, d.first_page, d.num_pages);
963 }
964
965 /*
966 * After we have updated the remote dirty bitmap, we update the
967 * cached bitmap as well for the memslot, then if another user
968 * clears the same region we know we shouldn't clear it again on
969 * the remote otherwise it's data loss as well.
970 */
971 bitmap_clear(mem->dirty_bmap, bmap_start + start_delta,
972 size / psize);
973 /* This handles the NULL case well */
974 g_free(bmap_clear);
975 return ret;
976 }
977
978
979 /**
980 * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range
981 *
982 * NOTE: this will be a no-op if we haven't enabled manual dirty log
983 * protection in the host kernel because in that case this operation
984 * will be done within log_sync().
985 *
986 * @kml: the kvm memory listener
987 * @section: the memory range to clear dirty bitmap
988 */
989 static int kvm_physical_log_clear(KVMMemoryListener *kml,
990 MemoryRegionSection *section)
991 {
992 KVMState *s = kvm_state;
993 uint64_t start, size, offset, count;
994 KVMSlot *mem;
995 int ret = 0, i;
996
997 if (!s->manual_dirty_log_protect) {
998 /* No need to do explicit clear */
999 return ret;
1000 }
1001
1002 start = section->offset_within_address_space;
1003 size = int128_get64(section->size);
1004
1005 if (!size) {
1006 /* Nothing more we can do... */
1007 return ret;
1008 }
1009
1010 kvm_slots_lock();
1011
1012 for (i = 0; i < s->nr_slots; i++) {
1013 mem = &kml->slots[i];
1014 /* Discard slots that are empty or do not overlap the section */
1015 if (!mem->memory_size ||
1016 mem->start_addr > start + size - 1 ||
1017 start > mem->start_addr + mem->memory_size - 1) {
1018 continue;
1019 }
1020
1021 if (start >= mem->start_addr) {
1022 /* The slot starts before section or is aligned to it. */
1023 offset = start - mem->start_addr;
1024 count = MIN(mem->memory_size - offset, size);
1025 } else {
1026 /* The slot starts after section. */
1027 offset = 0;
1028 count = MIN(mem->memory_size, size - (mem->start_addr - start));
1029 }
1030 ret = kvm_log_clear_one_slot(mem, kml->as_id, offset, count);
1031 if (ret < 0) {
1032 break;
1033 }
1034 }
1035
1036 kvm_slots_unlock();
1037
1038 return ret;
1039 }
1040
1041 static void kvm_coalesce_mmio_region(MemoryListener *listener,
1042 MemoryRegionSection *secion,
1043 hwaddr start, hwaddr size)
1044 {
1045 KVMState *s = kvm_state;
1046
1047 if (s->coalesced_mmio) {
1048 struct kvm_coalesced_mmio_zone zone;
1049
1050 zone.addr = start;
1051 zone.size = size;
1052 zone.pad = 0;
1053
1054 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
1055 }
1056 }
1057
1058 static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
1059 MemoryRegionSection *secion,
1060 hwaddr start, hwaddr size)
1061 {
1062 KVMState *s = kvm_state;
1063
1064 if (s->coalesced_mmio) {
1065 struct kvm_coalesced_mmio_zone zone;
1066
1067 zone.addr = start;
1068 zone.size = size;
1069 zone.pad = 0;
1070
1071 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
1072 }
1073 }
1074
1075 static void kvm_coalesce_pio_add(MemoryListener *listener,
1076 MemoryRegionSection *section,
1077 hwaddr start, hwaddr size)
1078 {
1079 KVMState *s = kvm_state;
1080
1081 if (s->coalesced_pio) {
1082 struct kvm_coalesced_mmio_zone zone;
1083
1084 zone.addr = start;
1085 zone.size = size;
1086 zone.pio = 1;
1087
1088 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
1089 }
1090 }
1091
1092 static void kvm_coalesce_pio_del(MemoryListener *listener,
1093 MemoryRegionSection *section,
1094 hwaddr start, hwaddr size)
1095 {
1096 KVMState *s = kvm_state;
1097
1098 if (s->coalesced_pio) {
1099 struct kvm_coalesced_mmio_zone zone;
1100
1101 zone.addr = start;
1102 zone.size = size;
1103 zone.pio = 1;
1104
1105 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
1106 }
1107 }
1108
1109 static MemoryListener kvm_coalesced_pio_listener = {
1110 .name = "kvm-coalesced-pio",
1111 .coalesced_io_add = kvm_coalesce_pio_add,
1112 .coalesced_io_del = kvm_coalesce_pio_del,
1113 .priority = MEMORY_LISTENER_PRIORITY_MIN,
1114 };
1115
1116 int kvm_check_extension(KVMState *s, unsigned int extension)
1117 {
1118 int ret;
1119
1120 ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
1121 if (ret < 0) {
1122 ret = 0;
1123 }
1124
1125 return ret;
1126 }
1127
1128 int kvm_vm_check_extension(KVMState *s, unsigned int extension)
1129 {
1130 int ret;
1131
1132 ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension);
1133 if (ret < 0) {
1134 /* VM wide version not implemented, use global one instead */
1135 ret = kvm_check_extension(s, extension);
1136 }
1137
1138 return ret;
1139 }
1140
1141 typedef struct HWPoisonPage {
1142 ram_addr_t ram_addr;
1143 QLIST_ENTRY(HWPoisonPage) list;
1144 } HWPoisonPage;
1145
1146 static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
1147 QLIST_HEAD_INITIALIZER(hwpoison_page_list);
1148
1149 static void kvm_unpoison_all(void *param)
1150 {
1151 HWPoisonPage *page, *next_page;
1152
1153 QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
1154 QLIST_REMOVE(page, list);
1155 qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
1156 g_free(page);
1157 }
1158 }
1159
1160 void kvm_hwpoison_page_add(ram_addr_t ram_addr)
1161 {
1162 HWPoisonPage *page;
1163
1164 QLIST_FOREACH(page, &hwpoison_page_list, list) {
1165 if (page->ram_addr == ram_addr) {
1166 return;
1167 }
1168 }
1169 page = g_new(HWPoisonPage, 1);
1170 page->ram_addr = ram_addr;
1171 QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
1172 }
1173
1174 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
1175 {
1176 #if HOST_BIG_ENDIAN != TARGET_BIG_ENDIAN
1177 /* The kernel expects ioeventfd values in HOST_BIG_ENDIAN
1178 * endianness, but the memory core hands them in target endianness.
1179 * For example, PPC is always treated as big-endian even if running
1180 * on KVM and on PPC64LE. Correct here.
1181 */
1182 switch (size) {
1183 case 2:
1184 val = bswap16(val);
1185 break;
1186 case 4:
1187 val = bswap32(val);
1188 break;
1189 }
1190 #endif
1191 return val;
1192 }
1193
1194 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
1195 bool assign, uint32_t size, bool datamatch)
1196 {
1197 int ret;
1198 struct kvm_ioeventfd iofd = {
1199 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
1200 .addr = addr,
1201 .len = size,
1202 .flags = 0,
1203 .fd = fd,
1204 };
1205
1206 trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size,
1207 datamatch);
1208 if (!kvm_enabled()) {
1209 return -ENOSYS;
1210 }
1211
1212 if (datamatch) {
1213 iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
1214 }
1215 if (!assign) {
1216 iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1217 }
1218
1219 ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
1220
1221 if (ret < 0) {
1222 return -errno;
1223 }
1224
1225 return 0;
1226 }
1227
1228 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
1229 bool assign, uint32_t size, bool datamatch)
1230 {
1231 struct kvm_ioeventfd kick = {
1232 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
1233 .addr = addr,
1234 .flags = KVM_IOEVENTFD_FLAG_PIO,
1235 .len = size,
1236 .fd = fd,
1237 };
1238 int r;
1239 trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch);
1240 if (!kvm_enabled()) {
1241 return -ENOSYS;
1242 }
1243 if (datamatch) {
1244 kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
1245 }
1246 if (!assign) {
1247 kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1248 }
1249 r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1250 if (r < 0) {
1251 return r;
1252 }
1253 return 0;
1254 }
1255
1256
1257 static int kvm_check_many_ioeventfds(void)
1258 {
1259 /* Userspace can use ioeventfd for io notification. This requires a host
1260 * that supports eventfd(2) and an I/O thread; since eventfd does not
1261 * support SIGIO it cannot interrupt the vcpu.
1262 *
1263 * Older kernels have a 6 device limit on the KVM io bus. Find out so we
1264 * can avoid creating too many ioeventfds.
1265 */
1266 #if defined(CONFIG_EVENTFD)
1267 int ioeventfds[7];
1268 int i, ret = 0;
1269 for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
1270 ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
1271 if (ioeventfds[i] < 0) {
1272 break;
1273 }
1274 ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true);
1275 if (ret < 0) {
1276 close(ioeventfds[i]);
1277 break;
1278 }
1279 }
1280
1281 /* Decide whether many devices are supported or not */
1282 ret = i == ARRAY_SIZE(ioeventfds);
1283
1284 while (i-- > 0) {
1285 kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true);
1286 close(ioeventfds[i]);
1287 }
1288 return ret;
1289 #else
1290 return 0;
1291 #endif
1292 }
1293
1294 static const KVMCapabilityInfo *
1295 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
1296 {
1297 while (list->name) {
1298 if (!kvm_check_extension(s, list->value)) {
1299 return list;
1300 }
1301 list++;
1302 }
1303 return NULL;
1304 }
1305
1306 void kvm_set_max_memslot_size(hwaddr max_slot_size)
1307 {
1308 g_assert(
1309 ROUND_UP(max_slot_size, qemu_real_host_page_size()) == max_slot_size
1310 );
1311 kvm_max_slot_size = max_slot_size;
1312 }
1313
1314 /* Called with KVMMemoryListener.slots_lock held */
1315 static void kvm_set_phys_mem(KVMMemoryListener *kml,
1316 MemoryRegionSection *section, bool add)
1317 {
1318 KVMSlot *mem;
1319 int err;
1320 MemoryRegion *mr = section->mr;
1321 bool writable = !mr->readonly && !mr->rom_device;
1322 hwaddr start_addr, size, slot_size, mr_offset;
1323 ram_addr_t ram_start_offset;
1324 void *ram;
1325
1326 if (!memory_region_is_ram(mr)) {
1327 if (writable || !kvm_readonly_mem_allowed) {
1328 return;
1329 } else if (!mr->romd_mode) {
1330 /* If the memory device is not in romd_mode, then we actually want
1331 * to remove the kvm memory slot so all accesses will trap. */
1332 add = false;
1333 }
1334 }
1335
1336 size = kvm_align_section(section, &start_addr);
1337 if (!size) {
1338 return;
1339 }
1340
1341 /* The offset of the kvmslot within the memory region */
1342 mr_offset = section->offset_within_region + start_addr -
1343 section->offset_within_address_space;
1344
1345 /* use aligned delta to align the ram address and offset */
1346 ram = memory_region_get_ram_ptr(mr) + mr_offset;
1347 ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset;
1348
1349 if (!add) {
1350 do {
1351 slot_size = MIN(kvm_max_slot_size, size);
1352 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
1353 if (!mem) {
1354 return;
1355 }
1356 if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1357 /*
1358 * NOTE: We should be aware of the fact that here we're only
1359 * doing a best effort to sync dirty bits. No matter whether
1360 * we're using dirty log or dirty ring, we ignored two facts:
1361 *
1362 * (1) dirty bits can reside in hardware buffers (PML)
1363 *
1364 * (2) after we collected dirty bits here, pages can be dirtied
1365 * again before we do the final KVM_SET_USER_MEMORY_REGION to
1366 * remove the slot.
1367 *
1368 * Not easy. Let's cross the fingers until it's fixed.
1369 */
1370 if (kvm_state->kvm_dirty_ring_size) {
1371 kvm_dirty_ring_reap_locked(kvm_state, NULL);
1372 if (kvm_state->kvm_dirty_ring_with_bitmap) {
1373 kvm_slot_sync_dirty_pages(mem);
1374 kvm_slot_get_dirty_log(kvm_state, mem);
1375 }
1376 } else {
1377 kvm_slot_get_dirty_log(kvm_state, mem);
1378 }
1379 kvm_slot_sync_dirty_pages(mem);
1380 }
1381
1382 /* unregister the slot */
1383 g_free(mem->dirty_bmap);
1384 mem->dirty_bmap = NULL;
1385 mem->memory_size = 0;
1386 mem->flags = 0;
1387 err = kvm_set_user_memory_region(kml, mem, false);
1388 if (err) {
1389 fprintf(stderr, "%s: error unregistering slot: %s\n",
1390 __func__, strerror(-err));
1391 abort();
1392 }
1393 start_addr += slot_size;
1394 size -= slot_size;
1395 kml->nr_used_slots--;
1396 } while (size);
1397 return;
1398 }
1399
1400 /* register the new slot */
1401 do {
1402 slot_size = MIN(kvm_max_slot_size, size);
1403 mem = kvm_alloc_slot(kml);
1404 mem->as_id = kml->as_id;
1405 mem->memory_size = slot_size;
1406 mem->start_addr = start_addr;
1407 mem->ram_start_offset = ram_start_offset;
1408 mem->ram = ram;
1409 mem->flags = kvm_mem_flags(mr);
1410 kvm_slot_init_dirty_bitmap(mem);
1411 err = kvm_set_user_memory_region(kml, mem, true);
1412 if (err) {
1413 fprintf(stderr, "%s: error registering slot: %s\n", __func__,
1414 strerror(-err));
1415 abort();
1416 }
1417 start_addr += slot_size;
1418 ram_start_offset += slot_size;
1419 ram += slot_size;
1420 size -= slot_size;
1421 kml->nr_used_slots++;
1422 } while (size);
1423 }
1424
1425 static void *kvm_dirty_ring_reaper_thread(void *data)
1426 {
1427 KVMState *s = data;
1428 struct KVMDirtyRingReaper *r = &s->reaper;
1429
1430 rcu_register_thread();
1431
1432 trace_kvm_dirty_ring_reaper("init");
1433
1434 while (true) {
1435 r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
1436 trace_kvm_dirty_ring_reaper("wait");
1437 /*
1438 * TODO: provide a smarter timeout rather than a constant?
1439 */
1440 sleep(1);
1441
1442 /* keep sleeping so that dirtylimit not be interfered by reaper */
1443 if (dirtylimit_in_service()) {
1444 continue;
1445 }
1446
1447 trace_kvm_dirty_ring_reaper("wakeup");
1448 r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
1449
1450 qemu_mutex_lock_iothread();
1451 kvm_dirty_ring_reap(s, NULL);
1452 qemu_mutex_unlock_iothread();
1453
1454 r->reaper_iteration++;
1455 }
1456
1457 trace_kvm_dirty_ring_reaper("exit");
1458
1459 rcu_unregister_thread();
1460
1461 return NULL;
1462 }
1463
1464 static void kvm_dirty_ring_reaper_init(KVMState *s)
1465 {
1466 struct KVMDirtyRingReaper *r = &s->reaper;
1467
1468 qemu_thread_create(&r->reaper_thr, "kvm-reaper",
1469 kvm_dirty_ring_reaper_thread,
1470 s, QEMU_THREAD_JOINABLE);
1471 }
1472
1473 static int kvm_dirty_ring_init(KVMState *s)
1474 {
1475 uint32_t ring_size = s->kvm_dirty_ring_size;
1476 uint64_t ring_bytes = ring_size * sizeof(struct kvm_dirty_gfn);
1477 unsigned int capability = KVM_CAP_DIRTY_LOG_RING;
1478 int ret;
1479
1480 s->kvm_dirty_ring_size = 0;
1481 s->kvm_dirty_ring_bytes = 0;
1482
1483 /* Bail if the dirty ring size isn't specified */
1484 if (!ring_size) {
1485 return 0;
1486 }
1487
1488 /*
1489 * Read the max supported pages. Fall back to dirty logging mode
1490 * if the dirty ring isn't supported.
1491 */
1492 ret = kvm_vm_check_extension(s, capability);
1493 if (ret <= 0) {
1494 capability = KVM_CAP_DIRTY_LOG_RING_ACQ_REL;
1495 ret = kvm_vm_check_extension(s, capability);
1496 }
1497
1498 if (ret <= 0) {
1499 warn_report("KVM dirty ring not available, using bitmap method");
1500 return 0;
1501 }
1502
1503 if (ring_bytes > ret) {
1504 error_report("KVM dirty ring size %" PRIu32 " too big "
1505 "(maximum is %ld). Please use a smaller value.",
1506 ring_size, (long)ret / sizeof(struct kvm_dirty_gfn));
1507 return -EINVAL;
1508 }
1509
1510 ret = kvm_vm_enable_cap(s, capability, 0, ring_bytes);
1511 if (ret) {
1512 error_report("Enabling of KVM dirty ring failed: %s. "
1513 "Suggested minimum value is 1024.", strerror(-ret));
1514 return -EIO;
1515 }
1516
1517 /* Enable the backup bitmap if it is supported */
1518 ret = kvm_vm_check_extension(s, KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP);
1519 if (ret > 0) {
1520 ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP, 0);
1521 if (ret) {
1522 error_report("Enabling of KVM dirty ring's backup bitmap failed: "
1523 "%s. ", strerror(-ret));
1524 return -EIO;
1525 }
1526
1527 s->kvm_dirty_ring_with_bitmap = true;
1528 }
1529
1530 s->kvm_dirty_ring_size = ring_size;
1531 s->kvm_dirty_ring_bytes = ring_bytes;
1532
1533 return 0;
1534 }
1535
1536 static void kvm_region_add(MemoryListener *listener,
1537 MemoryRegionSection *section)
1538 {
1539 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1540 KVMMemoryUpdate *update;
1541
1542 update = g_new0(KVMMemoryUpdate, 1);
1543 update->section = *section;
1544
1545 QSIMPLEQ_INSERT_TAIL(&kml->transaction_add, update, next);
1546 }
1547
1548 static void kvm_region_del(MemoryListener *listener,
1549 MemoryRegionSection *section)
1550 {
1551 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1552 KVMMemoryUpdate *update;
1553
1554 update = g_new0(KVMMemoryUpdate, 1);
1555 update->section = *section;
1556
1557 QSIMPLEQ_INSERT_TAIL(&kml->transaction_del, update, next);
1558 }
1559
1560 static void kvm_region_commit(MemoryListener *listener)
1561 {
1562 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener,
1563 listener);
1564 KVMMemoryUpdate *u1, *u2;
1565 bool need_inhibit = false;
1566
1567 if (QSIMPLEQ_EMPTY(&kml->transaction_add) &&
1568 QSIMPLEQ_EMPTY(&kml->transaction_del)) {
1569 return;
1570 }
1571
1572 /*
1573 * We have to be careful when regions to add overlap with ranges to remove.
1574 * We have to simulate atomic KVM memslot updates by making sure no ioctl()
1575 * is currently active.
1576 *
1577 * The lists are order by addresses, so it's easy to find overlaps.
1578 */
1579 u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
1580 u2 = QSIMPLEQ_FIRST(&kml->transaction_add);
1581 while (u1 && u2) {
1582 Range r1, r2;
1583
1584 range_init_nofail(&r1, u1->section.offset_within_address_space,
1585 int128_get64(u1->section.size));
1586 range_init_nofail(&r2, u2->section.offset_within_address_space,
1587 int128_get64(u2->section.size));
1588
1589 if (range_overlaps_range(&r1, &r2)) {
1590 need_inhibit = true;
1591 break;
1592 }
1593 if (range_lob(&r1) < range_lob(&r2)) {
1594 u1 = QSIMPLEQ_NEXT(u1, next);
1595 } else {
1596 u2 = QSIMPLEQ_NEXT(u2, next);
1597 }
1598 }
1599
1600 kvm_slots_lock();
1601 if (need_inhibit) {
1602 accel_ioctl_inhibit_begin();
1603 }
1604
1605 /* Remove all memslots before adding the new ones. */
1606 while (!QSIMPLEQ_EMPTY(&kml->transaction_del)) {
1607 u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
1608 QSIMPLEQ_REMOVE_HEAD(&kml->transaction_del, next);
1609
1610 kvm_set_phys_mem(kml, &u1->section, false);
1611 memory_region_unref(u1->section.mr);
1612
1613 g_free(u1);
1614 }
1615 while (!QSIMPLEQ_EMPTY(&kml->transaction_add)) {
1616 u1 = QSIMPLEQ_FIRST(&kml->transaction_add);
1617 QSIMPLEQ_REMOVE_HEAD(&kml->transaction_add, next);
1618
1619 memory_region_ref(u1->section.mr);
1620 kvm_set_phys_mem(kml, &u1->section, true);
1621
1622 g_free(u1);
1623 }
1624
1625 if (need_inhibit) {
1626 accel_ioctl_inhibit_end();
1627 }
1628 kvm_slots_unlock();
1629 }
1630
1631 static void kvm_log_sync(MemoryListener *listener,
1632 MemoryRegionSection *section)
1633 {
1634 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1635
1636 kvm_slots_lock();
1637 kvm_physical_sync_dirty_bitmap(kml, section);
1638 kvm_slots_unlock();
1639 }
1640
1641 static void kvm_log_sync_global(MemoryListener *l, bool last_stage)
1642 {
1643 KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener);
1644 KVMState *s = kvm_state;
1645 KVMSlot *mem;
1646 int i;
1647
1648 /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */
1649 kvm_dirty_ring_flush();
1650
1651 /*
1652 * TODO: make this faster when nr_slots is big while there are
1653 * only a few used slots (small VMs).
1654 */
1655 kvm_slots_lock();
1656 for (i = 0; i < s->nr_slots; i++) {
1657 mem = &kml->slots[i];
1658 if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1659 kvm_slot_sync_dirty_pages(mem);
1660
1661 if (s->kvm_dirty_ring_with_bitmap && last_stage &&
1662 kvm_slot_get_dirty_log(s, mem)) {
1663 kvm_slot_sync_dirty_pages(mem);
1664 }
1665
1666 /*
1667 * This is not needed by KVM_GET_DIRTY_LOG because the
1668 * ioctl will unconditionally overwrite the whole region.
1669 * However kvm dirty ring has no such side effect.
1670 */
1671 kvm_slot_reset_dirty_pages(mem);
1672 }
1673 }
1674 kvm_slots_unlock();
1675 }
1676
1677 static void kvm_log_clear(MemoryListener *listener,
1678 MemoryRegionSection *section)
1679 {
1680 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1681 int r;
1682
1683 r = kvm_physical_log_clear(kml, section);
1684 if (r < 0) {
1685 error_report_once("%s: kvm log clear failed: mr=%s "
1686 "offset=%"HWADDR_PRIx" size=%"PRIx64, __func__,
1687 section->mr->name, section->offset_within_region,
1688 int128_get64(section->size));
1689 abort();
1690 }
1691 }
1692
1693 static void kvm_mem_ioeventfd_add(MemoryListener *listener,
1694 MemoryRegionSection *section,
1695 bool match_data, uint64_t data,
1696 EventNotifier *e)
1697 {
1698 int fd = event_notifier_get_fd(e);
1699 int r;
1700
1701 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1702 data, true, int128_get64(section->size),
1703 match_data);
1704 if (r < 0) {
1705 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1706 __func__, strerror(-r), -r);
1707 abort();
1708 }
1709 }
1710
1711 static void kvm_mem_ioeventfd_del(MemoryListener *listener,
1712 MemoryRegionSection *section,
1713 bool match_data, uint64_t data,
1714 EventNotifier *e)
1715 {
1716 int fd = event_notifier_get_fd(e);
1717 int r;
1718
1719 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1720 data, false, int128_get64(section->size),
1721 match_data);
1722 if (r < 0) {
1723 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1724 __func__, strerror(-r), -r);
1725 abort();
1726 }
1727 }
1728
1729 static void kvm_io_ioeventfd_add(MemoryListener *listener,
1730 MemoryRegionSection *section,
1731 bool match_data, uint64_t data,
1732 EventNotifier *e)
1733 {
1734 int fd = event_notifier_get_fd(e);
1735 int r;
1736
1737 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1738 data, true, int128_get64(section->size),
1739 match_data);
1740 if (r < 0) {
1741 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1742 __func__, strerror(-r), -r);
1743 abort();
1744 }
1745 }
1746
1747 static void kvm_io_ioeventfd_del(MemoryListener *listener,
1748 MemoryRegionSection *section,
1749 bool match_data, uint64_t data,
1750 EventNotifier *e)
1751
1752 {
1753 int fd = event_notifier_get_fd(e);
1754 int r;
1755
1756 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1757 data, false, int128_get64(section->size),
1758 match_data);
1759 if (r < 0) {
1760 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1761 __func__, strerror(-r), -r);
1762 abort();
1763 }
1764 }
1765
1766 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
1767 AddressSpace *as, int as_id, const char *name)
1768 {
1769 int i;
1770
1771 kml->slots = g_new0(KVMSlot, s->nr_slots);
1772 kml->as_id = as_id;
1773
1774 for (i = 0; i < s->nr_slots; i++) {
1775 kml->slots[i].slot = i;
1776 }
1777
1778 QSIMPLEQ_INIT(&kml->transaction_add);
1779 QSIMPLEQ_INIT(&kml->transaction_del);
1780
1781 kml->listener.region_add = kvm_region_add;
1782 kml->listener.region_del = kvm_region_del;
1783 kml->listener.commit = kvm_region_commit;
1784 kml->listener.log_start = kvm_log_start;
1785 kml->listener.log_stop = kvm_log_stop;
1786 kml->listener.priority = MEMORY_LISTENER_PRIORITY_ACCEL;
1787 kml->listener.name = name;
1788
1789 if (s->kvm_dirty_ring_size) {
1790 kml->listener.log_sync_global = kvm_log_sync_global;
1791 } else {
1792 kml->listener.log_sync = kvm_log_sync;
1793 kml->listener.log_clear = kvm_log_clear;
1794 }
1795
1796 memory_listener_register(&kml->listener, as);
1797
1798 for (i = 0; i < s->nr_as; ++i) {
1799 if (!s->as[i].as) {
1800 s->as[i].as = as;
1801 s->as[i].ml = kml;
1802 break;
1803 }
1804 }
1805 }
1806
1807 static MemoryListener kvm_io_listener = {
1808 .name = "kvm-io",
1809 .eventfd_add = kvm_io_ioeventfd_add,
1810 .eventfd_del = kvm_io_ioeventfd_del,
1811 .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND,
1812 };
1813
1814 int kvm_set_irq(KVMState *s, int irq, int level)
1815 {
1816 struct kvm_irq_level event;
1817 int ret;
1818
1819 assert(kvm_async_interrupts_enabled());
1820
1821 event.level = level;
1822 event.irq = irq;
1823 ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
1824 if (ret < 0) {
1825 perror("kvm_set_irq");
1826 abort();
1827 }
1828
1829 return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
1830 }
1831
1832 #ifdef KVM_CAP_IRQ_ROUTING
1833 typedef struct KVMMSIRoute {
1834 struct kvm_irq_routing_entry kroute;
1835 QTAILQ_ENTRY(KVMMSIRoute) entry;
1836 } KVMMSIRoute;
1837
1838 static void set_gsi(KVMState *s, unsigned int gsi)
1839 {
1840 set_bit(gsi, s->used_gsi_bitmap);
1841 }
1842
1843 static void clear_gsi(KVMState *s, unsigned int gsi)
1844 {
1845 clear_bit(gsi, s->used_gsi_bitmap);
1846 }
1847
1848 void kvm_init_irq_routing(KVMState *s)
1849 {
1850 int gsi_count, i;
1851
1852 gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1;
1853 if (gsi_count > 0) {
1854 /* Round up so we can search ints using ffs */
1855 s->used_gsi_bitmap = bitmap_new(gsi_count);
1856 s->gsi_count = gsi_count;
1857 }
1858
1859 s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
1860 s->nr_allocated_irq_routes = 0;
1861
1862 if (!kvm_direct_msi_allowed) {
1863 for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
1864 QTAILQ_INIT(&s->msi_hashtab[i]);
1865 }
1866 }
1867
1868 kvm_arch_init_irq_routing(s);
1869 }
1870
1871 void kvm_irqchip_commit_routes(KVMState *s)
1872 {
1873 int ret;
1874
1875 if (kvm_gsi_direct_mapping()) {
1876 return;
1877 }
1878
1879 if (!kvm_gsi_routing_enabled()) {
1880 return;
1881 }
1882
1883 s->irq_routes->flags = 0;
1884 trace_kvm_irqchip_commit_routes();
1885 ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
1886 assert(ret == 0);
1887 }
1888
1889 static void kvm_add_routing_entry(KVMState *s,
1890 struct kvm_irq_routing_entry *entry)
1891 {
1892 struct kvm_irq_routing_entry *new;
1893 int n, size;
1894
1895 if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
1896 n = s->nr_allocated_irq_routes * 2;
1897 if (n < 64) {
1898 n = 64;
1899 }
1900 size = sizeof(struct kvm_irq_routing);
1901 size += n * sizeof(*new);
1902 s->irq_routes = g_realloc(s->irq_routes, size);
1903 s->nr_allocated_irq_routes = n;
1904 }
1905 n = s->irq_routes->nr++;
1906 new = &s->irq_routes->entries[n];
1907
1908 *new = *entry;
1909
1910 set_gsi(s, entry->gsi);
1911 }
1912
1913 static int kvm_update_routing_entry(KVMState *s,
1914 struct kvm_irq_routing_entry *new_entry)
1915 {
1916 struct kvm_irq_routing_entry *entry;
1917 int n;
1918
1919 for (n = 0; n < s->irq_routes->nr; n++) {
1920 entry = &s->irq_routes->entries[n];
1921 if (entry->gsi != new_entry->gsi) {
1922 continue;
1923 }
1924
1925 if(!memcmp(entry, new_entry, sizeof *entry)) {
1926 return 0;
1927 }
1928
1929 *entry = *new_entry;
1930
1931 return 0;
1932 }
1933
1934 return -ESRCH;
1935 }
1936
1937 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
1938 {
1939 struct kvm_irq_routing_entry e = {};
1940
1941 assert(pin < s->gsi_count);
1942
1943 e.gsi = irq;
1944 e.type = KVM_IRQ_ROUTING_IRQCHIP;
1945 e.flags = 0;
1946 e.u.irqchip.irqchip = irqchip;
1947 e.u.irqchip.pin = pin;
1948 kvm_add_routing_entry(s, &e);
1949 }
1950
1951 void kvm_irqchip_release_virq(KVMState *s, int virq)
1952 {
1953 struct kvm_irq_routing_entry *e;
1954 int i;
1955
1956 if (kvm_gsi_direct_mapping()) {
1957 return;
1958 }
1959
1960 for (i = 0; i < s->irq_routes->nr; i++) {
1961 e = &s->irq_routes->entries[i];
1962 if (e->gsi == virq) {
1963 s->irq_routes->nr--;
1964 *e = s->irq_routes->entries[s->irq_routes->nr];
1965 }
1966 }
1967 clear_gsi(s, virq);
1968 kvm_arch_release_virq_post(virq);
1969 trace_kvm_irqchip_release_virq(virq);
1970 }
1971
1972 void kvm_irqchip_add_change_notifier(Notifier *n)
1973 {
1974 notifier_list_add(&kvm_irqchip_change_notifiers, n);
1975 }
1976
1977 void kvm_irqchip_remove_change_notifier(Notifier *n)
1978 {
1979 notifier_remove(n);
1980 }
1981
1982 void kvm_irqchip_change_notify(void)
1983 {
1984 notifier_list_notify(&kvm_irqchip_change_notifiers, NULL);
1985 }
1986
1987 static unsigned int kvm_hash_msi(uint32_t data)
1988 {
1989 /* This is optimized for IA32 MSI layout. However, no other arch shall
1990 * repeat the mistake of not providing a direct MSI injection API. */
1991 return data & 0xff;
1992 }
1993
1994 static void kvm_flush_dynamic_msi_routes(KVMState *s)
1995 {
1996 KVMMSIRoute *route, *next;
1997 unsigned int hash;
1998
1999 for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
2000 QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
2001 kvm_irqchip_release_virq(s, route->kroute.gsi);
2002 QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
2003 g_free(route);
2004 }
2005 }
2006 }
2007
2008 static int kvm_irqchip_get_virq(KVMState *s)
2009 {
2010 int next_virq;
2011
2012 /*
2013 * PIC and IOAPIC share the first 16 GSI numbers, thus the available
2014 * GSI numbers are more than the number of IRQ route. Allocating a GSI
2015 * number can succeed even though a new route entry cannot be added.
2016 * When this happens, flush dynamic MSI entries to free IRQ route entries.
2017 */
2018 if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) {
2019 kvm_flush_dynamic_msi_routes(s);
2020 }
2021
2022 /* Return the lowest unused GSI in the bitmap */
2023 next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count);
2024 if (next_virq >= s->gsi_count) {
2025 return -ENOSPC;
2026 } else {
2027 return next_virq;
2028 }
2029 }
2030
2031 static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg)
2032 {
2033 unsigned int hash = kvm_hash_msi(msg.data);
2034 KVMMSIRoute *route;
2035
2036 QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) {
2037 if (route->kroute.u.msi.address_lo == (uint32_t)msg.address &&
2038 route->kroute.u.msi.address_hi == (msg.address >> 32) &&
2039 route->kroute.u.msi.data == le32_to_cpu(msg.data)) {
2040 return route;
2041 }
2042 }
2043 return NULL;
2044 }
2045
2046 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
2047 {
2048 struct kvm_msi msi;
2049 KVMMSIRoute *route;
2050
2051 if (kvm_direct_msi_allowed) {
2052 msi.address_lo = (uint32_t)msg.address;
2053 msi.address_hi = msg.address >> 32;
2054 msi.data = le32_to_cpu(msg.data);
2055 msi.flags = 0;
2056 memset(msi.pad, 0, sizeof(msi.pad));
2057
2058 return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
2059 }
2060
2061 route = kvm_lookup_msi_route(s, msg);
2062 if (!route) {
2063 int virq;
2064
2065 virq = kvm_irqchip_get_virq(s);
2066 if (virq < 0) {
2067 return virq;
2068 }
2069
2070 route = g_new0(KVMMSIRoute, 1);
2071 route->kroute.gsi = virq;
2072 route->kroute.type = KVM_IRQ_ROUTING_MSI;
2073 route->kroute.flags = 0;
2074 route->kroute.u.msi.address_lo = (uint32_t)msg.address;
2075 route->kroute.u.msi.address_hi = msg.address >> 32;
2076 route->kroute.u.msi.data = le32_to_cpu(msg.data);
2077
2078 kvm_add_routing_entry(s, &route->kroute);
2079 kvm_irqchip_commit_routes(s);
2080
2081 QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route,
2082 entry);
2083 }
2084
2085 assert(route->kroute.type == KVM_IRQ_ROUTING_MSI);
2086
2087 return kvm_set_irq(s, route->kroute.gsi, 1);
2088 }
2089
2090 int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
2091 {
2092 struct kvm_irq_routing_entry kroute = {};
2093 int virq;
2094 KVMState *s = c->s;
2095 MSIMessage msg = {0, 0};
2096
2097 if (pci_available && dev) {
2098 msg = pci_get_msi_message(dev, vector);
2099 }
2100
2101 if (kvm_gsi_direct_mapping()) {
2102 return kvm_arch_msi_data_to_gsi(msg.data);
2103 }
2104
2105 if (!kvm_gsi_routing_enabled()) {
2106 return -ENOSYS;
2107 }
2108
2109 virq = kvm_irqchip_get_virq(s);
2110 if (virq < 0) {
2111 return virq;
2112 }
2113
2114 kroute.gsi = virq;
2115 kroute.type = KVM_IRQ_ROUTING_MSI;
2116 kroute.flags = 0;
2117 kroute.u.msi.address_lo = (uint32_t)msg.address;
2118 kroute.u.msi.address_hi = msg.address >> 32;
2119 kroute.u.msi.data = le32_to_cpu(msg.data);
2120 if (pci_available && kvm_msi_devid_required()) {
2121 kroute.flags = KVM_MSI_VALID_DEVID;
2122 kroute.u.msi.devid = pci_requester_id(dev);
2123 }
2124 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
2125 kvm_irqchip_release_virq(s, virq);
2126 return -EINVAL;
2127 }
2128
2129 trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A",
2130 vector, virq);
2131
2132 kvm_add_routing_entry(s, &kroute);
2133 kvm_arch_add_msi_route_post(&kroute, vector, dev);
2134 c->changes++;
2135
2136 return virq;
2137 }
2138
2139 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
2140 PCIDevice *dev)
2141 {
2142 struct kvm_irq_routing_entry kroute = {};
2143
2144 if (kvm_gsi_direct_mapping()) {
2145 return 0;
2146 }
2147
2148 if (!kvm_irqchip_in_kernel()) {
2149 return -ENOSYS;
2150 }
2151
2152 kroute.gsi = virq;
2153 kroute.type = KVM_IRQ_ROUTING_MSI;
2154 kroute.flags = 0;
2155 kroute.u.msi.address_lo = (uint32_t)msg.address;
2156 kroute.u.msi.address_hi = msg.address >> 32;
2157 kroute.u.msi.data = le32_to_cpu(msg.data);
2158 if (pci_available && kvm_msi_devid_required()) {
2159 kroute.flags = KVM_MSI_VALID_DEVID;
2160 kroute.u.msi.devid = pci_requester_id(dev);
2161 }
2162 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
2163 return -EINVAL;
2164 }
2165
2166 trace_kvm_irqchip_update_msi_route(virq);
2167
2168 return kvm_update_routing_entry(s, &kroute);
2169 }
2170
2171 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
2172 EventNotifier *resample, int virq,
2173 bool assign)
2174 {
2175 int fd = event_notifier_get_fd(event);
2176 int rfd = resample ? event_notifier_get_fd(resample) : -1;
2177
2178 struct kvm_irqfd irqfd = {
2179 .fd = fd,
2180 .gsi = virq,
2181 .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
2182 };
2183
2184 if (rfd != -1) {
2185 assert(assign);
2186 if (kvm_irqchip_is_split()) {
2187 /*
2188 * When the slow irqchip (e.g. IOAPIC) is in the
2189 * userspace, KVM kernel resamplefd will not work because
2190 * the EOI of the interrupt will be delivered to userspace
2191 * instead, so the KVM kernel resamplefd kick will be
2192 * skipped. The userspace here mimics what the kernel
2193 * provides with resamplefd, remember the resamplefd and
2194 * kick it when we receive EOI of this IRQ.
2195 *
2196 * This is hackery because IOAPIC is mostly bypassed
2197 * (except EOI broadcasts) when irqfd is used. However
2198 * this can bring much performance back for split irqchip
2199 * with INTx IRQs (for VFIO, this gives 93% perf of the
2200 * full fast path, which is 46% perf boost comparing to
2201 * the INTx slow path).
2202 */
2203 kvm_resample_fd_insert(virq, resample);
2204 } else {
2205 irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
2206 irqfd.resamplefd = rfd;
2207 }
2208 } else if (!assign) {
2209 if (kvm_irqchip_is_split()) {
2210 kvm_resample_fd_remove(virq);
2211 }
2212 }
2213
2214 if (!kvm_irqfds_enabled()) {
2215 return -ENOSYS;
2216 }
2217
2218 return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
2219 }
2220
2221 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
2222 {
2223 struct kvm_irq_routing_entry kroute = {};
2224 int virq;
2225
2226 if (!kvm_gsi_routing_enabled()) {
2227 return -ENOSYS;
2228 }
2229
2230 virq = kvm_irqchip_get_virq(s);
2231 if (virq < 0) {
2232 return virq;
2233 }
2234
2235 kroute.gsi = virq;
2236 kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER;
2237 kroute.flags = 0;
2238 kroute.u.adapter.summary_addr = adapter->summary_addr;
2239 kroute.u.adapter.ind_addr = adapter->ind_addr;
2240 kroute.u.adapter.summary_offset = adapter->summary_offset;
2241 kroute.u.adapter.ind_offset = adapter->ind_offset;
2242 kroute.u.adapter.adapter_id = adapter->adapter_id;
2243
2244 kvm_add_routing_entry(s, &kroute);
2245
2246 return virq;
2247 }
2248
2249 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
2250 {
2251 struct kvm_irq_routing_entry kroute = {};
2252 int virq;
2253
2254 if (!kvm_gsi_routing_enabled()) {
2255 return -ENOSYS;
2256 }
2257 if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) {
2258 return -ENOSYS;
2259 }
2260 virq = kvm_irqchip_get_virq(s);
2261 if (virq < 0) {
2262 return virq;
2263 }
2264
2265 kroute.gsi = virq;
2266 kroute.type = KVM_IRQ_ROUTING_HV_SINT;
2267 kroute.flags = 0;
2268 kroute.u.hv_sint.vcpu = vcpu;
2269 kroute.u.hv_sint.sint = sint;
2270
2271 kvm_add_routing_entry(s, &kroute);
2272 kvm_irqchip_commit_routes(s);
2273
2274 return virq;
2275 }
2276
2277 #else /* !KVM_CAP_IRQ_ROUTING */
2278
2279 void kvm_init_irq_routing(KVMState *s)
2280 {
2281 }
2282
2283 void kvm_irqchip_release_virq(KVMState *s, int virq)
2284 {
2285 }
2286
2287 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
2288 {
2289 abort();
2290 }
2291
2292 int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
2293 {
2294 return -ENOSYS;
2295 }
2296
2297 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
2298 {
2299 return -ENOSYS;
2300 }
2301
2302 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
2303 {
2304 return -ENOSYS;
2305 }
2306
2307 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
2308 EventNotifier *resample, int virq,
2309 bool assign)
2310 {
2311 abort();
2312 }
2313
2314 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
2315 {
2316 return -ENOSYS;
2317 }
2318 #endif /* !KVM_CAP_IRQ_ROUTING */
2319
2320 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
2321 EventNotifier *rn, int virq)
2322 {
2323 return kvm_irqchip_assign_irqfd(s, n, rn, virq, true);
2324 }
2325
2326 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
2327 int virq)
2328 {
2329 return kvm_irqchip_assign_irqfd(s, n, NULL, virq, false);
2330 }
2331
2332 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
2333 EventNotifier *rn, qemu_irq irq)
2334 {
2335 gpointer key, gsi;
2336 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
2337
2338 if (!found) {
2339 return -ENXIO;
2340 }
2341 return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
2342 }
2343
2344 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
2345 qemu_irq irq)
2346 {
2347 gpointer key, gsi;
2348 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
2349
2350 if (!found) {
2351 return -ENXIO;
2352 }
2353 return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
2354 }
2355
2356 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
2357 {
2358 g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
2359 }
2360
2361 static void kvm_irqchip_create(KVMState *s)
2362 {
2363 int ret;
2364
2365 assert(s->kernel_irqchip_split != ON_OFF_AUTO_AUTO);
2366 if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
2367 ;
2368 } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
2369 ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
2370 if (ret < 0) {
2371 fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
2372 exit(1);
2373 }
2374 } else {
2375 return;
2376 }
2377
2378 /* First probe and see if there's a arch-specific hook to create the
2379 * in-kernel irqchip for us */
2380 ret = kvm_arch_irqchip_create(s);
2381 if (ret == 0) {
2382 if (s->kernel_irqchip_split == ON_OFF_AUTO_ON) {
2383 error_report("Split IRQ chip mode not supported.");
2384 exit(1);
2385 } else {
2386 ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
2387 }
2388 }
2389 if (ret < 0) {
2390 fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
2391 exit(1);
2392 }
2393
2394 kvm_kernel_irqchip = true;
2395 /* If we have an in-kernel IRQ chip then we must have asynchronous
2396 * interrupt delivery (though the reverse is not necessarily true)
2397 */
2398 kvm_async_interrupts_allowed = true;
2399 kvm_halt_in_kernel_allowed = true;
2400
2401 kvm_init_irq_routing(s);
2402
2403 s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
2404 }
2405
2406 /* Find number of supported CPUs using the recommended
2407 * procedure from the kernel API documentation to cope with
2408 * older kernels that may be missing capabilities.
2409 */
2410 static int kvm_recommended_vcpus(KVMState *s)
2411 {
2412 int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS);
2413 return (ret) ? ret : 4;
2414 }
2415
2416 static int kvm_max_vcpus(KVMState *s)
2417 {
2418 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
2419 return (ret) ? ret : kvm_recommended_vcpus(s);
2420 }
2421
2422 static int kvm_max_vcpu_id(KVMState *s)
2423 {
2424 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID);
2425 return (ret) ? ret : kvm_max_vcpus(s);
2426 }
2427
2428 bool kvm_vcpu_id_is_valid(int vcpu_id)
2429 {
2430 KVMState *s = KVM_STATE(current_accel());
2431 return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
2432 }
2433
2434 bool kvm_dirty_ring_enabled(void)
2435 {
2436 return kvm_state->kvm_dirty_ring_size ? true : false;
2437 }
2438
2439 static void query_stats_cb(StatsResultList **result, StatsTarget target,
2440 strList *names, strList *targets, Error **errp);
2441 static void query_stats_schemas_cb(StatsSchemaList **result, Error **errp);
2442
2443 uint32_t kvm_dirty_ring_size(void)
2444 {
2445 return kvm_state->kvm_dirty_ring_size;
2446 }
2447
2448 static int kvm_init(MachineState *ms)
2449 {
2450 MachineClass *mc = MACHINE_GET_CLASS(ms);
2451 static const char upgrade_note[] =
2452 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
2453 "(see http://sourceforge.net/projects/kvm).\n";
2454 const struct {
2455 const char *name;
2456 int num;
2457 } num_cpus[] = {
2458 { "SMP", ms->smp.cpus },
2459 { "hotpluggable", ms->smp.max_cpus },
2460 { /* end of list */ }
2461 }, *nc = num_cpus;
2462 int soft_vcpus_limit, hard_vcpus_limit;
2463 KVMState *s;
2464 const KVMCapabilityInfo *missing_cap;
2465 int ret;
2466 int type;
2467 uint64_t dirty_log_manual_caps;
2468
2469 qemu_mutex_init(&kml_slots_lock);
2470
2471 s = KVM_STATE(ms->accelerator);
2472
2473 /*
2474 * On systems where the kernel can support different base page
2475 * sizes, host page size may be different from TARGET_PAGE_SIZE,
2476 * even with KVM. TARGET_PAGE_SIZE is assumed to be the minimum
2477 * page size for the system though.
2478 */
2479 assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size());
2480
2481 s->sigmask_len = 8;
2482 accel_blocker_init();
2483
2484 #ifdef KVM_CAP_SET_GUEST_DEBUG
2485 QTAILQ_INIT(&s->kvm_sw_breakpoints);
2486 #endif
2487 QLIST_INIT(&s->kvm_parked_vcpus);
2488 s->fd = qemu_open_old("/dev/kvm", O_RDWR);
2489 if (s->fd == -1) {
2490 fprintf(stderr, "Could not access KVM kernel module: %m\n");
2491 ret = -errno;
2492 goto err;
2493 }
2494
2495 ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
2496 if (ret < KVM_API_VERSION) {
2497 if (ret >= 0) {
2498 ret = -EINVAL;
2499 }
2500 fprintf(stderr, "kvm version too old\n");
2501 goto err;
2502 }
2503
2504 if (ret > KVM_API_VERSION) {
2505 ret = -EINVAL;
2506 fprintf(stderr, "kvm version not supported\n");
2507 goto err;
2508 }
2509
2510 kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
2511 s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
2512
2513 /* If unspecified, use the default value */
2514 if (!s->nr_slots) {
2515 s->nr_slots = 32;
2516 }
2517
2518 s->nr_as = kvm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE);
2519 if (s->nr_as <= 1) {
2520 s->nr_as = 1;
2521 }
2522 s->as = g_new0(struct KVMAs, s->nr_as);
2523
2524 if (object_property_find(OBJECT(current_machine), "kvm-type")) {
2525 g_autofree char *kvm_type = object_property_get_str(OBJECT(current_machine),
2526 "kvm-type",
2527 &error_abort);
2528 type = mc->kvm_type(ms, kvm_type);
2529 } else if (mc->kvm_type) {
2530 type = mc->kvm_type(ms, NULL);
2531 } else {
2532 type = kvm_arch_get_default_type(ms);
2533 }
2534
2535 if (type < 0) {
2536 ret = -EINVAL;
2537 goto err;
2538 }
2539
2540 do {
2541 ret = kvm_ioctl(s, KVM_CREATE_VM, type);
2542 } while (ret == -EINTR);
2543
2544 if (ret < 0) {
2545 fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret,
2546 strerror(-ret));
2547
2548 #ifdef TARGET_S390X
2549 if (ret == -EINVAL) {
2550 fprintf(stderr,
2551 "Host kernel setup problem detected. Please verify:\n");
2552 fprintf(stderr, "- for kernels supporting the switch_amode or"
2553 " user_mode parameters, whether\n");
2554 fprintf(stderr,
2555 " user space is running in primary address space\n");
2556 fprintf(stderr,
2557 "- for kernels supporting the vm.allocate_pgste sysctl, "
2558 "whether it is enabled\n");
2559 }
2560 #elif defined(TARGET_PPC)
2561 if (ret == -EINVAL) {
2562 fprintf(stderr,
2563 "PPC KVM module is not loaded. Try modprobe kvm_%s.\n",
2564 (type == 2) ? "pr" : "hv");
2565 }
2566 #endif
2567 goto err;
2568 }
2569
2570 s->vmfd = ret;
2571
2572 /* check the vcpu limits */
2573 soft_vcpus_limit = kvm_recommended_vcpus(s);
2574 hard_vcpus_limit = kvm_max_vcpus(s);
2575
2576 while (nc->name) {
2577 if (nc->num > soft_vcpus_limit) {
2578 warn_report("Number of %s cpus requested (%d) exceeds "
2579 "the recommended cpus supported by KVM (%d)",
2580 nc->name, nc->num, soft_vcpus_limit);
2581
2582 if (nc->num > hard_vcpus_limit) {
2583 fprintf(stderr, "Number of %s cpus requested (%d) exceeds "
2584 "the maximum cpus supported by KVM (%d)\n",
2585 nc->name, nc->num, hard_vcpus_limit);
2586 exit(1);
2587 }
2588 }
2589 nc++;
2590 }
2591
2592 missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
2593 if (!missing_cap) {
2594 missing_cap =
2595 kvm_check_extension_list(s, kvm_arch_required_capabilities);
2596 }
2597 if (missing_cap) {
2598 ret = -EINVAL;
2599 fprintf(stderr, "kvm does not support %s\n%s",
2600 missing_cap->name, upgrade_note);
2601 goto err;
2602 }
2603
2604 s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
2605 s->coalesced_pio = s->coalesced_mmio &&
2606 kvm_check_extension(s, KVM_CAP_COALESCED_PIO);
2607
2608 /*
2609 * Enable KVM dirty ring if supported, otherwise fall back to
2610 * dirty logging mode
2611 */
2612 ret = kvm_dirty_ring_init(s);
2613 if (ret < 0) {
2614 goto err;
2615 }
2616
2617 /*
2618 * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is
2619 * enabled. More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no
2620 * page is wr-protected initially, which is against how kvm dirty ring is
2621 * usage - kvm dirty ring requires all pages are wr-protected at the very
2622 * beginning. Enabling this feature for dirty ring causes data corruption.
2623 *
2624 * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log,
2625 * we may expect a higher stall time when starting the migration. In the
2626 * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too:
2627 * instead of clearing dirty bit, it can be a way to explicitly wr-protect
2628 * guest pages.
2629 */
2630 if (!s->kvm_dirty_ring_size) {
2631 dirty_log_manual_caps =
2632 kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
2633 dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
2634 KVM_DIRTY_LOG_INITIALLY_SET);
2635 s->manual_dirty_log_protect = dirty_log_manual_caps;
2636 if (dirty_log_manual_caps) {
2637 ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0,
2638 dirty_log_manual_caps);
2639 if (ret) {
2640 warn_report("Trying to enable capability %"PRIu64" of "
2641 "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. "
2642 "Falling back to the legacy mode. ",
2643 dirty_log_manual_caps);
2644 s->manual_dirty_log_protect = 0;
2645 }
2646 }
2647 }
2648
2649 #ifdef KVM_CAP_VCPU_EVENTS
2650 s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
2651 #endif
2652
2653 s->robust_singlestep =
2654 kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
2655
2656 #ifdef KVM_CAP_DEBUGREGS
2657 s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
2658 #endif
2659
2660 s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);
2661
2662 #ifdef KVM_CAP_IRQ_ROUTING
2663 kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
2664 #endif
2665
2666 s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3);
2667
2668 s->irq_set_ioctl = KVM_IRQ_LINE;
2669 if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
2670 s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
2671 }
2672
2673 kvm_readonly_mem_allowed =
2674 (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
2675
2676 kvm_eventfds_allowed =
2677 (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0);
2678
2679 kvm_irqfds_allowed =
2680 (kvm_check_extension(s, KVM_CAP_IRQFD) > 0);
2681
2682 kvm_resamplefds_allowed =
2683 (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0);
2684
2685 kvm_vm_attributes_allowed =
2686 (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0);
2687
2688 kvm_ioeventfd_any_length_allowed =
2689 (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0);
2690
2691 #ifdef KVM_CAP_SET_GUEST_DEBUG
2692 kvm_has_guest_debug =
2693 (kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG) > 0);
2694 #endif
2695
2696 kvm_sstep_flags = 0;
2697 if (kvm_has_guest_debug) {
2698 kvm_sstep_flags = SSTEP_ENABLE;
2699
2700 #if defined KVM_CAP_SET_GUEST_DEBUG2
2701 int guest_debug_flags =
2702 kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG2);
2703
2704 if (guest_debug_flags & KVM_GUESTDBG_BLOCKIRQ) {
2705 kvm_sstep_flags |= SSTEP_NOIRQ;
2706 }
2707 #endif
2708 }
2709
2710 kvm_state = s;
2711
2712 ret = kvm_arch_init(ms, s);
2713 if (ret < 0) {
2714 goto err;
2715 }
2716
2717 if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) {
2718 s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
2719 }
2720
2721 qemu_register_reset(kvm_unpoison_all, NULL);
2722
2723 if (s->kernel_irqchip_allowed) {
2724 kvm_irqchip_create(s);
2725 }
2726
2727 if (kvm_eventfds_allowed) {
2728 s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
2729 s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
2730 }
2731 s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region;
2732 s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region;
2733
2734 kvm_memory_listener_register(s, &s->memory_listener,
2735 &address_space_memory, 0, "kvm-memory");
2736 if (kvm_eventfds_allowed) {
2737 memory_listener_register(&kvm_io_listener,
2738 &address_space_io);
2739 }
2740 memory_listener_register(&kvm_coalesced_pio_listener,
2741 &address_space_io);
2742
2743 s->many_ioeventfds = kvm_check_many_ioeventfds();
2744
2745 s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
2746 if (!s->sync_mmu) {
2747 ret = ram_block_discard_disable(true);
2748 assert(!ret);
2749 }
2750
2751 if (s->kvm_dirty_ring_size) {
2752 kvm_dirty_ring_reaper_init(s);
2753 }
2754
2755 if (kvm_check_extension(kvm_state, KVM_CAP_BINARY_STATS_FD)) {
2756 add_stats_callbacks(STATS_PROVIDER_KVM, query_stats_cb,
2757 query_stats_schemas_cb);
2758 }
2759
2760 return 0;
2761
2762 err:
2763 assert(ret < 0);
2764 if (s->vmfd >= 0) {
2765 close(s->vmfd);
2766 }
2767 if (s->fd != -1) {
2768 close(s->fd);
2769 }
2770 g_free(s->as);
2771 g_free(s->memory_listener.slots);
2772
2773 return ret;
2774 }
2775
2776 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len)
2777 {
2778 s->sigmask_len = sigmask_len;
2779 }
2780
2781 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
2782 int size, uint32_t count)
2783 {
2784 int i;
2785 uint8_t *ptr = data;
2786
2787 for (i = 0; i < count; i++) {
2788 address_space_rw(&address_space_io, port, attrs,
2789 ptr, size,
2790 direction == KVM_EXIT_IO_OUT);
2791 ptr += size;
2792 }
2793 }
2794
2795 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run)
2796 {
2797 fprintf(stderr, "KVM internal error. Suberror: %d\n",
2798 run->internal.suberror);
2799
2800 if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
2801 int i;
2802
2803 for (i = 0; i < run->internal.ndata; ++i) {
2804 fprintf(stderr, "extra data[%d]: 0x%016"PRIx64"\n",
2805 i, (uint64_t)run->internal.data[i]);
2806 }
2807 }
2808 if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
2809 fprintf(stderr, "emulation failure\n");
2810 if (!kvm_arch_stop_on_emulation_error(cpu)) {
2811 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2812 return EXCP_INTERRUPT;
2813 }
2814 }
2815 /* FIXME: Should trigger a qmp message to let management know
2816 * something went wrong.
2817 */
2818 return -1;
2819 }
2820
2821 void kvm_flush_coalesced_mmio_buffer(void)
2822 {
2823 KVMState *s = kvm_state;
2824
2825 if (!s || s->coalesced_flush_in_progress) {
2826 return;
2827 }
2828
2829 s->coalesced_flush_in_progress = true;
2830
2831 if (s->coalesced_mmio_ring) {
2832 struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
2833 while (ring->first != ring->last) {
2834 struct kvm_coalesced_mmio *ent;
2835
2836 ent = &ring->coalesced_mmio[ring->first];
2837
2838 if (ent->pio == 1) {
2839 address_space_write(&address_space_io, ent->phys_addr,
2840 MEMTXATTRS_UNSPECIFIED, ent->data,
2841 ent->len);
2842 } else {
2843 cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
2844 }
2845 smp_wmb();
2846 ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
2847 }
2848 }
2849
2850 s->coalesced_flush_in_progress = false;
2851 }
2852
2853 bool kvm_cpu_check_are_resettable(void)
2854 {
2855 return kvm_arch_cpu_check_are_resettable();
2856 }
2857
2858 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2859 {
2860 if (!cpu->vcpu_dirty) {
2861 int ret = kvm_arch_get_registers(cpu);
2862 if (ret) {
2863 error_report("Failed to get registers: %s", strerror(-ret));
2864 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2865 vm_stop(RUN_STATE_INTERNAL_ERROR);
2866 }
2867
2868 cpu->vcpu_dirty = true;
2869 }
2870 }
2871
2872 void kvm_cpu_synchronize_state(CPUState *cpu)
2873 {
2874 if (!cpu->vcpu_dirty) {
2875 run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
2876 }
2877 }
2878
2879 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
2880 {
2881 int ret = kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
2882 if (ret) {
2883 error_report("Failed to put registers after reset: %s", strerror(-ret));
2884 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2885 vm_stop(RUN_STATE_INTERNAL_ERROR);
2886 }
2887
2888 cpu->vcpu_dirty = false;
2889 }
2890
2891 void kvm_cpu_synchronize_post_reset(CPUState *cpu)
2892 {
2893 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2894 }
2895
2896 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
2897 {
2898 int ret = kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
2899 if (ret) {
2900 error_report("Failed to put registers after init: %s", strerror(-ret));
2901 exit(1);
2902 }
2903
2904 cpu->vcpu_dirty = false;
2905 }
2906
2907 void kvm_cpu_synchronize_post_init(CPUState *cpu)
2908 {
2909 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2910 }
2911
2912 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
2913 {
2914 cpu->vcpu_dirty = true;
2915 }
2916
2917 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
2918 {
2919 run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2920 }
2921
2922 #ifdef KVM_HAVE_MCE_INJECTION
2923 static __thread void *pending_sigbus_addr;
2924 static __thread int pending_sigbus_code;
2925 static __thread bool have_sigbus_pending;
2926 #endif
2927
2928 static void kvm_cpu_kick(CPUState *cpu)
2929 {
2930 qatomic_set(&cpu->kvm_run->immediate_exit, 1);
2931 }
2932
2933 static void kvm_cpu_kick_self(void)
2934 {
2935 if (kvm_immediate_exit) {
2936 kvm_cpu_kick(current_cpu);
2937 } else {
2938 qemu_cpu_kick_self();
2939 }
2940 }
2941
2942 static void kvm_eat_signals(CPUState *cpu)
2943 {
2944 struct timespec ts = { 0, 0 };
2945 siginfo_t siginfo;
2946 sigset_t waitset;
2947 sigset_t chkset;
2948 int r;
2949
2950 if (kvm_immediate_exit) {
2951 qatomic_set(&cpu->kvm_run->immediate_exit, 0);
2952 /* Write kvm_run->immediate_exit before the cpu->exit_request
2953 * write in kvm_cpu_exec.
2954 */
2955 smp_wmb();
2956 return;
2957 }
2958
2959 sigemptyset(&waitset);
2960 sigaddset(&waitset, SIG_IPI);
2961
2962 do {
2963 r = sigtimedwait(&waitset, &siginfo, &ts);
2964 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
2965 perror("sigtimedwait");
2966 exit(1);
2967 }
2968
2969 r = sigpending(&chkset);
2970 if (r == -1) {
2971 perror("sigpending");
2972 exit(1);
2973 }
2974 } while (sigismember(&chkset, SIG_IPI));
2975 }
2976
2977 int kvm_cpu_exec(CPUState *cpu)
2978 {
2979 struct kvm_run *run = cpu->kvm_run;
2980 int ret, run_ret;
2981
2982 DPRINTF("kvm_cpu_exec()\n");
2983
2984 if (kvm_arch_process_async_events(cpu)) {
2985 qatomic_set(&cpu->exit_request, 0);
2986 return EXCP_HLT;
2987 }
2988
2989 qemu_mutex_unlock_iothread();
2990 cpu_exec_start(cpu);
2991
2992 do {
2993 MemTxAttrs attrs;
2994
2995 if (cpu->vcpu_dirty) {
2996 ret = kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
2997 if (ret) {
2998 error_report("Failed to put registers after init: %s",
2999 strerror(-ret));
3000 ret = -1;
3001 break;
3002 }
3003
3004 cpu->vcpu_dirty = false;
3005 }
3006
3007 kvm_arch_pre_run(cpu, run);
3008 if (qatomic_read(&cpu->exit_request)) {
3009 DPRINTF("interrupt exit requested\n");
3010 /*
3011 * KVM requires us to reenter the kernel after IO exits to complete
3012 * instruction emulation. This self-signal will ensure that we
3013 * leave ASAP again.
3014 */
3015 kvm_cpu_kick_self();
3016 }
3017
3018 /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
3019 * Matching barrier in kvm_eat_signals.
3020 */
3021 smp_rmb();
3022
3023 run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
3024
3025 attrs = kvm_arch_post_run(cpu, run);
3026
3027 #ifdef KVM_HAVE_MCE_INJECTION
3028 if (unlikely(have_sigbus_pending)) {
3029 qemu_mutex_lock_iothread();
3030 kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
3031 pending_sigbus_addr);
3032 have_sigbus_pending = false;
3033 qemu_mutex_unlock_iothread();
3034 }
3035 #endif
3036
3037 if (run_ret < 0) {
3038 if (run_ret == -EINTR || run_ret == -EAGAIN) {
3039 DPRINTF("io window exit\n");
3040 kvm_eat_signals(cpu);
3041 ret = EXCP_INTERRUPT;
3042 break;
3043 }
3044 fprintf(stderr, "error: kvm run failed %s\n",
3045 strerror(-run_ret));
3046 #ifdef TARGET_PPC
3047 if (run_ret == -EBUSY) {
3048 fprintf(stderr,
3049 "This is probably because your SMT is enabled.\n"
3050 "VCPU can only run on primary threads with all "
3051 "secondary threads offline.\n");
3052 }
3053 #endif
3054 ret = -1;
3055 break;
3056 }
3057
3058 trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
3059 switch (run->exit_reason) {
3060 case KVM_EXIT_IO:
3061 DPRINTF("handle_io\n");
3062 /* Called outside BQL */
3063 kvm_handle_io(run->io.port, attrs,
3064 (uint8_t *)run + run->io.data_offset,
3065 run->io.direction,
3066 run->io.size,
3067 run->io.count);
3068 ret = 0;
3069 break;
3070 case KVM_EXIT_MMIO:
3071 DPRINTF("handle_mmio\n");
3072 /* Called outside BQL */
3073 address_space_rw(&address_space_memory,
3074 run->mmio.phys_addr, attrs,
3075 run->mmio.data,
3076 run->mmio.len,
3077 run->mmio.is_write);
3078 ret = 0;
3079 break;
3080 case KVM_EXIT_IRQ_WINDOW_OPEN:
3081 DPRINTF("irq_window_open\n");
3082 ret = EXCP_INTERRUPT;
3083 break;
3084 case KVM_EXIT_SHUTDOWN:
3085 DPRINTF("shutdown\n");
3086 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
3087 ret = EXCP_INTERRUPT;
3088 break;
3089 case KVM_EXIT_UNKNOWN:
3090 fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
3091 (uint64_t)run->hw.hardware_exit_reason);
3092 ret = -1;
3093 break;
3094 case KVM_EXIT_INTERNAL_ERROR:
3095 ret = kvm_handle_internal_error(cpu, run);
3096 break;
3097 case KVM_EXIT_DIRTY_RING_FULL:
3098 /*
3099 * We shouldn't continue if the dirty ring of this vcpu is
3100 * still full. Got kicked by KVM_RESET_DIRTY_RINGS.
3101 */
3102 trace_kvm_dirty_ring_full(cpu->cpu_index);
3103 qemu_mutex_lock_iothread();
3104 /*
3105 * We throttle vCPU by making it sleep once it exit from kernel
3106 * due to dirty ring full. In the dirtylimit scenario, reaping
3107 * all vCPUs after a single vCPU dirty ring get full result in
3108 * the miss of sleep, so just reap the ring-fulled vCPU.
3109 */
3110 if (dirtylimit_in_service()) {
3111 kvm_dirty_ring_reap(kvm_state, cpu);
3112 } else {
3113 kvm_dirty_ring_reap(kvm_state, NULL);
3114 }
3115 qemu_mutex_unlock_iothread();
3116 dirtylimit_vcpu_execute(cpu);
3117 ret = 0;
3118 break;
3119 case KVM_EXIT_SYSTEM_EVENT:
3120 switch (run->system_event.type) {
3121 case KVM_SYSTEM_EVENT_SHUTDOWN:
3122 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
3123 ret = EXCP_INTERRUPT;
3124 break;
3125 case KVM_SYSTEM_EVENT_RESET:
3126 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
3127 ret = EXCP_INTERRUPT;
3128 break;
3129 case KVM_SYSTEM_EVENT_CRASH:
3130 kvm_cpu_synchronize_state(cpu);
3131 qemu_mutex_lock_iothread();
3132 qemu_system_guest_panicked(cpu_get_crash_info(cpu));
3133 qemu_mutex_unlock_iothread();
3134 ret = 0;
3135 break;
3136 default:
3137 DPRINTF("kvm_arch_handle_exit\n");
3138 ret = kvm_arch_handle_exit(cpu, run);
3139 break;
3140 }
3141 break;
3142 default:
3143 DPRINTF("kvm_arch_handle_exit\n");
3144 ret = kvm_arch_handle_exit(cpu, run);
3145 break;
3146 }
3147 } while (ret == 0);
3148
3149 cpu_exec_end(cpu);
3150 qemu_mutex_lock_iothread();
3151
3152 if (ret < 0) {
3153 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
3154 vm_stop(RUN_STATE_INTERNAL_ERROR);
3155 }
3156
3157 qatomic_set(&cpu->exit_request, 0);
3158 return ret;
3159 }
3160
3161 int kvm_ioctl(KVMState *s, int type, ...)
3162 {
3163 int ret;
3164 void *arg;
3165 va_list ap;
3166
3167 va_start(ap, type);
3168 arg = va_arg(ap, void *);
3169 va_end(ap);
3170
3171 trace_kvm_ioctl(type, arg);
3172 ret = ioctl(s->fd, type, arg);
3173 if (ret == -1) {
3174 ret = -errno;
3175 }
3176 return ret;
3177 }
3178
3179 int kvm_vm_ioctl(KVMState *s, int type, ...)
3180 {
3181 int ret;
3182 void *arg;
3183 va_list ap;
3184
3185 va_start(ap, type);
3186 arg = va_arg(ap, void *);
3187 va_end(ap);
3188
3189 trace_kvm_vm_ioctl(type, arg);
3190 accel_ioctl_begin();
3191 ret = ioctl(s->vmfd, type, arg);
3192 accel_ioctl_end();
3193 if (ret == -1) {
3194 ret = -errno;
3195 }
3196 return ret;
3197 }
3198
3199 int kvm_vcpu_ioctl(CPUState *cpu, int type, ...)
3200 {
3201 int ret;
3202 void *arg;
3203 va_list ap;
3204
3205 va_start(ap, type);
3206 arg = va_arg(ap, void *);
3207 va_end(ap);
3208
3209 trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
3210 accel_cpu_ioctl_begin(cpu);
3211 ret = ioctl(cpu->kvm_fd, type, arg);
3212 accel_cpu_ioctl_end(cpu);
3213 if (ret == -1) {
3214 ret = -errno;
3215 }
3216 return ret;
3217 }
3218
3219 int kvm_device_ioctl(int fd, int type, ...)
3220 {
3221 int ret;
3222 void *arg;
3223 va_list ap;
3224
3225 va_start(ap, type);
3226 arg = va_arg(ap, void *);
3227 va_end(ap);
3228
3229 trace_kvm_device_ioctl(fd, type, arg);
3230 accel_ioctl_begin();
3231 ret = ioctl(fd, type, arg);
3232 accel_ioctl_end();
3233 if (ret == -1) {
3234 ret = -errno;
3235 }
3236 return ret;
3237 }
3238
3239 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr)
3240 {
3241 int ret;
3242 struct kvm_device_attr attribute = {
3243 .group = group,
3244 .attr = attr,
3245 };
3246
3247 if (!kvm_vm_attributes_allowed) {
3248 return 0;
3249 }
3250
3251 ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute);
3252 /* kvm returns 0 on success for HAS_DEVICE_ATTR */
3253 return ret ? 0 : 1;
3254 }
3255
3256 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
3257 {
3258 struct kvm_device_attr attribute = {
3259 .group = group,
3260 .attr = attr,
3261 .flags = 0,
3262 };
3263
3264 return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1;
3265 }
3266
3267 int kvm_device_access(int fd, int group, uint64_t attr,
3268 void *val, bool write, Error **errp)
3269 {
3270 struct kvm_device_attr kvmattr;
3271 int err;
3272
3273 kvmattr.flags = 0;
3274 kvmattr.group = group;
3275 kvmattr.attr = attr;
3276 kvmattr.addr = (uintptr_t)val;
3277
3278 err = kvm_device_ioctl(fd,
3279 write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
3280 &kvmattr);
3281 if (err < 0) {
3282 error_setg_errno(errp, -err,
3283 "KVM_%s_DEVICE_ATTR failed: Group %d "
3284 "attr 0x%016" PRIx64,
3285 write ? "SET" : "GET", group, attr);
3286 }
3287 return err;
3288 }
3289
3290 bool kvm_has_sync_mmu(void)
3291 {
3292 return kvm_state->sync_mmu;
3293 }
3294
3295 int kvm_has_vcpu_events(void)
3296 {
3297 return kvm_state->vcpu_events;
3298 }
3299
3300 int kvm_has_robust_singlestep(void)
3301 {
3302 return kvm_state->robust_singlestep;
3303 }
3304
3305 int kvm_has_debugregs(void)
3306 {
3307 return kvm_state->debugregs;
3308 }
3309
3310 int kvm_max_nested_state_length(void)
3311 {
3312 return kvm_state->max_nested_state_len;
3313 }
3314
3315 int kvm_has_many_ioeventfds(void)
3316 {
3317 if (!kvm_enabled()) {
3318 return 0;
3319 }
3320 return kvm_state->many_ioeventfds;
3321 }
3322
3323 int kvm_has_gsi_routing(void)
3324 {
3325 #ifdef KVM_CAP_IRQ_ROUTING
3326 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
3327 #else
3328 return false;
3329 #endif
3330 }
3331
3332 int kvm_has_intx_set_mask(void)
3333 {
3334 return kvm_state->intx_set_mask;
3335 }
3336
3337 bool kvm_arm_supports_user_irq(void)
3338 {
3339 return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ);
3340 }
3341
3342 #ifdef KVM_CAP_SET_GUEST_DEBUG
3343 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, vaddr pc)
3344 {
3345 struct kvm_sw_breakpoint *bp;
3346
3347 QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
3348 if (bp->pc == pc) {
3349 return bp;
3350 }
3351 }
3352 return NULL;
3353 }
3354
3355 int kvm_sw_breakpoints_active(CPUState *cpu)
3356 {
3357 return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
3358 }
3359
3360 struct kvm_set_guest_debug_data {
3361 struct kvm_guest_debug dbg;
3362 int err;
3363 };
3364
3365 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
3366 {
3367 struct kvm_set_guest_debug_data *dbg_data =
3368 (struct kvm_set_guest_debug_data *) data.host_ptr;
3369
3370 dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
3371 &dbg_data->dbg);
3372 }
3373
3374 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
3375 {
3376 struct kvm_set_guest_debug_data data;
3377
3378 data.dbg.control = reinject_trap;
3379
3380 if (cpu->singlestep_enabled) {
3381 data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
3382
3383 if (cpu->singlestep_enabled & SSTEP_NOIRQ) {
3384 data.dbg.control |= KVM_GUESTDBG_BLOCKIRQ;
3385 }
3386 }
3387 kvm_arch_update_guest_debug(cpu, &data.dbg);
3388
3389 run_on_cpu(cpu, kvm_invoke_set_guest_debug,
3390 RUN_ON_CPU_HOST_PTR(&data));
3391 return data.err;
3392 }
3393
3394 bool kvm_supports_guest_debug(void)
3395 {
3396 /* probed during kvm_init() */
3397 return kvm_has_guest_debug;
3398 }
3399
3400 int kvm_insert_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len)
3401 {
3402 struct kvm_sw_breakpoint *bp;
3403 int err;
3404
3405 if (type == GDB_BREAKPOINT_SW) {
3406 bp = kvm_find_sw_breakpoint(cpu, addr);
3407 if (bp) {
3408 bp->use_count++;
3409 return 0;
3410 }
3411
3412 bp = g_new(struct kvm_sw_breakpoint, 1);
3413 bp->pc = addr;
3414 bp->use_count = 1;
3415 err = kvm_arch_insert_sw_breakpoint(cpu, bp);
3416 if (err) {
3417 g_free(bp);
3418 return err;
3419 }
3420
3421 QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
3422 } else {
3423 err = kvm_arch_insert_hw_breakpoint(addr, len, type);
3424 if (err) {
3425 return err;
3426 }
3427 }
3428
3429 CPU_FOREACH(cpu) {
3430 err = kvm_update_guest_debug(cpu, 0);
3431 if (err) {
3432 return err;
3433 }
3434 }
3435 return 0;
3436 }
3437
3438 int kvm_remove_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len)
3439 {
3440 struct kvm_sw_breakpoint *bp;
3441 int err;
3442
3443 if (type == GDB_BREAKPOINT_SW) {
3444 bp = kvm_find_sw_breakpoint(cpu, addr);
3445 if (!bp) {
3446 return -ENOENT;
3447 }
3448
3449 if (bp->use_count > 1) {
3450 bp->use_count--;
3451 return 0;
3452 }
3453
3454 err = kvm_arch_remove_sw_breakpoint(cpu, bp);
3455 if (err) {
3456 return err;
3457 }
3458
3459 QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
3460 g_free(bp);
3461 } else {
3462 err = kvm_arch_remove_hw_breakpoint(addr, len, type);
3463 if (err) {
3464 return err;
3465 }
3466 }
3467
3468 CPU_FOREACH(cpu) {
3469 err = kvm_update_guest_debug(cpu, 0);
3470 if (err) {
3471 return err;
3472 }
3473 }
3474 return 0;
3475 }
3476
3477 void kvm_remove_all_breakpoints(CPUState *cpu)
3478 {
3479 struct kvm_sw_breakpoint *bp, *next;
3480 KVMState *s = cpu->kvm_state;
3481 CPUState *tmpcpu;
3482
3483 QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
3484 if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
3485 /* Try harder to find a CPU that currently sees the breakpoint. */
3486 CPU_FOREACH(tmpcpu) {
3487 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) {
3488 break;
3489 }
3490 }
3491 }
3492 QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
3493 g_free(bp);
3494 }
3495 kvm_arch_remove_all_hw_breakpoints();
3496
3497 CPU_FOREACH(cpu) {
3498 kvm_update_guest_debug(cpu, 0);
3499 }
3500 }
3501
3502 #endif /* !KVM_CAP_SET_GUEST_DEBUG */
3503
3504 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
3505 {
3506 KVMState *s = kvm_state;
3507 struct kvm_signal_mask *sigmask;
3508 int r;
3509
3510 sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
3511
3512 sigmask->len = s->sigmask_len;
3513 memcpy(sigmask->sigset, sigset, sizeof(*sigset));
3514 r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
3515 g_free(sigmask);
3516
3517 return r;
3518 }
3519
3520 static void kvm_ipi_signal(int sig)
3521 {
3522 if (current_cpu) {
3523 assert(kvm_immediate_exit);
3524 kvm_cpu_kick(current_cpu);
3525 }
3526 }
3527
3528 void kvm_init_cpu_signals(CPUState *cpu)
3529 {
3530 int r;
3531 sigset_t set;
3532 struct sigaction sigact;
3533
3534 memset(&sigact, 0, sizeof(sigact));
3535 sigact.sa_handler = kvm_ipi_signal;
3536 sigaction(SIG_IPI, &sigact, NULL);
3537
3538 pthread_sigmask(SIG_BLOCK, NULL, &set);
3539 #if defined KVM_HAVE_MCE_INJECTION
3540 sigdelset(&set, SIGBUS);
3541 pthread_sigmask(SIG_SETMASK, &set, NULL);
3542 #endif
3543 sigdelset(&set, SIG_IPI);
3544 if (kvm_immediate_exit) {
3545 r = pthread_sigmask(SIG_SETMASK, &set, NULL);
3546 } else {
3547 r = kvm_set_signal_mask(cpu, &set);
3548 }
3549 if (r) {
3550 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
3551 exit(1);
3552 }
3553 }
3554
3555 /* Called asynchronously in VCPU thread. */
3556 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
3557 {
3558 #ifdef KVM_HAVE_MCE_INJECTION
3559 if (have_sigbus_pending) {
3560 return 1;
3561 }
3562 have_sigbus_pending = true;
3563 pending_sigbus_addr = addr;
3564 pending_sigbus_code = code;
3565 qatomic_set(&cpu->exit_request, 1);
3566 return 0;
3567 #else
3568 return 1;
3569 #endif
3570 }
3571
3572 /* Called synchronously (via signalfd) in main thread. */
3573 int kvm_on_sigbus(int code, void *addr)
3574 {
3575 #ifdef KVM_HAVE_MCE_INJECTION
3576 /* Action required MCE kills the process if SIGBUS is blocked. Because
3577 * that's what happens in the I/O thread, where we handle MCE via signalfd,
3578 * we can only get action optional here.
3579 */
3580 assert(code != BUS_MCEERR_AR);
3581 kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
3582 return 0;
3583 #else
3584 return 1;
3585 #endif
3586 }
3587
3588 int kvm_create_device(KVMState *s, uint64_t type, bool test)
3589 {
3590 int ret;
3591 struct kvm_create_device create_dev;
3592
3593 create_dev.type = type;
3594 create_dev.fd = -1;
3595 create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
3596
3597 if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
3598 return -ENOTSUP;
3599 }
3600
3601 ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev);
3602 if (ret) {
3603 return ret;
3604 }
3605
3606 return test ? 0 : create_dev.fd;
3607 }
3608
3609 bool kvm_device_supported(int vmfd, uint64_t type)
3610 {
3611 struct kvm_create_device create_dev = {
3612 .type = type,
3613 .fd = -1,
3614 .flags = KVM_CREATE_DEVICE_TEST,
3615 };
3616
3617 if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) {
3618 return false;
3619 }
3620
3621 return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0);
3622 }
3623
3624 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source)
3625 {
3626 struct kvm_one_reg reg;
3627 int r;
3628
3629 reg.id = id;
3630 reg.addr = (uintptr_t) source;
3631 r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
3632 if (r) {
3633 trace_kvm_failed_reg_set(id, strerror(-r));
3634 }
3635 return r;
3636 }
3637
3638 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target)
3639 {
3640 struct kvm_one_reg reg;
3641 int r;
3642
3643 reg.id = id;
3644 reg.addr = (uintptr_t) target;
3645 r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
3646 if (r) {
3647 trace_kvm_failed_reg_get(id, strerror(-r));
3648 }
3649 return r;
3650 }
3651
3652 static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as,
3653 hwaddr start_addr, hwaddr size)
3654 {
3655 KVMState *kvm = KVM_STATE(ms->accelerator);
3656 int i;
3657
3658 for (i = 0; i < kvm->nr_as; ++i) {
3659 if (kvm->as[i].as == as && kvm->as[i].ml) {
3660 size = MIN(kvm_max_slot_size, size);
3661 return NULL != kvm_lookup_matching_slot(kvm->as[i].ml,
3662 start_addr, size);
3663 }
3664 }
3665
3666 return false;
3667 }
3668
3669 static void kvm_get_kvm_shadow_mem(Object *obj, Visitor *v,
3670 const char *name, void *opaque,
3671 Error **errp)
3672 {
3673 KVMState *s = KVM_STATE(obj);
3674 int64_t value = s->kvm_shadow_mem;
3675
3676 visit_type_int(v, name, &value, errp);
3677 }
3678
3679 static void kvm_set_kvm_shadow_mem(Object *obj, Visitor *v,
3680 const char *name, void *opaque,
3681 Error **errp)
3682 {
3683 KVMState *s = KVM_STATE(obj);
3684 int64_t value;
3685
3686 if (s->fd != -1) {
3687 error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3688 return;
3689 }
3690
3691 if (!visit_type_int(v, name, &value, errp)) {
3692 return;
3693 }
3694
3695 s->kvm_shadow_mem = value;
3696 }
3697
3698 static void kvm_set_kernel_irqchip(Object *obj, Visitor *v,
3699 const char *name, void *opaque,
3700 Error **errp)
3701 {
3702 KVMState *s = KVM_STATE(obj);
3703 OnOffSplit mode;
3704
3705 if (s->fd != -1) {
3706 error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3707 return;
3708 }
3709
3710 if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
3711 return;
3712 }
3713 switch (mode) {
3714 case ON_OFF_SPLIT_ON:
3715 s->kernel_irqchip_allowed = true;
3716 s->kernel_irqchip_required = true;
3717 s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
3718 break;
3719 case ON_OFF_SPLIT_OFF:
3720 s->kernel_irqchip_allowed = false;
3721 s->kernel_irqchip_required = false;
3722 s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
3723 break;
3724 case ON_OFF_SPLIT_SPLIT:
3725 s->kernel_irqchip_allowed = true;
3726 s->kernel_irqchip_required = true;
3727 s->kernel_irqchip_split = ON_OFF_AUTO_ON;
3728 break;
3729 default:
3730 /* The value was checked in visit_type_OnOffSplit() above. If
3731 * we get here, then something is wrong in QEMU.
3732 */
3733 abort();
3734 }
3735 }
3736
3737 bool kvm_kernel_irqchip_allowed(void)
3738 {
3739 return kvm_state->kernel_irqchip_allowed;
3740 }
3741
3742 bool kvm_kernel_irqchip_required(void)
3743 {
3744 return kvm_state->kernel_irqchip_required;
3745 }
3746
3747 bool kvm_kernel_irqchip_split(void)
3748 {
3749 return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON;
3750 }
3751
3752 static void kvm_get_dirty_ring_size(Object *obj, Visitor *v,
3753 const char *name, void *opaque,
3754 Error **errp)
3755 {
3756 KVMState *s = KVM_STATE(obj);
3757 uint32_t value = s->kvm_dirty_ring_size;
3758
3759 visit_type_uint32(v, name, &value, errp);
3760 }
3761
3762 static void kvm_set_dirty_ring_size(Object *obj, Visitor *v,
3763 const char *name, void *opaque,
3764 Error **errp)
3765 {
3766 KVMState *s = KVM_STATE(obj);
3767 uint32_t value;
3768
3769 if (s->fd != -1) {
3770 error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3771 return;
3772 }
3773
3774 if (!visit_type_uint32(v, name, &value, errp)) {
3775 return;
3776 }
3777 if (value & (value - 1)) {
3778 error_setg(errp, "dirty-ring-size must be a power of two.");
3779 return;
3780 }
3781
3782 s->kvm_dirty_ring_size = value;
3783 }
3784
3785 static void kvm_accel_instance_init(Object *obj)
3786 {
3787 KVMState *s = KVM_STATE(obj);
3788
3789 s->fd = -1;
3790 s->vmfd = -1;
3791 s->kvm_shadow_mem = -1;
3792 s->kernel_irqchip_allowed = true;
3793 s->kernel_irqchip_split = ON_OFF_AUTO_AUTO;
3794 /* KVM dirty ring is by default off */
3795 s->kvm_dirty_ring_size = 0;
3796 s->kvm_dirty_ring_with_bitmap = false;
3797 s->kvm_eager_split_size = 0;
3798 s->notify_vmexit = NOTIFY_VMEXIT_OPTION_RUN;
3799 s->notify_window = 0;
3800 s->xen_version = 0;
3801 s->xen_gnttab_max_frames = 64;
3802 s->xen_evtchn_max_pirq = 256;
3803 }
3804
3805 /**
3806 * kvm_gdbstub_sstep_flags():
3807 *
3808 * Returns: SSTEP_* flags that KVM supports for guest debug. The
3809 * support is probed during kvm_init()
3810 */
3811 static int kvm_gdbstub_sstep_flags(void)
3812 {
3813 return kvm_sstep_flags;
3814 }
3815
3816 static void kvm_accel_class_init(ObjectClass *oc, void *data)
3817 {
3818 AccelClass *ac = ACCEL_CLASS(oc);
3819 ac->name = "KVM";
3820 ac->init_machine = kvm_init;
3821 ac->has_memory = kvm_accel_has_memory;
3822 ac->allowed = &kvm_allowed;
3823 ac->gdbstub_supported_sstep_flags = kvm_gdbstub_sstep_flags;
3824
3825 object_class_property_add(oc, "kernel-irqchip", "on|off|split",
3826 NULL, kvm_set_kernel_irqchip,
3827 NULL, NULL);
3828 object_class_property_set_description(oc, "kernel-irqchip",
3829 "Configure KVM in-kernel irqchip");
3830
3831 object_class_property_add(oc, "kvm-shadow-mem", "int",
3832 kvm_get_kvm_shadow_mem, kvm_set_kvm_shadow_mem,
3833 NULL, NULL);
3834 object_class_property_set_description(oc, "kvm-shadow-mem",
3835 "KVM shadow MMU size");
3836
3837 object_class_property_add(oc, "dirty-ring-size", "uint32",
3838 kvm_get_dirty_ring_size, kvm_set_dirty_ring_size,
3839 NULL, NULL);
3840 object_class_property_set_description(oc, "dirty-ring-size",
3841 "Size of KVM dirty page ring buffer (default: 0, i.e. use bitmap)");
3842
3843 kvm_arch_accel_class_init(oc);
3844 }
3845
3846 static const TypeInfo kvm_accel_type = {
3847 .name = TYPE_KVM_ACCEL,
3848 .parent = TYPE_ACCEL,
3849 .instance_init = kvm_accel_instance_init,
3850 .class_init = kvm_accel_class_init,
3851 .instance_size = sizeof(KVMState),
3852 };
3853
3854 static void kvm_type_init(void)
3855 {
3856 type_register_static(&kvm_accel_type);
3857 }
3858
3859 type_init(kvm_type_init);
3860
3861 typedef struct StatsArgs {
3862 union StatsResultsType {
3863 StatsResultList **stats;
3864 StatsSchemaList **schema;
3865 } result;
3866 strList *names;
3867 Error **errp;
3868 } StatsArgs;
3869
3870 static StatsList *add_kvmstat_entry(struct kvm_stats_desc *pdesc,
3871 uint64_t *stats_data,
3872 StatsList *stats_list,
3873 Error **errp)
3874 {
3875
3876 Stats *stats;
3877 uint64List *val_list = NULL;
3878
3879 /* Only add stats that we understand. */
3880 switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
3881 case KVM_STATS_TYPE_CUMULATIVE:
3882 case KVM_STATS_TYPE_INSTANT:
3883 case KVM_STATS_TYPE_PEAK:
3884 case KVM_STATS_TYPE_LINEAR_HIST:
3885 case KVM_STATS_TYPE_LOG_HIST:
3886 break;
3887 default:
3888 return stats_list;
3889 }
3890
3891 switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
3892 case KVM_STATS_UNIT_NONE:
3893 case KVM_STATS_UNIT_BYTES:
3894 case KVM_STATS_UNIT_CYCLES:
3895 case KVM_STATS_UNIT_SECONDS:
3896 case KVM_STATS_UNIT_BOOLEAN:
3897 break;
3898 default:
3899 return stats_list;
3900 }
3901
3902 switch (pdesc->flags & KVM_STATS_BASE_MASK) {
3903 case KVM_STATS_BASE_POW10:
3904 case KVM_STATS_BASE_POW2:
3905 break;
3906 default:
3907 return stats_list;
3908 }
3909
3910 /* Alloc and populate data list */
3911 stats = g_new0(Stats, 1);
3912 stats->name = g_strdup(pdesc->name);
3913 stats->value = g_new0(StatsValue, 1);;
3914
3915 if ((pdesc->flags & KVM_STATS_UNIT_MASK) == KVM_STATS_UNIT_BOOLEAN) {
3916 stats->value->u.boolean = *stats_data;
3917 stats->value->type = QTYPE_QBOOL;
3918 } else if (pdesc->size == 1) {
3919 stats->value->u.scalar = *stats_data;
3920 stats->value->type = QTYPE_QNUM;
3921 } else {
3922 int i;
3923 for (i = 0; i < pdesc->size; i++) {
3924 QAPI_LIST_PREPEND(val_list, stats_data[i]);
3925 }
3926 stats->value->u.list = val_list;
3927 stats->value->type = QTYPE_QLIST;
3928 }
3929
3930 QAPI_LIST_PREPEND(stats_list, stats);
3931 return stats_list;
3932 }
3933
3934 static StatsSchemaValueList *add_kvmschema_entry(struct kvm_stats_desc *pdesc,
3935 StatsSchemaValueList *list,
3936 Error **errp)
3937 {
3938 StatsSchemaValueList *schema_entry = g_new0(StatsSchemaValueList, 1);
3939 schema_entry->value = g_new0(StatsSchemaValue, 1);
3940
3941 switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
3942 case KVM_STATS_TYPE_CUMULATIVE:
3943 schema_entry->value->type = STATS_TYPE_CUMULATIVE;
3944 break;
3945 case KVM_STATS_TYPE_INSTANT:
3946 schema_entry->value->type = STATS_TYPE_INSTANT;
3947 break;
3948 case KVM_STATS_TYPE_PEAK:
3949 schema_entry->value->type = STATS_TYPE_PEAK;
3950 break;
3951 case KVM_STATS_TYPE_LINEAR_HIST:
3952 schema_entry->value->type = STATS_TYPE_LINEAR_HISTOGRAM;
3953 schema_entry->value->bucket_size = pdesc->bucket_size;
3954 schema_entry->value->has_bucket_size = true;
3955 break;
3956 case KVM_STATS_TYPE_LOG_HIST:
3957 schema_entry->value->type = STATS_TYPE_LOG2_HISTOGRAM;
3958 break;
3959 default:
3960 goto exit;
3961 }
3962
3963 switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
3964 case KVM_STATS_UNIT_NONE:
3965 break;
3966 case KVM_STATS_UNIT_BOOLEAN:
3967 schema_entry->value->has_unit = true;
3968 schema_entry->value->unit = STATS_UNIT_BOOLEAN;
3969 break;
3970 case KVM_STATS_UNIT_BYTES:
3971 schema_entry->value->has_unit = true;
3972 schema_entry->value->unit = STATS_UNIT_BYTES;
3973 break;
3974 case KVM_STATS_UNIT_CYCLES:
3975 schema_entry->value->has_unit = true;
3976 schema_entry->value->unit = STATS_UNIT_CYCLES;
3977 break;
3978 case KVM_STATS_UNIT_SECONDS:
3979 schema_entry->value->has_unit = true;
3980 schema_entry->value->unit = STATS_UNIT_SECONDS;
3981 break;
3982 default:
3983 goto exit;
3984 }
3985
3986 schema_entry->value->exponent = pdesc->exponent;
3987 if (pdesc->exponent) {
3988 switch (pdesc->flags & KVM_STATS_BASE_MASK) {
3989 case KVM_STATS_BASE_POW10:
3990 schema_entry->value->has_base = true;
3991 schema_entry->value->base = 10;
3992 break;
3993 case KVM_STATS_BASE_POW2:
3994 schema_entry->value->has_base = true;
3995 schema_entry->value->base = 2;
3996 break;
3997 default:
3998 goto exit;
3999 }
4000 }
4001
4002 schema_entry->value->name = g_strdup(pdesc->name);
4003 schema_entry->next = list;
4004 return schema_entry;
4005 exit:
4006 g_free(schema_entry->value);
4007 g_free(schema_entry);
4008 return list;
4009 }
4010
4011 /* Cached stats descriptors */
4012 typedef struct StatsDescriptors {
4013 const char *ident; /* cache key, currently the StatsTarget */
4014 struct kvm_stats_desc *kvm_stats_desc;
4015 struct kvm_stats_header kvm_stats_header;
4016 QTAILQ_ENTRY(StatsDescriptors) next;
4017 } StatsDescriptors;
4018
4019 static QTAILQ_HEAD(, StatsDescriptors) stats_descriptors =
4020 QTAILQ_HEAD_INITIALIZER(stats_descriptors);
4021
4022 /*
4023 * Return the descriptors for 'target', that either have already been read
4024 * or are retrieved from 'stats_fd'.
4025 */
4026 static StatsDescriptors *find_stats_descriptors(StatsTarget target, int stats_fd,
4027 Error **errp)
4028 {
4029 StatsDescriptors *descriptors;
4030 const char *ident;
4031 struct kvm_stats_desc *kvm_stats_desc;
4032 struct kvm_stats_header *kvm_stats_header;
4033 size_t size_desc;
4034 ssize_t ret;
4035
4036 ident = StatsTarget_str(target);
4037 QTAILQ_FOREACH(descriptors, &stats_descriptors, next) {
4038 if (g_str_equal(descriptors->ident, ident)) {
4039 return descriptors;
4040 }
4041 }
4042
4043 descriptors = g_new0(StatsDescriptors, 1);
4044
4045 /* Read stats header */
4046 kvm_stats_header = &descriptors->kvm_stats_header;
4047 ret = pread(stats_fd, kvm_stats_header, sizeof(*kvm_stats_header), 0);
4048 if (ret != sizeof(*kvm_stats_header)) {
4049 error_setg(errp, "KVM stats: failed to read stats header: "
4050 "expected %zu actual %zu",
4051 sizeof(*kvm_stats_header), ret);
4052 g_free(descriptors);
4053 return NULL;
4054 }
4055 size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4056
4057 /* Read stats descriptors */
4058 kvm_stats_desc = g_malloc0_n(kvm_stats_header->num_desc, size_desc);
4059 ret = pread(stats_fd, kvm_stats_desc,
4060 size_desc * kvm_stats_header->num_desc,
4061 kvm_stats_header->desc_offset);
4062
4063 if (ret != size_desc * kvm_stats_header->num_desc) {
4064 error_setg(errp, "KVM stats: failed to read stats descriptors: "
4065 "expected %zu actual %zu",
4066 size_desc * kvm_stats_header->num_desc, ret);
4067 g_free(descriptors);
4068 g_free(kvm_stats_desc);
4069 return NULL;
4070 }
4071 descriptors->kvm_stats_desc = kvm_stats_desc;
4072 descriptors->ident = ident;
4073 QTAILQ_INSERT_TAIL(&stats_descriptors, descriptors, next);
4074 return descriptors;
4075 }
4076
4077 static void query_stats(StatsResultList **result, StatsTarget target,
4078 strList *names, int stats_fd, CPUState *cpu,
4079 Error **errp)
4080 {
4081 struct kvm_stats_desc *kvm_stats_desc;
4082 struct kvm_stats_header *kvm_stats_header;
4083 StatsDescriptors *descriptors;
4084 g_autofree uint64_t *stats_data = NULL;
4085 struct kvm_stats_desc *pdesc;
4086 StatsList *stats_list = NULL;
4087 size_t size_desc, size_data = 0;
4088 ssize_t ret;
4089 int i;
4090
4091 descriptors = find_stats_descriptors(target, stats_fd, errp);
4092 if (!descriptors) {
4093 return;
4094 }
4095
4096 kvm_stats_header = &descriptors->kvm_stats_header;
4097 kvm_stats_desc = descriptors->kvm_stats_desc;
4098 size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4099
4100 /* Tally the total data size; read schema data */
4101 for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4102 pdesc = (void *)kvm_stats_desc + i * size_desc;
4103 size_data += pdesc->size * sizeof(*stats_data);
4104 }
4105
4106 stats_data = g_malloc0(size_data);
4107 ret = pread(stats_fd, stats_data, size_data, kvm_stats_header->data_offset);
4108
4109 if (ret != size_data) {
4110 error_setg(errp, "KVM stats: failed to read data: "
4111 "expected %zu actual %zu", size_data, ret);
4112 return;
4113 }
4114
4115 for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4116 uint64_t *stats;
4117 pdesc = (void *)kvm_stats_desc + i * size_desc;
4118
4119 /* Add entry to the list */
4120 stats = (void *)stats_data + pdesc->offset;
4121 if (!apply_str_list_filter(pdesc->name, names)) {
4122 continue;
4123 }
4124 stats_list = add_kvmstat_entry(pdesc, stats, stats_list, errp);
4125 }
4126
4127 if (!stats_list) {
4128 return;
4129 }
4130
4131 switch (target) {
4132 case STATS_TARGET_VM:
4133 add_stats_entry(result, STATS_PROVIDER_KVM, NULL, stats_list);
4134 break;
4135 case STATS_TARGET_VCPU:
4136 add_stats_entry(result, STATS_PROVIDER_KVM,
4137 cpu->parent_obj.canonical_path,
4138 stats_list);
4139 break;
4140 default:
4141 g_assert_not_reached();
4142 }
4143 }
4144
4145 static void query_stats_schema(StatsSchemaList **result, StatsTarget target,
4146 int stats_fd, Error **errp)
4147 {
4148 struct kvm_stats_desc *kvm_stats_desc;
4149 struct kvm_stats_header *kvm_stats_header;
4150 StatsDescriptors *descriptors;
4151 struct kvm_stats_desc *pdesc;
4152 StatsSchemaValueList *stats_list = NULL;
4153 size_t size_desc;
4154 int i;
4155
4156 descriptors = find_stats_descriptors(target, stats_fd, errp);
4157 if (!descriptors) {
4158 return;
4159 }
4160
4161 kvm_stats_header = &descriptors->kvm_stats_header;
4162 kvm_stats_desc = descriptors->kvm_stats_desc;
4163 size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4164
4165 /* Tally the total data size; read schema data */
4166 for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4167 pdesc = (void *)kvm_stats_desc + i * size_desc;
4168 stats_list = add_kvmschema_entry(pdesc, stats_list, errp);
4169 }
4170
4171 add_stats_schema(result, STATS_PROVIDER_KVM, target, stats_list);
4172 }
4173
4174 static void query_stats_vcpu(CPUState *cpu, StatsArgs *kvm_stats_args)
4175 {
4176 int stats_fd = cpu->kvm_vcpu_stats_fd;
4177 Error *local_err = NULL;
4178
4179 if (stats_fd == -1) {
4180 error_setg_errno(&local_err, errno, "KVM stats: ioctl failed");
4181 error_propagate(kvm_stats_args->errp, local_err);
4182 return;
4183 }
4184 query_stats(kvm_stats_args->result.stats, STATS_TARGET_VCPU,
4185 kvm_stats_args->names, stats_fd, cpu,
4186 kvm_stats_args->errp);
4187 }
4188
4189 static void query_stats_schema_vcpu(CPUState *cpu, StatsArgs *kvm_stats_args)
4190 {
4191 int stats_fd = cpu->kvm_vcpu_stats_fd;
4192 Error *local_err = NULL;
4193
4194 if (stats_fd == -1) {
4195 error_setg_errno(&local_err, errno, "KVM stats: ioctl failed");
4196 error_propagate(kvm_stats_args->errp, local_err);
4197 return;
4198 }
4199 query_stats_schema(kvm_stats_args->result.schema, STATS_TARGET_VCPU, stats_fd,
4200 kvm_stats_args->errp);
4201 }
4202
4203 static void query_stats_cb(StatsResultList **result, StatsTarget target,
4204 strList *names, strList *targets, Error **errp)
4205 {
4206 KVMState *s = kvm_state;
4207 CPUState *cpu;
4208 int stats_fd;
4209
4210 switch (target) {
4211 case STATS_TARGET_VM:
4212 {
4213 stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL);
4214 if (stats_fd == -1) {
4215 error_setg_errno(errp, errno, "KVM stats: ioctl failed");
4216 return;
4217 }
4218 query_stats(result, target, names, stats_fd, NULL, errp);
4219 close(stats_fd);
4220 break;
4221 }
4222 case STATS_TARGET_VCPU:
4223 {
4224 StatsArgs stats_args;
4225 stats_args.result.stats = result;
4226 stats_args.names = names;
4227 stats_args.errp = errp;
4228 CPU_FOREACH(cpu) {
4229 if (!apply_str_list_filter(cpu->parent_obj.canonical_path, targets)) {
4230 continue;
4231 }
4232 query_stats_vcpu(cpu, &stats_args);
4233 }
4234 break;
4235 }
4236 default:
4237 break;
4238 }
4239 }
4240
4241 void query_stats_schemas_cb(StatsSchemaList **result, Error **errp)
4242 {
4243 StatsArgs stats_args;
4244 KVMState *s = kvm_state;
4245 int stats_fd;
4246
4247 stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL);
4248 if (stats_fd == -1) {
4249 error_setg_errno(errp, errno, "KVM stats: ioctl failed");
4250 return;
4251 }
4252 query_stats_schema(result, STATS_TARGET_VM, stats_fd, errp);
4253 close(stats_fd);
4254
4255 if (first_cpu) {
4256 stats_args.result.schema = result;
4257 stats_args.errp = errp;
4258 query_stats_schema_vcpu(first_cpu, &stats_args);
4259 }
4260 }