]> git.proxmox.com Git - mirror_qemu.git/blob - hw/vfio/common.c
vfio: Check that IOMMU MR translates to system address space
[mirror_qemu.git] / hw / vfio / common.c
1 /*
2 * generic functions used by VFIO devices
3 *
4 * Copyright Red Hat, Inc. 2012
5 *
6 * Authors:
7 * Alex Williamson <alex.williamson@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 * Based on qemu-kvm device-assignment:
13 * Adapted for KVM by Qumranet.
14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19 */
20
21 #include "qemu/osdep.h"
22 #include <sys/ioctl.h>
23 #include <sys/mman.h>
24 #include <linux/vfio.h>
25
26 #include "hw/vfio/vfio-common.h"
27 #include "hw/vfio/vfio.h"
28 #include "exec/address-spaces.h"
29 #include "exec/memory.h"
30 #include "hw/hw.h"
31 #include "qemu/error-report.h"
32 #include "sysemu/kvm.h"
33 #ifdef CONFIG_KVM
34 #include "linux/kvm.h"
35 #endif
36 #include "trace.h"
37
38 struct vfio_group_head vfio_group_list =
39 QLIST_HEAD_INITIALIZER(vfio_group_list);
40 struct vfio_as_head vfio_address_spaces =
41 QLIST_HEAD_INITIALIZER(vfio_address_spaces);
42
43 #ifdef CONFIG_KVM
44 /*
45 * We have a single VFIO pseudo device per KVM VM. Once created it lives
46 * for the life of the VM. Closing the file descriptor only drops our
47 * reference to it and the device's reference to kvm. Therefore once
48 * initialized, this file descriptor is only released on QEMU exit and
49 * we'll re-use it should another vfio device be attached before then.
50 */
51 static int vfio_kvm_device_fd = -1;
52 #endif
53
54 /*
55 * Common VFIO interrupt disable
56 */
57 void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
58 {
59 struct vfio_irq_set irq_set = {
60 .argsz = sizeof(irq_set),
61 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
62 .index = index,
63 .start = 0,
64 .count = 0,
65 };
66
67 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
68 }
69
70 void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
71 {
72 struct vfio_irq_set irq_set = {
73 .argsz = sizeof(irq_set),
74 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
75 .index = index,
76 .start = 0,
77 .count = 1,
78 };
79
80 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
81 }
82
83 void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
84 {
85 struct vfio_irq_set irq_set = {
86 .argsz = sizeof(irq_set),
87 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
88 .index = index,
89 .start = 0,
90 .count = 1,
91 };
92
93 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
94 }
95
96 /*
97 * IO Port/MMIO - Beware of the endians, VFIO is always little endian
98 */
99 void vfio_region_write(void *opaque, hwaddr addr,
100 uint64_t data, unsigned size)
101 {
102 VFIORegion *region = opaque;
103 VFIODevice *vbasedev = region->vbasedev;
104 union {
105 uint8_t byte;
106 uint16_t word;
107 uint32_t dword;
108 uint64_t qword;
109 } buf;
110
111 switch (size) {
112 case 1:
113 buf.byte = data;
114 break;
115 case 2:
116 buf.word = cpu_to_le16(data);
117 break;
118 case 4:
119 buf.dword = cpu_to_le32(data);
120 break;
121 default:
122 hw_error("vfio: unsupported write size, %d bytes", size);
123 break;
124 }
125
126 if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
127 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
128 ",%d) failed: %m",
129 __func__, vbasedev->name, region->nr,
130 addr, data, size);
131 }
132
133 trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
134
135 /*
136 * A read or write to a BAR always signals an INTx EOI. This will
137 * do nothing if not pending (including not in INTx mode). We assume
138 * that a BAR access is in response to an interrupt and that BAR
139 * accesses will service the interrupt. Unfortunately, we don't know
140 * which access will service the interrupt, so we're potentially
141 * getting quite a few host interrupts per guest interrupt.
142 */
143 vbasedev->ops->vfio_eoi(vbasedev);
144 }
145
146 uint64_t vfio_region_read(void *opaque,
147 hwaddr addr, unsigned size)
148 {
149 VFIORegion *region = opaque;
150 VFIODevice *vbasedev = region->vbasedev;
151 union {
152 uint8_t byte;
153 uint16_t word;
154 uint32_t dword;
155 uint64_t qword;
156 } buf;
157 uint64_t data = 0;
158
159 if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
160 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
161 __func__, vbasedev->name, region->nr,
162 addr, size);
163 return (uint64_t)-1;
164 }
165 switch (size) {
166 case 1:
167 data = buf.byte;
168 break;
169 case 2:
170 data = le16_to_cpu(buf.word);
171 break;
172 case 4:
173 data = le32_to_cpu(buf.dword);
174 break;
175 default:
176 hw_error("vfio: unsupported read size, %d bytes", size);
177 break;
178 }
179
180 trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data);
181
182 /* Same as write above */
183 vbasedev->ops->vfio_eoi(vbasedev);
184
185 return data;
186 }
187
188 const MemoryRegionOps vfio_region_ops = {
189 .read = vfio_region_read,
190 .write = vfio_region_write,
191 .endianness = DEVICE_LITTLE_ENDIAN,
192 };
193
194 /*
195 * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
196 */
197 static int vfio_dma_unmap(VFIOContainer *container,
198 hwaddr iova, ram_addr_t size)
199 {
200 struct vfio_iommu_type1_dma_unmap unmap = {
201 .argsz = sizeof(unmap),
202 .flags = 0,
203 .iova = iova,
204 .size = size,
205 };
206
207 if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
208 error_report("VFIO_UNMAP_DMA: %d", -errno);
209 return -errno;
210 }
211
212 return 0;
213 }
214
215 static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
216 ram_addr_t size, void *vaddr, bool readonly)
217 {
218 struct vfio_iommu_type1_dma_map map = {
219 .argsz = sizeof(map),
220 .flags = VFIO_DMA_MAP_FLAG_READ,
221 .vaddr = (__u64)(uintptr_t)vaddr,
222 .iova = iova,
223 .size = size,
224 };
225
226 if (!readonly) {
227 map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
228 }
229
230 /*
231 * Try the mapping, if it fails with EBUSY, unmap the region and try
232 * again. This shouldn't be necessary, but we sometimes see it in
233 * the VGA ROM space.
234 */
235 if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
236 (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
237 ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
238 return 0;
239 }
240
241 error_report("VFIO_MAP_DMA: %d", -errno);
242 return -errno;
243 }
244
245 static bool vfio_listener_skipped_section(MemoryRegionSection *section)
246 {
247 return (!memory_region_is_ram(section->mr) &&
248 !memory_region_is_iommu(section->mr)) ||
249 /*
250 * Sizing an enabled 64-bit BAR can cause spurious mappings to
251 * addresses in the upper part of the 64-bit address space. These
252 * are never accessed by the CPU and beyond the address width of
253 * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width.
254 */
255 section->offset_within_address_space & (1ULL << 63);
256 }
257
258 static void vfio_iommu_map_notify(Notifier *n, void *data)
259 {
260 VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
261 VFIOContainer *container = giommu->container;
262 IOMMUTLBEntry *iotlb = data;
263 hwaddr iova = iotlb->iova + giommu->iommu_offset;
264 MemoryRegion *mr;
265 hwaddr xlat;
266 hwaddr len = iotlb->addr_mask + 1;
267 void *vaddr;
268 int ret;
269
270 trace_vfio_iommu_map_notify(iova, iova + iotlb->addr_mask);
271
272 if (iotlb->target_as != &address_space_memory) {
273 error_report("Wrong target AS \"%s\", only system memory is allowed",
274 iotlb->target_as->name ? iotlb->target_as->name : "none");
275 return;
276 }
277
278 /*
279 * The IOMMU TLB entry we have just covers translation through
280 * this IOMMU to its immediate target. We need to translate
281 * it the rest of the way through to memory.
282 */
283 rcu_read_lock();
284 mr = address_space_translate(&address_space_memory,
285 iotlb->translated_addr,
286 &xlat, &len, iotlb->perm & IOMMU_WO);
287 if (!memory_region_is_ram(mr)) {
288 error_report("iommu map to non memory area %"HWADDR_PRIx"",
289 xlat);
290 goto out;
291 }
292 /*
293 * Translation truncates length to the IOMMU page size,
294 * check that it did not truncate too much.
295 */
296 if (len & iotlb->addr_mask) {
297 error_report("iommu has granularity incompatible with target AS");
298 goto out;
299 }
300
301 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
302 vaddr = memory_region_get_ram_ptr(mr) + xlat;
303 ret = vfio_dma_map(container, iova,
304 iotlb->addr_mask + 1, vaddr,
305 !(iotlb->perm & IOMMU_WO) || mr->readonly);
306 if (ret) {
307 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
308 "0x%"HWADDR_PRIx", %p) = %d (%m)",
309 container, iova,
310 iotlb->addr_mask + 1, vaddr, ret);
311 }
312 } else {
313 ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1);
314 if (ret) {
315 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
316 "0x%"HWADDR_PRIx") = %d (%m)",
317 container, iova,
318 iotlb->addr_mask + 1, ret);
319 }
320 }
321 out:
322 rcu_read_unlock();
323 }
324
325 static hwaddr vfio_container_granularity(VFIOContainer *container)
326 {
327 return (hwaddr)1 << ctz64(container->iova_pgsizes);
328 }
329
330 static void vfio_listener_region_add(MemoryListener *listener,
331 MemoryRegionSection *section)
332 {
333 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
334 hwaddr iova, end;
335 Int128 llend, llsize;
336 void *vaddr;
337 int ret;
338
339 if (vfio_listener_skipped_section(section)) {
340 trace_vfio_listener_region_add_skip(
341 section->offset_within_address_space,
342 section->offset_within_address_space +
343 int128_get64(int128_sub(section->size, int128_one())));
344 return;
345 }
346
347 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
348 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
349 error_report("%s received unaligned region", __func__);
350 return;
351 }
352
353 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
354 llend = int128_make64(section->offset_within_address_space);
355 llend = int128_add(llend, section->size);
356 llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
357
358 if (int128_ge(int128_make64(iova), llend)) {
359 return;
360 }
361 end = int128_get64(int128_sub(llend, int128_one()));
362
363 if ((iova < container->min_iova) || (end > container->max_iova)) {
364 error_report("vfio: IOMMU container %p can't map guest IOVA region"
365 " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx,
366 container, iova, end);
367 ret = -EFAULT;
368 goto fail;
369 }
370
371 memory_region_ref(section->mr);
372
373 if (memory_region_is_iommu(section->mr)) {
374 VFIOGuestIOMMU *giommu;
375
376 trace_vfio_listener_region_add_iommu(iova, end);
377 /*
378 * FIXME: We should do some checking to see if the
379 * capabilities of the host VFIO IOMMU are adequate to model
380 * the guest IOMMU
381 *
382 * FIXME: For VFIO iommu types which have KVM acceleration to
383 * avoid bouncing all map/unmaps through qemu this way, this
384 * would be the right place to wire that up (tell the KVM
385 * device emulation the VFIO iommu handles to use).
386 */
387 giommu = g_malloc0(sizeof(*giommu));
388 giommu->iommu = section->mr;
389 giommu->iommu_offset = section->offset_within_address_space -
390 section->offset_within_region;
391 giommu->container = container;
392 giommu->n.notify = vfio_iommu_map_notify;
393 QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
394
395 memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
396 memory_region_iommu_replay(giommu->iommu, &giommu->n,
397 vfio_container_granularity(container),
398 false);
399
400 return;
401 }
402
403 /* Here we assume that memory_region_is_ram(section->mr)==true */
404
405 vaddr = memory_region_get_ram_ptr(section->mr) +
406 section->offset_within_region +
407 (iova - section->offset_within_address_space);
408
409 trace_vfio_listener_region_add_ram(iova, end, vaddr);
410
411 llsize = int128_sub(llend, int128_make64(iova));
412
413 ret = vfio_dma_map(container, iova, int128_get64(llsize),
414 vaddr, section->readonly);
415 if (ret) {
416 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
417 "0x%"HWADDR_PRIx", %p) = %d (%m)",
418 container, iova, int128_get64(llsize), vaddr, ret);
419 goto fail;
420 }
421
422 return;
423
424 fail:
425 /*
426 * On the initfn path, store the first error in the container so we
427 * can gracefully fail. Runtime, there's not much we can do other
428 * than throw a hardware error.
429 */
430 if (!container->initialized) {
431 if (!container->error) {
432 container->error = ret;
433 }
434 } else {
435 hw_error("vfio: DMA mapping failed, unable to continue");
436 }
437 }
438
439 static void vfio_listener_region_del(MemoryListener *listener,
440 MemoryRegionSection *section)
441 {
442 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
443 hwaddr iova, end;
444 Int128 llend, llsize;
445 int ret;
446
447 if (vfio_listener_skipped_section(section)) {
448 trace_vfio_listener_region_del_skip(
449 section->offset_within_address_space,
450 section->offset_within_address_space +
451 int128_get64(int128_sub(section->size, int128_one())));
452 return;
453 }
454
455 if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
456 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
457 error_report("%s received unaligned region", __func__);
458 return;
459 }
460
461 if (memory_region_is_iommu(section->mr)) {
462 VFIOGuestIOMMU *giommu;
463
464 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
465 if (giommu->iommu == section->mr) {
466 memory_region_unregister_iommu_notifier(&giommu->n);
467 QLIST_REMOVE(giommu, giommu_next);
468 g_free(giommu);
469 break;
470 }
471 }
472
473 /*
474 * FIXME: We assume the one big unmap below is adequate to
475 * remove any individual page mappings in the IOMMU which
476 * might have been copied into VFIO. This works for a page table
477 * based IOMMU where a big unmap flattens a large range of IO-PTEs.
478 * That may not be true for all IOMMU types.
479 */
480 }
481
482 iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
483 llend = int128_make64(section->offset_within_address_space);
484 llend = int128_add(llend, section->size);
485 llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
486
487 if (int128_ge(int128_make64(iova), llend)) {
488 return;
489 }
490 end = int128_get64(int128_sub(llend, int128_one()));
491
492 llsize = int128_sub(llend, int128_make64(iova));
493
494 trace_vfio_listener_region_del(iova, end);
495
496 ret = vfio_dma_unmap(container, iova, int128_get64(llsize));
497 memory_region_unref(section->mr);
498 if (ret) {
499 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
500 "0x%"HWADDR_PRIx") = %d (%m)",
501 container, iova, int128_get64(llsize), ret);
502 }
503 }
504
505 static const MemoryListener vfio_memory_listener = {
506 .region_add = vfio_listener_region_add,
507 .region_del = vfio_listener_region_del,
508 };
509
510 static void vfio_listener_release(VFIOContainer *container)
511 {
512 memory_listener_unregister(&container->listener);
513 }
514
515 static struct vfio_info_cap_header *
516 vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
517 {
518 struct vfio_info_cap_header *hdr;
519 void *ptr = info;
520
521 if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) {
522 return NULL;
523 }
524
525 for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
526 if (hdr->id == id) {
527 return hdr;
528 }
529 }
530
531 return NULL;
532 }
533
534 static void vfio_setup_region_sparse_mmaps(VFIORegion *region,
535 struct vfio_region_info *info)
536 {
537 struct vfio_info_cap_header *hdr;
538 struct vfio_region_info_cap_sparse_mmap *sparse;
539 int i;
540
541 hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
542 if (!hdr) {
543 return;
544 }
545
546 sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
547
548 trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
549 region->nr, sparse->nr_areas);
550
551 region->nr_mmaps = sparse->nr_areas;
552 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
553
554 for (i = 0; i < region->nr_mmaps; i++) {
555 region->mmaps[i].offset = sparse->areas[i].offset;
556 region->mmaps[i].size = sparse->areas[i].size;
557 trace_vfio_region_sparse_mmap_entry(i, region->mmaps[i].offset,
558 region->mmaps[i].offset +
559 region->mmaps[i].size);
560 }
561 }
562
563 int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
564 int index, const char *name)
565 {
566 struct vfio_region_info *info;
567 int ret;
568
569 ret = vfio_get_region_info(vbasedev, index, &info);
570 if (ret) {
571 return ret;
572 }
573
574 region->vbasedev = vbasedev;
575 region->flags = info->flags;
576 region->size = info->size;
577 region->fd_offset = info->offset;
578 region->nr = index;
579
580 if (region->size) {
581 region->mem = g_new0(MemoryRegion, 1);
582 memory_region_init_io(region->mem, obj, &vfio_region_ops,
583 region, name, region->size);
584
585 if (!vbasedev->no_mmap &&
586 region->flags & VFIO_REGION_INFO_FLAG_MMAP &&
587 !(region->size & ~qemu_real_host_page_mask)) {
588
589 vfio_setup_region_sparse_mmaps(region, info);
590
591 if (!region->nr_mmaps) {
592 region->nr_mmaps = 1;
593 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
594 region->mmaps[0].offset = 0;
595 region->mmaps[0].size = region->size;
596 }
597 }
598 }
599
600 g_free(info);
601
602 trace_vfio_region_setup(vbasedev->name, index, name,
603 region->flags, region->fd_offset, region->size);
604 return 0;
605 }
606
607 int vfio_region_mmap(VFIORegion *region)
608 {
609 int i, prot = 0;
610 char *name;
611
612 if (!region->mem) {
613 return 0;
614 }
615
616 prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0;
617 prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0;
618
619 for (i = 0; i < region->nr_mmaps; i++) {
620 region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot,
621 MAP_SHARED, region->vbasedev->fd,
622 region->fd_offset +
623 region->mmaps[i].offset);
624 if (region->mmaps[i].mmap == MAP_FAILED) {
625 int ret = -errno;
626
627 trace_vfio_region_mmap_fault(memory_region_name(region->mem), i,
628 region->fd_offset +
629 region->mmaps[i].offset,
630 region->fd_offset +
631 region->mmaps[i].offset +
632 region->mmaps[i].size - 1, ret);
633
634 region->mmaps[i].mmap = NULL;
635
636 for (i--; i >= 0; i--) {
637 memory_region_del_subregion(region->mem, &region->mmaps[i].mem);
638 munmap(region->mmaps[i].mmap, region->mmaps[i].size);
639 object_unparent(OBJECT(&region->mmaps[i].mem));
640 region->mmaps[i].mmap = NULL;
641 }
642
643 return ret;
644 }
645
646 name = g_strdup_printf("%s mmaps[%d]",
647 memory_region_name(region->mem), i);
648 memory_region_init_ram_ptr(&region->mmaps[i].mem,
649 memory_region_owner(region->mem),
650 name, region->mmaps[i].size,
651 region->mmaps[i].mmap);
652 g_free(name);
653 memory_region_set_skip_dump(&region->mmaps[i].mem);
654 memory_region_add_subregion(region->mem, region->mmaps[i].offset,
655 &region->mmaps[i].mem);
656
657 trace_vfio_region_mmap(memory_region_name(&region->mmaps[i].mem),
658 region->mmaps[i].offset,
659 region->mmaps[i].offset +
660 region->mmaps[i].size - 1);
661 }
662
663 return 0;
664 }
665
666 void vfio_region_exit(VFIORegion *region)
667 {
668 int i;
669
670 if (!region->mem) {
671 return;
672 }
673
674 for (i = 0; i < region->nr_mmaps; i++) {
675 if (region->mmaps[i].mmap) {
676 memory_region_del_subregion(region->mem, &region->mmaps[i].mem);
677 }
678 }
679
680 trace_vfio_region_exit(region->vbasedev->name, region->nr);
681 }
682
683 void vfio_region_finalize(VFIORegion *region)
684 {
685 int i;
686
687 if (!region->mem) {
688 return;
689 }
690
691 for (i = 0; i < region->nr_mmaps; i++) {
692 if (region->mmaps[i].mmap) {
693 munmap(region->mmaps[i].mmap, region->mmaps[i].size);
694 object_unparent(OBJECT(&region->mmaps[i].mem));
695 }
696 }
697
698 object_unparent(OBJECT(region->mem));
699
700 g_free(region->mem);
701 g_free(region->mmaps);
702
703 trace_vfio_region_finalize(region->vbasedev->name, region->nr);
704 }
705
706 void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled)
707 {
708 int i;
709
710 if (!region->mem) {
711 return;
712 }
713
714 for (i = 0; i < region->nr_mmaps; i++) {
715 if (region->mmaps[i].mmap) {
716 memory_region_set_enabled(&region->mmaps[i].mem, enabled);
717 }
718 }
719
720 trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem),
721 enabled);
722 }
723
724 void vfio_reset_handler(void *opaque)
725 {
726 VFIOGroup *group;
727 VFIODevice *vbasedev;
728
729 QLIST_FOREACH(group, &vfio_group_list, next) {
730 QLIST_FOREACH(vbasedev, &group->device_list, next) {
731 vbasedev->ops->vfio_compute_needs_reset(vbasedev);
732 }
733 }
734
735 QLIST_FOREACH(group, &vfio_group_list, next) {
736 QLIST_FOREACH(vbasedev, &group->device_list, next) {
737 if (vbasedev->needs_reset) {
738 vbasedev->ops->vfio_hot_reset_multi(vbasedev);
739 }
740 }
741 }
742 }
743
744 static void vfio_kvm_device_add_group(VFIOGroup *group)
745 {
746 #ifdef CONFIG_KVM
747 struct kvm_device_attr attr = {
748 .group = KVM_DEV_VFIO_GROUP,
749 .attr = KVM_DEV_VFIO_GROUP_ADD,
750 .addr = (uint64_t)(unsigned long)&group->fd,
751 };
752
753 if (!kvm_enabled()) {
754 return;
755 }
756
757 if (vfio_kvm_device_fd < 0) {
758 struct kvm_create_device cd = {
759 .type = KVM_DEV_TYPE_VFIO,
760 };
761
762 if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
763 error_report("Failed to create KVM VFIO device: %m");
764 return;
765 }
766
767 vfio_kvm_device_fd = cd.fd;
768 }
769
770 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
771 error_report("Failed to add group %d to KVM VFIO device: %m",
772 group->groupid);
773 }
774 #endif
775 }
776
777 static void vfio_kvm_device_del_group(VFIOGroup *group)
778 {
779 #ifdef CONFIG_KVM
780 struct kvm_device_attr attr = {
781 .group = KVM_DEV_VFIO_GROUP,
782 .attr = KVM_DEV_VFIO_GROUP_DEL,
783 .addr = (uint64_t)(unsigned long)&group->fd,
784 };
785
786 if (vfio_kvm_device_fd < 0) {
787 return;
788 }
789
790 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
791 error_report("Failed to remove group %d from KVM VFIO device: %m",
792 group->groupid);
793 }
794 #endif
795 }
796
797 static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
798 {
799 VFIOAddressSpace *space;
800
801 QLIST_FOREACH(space, &vfio_address_spaces, list) {
802 if (space->as == as) {
803 return space;
804 }
805 }
806
807 /* No suitable VFIOAddressSpace, create a new one */
808 space = g_malloc0(sizeof(*space));
809 space->as = as;
810 QLIST_INIT(&space->containers);
811
812 QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
813
814 return space;
815 }
816
817 static void vfio_put_address_space(VFIOAddressSpace *space)
818 {
819 if (QLIST_EMPTY(&space->containers)) {
820 QLIST_REMOVE(space, list);
821 g_free(space);
822 }
823 }
824
825 static int vfio_connect_container(VFIOGroup *group, AddressSpace *as)
826 {
827 VFIOContainer *container;
828 int ret, fd;
829 VFIOAddressSpace *space;
830
831 space = vfio_get_address_space(as);
832
833 QLIST_FOREACH(container, &space->containers, next) {
834 if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
835 group->container = container;
836 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
837 return 0;
838 }
839 }
840
841 fd = qemu_open("/dev/vfio/vfio", O_RDWR);
842 if (fd < 0) {
843 error_report("vfio: failed to open /dev/vfio/vfio: %m");
844 ret = -errno;
845 goto put_space_exit;
846 }
847
848 ret = ioctl(fd, VFIO_GET_API_VERSION);
849 if (ret != VFIO_API_VERSION) {
850 error_report("vfio: supported vfio version: %d, "
851 "reported version: %d", VFIO_API_VERSION, ret);
852 ret = -EINVAL;
853 goto close_fd_exit;
854 }
855
856 container = g_malloc0(sizeof(*container));
857 container->space = space;
858 container->fd = fd;
859 if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU) ||
860 ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU)) {
861 bool v2 = !!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU);
862 struct vfio_iommu_type1_info info;
863
864 ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
865 if (ret) {
866 error_report("vfio: failed to set group container: %m");
867 ret = -errno;
868 goto free_container_exit;
869 }
870
871 ret = ioctl(fd, VFIO_SET_IOMMU,
872 v2 ? VFIO_TYPE1v2_IOMMU : VFIO_TYPE1_IOMMU);
873 if (ret) {
874 error_report("vfio: failed to set iommu for container: %m");
875 ret = -errno;
876 goto free_container_exit;
877 }
878
879 /*
880 * FIXME: This assumes that a Type1 IOMMU can map any 64-bit
881 * IOVA whatsoever. That's not actually true, but the current
882 * kernel interface doesn't tell us what it can map, and the
883 * existing Type1 IOMMUs generally support any IOVA we're
884 * going to actually try in practice.
885 */
886 container->min_iova = 0;
887 container->max_iova = (hwaddr)-1;
888
889 /* Assume just 4K IOVA page size */
890 container->iova_pgsizes = 0x1000;
891 info.argsz = sizeof(info);
892 ret = ioctl(fd, VFIO_IOMMU_GET_INFO, &info);
893 /* Ignore errors */
894 if ((ret == 0) && (info.flags & VFIO_IOMMU_INFO_PGSIZES)) {
895 container->iova_pgsizes = info.iova_pgsizes;
896 }
897 } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) {
898 struct vfio_iommu_spapr_tce_info info;
899
900 ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
901 if (ret) {
902 error_report("vfio: failed to set group container: %m");
903 ret = -errno;
904 goto free_container_exit;
905 }
906 ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU);
907 if (ret) {
908 error_report("vfio: failed to set iommu for container: %m");
909 ret = -errno;
910 goto free_container_exit;
911 }
912
913 /*
914 * The host kernel code implementing VFIO_IOMMU_DISABLE is called
915 * when container fd is closed so we do not call it explicitly
916 * in this file.
917 */
918 ret = ioctl(fd, VFIO_IOMMU_ENABLE);
919 if (ret) {
920 error_report("vfio: failed to enable container: %m");
921 ret = -errno;
922 goto free_container_exit;
923 }
924
925 /*
926 * This only considers the host IOMMU's 32-bit window. At
927 * some point we need to add support for the optional 64-bit
928 * window and dynamic windows
929 */
930 info.argsz = sizeof(info);
931 ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
932 if (ret) {
933 error_report("vfio: VFIO_IOMMU_SPAPR_TCE_GET_INFO failed: %m");
934 ret = -errno;
935 goto free_container_exit;
936 }
937 container->min_iova = info.dma32_window_start;
938 container->max_iova = container->min_iova + info.dma32_window_size - 1;
939
940 /* Assume just 4K IOVA pages for now */
941 container->iova_pgsizes = 0x1000;
942 } else {
943 error_report("vfio: No available IOMMU models");
944 ret = -EINVAL;
945 goto free_container_exit;
946 }
947
948 container->listener = vfio_memory_listener;
949
950 memory_listener_register(&container->listener, container->space->as);
951
952 if (container->error) {
953 ret = container->error;
954 error_report("vfio: memory listener initialization failed for container");
955 goto listener_release_exit;
956 }
957
958 container->initialized = true;
959
960 QLIST_INIT(&container->group_list);
961 QLIST_INSERT_HEAD(&space->containers, container, next);
962
963 group->container = container;
964 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
965
966 return 0;
967 listener_release_exit:
968 vfio_listener_release(container);
969
970 free_container_exit:
971 g_free(container);
972
973 close_fd_exit:
974 close(fd);
975
976 put_space_exit:
977 vfio_put_address_space(space);
978
979 return ret;
980 }
981
982 static void vfio_disconnect_container(VFIOGroup *group)
983 {
984 VFIOContainer *container = group->container;
985
986 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
987 error_report("vfio: error disconnecting group %d from container",
988 group->groupid);
989 }
990
991 QLIST_REMOVE(group, container_next);
992 group->container = NULL;
993
994 if (QLIST_EMPTY(&container->group_list)) {
995 VFIOAddressSpace *space = container->space;
996 VFIOGuestIOMMU *giommu, *tmp;
997
998 vfio_listener_release(container);
999 QLIST_REMOVE(container, next);
1000
1001 QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
1002 memory_region_unregister_iommu_notifier(&giommu->n);
1003 QLIST_REMOVE(giommu, giommu_next);
1004 g_free(giommu);
1005 }
1006
1007 trace_vfio_disconnect_container(container->fd);
1008 close(container->fd);
1009 g_free(container);
1010
1011 vfio_put_address_space(space);
1012 }
1013 }
1014
1015 VFIOGroup *vfio_get_group(int groupid, AddressSpace *as)
1016 {
1017 VFIOGroup *group;
1018 char path[32];
1019 struct vfio_group_status status = { .argsz = sizeof(status) };
1020
1021 QLIST_FOREACH(group, &vfio_group_list, next) {
1022 if (group->groupid == groupid) {
1023 /* Found it. Now is it already in the right context? */
1024 if (group->container->space->as == as) {
1025 return group;
1026 } else {
1027 error_report("vfio: group %d used in multiple address spaces",
1028 group->groupid);
1029 return NULL;
1030 }
1031 }
1032 }
1033
1034 group = g_malloc0(sizeof(*group));
1035
1036 snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
1037 group->fd = qemu_open(path, O_RDWR);
1038 if (group->fd < 0) {
1039 error_report("vfio: error opening %s: %m", path);
1040 goto free_group_exit;
1041 }
1042
1043 if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
1044 error_report("vfio: error getting group status: %m");
1045 goto close_fd_exit;
1046 }
1047
1048 if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
1049 error_report("vfio: error, group %d is not viable, please ensure "
1050 "all devices within the iommu_group are bound to their "
1051 "vfio bus driver.", groupid);
1052 goto close_fd_exit;
1053 }
1054
1055 group->groupid = groupid;
1056 QLIST_INIT(&group->device_list);
1057
1058 if (vfio_connect_container(group, as)) {
1059 error_report("vfio: failed to setup container for group %d", groupid);
1060 goto close_fd_exit;
1061 }
1062
1063 if (QLIST_EMPTY(&vfio_group_list)) {
1064 qemu_register_reset(vfio_reset_handler, NULL);
1065 }
1066
1067 QLIST_INSERT_HEAD(&vfio_group_list, group, next);
1068
1069 vfio_kvm_device_add_group(group);
1070
1071 return group;
1072
1073 close_fd_exit:
1074 close(group->fd);
1075
1076 free_group_exit:
1077 g_free(group);
1078
1079 return NULL;
1080 }
1081
1082 void vfio_put_group(VFIOGroup *group)
1083 {
1084 if (!group || !QLIST_EMPTY(&group->device_list)) {
1085 return;
1086 }
1087
1088 vfio_kvm_device_del_group(group);
1089 vfio_disconnect_container(group);
1090 QLIST_REMOVE(group, next);
1091 trace_vfio_put_group(group->fd);
1092 close(group->fd);
1093 g_free(group);
1094
1095 if (QLIST_EMPTY(&vfio_group_list)) {
1096 qemu_unregister_reset(vfio_reset_handler, NULL);
1097 }
1098 }
1099
1100 int vfio_get_device(VFIOGroup *group, const char *name,
1101 VFIODevice *vbasedev)
1102 {
1103 struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
1104 int ret, fd;
1105
1106 fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
1107 if (fd < 0) {
1108 error_report("vfio: error getting device %s from group %d: %m",
1109 name, group->groupid);
1110 error_printf("Verify all devices in group %d are bound to vfio-<bus> "
1111 "or pci-stub and not already in use\n", group->groupid);
1112 return fd;
1113 }
1114
1115 ret = ioctl(fd, VFIO_DEVICE_GET_INFO, &dev_info);
1116 if (ret) {
1117 error_report("vfio: error getting device info: %m");
1118 close(fd);
1119 return ret;
1120 }
1121
1122 vbasedev->fd = fd;
1123 vbasedev->group = group;
1124 QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
1125
1126 vbasedev->num_irqs = dev_info.num_irqs;
1127 vbasedev->num_regions = dev_info.num_regions;
1128 vbasedev->flags = dev_info.flags;
1129
1130 trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions,
1131 dev_info.num_irqs);
1132
1133 vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
1134 return 0;
1135 }
1136
1137 void vfio_put_base_device(VFIODevice *vbasedev)
1138 {
1139 if (!vbasedev->group) {
1140 return;
1141 }
1142 QLIST_REMOVE(vbasedev, next);
1143 vbasedev->group = NULL;
1144 trace_vfio_put_base_device(vbasedev->fd);
1145 close(vbasedev->fd);
1146 }
1147
1148 int vfio_get_region_info(VFIODevice *vbasedev, int index,
1149 struct vfio_region_info **info)
1150 {
1151 size_t argsz = sizeof(struct vfio_region_info);
1152
1153 *info = g_malloc0(argsz);
1154
1155 (*info)->index = index;
1156 retry:
1157 (*info)->argsz = argsz;
1158
1159 if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
1160 g_free(*info);
1161 *info = NULL;
1162 return -errno;
1163 }
1164
1165 if ((*info)->argsz > argsz) {
1166 argsz = (*info)->argsz;
1167 *info = g_realloc(*info, argsz);
1168
1169 goto retry;
1170 }
1171
1172 return 0;
1173 }
1174
1175 int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
1176 uint32_t subtype, struct vfio_region_info **info)
1177 {
1178 int i;
1179
1180 for (i = 0; i < vbasedev->num_regions; i++) {
1181 struct vfio_info_cap_header *hdr;
1182 struct vfio_region_info_cap_type *cap_type;
1183
1184 if (vfio_get_region_info(vbasedev, i, info)) {
1185 continue;
1186 }
1187
1188 hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
1189 if (!hdr) {
1190 g_free(*info);
1191 continue;
1192 }
1193
1194 cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
1195
1196 trace_vfio_get_dev_region(vbasedev->name, i,
1197 cap_type->type, cap_type->subtype);
1198
1199 if (cap_type->type == type && cap_type->subtype == subtype) {
1200 return 0;
1201 }
1202
1203 g_free(*info);
1204 }
1205
1206 *info = NULL;
1207 return -ENODEV;
1208 }
1209
1210 /*
1211 * Interfaces for IBM EEH (Enhanced Error Handling)
1212 */
1213 static bool vfio_eeh_container_ok(VFIOContainer *container)
1214 {
1215 /*
1216 * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO
1217 * implementation is broken if there are multiple groups in a
1218 * container. The hardware works in units of Partitionable
1219 * Endpoints (== IOMMU groups) and the EEH operations naively
1220 * iterate across all groups in the container, without any logic
1221 * to make sure the groups have their state synchronized. For
1222 * certain operations (ENABLE) that might be ok, until an error
1223 * occurs, but for others (GET_STATE) it's clearly broken.
1224 */
1225
1226 /*
1227 * XXX Once fixed kernels exist, test for them here
1228 */
1229
1230 if (QLIST_EMPTY(&container->group_list)) {
1231 return false;
1232 }
1233
1234 if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) {
1235 return false;
1236 }
1237
1238 return true;
1239 }
1240
1241 static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
1242 {
1243 struct vfio_eeh_pe_op pe_op = {
1244 .argsz = sizeof(pe_op),
1245 .op = op,
1246 };
1247 int ret;
1248
1249 if (!vfio_eeh_container_ok(container)) {
1250 error_report("vfio/eeh: EEH_PE_OP 0x%x: "
1251 "kernel requires a container with exactly one group", op);
1252 return -EPERM;
1253 }
1254
1255 ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op);
1256 if (ret < 0) {
1257 error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op);
1258 return -errno;
1259 }
1260
1261 return 0;
1262 }
1263
1264 static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
1265 {
1266 VFIOAddressSpace *space = vfio_get_address_space(as);
1267 VFIOContainer *container = NULL;
1268
1269 if (QLIST_EMPTY(&space->containers)) {
1270 /* No containers to act on */
1271 goto out;
1272 }
1273
1274 container = QLIST_FIRST(&space->containers);
1275
1276 if (QLIST_NEXT(container, next)) {
1277 /* We don't yet have logic to synchronize EEH state across
1278 * multiple containers */
1279 container = NULL;
1280 goto out;
1281 }
1282
1283 out:
1284 vfio_put_address_space(space);
1285 return container;
1286 }
1287
1288 bool vfio_eeh_as_ok(AddressSpace *as)
1289 {
1290 VFIOContainer *container = vfio_eeh_as_container(as);
1291
1292 return (container != NULL) && vfio_eeh_container_ok(container);
1293 }
1294
1295 int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
1296 {
1297 VFIOContainer *container = vfio_eeh_as_container(as);
1298
1299 if (!container) {
1300 return -ENODEV;
1301 }
1302 return vfio_eeh_container_op(container, op);
1303 }