]> git.proxmox.com Git - mirror_qemu.git/blame - hw/vfio/common.c
vfio/common: Add helper to consolidate iova/end calculation
[mirror_qemu.git] / hw / vfio / common.c
CommitLineData
e2c7d025
EA
1/*
2 * generic functions used by VFIO devices
3 *
4 * Copyright Red Hat, Inc. 2012
5 *
6 * Authors:
7 * Alex Williamson <alex.williamson@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 * Based on qemu-kvm device-assignment:
13 * Adapted for KVM by Qumranet.
14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19 */
20
c6eacb1a 21#include "qemu/osdep.h"
e2c7d025 22#include <sys/ioctl.h>
a9c94277
MA
23#ifdef CONFIG_KVM
24#include <linux/kvm.h>
25#endif
e2c7d025
EA
26#include <linux/vfio.h>
27
28#include "hw/vfio/vfio-common.h"
29#include "hw/vfio/vfio.h"
30#include "exec/address-spaces.h"
31#include "exec/memory.h"
b6dd6504 32#include "exec/ram_addr.h"
e2c7d025
EA
33#include "hw/hw.h"
34#include "qemu/error-report.h"
db725815 35#include "qemu/main-loop.h"
f4ec5e26 36#include "qemu/range.h"
e2c7d025 37#include "sysemu/kvm.h"
71e8a915 38#include "sysemu/reset.h"
0fd7616e 39#include "sysemu/runstate.h"
e2c7d025 40#include "trace.h"
01905f58 41#include "qapi/error.h"
b6dd6504 42#include "migration/migration.h"
8b942af3 43#include "migration/misc.h"
29d81b71 44#include "migration/blocker.h"
236e0a45 45#include "migration/qemu-file.h"
851d6d1a 46#include "sysemu/tpm.h"
e2c7d025 47
f481ee2d 48VFIOGroupList vfio_group_list =
39cb514f 49 QLIST_HEAD_INITIALIZER(vfio_group_list);
10ca76b4 50static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
e2c7d025
EA
51 QLIST_HEAD_INITIALIZER(vfio_address_spaces);
52
53#ifdef CONFIG_KVM
54/*
55 * We have a single VFIO pseudo device per KVM VM. Once created it lives
56 * for the life of the VM. Closing the file descriptor only drops our
57 * reference to it and the device's reference to kvm. Therefore once
58 * initialized, this file descriptor is only released on QEMU exit and
59 * we'll re-use it should another vfio device be attached before then.
60 */
61static int vfio_kvm_device_fd = -1;
62#endif
63
64/*
65 * Common VFIO interrupt disable
66 */
67void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
68{
69 struct vfio_irq_set irq_set = {
70 .argsz = sizeof(irq_set),
71 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
72 .index = index,
73 .start = 0,
74 .count = 0,
75 };
76
77 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
78}
79
80void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
81{
82 struct vfio_irq_set irq_set = {
83 .argsz = sizeof(irq_set),
84 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
85 .index = index,
86 .start = 0,
87 .count = 1,
88 };
89
90 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
91}
92
93void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
94{
95 struct vfio_irq_set irq_set = {
96 .argsz = sizeof(irq_set),
97 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
98 .index = index,
99 .start = 0,
100 .count = 1,
101 };
102
103 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
104}
105
201a7331
EA
106static inline const char *action_to_str(int action)
107{
108 switch (action) {
109 case VFIO_IRQ_SET_ACTION_MASK:
110 return "MASK";
111 case VFIO_IRQ_SET_ACTION_UNMASK:
112 return "UNMASK";
113 case VFIO_IRQ_SET_ACTION_TRIGGER:
114 return "TRIGGER";
115 default:
116 return "UNKNOWN ACTION";
117 }
118}
119
120static const char *index_to_str(VFIODevice *vbasedev, int index)
121{
122 if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
123 return NULL;
124 }
125
126 switch (index) {
127 case VFIO_PCI_INTX_IRQ_INDEX:
128 return "INTX";
129 case VFIO_PCI_MSI_IRQ_INDEX:
130 return "MSI";
131 case VFIO_PCI_MSIX_IRQ_INDEX:
132 return "MSIX";
133 case VFIO_PCI_ERR_IRQ_INDEX:
134 return "ERR";
135 case VFIO_PCI_REQ_IRQ_INDEX:
136 return "REQ";
137 default:
138 return NULL;
139 }
140}
141
53d1b5fc
DH
142static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
143{
144 switch (container->iommu_type) {
145 case VFIO_TYPE1v2_IOMMU:
146 case VFIO_TYPE1_IOMMU:
147 /*
148 * We support coordinated discarding of RAM via the RamDiscardManager.
149 */
150 return ram_block_uncoordinated_discard_disable(state);
151 default:
152 /*
153 * VFIO_SPAPR_TCE_IOMMU most probably works just fine with
154 * RamDiscardManager, however, it is completely untested.
155 *
156 * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does
157 * completely the opposite of managing mapping/pinning dynamically as
158 * required by RamDiscardManager. We would have to special-case sections
159 * with a RamDiscardManager.
160 */
161 return ram_block_discard_disable(state);
162 }
163}
164
201a7331
EA
165int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
166 int action, int fd, Error **errp)
167{
168 struct vfio_irq_set *irq_set;
169 int argsz, ret = 0;
170 const char *name;
171 int32_t *pfd;
172
173 argsz = sizeof(*irq_set) + sizeof(*pfd);
174
175 irq_set = g_malloc0(argsz);
176 irq_set->argsz = argsz;
177 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action;
178 irq_set->index = index;
179 irq_set->start = subindex;
180 irq_set->count = 1;
181 pfd = (int32_t *)&irq_set->data;
182 *pfd = fd;
183
184 if (ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
185 ret = -errno;
186 }
187 g_free(irq_set);
188
189 if (!ret) {
190 return 0;
191 }
192
193 error_setg_errno(errp, -ret, "VFIO_DEVICE_SET_IRQS failure");
194
195 name = index_to_str(vbasedev, index);
196 if (name) {
197 error_prepend(errp, "%s-%d: ", name, subindex);
198 } else {
199 error_prepend(errp, "index %d-%d: ", index, subindex);
200 }
201 error_prepend(errp,
202 "Failed to %s %s eventfd signaling for interrupt ",
203 fd < 0 ? "tear down" : "set up", action_to_str(action));
204 return ret;
205}
206
e2c7d025
EA
207/*
208 * IO Port/MMIO - Beware of the endians, VFIO is always little endian
209 */
210void vfio_region_write(void *opaque, hwaddr addr,
211 uint64_t data, unsigned size)
212{
213 VFIORegion *region = opaque;
214 VFIODevice *vbasedev = region->vbasedev;
215 union {
216 uint8_t byte;
217 uint16_t word;
218 uint32_t dword;
219 uint64_t qword;
220 } buf;
221
222 switch (size) {
223 case 1:
224 buf.byte = data;
225 break;
226 case 2:
227 buf.word = cpu_to_le16(data);
228 break;
229 case 4:
230 buf.dword = cpu_to_le32(data);
231 break;
38d49e8c
JRZ
232 case 8:
233 buf.qword = cpu_to_le64(data);
234 break;
e2c7d025 235 default:
c624b6b3 236 hw_error("vfio: unsupported write size, %u bytes", size);
e2c7d025
EA
237 break;
238 }
239
240 if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
241 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
242 ",%d) failed: %m",
243 __func__, vbasedev->name, region->nr,
244 addr, data, size);
245 }
246
247 trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
248
249 /*
250 * A read or write to a BAR always signals an INTx EOI. This will
251 * do nothing if not pending (including not in INTx mode). We assume
252 * that a BAR access is in response to an interrupt and that BAR
253 * accesses will service the interrupt. Unfortunately, we don't know
254 * which access will service the interrupt, so we're potentially
255 * getting quite a few host interrupts per guest interrupt.
256 */
257 vbasedev->ops->vfio_eoi(vbasedev);
258}
259
260uint64_t vfio_region_read(void *opaque,
261 hwaddr addr, unsigned size)
262{
263 VFIORegion *region = opaque;
264 VFIODevice *vbasedev = region->vbasedev;
265 union {
266 uint8_t byte;
267 uint16_t word;
268 uint32_t dword;
269 uint64_t qword;
270 } buf;
271 uint64_t data = 0;
272
273 if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
274 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
275 __func__, vbasedev->name, region->nr,
276 addr, size);
277 return (uint64_t)-1;
278 }
279 switch (size) {
280 case 1:
281 data = buf.byte;
282 break;
283 case 2:
284 data = le16_to_cpu(buf.word);
285 break;
286 case 4:
287 data = le32_to_cpu(buf.dword);
288 break;
38d49e8c
JRZ
289 case 8:
290 data = le64_to_cpu(buf.qword);
291 break;
e2c7d025 292 default:
c624b6b3 293 hw_error("vfio: unsupported read size, %u bytes", size);
e2c7d025
EA
294 break;
295 }
296
297 trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data);
298
299 /* Same as write above */
300 vbasedev->ops->vfio_eoi(vbasedev);
301
302 return data;
303}
304
305const MemoryRegionOps vfio_region_ops = {
306 .read = vfio_region_read,
307 .write = vfio_region_write,
308 .endianness = DEVICE_LITTLE_ENDIAN,
15126cba
JRZ
309 .valid = {
310 .min_access_size = 1,
311 .max_access_size = 8,
312 },
38d49e8c
JRZ
313 .impl = {
314 .min_access_size = 1,
315 .max_access_size = 8,
316 },
e2c7d025
EA
317};
318
b6dd6504
KW
319/*
320 * Device state interfaces
321 */
322
725ccd7e
AH
323typedef struct {
324 unsigned long *bitmap;
325 hwaddr size;
326 hwaddr pages;
327} VFIOBitmap;
328
329static int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size)
330{
331 vbmap->pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
332 vbmap->size = ROUND_UP(vbmap->pages, sizeof(__u64) * BITS_PER_BYTE) /
333 BITS_PER_BYTE;
334 vbmap->bitmap = g_try_malloc0(vbmap->size);
335 if (!vbmap->bitmap) {
336 return -ENOMEM;
337 }
338
339 return 0;
340}
341
3710586c
KW
342bool vfio_mig_active(void)
343{
344 VFIOGroup *group;
345 VFIODevice *vbasedev;
346
347 if (QLIST_EMPTY(&vfio_group_list)) {
348 return false;
349 }
350
351 QLIST_FOREACH(group, &vfio_group_list, next) {
352 QLIST_FOREACH(vbasedev, &group->device_list, next) {
353 if (vbasedev->migration_blocker) {
354 return false;
355 }
356 }
357 }
358 return true;
359}
360
29d81b71
AH
361static Error *multiple_devices_migration_blocker;
362
363static unsigned int vfio_migratable_device_num(void)
364{
365 VFIOGroup *group;
366 VFIODevice *vbasedev;
367 unsigned int device_num = 0;
368
369 QLIST_FOREACH(group, &vfio_group_list, next) {
370 QLIST_FOREACH(vbasedev, &group->device_list, next) {
371 if (vbasedev->migration) {
372 device_num++;
373 }
374 }
375 }
376
377 return device_num;
378}
379
380int vfio_block_multiple_devices_migration(Error **errp)
381{
382 int ret;
383
384 if (multiple_devices_migration_blocker ||
385 vfio_migratable_device_num() <= 1) {
386 return 0;
387 }
388
389 error_setg(&multiple_devices_migration_blocker,
390 "Migration is currently not supported with multiple "
391 "VFIO devices");
392 ret = migrate_add_blocker(multiple_devices_migration_blocker, errp);
393 if (ret < 0) {
394 error_free(multiple_devices_migration_blocker);
395 multiple_devices_migration_blocker = NULL;
396 }
397
398 return ret;
399}
400
401void vfio_unblock_multiple_devices_migration(void)
402{
403 if (!multiple_devices_migration_blocker ||
404 vfio_migratable_device_num() > 1) {
405 return;
406 }
407
408 migrate_del_blocker(multiple_devices_migration_blocker);
409 error_free(multiple_devices_migration_blocker);
410 multiple_devices_migration_blocker = NULL;
411}
412
236e0a45
AH
413static void vfio_set_migration_error(int err)
414{
415 MigrationState *ms = migrate_get_current();
416
417 if (migration_is_setup_or_active(ms->state)) {
418 WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
419 if (ms->to_dst_file) {
420 qemu_file_set_error(ms->to_dst_file, err);
421 }
422 }
423 }
424}
425
758b96b6 426static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
b6dd6504
KW
427{
428 VFIOGroup *group;
429 VFIODevice *vbasedev;
430 MigrationState *ms = migrate_get_current();
431
432 if (!migration_is_setup_or_active(ms->state)) {
433 return false;
434 }
435
436 QLIST_FOREACH(group, &container->group_list, container_next) {
437 QLIST_FOREACH(vbasedev, &group->device_list, next) {
438 VFIOMigration *migration = vbasedev->migration;
439
440 if (!migration) {
441 return false;
442 }
443
7429aebe 444 if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF &&
31bcbbb5
AH
445 migration->device_state == VFIO_DEVICE_STATE_RUNNING) {
446 return false;
447 }
b6dd6504
KW
448 }
449 }
450 return true;
451}
452
8b942af3
AH
453/*
454 * Check if all VFIO devices are running and migration is active, which is
455 * essentially equivalent to the migration being in pre-copy phase.
456 */
457static bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
9e7b0442
KW
458{
459 VFIOGroup *group;
460 VFIODevice *vbasedev;
9e7b0442 461
8b942af3 462 if (!migration_is_active(migrate_get_current())) {
9e7b0442
KW
463 return false;
464 }
465
466 QLIST_FOREACH(group, &container->group_list, container_next) {
467 QLIST_FOREACH(vbasedev, &group->device_list, next) {
468 VFIOMigration *migration = vbasedev->migration;
469
470 if (!migration) {
471 return false;
472 }
473
7429aebe 474 if (migration->device_state == VFIO_DEVICE_STATE_RUNNING) {
9e7b0442
KW
475 continue;
476 } else {
477 return false;
478 }
479 }
480 }
481 return true;
482}
483
484static int vfio_dma_unmap_bitmap(VFIOContainer *container,
485 hwaddr iova, ram_addr_t size,
486 IOMMUTLBEntry *iotlb)
487{
488 struct vfio_iommu_type1_dma_unmap *unmap;
489 struct vfio_bitmap *bitmap;
725ccd7e 490 VFIOBitmap vbmap;
9e7b0442
KW
491 int ret;
492
725ccd7e
AH
493 ret = vfio_bitmap_alloc(&vbmap, size);
494 if (ret) {
495 return ret;
496 }
497
9e7b0442
KW
498 unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
499
500 unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
501 unmap->iova = iova;
502 unmap->size = size;
503 unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
504 bitmap = (struct vfio_bitmap *)&unmap->data;
505
506 /*
1eb7f642
KJ
507 * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
508 * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
509 * to qemu_real_host_page_size.
9e7b0442 510 */
8e3b0cbb 511 bitmap->pgsize = qemu_real_host_page_size();
725ccd7e
AH
512 bitmap->size = vbmap.size;
513 bitmap->data = (__u64 *)vbmap.bitmap;
9e7b0442 514
725ccd7e
AH
515 if (vbmap.size > container->max_dirty_bitmap_size) {
516 error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size);
9e7b0442
KW
517 ret = -E2BIG;
518 goto unmap_exit;
519 }
520
9e7b0442
KW
521 ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
522 if (!ret) {
725ccd7e
AH
523 cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap,
524 iotlb->translated_addr, vbmap.pages);
9e7b0442
KW
525 } else {
526 error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
527 }
528
9e7b0442
KW
529unmap_exit:
530 g_free(unmap);
725ccd7e
AH
531 g_free(vbmap.bitmap);
532
9e7b0442
KW
533 return ret;
534}
535
e2c7d025
EA
536/*
537 * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
538 */
539static int vfio_dma_unmap(VFIOContainer *container,
9e7b0442
KW
540 hwaddr iova, ram_addr_t size,
541 IOMMUTLBEntry *iotlb)
e2c7d025
EA
542{
543 struct vfio_iommu_type1_dma_unmap unmap = {
544 .argsz = sizeof(unmap),
545 .flags = 0,
546 .iova = iova,
547 .size = size,
548 };
549
9e7b0442 550 if (iotlb && container->dirty_pages_supported &&
8b942af3 551 vfio_devices_all_running_and_mig_active(container)) {
9e7b0442
KW
552 return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
553 }
554
567d7d3e
AW
555 while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
556 /*
557 * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
558 * v4.15) where an overflow in its wrap-around check prevents us from
559 * unmapping the last page of the address space. Test for the error
560 * condition and re-try the unmap excluding the last page. The
561 * expectation is that we've never mapped the last page anyway and this
562 * unmap request comes via vIOMMU support which also makes it unlikely
563 * that this page is used. This bug was introduced well after type1 v2
564 * support was introduced, so we shouldn't need to test for v1. A fix
565 * is queued for kernel v5.0 so this workaround can be removed once
566 * affected kernels are sufficiently deprecated.
567 */
568 if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
569 container->iommu_type == VFIO_TYPE1v2_IOMMU) {
570 trace_vfio_dma_unmap_overflow_workaround();
571 unmap.size -= 1ULL << ctz64(container->pgsizes);
572 continue;
573 }
b09d51c9 574 error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
e2c7d025
EA
575 return -errno;
576 }
577
8b942af3 578 if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
b051a3f6
AH
579 cpu_physical_memory_set_dirty_range(iotlb->translated_addr, size,
580 tcg_enabled() ? DIRTY_CLIENTS_ALL :
581 DIRTY_CLIENTS_NOCODE);
582 }
583
e2c7d025
EA
584 return 0;
585}
586
587static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
588 ram_addr_t size, void *vaddr, bool readonly)
589{
590 struct vfio_iommu_type1_dma_map map = {
591 .argsz = sizeof(map),
592 .flags = VFIO_DMA_MAP_FLAG_READ,
593 .vaddr = (__u64)(uintptr_t)vaddr,
594 .iova = iova,
595 .size = size,
596 };
597
598 if (!readonly) {
599 map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
600 }
601
602 /*
603 * Try the mapping, if it fails with EBUSY, unmap the region and try
604 * again. This shouldn't be necessary, but we sometimes see it in
b6af0975 605 * the VGA ROM space.
e2c7d025
EA
606 */
607 if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
9e7b0442 608 (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
e2c7d025
EA
609 ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
610 return 0;
611 }
612
b09d51c9 613 error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
e2c7d025
EA
614 return -errno;
615}
616
f4ec5e26
AK
617static void vfio_host_win_add(VFIOContainer *container,
618 hwaddr min_iova, hwaddr max_iova,
619 uint64_t iova_pgsizes)
620{
621 VFIOHostDMAWindow *hostwin;
622
623 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
624 if (ranges_overlap(hostwin->min_iova,
625 hostwin->max_iova - hostwin->min_iova + 1,
626 min_iova,
627 max_iova - min_iova + 1)) {
628 hw_error("%s: Overlapped IOMMU are not enabled", __func__);
629 }
630 }
631
632 hostwin = g_malloc0(sizeof(*hostwin));
633
634 hostwin->min_iova = min_iova;
635 hostwin->max_iova = max_iova;
636 hostwin->iova_pgsizes = iova_pgsizes;
637 QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
638}
639
2e4109de
AK
640static int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova,
641 hwaddr max_iova)
642{
643 VFIOHostDMAWindow *hostwin;
644
645 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
646 if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
647 QLIST_REMOVE(hostwin, hostwin_next);
f3bc3a73 648 g_free(hostwin);
2e4109de
AK
649 return 0;
650 }
651 }
652
653 return -1;
654}
655
e2c7d025
EA
656static bool vfio_listener_skipped_section(MemoryRegionSection *section)
657{
658 return (!memory_region_is_ram(section->mr) &&
659 !memory_region_is_iommu(section->mr)) ||
56918a12 660 memory_region_is_protected(section->mr) ||
e2c7d025
EA
661 /*
662 * Sizing an enabled 64-bit BAR can cause spurious mappings to
663 * addresses in the upper part of the 64-bit address space. These
664 * are never accessed by the CPU and beyond the address width of
665 * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width.
666 */
667 section->offset_within_address_space & (1ULL << 63);
668}
669
4a4b88fb 670/* Called with rcu_read_lock held. */
9a04fe09
KW
671static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
672 ram_addr_t *ram_addr, bool *read_only)
e2c7d025 673{
baa44bce 674 bool ret, mr_has_discard_manager;
0fd7616e 675
baa44bce
CL
676 ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only,
677 &mr_has_discard_manager);
678 if (ret && mr_has_discard_manager) {
0fd7616e
DH
679 /*
680 * Malicious VMs might trigger discarding of IOMMU-mapped memory. The
681 * pages will remain pinned inside vfio until unmapped, resulting in a
682 * higher memory consumption than expected. If memory would get
683 * populated again later, there would be an inconsistency between pages
684 * pinned by vfio and pages seen by QEMU. This is the case until
685 * unmapped from the IOMMU (e.g., during device reset).
686 *
687 * With malicious guests, we really only care about pinning more memory
688 * than expected. RLIMIT_MEMLOCK set for the user/process can never be
689 * exceeded and can be used to mitigate this problem.
690 */
691 warn_report_once("Using vfio with vIOMMUs and coordinated discarding of"
692 " RAM (e.g., virtio-mem) works, however, malicious"
693 " guests can trigger pinning of more memory than"
694 " intended via an IOMMU. It's possible to mitigate "
695 " by setting/adjusting RLIMIT_MEMLOCK.");
e2c7d025 696 }
baa44bce 697 return ret;
4a4b88fb
PX
698}
699
700static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
701{
702 VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
703 VFIOContainer *container = giommu->container;
704 hwaddr iova = iotlb->iova + giommu->iommu_offset;
4a4b88fb
PX
705 void *vaddr;
706 int ret;
707
708 trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
709 iova, iova + iotlb->addr_mask);
710
711 if (iotlb->target_as != &address_space_memory) {
712 error_report("Wrong target AS \"%s\", only system memory is allowed",
713 iotlb->target_as->name ? iotlb->target_as->name : "none");
236e0a45 714 vfio_set_migration_error(-EINVAL);
4a4b88fb
PX
715 return;
716 }
717
718 rcu_read_lock();
719
e2c7d025 720 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
9a04fe09
KW
721 bool read_only;
722
723 if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) {
dfbd90e5
PX
724 goto out;
725 }
4a4b88fb
PX
726 /*
727 * vaddr is only valid until rcu_read_unlock(). But after
728 * vfio_dma_map has set up the mapping the pages will be
729 * pinned by the kernel. This makes sure that the RAM backend
730 * of vaddr will always be there, even if the memory object is
731 * destroyed and its backing memory munmap-ed.
732 */
d78c19b5 733 ret = vfio_dma_map(container, iova,
e2c7d025 734 iotlb->addr_mask + 1, vaddr,
4a4b88fb 735 read_only);
e2c7d025
EA
736 if (ret) {
737 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
db9b829b 738 "0x%"HWADDR_PRIx", %p) = %d (%s)",
d78c19b5 739 container, iova,
db9b829b 740 iotlb->addr_mask + 1, vaddr, ret, strerror(-ret));
e2c7d025
EA
741 }
742 } else {
9e7b0442 743 ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
e2c7d025
EA
744 if (ret) {
745 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
db9b829b 746 "0x%"HWADDR_PRIx") = %d (%s)",
d78c19b5 747 container, iova,
db9b829b 748 iotlb->addr_mask + 1, ret, strerror(-ret));
236e0a45 749 vfio_set_migration_error(ret);
e2c7d025
EA
750 }
751 }
41063e1e
PB
752out:
753 rcu_read_unlock();
e2c7d025
EA
754}
755
5e3b981c
DH
756static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
757 MemoryRegionSection *section)
758{
759 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
760 listener);
761 const hwaddr size = int128_get64(section->size);
762 const hwaddr iova = section->offset_within_address_space;
763 int ret;
764
765 /* Unmap with a single call. */
766 ret = vfio_dma_unmap(vrdl->container, iova, size , NULL);
767 if (ret) {
768 error_report("%s: vfio_dma_unmap() failed: %s", __func__,
769 strerror(-ret));
770 }
771}
772
773static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
774 MemoryRegionSection *section)
775{
776 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
777 listener);
778 const hwaddr end = section->offset_within_region +
779 int128_get64(section->size);
780 hwaddr start, next, iova;
781 void *vaddr;
782 int ret;
783
784 /*
785 * Map in (aligned within memory region) minimum granularity, so we can
786 * unmap in minimum granularity later.
787 */
788 for (start = section->offset_within_region; start < end; start = next) {
789 next = ROUND_UP(start + 1, vrdl->granularity);
790 next = MIN(next, end);
791
792 iova = start - section->offset_within_region +
793 section->offset_within_address_space;
794 vaddr = memory_region_get_ram_ptr(section->mr) + start;
795
796 ret = vfio_dma_map(vrdl->container, iova, next - start,
797 vaddr, section->readonly);
798 if (ret) {
799 /* Rollback */
800 vfio_ram_discard_notify_discard(rdl, section);
801 return ret;
802 }
803 }
804 return 0;
805}
806
807static void vfio_register_ram_discard_listener(VFIOContainer *container,
808 MemoryRegionSection *section)
809{
810 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
811 VFIORamDiscardListener *vrdl;
812
813 /* Ignore some corner cases not relevant in practice. */
814 g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE));
815 g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space,
816 TARGET_PAGE_SIZE));
817 g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE));
818
819 vrdl = g_new0(VFIORamDiscardListener, 1);
820 vrdl->container = container;
821 vrdl->mr = section->mr;
822 vrdl->offset_within_address_space = section->offset_within_address_space;
823 vrdl->size = int128_get64(section->size);
824 vrdl->granularity = ram_discard_manager_get_min_granularity(rdm,
825 section->mr);
826
827 g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
a5dba9bc
DH
828 g_assert(container->pgsizes &&
829 vrdl->granularity >= 1ULL << ctz64(container->pgsizes));
5e3b981c
DH
830
831 ram_discard_listener_init(&vrdl->listener,
832 vfio_ram_discard_notify_populate,
833 vfio_ram_discard_notify_discard, true);
834 ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
835 QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next);
a74317f6
DH
836
837 /*
838 * Sanity-check if we have a theoretically problematic setup where we could
839 * exceed the maximum number of possible DMA mappings over time. We assume
840 * that each mapped section in the same address space as a RamDiscardManager
841 * section consumes exactly one DMA mapping, with the exception of
842 * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections
843 * in the same address space as RamDiscardManager sections.
844 *
845 * We assume that each section in the address space consumes one memslot.
846 * We take the number of KVM memory slots as a best guess for the maximum
847 * number of sections in the address space we could have over time,
848 * also consuming DMA mappings.
849 */
850 if (container->dma_max_mappings) {
851 unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
852
853#ifdef CONFIG_KVM
854 if (kvm_enabled()) {
855 max_memslots = kvm_get_max_memslots();
856 }
857#endif
858
859 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
860 hwaddr start, end;
861
862 start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
863 vrdl->granularity);
864 end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size,
865 vrdl->granularity);
866 vrdl_mappings += (end - start) / vrdl->granularity;
867 vrdl_count++;
868 }
869
870 if (vrdl_mappings + max_memslots - vrdl_count >
871 container->dma_max_mappings) {
872 warn_report("%s: possibly running out of DMA mappings. E.g., try"
873 " increasing the 'block-size' of virtio-mem devies."
874 " Maximum possible DMA mappings: %d, Maximum possible"
875 " memslots: %d", __func__, container->dma_max_mappings,
876 max_memslots);
877 }
878 }
5e3b981c
DH
879}
880
881static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
882 MemoryRegionSection *section)
883{
884 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
885 VFIORamDiscardListener *vrdl = NULL;
886
887 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
888 if (vrdl->mr == section->mr &&
889 vrdl->offset_within_address_space ==
890 section->offset_within_address_space) {
891 break;
892 }
893 }
894
895 if (!vrdl) {
896 hw_error("vfio: Trying to unregister missing RAM discard listener");
897 }
898
899 ram_discard_manager_unregister_listener(rdm, &vrdl->listener);
900 QLIST_REMOVE(vrdl, next);
901 g_free(vrdl);
902}
903
fbc6c921
JM
904static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container,
905 hwaddr iova, hwaddr end)
906{
907 VFIOHostDMAWindow *hostwin;
908 bool hostwin_found = false;
909
910 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
911 if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
912 hostwin_found = true;
913 break;
914 }
915 }
916
917 return hostwin_found ? hostwin : NULL;
918}
919
851d6d1a
EA
920static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
921{
922 MemoryRegion *mr = section->mr;
923
924 if (!TPM_IS_CRB(mr->owner)) {
925 return false;
926 }
927
928 /* this is a known safe misaligned region, just trace for debug purpose */
929 trace_vfio_known_safe_misalignment(memory_region_name(mr),
930 section->offset_within_address_space,
931 section->offset_within_region,
932 qemu_real_host_page_size());
933 return true;
934}
935
b92f2376
JM
936static bool vfio_listener_valid_section(MemoryRegionSection *section,
937 const char *name)
e2c7d025 938{
e2c7d025 939 if (vfio_listener_skipped_section(section)) {
b92f2376 940 trace_vfio_listener_region_skip(name,
e2c7d025
EA
941 section->offset_within_address_space,
942 section->offset_within_address_space +
943 int128_get64(int128_sub(section->size, int128_one())));
b92f2376 944 return false;
e2c7d025
EA
945 }
946
1eb7f642 947 if (unlikely((section->offset_within_address_space &
8e3b0cbb
MAL
948 ~qemu_real_host_page_mask()) !=
949 (section->offset_within_region & ~qemu_real_host_page_mask()))) {
851d6d1a
EA
950 if (!vfio_known_safe_misalignment(section)) {
951 error_report("%s received unaligned region %s iova=0x%"PRIx64
952 " offset_within_region=0x%"PRIx64
953 " qemu_real_host_page_size=0x%"PRIxPTR,
954 __func__, memory_region_name(section->mr),
955 section->offset_within_address_space,
956 section->offset_within_region,
957 qemu_real_host_page_size());
958 }
b92f2376
JM
959 return false;
960 }
961
962 return true;
963}
964
4ead8308
JM
965static bool vfio_get_section_iova_range(VFIOContainer *container,
966 MemoryRegionSection *section,
967 hwaddr *out_iova, hwaddr *out_end,
968 Int128 *out_llend)
969{
970 Int128 llend;
971 hwaddr iova;
972
973 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
974 llend = int128_make64(section->offset_within_address_space);
975 llend = int128_add(llend, section->size);
976 llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
977
978 if (int128_ge(int128_make64(iova), llend)) {
979 return false;
980 }
981
982 *out_iova = iova;
983 *out_end = int128_get64(int128_sub(llend, int128_one()));
984 if (out_llend) {
985 *out_llend = llend;
986 }
987 return true;
988}
989
b92f2376
JM
990static void vfio_listener_region_add(MemoryListener *listener,
991 MemoryRegionSection *section)
992{
993 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
994 hwaddr iova, end;
995 Int128 llend, llsize;
996 void *vaddr;
997 int ret;
998 VFIOHostDMAWindow *hostwin;
999 Error *err = NULL;
1000
1001 if (!vfio_listener_valid_section(section, "region_add")) {
e2c7d025
EA
1002 return;
1003 }
1004
4ead8308 1005 if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) {
e4b34708
KJ
1006 if (memory_region_is_ram_device(section->mr)) {
1007 trace_vfio_listener_region_add_no_dma_map(
1008 memory_region_name(section->mr),
1009 section->offset_within_address_space,
1010 int128_getlo(section->size),
8e3b0cbb 1011 qemu_real_host_page_size());
e4b34708 1012 }
e2c7d025
EA
1013 return;
1014 }
3898aad3 1015
2e4109de 1016 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
2e4109de
AK
1017 hwaddr pgsize = 0;
1018
1019 /* For now intersections are not allowed, we may relax this later */
1020 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
1021 if (ranges_overlap(hostwin->min_iova,
1022 hostwin->max_iova - hostwin->min_iova + 1,
1023 section->offset_within_address_space,
1024 int128_get64(section->size))) {
d7d87836
EA
1025 error_setg(&err,
1026 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
1027 "host DMA window [0x%"PRIx64",0x%"PRIx64"]",
1028 section->offset_within_address_space,
1029 section->offset_within_address_space +
1030 int128_get64(section->size) - 1,
1031 hostwin->min_iova, hostwin->max_iova);
2e4109de
AK
1032 goto fail;
1033 }
1034 }
1035
1036 ret = vfio_spapr_create_window(container, section, &pgsize);
1037 if (ret) {
d7d87836 1038 error_setg_errno(&err, -ret, "Failed to create SPAPR window");
2e4109de
AK
1039 goto fail;
1040 }
1041
1042 vfio_host_win_add(container, section->offset_within_address_space,
1043 section->offset_within_address_space +
1044 int128_get64(section->size) - 1, pgsize);
07bc681a
AK
1045#ifdef CONFIG_KVM
1046 if (kvm_enabled()) {
1047 VFIOGroup *group;
1048 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
1049 struct kvm_vfio_spapr_tce param;
1050 struct kvm_device_attr attr = {
1051 .group = KVM_DEV_VFIO_GROUP,
1052 .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
1053 .addr = (uint64_t)(unsigned long)&param,
1054 };
1055
1056 if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
1057 &param.tablefd)) {
1058 QLIST_FOREACH(group, &container->group_list, container_next) {
1059 param.groupfd = group->fd;
1060 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
1061 error_report("vfio: failed to setup fd %d "
1062 "for a group with fd %d: %s",
1063 param.tablefd, param.groupfd,
1064 strerror(errno));
1065 return;
1066 }
1067 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
1068 }
1069 }
1070 }
1071#endif
2e4109de
AK
1072 }
1073
fbc6c921
JM
1074 hostwin = vfio_find_hostwin(container, iova, end);
1075 if (!hostwin) {
d7d87836
EA
1076 error_setg(&err, "Container %p can't map guest IOVA region"
1077 " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end);
3898aad3
DG
1078 goto fail;
1079 }
e2c7d025
EA
1080
1081 memory_region_ref(section->mr);
1082
1083 if (memory_region_is_iommu(section->mr)) {
1084 VFIOGuestIOMMU *giommu;
3df9d748 1085 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
cb1efcf4 1086 int iommu_idx;
e2c7d025 1087
55efcc53 1088 trace_vfio_listener_region_add_iommu(iova, end);
e2c7d025 1089 /*
e2c7d025
EA
1090 * FIXME: For VFIO iommu types which have KVM acceleration to
1091 * avoid bouncing all map/unmaps through qemu this way, this
1092 * would be the right place to wire that up (tell the KVM
1093 * device emulation the VFIO iommu handles to use).
1094 */
e2c7d025 1095 giommu = g_malloc0(sizeof(*giommu));
44ee6aaa 1096 giommu->iommu_mr = iommu_mr;
d78c19b5
AK
1097 giommu->iommu_offset = section->offset_within_address_space -
1098 section->offset_within_region;
e2c7d025 1099 giommu->container = container;
698feb5e
PX
1100 llend = int128_add(int128_make64(section->offset_within_region),
1101 section->size);
1102 llend = int128_sub(llend, int128_one());
cb1efcf4
PM
1103 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
1104 MEMTXATTRS_UNSPECIFIED);
698feb5e 1105 iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
8dca037b 1106 IOMMU_NOTIFIER_IOTLB_EVENTS,
698feb5e 1107 section->offset_within_region,
cb1efcf4
PM
1108 int128_get64(llend),
1109 iommu_idx);
508ce5eb 1110
44ee6aaa 1111 ret = memory_region_iommu_set_page_size_mask(giommu->iommu_mr,
b9177498
BB
1112 container->pgsizes,
1113 &err);
1114 if (ret) {
1115 g_free(giommu);
1116 goto fail;
1117 }
1118
549d4005
EA
1119 ret = memory_region_register_iommu_notifier(section->mr, &giommu->n,
1120 &err);
1121 if (ret) {
1122 g_free(giommu);
1123 goto fail;
1124 }
1125 QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
44ee6aaa 1126 memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
e2c7d025
EA
1127
1128 return;
1129 }
1130
1131 /* Here we assume that memory_region_is_ram(section->mr)==true */
1132
5e3b981c
DH
1133 /*
1134 * For RAM memory regions with a RamDiscardManager, we only want to map the
1135 * actually populated parts - and update the mapping whenever we're notified
1136 * about changes.
1137 */
1138 if (memory_region_has_ram_discard_manager(section->mr)) {
1139 vfio_register_ram_discard_listener(container, section);
1140 return;
1141 }
1142
e2c7d025
EA
1143 vaddr = memory_region_get_ram_ptr(section->mr) +
1144 section->offset_within_region +
1145 (iova - section->offset_within_address_space);
1146
55efcc53 1147 trace_vfio_listener_region_add_ram(iova, end, vaddr);
e2c7d025 1148
55efcc53
BD
1149 llsize = int128_sub(llend, int128_make64(iova));
1150
567b5b30
AK
1151 if (memory_region_is_ram_device(section->mr)) {
1152 hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
1153
1154 if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
5c086005
EA
1155 trace_vfio_listener_region_add_no_dma_map(
1156 memory_region_name(section->mr),
1157 section->offset_within_address_space,
1158 int128_getlo(section->size),
1159 pgmask + 1);
567b5b30
AK
1160 return;
1161 }
1162 }
1163
55efcc53
BD
1164 ret = vfio_dma_map(container, iova, int128_get64(llsize),
1165 vaddr, section->readonly);
e2c7d025 1166 if (ret) {
d7d87836 1167 error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
db9b829b
AH
1168 "0x%"HWADDR_PRIx", %p) = %d (%s)",
1169 container, iova, int128_get64(llsize), vaddr, ret,
1170 strerror(-ret));
567b5b30
AK
1171 if (memory_region_is_ram_device(section->mr)) {
1172 /* Allow unexpected mappings not to be fatal for RAM devices */
d7d87836 1173 error_report_err(err);
567b5b30
AK
1174 return;
1175 }
ac6dc389
DG
1176 goto fail;
1177 }
e2c7d025 1178
ac6dc389
DG
1179 return;
1180
1181fail:
567b5b30
AK
1182 if (memory_region_is_ram_device(section->mr)) {
1183 error_report("failed to vfio_dma_map. pci p2p may not work");
1184 return;
1185 }
ac6dc389
DG
1186 /*
1187 * On the initfn path, store the first error in the container so we
1188 * can gracefully fail. Runtime, there's not much we can do other
1189 * than throw a hardware error.
1190 */
1191 if (!container->initialized) {
1192 if (!container->error) {
d7d87836
EA
1193 error_propagate_prepend(&container->error, err,
1194 "Region %s: ",
1195 memory_region_name(section->mr));
1196 } else {
1197 error_free(err);
e2c7d025 1198 }
ac6dc389 1199 } else {
d7d87836 1200 error_report_err(err);
ac6dc389 1201 hw_error("vfio: DMA mapping failed, unable to continue");
e2c7d025
EA
1202 }
1203}
1204
1205static void vfio_listener_region_del(MemoryListener *listener,
1206 MemoryRegionSection *section)
1207{
ee0bf0e5 1208 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
e2c7d025 1209 hwaddr iova, end;
7a057b4f 1210 Int128 llend, llsize;
e2c7d025 1211 int ret;
567b5b30 1212 bool try_unmap = true;
e2c7d025 1213
b92f2376 1214 if (!vfio_listener_valid_section(section, "region_del")) {
e2c7d025
EA
1215 return;
1216 }
1217
1218 if (memory_region_is_iommu(section->mr)) {
1219 VFIOGuestIOMMU *giommu;
1220
1221 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
44ee6aaa 1222 if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
698feb5e 1223 giommu->n.start == section->offset_within_region) {
3df9d748 1224 memory_region_unregister_iommu_notifier(section->mr,
d22d8956 1225 &giommu->n);
e2c7d025
EA
1226 QLIST_REMOVE(giommu, giommu_next);
1227 g_free(giommu);
1228 break;
1229 }
1230 }
1231
1232 /*
1233 * FIXME: We assume the one big unmap below is adequate to
1234 * remove any individual page mappings in the IOMMU which
1235 * might have been copied into VFIO. This works for a page table
1236 * based IOMMU where a big unmap flattens a large range of IO-PTEs.
1237 * That may not be true for all IOMMU types.
1238 */
1239 }
1240
4ead8308 1241 if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) {
e2c7d025
EA
1242 return;
1243 }
7a057b4f
AK
1244
1245 llsize = int128_sub(llend, int128_make64(iova));
e2c7d025 1246
7a057b4f 1247 trace_vfio_listener_region_del(iova, end);
e2c7d025 1248
567b5b30
AK
1249 if (memory_region_is_ram_device(section->mr)) {
1250 hwaddr pgmask;
1251 VFIOHostDMAWindow *hostwin;
567b5b30 1252
fbc6c921
JM
1253 hostwin = vfio_find_hostwin(container, iova, end);
1254 assert(hostwin); /* or region_add() would have failed */
567b5b30
AK
1255
1256 pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
1257 try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
5e3b981c
DH
1258 } else if (memory_region_has_ram_discard_manager(section->mr)) {
1259 vfio_unregister_ram_discard_listener(container, section);
1260 /* Unregistering will trigger an unmap. */
1261 try_unmap = false;
e2c7d025 1262 }
2e4109de 1263
567b5b30 1264 if (try_unmap) {
1b296c3d
JPB
1265 if (int128_eq(llsize, int128_2_64())) {
1266 /* The unmap ioctl doesn't accept a full 64-bit span. */
1267 llsize = int128_rshift(llsize, 1);
1268 ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
1269 if (ret) {
1270 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
db9b829b
AH
1271 "0x%"HWADDR_PRIx") = %d (%s)",
1272 container, iova, int128_get64(llsize), ret,
1273 strerror(-ret));
1b296c3d
JPB
1274 }
1275 iova += int128_get64(llsize);
1276 }
9e7b0442 1277 ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
567b5b30
AK
1278 if (ret) {
1279 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
db9b829b
AH
1280 "0x%"HWADDR_PRIx") = %d (%s)",
1281 container, iova, int128_get64(llsize), ret,
1282 strerror(-ret));
567b5b30
AK
1283 }
1284 }
1285
1286 memory_region_unref(section->mr);
1287
2e4109de
AK
1288 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1289 vfio_spapr_remove_window(container,
1290 section->offset_within_address_space);
1291 if (vfio_host_win_del(container,
1292 section->offset_within_address_space,
1293 section->offset_within_address_space +
1294 int128_get64(section->size) - 1) < 0) {
1295 hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
1296 __func__, section->offset_within_address_space);
1297 }
1298 }
e2c7d025
EA
1299}
1300
236e0a45 1301static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
758b96b6
KZ
1302{
1303 int ret;
1304 struct vfio_iommu_type1_dirty_bitmap dirty = {
1305 .argsz = sizeof(dirty),
1306 };
1307
b051a3f6 1308 if (!container->dirty_pages_supported) {
236e0a45 1309 return 0;
b051a3f6
AH
1310 }
1311
758b96b6
KZ
1312 if (start) {
1313 dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
1314 } else {
1315 dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
1316 }
1317
1318 ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
1319 if (ret) {
236e0a45 1320 ret = -errno;
758b96b6
KZ
1321 error_report("Failed to set dirty tracking flag 0x%x errno: %d",
1322 dirty.flags, errno);
1323 }
236e0a45
AH
1324
1325 return ret;
758b96b6
KZ
1326}
1327
1328static void vfio_listener_log_global_start(MemoryListener *listener)
1329{
1330 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
236e0a45 1331 int ret;
758b96b6 1332
236e0a45
AH
1333 ret = vfio_set_dirty_page_tracking(container, true);
1334 if (ret) {
1335 vfio_set_migration_error(ret);
1336 }
758b96b6
KZ
1337}
1338
1339static void vfio_listener_log_global_stop(MemoryListener *listener)
1340{
1341 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
236e0a45 1342 int ret;
758b96b6 1343
236e0a45
AH
1344 ret = vfio_set_dirty_page_tracking(container, false);
1345 if (ret) {
1346 vfio_set_migration_error(ret);
1347 }
758b96b6
KZ
1348}
1349
b6dd6504
KW
1350static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
1351 uint64_t size, ram_addr_t ram_addr)
1352{
1353 struct vfio_iommu_type1_dirty_bitmap *dbitmap;
1354 struct vfio_iommu_type1_dirty_bitmap_get *range;
725ccd7e 1355 VFIOBitmap vbmap;
b6dd6504
KW
1356 int ret;
1357
b051a3f6
AH
1358 if (!container->dirty_pages_supported) {
1359 cpu_physical_memory_set_dirty_range(ram_addr, size,
1360 tcg_enabled() ? DIRTY_CLIENTS_ALL :
1361 DIRTY_CLIENTS_NOCODE);
1362 return 0;
1363 }
1364
725ccd7e
AH
1365 ret = vfio_bitmap_alloc(&vbmap, size);
1366 if (ret) {
1367 return ret;
1368 }
1369
b6dd6504
KW
1370 dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
1371
1372 dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
1373 dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
1374 range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
1375 range->iova = iova;
1376 range->size = size;
1377
1378 /*
1eb7f642
KJ
1379 * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
1380 * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize
1381 * to qemu_real_host_page_size.
b6dd6504 1382 */
8e3b0cbb 1383 range->bitmap.pgsize = qemu_real_host_page_size();
725ccd7e
AH
1384 range->bitmap.size = vbmap.size;
1385 range->bitmap.data = (__u64 *)vbmap.bitmap;
b6dd6504
KW
1386
1387 ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
1388 if (ret) {
3e2413a6 1389 ret = -errno;
b6dd6504
KW
1390 error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64
1391 " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
1392 (uint64_t)range->size, errno);
1393 goto err_out;
1394 }
1395
725ccd7e
AH
1396 cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr,
1397 vbmap.pages);
b6dd6504
KW
1398
1399 trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
1400 range->bitmap.size, ram_addr);
1401err_out:
b6dd6504 1402 g_free(dbitmap);
725ccd7e 1403 g_free(vbmap.bitmap);
b6dd6504
KW
1404
1405 return ret;
1406}
1407
9a04fe09
KW
1408typedef struct {
1409 IOMMUNotifier n;
1410 VFIOGuestIOMMU *giommu;
1411} vfio_giommu_dirty_notifier;
1412
1413static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
1414{
1415 vfio_giommu_dirty_notifier *gdn = container_of(n,
1416 vfio_giommu_dirty_notifier, n);
1417 VFIOGuestIOMMU *giommu = gdn->giommu;
1418 VFIOContainer *container = giommu->container;
1419 hwaddr iova = iotlb->iova + giommu->iommu_offset;
1420 ram_addr_t translated_addr;
236e0a45 1421 int ret = -EINVAL;
9a04fe09
KW
1422
1423 trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
1424
1425 if (iotlb->target_as != &address_space_memory) {
1426 error_report("Wrong target AS \"%s\", only system memory is allowed",
1427 iotlb->target_as->name ? iotlb->target_as->name : "none");
236e0a45 1428 goto out;
9a04fe09
KW
1429 }
1430
1431 rcu_read_lock();
1432 if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
9a04fe09
KW
1433 ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
1434 translated_addr);
1435 if (ret) {
1436 error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
db9b829b
AH
1437 "0x%"HWADDR_PRIx") = %d (%s)",
1438 container, iova, iotlb->addr_mask + 1, ret,
1439 strerror(-ret));
9a04fe09
KW
1440 }
1441 }
1442 rcu_read_unlock();
236e0a45
AH
1443
1444out:
1445 if (ret) {
1446 vfio_set_migration_error(ret);
1447 }
9a04fe09
KW
1448}
1449
5e3b981c
DH
1450static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
1451 void *opaque)
1452{
1453 const hwaddr size = int128_get64(section->size);
1454 const hwaddr iova = section->offset_within_address_space;
1455 const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) +
1456 section->offset_within_region;
1457 VFIORamDiscardListener *vrdl = opaque;
1458
1459 /*
1460 * Sync the whole mapped region (spanning multiple individual mappings)
1461 * in one go.
1462 */
1463 return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr);
1464}
1465
1466static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
1467 MemoryRegionSection *section)
1468{
1469 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
1470 VFIORamDiscardListener *vrdl = NULL;
1471
1472 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
1473 if (vrdl->mr == section->mr &&
1474 vrdl->offset_within_address_space ==
1475 section->offset_within_address_space) {
1476 break;
1477 }
1478 }
1479
1480 if (!vrdl) {
1481 hw_error("vfio: Trying to sync missing RAM discard listener");
1482 }
1483
1484 /*
1485 * We only want/can synchronize the bitmap for actually mapped parts -
1486 * which correspond to populated parts. Replay all populated parts.
1487 */
1488 return ram_discard_manager_replay_populated(rdm, section,
1489 vfio_ram_discard_get_dirty_bitmap,
1490 &vrdl);
1491}
1492
b6dd6504
KW
1493static int vfio_sync_dirty_bitmap(VFIOContainer *container,
1494 MemoryRegionSection *section)
1495{
1496 ram_addr_t ram_addr;
1497
9a04fe09
KW
1498 if (memory_region_is_iommu(section->mr)) {
1499 VFIOGuestIOMMU *giommu;
1500
1501 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
44ee6aaa 1502 if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
9a04fe09
KW
1503 giommu->n.start == section->offset_within_region) {
1504 Int128 llend;
1505 vfio_giommu_dirty_notifier gdn = { .giommu = giommu };
44ee6aaa 1506 int idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr,
9a04fe09
KW
1507 MEMTXATTRS_UNSPECIFIED);
1508
1509 llend = int128_add(int128_make64(section->offset_within_region),
1510 section->size);
1511 llend = int128_sub(llend, int128_one());
1512
1513 iommu_notifier_init(&gdn.n,
1514 vfio_iommu_map_dirty_notify,
1515 IOMMU_NOTIFIER_MAP,
1516 section->offset_within_region,
1517 int128_get64(llend),
1518 idx);
44ee6aaa 1519 memory_region_iommu_replay(giommu->iommu_mr, &gdn.n);
9a04fe09
KW
1520 break;
1521 }
1522 }
1523 return 0;
5e3b981c
DH
1524 } else if (memory_region_has_ram_discard_manager(section->mr)) {
1525 return vfio_sync_ram_discard_listener_dirty_bitmap(container, section);
9a04fe09
KW
1526 }
1527
b6dd6504
KW
1528 ram_addr = memory_region_get_ram_addr(section->mr) +
1529 section->offset_within_region;
1530
1531 return vfio_get_dirty_bitmap(container,
1eb7f642
KJ
1532 REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
1533 int128_get64(section->size), ram_addr);
b6dd6504
KW
1534}
1535
4292d501 1536static void vfio_listener_log_sync(MemoryListener *listener,
b6dd6504
KW
1537 MemoryRegionSection *section)
1538{
1539 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
236e0a45 1540 int ret;
b6dd6504 1541
b051a3f6 1542 if (vfio_listener_skipped_section(section)) {
b6dd6504
KW
1543 return;
1544 }
1545
758b96b6 1546 if (vfio_devices_all_dirty_tracking(container)) {
236e0a45
AH
1547 ret = vfio_sync_dirty_bitmap(container, section);
1548 if (ret) {
1549 error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret,
1550 strerror(-ret));
1551 vfio_set_migration_error(ret);
1552 }
b6dd6504
KW
1553 }
1554}
1555
51b833f4 1556static const MemoryListener vfio_memory_listener = {
142518bd 1557 .name = "vfio",
e2c7d025
EA
1558 .region_add = vfio_listener_region_add,
1559 .region_del = vfio_listener_region_del,
758b96b6
KZ
1560 .log_global_start = vfio_listener_log_global_start,
1561 .log_global_stop = vfio_listener_log_global_stop,
4292d501 1562 .log_sync = vfio_listener_log_sync,
e2c7d025
EA
1563};
1564
51b833f4 1565static void vfio_listener_release(VFIOContainer *container)
e2c7d025 1566{
ee0bf0e5 1567 memory_listener_unregister(&container->listener);
318f67ce
AK
1568 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1569 memory_listener_unregister(&container->prereg_listener);
1570 }
e2c7d025
EA
1571}
1572
3ab7a0b4
MR
1573static struct vfio_info_cap_header *
1574vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id)
b53b0f69
AW
1575{
1576 struct vfio_info_cap_header *hdr;
b53b0f69 1577
3ab7a0b4 1578 for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
b53b0f69
AW
1579 if (hdr->id == id) {
1580 return hdr;
1581 }
1582 }
1583
1584 return NULL;
1585}
1586
3ab7a0b4
MR
1587struct vfio_info_cap_header *
1588vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
1589{
1590 if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) {
1591 return NULL;
1592 }
1593
1594 return vfio_get_cap((void *)info, info->cap_offset, id);
1595}
1596
7486a628
MR
1597static struct vfio_info_cap_header *
1598vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
1599{
1600 if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
1601 return NULL;
1602 }
1603
1604 return vfio_get_cap((void *)info, info->cap_offset, id);
1605}
1606
92fe289a
MR
1607struct vfio_info_cap_header *
1608vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id)
1609{
1610 if (!(info->flags & VFIO_DEVICE_FLAGS_CAPS)) {
1611 return NULL;
1612 }
1613
1614 return vfio_get_cap((void *)info, info->cap_offset, id);
1615}
1616
7486a628
MR
1617bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
1618 unsigned int *avail)
1619{
1620 struct vfio_info_cap_header *hdr;
1621 struct vfio_iommu_type1_info_dma_avail *cap;
1622
1623 /* If the capability cannot be found, assume no DMA limiting */
1624 hdr = vfio_get_iommu_type1_info_cap(info,
1625 VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
1626 if (hdr == NULL) {
1627 return false;
1628 }
1629
1630 if (avail != NULL) {
1631 cap = (void *) hdr;
1632 *avail = cap->avail;
1633 }
1634
1635 return true;
1636}
1637
24acf72b
AW
1638static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
1639 struct vfio_region_info *info)
b53b0f69
AW
1640{
1641 struct vfio_info_cap_header *hdr;
1642 struct vfio_region_info_cap_sparse_mmap *sparse;
24acf72b 1643 int i, j;
b53b0f69
AW
1644
1645 hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
1646 if (!hdr) {
24acf72b 1647 return -ENODEV;
b53b0f69
AW
1648 }
1649
1650 sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
1651
1652 trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
1653 region->nr, sparse->nr_areas);
1654
24acf72b
AW
1655 region->mmaps = g_new0(VFIOMmap, sparse->nr_areas);
1656
1657 for (i = 0, j = 0; i < sparse->nr_areas; i++) {
24acf72b 1658 if (sparse->areas[i].size) {
99510d27
XC
1659 trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset,
1660 sparse->areas[i].offset +
1661 sparse->areas[i].size - 1);
24acf72b
AW
1662 region->mmaps[j].offset = sparse->areas[i].offset;
1663 region->mmaps[j].size = sparse->areas[i].size;
1664 j++;
1665 }
b53b0f69 1666 }
24acf72b
AW
1667
1668 region->nr_mmaps = j;
1669 region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap));
1670
1671 return 0;
b53b0f69
AW
1672}
1673
db0da029
AW
1674int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
1675 int index, const char *name)
e2c7d025 1676{
db0da029
AW
1677 struct vfio_region_info *info;
1678 int ret;
1679
1680 ret = vfio_get_region_info(vbasedev, index, &info);
1681 if (ret) {
1682 return ret;
1683 }
1684
1685 region->vbasedev = vbasedev;
1686 region->flags = info->flags;
1687 region->size = info->size;
1688 region->fd_offset = info->offset;
1689 region->nr = index;
1690
1691 if (region->size) {
1692 region->mem = g_new0(MemoryRegion, 1);
1693 memory_region_init_io(region->mem, obj, &vfio_region_ops,
1694 region, name, region->size);
e2c7d025 1695
db0da029 1696 if (!vbasedev->no_mmap &&
95251725 1697 region->flags & VFIO_REGION_INFO_FLAG_MMAP) {
e2c7d025 1698
24acf72b 1699 ret = vfio_setup_region_sparse_mmaps(region, info);
db0da029 1700
24acf72b 1701 if (ret) {
b53b0f69
AW
1702 region->nr_mmaps = 1;
1703 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
1704 region->mmaps[0].offset = 0;
1705 region->mmaps[0].size = region->size;
1706 }
e2c7d025 1707 }
db0da029
AW
1708 }
1709
1710 g_free(info);
1711
1712 trace_vfio_region_setup(vbasedev->name, index, name,
1713 region->flags, region->fd_offset, region->size);
1714 return 0;
1715}
e2c7d025 1716
0f7a903b
KW
1717static void vfio_subregion_unmap(VFIORegion *region, int index)
1718{
1719 trace_vfio_region_unmap(memory_region_name(&region->mmaps[index].mem),
1720 region->mmaps[index].offset,
1721 region->mmaps[index].offset +
1722 region->mmaps[index].size - 1);
1723 memory_region_del_subregion(region->mem, &region->mmaps[index].mem);
1724 munmap(region->mmaps[index].mmap, region->mmaps[index].size);
1725 object_unparent(OBJECT(&region->mmaps[index].mem));
1726 region->mmaps[index].mmap = NULL;
1727}
1728
db0da029
AW
1729int vfio_region_mmap(VFIORegion *region)
1730{
1731 int i, prot = 0;
1732 char *name;
1733
1734 if (!region->mem) {
1735 return 0;
1736 }
1737
1738 prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0;
1739 prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0;
1740
1741 for (i = 0; i < region->nr_mmaps; i++) {
1742 region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot,
1743 MAP_SHARED, region->vbasedev->fd,
1744 region->fd_offset +
1745 region->mmaps[i].offset);
1746 if (region->mmaps[i].mmap == MAP_FAILED) {
1747 int ret = -errno;
1748
1749 trace_vfio_region_mmap_fault(memory_region_name(region->mem), i,
1750 region->fd_offset +
1751 region->mmaps[i].offset,
1752 region->fd_offset +
1753 region->mmaps[i].offset +
1754 region->mmaps[i].size - 1, ret);
1755
1756 region->mmaps[i].mmap = NULL;
1757
1758 for (i--; i >= 0; i--) {
0f7a903b 1759 vfio_subregion_unmap(region, i);
db0da029
AW
1760 }
1761
1762 return ret;
e2c7d025
EA
1763 }
1764
db0da029
AW
1765 name = g_strdup_printf("%s mmaps[%d]",
1766 memory_region_name(region->mem), i);
21e00fa5
AW
1767 memory_region_init_ram_device_ptr(&region->mmaps[i].mem,
1768 memory_region_owner(region->mem),
1769 name, region->mmaps[i].size,
1770 region->mmaps[i].mmap);
db0da029 1771 g_free(name);
db0da029
AW
1772 memory_region_add_subregion(region->mem, region->mmaps[i].offset,
1773 &region->mmaps[i].mem);
1774
1775 trace_vfio_region_mmap(memory_region_name(&region->mmaps[i].mem),
1776 region->mmaps[i].offset,
1777 region->mmaps[i].offset +
1778 region->mmaps[i].size - 1);
1779 }
1780
1781 return 0;
1782}
1783
0f7a903b
KW
1784void vfio_region_unmap(VFIORegion *region)
1785{
1786 int i;
1787
1788 if (!region->mem) {
1789 return;
1790 }
1791
1792 for (i = 0; i < region->nr_mmaps; i++) {
1793 if (region->mmaps[i].mmap) {
1794 vfio_subregion_unmap(region, i);
1795 }
1796 }
1797}
1798
db0da029
AW
1799void vfio_region_exit(VFIORegion *region)
1800{
1801 int i;
1802
1803 if (!region->mem) {
1804 return;
1805 }
1806
1807 for (i = 0; i < region->nr_mmaps; i++) {
1808 if (region->mmaps[i].mmap) {
1809 memory_region_del_subregion(region->mem, &region->mmaps[i].mem);
e2c7d025 1810 }
db0da029 1811 }
e2c7d025 1812
db0da029
AW
1813 trace_vfio_region_exit(region->vbasedev->name, region->nr);
1814}
1815
1816void vfio_region_finalize(VFIORegion *region)
1817{
1818 int i;
1819
1820 if (!region->mem) {
1821 return;
e2c7d025
EA
1822 }
1823
db0da029
AW
1824 for (i = 0; i < region->nr_mmaps; i++) {
1825 if (region->mmaps[i].mmap) {
1826 munmap(region->mmaps[i].mmap, region->mmaps[i].size);
1827 object_unparent(OBJECT(&region->mmaps[i].mem));
1828 }
1829 }
1830
1831 object_unparent(OBJECT(region->mem));
1832
1833 g_free(region->mem);
1834 g_free(region->mmaps);
1835
1836 trace_vfio_region_finalize(region->vbasedev->name, region->nr);
92f86bff
GH
1837
1838 region->mem = NULL;
1839 region->mmaps = NULL;
1840 region->nr_mmaps = 0;
1841 region->size = 0;
1842 region->flags = 0;
1843 region->nr = 0;
db0da029
AW
1844}
1845
1846void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled)
1847{
1848 int i;
1849
1850 if (!region->mem) {
1851 return;
1852 }
1853
1854 for (i = 0; i < region->nr_mmaps; i++) {
1855 if (region->mmaps[i].mmap) {
1856 memory_region_set_enabled(&region->mmaps[i].mem, enabled);
1857 }
1858 }
e2c7d025 1859
db0da029
AW
1860 trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem),
1861 enabled);
e2c7d025
EA
1862}
1863
1864void vfio_reset_handler(void *opaque)
1865{
1866 VFIOGroup *group;
1867 VFIODevice *vbasedev;
1868
1869 QLIST_FOREACH(group, &vfio_group_list, next) {
1870 QLIST_FOREACH(vbasedev, &group->device_list, next) {
7da624e2
AW
1871 if (vbasedev->dev->realized) {
1872 vbasedev->ops->vfio_compute_needs_reset(vbasedev);
1873 }
e2c7d025
EA
1874 }
1875 }
1876
1877 QLIST_FOREACH(group, &vfio_group_list, next) {
1878 QLIST_FOREACH(vbasedev, &group->device_list, next) {
7da624e2 1879 if (vbasedev->dev->realized && vbasedev->needs_reset) {
e2c7d025
EA
1880 vbasedev->ops->vfio_hot_reset_multi(vbasedev);
1881 }
1882 }
1883 }
1884}
1885
1886static void vfio_kvm_device_add_group(VFIOGroup *group)
1887{
1888#ifdef CONFIG_KVM
1889 struct kvm_device_attr attr = {
1890 .group = KVM_DEV_VFIO_GROUP,
1891 .attr = KVM_DEV_VFIO_GROUP_ADD,
1892 .addr = (uint64_t)(unsigned long)&group->fd,
1893 };
1894
1895 if (!kvm_enabled()) {
1896 return;
1897 }
1898
1899 if (vfio_kvm_device_fd < 0) {
1900 struct kvm_create_device cd = {
1901 .type = KVM_DEV_TYPE_VFIO,
1902 };
1903
1904 if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
78e5b17f 1905 error_report("Failed to create KVM VFIO device: %m");
e2c7d025
EA
1906 return;
1907 }
1908
1909 vfio_kvm_device_fd = cd.fd;
1910 }
1911
1912 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
1913 error_report("Failed to add group %d to KVM VFIO device: %m",
1914 group->groupid);
1915 }
1916#endif
1917}
1918
1919static void vfio_kvm_device_del_group(VFIOGroup *group)
1920{
1921#ifdef CONFIG_KVM
1922 struct kvm_device_attr attr = {
1923 .group = KVM_DEV_VFIO_GROUP,
1924 .attr = KVM_DEV_VFIO_GROUP_DEL,
1925 .addr = (uint64_t)(unsigned long)&group->fd,
1926 };
1927
1928 if (vfio_kvm_device_fd < 0) {
1929 return;
1930 }
1931
1932 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
1933 error_report("Failed to remove group %d from KVM VFIO device: %m",
1934 group->groupid);
1935 }
1936#endif
1937}
1938
1939static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
1940{
1941 VFIOAddressSpace *space;
1942
1943 QLIST_FOREACH(space, &vfio_address_spaces, list) {
1944 if (space->as == as) {
1945 return space;
1946 }
1947 }
1948
1949 /* No suitable VFIOAddressSpace, create a new one */
1950 space = g_malloc0(sizeof(*space));
1951 space->as = as;
1952 QLIST_INIT(&space->containers);
1953
1954 QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
1955
1956 return space;
1957}
1958
1959static void vfio_put_address_space(VFIOAddressSpace *space)
1960{
1961 if (QLIST_EMPTY(&space->containers)) {
1962 QLIST_REMOVE(space, list);
1963 g_free(space);
1964 }
1965}
1966
2b6326c0
EA
1967/*
1968 * vfio_get_iommu_type - selects the richest iommu_type (v2 first)
1969 */
1970static int vfio_get_iommu_type(VFIOContainer *container,
1971 Error **errp)
1972{
1973 int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
1974 VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
1975 int i;
1976
1977 for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
1978 if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
1979 return iommu_types[i];
1980 }
1981 }
1982 error_setg(errp, "No available IOMMU models");
1983 return -EINVAL;
1984}
1985
1986static int vfio_init_container(VFIOContainer *container, int group_fd,
1987 Error **errp)
1988{
1989 int iommu_type, ret;
1990
1991 iommu_type = vfio_get_iommu_type(container, errp);
1992 if (iommu_type < 0) {
1993 return iommu_type;
1994 }
1995
1996 ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd);
1997 if (ret) {
1998 error_setg_errno(errp, errno, "Failed to set group container");
1999 return -errno;
2000 }
2001
2002 while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
2003 if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
2004 /*
2005 * On sPAPR, despite the IOMMU subdriver always advertises v1 and
2006 * v2, the running platform may not support v2 and there is no
2007 * way to guess it until an IOMMU group gets added to the container.
2008 * So in case it fails with v2, try v1 as a fallback.
2009 */
2010 iommu_type = VFIO_SPAPR_TCE_IOMMU;
2011 continue;
2012 }
2013 error_setg_errno(errp, errno, "Failed to set iommu for container");
2014 return -errno;
2015 }
2016
2017 container->iommu_type = iommu_type;
2018 return 0;
2019}
2020
87ea529c
KW
2021static int vfio_get_iommu_info(VFIOContainer *container,
2022 struct vfio_iommu_type1_info **info)
2023{
2024
2025 size_t argsz = sizeof(struct vfio_iommu_type1_info);
2026
2027 *info = g_new0(struct vfio_iommu_type1_info, 1);
2028again:
2029 (*info)->argsz = argsz;
2030
2031 if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
2032 g_free(*info);
2033 *info = NULL;
2034 return -errno;
2035 }
2036
2037 if (((*info)->argsz > argsz)) {
2038 argsz = (*info)->argsz;
2039 *info = g_realloc(*info, argsz);
2040 goto again;
2041 }
2042
2043 return 0;
2044}
2045
2046static struct vfio_info_cap_header *
2047vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
2048{
2049 struct vfio_info_cap_header *hdr;
2050 void *ptr = info;
2051
2052 if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
2053 return NULL;
2054 }
2055
2056 for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
2057 if (hdr->id == id) {
2058 return hdr;
2059 }
2060 }
2061
2062 return NULL;
2063}
2064
2065static void vfio_get_iommu_info_migration(VFIOContainer *container,
2066 struct vfio_iommu_type1_info *info)
2067{
2068 struct vfio_info_cap_header *hdr;
2069 struct vfio_iommu_type1_info_cap_migration *cap_mig;
2070
2071 hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
2072 if (!hdr) {
2073 return;
2074 }
2075
2076 cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
2077 header);
2078
2079 /*
1eb7f642
KJ
2080 * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
2081 * qemu_real_host_page_size to mark those dirty.
87ea529c 2082 */
8e3b0cbb 2083 if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
87ea529c
KW
2084 container->dirty_pages_supported = true;
2085 container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
2086 container->dirty_pgsizes = cap_mig->pgsize_bitmap;
2087 }
2088}
2089
01905f58
EA
2090static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
2091 Error **errp)
e2c7d025
EA
2092{
2093 VFIOContainer *container;
2094 int ret, fd;
2095 VFIOAddressSpace *space;
2096
2097 space = vfio_get_address_space(as);
2098
c65ee433 2099 /*
aff92b82 2100 * VFIO is currently incompatible with discarding of RAM insofar as the
c65ee433
AW
2101 * madvise to purge (zap) the page from QEMU's address space does not
2102 * interact with the memory API and therefore leaves stale virtual to
2103 * physical mappings in the IOMMU if the page was previously pinned. We
aff92b82 2104 * therefore set discarding broken for each group added to a container,
c65ee433
AW
2105 * whether the container is used individually or shared. This provides
2106 * us with options to allow devices within a group to opt-in and allow
aff92b82 2107 * discarding, so long as it is done consistently for a group (for instance
c65ee433
AW
2108 * if the device is an mdev device where it is known that the host vendor
2109 * driver will never pin pages outside of the working set of the guest
aff92b82 2110 * driver, which would thus not be discarding candidates).
c65ee433
AW
2111 *
2112 * The first opportunity to induce pinning occurs here where we attempt to
2113 * attach the group to existing containers within the AddressSpace. If any
aff92b82
DH
2114 * pages are already zapped from the virtual address space, such as from
2115 * previous discards, new pinning will cause valid mappings to be
c65ee433
AW
2116 * re-established. Likewise, when the overall MemoryListener for a new
2117 * container is registered, a replay of mappings within the AddressSpace
2118 * will occur, re-establishing any previously zapped pages as well.
2119 *
aff92b82
DH
2120 * Especially virtio-balloon is currently only prevented from discarding
2121 * new memory, it will not yet set ram_block_discard_set_required() and
2122 * therefore, neither stops us here or deals with the sudden memory
2123 * consumption of inflated memory.
53d1b5fc
DH
2124 *
2125 * We do support discarding of memory coordinated via the RamDiscardManager
2126 * with some IOMMU types. vfio_ram_block_discard_disable() handles the
2127 * details once we know which type of IOMMU we are using.
c65ee433 2128 */
c65ee433 2129
e2c7d025
EA
2130 QLIST_FOREACH(container, &space->containers, next) {
2131 if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
53d1b5fc
DH
2132 ret = vfio_ram_block_discard_disable(container, true);
2133 if (ret) {
2134 error_setg_errno(errp, -ret,
2135 "Cannot set discarding of RAM broken");
2136 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER,
2137 &container->fd)) {
2138 error_report("vfio: error disconnecting group %d from"
2139 " container", group->groupid);
2140 }
2141 return ret;
2142 }
e2c7d025
EA
2143 group->container = container;
2144 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2016986a 2145 vfio_kvm_device_add_group(group);
e2c7d025
EA
2146 return 0;
2147 }
2148 }
2149
448058aa 2150 fd = qemu_open_old("/dev/vfio/vfio", O_RDWR);
e2c7d025 2151 if (fd < 0) {
01905f58 2152 error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio");
e2c7d025
EA
2153 ret = -errno;
2154 goto put_space_exit;
2155 }
2156
2157 ret = ioctl(fd, VFIO_GET_API_VERSION);
2158 if (ret != VFIO_API_VERSION) {
01905f58
EA
2159 error_setg(errp, "supported vfio version: %d, "
2160 "reported version: %d", VFIO_API_VERSION, ret);
e2c7d025
EA
2161 ret = -EINVAL;
2162 goto close_fd_exit;
2163 }
2164
2165 container = g_malloc0(sizeof(*container));
2166 container->space = space;
2167 container->fd = fd;
d7d87836 2168 container->error = NULL;
87ea529c 2169 container->dirty_pages_supported = false;
3eed155c 2170 container->dma_max_mappings = 0;
f7f9c7b2
LY
2171 QLIST_INIT(&container->giommu_list);
2172 QLIST_INIT(&container->hostwin_list);
5e3b981c 2173 QLIST_INIT(&container->vrdl_list);
2e6e697e 2174
2b6326c0
EA
2175 ret = vfio_init_container(container, group->fd, errp);
2176 if (ret) {
2177 goto free_container_exit;
2178 }
e2c7d025 2179
53d1b5fc
DH
2180 ret = vfio_ram_block_discard_disable(container, true);
2181 if (ret) {
2182 error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
2183 goto free_container_exit;
2184 }
2185
2b6326c0
EA
2186 switch (container->iommu_type) {
2187 case VFIO_TYPE1v2_IOMMU:
2188 case VFIO_TYPE1_IOMMU:
2189 {
87ea529c 2190 struct vfio_iommu_type1_info *info;
3898aad3 2191
87ea529c 2192 ret = vfio_get_iommu_info(container, &info);
85b6d2b5
AW
2193 if (ret) {
2194 error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
2195 goto enable_discards_exit;
2196 }
87ea529c 2197
85b6d2b5
AW
2198 if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
2199 container->pgsizes = info->iova_pgsizes;
2200 } else {
2201 container->pgsizes = qemu_real_host_page_size();
87ea529c 2202 }
85b6d2b5
AW
2203
2204 if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) {
2205 container->dma_max_mappings = 65535;
7a140a57 2206 }
85b6d2b5 2207 vfio_get_iommu_info_migration(container, info);
87ea529c 2208 g_free(info);
85b6d2b5
AW
2209
2210 /*
2211 * FIXME: We should parse VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE
2212 * information to get the actual window extent rather than assume
2213 * a 64-bit IOVA address space.
2214 */
2215 vfio_host_win_add(container, 0, (hwaddr)-1, container->pgsizes);
2216
2b6326c0
EA
2217 break;
2218 }
2219 case VFIO_SPAPR_TCE_v2_IOMMU:
2220 case VFIO_SPAPR_TCE_IOMMU:
2221 {
3898aad3 2222 struct vfio_iommu_spapr_tce_info info;
2b6326c0 2223 bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
e2c7d025
EA
2224
2225 /*
2226 * The host kernel code implementing VFIO_IOMMU_DISABLE is called
2227 * when container fd is closed so we do not call it explicitly
2228 * in this file.
2229 */
318f67ce
AK
2230 if (!v2) {
2231 ret = ioctl(fd, VFIO_IOMMU_ENABLE);
2232 if (ret) {
01905f58 2233 error_setg_errno(errp, errno, "failed to enable container");
318f67ce 2234 ret = -errno;
53d1b5fc 2235 goto enable_discards_exit;
318f67ce
AK
2236 }
2237 } else {
2238 container->prereg_listener = vfio_prereg_listener;
2239
2240 memory_listener_register(&container->prereg_listener,
2241 &address_space_memory);
2242 if (container->error) {
2243 memory_listener_unregister(&container->prereg_listener);
d7d87836
EA
2244 ret = -1;
2245 error_propagate_prepend(errp, container->error,
2246 "RAM memory listener initialization failed: ");
53d1b5fc 2247 goto enable_discards_exit;
318f67ce 2248 }
e2c7d025 2249 }
3898aad3 2250
3898aad3
DG
2251 info.argsz = sizeof(info);
2252 ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
2253 if (ret) {
01905f58
EA
2254 error_setg_errno(errp, errno,
2255 "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
3898aad3 2256 ret = -errno;
318f67ce
AK
2257 if (v2) {
2258 memory_listener_unregister(&container->prereg_listener);
2259 }
53d1b5fc 2260 goto enable_discards_exit;
3898aad3 2261 }
7a140a57 2262
2e4109de 2263 if (v2) {
c26bc185 2264 container->pgsizes = info.ddw.pgsizes;
2e4109de
AK
2265 /*
2266 * There is a default window in just created container.
2267 * To make region_add/del simpler, we better remove this
2268 * window now and let those iommu_listener callbacks
2269 * create/remove them when needed.
2270 */
2271 ret = vfio_spapr_remove_window(container, info.dma32_window_start);
2272 if (ret) {
01905f58
EA
2273 error_setg_errno(errp, -ret,
2274 "failed to remove existing window");
53d1b5fc 2275 goto enable_discards_exit;
2e4109de
AK
2276 }
2277 } else {
2278 /* The default table uses 4K pages */
c26bc185 2279 container->pgsizes = 0x1000;
2e4109de
AK
2280 vfio_host_win_add(container, info.dma32_window_start,
2281 info.dma32_window_start +
2282 info.dma32_window_size - 1,
2283 0x1000);
2284 }
2b6326c0 2285 }
e2c7d025
EA
2286 }
2287
8c37faa4
AK
2288 vfio_kvm_device_add_group(group);
2289
2290 QLIST_INIT(&container->group_list);
2291 QLIST_INSERT_HEAD(&space->containers, container, next);
2292
2293 group->container = container;
2294 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2295
ee0bf0e5
DG
2296 container->listener = vfio_memory_listener;
2297
2298 memory_listener_register(&container->listener, container->space->as);
2299
2300 if (container->error) {
d7d87836
EA
2301 ret = -1;
2302 error_propagate_prepend(errp, container->error,
2303 "memory listener initialization failed: ");
ee0bf0e5
DG
2304 goto listener_release_exit;
2305 }
2306
2307 container->initialized = true;
2308
e2c7d025
EA
2309 return 0;
2310listener_release_exit:
8c37faa4
AK
2311 QLIST_REMOVE(group, container_next);
2312 QLIST_REMOVE(container, next);
2313 vfio_kvm_device_del_group(group);
e2c7d025
EA
2314 vfio_listener_release(container);
2315
53d1b5fc
DH
2316enable_discards_exit:
2317 vfio_ram_block_discard_disable(container, false);
2318
e2c7d025
EA
2319free_container_exit:
2320 g_free(container);
2321
2322close_fd_exit:
2323 close(fd);
2324
2325put_space_exit:
2326 vfio_put_address_space(space);
2327
2328 return ret;
2329}
2330
2331static void vfio_disconnect_container(VFIOGroup *group)
2332{
2333 VFIOContainer *container = group->container;
2334
36968626
PX
2335 QLIST_REMOVE(group, container_next);
2336 group->container = NULL;
2337
2338 /*
2339 * Explicitly release the listener first before unset container,
2340 * since unset may destroy the backend container if it's the last
2341 * group.
2342 */
2343 if (QLIST_EMPTY(&container->group_list)) {
2344 vfio_listener_release(container);
2345 }
2346
e2c7d025
EA
2347 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
2348 error_report("vfio: error disconnecting group %d from container",
2349 group->groupid);
2350 }
2351
e2c7d025
EA
2352 if (QLIST_EMPTY(&container->group_list)) {
2353 VFIOAddressSpace *space = container->space;
f8d8a944 2354 VFIOGuestIOMMU *giommu, *tmp;
f3bc3a73 2355 VFIOHostDMAWindow *hostwin, *next;
e2c7d025 2356
e2c7d025 2357 QLIST_REMOVE(container, next);
f8d8a944
AK
2358
2359 QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
3df9d748 2360 memory_region_unregister_iommu_notifier(
44ee6aaa 2361 MEMORY_REGION(giommu->iommu_mr), &giommu->n);
f8d8a944
AK
2362 QLIST_REMOVE(giommu, giommu_next);
2363 g_free(giommu);
2364 }
2365
f3bc3a73
PL
2366 QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
2367 next) {
2368 QLIST_REMOVE(hostwin, hostwin_next);
2369 g_free(hostwin);
2370 }
2371
e2c7d025
EA
2372 trace_vfio_disconnect_container(container->fd);
2373 close(container->fd);
2374 g_free(container);
2375
2376 vfio_put_address_space(space);
2377 }
2378}
2379
1b808d5b 2380VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
e2c7d025
EA
2381{
2382 VFIOGroup *group;
2383 char path[32];
2384 struct vfio_group_status status = { .argsz = sizeof(status) };
2385
2386 QLIST_FOREACH(group, &vfio_group_list, next) {
2387 if (group->groupid == groupid) {
2388 /* Found it. Now is it already in the right context? */
2389 if (group->container->space->as == as) {
2390 return group;
2391 } else {
1b808d5b
EA
2392 error_setg(errp, "group %d used in multiple address spaces",
2393 group->groupid);
e2c7d025
EA
2394 return NULL;
2395 }
2396 }
2397 }
2398
2399 group = g_malloc0(sizeof(*group));
2400
2401 snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
448058aa 2402 group->fd = qemu_open_old(path, O_RDWR);
e2c7d025 2403 if (group->fd < 0) {
1b808d5b 2404 error_setg_errno(errp, errno, "failed to open %s", path);
e2c7d025
EA
2405 goto free_group_exit;
2406 }
2407
2408 if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
1b808d5b 2409 error_setg_errno(errp, errno, "failed to get group %d status", groupid);
e2c7d025
EA
2410 goto close_fd_exit;
2411 }
2412
2413 if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
1b808d5b
EA
2414 error_setg(errp, "group %d is not viable", groupid);
2415 error_append_hint(errp,
2416 "Please ensure all devices within the iommu_group "
2417 "are bound to their vfio bus driver.\n");
e2c7d025
EA
2418 goto close_fd_exit;
2419 }
2420
2421 group->groupid = groupid;
2422 QLIST_INIT(&group->device_list);
2423
1b808d5b
EA
2424 if (vfio_connect_container(group, as, errp)) {
2425 error_prepend(errp, "failed to setup container for group %d: ",
2426 groupid);
e2c7d025
EA
2427 goto close_fd_exit;
2428 }
2429
2430 if (QLIST_EMPTY(&vfio_group_list)) {
2431 qemu_register_reset(vfio_reset_handler, NULL);
2432 }
2433
2434 QLIST_INSERT_HEAD(&vfio_group_list, group, next);
2435
e2c7d025
EA
2436 return group;
2437
2438close_fd_exit:
2439 close(group->fd);
2440
2441free_group_exit:
2442 g_free(group);
2443
2444 return NULL;
2445}
2446
2447void vfio_put_group(VFIOGroup *group)
2448{
77a10d04 2449 if (!group || !QLIST_EMPTY(&group->device_list)) {
e2c7d025
EA
2450 return;
2451 }
2452
aff92b82 2453 if (!group->ram_block_discard_allowed) {
53d1b5fc 2454 vfio_ram_block_discard_disable(group->container, false);
238e9172 2455 }
e2c7d025
EA
2456 vfio_kvm_device_del_group(group);
2457 vfio_disconnect_container(group);
2458 QLIST_REMOVE(group, next);
2459 trace_vfio_put_group(group->fd);
2460 close(group->fd);
2461 g_free(group);
2462
2463 if (QLIST_EMPTY(&vfio_group_list)) {
2464 qemu_unregister_reset(vfio_reset_handler, NULL);
2465 }
2466}
2467
2468int vfio_get_device(VFIOGroup *group, const char *name,
59f7d674 2469 VFIODevice *vbasedev, Error **errp)
e2c7d025
EA
2470{
2471 struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
217e9fdc 2472 int ret, fd;
e2c7d025 2473
217e9fdc
PB
2474 fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
2475 if (fd < 0) {
59f7d674
EA
2476 error_setg_errno(errp, errno, "error getting device from group %d",
2477 group->groupid);
2478 error_append_hint(errp,
2479 "Verify all devices in group %d are bound to vfio-<bus> "
2480 "or pci-stub and not already in use\n", group->groupid);
217e9fdc 2481 return fd;
e2c7d025
EA
2482 }
2483
217e9fdc 2484 ret = ioctl(fd, VFIO_DEVICE_GET_INFO, &dev_info);
e2c7d025 2485 if (ret) {
59f7d674 2486 error_setg_errno(errp, errno, "error getting device info");
217e9fdc
PB
2487 close(fd);
2488 return ret;
e2c7d025
EA
2489 }
2490
238e9172 2491 /*
aff92b82
DH
2492 * Set discarding of RAM as not broken for this group if the driver knows
2493 * the device operates compatibly with discarding. Setting must be
2494 * consistent per group, but since compatibility is really only possible
2495 * with mdev currently, we expect singleton groups.
238e9172 2496 */
aff92b82
DH
2497 if (vbasedev->ram_block_discard_allowed !=
2498 group->ram_block_discard_allowed) {
238e9172 2499 if (!QLIST_EMPTY(&group->device_list)) {
aff92b82
DH
2500 error_setg(errp, "Inconsistent setting of support for discarding "
2501 "RAM (e.g., balloon) within group");
8709b395 2502 close(fd);
238e9172
AW
2503 return -1;
2504 }
2505
aff92b82
DH
2506 if (!group->ram_block_discard_allowed) {
2507 group->ram_block_discard_allowed = true;
53d1b5fc 2508 vfio_ram_block_discard_disable(group->container, false);
238e9172
AW
2509 }
2510 }
2511
217e9fdc
PB
2512 vbasedev->fd = fd;
2513 vbasedev->group = group;
2514 QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
2515
e2c7d025
EA
2516 vbasedev->num_irqs = dev_info.num_irqs;
2517 vbasedev->num_regions = dev_info.num_regions;
2518 vbasedev->flags = dev_info.flags;
2519
2520 trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions,
2521 dev_info.num_irqs);
2522
2523 vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
217e9fdc 2524 return 0;
e2c7d025
EA
2525}
2526
2527void vfio_put_base_device(VFIODevice *vbasedev)
2528{
77a10d04
PB
2529 if (!vbasedev->group) {
2530 return;
2531 }
e2c7d025
EA
2532 QLIST_REMOVE(vbasedev, next);
2533 vbasedev->group = NULL;
2534 trace_vfio_put_base_device(vbasedev->fd);
2535 close(vbasedev->fd);
2536}
2537
46900226
AW
2538int vfio_get_region_info(VFIODevice *vbasedev, int index,
2539 struct vfio_region_info **info)
2540{
2541 size_t argsz = sizeof(struct vfio_region_info);
2542
2543 *info = g_malloc0(argsz);
2544
2545 (*info)->index = index;
b53b0f69 2546retry:
46900226
AW
2547 (*info)->argsz = argsz;
2548
2549 if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
2550 g_free(*info);
e61a424f 2551 *info = NULL;
46900226
AW
2552 return -errno;
2553 }
2554
b53b0f69
AW
2555 if ((*info)->argsz > argsz) {
2556 argsz = (*info)->argsz;
2557 *info = g_realloc(*info, argsz);
2558
2559 goto retry;
2560 }
2561
46900226
AW
2562 return 0;
2563}
2564
e61a424f
AW
2565int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
2566 uint32_t subtype, struct vfio_region_info **info)
2567{
2568 int i;
2569
2570 for (i = 0; i < vbasedev->num_regions; i++) {
2571 struct vfio_info_cap_header *hdr;
2572 struct vfio_region_info_cap_type *cap_type;
2573
2574 if (vfio_get_region_info(vbasedev, i, info)) {
2575 continue;
2576 }
2577
2578 hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
2579 if (!hdr) {
2580 g_free(*info);
2581 continue;
2582 }
2583
2584 cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
2585
2586 trace_vfio_get_dev_region(vbasedev->name, i,
2587 cap_type->type, cap_type->subtype);
2588
2589 if (cap_type->type == type && cap_type->subtype == subtype) {
2590 return 0;
2591 }
2592
2593 g_free(*info);
2594 }
2595
2596 *info = NULL;
2597 return -ENODEV;
2598}
2599
ae0215b2
AK
2600bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
2601{
2602 struct vfio_region_info *info = NULL;
2603 bool ret = false;
2604
2605 if (!vfio_get_region_info(vbasedev, region, &info)) {
2606 if (vfio_get_region_info_cap(info, cap_type)) {
2607 ret = true;
2608 }
2609 g_free(info);
2610 }
2611
2612 return ret;
2613}
2614
3153119e
DG
2615/*
2616 * Interfaces for IBM EEH (Enhanced Error Handling)
2617 */
2618static bool vfio_eeh_container_ok(VFIOContainer *container)
2619{
2620 /*
2621 * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO
2622 * implementation is broken if there are multiple groups in a
2623 * container. The hardware works in units of Partitionable
2624 * Endpoints (== IOMMU groups) and the EEH operations naively
2625 * iterate across all groups in the container, without any logic
2626 * to make sure the groups have their state synchronized. For
2627 * certain operations (ENABLE) that might be ok, until an error
2628 * occurs, but for others (GET_STATE) it's clearly broken.
2629 */
2630
2631 /*
2632 * XXX Once fixed kernels exist, test for them here
2633 */
2634
2635 if (QLIST_EMPTY(&container->group_list)) {
2636 return false;
2637 }
2638
2639 if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) {
2640 return false;
2641 }
2642
2643 return true;
2644}
2645
2646static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
2647{
2648 struct vfio_eeh_pe_op pe_op = {
2649 .argsz = sizeof(pe_op),
2650 .op = op,
2651 };
2652 int ret;
2653
2654 if (!vfio_eeh_container_ok(container)) {
2655 error_report("vfio/eeh: EEH_PE_OP 0x%x: "
2656 "kernel requires a container with exactly one group", op);
2657 return -EPERM;
2658 }
2659
2660 ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op);
2661 if (ret < 0) {
2662 error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op);
2663 return -errno;
2664 }
2665
d917e88d 2666 return ret;
3153119e
DG
2667}
2668
2669static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
2670{
2671 VFIOAddressSpace *space = vfio_get_address_space(as);
2672 VFIOContainer *container = NULL;
2673
2674 if (QLIST_EMPTY(&space->containers)) {
2675 /* No containers to act on */
2676 goto out;
2677 }
2678
2679 container = QLIST_FIRST(&space->containers);
2680
2681 if (QLIST_NEXT(container, next)) {
2682 /* We don't yet have logic to synchronize EEH state across
2683 * multiple containers */
2684 container = NULL;
2685 goto out;
2686 }
2687
2688out:
2689 vfio_put_address_space(space);
2690 return container;
2691}
2692
2693bool vfio_eeh_as_ok(AddressSpace *as)
2694{
2695 VFIOContainer *container = vfio_eeh_as_container(as);
2696
2697 return (container != NULL) && vfio_eeh_container_ok(container);
2698}
2699
2700int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
2701{
2702 VFIOContainer *container = vfio_eeh_as_container(as);
2703
2704 if (!container) {
2705 return -ENODEV;
2706 }
2707 return vfio_eeh_container_op(container, op);
2708}