]> git.proxmox.com Git - mirror_qemu.git/blame - hw/vfio/common.c
vfio/common: Consolidate skip/invalid section into helper
[mirror_qemu.git] / hw / vfio / common.c
CommitLineData
e2c7d025
EA
1/*
2 * generic functions used by VFIO devices
3 *
4 * Copyright Red Hat, Inc. 2012
5 *
6 * Authors:
7 * Alex Williamson <alex.williamson@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 * Based on qemu-kvm device-assignment:
13 * Adapted for KVM by Qumranet.
14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19 */
20
c6eacb1a 21#include "qemu/osdep.h"
e2c7d025 22#include <sys/ioctl.h>
a9c94277
MA
23#ifdef CONFIG_KVM
24#include <linux/kvm.h>
25#endif
e2c7d025
EA
26#include <linux/vfio.h>
27
28#include "hw/vfio/vfio-common.h"
29#include "hw/vfio/vfio.h"
30#include "exec/address-spaces.h"
31#include "exec/memory.h"
b6dd6504 32#include "exec/ram_addr.h"
e2c7d025
EA
33#include "hw/hw.h"
34#include "qemu/error-report.h"
db725815 35#include "qemu/main-loop.h"
f4ec5e26 36#include "qemu/range.h"
e2c7d025 37#include "sysemu/kvm.h"
71e8a915 38#include "sysemu/reset.h"
0fd7616e 39#include "sysemu/runstate.h"
e2c7d025 40#include "trace.h"
01905f58 41#include "qapi/error.h"
b6dd6504 42#include "migration/migration.h"
8b942af3 43#include "migration/misc.h"
29d81b71 44#include "migration/blocker.h"
236e0a45 45#include "migration/qemu-file.h"
851d6d1a 46#include "sysemu/tpm.h"
e2c7d025 47
f481ee2d 48VFIOGroupList vfio_group_list =
39cb514f 49 QLIST_HEAD_INITIALIZER(vfio_group_list);
10ca76b4 50static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
e2c7d025
EA
51 QLIST_HEAD_INITIALIZER(vfio_address_spaces);
52
53#ifdef CONFIG_KVM
54/*
55 * We have a single VFIO pseudo device per KVM VM. Once created it lives
56 * for the life of the VM. Closing the file descriptor only drops our
57 * reference to it and the device's reference to kvm. Therefore once
58 * initialized, this file descriptor is only released on QEMU exit and
59 * we'll re-use it should another vfio device be attached before then.
60 */
61static int vfio_kvm_device_fd = -1;
62#endif
63
64/*
65 * Common VFIO interrupt disable
66 */
67void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
68{
69 struct vfio_irq_set irq_set = {
70 .argsz = sizeof(irq_set),
71 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
72 .index = index,
73 .start = 0,
74 .count = 0,
75 };
76
77 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
78}
79
80void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
81{
82 struct vfio_irq_set irq_set = {
83 .argsz = sizeof(irq_set),
84 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
85 .index = index,
86 .start = 0,
87 .count = 1,
88 };
89
90 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
91}
92
93void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
94{
95 struct vfio_irq_set irq_set = {
96 .argsz = sizeof(irq_set),
97 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
98 .index = index,
99 .start = 0,
100 .count = 1,
101 };
102
103 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
104}
105
201a7331
EA
106static inline const char *action_to_str(int action)
107{
108 switch (action) {
109 case VFIO_IRQ_SET_ACTION_MASK:
110 return "MASK";
111 case VFIO_IRQ_SET_ACTION_UNMASK:
112 return "UNMASK";
113 case VFIO_IRQ_SET_ACTION_TRIGGER:
114 return "TRIGGER";
115 default:
116 return "UNKNOWN ACTION";
117 }
118}
119
120static const char *index_to_str(VFIODevice *vbasedev, int index)
121{
122 if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
123 return NULL;
124 }
125
126 switch (index) {
127 case VFIO_PCI_INTX_IRQ_INDEX:
128 return "INTX";
129 case VFIO_PCI_MSI_IRQ_INDEX:
130 return "MSI";
131 case VFIO_PCI_MSIX_IRQ_INDEX:
132 return "MSIX";
133 case VFIO_PCI_ERR_IRQ_INDEX:
134 return "ERR";
135 case VFIO_PCI_REQ_IRQ_INDEX:
136 return "REQ";
137 default:
138 return NULL;
139 }
140}
141
53d1b5fc
DH
142static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
143{
144 switch (container->iommu_type) {
145 case VFIO_TYPE1v2_IOMMU:
146 case VFIO_TYPE1_IOMMU:
147 /*
148 * We support coordinated discarding of RAM via the RamDiscardManager.
149 */
150 return ram_block_uncoordinated_discard_disable(state);
151 default:
152 /*
153 * VFIO_SPAPR_TCE_IOMMU most probably works just fine with
154 * RamDiscardManager, however, it is completely untested.
155 *
156 * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does
157 * completely the opposite of managing mapping/pinning dynamically as
158 * required by RamDiscardManager. We would have to special-case sections
159 * with a RamDiscardManager.
160 */
161 return ram_block_discard_disable(state);
162 }
163}
164
201a7331
EA
165int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
166 int action, int fd, Error **errp)
167{
168 struct vfio_irq_set *irq_set;
169 int argsz, ret = 0;
170 const char *name;
171 int32_t *pfd;
172
173 argsz = sizeof(*irq_set) + sizeof(*pfd);
174
175 irq_set = g_malloc0(argsz);
176 irq_set->argsz = argsz;
177 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action;
178 irq_set->index = index;
179 irq_set->start = subindex;
180 irq_set->count = 1;
181 pfd = (int32_t *)&irq_set->data;
182 *pfd = fd;
183
184 if (ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
185 ret = -errno;
186 }
187 g_free(irq_set);
188
189 if (!ret) {
190 return 0;
191 }
192
193 error_setg_errno(errp, -ret, "VFIO_DEVICE_SET_IRQS failure");
194
195 name = index_to_str(vbasedev, index);
196 if (name) {
197 error_prepend(errp, "%s-%d: ", name, subindex);
198 } else {
199 error_prepend(errp, "index %d-%d: ", index, subindex);
200 }
201 error_prepend(errp,
202 "Failed to %s %s eventfd signaling for interrupt ",
203 fd < 0 ? "tear down" : "set up", action_to_str(action));
204 return ret;
205}
206
e2c7d025
EA
207/*
208 * IO Port/MMIO - Beware of the endians, VFIO is always little endian
209 */
210void vfio_region_write(void *opaque, hwaddr addr,
211 uint64_t data, unsigned size)
212{
213 VFIORegion *region = opaque;
214 VFIODevice *vbasedev = region->vbasedev;
215 union {
216 uint8_t byte;
217 uint16_t word;
218 uint32_t dword;
219 uint64_t qword;
220 } buf;
221
222 switch (size) {
223 case 1:
224 buf.byte = data;
225 break;
226 case 2:
227 buf.word = cpu_to_le16(data);
228 break;
229 case 4:
230 buf.dword = cpu_to_le32(data);
231 break;
38d49e8c
JRZ
232 case 8:
233 buf.qword = cpu_to_le64(data);
234 break;
e2c7d025 235 default:
c624b6b3 236 hw_error("vfio: unsupported write size, %u bytes", size);
e2c7d025
EA
237 break;
238 }
239
240 if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
241 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
242 ",%d) failed: %m",
243 __func__, vbasedev->name, region->nr,
244 addr, data, size);
245 }
246
247 trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
248
249 /*
250 * A read or write to a BAR always signals an INTx EOI. This will
251 * do nothing if not pending (including not in INTx mode). We assume
252 * that a BAR access is in response to an interrupt and that BAR
253 * accesses will service the interrupt. Unfortunately, we don't know
254 * which access will service the interrupt, so we're potentially
255 * getting quite a few host interrupts per guest interrupt.
256 */
257 vbasedev->ops->vfio_eoi(vbasedev);
258}
259
260uint64_t vfio_region_read(void *opaque,
261 hwaddr addr, unsigned size)
262{
263 VFIORegion *region = opaque;
264 VFIODevice *vbasedev = region->vbasedev;
265 union {
266 uint8_t byte;
267 uint16_t word;
268 uint32_t dword;
269 uint64_t qword;
270 } buf;
271 uint64_t data = 0;
272
273 if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
274 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
275 __func__, vbasedev->name, region->nr,
276 addr, size);
277 return (uint64_t)-1;
278 }
279 switch (size) {
280 case 1:
281 data = buf.byte;
282 break;
283 case 2:
284 data = le16_to_cpu(buf.word);
285 break;
286 case 4:
287 data = le32_to_cpu(buf.dword);
288 break;
38d49e8c
JRZ
289 case 8:
290 data = le64_to_cpu(buf.qword);
291 break;
e2c7d025 292 default:
c624b6b3 293 hw_error("vfio: unsupported read size, %u bytes", size);
e2c7d025
EA
294 break;
295 }
296
297 trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data);
298
299 /* Same as write above */
300 vbasedev->ops->vfio_eoi(vbasedev);
301
302 return data;
303}
304
305const MemoryRegionOps vfio_region_ops = {
306 .read = vfio_region_read,
307 .write = vfio_region_write,
308 .endianness = DEVICE_LITTLE_ENDIAN,
15126cba
JRZ
309 .valid = {
310 .min_access_size = 1,
311 .max_access_size = 8,
312 },
38d49e8c
JRZ
313 .impl = {
314 .min_access_size = 1,
315 .max_access_size = 8,
316 },
e2c7d025
EA
317};
318
b6dd6504
KW
319/*
320 * Device state interfaces
321 */
322
725ccd7e
AH
323typedef struct {
324 unsigned long *bitmap;
325 hwaddr size;
326 hwaddr pages;
327} VFIOBitmap;
328
329static int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size)
330{
331 vbmap->pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
332 vbmap->size = ROUND_UP(vbmap->pages, sizeof(__u64) * BITS_PER_BYTE) /
333 BITS_PER_BYTE;
334 vbmap->bitmap = g_try_malloc0(vbmap->size);
335 if (!vbmap->bitmap) {
336 return -ENOMEM;
337 }
338
339 return 0;
340}
341
3710586c
KW
342bool vfio_mig_active(void)
343{
344 VFIOGroup *group;
345 VFIODevice *vbasedev;
346
347 if (QLIST_EMPTY(&vfio_group_list)) {
348 return false;
349 }
350
351 QLIST_FOREACH(group, &vfio_group_list, next) {
352 QLIST_FOREACH(vbasedev, &group->device_list, next) {
353 if (vbasedev->migration_blocker) {
354 return false;
355 }
356 }
357 }
358 return true;
359}
360
29d81b71
AH
361static Error *multiple_devices_migration_blocker;
362
363static unsigned int vfio_migratable_device_num(void)
364{
365 VFIOGroup *group;
366 VFIODevice *vbasedev;
367 unsigned int device_num = 0;
368
369 QLIST_FOREACH(group, &vfio_group_list, next) {
370 QLIST_FOREACH(vbasedev, &group->device_list, next) {
371 if (vbasedev->migration) {
372 device_num++;
373 }
374 }
375 }
376
377 return device_num;
378}
379
380int vfio_block_multiple_devices_migration(Error **errp)
381{
382 int ret;
383
384 if (multiple_devices_migration_blocker ||
385 vfio_migratable_device_num() <= 1) {
386 return 0;
387 }
388
389 error_setg(&multiple_devices_migration_blocker,
390 "Migration is currently not supported with multiple "
391 "VFIO devices");
392 ret = migrate_add_blocker(multiple_devices_migration_blocker, errp);
393 if (ret < 0) {
394 error_free(multiple_devices_migration_blocker);
395 multiple_devices_migration_blocker = NULL;
396 }
397
398 return ret;
399}
400
401void vfio_unblock_multiple_devices_migration(void)
402{
403 if (!multiple_devices_migration_blocker ||
404 vfio_migratable_device_num() > 1) {
405 return;
406 }
407
408 migrate_del_blocker(multiple_devices_migration_blocker);
409 error_free(multiple_devices_migration_blocker);
410 multiple_devices_migration_blocker = NULL;
411}
412
236e0a45
AH
413static void vfio_set_migration_error(int err)
414{
415 MigrationState *ms = migrate_get_current();
416
417 if (migration_is_setup_or_active(ms->state)) {
418 WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
419 if (ms->to_dst_file) {
420 qemu_file_set_error(ms->to_dst_file, err);
421 }
422 }
423 }
424}
425
758b96b6 426static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
b6dd6504
KW
427{
428 VFIOGroup *group;
429 VFIODevice *vbasedev;
430 MigrationState *ms = migrate_get_current();
431
432 if (!migration_is_setup_or_active(ms->state)) {
433 return false;
434 }
435
436 QLIST_FOREACH(group, &container->group_list, container_next) {
437 QLIST_FOREACH(vbasedev, &group->device_list, next) {
438 VFIOMigration *migration = vbasedev->migration;
439
440 if (!migration) {
441 return false;
442 }
443
7429aebe 444 if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF &&
31bcbbb5
AH
445 migration->device_state == VFIO_DEVICE_STATE_RUNNING) {
446 return false;
447 }
b6dd6504
KW
448 }
449 }
450 return true;
451}
452
8b942af3
AH
453/*
454 * Check if all VFIO devices are running and migration is active, which is
455 * essentially equivalent to the migration being in pre-copy phase.
456 */
457static bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
9e7b0442
KW
458{
459 VFIOGroup *group;
460 VFIODevice *vbasedev;
9e7b0442 461
8b942af3 462 if (!migration_is_active(migrate_get_current())) {
9e7b0442
KW
463 return false;
464 }
465
466 QLIST_FOREACH(group, &container->group_list, container_next) {
467 QLIST_FOREACH(vbasedev, &group->device_list, next) {
468 VFIOMigration *migration = vbasedev->migration;
469
470 if (!migration) {
471 return false;
472 }
473
7429aebe 474 if (migration->device_state == VFIO_DEVICE_STATE_RUNNING) {
9e7b0442
KW
475 continue;
476 } else {
477 return false;
478 }
479 }
480 }
481 return true;
482}
483
484static int vfio_dma_unmap_bitmap(VFIOContainer *container,
485 hwaddr iova, ram_addr_t size,
486 IOMMUTLBEntry *iotlb)
487{
488 struct vfio_iommu_type1_dma_unmap *unmap;
489 struct vfio_bitmap *bitmap;
725ccd7e 490 VFIOBitmap vbmap;
9e7b0442
KW
491 int ret;
492
725ccd7e
AH
493 ret = vfio_bitmap_alloc(&vbmap, size);
494 if (ret) {
495 return ret;
496 }
497
9e7b0442
KW
498 unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
499
500 unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
501 unmap->iova = iova;
502 unmap->size = size;
503 unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
504 bitmap = (struct vfio_bitmap *)&unmap->data;
505
506 /*
1eb7f642
KJ
507 * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
508 * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
509 * to qemu_real_host_page_size.
9e7b0442 510 */
8e3b0cbb 511 bitmap->pgsize = qemu_real_host_page_size();
725ccd7e
AH
512 bitmap->size = vbmap.size;
513 bitmap->data = (__u64 *)vbmap.bitmap;
9e7b0442 514
725ccd7e
AH
515 if (vbmap.size > container->max_dirty_bitmap_size) {
516 error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size);
9e7b0442
KW
517 ret = -E2BIG;
518 goto unmap_exit;
519 }
520
9e7b0442
KW
521 ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
522 if (!ret) {
725ccd7e
AH
523 cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap,
524 iotlb->translated_addr, vbmap.pages);
9e7b0442
KW
525 } else {
526 error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
527 }
528
9e7b0442
KW
529unmap_exit:
530 g_free(unmap);
725ccd7e
AH
531 g_free(vbmap.bitmap);
532
9e7b0442
KW
533 return ret;
534}
535
e2c7d025
EA
536/*
537 * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
538 */
539static int vfio_dma_unmap(VFIOContainer *container,
9e7b0442
KW
540 hwaddr iova, ram_addr_t size,
541 IOMMUTLBEntry *iotlb)
e2c7d025
EA
542{
543 struct vfio_iommu_type1_dma_unmap unmap = {
544 .argsz = sizeof(unmap),
545 .flags = 0,
546 .iova = iova,
547 .size = size,
548 };
549
9e7b0442 550 if (iotlb && container->dirty_pages_supported &&
8b942af3 551 vfio_devices_all_running_and_mig_active(container)) {
9e7b0442
KW
552 return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
553 }
554
567d7d3e
AW
555 while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
556 /*
557 * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
558 * v4.15) where an overflow in its wrap-around check prevents us from
559 * unmapping the last page of the address space. Test for the error
560 * condition and re-try the unmap excluding the last page. The
561 * expectation is that we've never mapped the last page anyway and this
562 * unmap request comes via vIOMMU support which also makes it unlikely
563 * that this page is used. This bug was introduced well after type1 v2
564 * support was introduced, so we shouldn't need to test for v1. A fix
565 * is queued for kernel v5.0 so this workaround can be removed once
566 * affected kernels are sufficiently deprecated.
567 */
568 if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
569 container->iommu_type == VFIO_TYPE1v2_IOMMU) {
570 trace_vfio_dma_unmap_overflow_workaround();
571 unmap.size -= 1ULL << ctz64(container->pgsizes);
572 continue;
573 }
b09d51c9 574 error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
e2c7d025
EA
575 return -errno;
576 }
577
8b942af3 578 if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
b051a3f6
AH
579 cpu_physical_memory_set_dirty_range(iotlb->translated_addr, size,
580 tcg_enabled() ? DIRTY_CLIENTS_ALL :
581 DIRTY_CLIENTS_NOCODE);
582 }
583
e2c7d025
EA
584 return 0;
585}
586
587static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
588 ram_addr_t size, void *vaddr, bool readonly)
589{
590 struct vfio_iommu_type1_dma_map map = {
591 .argsz = sizeof(map),
592 .flags = VFIO_DMA_MAP_FLAG_READ,
593 .vaddr = (__u64)(uintptr_t)vaddr,
594 .iova = iova,
595 .size = size,
596 };
597
598 if (!readonly) {
599 map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
600 }
601
602 /*
603 * Try the mapping, if it fails with EBUSY, unmap the region and try
604 * again. This shouldn't be necessary, but we sometimes see it in
b6af0975 605 * the VGA ROM space.
e2c7d025
EA
606 */
607 if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
9e7b0442 608 (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
e2c7d025
EA
609 ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
610 return 0;
611 }
612
b09d51c9 613 error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
e2c7d025
EA
614 return -errno;
615}
616
f4ec5e26
AK
617static void vfio_host_win_add(VFIOContainer *container,
618 hwaddr min_iova, hwaddr max_iova,
619 uint64_t iova_pgsizes)
620{
621 VFIOHostDMAWindow *hostwin;
622
623 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
624 if (ranges_overlap(hostwin->min_iova,
625 hostwin->max_iova - hostwin->min_iova + 1,
626 min_iova,
627 max_iova - min_iova + 1)) {
628 hw_error("%s: Overlapped IOMMU are not enabled", __func__);
629 }
630 }
631
632 hostwin = g_malloc0(sizeof(*hostwin));
633
634 hostwin->min_iova = min_iova;
635 hostwin->max_iova = max_iova;
636 hostwin->iova_pgsizes = iova_pgsizes;
637 QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
638}
639
2e4109de
AK
640static int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova,
641 hwaddr max_iova)
642{
643 VFIOHostDMAWindow *hostwin;
644
645 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
646 if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
647 QLIST_REMOVE(hostwin, hostwin_next);
f3bc3a73 648 g_free(hostwin);
2e4109de
AK
649 return 0;
650 }
651 }
652
653 return -1;
654}
655
e2c7d025
EA
656static bool vfio_listener_skipped_section(MemoryRegionSection *section)
657{
658 return (!memory_region_is_ram(section->mr) &&
659 !memory_region_is_iommu(section->mr)) ||
56918a12 660 memory_region_is_protected(section->mr) ||
e2c7d025
EA
661 /*
662 * Sizing an enabled 64-bit BAR can cause spurious mappings to
663 * addresses in the upper part of the 64-bit address space. These
664 * are never accessed by the CPU and beyond the address width of
665 * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width.
666 */
667 section->offset_within_address_space & (1ULL << 63);
668}
669
4a4b88fb 670/* Called with rcu_read_lock held. */
9a04fe09
KW
671static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
672 ram_addr_t *ram_addr, bool *read_only)
e2c7d025 673{
baa44bce 674 bool ret, mr_has_discard_manager;
0fd7616e 675
baa44bce
CL
676 ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only,
677 &mr_has_discard_manager);
678 if (ret && mr_has_discard_manager) {
0fd7616e
DH
679 /*
680 * Malicious VMs might trigger discarding of IOMMU-mapped memory. The
681 * pages will remain pinned inside vfio until unmapped, resulting in a
682 * higher memory consumption than expected. If memory would get
683 * populated again later, there would be an inconsistency between pages
684 * pinned by vfio and pages seen by QEMU. This is the case until
685 * unmapped from the IOMMU (e.g., during device reset).
686 *
687 * With malicious guests, we really only care about pinning more memory
688 * than expected. RLIMIT_MEMLOCK set for the user/process can never be
689 * exceeded and can be used to mitigate this problem.
690 */
691 warn_report_once("Using vfio with vIOMMUs and coordinated discarding of"
692 " RAM (e.g., virtio-mem) works, however, malicious"
693 " guests can trigger pinning of more memory than"
694 " intended via an IOMMU. It's possible to mitigate "
695 " by setting/adjusting RLIMIT_MEMLOCK.");
e2c7d025 696 }
baa44bce 697 return ret;
4a4b88fb
PX
698}
699
700static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
701{
702 VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
703 VFIOContainer *container = giommu->container;
704 hwaddr iova = iotlb->iova + giommu->iommu_offset;
4a4b88fb
PX
705 void *vaddr;
706 int ret;
707
708 trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
709 iova, iova + iotlb->addr_mask);
710
711 if (iotlb->target_as != &address_space_memory) {
712 error_report("Wrong target AS \"%s\", only system memory is allowed",
713 iotlb->target_as->name ? iotlb->target_as->name : "none");
236e0a45 714 vfio_set_migration_error(-EINVAL);
4a4b88fb
PX
715 return;
716 }
717
718 rcu_read_lock();
719
e2c7d025 720 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
9a04fe09
KW
721 bool read_only;
722
723 if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) {
dfbd90e5
PX
724 goto out;
725 }
4a4b88fb
PX
726 /*
727 * vaddr is only valid until rcu_read_unlock(). But after
728 * vfio_dma_map has set up the mapping the pages will be
729 * pinned by the kernel. This makes sure that the RAM backend
730 * of vaddr will always be there, even if the memory object is
731 * destroyed and its backing memory munmap-ed.
732 */
d78c19b5 733 ret = vfio_dma_map(container, iova,
e2c7d025 734 iotlb->addr_mask + 1, vaddr,
4a4b88fb 735 read_only);
e2c7d025
EA
736 if (ret) {
737 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
db9b829b 738 "0x%"HWADDR_PRIx", %p) = %d (%s)",
d78c19b5 739 container, iova,
db9b829b 740 iotlb->addr_mask + 1, vaddr, ret, strerror(-ret));
e2c7d025
EA
741 }
742 } else {
9e7b0442 743 ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
e2c7d025
EA
744 if (ret) {
745 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
db9b829b 746 "0x%"HWADDR_PRIx") = %d (%s)",
d78c19b5 747 container, iova,
db9b829b 748 iotlb->addr_mask + 1, ret, strerror(-ret));
236e0a45 749 vfio_set_migration_error(ret);
e2c7d025
EA
750 }
751 }
41063e1e
PB
752out:
753 rcu_read_unlock();
e2c7d025
EA
754}
755
5e3b981c
DH
756static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
757 MemoryRegionSection *section)
758{
759 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
760 listener);
761 const hwaddr size = int128_get64(section->size);
762 const hwaddr iova = section->offset_within_address_space;
763 int ret;
764
765 /* Unmap with a single call. */
766 ret = vfio_dma_unmap(vrdl->container, iova, size , NULL);
767 if (ret) {
768 error_report("%s: vfio_dma_unmap() failed: %s", __func__,
769 strerror(-ret));
770 }
771}
772
773static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
774 MemoryRegionSection *section)
775{
776 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
777 listener);
778 const hwaddr end = section->offset_within_region +
779 int128_get64(section->size);
780 hwaddr start, next, iova;
781 void *vaddr;
782 int ret;
783
784 /*
785 * Map in (aligned within memory region) minimum granularity, so we can
786 * unmap in minimum granularity later.
787 */
788 for (start = section->offset_within_region; start < end; start = next) {
789 next = ROUND_UP(start + 1, vrdl->granularity);
790 next = MIN(next, end);
791
792 iova = start - section->offset_within_region +
793 section->offset_within_address_space;
794 vaddr = memory_region_get_ram_ptr(section->mr) + start;
795
796 ret = vfio_dma_map(vrdl->container, iova, next - start,
797 vaddr, section->readonly);
798 if (ret) {
799 /* Rollback */
800 vfio_ram_discard_notify_discard(rdl, section);
801 return ret;
802 }
803 }
804 return 0;
805}
806
807static void vfio_register_ram_discard_listener(VFIOContainer *container,
808 MemoryRegionSection *section)
809{
810 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
811 VFIORamDiscardListener *vrdl;
812
813 /* Ignore some corner cases not relevant in practice. */
814 g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE));
815 g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space,
816 TARGET_PAGE_SIZE));
817 g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE));
818
819 vrdl = g_new0(VFIORamDiscardListener, 1);
820 vrdl->container = container;
821 vrdl->mr = section->mr;
822 vrdl->offset_within_address_space = section->offset_within_address_space;
823 vrdl->size = int128_get64(section->size);
824 vrdl->granularity = ram_discard_manager_get_min_granularity(rdm,
825 section->mr);
826
827 g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
a5dba9bc
DH
828 g_assert(container->pgsizes &&
829 vrdl->granularity >= 1ULL << ctz64(container->pgsizes));
5e3b981c
DH
830
831 ram_discard_listener_init(&vrdl->listener,
832 vfio_ram_discard_notify_populate,
833 vfio_ram_discard_notify_discard, true);
834 ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
835 QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next);
a74317f6
DH
836
837 /*
838 * Sanity-check if we have a theoretically problematic setup where we could
839 * exceed the maximum number of possible DMA mappings over time. We assume
840 * that each mapped section in the same address space as a RamDiscardManager
841 * section consumes exactly one DMA mapping, with the exception of
842 * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections
843 * in the same address space as RamDiscardManager sections.
844 *
845 * We assume that each section in the address space consumes one memslot.
846 * We take the number of KVM memory slots as a best guess for the maximum
847 * number of sections in the address space we could have over time,
848 * also consuming DMA mappings.
849 */
850 if (container->dma_max_mappings) {
851 unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
852
853#ifdef CONFIG_KVM
854 if (kvm_enabled()) {
855 max_memslots = kvm_get_max_memslots();
856 }
857#endif
858
859 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
860 hwaddr start, end;
861
862 start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
863 vrdl->granularity);
864 end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size,
865 vrdl->granularity);
866 vrdl_mappings += (end - start) / vrdl->granularity;
867 vrdl_count++;
868 }
869
870 if (vrdl_mappings + max_memslots - vrdl_count >
871 container->dma_max_mappings) {
872 warn_report("%s: possibly running out of DMA mappings. E.g., try"
873 " increasing the 'block-size' of virtio-mem devies."
874 " Maximum possible DMA mappings: %d, Maximum possible"
875 " memslots: %d", __func__, container->dma_max_mappings,
876 max_memslots);
877 }
878 }
5e3b981c
DH
879}
880
881static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
882 MemoryRegionSection *section)
883{
884 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
885 VFIORamDiscardListener *vrdl = NULL;
886
887 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
888 if (vrdl->mr == section->mr &&
889 vrdl->offset_within_address_space ==
890 section->offset_within_address_space) {
891 break;
892 }
893 }
894
895 if (!vrdl) {
896 hw_error("vfio: Trying to unregister missing RAM discard listener");
897 }
898
899 ram_discard_manager_unregister_listener(rdm, &vrdl->listener);
900 QLIST_REMOVE(vrdl, next);
901 g_free(vrdl);
902}
903
fbc6c921
JM
904static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container,
905 hwaddr iova, hwaddr end)
906{
907 VFIOHostDMAWindow *hostwin;
908 bool hostwin_found = false;
909
910 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
911 if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
912 hostwin_found = true;
913 break;
914 }
915 }
916
917 return hostwin_found ? hostwin : NULL;
918}
919
851d6d1a
EA
920static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
921{
922 MemoryRegion *mr = section->mr;
923
924 if (!TPM_IS_CRB(mr->owner)) {
925 return false;
926 }
927
928 /* this is a known safe misaligned region, just trace for debug purpose */
929 trace_vfio_known_safe_misalignment(memory_region_name(mr),
930 section->offset_within_address_space,
931 section->offset_within_region,
932 qemu_real_host_page_size());
933 return true;
934}
935
b92f2376
JM
936static bool vfio_listener_valid_section(MemoryRegionSection *section,
937 const char *name)
e2c7d025 938{
e2c7d025 939 if (vfio_listener_skipped_section(section)) {
b92f2376 940 trace_vfio_listener_region_skip(name,
e2c7d025
EA
941 section->offset_within_address_space,
942 section->offset_within_address_space +
943 int128_get64(int128_sub(section->size, int128_one())));
b92f2376 944 return false;
e2c7d025
EA
945 }
946
1eb7f642 947 if (unlikely((section->offset_within_address_space &
8e3b0cbb
MAL
948 ~qemu_real_host_page_mask()) !=
949 (section->offset_within_region & ~qemu_real_host_page_mask()))) {
851d6d1a
EA
950 if (!vfio_known_safe_misalignment(section)) {
951 error_report("%s received unaligned region %s iova=0x%"PRIx64
952 " offset_within_region=0x%"PRIx64
953 " qemu_real_host_page_size=0x%"PRIxPTR,
954 __func__, memory_region_name(section->mr),
955 section->offset_within_address_space,
956 section->offset_within_region,
957 qemu_real_host_page_size());
958 }
b92f2376
JM
959 return false;
960 }
961
962 return true;
963}
964
965static void vfio_listener_region_add(MemoryListener *listener,
966 MemoryRegionSection *section)
967{
968 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
969 hwaddr iova, end;
970 Int128 llend, llsize;
971 void *vaddr;
972 int ret;
973 VFIOHostDMAWindow *hostwin;
974 Error *err = NULL;
975
976 if (!vfio_listener_valid_section(section, "region_add")) {
e2c7d025
EA
977 return;
978 }
979
1eb7f642 980 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
e2c7d025
EA
981 llend = int128_make64(section->offset_within_address_space);
982 llend = int128_add(llend, section->size);
8e3b0cbb 983 llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
e2c7d025
EA
984
985 if (int128_ge(int128_make64(iova), llend)) {
e4b34708
KJ
986 if (memory_region_is_ram_device(section->mr)) {
987 trace_vfio_listener_region_add_no_dma_map(
988 memory_region_name(section->mr),
989 section->offset_within_address_space,
990 int128_getlo(section->size),
8e3b0cbb 991 qemu_real_host_page_size());
e4b34708 992 }
e2c7d025
EA
993 return;
994 }
55efcc53 995 end = int128_get64(int128_sub(llend, int128_one()));
3898aad3 996
2e4109de 997 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
2e4109de
AK
998 hwaddr pgsize = 0;
999
1000 /* For now intersections are not allowed, we may relax this later */
1001 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
1002 if (ranges_overlap(hostwin->min_iova,
1003 hostwin->max_iova - hostwin->min_iova + 1,
1004 section->offset_within_address_space,
1005 int128_get64(section->size))) {
d7d87836
EA
1006 error_setg(&err,
1007 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
1008 "host DMA window [0x%"PRIx64",0x%"PRIx64"]",
1009 section->offset_within_address_space,
1010 section->offset_within_address_space +
1011 int128_get64(section->size) - 1,
1012 hostwin->min_iova, hostwin->max_iova);
2e4109de
AK
1013 goto fail;
1014 }
1015 }
1016
1017 ret = vfio_spapr_create_window(container, section, &pgsize);
1018 if (ret) {
d7d87836 1019 error_setg_errno(&err, -ret, "Failed to create SPAPR window");
2e4109de
AK
1020 goto fail;
1021 }
1022
1023 vfio_host_win_add(container, section->offset_within_address_space,
1024 section->offset_within_address_space +
1025 int128_get64(section->size) - 1, pgsize);
07bc681a
AK
1026#ifdef CONFIG_KVM
1027 if (kvm_enabled()) {
1028 VFIOGroup *group;
1029 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
1030 struct kvm_vfio_spapr_tce param;
1031 struct kvm_device_attr attr = {
1032 .group = KVM_DEV_VFIO_GROUP,
1033 .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
1034 .addr = (uint64_t)(unsigned long)&param,
1035 };
1036
1037 if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
1038 &param.tablefd)) {
1039 QLIST_FOREACH(group, &container->group_list, container_next) {
1040 param.groupfd = group->fd;
1041 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
1042 error_report("vfio: failed to setup fd %d "
1043 "for a group with fd %d: %s",
1044 param.tablefd, param.groupfd,
1045 strerror(errno));
1046 return;
1047 }
1048 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
1049 }
1050 }
1051 }
1052#endif
2e4109de
AK
1053 }
1054
fbc6c921
JM
1055 hostwin = vfio_find_hostwin(container, iova, end);
1056 if (!hostwin) {
d7d87836
EA
1057 error_setg(&err, "Container %p can't map guest IOVA region"
1058 " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end);
3898aad3
DG
1059 goto fail;
1060 }
e2c7d025
EA
1061
1062 memory_region_ref(section->mr);
1063
1064 if (memory_region_is_iommu(section->mr)) {
1065 VFIOGuestIOMMU *giommu;
3df9d748 1066 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
cb1efcf4 1067 int iommu_idx;
e2c7d025 1068
55efcc53 1069 trace_vfio_listener_region_add_iommu(iova, end);
e2c7d025 1070 /*
e2c7d025
EA
1071 * FIXME: For VFIO iommu types which have KVM acceleration to
1072 * avoid bouncing all map/unmaps through qemu this way, this
1073 * would be the right place to wire that up (tell the KVM
1074 * device emulation the VFIO iommu handles to use).
1075 */
e2c7d025 1076 giommu = g_malloc0(sizeof(*giommu));
44ee6aaa 1077 giommu->iommu_mr = iommu_mr;
d78c19b5
AK
1078 giommu->iommu_offset = section->offset_within_address_space -
1079 section->offset_within_region;
e2c7d025 1080 giommu->container = container;
698feb5e
PX
1081 llend = int128_add(int128_make64(section->offset_within_region),
1082 section->size);
1083 llend = int128_sub(llend, int128_one());
cb1efcf4
PM
1084 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
1085 MEMTXATTRS_UNSPECIFIED);
698feb5e 1086 iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
8dca037b 1087 IOMMU_NOTIFIER_IOTLB_EVENTS,
698feb5e 1088 section->offset_within_region,
cb1efcf4
PM
1089 int128_get64(llend),
1090 iommu_idx);
508ce5eb 1091
44ee6aaa 1092 ret = memory_region_iommu_set_page_size_mask(giommu->iommu_mr,
b9177498
BB
1093 container->pgsizes,
1094 &err);
1095 if (ret) {
1096 g_free(giommu);
1097 goto fail;
1098 }
1099
549d4005
EA
1100 ret = memory_region_register_iommu_notifier(section->mr, &giommu->n,
1101 &err);
1102 if (ret) {
1103 g_free(giommu);
1104 goto fail;
1105 }
1106 QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
44ee6aaa 1107 memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
e2c7d025
EA
1108
1109 return;
1110 }
1111
1112 /* Here we assume that memory_region_is_ram(section->mr)==true */
1113
5e3b981c
DH
1114 /*
1115 * For RAM memory regions with a RamDiscardManager, we only want to map the
1116 * actually populated parts - and update the mapping whenever we're notified
1117 * about changes.
1118 */
1119 if (memory_region_has_ram_discard_manager(section->mr)) {
1120 vfio_register_ram_discard_listener(container, section);
1121 return;
1122 }
1123
e2c7d025
EA
1124 vaddr = memory_region_get_ram_ptr(section->mr) +
1125 section->offset_within_region +
1126 (iova - section->offset_within_address_space);
1127
55efcc53 1128 trace_vfio_listener_region_add_ram(iova, end, vaddr);
e2c7d025 1129
55efcc53
BD
1130 llsize = int128_sub(llend, int128_make64(iova));
1131
567b5b30
AK
1132 if (memory_region_is_ram_device(section->mr)) {
1133 hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
1134
1135 if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
5c086005
EA
1136 trace_vfio_listener_region_add_no_dma_map(
1137 memory_region_name(section->mr),
1138 section->offset_within_address_space,
1139 int128_getlo(section->size),
1140 pgmask + 1);
567b5b30
AK
1141 return;
1142 }
1143 }
1144
55efcc53
BD
1145 ret = vfio_dma_map(container, iova, int128_get64(llsize),
1146 vaddr, section->readonly);
e2c7d025 1147 if (ret) {
d7d87836 1148 error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
db9b829b
AH
1149 "0x%"HWADDR_PRIx", %p) = %d (%s)",
1150 container, iova, int128_get64(llsize), vaddr, ret,
1151 strerror(-ret));
567b5b30
AK
1152 if (memory_region_is_ram_device(section->mr)) {
1153 /* Allow unexpected mappings not to be fatal for RAM devices */
d7d87836 1154 error_report_err(err);
567b5b30
AK
1155 return;
1156 }
ac6dc389
DG
1157 goto fail;
1158 }
e2c7d025 1159
ac6dc389
DG
1160 return;
1161
1162fail:
567b5b30
AK
1163 if (memory_region_is_ram_device(section->mr)) {
1164 error_report("failed to vfio_dma_map. pci p2p may not work");
1165 return;
1166 }
ac6dc389
DG
1167 /*
1168 * On the initfn path, store the first error in the container so we
1169 * can gracefully fail. Runtime, there's not much we can do other
1170 * than throw a hardware error.
1171 */
1172 if (!container->initialized) {
1173 if (!container->error) {
d7d87836
EA
1174 error_propagate_prepend(&container->error, err,
1175 "Region %s: ",
1176 memory_region_name(section->mr));
1177 } else {
1178 error_free(err);
e2c7d025 1179 }
ac6dc389 1180 } else {
d7d87836 1181 error_report_err(err);
ac6dc389 1182 hw_error("vfio: DMA mapping failed, unable to continue");
e2c7d025
EA
1183 }
1184}
1185
1186static void vfio_listener_region_del(MemoryListener *listener,
1187 MemoryRegionSection *section)
1188{
ee0bf0e5 1189 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
e2c7d025 1190 hwaddr iova, end;
7a057b4f 1191 Int128 llend, llsize;
e2c7d025 1192 int ret;
567b5b30 1193 bool try_unmap = true;
e2c7d025 1194
b92f2376 1195 if (!vfio_listener_valid_section(section, "region_del")) {
e2c7d025
EA
1196 return;
1197 }
1198
1199 if (memory_region_is_iommu(section->mr)) {
1200 VFIOGuestIOMMU *giommu;
1201
1202 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
44ee6aaa 1203 if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
698feb5e 1204 giommu->n.start == section->offset_within_region) {
3df9d748 1205 memory_region_unregister_iommu_notifier(section->mr,
d22d8956 1206 &giommu->n);
e2c7d025
EA
1207 QLIST_REMOVE(giommu, giommu_next);
1208 g_free(giommu);
1209 break;
1210 }
1211 }
1212
1213 /*
1214 * FIXME: We assume the one big unmap below is adequate to
1215 * remove any individual page mappings in the IOMMU which
1216 * might have been copied into VFIO. This works for a page table
1217 * based IOMMU where a big unmap flattens a large range of IO-PTEs.
1218 * That may not be true for all IOMMU types.
1219 */
1220 }
1221
1eb7f642 1222 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
7a057b4f
AK
1223 llend = int128_make64(section->offset_within_address_space);
1224 llend = int128_add(llend, section->size);
8e3b0cbb 1225 llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
e2c7d025 1226
7a057b4f 1227 if (int128_ge(int128_make64(iova), llend)) {
e2c7d025
EA
1228 return;
1229 }
7a057b4f
AK
1230 end = int128_get64(int128_sub(llend, int128_one()));
1231
1232 llsize = int128_sub(llend, int128_make64(iova));
e2c7d025 1233
7a057b4f 1234 trace_vfio_listener_region_del(iova, end);
e2c7d025 1235
567b5b30
AK
1236 if (memory_region_is_ram_device(section->mr)) {
1237 hwaddr pgmask;
1238 VFIOHostDMAWindow *hostwin;
567b5b30 1239
fbc6c921
JM
1240 hostwin = vfio_find_hostwin(container, iova, end);
1241 assert(hostwin); /* or region_add() would have failed */
567b5b30
AK
1242
1243 pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
1244 try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
5e3b981c
DH
1245 } else if (memory_region_has_ram_discard_manager(section->mr)) {
1246 vfio_unregister_ram_discard_listener(container, section);
1247 /* Unregistering will trigger an unmap. */
1248 try_unmap = false;
e2c7d025 1249 }
2e4109de 1250
567b5b30 1251 if (try_unmap) {
1b296c3d
JPB
1252 if (int128_eq(llsize, int128_2_64())) {
1253 /* The unmap ioctl doesn't accept a full 64-bit span. */
1254 llsize = int128_rshift(llsize, 1);
1255 ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
1256 if (ret) {
1257 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
db9b829b
AH
1258 "0x%"HWADDR_PRIx") = %d (%s)",
1259 container, iova, int128_get64(llsize), ret,
1260 strerror(-ret));
1b296c3d
JPB
1261 }
1262 iova += int128_get64(llsize);
1263 }
9e7b0442 1264 ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
567b5b30
AK
1265 if (ret) {
1266 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
db9b829b
AH
1267 "0x%"HWADDR_PRIx") = %d (%s)",
1268 container, iova, int128_get64(llsize), ret,
1269 strerror(-ret));
567b5b30
AK
1270 }
1271 }
1272
1273 memory_region_unref(section->mr);
1274
2e4109de
AK
1275 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1276 vfio_spapr_remove_window(container,
1277 section->offset_within_address_space);
1278 if (vfio_host_win_del(container,
1279 section->offset_within_address_space,
1280 section->offset_within_address_space +
1281 int128_get64(section->size) - 1) < 0) {
1282 hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
1283 __func__, section->offset_within_address_space);
1284 }
1285 }
e2c7d025
EA
1286}
1287
236e0a45 1288static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
758b96b6
KZ
1289{
1290 int ret;
1291 struct vfio_iommu_type1_dirty_bitmap dirty = {
1292 .argsz = sizeof(dirty),
1293 };
1294
b051a3f6 1295 if (!container->dirty_pages_supported) {
236e0a45 1296 return 0;
b051a3f6
AH
1297 }
1298
758b96b6
KZ
1299 if (start) {
1300 dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
1301 } else {
1302 dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
1303 }
1304
1305 ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
1306 if (ret) {
236e0a45 1307 ret = -errno;
758b96b6
KZ
1308 error_report("Failed to set dirty tracking flag 0x%x errno: %d",
1309 dirty.flags, errno);
1310 }
236e0a45
AH
1311
1312 return ret;
758b96b6
KZ
1313}
1314
1315static void vfio_listener_log_global_start(MemoryListener *listener)
1316{
1317 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
236e0a45 1318 int ret;
758b96b6 1319
236e0a45
AH
1320 ret = vfio_set_dirty_page_tracking(container, true);
1321 if (ret) {
1322 vfio_set_migration_error(ret);
1323 }
758b96b6
KZ
1324}
1325
1326static void vfio_listener_log_global_stop(MemoryListener *listener)
1327{
1328 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
236e0a45 1329 int ret;
758b96b6 1330
236e0a45
AH
1331 ret = vfio_set_dirty_page_tracking(container, false);
1332 if (ret) {
1333 vfio_set_migration_error(ret);
1334 }
758b96b6
KZ
1335}
1336
b6dd6504
KW
1337static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
1338 uint64_t size, ram_addr_t ram_addr)
1339{
1340 struct vfio_iommu_type1_dirty_bitmap *dbitmap;
1341 struct vfio_iommu_type1_dirty_bitmap_get *range;
725ccd7e 1342 VFIOBitmap vbmap;
b6dd6504
KW
1343 int ret;
1344
b051a3f6
AH
1345 if (!container->dirty_pages_supported) {
1346 cpu_physical_memory_set_dirty_range(ram_addr, size,
1347 tcg_enabled() ? DIRTY_CLIENTS_ALL :
1348 DIRTY_CLIENTS_NOCODE);
1349 return 0;
1350 }
1351
725ccd7e
AH
1352 ret = vfio_bitmap_alloc(&vbmap, size);
1353 if (ret) {
1354 return ret;
1355 }
1356
b6dd6504
KW
1357 dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
1358
1359 dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
1360 dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
1361 range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
1362 range->iova = iova;
1363 range->size = size;
1364
1365 /*
1eb7f642
KJ
1366 * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
1367 * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize
1368 * to qemu_real_host_page_size.
b6dd6504 1369 */
8e3b0cbb 1370 range->bitmap.pgsize = qemu_real_host_page_size();
725ccd7e
AH
1371 range->bitmap.size = vbmap.size;
1372 range->bitmap.data = (__u64 *)vbmap.bitmap;
b6dd6504
KW
1373
1374 ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
1375 if (ret) {
3e2413a6 1376 ret = -errno;
b6dd6504
KW
1377 error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64
1378 " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
1379 (uint64_t)range->size, errno);
1380 goto err_out;
1381 }
1382
725ccd7e
AH
1383 cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr,
1384 vbmap.pages);
b6dd6504
KW
1385
1386 trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
1387 range->bitmap.size, ram_addr);
1388err_out:
b6dd6504 1389 g_free(dbitmap);
725ccd7e 1390 g_free(vbmap.bitmap);
b6dd6504
KW
1391
1392 return ret;
1393}
1394
9a04fe09
KW
1395typedef struct {
1396 IOMMUNotifier n;
1397 VFIOGuestIOMMU *giommu;
1398} vfio_giommu_dirty_notifier;
1399
1400static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
1401{
1402 vfio_giommu_dirty_notifier *gdn = container_of(n,
1403 vfio_giommu_dirty_notifier, n);
1404 VFIOGuestIOMMU *giommu = gdn->giommu;
1405 VFIOContainer *container = giommu->container;
1406 hwaddr iova = iotlb->iova + giommu->iommu_offset;
1407 ram_addr_t translated_addr;
236e0a45 1408 int ret = -EINVAL;
9a04fe09
KW
1409
1410 trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
1411
1412 if (iotlb->target_as != &address_space_memory) {
1413 error_report("Wrong target AS \"%s\", only system memory is allowed",
1414 iotlb->target_as->name ? iotlb->target_as->name : "none");
236e0a45 1415 goto out;
9a04fe09
KW
1416 }
1417
1418 rcu_read_lock();
1419 if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
9a04fe09
KW
1420 ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
1421 translated_addr);
1422 if (ret) {
1423 error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
db9b829b
AH
1424 "0x%"HWADDR_PRIx") = %d (%s)",
1425 container, iova, iotlb->addr_mask + 1, ret,
1426 strerror(-ret));
9a04fe09
KW
1427 }
1428 }
1429 rcu_read_unlock();
236e0a45
AH
1430
1431out:
1432 if (ret) {
1433 vfio_set_migration_error(ret);
1434 }
9a04fe09
KW
1435}
1436
5e3b981c
DH
1437static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
1438 void *opaque)
1439{
1440 const hwaddr size = int128_get64(section->size);
1441 const hwaddr iova = section->offset_within_address_space;
1442 const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) +
1443 section->offset_within_region;
1444 VFIORamDiscardListener *vrdl = opaque;
1445
1446 /*
1447 * Sync the whole mapped region (spanning multiple individual mappings)
1448 * in one go.
1449 */
1450 return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr);
1451}
1452
1453static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
1454 MemoryRegionSection *section)
1455{
1456 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
1457 VFIORamDiscardListener *vrdl = NULL;
1458
1459 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
1460 if (vrdl->mr == section->mr &&
1461 vrdl->offset_within_address_space ==
1462 section->offset_within_address_space) {
1463 break;
1464 }
1465 }
1466
1467 if (!vrdl) {
1468 hw_error("vfio: Trying to sync missing RAM discard listener");
1469 }
1470
1471 /*
1472 * We only want/can synchronize the bitmap for actually mapped parts -
1473 * which correspond to populated parts. Replay all populated parts.
1474 */
1475 return ram_discard_manager_replay_populated(rdm, section,
1476 vfio_ram_discard_get_dirty_bitmap,
1477 &vrdl);
1478}
1479
b6dd6504
KW
1480static int vfio_sync_dirty_bitmap(VFIOContainer *container,
1481 MemoryRegionSection *section)
1482{
1483 ram_addr_t ram_addr;
1484
9a04fe09
KW
1485 if (memory_region_is_iommu(section->mr)) {
1486 VFIOGuestIOMMU *giommu;
1487
1488 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
44ee6aaa 1489 if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
9a04fe09
KW
1490 giommu->n.start == section->offset_within_region) {
1491 Int128 llend;
1492 vfio_giommu_dirty_notifier gdn = { .giommu = giommu };
44ee6aaa 1493 int idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr,
9a04fe09
KW
1494 MEMTXATTRS_UNSPECIFIED);
1495
1496 llend = int128_add(int128_make64(section->offset_within_region),
1497 section->size);
1498 llend = int128_sub(llend, int128_one());
1499
1500 iommu_notifier_init(&gdn.n,
1501 vfio_iommu_map_dirty_notify,
1502 IOMMU_NOTIFIER_MAP,
1503 section->offset_within_region,
1504 int128_get64(llend),
1505 idx);
44ee6aaa 1506 memory_region_iommu_replay(giommu->iommu_mr, &gdn.n);
9a04fe09
KW
1507 break;
1508 }
1509 }
1510 return 0;
5e3b981c
DH
1511 } else if (memory_region_has_ram_discard_manager(section->mr)) {
1512 return vfio_sync_ram_discard_listener_dirty_bitmap(container, section);
9a04fe09
KW
1513 }
1514
b6dd6504
KW
1515 ram_addr = memory_region_get_ram_addr(section->mr) +
1516 section->offset_within_region;
1517
1518 return vfio_get_dirty_bitmap(container,
1eb7f642
KJ
1519 REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
1520 int128_get64(section->size), ram_addr);
b6dd6504
KW
1521}
1522
4292d501 1523static void vfio_listener_log_sync(MemoryListener *listener,
b6dd6504
KW
1524 MemoryRegionSection *section)
1525{
1526 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
236e0a45 1527 int ret;
b6dd6504 1528
b051a3f6 1529 if (vfio_listener_skipped_section(section)) {
b6dd6504
KW
1530 return;
1531 }
1532
758b96b6 1533 if (vfio_devices_all_dirty_tracking(container)) {
236e0a45
AH
1534 ret = vfio_sync_dirty_bitmap(container, section);
1535 if (ret) {
1536 error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret,
1537 strerror(-ret));
1538 vfio_set_migration_error(ret);
1539 }
b6dd6504
KW
1540 }
1541}
1542
51b833f4 1543static const MemoryListener vfio_memory_listener = {
142518bd 1544 .name = "vfio",
e2c7d025
EA
1545 .region_add = vfio_listener_region_add,
1546 .region_del = vfio_listener_region_del,
758b96b6
KZ
1547 .log_global_start = vfio_listener_log_global_start,
1548 .log_global_stop = vfio_listener_log_global_stop,
4292d501 1549 .log_sync = vfio_listener_log_sync,
e2c7d025
EA
1550};
1551
51b833f4 1552static void vfio_listener_release(VFIOContainer *container)
e2c7d025 1553{
ee0bf0e5 1554 memory_listener_unregister(&container->listener);
318f67ce
AK
1555 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1556 memory_listener_unregister(&container->prereg_listener);
1557 }
e2c7d025
EA
1558}
1559
3ab7a0b4
MR
1560static struct vfio_info_cap_header *
1561vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id)
b53b0f69
AW
1562{
1563 struct vfio_info_cap_header *hdr;
b53b0f69 1564
3ab7a0b4 1565 for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
b53b0f69
AW
1566 if (hdr->id == id) {
1567 return hdr;
1568 }
1569 }
1570
1571 return NULL;
1572}
1573
3ab7a0b4
MR
1574struct vfio_info_cap_header *
1575vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
1576{
1577 if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) {
1578 return NULL;
1579 }
1580
1581 return vfio_get_cap((void *)info, info->cap_offset, id);
1582}
1583
7486a628
MR
1584static struct vfio_info_cap_header *
1585vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
1586{
1587 if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
1588 return NULL;
1589 }
1590
1591 return vfio_get_cap((void *)info, info->cap_offset, id);
1592}
1593
92fe289a
MR
1594struct vfio_info_cap_header *
1595vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id)
1596{
1597 if (!(info->flags & VFIO_DEVICE_FLAGS_CAPS)) {
1598 return NULL;
1599 }
1600
1601 return vfio_get_cap((void *)info, info->cap_offset, id);
1602}
1603
7486a628
MR
1604bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
1605 unsigned int *avail)
1606{
1607 struct vfio_info_cap_header *hdr;
1608 struct vfio_iommu_type1_info_dma_avail *cap;
1609
1610 /* If the capability cannot be found, assume no DMA limiting */
1611 hdr = vfio_get_iommu_type1_info_cap(info,
1612 VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
1613 if (hdr == NULL) {
1614 return false;
1615 }
1616
1617 if (avail != NULL) {
1618 cap = (void *) hdr;
1619 *avail = cap->avail;
1620 }
1621
1622 return true;
1623}
1624
24acf72b
AW
1625static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
1626 struct vfio_region_info *info)
b53b0f69
AW
1627{
1628 struct vfio_info_cap_header *hdr;
1629 struct vfio_region_info_cap_sparse_mmap *sparse;
24acf72b 1630 int i, j;
b53b0f69
AW
1631
1632 hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
1633 if (!hdr) {
24acf72b 1634 return -ENODEV;
b53b0f69
AW
1635 }
1636
1637 sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
1638
1639 trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
1640 region->nr, sparse->nr_areas);
1641
24acf72b
AW
1642 region->mmaps = g_new0(VFIOMmap, sparse->nr_areas);
1643
1644 for (i = 0, j = 0; i < sparse->nr_areas; i++) {
24acf72b 1645 if (sparse->areas[i].size) {
99510d27
XC
1646 trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset,
1647 sparse->areas[i].offset +
1648 sparse->areas[i].size - 1);
24acf72b
AW
1649 region->mmaps[j].offset = sparse->areas[i].offset;
1650 region->mmaps[j].size = sparse->areas[i].size;
1651 j++;
1652 }
b53b0f69 1653 }
24acf72b
AW
1654
1655 region->nr_mmaps = j;
1656 region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap));
1657
1658 return 0;
b53b0f69
AW
1659}
1660
db0da029
AW
1661int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
1662 int index, const char *name)
e2c7d025 1663{
db0da029
AW
1664 struct vfio_region_info *info;
1665 int ret;
1666
1667 ret = vfio_get_region_info(vbasedev, index, &info);
1668 if (ret) {
1669 return ret;
1670 }
1671
1672 region->vbasedev = vbasedev;
1673 region->flags = info->flags;
1674 region->size = info->size;
1675 region->fd_offset = info->offset;
1676 region->nr = index;
1677
1678 if (region->size) {
1679 region->mem = g_new0(MemoryRegion, 1);
1680 memory_region_init_io(region->mem, obj, &vfio_region_ops,
1681 region, name, region->size);
e2c7d025 1682
db0da029 1683 if (!vbasedev->no_mmap &&
95251725 1684 region->flags & VFIO_REGION_INFO_FLAG_MMAP) {
e2c7d025 1685
24acf72b 1686 ret = vfio_setup_region_sparse_mmaps(region, info);
db0da029 1687
24acf72b 1688 if (ret) {
b53b0f69
AW
1689 region->nr_mmaps = 1;
1690 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
1691 region->mmaps[0].offset = 0;
1692 region->mmaps[0].size = region->size;
1693 }
e2c7d025 1694 }
db0da029
AW
1695 }
1696
1697 g_free(info);
1698
1699 trace_vfio_region_setup(vbasedev->name, index, name,
1700 region->flags, region->fd_offset, region->size);
1701 return 0;
1702}
e2c7d025 1703
0f7a903b
KW
1704static void vfio_subregion_unmap(VFIORegion *region, int index)
1705{
1706 trace_vfio_region_unmap(memory_region_name(&region->mmaps[index].mem),
1707 region->mmaps[index].offset,
1708 region->mmaps[index].offset +
1709 region->mmaps[index].size - 1);
1710 memory_region_del_subregion(region->mem, &region->mmaps[index].mem);
1711 munmap(region->mmaps[index].mmap, region->mmaps[index].size);
1712 object_unparent(OBJECT(&region->mmaps[index].mem));
1713 region->mmaps[index].mmap = NULL;
1714}
1715
db0da029
AW
1716int vfio_region_mmap(VFIORegion *region)
1717{
1718 int i, prot = 0;
1719 char *name;
1720
1721 if (!region->mem) {
1722 return 0;
1723 }
1724
1725 prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0;
1726 prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0;
1727
1728 for (i = 0; i < region->nr_mmaps; i++) {
1729 region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot,
1730 MAP_SHARED, region->vbasedev->fd,
1731 region->fd_offset +
1732 region->mmaps[i].offset);
1733 if (region->mmaps[i].mmap == MAP_FAILED) {
1734 int ret = -errno;
1735
1736 trace_vfio_region_mmap_fault(memory_region_name(region->mem), i,
1737 region->fd_offset +
1738 region->mmaps[i].offset,
1739 region->fd_offset +
1740 region->mmaps[i].offset +
1741 region->mmaps[i].size - 1, ret);
1742
1743 region->mmaps[i].mmap = NULL;
1744
1745 for (i--; i >= 0; i--) {
0f7a903b 1746 vfio_subregion_unmap(region, i);
db0da029
AW
1747 }
1748
1749 return ret;
e2c7d025
EA
1750 }
1751
db0da029
AW
1752 name = g_strdup_printf("%s mmaps[%d]",
1753 memory_region_name(region->mem), i);
21e00fa5
AW
1754 memory_region_init_ram_device_ptr(&region->mmaps[i].mem,
1755 memory_region_owner(region->mem),
1756 name, region->mmaps[i].size,
1757 region->mmaps[i].mmap);
db0da029 1758 g_free(name);
db0da029
AW
1759 memory_region_add_subregion(region->mem, region->mmaps[i].offset,
1760 &region->mmaps[i].mem);
1761
1762 trace_vfio_region_mmap(memory_region_name(&region->mmaps[i].mem),
1763 region->mmaps[i].offset,
1764 region->mmaps[i].offset +
1765 region->mmaps[i].size - 1);
1766 }
1767
1768 return 0;
1769}
1770
0f7a903b
KW
1771void vfio_region_unmap(VFIORegion *region)
1772{
1773 int i;
1774
1775 if (!region->mem) {
1776 return;
1777 }
1778
1779 for (i = 0; i < region->nr_mmaps; i++) {
1780 if (region->mmaps[i].mmap) {
1781 vfio_subregion_unmap(region, i);
1782 }
1783 }
1784}
1785
db0da029
AW
1786void vfio_region_exit(VFIORegion *region)
1787{
1788 int i;
1789
1790 if (!region->mem) {
1791 return;
1792 }
1793
1794 for (i = 0; i < region->nr_mmaps; i++) {
1795 if (region->mmaps[i].mmap) {
1796 memory_region_del_subregion(region->mem, &region->mmaps[i].mem);
e2c7d025 1797 }
db0da029 1798 }
e2c7d025 1799
db0da029
AW
1800 trace_vfio_region_exit(region->vbasedev->name, region->nr);
1801}
1802
1803void vfio_region_finalize(VFIORegion *region)
1804{
1805 int i;
1806
1807 if (!region->mem) {
1808 return;
e2c7d025
EA
1809 }
1810
db0da029
AW
1811 for (i = 0; i < region->nr_mmaps; i++) {
1812 if (region->mmaps[i].mmap) {
1813 munmap(region->mmaps[i].mmap, region->mmaps[i].size);
1814 object_unparent(OBJECT(&region->mmaps[i].mem));
1815 }
1816 }
1817
1818 object_unparent(OBJECT(region->mem));
1819
1820 g_free(region->mem);
1821 g_free(region->mmaps);
1822
1823 trace_vfio_region_finalize(region->vbasedev->name, region->nr);
92f86bff
GH
1824
1825 region->mem = NULL;
1826 region->mmaps = NULL;
1827 region->nr_mmaps = 0;
1828 region->size = 0;
1829 region->flags = 0;
1830 region->nr = 0;
db0da029
AW
1831}
1832
1833void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled)
1834{
1835 int i;
1836
1837 if (!region->mem) {
1838 return;
1839 }
1840
1841 for (i = 0; i < region->nr_mmaps; i++) {
1842 if (region->mmaps[i].mmap) {
1843 memory_region_set_enabled(&region->mmaps[i].mem, enabled);
1844 }
1845 }
e2c7d025 1846
db0da029
AW
1847 trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem),
1848 enabled);
e2c7d025
EA
1849}
1850
1851void vfio_reset_handler(void *opaque)
1852{
1853 VFIOGroup *group;
1854 VFIODevice *vbasedev;
1855
1856 QLIST_FOREACH(group, &vfio_group_list, next) {
1857 QLIST_FOREACH(vbasedev, &group->device_list, next) {
7da624e2
AW
1858 if (vbasedev->dev->realized) {
1859 vbasedev->ops->vfio_compute_needs_reset(vbasedev);
1860 }
e2c7d025
EA
1861 }
1862 }
1863
1864 QLIST_FOREACH(group, &vfio_group_list, next) {
1865 QLIST_FOREACH(vbasedev, &group->device_list, next) {
7da624e2 1866 if (vbasedev->dev->realized && vbasedev->needs_reset) {
e2c7d025
EA
1867 vbasedev->ops->vfio_hot_reset_multi(vbasedev);
1868 }
1869 }
1870 }
1871}
1872
1873static void vfio_kvm_device_add_group(VFIOGroup *group)
1874{
1875#ifdef CONFIG_KVM
1876 struct kvm_device_attr attr = {
1877 .group = KVM_DEV_VFIO_GROUP,
1878 .attr = KVM_DEV_VFIO_GROUP_ADD,
1879 .addr = (uint64_t)(unsigned long)&group->fd,
1880 };
1881
1882 if (!kvm_enabled()) {
1883 return;
1884 }
1885
1886 if (vfio_kvm_device_fd < 0) {
1887 struct kvm_create_device cd = {
1888 .type = KVM_DEV_TYPE_VFIO,
1889 };
1890
1891 if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
78e5b17f 1892 error_report("Failed to create KVM VFIO device: %m");
e2c7d025
EA
1893 return;
1894 }
1895
1896 vfio_kvm_device_fd = cd.fd;
1897 }
1898
1899 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
1900 error_report("Failed to add group %d to KVM VFIO device: %m",
1901 group->groupid);
1902 }
1903#endif
1904}
1905
1906static void vfio_kvm_device_del_group(VFIOGroup *group)
1907{
1908#ifdef CONFIG_KVM
1909 struct kvm_device_attr attr = {
1910 .group = KVM_DEV_VFIO_GROUP,
1911 .attr = KVM_DEV_VFIO_GROUP_DEL,
1912 .addr = (uint64_t)(unsigned long)&group->fd,
1913 };
1914
1915 if (vfio_kvm_device_fd < 0) {
1916 return;
1917 }
1918
1919 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
1920 error_report("Failed to remove group %d from KVM VFIO device: %m",
1921 group->groupid);
1922 }
1923#endif
1924}
1925
1926static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
1927{
1928 VFIOAddressSpace *space;
1929
1930 QLIST_FOREACH(space, &vfio_address_spaces, list) {
1931 if (space->as == as) {
1932 return space;
1933 }
1934 }
1935
1936 /* No suitable VFIOAddressSpace, create a new one */
1937 space = g_malloc0(sizeof(*space));
1938 space->as = as;
1939 QLIST_INIT(&space->containers);
1940
1941 QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
1942
1943 return space;
1944}
1945
1946static void vfio_put_address_space(VFIOAddressSpace *space)
1947{
1948 if (QLIST_EMPTY(&space->containers)) {
1949 QLIST_REMOVE(space, list);
1950 g_free(space);
1951 }
1952}
1953
2b6326c0
EA
1954/*
1955 * vfio_get_iommu_type - selects the richest iommu_type (v2 first)
1956 */
1957static int vfio_get_iommu_type(VFIOContainer *container,
1958 Error **errp)
1959{
1960 int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
1961 VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
1962 int i;
1963
1964 for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
1965 if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
1966 return iommu_types[i];
1967 }
1968 }
1969 error_setg(errp, "No available IOMMU models");
1970 return -EINVAL;
1971}
1972
1973static int vfio_init_container(VFIOContainer *container, int group_fd,
1974 Error **errp)
1975{
1976 int iommu_type, ret;
1977
1978 iommu_type = vfio_get_iommu_type(container, errp);
1979 if (iommu_type < 0) {
1980 return iommu_type;
1981 }
1982
1983 ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd);
1984 if (ret) {
1985 error_setg_errno(errp, errno, "Failed to set group container");
1986 return -errno;
1987 }
1988
1989 while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
1990 if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1991 /*
1992 * On sPAPR, despite the IOMMU subdriver always advertises v1 and
1993 * v2, the running platform may not support v2 and there is no
1994 * way to guess it until an IOMMU group gets added to the container.
1995 * So in case it fails with v2, try v1 as a fallback.
1996 */
1997 iommu_type = VFIO_SPAPR_TCE_IOMMU;
1998 continue;
1999 }
2000 error_setg_errno(errp, errno, "Failed to set iommu for container");
2001 return -errno;
2002 }
2003
2004 container->iommu_type = iommu_type;
2005 return 0;
2006}
2007
87ea529c
KW
2008static int vfio_get_iommu_info(VFIOContainer *container,
2009 struct vfio_iommu_type1_info **info)
2010{
2011
2012 size_t argsz = sizeof(struct vfio_iommu_type1_info);
2013
2014 *info = g_new0(struct vfio_iommu_type1_info, 1);
2015again:
2016 (*info)->argsz = argsz;
2017
2018 if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
2019 g_free(*info);
2020 *info = NULL;
2021 return -errno;
2022 }
2023
2024 if (((*info)->argsz > argsz)) {
2025 argsz = (*info)->argsz;
2026 *info = g_realloc(*info, argsz);
2027 goto again;
2028 }
2029
2030 return 0;
2031}
2032
2033static struct vfio_info_cap_header *
2034vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
2035{
2036 struct vfio_info_cap_header *hdr;
2037 void *ptr = info;
2038
2039 if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
2040 return NULL;
2041 }
2042
2043 for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
2044 if (hdr->id == id) {
2045 return hdr;
2046 }
2047 }
2048
2049 return NULL;
2050}
2051
2052static void vfio_get_iommu_info_migration(VFIOContainer *container,
2053 struct vfio_iommu_type1_info *info)
2054{
2055 struct vfio_info_cap_header *hdr;
2056 struct vfio_iommu_type1_info_cap_migration *cap_mig;
2057
2058 hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
2059 if (!hdr) {
2060 return;
2061 }
2062
2063 cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
2064 header);
2065
2066 /*
1eb7f642
KJ
2067 * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
2068 * qemu_real_host_page_size to mark those dirty.
87ea529c 2069 */
8e3b0cbb 2070 if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
87ea529c
KW
2071 container->dirty_pages_supported = true;
2072 container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
2073 container->dirty_pgsizes = cap_mig->pgsize_bitmap;
2074 }
2075}
2076
01905f58
EA
2077static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
2078 Error **errp)
e2c7d025
EA
2079{
2080 VFIOContainer *container;
2081 int ret, fd;
2082 VFIOAddressSpace *space;
2083
2084 space = vfio_get_address_space(as);
2085
c65ee433 2086 /*
aff92b82 2087 * VFIO is currently incompatible with discarding of RAM insofar as the
c65ee433
AW
2088 * madvise to purge (zap) the page from QEMU's address space does not
2089 * interact with the memory API and therefore leaves stale virtual to
2090 * physical mappings in the IOMMU if the page was previously pinned. We
aff92b82 2091 * therefore set discarding broken for each group added to a container,
c65ee433
AW
2092 * whether the container is used individually or shared. This provides
2093 * us with options to allow devices within a group to opt-in and allow
aff92b82 2094 * discarding, so long as it is done consistently for a group (for instance
c65ee433
AW
2095 * if the device is an mdev device where it is known that the host vendor
2096 * driver will never pin pages outside of the working set of the guest
aff92b82 2097 * driver, which would thus not be discarding candidates).
c65ee433
AW
2098 *
2099 * The first opportunity to induce pinning occurs here where we attempt to
2100 * attach the group to existing containers within the AddressSpace. If any
aff92b82
DH
2101 * pages are already zapped from the virtual address space, such as from
2102 * previous discards, new pinning will cause valid mappings to be
c65ee433
AW
2103 * re-established. Likewise, when the overall MemoryListener for a new
2104 * container is registered, a replay of mappings within the AddressSpace
2105 * will occur, re-establishing any previously zapped pages as well.
2106 *
aff92b82
DH
2107 * Especially virtio-balloon is currently only prevented from discarding
2108 * new memory, it will not yet set ram_block_discard_set_required() and
2109 * therefore, neither stops us here or deals with the sudden memory
2110 * consumption of inflated memory.
53d1b5fc
DH
2111 *
2112 * We do support discarding of memory coordinated via the RamDiscardManager
2113 * with some IOMMU types. vfio_ram_block_discard_disable() handles the
2114 * details once we know which type of IOMMU we are using.
c65ee433 2115 */
c65ee433 2116
e2c7d025
EA
2117 QLIST_FOREACH(container, &space->containers, next) {
2118 if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
53d1b5fc
DH
2119 ret = vfio_ram_block_discard_disable(container, true);
2120 if (ret) {
2121 error_setg_errno(errp, -ret,
2122 "Cannot set discarding of RAM broken");
2123 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER,
2124 &container->fd)) {
2125 error_report("vfio: error disconnecting group %d from"
2126 " container", group->groupid);
2127 }
2128 return ret;
2129 }
e2c7d025
EA
2130 group->container = container;
2131 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2016986a 2132 vfio_kvm_device_add_group(group);
e2c7d025
EA
2133 return 0;
2134 }
2135 }
2136
448058aa 2137 fd = qemu_open_old("/dev/vfio/vfio", O_RDWR);
e2c7d025 2138 if (fd < 0) {
01905f58 2139 error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio");
e2c7d025
EA
2140 ret = -errno;
2141 goto put_space_exit;
2142 }
2143
2144 ret = ioctl(fd, VFIO_GET_API_VERSION);
2145 if (ret != VFIO_API_VERSION) {
01905f58
EA
2146 error_setg(errp, "supported vfio version: %d, "
2147 "reported version: %d", VFIO_API_VERSION, ret);
e2c7d025
EA
2148 ret = -EINVAL;
2149 goto close_fd_exit;
2150 }
2151
2152 container = g_malloc0(sizeof(*container));
2153 container->space = space;
2154 container->fd = fd;
d7d87836 2155 container->error = NULL;
87ea529c 2156 container->dirty_pages_supported = false;
3eed155c 2157 container->dma_max_mappings = 0;
f7f9c7b2
LY
2158 QLIST_INIT(&container->giommu_list);
2159 QLIST_INIT(&container->hostwin_list);
5e3b981c 2160 QLIST_INIT(&container->vrdl_list);
2e6e697e 2161
2b6326c0
EA
2162 ret = vfio_init_container(container, group->fd, errp);
2163 if (ret) {
2164 goto free_container_exit;
2165 }
e2c7d025 2166
53d1b5fc
DH
2167 ret = vfio_ram_block_discard_disable(container, true);
2168 if (ret) {
2169 error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
2170 goto free_container_exit;
2171 }
2172
2b6326c0
EA
2173 switch (container->iommu_type) {
2174 case VFIO_TYPE1v2_IOMMU:
2175 case VFIO_TYPE1_IOMMU:
2176 {
87ea529c 2177 struct vfio_iommu_type1_info *info;
3898aad3 2178
87ea529c 2179 ret = vfio_get_iommu_info(container, &info);
85b6d2b5
AW
2180 if (ret) {
2181 error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
2182 goto enable_discards_exit;
2183 }
87ea529c 2184
85b6d2b5
AW
2185 if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
2186 container->pgsizes = info->iova_pgsizes;
2187 } else {
2188 container->pgsizes = qemu_real_host_page_size();
87ea529c 2189 }
85b6d2b5
AW
2190
2191 if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) {
2192 container->dma_max_mappings = 65535;
7a140a57 2193 }
85b6d2b5 2194 vfio_get_iommu_info_migration(container, info);
87ea529c 2195 g_free(info);
85b6d2b5
AW
2196
2197 /*
2198 * FIXME: We should parse VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE
2199 * information to get the actual window extent rather than assume
2200 * a 64-bit IOVA address space.
2201 */
2202 vfio_host_win_add(container, 0, (hwaddr)-1, container->pgsizes);
2203
2b6326c0
EA
2204 break;
2205 }
2206 case VFIO_SPAPR_TCE_v2_IOMMU:
2207 case VFIO_SPAPR_TCE_IOMMU:
2208 {
3898aad3 2209 struct vfio_iommu_spapr_tce_info info;
2b6326c0 2210 bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
e2c7d025
EA
2211
2212 /*
2213 * The host kernel code implementing VFIO_IOMMU_DISABLE is called
2214 * when container fd is closed so we do not call it explicitly
2215 * in this file.
2216 */
318f67ce
AK
2217 if (!v2) {
2218 ret = ioctl(fd, VFIO_IOMMU_ENABLE);
2219 if (ret) {
01905f58 2220 error_setg_errno(errp, errno, "failed to enable container");
318f67ce 2221 ret = -errno;
53d1b5fc 2222 goto enable_discards_exit;
318f67ce
AK
2223 }
2224 } else {
2225 container->prereg_listener = vfio_prereg_listener;
2226
2227 memory_listener_register(&container->prereg_listener,
2228 &address_space_memory);
2229 if (container->error) {
2230 memory_listener_unregister(&container->prereg_listener);
d7d87836
EA
2231 ret = -1;
2232 error_propagate_prepend(errp, container->error,
2233 "RAM memory listener initialization failed: ");
53d1b5fc 2234 goto enable_discards_exit;
318f67ce 2235 }
e2c7d025 2236 }
3898aad3 2237
3898aad3
DG
2238 info.argsz = sizeof(info);
2239 ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
2240 if (ret) {
01905f58
EA
2241 error_setg_errno(errp, errno,
2242 "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
3898aad3 2243 ret = -errno;
318f67ce
AK
2244 if (v2) {
2245 memory_listener_unregister(&container->prereg_listener);
2246 }
53d1b5fc 2247 goto enable_discards_exit;
3898aad3 2248 }
7a140a57 2249
2e4109de 2250 if (v2) {
c26bc185 2251 container->pgsizes = info.ddw.pgsizes;
2e4109de
AK
2252 /*
2253 * There is a default window in just created container.
2254 * To make region_add/del simpler, we better remove this
2255 * window now and let those iommu_listener callbacks
2256 * create/remove them when needed.
2257 */
2258 ret = vfio_spapr_remove_window(container, info.dma32_window_start);
2259 if (ret) {
01905f58
EA
2260 error_setg_errno(errp, -ret,
2261 "failed to remove existing window");
53d1b5fc 2262 goto enable_discards_exit;
2e4109de
AK
2263 }
2264 } else {
2265 /* The default table uses 4K pages */
c26bc185 2266 container->pgsizes = 0x1000;
2e4109de
AK
2267 vfio_host_win_add(container, info.dma32_window_start,
2268 info.dma32_window_start +
2269 info.dma32_window_size - 1,
2270 0x1000);
2271 }
2b6326c0 2272 }
e2c7d025
EA
2273 }
2274
8c37faa4
AK
2275 vfio_kvm_device_add_group(group);
2276
2277 QLIST_INIT(&container->group_list);
2278 QLIST_INSERT_HEAD(&space->containers, container, next);
2279
2280 group->container = container;
2281 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2282
ee0bf0e5
DG
2283 container->listener = vfio_memory_listener;
2284
2285 memory_listener_register(&container->listener, container->space->as);
2286
2287 if (container->error) {
d7d87836
EA
2288 ret = -1;
2289 error_propagate_prepend(errp, container->error,
2290 "memory listener initialization failed: ");
ee0bf0e5
DG
2291 goto listener_release_exit;
2292 }
2293
2294 container->initialized = true;
2295
e2c7d025
EA
2296 return 0;
2297listener_release_exit:
8c37faa4
AK
2298 QLIST_REMOVE(group, container_next);
2299 QLIST_REMOVE(container, next);
2300 vfio_kvm_device_del_group(group);
e2c7d025
EA
2301 vfio_listener_release(container);
2302
53d1b5fc
DH
2303enable_discards_exit:
2304 vfio_ram_block_discard_disable(container, false);
2305
e2c7d025
EA
2306free_container_exit:
2307 g_free(container);
2308
2309close_fd_exit:
2310 close(fd);
2311
2312put_space_exit:
2313 vfio_put_address_space(space);
2314
2315 return ret;
2316}
2317
2318static void vfio_disconnect_container(VFIOGroup *group)
2319{
2320 VFIOContainer *container = group->container;
2321
36968626
PX
2322 QLIST_REMOVE(group, container_next);
2323 group->container = NULL;
2324
2325 /*
2326 * Explicitly release the listener first before unset container,
2327 * since unset may destroy the backend container if it's the last
2328 * group.
2329 */
2330 if (QLIST_EMPTY(&container->group_list)) {
2331 vfio_listener_release(container);
2332 }
2333
e2c7d025
EA
2334 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
2335 error_report("vfio: error disconnecting group %d from container",
2336 group->groupid);
2337 }
2338
e2c7d025
EA
2339 if (QLIST_EMPTY(&container->group_list)) {
2340 VFIOAddressSpace *space = container->space;
f8d8a944 2341 VFIOGuestIOMMU *giommu, *tmp;
f3bc3a73 2342 VFIOHostDMAWindow *hostwin, *next;
e2c7d025 2343
e2c7d025 2344 QLIST_REMOVE(container, next);
f8d8a944
AK
2345
2346 QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
3df9d748 2347 memory_region_unregister_iommu_notifier(
44ee6aaa 2348 MEMORY_REGION(giommu->iommu_mr), &giommu->n);
f8d8a944
AK
2349 QLIST_REMOVE(giommu, giommu_next);
2350 g_free(giommu);
2351 }
2352
f3bc3a73
PL
2353 QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
2354 next) {
2355 QLIST_REMOVE(hostwin, hostwin_next);
2356 g_free(hostwin);
2357 }
2358
e2c7d025
EA
2359 trace_vfio_disconnect_container(container->fd);
2360 close(container->fd);
2361 g_free(container);
2362
2363 vfio_put_address_space(space);
2364 }
2365}
2366
1b808d5b 2367VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
e2c7d025
EA
2368{
2369 VFIOGroup *group;
2370 char path[32];
2371 struct vfio_group_status status = { .argsz = sizeof(status) };
2372
2373 QLIST_FOREACH(group, &vfio_group_list, next) {
2374 if (group->groupid == groupid) {
2375 /* Found it. Now is it already in the right context? */
2376 if (group->container->space->as == as) {
2377 return group;
2378 } else {
1b808d5b
EA
2379 error_setg(errp, "group %d used in multiple address spaces",
2380 group->groupid);
e2c7d025
EA
2381 return NULL;
2382 }
2383 }
2384 }
2385
2386 group = g_malloc0(sizeof(*group));
2387
2388 snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
448058aa 2389 group->fd = qemu_open_old(path, O_RDWR);
e2c7d025 2390 if (group->fd < 0) {
1b808d5b 2391 error_setg_errno(errp, errno, "failed to open %s", path);
e2c7d025
EA
2392 goto free_group_exit;
2393 }
2394
2395 if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
1b808d5b 2396 error_setg_errno(errp, errno, "failed to get group %d status", groupid);
e2c7d025
EA
2397 goto close_fd_exit;
2398 }
2399
2400 if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
1b808d5b
EA
2401 error_setg(errp, "group %d is not viable", groupid);
2402 error_append_hint(errp,
2403 "Please ensure all devices within the iommu_group "
2404 "are bound to their vfio bus driver.\n");
e2c7d025
EA
2405 goto close_fd_exit;
2406 }
2407
2408 group->groupid = groupid;
2409 QLIST_INIT(&group->device_list);
2410
1b808d5b
EA
2411 if (vfio_connect_container(group, as, errp)) {
2412 error_prepend(errp, "failed to setup container for group %d: ",
2413 groupid);
e2c7d025
EA
2414 goto close_fd_exit;
2415 }
2416
2417 if (QLIST_EMPTY(&vfio_group_list)) {
2418 qemu_register_reset(vfio_reset_handler, NULL);
2419 }
2420
2421 QLIST_INSERT_HEAD(&vfio_group_list, group, next);
2422
e2c7d025
EA
2423 return group;
2424
2425close_fd_exit:
2426 close(group->fd);
2427
2428free_group_exit:
2429 g_free(group);
2430
2431 return NULL;
2432}
2433
2434void vfio_put_group(VFIOGroup *group)
2435{
77a10d04 2436 if (!group || !QLIST_EMPTY(&group->device_list)) {
e2c7d025
EA
2437 return;
2438 }
2439
aff92b82 2440 if (!group->ram_block_discard_allowed) {
53d1b5fc 2441 vfio_ram_block_discard_disable(group->container, false);
238e9172 2442 }
e2c7d025
EA
2443 vfio_kvm_device_del_group(group);
2444 vfio_disconnect_container(group);
2445 QLIST_REMOVE(group, next);
2446 trace_vfio_put_group(group->fd);
2447 close(group->fd);
2448 g_free(group);
2449
2450 if (QLIST_EMPTY(&vfio_group_list)) {
2451 qemu_unregister_reset(vfio_reset_handler, NULL);
2452 }
2453}
2454
2455int vfio_get_device(VFIOGroup *group, const char *name,
59f7d674 2456 VFIODevice *vbasedev, Error **errp)
e2c7d025
EA
2457{
2458 struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
217e9fdc 2459 int ret, fd;
e2c7d025 2460
217e9fdc
PB
2461 fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
2462 if (fd < 0) {
59f7d674
EA
2463 error_setg_errno(errp, errno, "error getting device from group %d",
2464 group->groupid);
2465 error_append_hint(errp,
2466 "Verify all devices in group %d are bound to vfio-<bus> "
2467 "or pci-stub and not already in use\n", group->groupid);
217e9fdc 2468 return fd;
e2c7d025
EA
2469 }
2470
217e9fdc 2471 ret = ioctl(fd, VFIO_DEVICE_GET_INFO, &dev_info);
e2c7d025 2472 if (ret) {
59f7d674 2473 error_setg_errno(errp, errno, "error getting device info");
217e9fdc
PB
2474 close(fd);
2475 return ret;
e2c7d025
EA
2476 }
2477
238e9172 2478 /*
aff92b82
DH
2479 * Set discarding of RAM as not broken for this group if the driver knows
2480 * the device operates compatibly with discarding. Setting must be
2481 * consistent per group, but since compatibility is really only possible
2482 * with mdev currently, we expect singleton groups.
238e9172 2483 */
aff92b82
DH
2484 if (vbasedev->ram_block_discard_allowed !=
2485 group->ram_block_discard_allowed) {
238e9172 2486 if (!QLIST_EMPTY(&group->device_list)) {
aff92b82
DH
2487 error_setg(errp, "Inconsistent setting of support for discarding "
2488 "RAM (e.g., balloon) within group");
8709b395 2489 close(fd);
238e9172
AW
2490 return -1;
2491 }
2492
aff92b82
DH
2493 if (!group->ram_block_discard_allowed) {
2494 group->ram_block_discard_allowed = true;
53d1b5fc 2495 vfio_ram_block_discard_disable(group->container, false);
238e9172
AW
2496 }
2497 }
2498
217e9fdc
PB
2499 vbasedev->fd = fd;
2500 vbasedev->group = group;
2501 QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
2502
e2c7d025
EA
2503 vbasedev->num_irqs = dev_info.num_irqs;
2504 vbasedev->num_regions = dev_info.num_regions;
2505 vbasedev->flags = dev_info.flags;
2506
2507 trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions,
2508 dev_info.num_irqs);
2509
2510 vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
217e9fdc 2511 return 0;
e2c7d025
EA
2512}
2513
2514void vfio_put_base_device(VFIODevice *vbasedev)
2515{
77a10d04
PB
2516 if (!vbasedev->group) {
2517 return;
2518 }
e2c7d025
EA
2519 QLIST_REMOVE(vbasedev, next);
2520 vbasedev->group = NULL;
2521 trace_vfio_put_base_device(vbasedev->fd);
2522 close(vbasedev->fd);
2523}
2524
46900226
AW
2525int vfio_get_region_info(VFIODevice *vbasedev, int index,
2526 struct vfio_region_info **info)
2527{
2528 size_t argsz = sizeof(struct vfio_region_info);
2529
2530 *info = g_malloc0(argsz);
2531
2532 (*info)->index = index;
b53b0f69 2533retry:
46900226
AW
2534 (*info)->argsz = argsz;
2535
2536 if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
2537 g_free(*info);
e61a424f 2538 *info = NULL;
46900226
AW
2539 return -errno;
2540 }
2541
b53b0f69
AW
2542 if ((*info)->argsz > argsz) {
2543 argsz = (*info)->argsz;
2544 *info = g_realloc(*info, argsz);
2545
2546 goto retry;
2547 }
2548
46900226
AW
2549 return 0;
2550}
2551
e61a424f
AW
2552int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
2553 uint32_t subtype, struct vfio_region_info **info)
2554{
2555 int i;
2556
2557 for (i = 0; i < vbasedev->num_regions; i++) {
2558 struct vfio_info_cap_header *hdr;
2559 struct vfio_region_info_cap_type *cap_type;
2560
2561 if (vfio_get_region_info(vbasedev, i, info)) {
2562 continue;
2563 }
2564
2565 hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
2566 if (!hdr) {
2567 g_free(*info);
2568 continue;
2569 }
2570
2571 cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
2572
2573 trace_vfio_get_dev_region(vbasedev->name, i,
2574 cap_type->type, cap_type->subtype);
2575
2576 if (cap_type->type == type && cap_type->subtype == subtype) {
2577 return 0;
2578 }
2579
2580 g_free(*info);
2581 }
2582
2583 *info = NULL;
2584 return -ENODEV;
2585}
2586
ae0215b2
AK
2587bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
2588{
2589 struct vfio_region_info *info = NULL;
2590 bool ret = false;
2591
2592 if (!vfio_get_region_info(vbasedev, region, &info)) {
2593 if (vfio_get_region_info_cap(info, cap_type)) {
2594 ret = true;
2595 }
2596 g_free(info);
2597 }
2598
2599 return ret;
2600}
2601
3153119e
DG
2602/*
2603 * Interfaces for IBM EEH (Enhanced Error Handling)
2604 */
2605static bool vfio_eeh_container_ok(VFIOContainer *container)
2606{
2607 /*
2608 * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO
2609 * implementation is broken if there are multiple groups in a
2610 * container. The hardware works in units of Partitionable
2611 * Endpoints (== IOMMU groups) and the EEH operations naively
2612 * iterate across all groups in the container, without any logic
2613 * to make sure the groups have their state synchronized. For
2614 * certain operations (ENABLE) that might be ok, until an error
2615 * occurs, but for others (GET_STATE) it's clearly broken.
2616 */
2617
2618 /*
2619 * XXX Once fixed kernels exist, test for them here
2620 */
2621
2622 if (QLIST_EMPTY(&container->group_list)) {
2623 return false;
2624 }
2625
2626 if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) {
2627 return false;
2628 }
2629
2630 return true;
2631}
2632
2633static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
2634{
2635 struct vfio_eeh_pe_op pe_op = {
2636 .argsz = sizeof(pe_op),
2637 .op = op,
2638 };
2639 int ret;
2640
2641 if (!vfio_eeh_container_ok(container)) {
2642 error_report("vfio/eeh: EEH_PE_OP 0x%x: "
2643 "kernel requires a container with exactly one group", op);
2644 return -EPERM;
2645 }
2646
2647 ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op);
2648 if (ret < 0) {
2649 error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op);
2650 return -errno;
2651 }
2652
d917e88d 2653 return ret;
3153119e
DG
2654}
2655
2656static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
2657{
2658 VFIOAddressSpace *space = vfio_get_address_space(as);
2659 VFIOContainer *container = NULL;
2660
2661 if (QLIST_EMPTY(&space->containers)) {
2662 /* No containers to act on */
2663 goto out;
2664 }
2665
2666 container = QLIST_FIRST(&space->containers);
2667
2668 if (QLIST_NEXT(container, next)) {
2669 /* We don't yet have logic to synchronize EEH state across
2670 * multiple containers */
2671 container = NULL;
2672 goto out;
2673 }
2674
2675out:
2676 vfio_put_address_space(space);
2677 return container;
2678}
2679
2680bool vfio_eeh_as_ok(AddressSpace *as)
2681{
2682 VFIOContainer *container = vfio_eeh_as_container(as);
2683
2684 return (container != NULL) && vfio_eeh_container_ok(container);
2685}
2686
2687int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
2688{
2689 VFIOContainer *container = vfio_eeh_as_container(as);
2690
2691 if (!container) {
2692 return -ENODEV;
2693 }
2694 return vfio_eeh_container_op(container, op);
2695}