]> git.proxmox.com Git - mirror_qemu.git/blame - hw/vfio/common.c
vfio/common: Add device dirty page tracking start/stop
[mirror_qemu.git] / hw / vfio / common.c
CommitLineData
e2c7d025
EA
1/*
2 * generic functions used by VFIO devices
3 *
4 * Copyright Red Hat, Inc. 2012
5 *
6 * Authors:
7 * Alex Williamson <alex.williamson@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 * Based on qemu-kvm device-assignment:
13 * Adapted for KVM by Qumranet.
14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19 */
20
c6eacb1a 21#include "qemu/osdep.h"
e2c7d025 22#include <sys/ioctl.h>
a9c94277
MA
23#ifdef CONFIG_KVM
24#include <linux/kvm.h>
25#endif
e2c7d025
EA
26#include <linux/vfio.h>
27
28#include "hw/vfio/vfio-common.h"
29#include "hw/vfio/vfio.h"
30#include "exec/address-spaces.h"
31#include "exec/memory.h"
b6dd6504 32#include "exec/ram_addr.h"
e2c7d025
EA
33#include "hw/hw.h"
34#include "qemu/error-report.h"
db725815 35#include "qemu/main-loop.h"
f4ec5e26 36#include "qemu/range.h"
e2c7d025 37#include "sysemu/kvm.h"
71e8a915 38#include "sysemu/reset.h"
0fd7616e 39#include "sysemu/runstate.h"
e2c7d025 40#include "trace.h"
01905f58 41#include "qapi/error.h"
b6dd6504 42#include "migration/migration.h"
8b942af3 43#include "migration/misc.h"
29d81b71 44#include "migration/blocker.h"
236e0a45 45#include "migration/qemu-file.h"
851d6d1a 46#include "sysemu/tpm.h"
e2c7d025 47
f481ee2d 48VFIOGroupList vfio_group_list =
39cb514f 49 QLIST_HEAD_INITIALIZER(vfio_group_list);
10ca76b4 50static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
e2c7d025
EA
51 QLIST_HEAD_INITIALIZER(vfio_address_spaces);
52
53#ifdef CONFIG_KVM
54/*
55 * We have a single VFIO pseudo device per KVM VM. Once created it lives
56 * for the life of the VM. Closing the file descriptor only drops our
57 * reference to it and the device's reference to kvm. Therefore once
58 * initialized, this file descriptor is only released on QEMU exit and
59 * we'll re-use it should another vfio device be attached before then.
60 */
61static int vfio_kvm_device_fd = -1;
62#endif
63
64/*
65 * Common VFIO interrupt disable
66 */
67void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
68{
69 struct vfio_irq_set irq_set = {
70 .argsz = sizeof(irq_set),
71 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
72 .index = index,
73 .start = 0,
74 .count = 0,
75 };
76
77 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
78}
79
80void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
81{
82 struct vfio_irq_set irq_set = {
83 .argsz = sizeof(irq_set),
84 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
85 .index = index,
86 .start = 0,
87 .count = 1,
88 };
89
90 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
91}
92
93void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
94{
95 struct vfio_irq_set irq_set = {
96 .argsz = sizeof(irq_set),
97 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
98 .index = index,
99 .start = 0,
100 .count = 1,
101 };
102
103 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
104}
105
201a7331
EA
106static inline const char *action_to_str(int action)
107{
108 switch (action) {
109 case VFIO_IRQ_SET_ACTION_MASK:
110 return "MASK";
111 case VFIO_IRQ_SET_ACTION_UNMASK:
112 return "UNMASK";
113 case VFIO_IRQ_SET_ACTION_TRIGGER:
114 return "TRIGGER";
115 default:
116 return "UNKNOWN ACTION";
117 }
118}
119
120static const char *index_to_str(VFIODevice *vbasedev, int index)
121{
122 if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
123 return NULL;
124 }
125
126 switch (index) {
127 case VFIO_PCI_INTX_IRQ_INDEX:
128 return "INTX";
129 case VFIO_PCI_MSI_IRQ_INDEX:
130 return "MSI";
131 case VFIO_PCI_MSIX_IRQ_INDEX:
132 return "MSIX";
133 case VFIO_PCI_ERR_IRQ_INDEX:
134 return "ERR";
135 case VFIO_PCI_REQ_IRQ_INDEX:
136 return "REQ";
137 default:
138 return NULL;
139 }
140}
141
53d1b5fc
DH
142static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
143{
144 switch (container->iommu_type) {
145 case VFIO_TYPE1v2_IOMMU:
146 case VFIO_TYPE1_IOMMU:
147 /*
148 * We support coordinated discarding of RAM via the RamDiscardManager.
149 */
150 return ram_block_uncoordinated_discard_disable(state);
151 default:
152 /*
153 * VFIO_SPAPR_TCE_IOMMU most probably works just fine with
154 * RamDiscardManager, however, it is completely untested.
155 *
156 * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does
157 * completely the opposite of managing mapping/pinning dynamically as
158 * required by RamDiscardManager. We would have to special-case sections
159 * with a RamDiscardManager.
160 */
161 return ram_block_discard_disable(state);
162 }
163}
164
201a7331
EA
165int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
166 int action, int fd, Error **errp)
167{
168 struct vfio_irq_set *irq_set;
169 int argsz, ret = 0;
170 const char *name;
171 int32_t *pfd;
172
173 argsz = sizeof(*irq_set) + sizeof(*pfd);
174
175 irq_set = g_malloc0(argsz);
176 irq_set->argsz = argsz;
177 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action;
178 irq_set->index = index;
179 irq_set->start = subindex;
180 irq_set->count = 1;
181 pfd = (int32_t *)&irq_set->data;
182 *pfd = fd;
183
184 if (ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
185 ret = -errno;
186 }
187 g_free(irq_set);
188
189 if (!ret) {
190 return 0;
191 }
192
193 error_setg_errno(errp, -ret, "VFIO_DEVICE_SET_IRQS failure");
194
195 name = index_to_str(vbasedev, index);
196 if (name) {
197 error_prepend(errp, "%s-%d: ", name, subindex);
198 } else {
199 error_prepend(errp, "index %d-%d: ", index, subindex);
200 }
201 error_prepend(errp,
202 "Failed to %s %s eventfd signaling for interrupt ",
203 fd < 0 ? "tear down" : "set up", action_to_str(action));
204 return ret;
205}
206
e2c7d025
EA
207/*
208 * IO Port/MMIO - Beware of the endians, VFIO is always little endian
209 */
210void vfio_region_write(void *opaque, hwaddr addr,
211 uint64_t data, unsigned size)
212{
213 VFIORegion *region = opaque;
214 VFIODevice *vbasedev = region->vbasedev;
215 union {
216 uint8_t byte;
217 uint16_t word;
218 uint32_t dword;
219 uint64_t qword;
220 } buf;
221
222 switch (size) {
223 case 1:
224 buf.byte = data;
225 break;
226 case 2:
227 buf.word = cpu_to_le16(data);
228 break;
229 case 4:
230 buf.dword = cpu_to_le32(data);
231 break;
38d49e8c
JRZ
232 case 8:
233 buf.qword = cpu_to_le64(data);
234 break;
e2c7d025 235 default:
c624b6b3 236 hw_error("vfio: unsupported write size, %u bytes", size);
e2c7d025
EA
237 break;
238 }
239
240 if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
241 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
242 ",%d) failed: %m",
243 __func__, vbasedev->name, region->nr,
244 addr, data, size);
245 }
246
247 trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
248
249 /*
250 * A read or write to a BAR always signals an INTx EOI. This will
251 * do nothing if not pending (including not in INTx mode). We assume
252 * that a BAR access is in response to an interrupt and that BAR
253 * accesses will service the interrupt. Unfortunately, we don't know
254 * which access will service the interrupt, so we're potentially
255 * getting quite a few host interrupts per guest interrupt.
256 */
257 vbasedev->ops->vfio_eoi(vbasedev);
258}
259
260uint64_t vfio_region_read(void *opaque,
261 hwaddr addr, unsigned size)
262{
263 VFIORegion *region = opaque;
264 VFIODevice *vbasedev = region->vbasedev;
265 union {
266 uint8_t byte;
267 uint16_t word;
268 uint32_t dword;
269 uint64_t qword;
270 } buf;
271 uint64_t data = 0;
272
273 if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
274 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
275 __func__, vbasedev->name, region->nr,
276 addr, size);
277 return (uint64_t)-1;
278 }
279 switch (size) {
280 case 1:
281 data = buf.byte;
282 break;
283 case 2:
284 data = le16_to_cpu(buf.word);
285 break;
286 case 4:
287 data = le32_to_cpu(buf.dword);
288 break;
38d49e8c
JRZ
289 case 8:
290 data = le64_to_cpu(buf.qword);
291 break;
e2c7d025 292 default:
c624b6b3 293 hw_error("vfio: unsupported read size, %u bytes", size);
e2c7d025
EA
294 break;
295 }
296
297 trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data);
298
299 /* Same as write above */
300 vbasedev->ops->vfio_eoi(vbasedev);
301
302 return data;
303}
304
305const MemoryRegionOps vfio_region_ops = {
306 .read = vfio_region_read,
307 .write = vfio_region_write,
308 .endianness = DEVICE_LITTLE_ENDIAN,
15126cba
JRZ
309 .valid = {
310 .min_access_size = 1,
311 .max_access_size = 8,
312 },
38d49e8c
JRZ
313 .impl = {
314 .min_access_size = 1,
315 .max_access_size = 8,
316 },
e2c7d025
EA
317};
318
b6dd6504
KW
319/*
320 * Device state interfaces
321 */
322
725ccd7e
AH
323typedef struct {
324 unsigned long *bitmap;
325 hwaddr size;
326 hwaddr pages;
327} VFIOBitmap;
328
329static int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size)
330{
331 vbmap->pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
332 vbmap->size = ROUND_UP(vbmap->pages, sizeof(__u64) * BITS_PER_BYTE) /
333 BITS_PER_BYTE;
334 vbmap->bitmap = g_try_malloc0(vbmap->size);
335 if (!vbmap->bitmap) {
336 return -ENOMEM;
337 }
338
339 return 0;
340}
341
3710586c
KW
342bool vfio_mig_active(void)
343{
344 VFIOGroup *group;
345 VFIODevice *vbasedev;
346
347 if (QLIST_EMPTY(&vfio_group_list)) {
348 return false;
349 }
350
351 QLIST_FOREACH(group, &vfio_group_list, next) {
352 QLIST_FOREACH(vbasedev, &group->device_list, next) {
353 if (vbasedev->migration_blocker) {
354 return false;
355 }
356 }
357 }
358 return true;
359}
360
29d81b71
AH
361static Error *multiple_devices_migration_blocker;
362
363static unsigned int vfio_migratable_device_num(void)
364{
365 VFIOGroup *group;
366 VFIODevice *vbasedev;
367 unsigned int device_num = 0;
368
369 QLIST_FOREACH(group, &vfio_group_list, next) {
370 QLIST_FOREACH(vbasedev, &group->device_list, next) {
371 if (vbasedev->migration) {
372 device_num++;
373 }
374 }
375 }
376
377 return device_num;
378}
379
380int vfio_block_multiple_devices_migration(Error **errp)
381{
382 int ret;
383
384 if (multiple_devices_migration_blocker ||
385 vfio_migratable_device_num() <= 1) {
386 return 0;
387 }
388
389 error_setg(&multiple_devices_migration_blocker,
390 "Migration is currently not supported with multiple "
391 "VFIO devices");
392 ret = migrate_add_blocker(multiple_devices_migration_blocker, errp);
393 if (ret < 0) {
394 error_free(multiple_devices_migration_blocker);
395 multiple_devices_migration_blocker = NULL;
396 }
397
398 return ret;
399}
400
401void vfio_unblock_multiple_devices_migration(void)
402{
403 if (!multiple_devices_migration_blocker ||
404 vfio_migratable_device_num() > 1) {
405 return;
406 }
407
408 migrate_del_blocker(multiple_devices_migration_blocker);
409 error_free(multiple_devices_migration_blocker);
410 multiple_devices_migration_blocker = NULL;
411}
412
236e0a45
AH
413static void vfio_set_migration_error(int err)
414{
415 MigrationState *ms = migrate_get_current();
416
417 if (migration_is_setup_or_active(ms->state)) {
418 WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
419 if (ms->to_dst_file) {
420 qemu_file_set_error(ms->to_dst_file, err);
421 }
422 }
423 }
424}
425
758b96b6 426static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
b6dd6504
KW
427{
428 VFIOGroup *group;
429 VFIODevice *vbasedev;
430 MigrationState *ms = migrate_get_current();
431
432 if (!migration_is_setup_or_active(ms->state)) {
433 return false;
434 }
435
436 QLIST_FOREACH(group, &container->group_list, container_next) {
437 QLIST_FOREACH(vbasedev, &group->device_list, next) {
438 VFIOMigration *migration = vbasedev->migration;
439
440 if (!migration) {
441 return false;
442 }
443
7429aebe 444 if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF &&
31bcbbb5
AH
445 migration->device_state == VFIO_DEVICE_STATE_RUNNING) {
446 return false;
447 }
b6dd6504
KW
448 }
449 }
450 return true;
451}
452
5255bbf4
JM
453static bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container)
454{
455 VFIOGroup *group;
456 VFIODevice *vbasedev;
457
458 QLIST_FOREACH(group, &container->group_list, container_next) {
459 QLIST_FOREACH(vbasedev, &group->device_list, next) {
460 if (!vbasedev->dirty_pages_supported) {
461 return false;
462 }
463 }
464 }
465
466 return true;
467}
468
8b942af3
AH
469/*
470 * Check if all VFIO devices are running and migration is active, which is
471 * essentially equivalent to the migration being in pre-copy phase.
472 */
473static bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
9e7b0442
KW
474{
475 VFIOGroup *group;
476 VFIODevice *vbasedev;
9e7b0442 477
8b942af3 478 if (!migration_is_active(migrate_get_current())) {
9e7b0442
KW
479 return false;
480 }
481
482 QLIST_FOREACH(group, &container->group_list, container_next) {
483 QLIST_FOREACH(vbasedev, &group->device_list, next) {
484 VFIOMigration *migration = vbasedev->migration;
485
486 if (!migration) {
487 return false;
488 }
489
7429aebe 490 if (migration->device_state == VFIO_DEVICE_STATE_RUNNING) {
9e7b0442
KW
491 continue;
492 } else {
493 return false;
494 }
495 }
496 }
497 return true;
498}
499
500static int vfio_dma_unmap_bitmap(VFIOContainer *container,
501 hwaddr iova, ram_addr_t size,
502 IOMMUTLBEntry *iotlb)
503{
504 struct vfio_iommu_type1_dma_unmap *unmap;
505 struct vfio_bitmap *bitmap;
725ccd7e 506 VFIOBitmap vbmap;
9e7b0442
KW
507 int ret;
508
725ccd7e
AH
509 ret = vfio_bitmap_alloc(&vbmap, size);
510 if (ret) {
511 return ret;
512 }
513
9e7b0442
KW
514 unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
515
516 unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
517 unmap->iova = iova;
518 unmap->size = size;
519 unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
520 bitmap = (struct vfio_bitmap *)&unmap->data;
521
522 /*
1eb7f642
KJ
523 * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
524 * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
525 * to qemu_real_host_page_size.
9e7b0442 526 */
8e3b0cbb 527 bitmap->pgsize = qemu_real_host_page_size();
725ccd7e
AH
528 bitmap->size = vbmap.size;
529 bitmap->data = (__u64 *)vbmap.bitmap;
9e7b0442 530
725ccd7e
AH
531 if (vbmap.size > container->max_dirty_bitmap_size) {
532 error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size);
9e7b0442
KW
533 ret = -E2BIG;
534 goto unmap_exit;
535 }
536
9e7b0442
KW
537 ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
538 if (!ret) {
725ccd7e
AH
539 cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap,
540 iotlb->translated_addr, vbmap.pages);
9e7b0442
KW
541 } else {
542 error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
543 }
544
9e7b0442
KW
545unmap_exit:
546 g_free(unmap);
725ccd7e
AH
547 g_free(vbmap.bitmap);
548
9e7b0442
KW
549 return ret;
550}
551
e2c7d025
EA
552/*
553 * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
554 */
555static int vfio_dma_unmap(VFIOContainer *container,
9e7b0442
KW
556 hwaddr iova, ram_addr_t size,
557 IOMMUTLBEntry *iotlb)
e2c7d025
EA
558{
559 struct vfio_iommu_type1_dma_unmap unmap = {
560 .argsz = sizeof(unmap),
561 .flags = 0,
562 .iova = iova,
563 .size = size,
564 };
565
9e7b0442 566 if (iotlb && container->dirty_pages_supported &&
8b942af3 567 vfio_devices_all_running_and_mig_active(container)) {
9e7b0442
KW
568 return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
569 }
570
567d7d3e
AW
571 while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
572 /*
573 * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
574 * v4.15) where an overflow in its wrap-around check prevents us from
575 * unmapping the last page of the address space. Test for the error
576 * condition and re-try the unmap excluding the last page. The
577 * expectation is that we've never mapped the last page anyway and this
578 * unmap request comes via vIOMMU support which also makes it unlikely
579 * that this page is used. This bug was introduced well after type1 v2
580 * support was introduced, so we shouldn't need to test for v1. A fix
581 * is queued for kernel v5.0 so this workaround can be removed once
582 * affected kernels are sufficiently deprecated.
583 */
584 if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
585 container->iommu_type == VFIO_TYPE1v2_IOMMU) {
586 trace_vfio_dma_unmap_overflow_workaround();
587 unmap.size -= 1ULL << ctz64(container->pgsizes);
588 continue;
589 }
b09d51c9 590 error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
e2c7d025
EA
591 return -errno;
592 }
593
8b942af3 594 if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
b051a3f6
AH
595 cpu_physical_memory_set_dirty_range(iotlb->translated_addr, size,
596 tcg_enabled() ? DIRTY_CLIENTS_ALL :
597 DIRTY_CLIENTS_NOCODE);
598 }
599
e2c7d025
EA
600 return 0;
601}
602
603static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
604 ram_addr_t size, void *vaddr, bool readonly)
605{
606 struct vfio_iommu_type1_dma_map map = {
607 .argsz = sizeof(map),
608 .flags = VFIO_DMA_MAP_FLAG_READ,
609 .vaddr = (__u64)(uintptr_t)vaddr,
610 .iova = iova,
611 .size = size,
612 };
613
614 if (!readonly) {
615 map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
616 }
617
618 /*
619 * Try the mapping, if it fails with EBUSY, unmap the region and try
620 * again. This shouldn't be necessary, but we sometimes see it in
b6af0975 621 * the VGA ROM space.
e2c7d025
EA
622 */
623 if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
9e7b0442 624 (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
e2c7d025
EA
625 ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
626 return 0;
627 }
628
b09d51c9 629 error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
e2c7d025
EA
630 return -errno;
631}
632
f4ec5e26
AK
633static void vfio_host_win_add(VFIOContainer *container,
634 hwaddr min_iova, hwaddr max_iova,
635 uint64_t iova_pgsizes)
636{
637 VFIOHostDMAWindow *hostwin;
638
639 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
640 if (ranges_overlap(hostwin->min_iova,
641 hostwin->max_iova - hostwin->min_iova + 1,
642 min_iova,
643 max_iova - min_iova + 1)) {
644 hw_error("%s: Overlapped IOMMU are not enabled", __func__);
645 }
646 }
647
648 hostwin = g_malloc0(sizeof(*hostwin));
649
650 hostwin->min_iova = min_iova;
651 hostwin->max_iova = max_iova;
652 hostwin->iova_pgsizes = iova_pgsizes;
653 QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
654}
655
2e4109de
AK
656static int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova,
657 hwaddr max_iova)
658{
659 VFIOHostDMAWindow *hostwin;
660
661 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
662 if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
663 QLIST_REMOVE(hostwin, hostwin_next);
f3bc3a73 664 g_free(hostwin);
2e4109de
AK
665 return 0;
666 }
667 }
668
669 return -1;
670}
671
e2c7d025
EA
672static bool vfio_listener_skipped_section(MemoryRegionSection *section)
673{
674 return (!memory_region_is_ram(section->mr) &&
675 !memory_region_is_iommu(section->mr)) ||
56918a12 676 memory_region_is_protected(section->mr) ||
e2c7d025
EA
677 /*
678 * Sizing an enabled 64-bit BAR can cause spurious mappings to
679 * addresses in the upper part of the 64-bit address space. These
680 * are never accessed by the CPU and beyond the address width of
681 * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width.
682 */
683 section->offset_within_address_space & (1ULL << 63);
684}
685
4a4b88fb 686/* Called with rcu_read_lock held. */
9a04fe09
KW
687static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
688 ram_addr_t *ram_addr, bool *read_only)
e2c7d025 689{
baa44bce 690 bool ret, mr_has_discard_manager;
0fd7616e 691
baa44bce
CL
692 ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only,
693 &mr_has_discard_manager);
694 if (ret && mr_has_discard_manager) {
0fd7616e
DH
695 /*
696 * Malicious VMs might trigger discarding of IOMMU-mapped memory. The
697 * pages will remain pinned inside vfio until unmapped, resulting in a
698 * higher memory consumption than expected. If memory would get
699 * populated again later, there would be an inconsistency between pages
700 * pinned by vfio and pages seen by QEMU. This is the case until
701 * unmapped from the IOMMU (e.g., during device reset).
702 *
703 * With malicious guests, we really only care about pinning more memory
704 * than expected. RLIMIT_MEMLOCK set for the user/process can never be
705 * exceeded and can be used to mitigate this problem.
706 */
707 warn_report_once("Using vfio with vIOMMUs and coordinated discarding of"
708 " RAM (e.g., virtio-mem) works, however, malicious"
709 " guests can trigger pinning of more memory than"
710 " intended via an IOMMU. It's possible to mitigate "
711 " by setting/adjusting RLIMIT_MEMLOCK.");
e2c7d025 712 }
baa44bce 713 return ret;
4a4b88fb
PX
714}
715
716static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
717{
718 VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
719 VFIOContainer *container = giommu->container;
720 hwaddr iova = iotlb->iova + giommu->iommu_offset;
4a4b88fb
PX
721 void *vaddr;
722 int ret;
723
724 trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
725 iova, iova + iotlb->addr_mask);
726
727 if (iotlb->target_as != &address_space_memory) {
728 error_report("Wrong target AS \"%s\", only system memory is allowed",
729 iotlb->target_as->name ? iotlb->target_as->name : "none");
236e0a45 730 vfio_set_migration_error(-EINVAL);
4a4b88fb
PX
731 return;
732 }
733
734 rcu_read_lock();
735
e2c7d025 736 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
9a04fe09
KW
737 bool read_only;
738
739 if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) {
dfbd90e5
PX
740 goto out;
741 }
4a4b88fb
PX
742 /*
743 * vaddr is only valid until rcu_read_unlock(). But after
744 * vfio_dma_map has set up the mapping the pages will be
745 * pinned by the kernel. This makes sure that the RAM backend
746 * of vaddr will always be there, even if the memory object is
747 * destroyed and its backing memory munmap-ed.
748 */
d78c19b5 749 ret = vfio_dma_map(container, iova,
e2c7d025 750 iotlb->addr_mask + 1, vaddr,
4a4b88fb 751 read_only);
e2c7d025
EA
752 if (ret) {
753 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
db9b829b 754 "0x%"HWADDR_PRIx", %p) = %d (%s)",
d78c19b5 755 container, iova,
db9b829b 756 iotlb->addr_mask + 1, vaddr, ret, strerror(-ret));
e2c7d025
EA
757 }
758 } else {
9e7b0442 759 ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
e2c7d025
EA
760 if (ret) {
761 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
db9b829b 762 "0x%"HWADDR_PRIx") = %d (%s)",
d78c19b5 763 container, iova,
db9b829b 764 iotlb->addr_mask + 1, ret, strerror(-ret));
236e0a45 765 vfio_set_migration_error(ret);
e2c7d025
EA
766 }
767 }
41063e1e
PB
768out:
769 rcu_read_unlock();
e2c7d025
EA
770}
771
5e3b981c
DH
772static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
773 MemoryRegionSection *section)
774{
775 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
776 listener);
777 const hwaddr size = int128_get64(section->size);
778 const hwaddr iova = section->offset_within_address_space;
779 int ret;
780
781 /* Unmap with a single call. */
782 ret = vfio_dma_unmap(vrdl->container, iova, size , NULL);
783 if (ret) {
784 error_report("%s: vfio_dma_unmap() failed: %s", __func__,
785 strerror(-ret));
786 }
787}
788
789static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
790 MemoryRegionSection *section)
791{
792 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
793 listener);
794 const hwaddr end = section->offset_within_region +
795 int128_get64(section->size);
796 hwaddr start, next, iova;
797 void *vaddr;
798 int ret;
799
800 /*
801 * Map in (aligned within memory region) minimum granularity, so we can
802 * unmap in minimum granularity later.
803 */
804 for (start = section->offset_within_region; start < end; start = next) {
805 next = ROUND_UP(start + 1, vrdl->granularity);
806 next = MIN(next, end);
807
808 iova = start - section->offset_within_region +
809 section->offset_within_address_space;
810 vaddr = memory_region_get_ram_ptr(section->mr) + start;
811
812 ret = vfio_dma_map(vrdl->container, iova, next - start,
813 vaddr, section->readonly);
814 if (ret) {
815 /* Rollback */
816 vfio_ram_discard_notify_discard(rdl, section);
817 return ret;
818 }
819 }
820 return 0;
821}
822
823static void vfio_register_ram_discard_listener(VFIOContainer *container,
824 MemoryRegionSection *section)
825{
826 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
827 VFIORamDiscardListener *vrdl;
828
829 /* Ignore some corner cases not relevant in practice. */
830 g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE));
831 g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space,
832 TARGET_PAGE_SIZE));
833 g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE));
834
835 vrdl = g_new0(VFIORamDiscardListener, 1);
836 vrdl->container = container;
837 vrdl->mr = section->mr;
838 vrdl->offset_within_address_space = section->offset_within_address_space;
839 vrdl->size = int128_get64(section->size);
840 vrdl->granularity = ram_discard_manager_get_min_granularity(rdm,
841 section->mr);
842
843 g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
a5dba9bc
DH
844 g_assert(container->pgsizes &&
845 vrdl->granularity >= 1ULL << ctz64(container->pgsizes));
5e3b981c
DH
846
847 ram_discard_listener_init(&vrdl->listener,
848 vfio_ram_discard_notify_populate,
849 vfio_ram_discard_notify_discard, true);
850 ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
851 QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next);
a74317f6
DH
852
853 /*
854 * Sanity-check if we have a theoretically problematic setup where we could
855 * exceed the maximum number of possible DMA mappings over time. We assume
856 * that each mapped section in the same address space as a RamDiscardManager
857 * section consumes exactly one DMA mapping, with the exception of
858 * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections
859 * in the same address space as RamDiscardManager sections.
860 *
861 * We assume that each section in the address space consumes one memslot.
862 * We take the number of KVM memory slots as a best guess for the maximum
863 * number of sections in the address space we could have over time,
864 * also consuming DMA mappings.
865 */
866 if (container->dma_max_mappings) {
867 unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
868
869#ifdef CONFIG_KVM
870 if (kvm_enabled()) {
871 max_memslots = kvm_get_max_memslots();
872 }
873#endif
874
875 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
876 hwaddr start, end;
877
878 start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
879 vrdl->granularity);
880 end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size,
881 vrdl->granularity);
882 vrdl_mappings += (end - start) / vrdl->granularity;
883 vrdl_count++;
884 }
885
886 if (vrdl_mappings + max_memslots - vrdl_count >
887 container->dma_max_mappings) {
888 warn_report("%s: possibly running out of DMA mappings. E.g., try"
889 " increasing the 'block-size' of virtio-mem devies."
890 " Maximum possible DMA mappings: %d, Maximum possible"
891 " memslots: %d", __func__, container->dma_max_mappings,
892 max_memslots);
893 }
894 }
5e3b981c
DH
895}
896
897static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
898 MemoryRegionSection *section)
899{
900 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
901 VFIORamDiscardListener *vrdl = NULL;
902
903 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
904 if (vrdl->mr == section->mr &&
905 vrdl->offset_within_address_space ==
906 section->offset_within_address_space) {
907 break;
908 }
909 }
910
911 if (!vrdl) {
912 hw_error("vfio: Trying to unregister missing RAM discard listener");
913 }
914
915 ram_discard_manager_unregister_listener(rdm, &vrdl->listener);
916 QLIST_REMOVE(vrdl, next);
917 g_free(vrdl);
918}
919
fbc6c921
JM
920static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container,
921 hwaddr iova, hwaddr end)
922{
923 VFIOHostDMAWindow *hostwin;
924 bool hostwin_found = false;
925
926 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
927 if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
928 hostwin_found = true;
929 break;
930 }
931 }
932
933 return hostwin_found ? hostwin : NULL;
934}
935
851d6d1a
EA
936static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
937{
938 MemoryRegion *mr = section->mr;
939
940 if (!TPM_IS_CRB(mr->owner)) {
941 return false;
942 }
943
944 /* this is a known safe misaligned region, just trace for debug purpose */
945 trace_vfio_known_safe_misalignment(memory_region_name(mr),
946 section->offset_within_address_space,
947 section->offset_within_region,
948 qemu_real_host_page_size());
949 return true;
950}
951
b92f2376
JM
952static bool vfio_listener_valid_section(MemoryRegionSection *section,
953 const char *name)
e2c7d025 954{
e2c7d025 955 if (vfio_listener_skipped_section(section)) {
b92f2376 956 trace_vfio_listener_region_skip(name,
e2c7d025
EA
957 section->offset_within_address_space,
958 section->offset_within_address_space +
959 int128_get64(int128_sub(section->size, int128_one())));
b92f2376 960 return false;
e2c7d025
EA
961 }
962
1eb7f642 963 if (unlikely((section->offset_within_address_space &
8e3b0cbb
MAL
964 ~qemu_real_host_page_mask()) !=
965 (section->offset_within_region & ~qemu_real_host_page_mask()))) {
851d6d1a
EA
966 if (!vfio_known_safe_misalignment(section)) {
967 error_report("%s received unaligned region %s iova=0x%"PRIx64
968 " offset_within_region=0x%"PRIx64
969 " qemu_real_host_page_size=0x%"PRIxPTR,
970 __func__, memory_region_name(section->mr),
971 section->offset_within_address_space,
972 section->offset_within_region,
973 qemu_real_host_page_size());
974 }
b92f2376
JM
975 return false;
976 }
977
978 return true;
979}
980
4ead8308
JM
981static bool vfio_get_section_iova_range(VFIOContainer *container,
982 MemoryRegionSection *section,
983 hwaddr *out_iova, hwaddr *out_end,
984 Int128 *out_llend)
985{
986 Int128 llend;
987 hwaddr iova;
988
989 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
990 llend = int128_make64(section->offset_within_address_space);
991 llend = int128_add(llend, section->size);
992 llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
993
994 if (int128_ge(int128_make64(iova), llend)) {
995 return false;
996 }
997
998 *out_iova = iova;
999 *out_end = int128_get64(int128_sub(llend, int128_one()));
1000 if (out_llend) {
1001 *out_llend = llend;
1002 }
1003 return true;
1004}
1005
b92f2376
JM
1006static void vfio_listener_region_add(MemoryListener *listener,
1007 MemoryRegionSection *section)
1008{
1009 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1010 hwaddr iova, end;
1011 Int128 llend, llsize;
1012 void *vaddr;
1013 int ret;
1014 VFIOHostDMAWindow *hostwin;
1015 Error *err = NULL;
1016
1017 if (!vfio_listener_valid_section(section, "region_add")) {
e2c7d025
EA
1018 return;
1019 }
1020
4ead8308 1021 if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) {
e4b34708
KJ
1022 if (memory_region_is_ram_device(section->mr)) {
1023 trace_vfio_listener_region_add_no_dma_map(
1024 memory_region_name(section->mr),
1025 section->offset_within_address_space,
1026 int128_getlo(section->size),
8e3b0cbb 1027 qemu_real_host_page_size());
e4b34708 1028 }
e2c7d025
EA
1029 return;
1030 }
3898aad3 1031
2e4109de 1032 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
2e4109de
AK
1033 hwaddr pgsize = 0;
1034
1035 /* For now intersections are not allowed, we may relax this later */
1036 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
1037 if (ranges_overlap(hostwin->min_iova,
1038 hostwin->max_iova - hostwin->min_iova + 1,
1039 section->offset_within_address_space,
1040 int128_get64(section->size))) {
d7d87836
EA
1041 error_setg(&err,
1042 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
1043 "host DMA window [0x%"PRIx64",0x%"PRIx64"]",
1044 section->offset_within_address_space,
1045 section->offset_within_address_space +
1046 int128_get64(section->size) - 1,
1047 hostwin->min_iova, hostwin->max_iova);
2e4109de
AK
1048 goto fail;
1049 }
1050 }
1051
1052 ret = vfio_spapr_create_window(container, section, &pgsize);
1053 if (ret) {
d7d87836 1054 error_setg_errno(&err, -ret, "Failed to create SPAPR window");
2e4109de
AK
1055 goto fail;
1056 }
1057
1058 vfio_host_win_add(container, section->offset_within_address_space,
1059 section->offset_within_address_space +
1060 int128_get64(section->size) - 1, pgsize);
07bc681a
AK
1061#ifdef CONFIG_KVM
1062 if (kvm_enabled()) {
1063 VFIOGroup *group;
1064 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
1065 struct kvm_vfio_spapr_tce param;
1066 struct kvm_device_attr attr = {
1067 .group = KVM_DEV_VFIO_GROUP,
1068 .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
1069 .addr = (uint64_t)(unsigned long)&param,
1070 };
1071
1072 if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
1073 &param.tablefd)) {
1074 QLIST_FOREACH(group, &container->group_list, container_next) {
1075 param.groupfd = group->fd;
1076 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
1077 error_report("vfio: failed to setup fd %d "
1078 "for a group with fd %d: %s",
1079 param.tablefd, param.groupfd,
1080 strerror(errno));
1081 return;
1082 }
1083 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
1084 }
1085 }
1086 }
1087#endif
2e4109de
AK
1088 }
1089
fbc6c921
JM
1090 hostwin = vfio_find_hostwin(container, iova, end);
1091 if (!hostwin) {
d7d87836
EA
1092 error_setg(&err, "Container %p can't map guest IOVA region"
1093 " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end);
3898aad3
DG
1094 goto fail;
1095 }
e2c7d025
EA
1096
1097 memory_region_ref(section->mr);
1098
1099 if (memory_region_is_iommu(section->mr)) {
1100 VFIOGuestIOMMU *giommu;
3df9d748 1101 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
cb1efcf4 1102 int iommu_idx;
e2c7d025 1103
55efcc53 1104 trace_vfio_listener_region_add_iommu(iova, end);
e2c7d025 1105 /*
e2c7d025
EA
1106 * FIXME: For VFIO iommu types which have KVM acceleration to
1107 * avoid bouncing all map/unmaps through qemu this way, this
1108 * would be the right place to wire that up (tell the KVM
1109 * device emulation the VFIO iommu handles to use).
1110 */
e2c7d025 1111 giommu = g_malloc0(sizeof(*giommu));
44ee6aaa 1112 giommu->iommu_mr = iommu_mr;
d78c19b5
AK
1113 giommu->iommu_offset = section->offset_within_address_space -
1114 section->offset_within_region;
e2c7d025 1115 giommu->container = container;
698feb5e
PX
1116 llend = int128_add(int128_make64(section->offset_within_region),
1117 section->size);
1118 llend = int128_sub(llend, int128_one());
cb1efcf4
PM
1119 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
1120 MEMTXATTRS_UNSPECIFIED);
698feb5e 1121 iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
8dca037b 1122 IOMMU_NOTIFIER_IOTLB_EVENTS,
698feb5e 1123 section->offset_within_region,
cb1efcf4
PM
1124 int128_get64(llend),
1125 iommu_idx);
508ce5eb 1126
44ee6aaa 1127 ret = memory_region_iommu_set_page_size_mask(giommu->iommu_mr,
b9177498
BB
1128 container->pgsizes,
1129 &err);
1130 if (ret) {
1131 g_free(giommu);
1132 goto fail;
1133 }
1134
549d4005
EA
1135 ret = memory_region_register_iommu_notifier(section->mr, &giommu->n,
1136 &err);
1137 if (ret) {
1138 g_free(giommu);
1139 goto fail;
1140 }
1141 QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
44ee6aaa 1142 memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
e2c7d025
EA
1143
1144 return;
1145 }
1146
1147 /* Here we assume that memory_region_is_ram(section->mr)==true */
1148
5e3b981c
DH
1149 /*
1150 * For RAM memory regions with a RamDiscardManager, we only want to map the
1151 * actually populated parts - and update the mapping whenever we're notified
1152 * about changes.
1153 */
1154 if (memory_region_has_ram_discard_manager(section->mr)) {
1155 vfio_register_ram_discard_listener(container, section);
1156 return;
1157 }
1158
e2c7d025
EA
1159 vaddr = memory_region_get_ram_ptr(section->mr) +
1160 section->offset_within_region +
1161 (iova - section->offset_within_address_space);
1162
55efcc53 1163 trace_vfio_listener_region_add_ram(iova, end, vaddr);
e2c7d025 1164
55efcc53
BD
1165 llsize = int128_sub(llend, int128_make64(iova));
1166
567b5b30
AK
1167 if (memory_region_is_ram_device(section->mr)) {
1168 hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
1169
1170 if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
5c086005
EA
1171 trace_vfio_listener_region_add_no_dma_map(
1172 memory_region_name(section->mr),
1173 section->offset_within_address_space,
1174 int128_getlo(section->size),
1175 pgmask + 1);
567b5b30
AK
1176 return;
1177 }
1178 }
1179
55efcc53
BD
1180 ret = vfio_dma_map(container, iova, int128_get64(llsize),
1181 vaddr, section->readonly);
e2c7d025 1182 if (ret) {
d7d87836 1183 error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
db9b829b
AH
1184 "0x%"HWADDR_PRIx", %p) = %d (%s)",
1185 container, iova, int128_get64(llsize), vaddr, ret,
1186 strerror(-ret));
567b5b30
AK
1187 if (memory_region_is_ram_device(section->mr)) {
1188 /* Allow unexpected mappings not to be fatal for RAM devices */
d7d87836 1189 error_report_err(err);
567b5b30
AK
1190 return;
1191 }
ac6dc389
DG
1192 goto fail;
1193 }
e2c7d025 1194
ac6dc389
DG
1195 return;
1196
1197fail:
567b5b30
AK
1198 if (memory_region_is_ram_device(section->mr)) {
1199 error_report("failed to vfio_dma_map. pci p2p may not work");
1200 return;
1201 }
ac6dc389
DG
1202 /*
1203 * On the initfn path, store the first error in the container so we
1204 * can gracefully fail. Runtime, there's not much we can do other
1205 * than throw a hardware error.
1206 */
1207 if (!container->initialized) {
1208 if (!container->error) {
d7d87836
EA
1209 error_propagate_prepend(&container->error, err,
1210 "Region %s: ",
1211 memory_region_name(section->mr));
1212 } else {
1213 error_free(err);
e2c7d025 1214 }
ac6dc389 1215 } else {
d7d87836 1216 error_report_err(err);
ac6dc389 1217 hw_error("vfio: DMA mapping failed, unable to continue");
e2c7d025
EA
1218 }
1219}
1220
1221static void vfio_listener_region_del(MemoryListener *listener,
1222 MemoryRegionSection *section)
1223{
ee0bf0e5 1224 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
e2c7d025 1225 hwaddr iova, end;
7a057b4f 1226 Int128 llend, llsize;
e2c7d025 1227 int ret;
567b5b30 1228 bool try_unmap = true;
e2c7d025 1229
b92f2376 1230 if (!vfio_listener_valid_section(section, "region_del")) {
e2c7d025
EA
1231 return;
1232 }
1233
1234 if (memory_region_is_iommu(section->mr)) {
1235 VFIOGuestIOMMU *giommu;
1236
1237 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
44ee6aaa 1238 if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
698feb5e 1239 giommu->n.start == section->offset_within_region) {
3df9d748 1240 memory_region_unregister_iommu_notifier(section->mr,
d22d8956 1241 &giommu->n);
e2c7d025
EA
1242 QLIST_REMOVE(giommu, giommu_next);
1243 g_free(giommu);
1244 break;
1245 }
1246 }
1247
1248 /*
1249 * FIXME: We assume the one big unmap below is adequate to
1250 * remove any individual page mappings in the IOMMU which
1251 * might have been copied into VFIO. This works for a page table
1252 * based IOMMU where a big unmap flattens a large range of IO-PTEs.
1253 * That may not be true for all IOMMU types.
1254 */
1255 }
1256
4ead8308 1257 if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) {
e2c7d025
EA
1258 return;
1259 }
7a057b4f
AK
1260
1261 llsize = int128_sub(llend, int128_make64(iova));
e2c7d025 1262
7a057b4f 1263 trace_vfio_listener_region_del(iova, end);
e2c7d025 1264
567b5b30
AK
1265 if (memory_region_is_ram_device(section->mr)) {
1266 hwaddr pgmask;
1267 VFIOHostDMAWindow *hostwin;
567b5b30 1268
fbc6c921
JM
1269 hostwin = vfio_find_hostwin(container, iova, end);
1270 assert(hostwin); /* or region_add() would have failed */
567b5b30
AK
1271
1272 pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
1273 try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
5e3b981c
DH
1274 } else if (memory_region_has_ram_discard_manager(section->mr)) {
1275 vfio_unregister_ram_discard_listener(container, section);
1276 /* Unregistering will trigger an unmap. */
1277 try_unmap = false;
e2c7d025 1278 }
2e4109de 1279
567b5b30 1280 if (try_unmap) {
1b296c3d
JPB
1281 if (int128_eq(llsize, int128_2_64())) {
1282 /* The unmap ioctl doesn't accept a full 64-bit span. */
1283 llsize = int128_rshift(llsize, 1);
1284 ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
1285 if (ret) {
1286 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
db9b829b
AH
1287 "0x%"HWADDR_PRIx") = %d (%s)",
1288 container, iova, int128_get64(llsize), ret,
1289 strerror(-ret));
1b296c3d
JPB
1290 }
1291 iova += int128_get64(llsize);
1292 }
9e7b0442 1293 ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
567b5b30
AK
1294 if (ret) {
1295 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
db9b829b
AH
1296 "0x%"HWADDR_PRIx") = %d (%s)",
1297 container, iova, int128_get64(llsize), ret,
1298 strerror(-ret));
567b5b30
AK
1299 }
1300 }
1301
1302 memory_region_unref(section->mr);
1303
2e4109de
AK
1304 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1305 vfio_spapr_remove_window(container,
1306 section->offset_within_address_space);
1307 if (vfio_host_win_del(container,
1308 section->offset_within_address_space,
1309 section->offset_within_address_space +
1310 int128_get64(section->size) - 1) < 0) {
1311 hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
1312 __func__, section->offset_within_address_space);
1313 }
1314 }
e2c7d025
EA
1315}
1316
236e0a45 1317static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
758b96b6
KZ
1318{
1319 int ret;
1320 struct vfio_iommu_type1_dirty_bitmap dirty = {
1321 .argsz = sizeof(dirty),
1322 };
1323
b051a3f6 1324 if (!container->dirty_pages_supported) {
236e0a45 1325 return 0;
b051a3f6
AH
1326 }
1327
758b96b6
KZ
1328 if (start) {
1329 dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
1330 } else {
1331 dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
1332 }
1333
1334 ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
1335 if (ret) {
236e0a45 1336 ret = -errno;
758b96b6
KZ
1337 error_report("Failed to set dirty tracking flag 0x%x errno: %d",
1338 dirty.flags, errno);
1339 }
236e0a45
AH
1340
1341 return ret;
758b96b6
KZ
1342}
1343
62c1b002
JM
1344typedef struct VFIODirtyRanges {
1345 hwaddr min32;
1346 hwaddr max32;
1347 hwaddr min64;
1348 hwaddr max64;
1349} VFIODirtyRanges;
1350
1351typedef struct VFIODirtyRangesListener {
1352 VFIOContainer *container;
1353 VFIODirtyRanges ranges;
1354 MemoryListener listener;
1355} VFIODirtyRangesListener;
1356
1357static void vfio_dirty_tracking_update(MemoryListener *listener,
1358 MemoryRegionSection *section)
1359{
1360 VFIODirtyRangesListener *dirty = container_of(listener,
1361 VFIODirtyRangesListener,
1362 listener);
1363 VFIODirtyRanges *range = &dirty->ranges;
1364 hwaddr iova, end, *min, *max;
1365
1366 if (!vfio_listener_valid_section(section, "tracking_update") ||
1367 !vfio_get_section_iova_range(dirty->container, section,
1368 &iova, &end, NULL)) {
1369 return;
1370 }
1371
1372 /*
1373 * The address space passed to the dirty tracker is reduced to two ranges:
1374 * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges.
1375 * The underlying reports of dirty will query a sub-interval of each of
1376 * these ranges.
1377 *
1378 * The purpose of the dual range handling is to handle known cases of big
1379 * holes in the address space, like the x86 AMD 1T hole. The alternative
1380 * would be an IOVATree but that has a much bigger runtime overhead and
1381 * unnecessary complexity.
1382 */
1383 min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
1384 max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
1385
1386 if (*min > iova) {
1387 *min = iova;
1388 }
1389 if (*max < end) {
1390 *max = end;
1391 }
1392
1393 trace_vfio_device_dirty_tracking_update(iova, end, *min, *max);
1394 return;
1395}
1396
1397static const MemoryListener vfio_dirty_tracking_listener = {
1398 .name = "vfio-tracking",
1399 .region_add = vfio_dirty_tracking_update,
1400};
1401
1402static void vfio_dirty_tracking_init(VFIOContainer *container,
1403 VFIODirtyRanges *ranges)
1404{
1405 VFIODirtyRangesListener dirty;
1406
1407 memset(&dirty, 0, sizeof(dirty));
1408 dirty.ranges.min32 = UINT32_MAX;
1409 dirty.ranges.min64 = UINT64_MAX;
1410 dirty.listener = vfio_dirty_tracking_listener;
1411 dirty.container = container;
1412
1413 memory_listener_register(&dirty.listener,
1414 container->space->as);
1415
1416 *ranges = dirty.ranges;
1417
1418 /*
1419 * The memory listener is synchronous, and used to calculate the range
1420 * to dirty tracking. Unregister it after we are done as we are not
1421 * interested in any follow-up updates.
1422 */
1423 memory_listener_unregister(&dirty.listener);
1424}
1425
5255bbf4
JM
1426static void vfio_devices_dma_logging_stop(VFIOContainer *container)
1427{
1428 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
1429 sizeof(uint64_t))] = {};
1430 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
1431 VFIODevice *vbasedev;
1432 VFIOGroup *group;
1433
1434 feature->argsz = sizeof(buf);
1435 feature->flags = VFIO_DEVICE_FEATURE_SET |
1436 VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
1437
1438 QLIST_FOREACH(group, &container->group_list, container_next) {
1439 QLIST_FOREACH(vbasedev, &group->device_list, next) {
1440 if (!vbasedev->dirty_tracking) {
1441 continue;
1442 }
1443
1444 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
1445 warn_report("%s: Failed to stop DMA logging, err %d (%s)",
1446 vbasedev->name, -errno, strerror(errno));
1447 }
1448 vbasedev->dirty_tracking = false;
1449 }
1450 }
1451}
1452
1453static struct vfio_device_feature *
1454vfio_device_feature_dma_logging_start_create(VFIOContainer *container,
1455 VFIODirtyRanges *tracking)
1456{
1457 struct vfio_device_feature *feature;
1458 size_t feature_size;
1459 struct vfio_device_feature_dma_logging_control *control;
1460 struct vfio_device_feature_dma_logging_range *ranges;
1461
1462 feature_size = sizeof(struct vfio_device_feature) +
1463 sizeof(struct vfio_device_feature_dma_logging_control);
1464 feature = g_try_malloc0(feature_size);
1465 if (!feature) {
1466 errno = ENOMEM;
1467 return NULL;
1468 }
1469 feature->argsz = feature_size;
1470 feature->flags = VFIO_DEVICE_FEATURE_SET |
1471 VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
1472
1473 control = (struct vfio_device_feature_dma_logging_control *)feature->data;
1474 control->page_size = qemu_real_host_page_size();
1475
1476 /*
1477 * DMA logging uAPI guarantees to support at least a number of ranges that
1478 * fits into a single host kernel base page.
1479 */
1480 control->num_ranges = !!tracking->max32 + !!tracking->max64;
1481 ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
1482 control->num_ranges);
1483 if (!ranges) {
1484 g_free(feature);
1485 errno = ENOMEM;
1486
1487 return NULL;
1488 }
1489
1490 control->ranges = (__u64)(uintptr_t)ranges;
1491 if (tracking->max32) {
1492 ranges->iova = tracking->min32;
1493 ranges->length = (tracking->max32 - tracking->min32) + 1;
1494 ranges++;
1495 }
1496 if (tracking->max64) {
1497 ranges->iova = tracking->min64;
1498 ranges->length = (tracking->max64 - tracking->min64) + 1;
1499 }
1500
1501 trace_vfio_device_dirty_tracking_start(control->num_ranges,
1502 tracking->min32, tracking->max32,
1503 tracking->min64, tracking->max64);
1504
1505 return feature;
1506}
1507
1508static void vfio_device_feature_dma_logging_start_destroy(
1509 struct vfio_device_feature *feature)
1510{
1511 struct vfio_device_feature_dma_logging_control *control =
1512 (struct vfio_device_feature_dma_logging_control *)feature->data;
1513 struct vfio_device_feature_dma_logging_range *ranges =
1514 (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges;
1515
1516 g_free(ranges);
1517 g_free(feature);
1518}
1519
1520static int vfio_devices_dma_logging_start(VFIOContainer *container)
1521{
1522 struct vfio_device_feature *feature;
1523 VFIODirtyRanges ranges;
1524 VFIODevice *vbasedev;
1525 VFIOGroup *group;
1526 int ret = 0;
1527
1528 vfio_dirty_tracking_init(container, &ranges);
1529 feature = vfio_device_feature_dma_logging_start_create(container,
1530 &ranges);
1531 if (!feature) {
1532 return -errno;
1533 }
1534
1535 QLIST_FOREACH(group, &container->group_list, container_next) {
1536 QLIST_FOREACH(vbasedev, &group->device_list, next) {
1537 if (vbasedev->dirty_tracking) {
1538 continue;
1539 }
1540
1541 ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
1542 if (ret) {
1543 ret = -errno;
1544 error_report("%s: Failed to start DMA logging, err %d (%s)",
1545 vbasedev->name, ret, strerror(errno));
1546 goto out;
1547 }
1548 vbasedev->dirty_tracking = true;
1549 }
1550 }
1551
1552out:
1553 if (ret) {
1554 vfio_devices_dma_logging_stop(container);
1555 }
1556
1557 vfio_device_feature_dma_logging_start_destroy(feature);
1558
1559 return ret;
1560}
1561
758b96b6
KZ
1562static void vfio_listener_log_global_start(MemoryListener *listener)
1563{
1564 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
236e0a45 1565 int ret;
758b96b6 1566
5255bbf4
JM
1567 if (vfio_devices_all_device_dirty_tracking(container)) {
1568 ret = vfio_devices_dma_logging_start(container);
1569 } else {
1570 ret = vfio_set_dirty_page_tracking(container, true);
1571 }
62c1b002 1572
236e0a45 1573 if (ret) {
5255bbf4
JM
1574 error_report("vfio: Could not start dirty page tracking, err: %d (%s)",
1575 ret, strerror(-ret));
236e0a45
AH
1576 vfio_set_migration_error(ret);
1577 }
758b96b6
KZ
1578}
1579
1580static void vfio_listener_log_global_stop(MemoryListener *listener)
1581{
1582 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
5255bbf4
JM
1583 int ret = 0;
1584
1585 if (vfio_devices_all_device_dirty_tracking(container)) {
1586 vfio_devices_dma_logging_stop(container);
1587 } else {
1588 ret = vfio_set_dirty_page_tracking(container, false);
1589 }
758b96b6 1590
236e0a45 1591 if (ret) {
5255bbf4
JM
1592 error_report("vfio: Could not stop dirty page tracking, err: %d (%s)",
1593 ret, strerror(-ret));
236e0a45
AH
1594 vfio_set_migration_error(ret);
1595 }
758b96b6
KZ
1596}
1597
b6dd6504
KW
1598static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
1599 uint64_t size, ram_addr_t ram_addr)
1600{
1601 struct vfio_iommu_type1_dirty_bitmap *dbitmap;
1602 struct vfio_iommu_type1_dirty_bitmap_get *range;
725ccd7e 1603 VFIOBitmap vbmap;
b6dd6504
KW
1604 int ret;
1605
b051a3f6
AH
1606 if (!container->dirty_pages_supported) {
1607 cpu_physical_memory_set_dirty_range(ram_addr, size,
1608 tcg_enabled() ? DIRTY_CLIENTS_ALL :
1609 DIRTY_CLIENTS_NOCODE);
1610 return 0;
1611 }
1612
725ccd7e
AH
1613 ret = vfio_bitmap_alloc(&vbmap, size);
1614 if (ret) {
1615 return ret;
1616 }
1617
b6dd6504
KW
1618 dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
1619
1620 dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
1621 dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
1622 range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
1623 range->iova = iova;
1624 range->size = size;
1625
1626 /*
1eb7f642
KJ
1627 * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
1628 * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize
1629 * to qemu_real_host_page_size.
b6dd6504 1630 */
8e3b0cbb 1631 range->bitmap.pgsize = qemu_real_host_page_size();
725ccd7e
AH
1632 range->bitmap.size = vbmap.size;
1633 range->bitmap.data = (__u64 *)vbmap.bitmap;
b6dd6504
KW
1634
1635 ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
1636 if (ret) {
3e2413a6 1637 ret = -errno;
b6dd6504
KW
1638 error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64
1639 " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
1640 (uint64_t)range->size, errno);
1641 goto err_out;
1642 }
1643
725ccd7e
AH
1644 cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr,
1645 vbmap.pages);
b6dd6504
KW
1646
1647 trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
1648 range->bitmap.size, ram_addr);
1649err_out:
b6dd6504 1650 g_free(dbitmap);
725ccd7e 1651 g_free(vbmap.bitmap);
b6dd6504
KW
1652
1653 return ret;
1654}
1655
9a04fe09
KW
1656typedef struct {
1657 IOMMUNotifier n;
1658 VFIOGuestIOMMU *giommu;
1659} vfio_giommu_dirty_notifier;
1660
1661static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
1662{
1663 vfio_giommu_dirty_notifier *gdn = container_of(n,
1664 vfio_giommu_dirty_notifier, n);
1665 VFIOGuestIOMMU *giommu = gdn->giommu;
1666 VFIOContainer *container = giommu->container;
1667 hwaddr iova = iotlb->iova + giommu->iommu_offset;
1668 ram_addr_t translated_addr;
236e0a45 1669 int ret = -EINVAL;
9a04fe09
KW
1670
1671 trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
1672
1673 if (iotlb->target_as != &address_space_memory) {
1674 error_report("Wrong target AS \"%s\", only system memory is allowed",
1675 iotlb->target_as->name ? iotlb->target_as->name : "none");
236e0a45 1676 goto out;
9a04fe09
KW
1677 }
1678
1679 rcu_read_lock();
1680 if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
9a04fe09
KW
1681 ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
1682 translated_addr);
1683 if (ret) {
1684 error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
db9b829b
AH
1685 "0x%"HWADDR_PRIx") = %d (%s)",
1686 container, iova, iotlb->addr_mask + 1, ret,
1687 strerror(-ret));
9a04fe09
KW
1688 }
1689 }
1690 rcu_read_unlock();
236e0a45
AH
1691
1692out:
1693 if (ret) {
1694 vfio_set_migration_error(ret);
1695 }
9a04fe09
KW
1696}
1697
5e3b981c
DH
1698static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
1699 void *opaque)
1700{
1701 const hwaddr size = int128_get64(section->size);
1702 const hwaddr iova = section->offset_within_address_space;
1703 const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) +
1704 section->offset_within_region;
1705 VFIORamDiscardListener *vrdl = opaque;
1706
1707 /*
1708 * Sync the whole mapped region (spanning multiple individual mappings)
1709 * in one go.
1710 */
1711 return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr);
1712}
1713
1714static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
1715 MemoryRegionSection *section)
1716{
1717 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
1718 VFIORamDiscardListener *vrdl = NULL;
1719
1720 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
1721 if (vrdl->mr == section->mr &&
1722 vrdl->offset_within_address_space ==
1723 section->offset_within_address_space) {
1724 break;
1725 }
1726 }
1727
1728 if (!vrdl) {
1729 hw_error("vfio: Trying to sync missing RAM discard listener");
1730 }
1731
1732 /*
1733 * We only want/can synchronize the bitmap for actually mapped parts -
1734 * which correspond to populated parts. Replay all populated parts.
1735 */
1736 return ram_discard_manager_replay_populated(rdm, section,
1737 vfio_ram_discard_get_dirty_bitmap,
1738 &vrdl);
1739}
1740
b6dd6504
KW
1741static int vfio_sync_dirty_bitmap(VFIOContainer *container,
1742 MemoryRegionSection *section)
1743{
1744 ram_addr_t ram_addr;
1745
9a04fe09
KW
1746 if (memory_region_is_iommu(section->mr)) {
1747 VFIOGuestIOMMU *giommu;
1748
1749 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
44ee6aaa 1750 if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
9a04fe09
KW
1751 giommu->n.start == section->offset_within_region) {
1752 Int128 llend;
1753 vfio_giommu_dirty_notifier gdn = { .giommu = giommu };
44ee6aaa 1754 int idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr,
9a04fe09
KW
1755 MEMTXATTRS_UNSPECIFIED);
1756
1757 llend = int128_add(int128_make64(section->offset_within_region),
1758 section->size);
1759 llend = int128_sub(llend, int128_one());
1760
1761 iommu_notifier_init(&gdn.n,
1762 vfio_iommu_map_dirty_notify,
1763 IOMMU_NOTIFIER_MAP,
1764 section->offset_within_region,
1765 int128_get64(llend),
1766 idx);
44ee6aaa 1767 memory_region_iommu_replay(giommu->iommu_mr, &gdn.n);
9a04fe09
KW
1768 break;
1769 }
1770 }
1771 return 0;
5e3b981c
DH
1772 } else if (memory_region_has_ram_discard_manager(section->mr)) {
1773 return vfio_sync_ram_discard_listener_dirty_bitmap(container, section);
9a04fe09
KW
1774 }
1775
b6dd6504
KW
1776 ram_addr = memory_region_get_ram_addr(section->mr) +
1777 section->offset_within_region;
1778
1779 return vfio_get_dirty_bitmap(container,
1eb7f642
KJ
1780 REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
1781 int128_get64(section->size), ram_addr);
b6dd6504
KW
1782}
1783
4292d501 1784static void vfio_listener_log_sync(MemoryListener *listener,
b6dd6504
KW
1785 MemoryRegionSection *section)
1786{
1787 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
236e0a45 1788 int ret;
b6dd6504 1789
b051a3f6 1790 if (vfio_listener_skipped_section(section)) {
b6dd6504
KW
1791 return;
1792 }
1793
758b96b6 1794 if (vfio_devices_all_dirty_tracking(container)) {
236e0a45
AH
1795 ret = vfio_sync_dirty_bitmap(container, section);
1796 if (ret) {
1797 error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret,
1798 strerror(-ret));
1799 vfio_set_migration_error(ret);
1800 }
b6dd6504
KW
1801 }
1802}
1803
51b833f4 1804static const MemoryListener vfio_memory_listener = {
142518bd 1805 .name = "vfio",
e2c7d025
EA
1806 .region_add = vfio_listener_region_add,
1807 .region_del = vfio_listener_region_del,
758b96b6
KZ
1808 .log_global_start = vfio_listener_log_global_start,
1809 .log_global_stop = vfio_listener_log_global_stop,
4292d501 1810 .log_sync = vfio_listener_log_sync,
e2c7d025
EA
1811};
1812
51b833f4 1813static void vfio_listener_release(VFIOContainer *container)
e2c7d025 1814{
ee0bf0e5 1815 memory_listener_unregister(&container->listener);
318f67ce
AK
1816 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1817 memory_listener_unregister(&container->prereg_listener);
1818 }
e2c7d025
EA
1819}
1820
3ab7a0b4
MR
1821static struct vfio_info_cap_header *
1822vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id)
b53b0f69
AW
1823{
1824 struct vfio_info_cap_header *hdr;
b53b0f69 1825
3ab7a0b4 1826 for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
b53b0f69
AW
1827 if (hdr->id == id) {
1828 return hdr;
1829 }
1830 }
1831
1832 return NULL;
1833}
1834
3ab7a0b4
MR
1835struct vfio_info_cap_header *
1836vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
1837{
1838 if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) {
1839 return NULL;
1840 }
1841
1842 return vfio_get_cap((void *)info, info->cap_offset, id);
1843}
1844
7486a628
MR
1845static struct vfio_info_cap_header *
1846vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
1847{
1848 if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
1849 return NULL;
1850 }
1851
1852 return vfio_get_cap((void *)info, info->cap_offset, id);
1853}
1854
92fe289a
MR
1855struct vfio_info_cap_header *
1856vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id)
1857{
1858 if (!(info->flags & VFIO_DEVICE_FLAGS_CAPS)) {
1859 return NULL;
1860 }
1861
1862 return vfio_get_cap((void *)info, info->cap_offset, id);
1863}
1864
7486a628
MR
1865bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
1866 unsigned int *avail)
1867{
1868 struct vfio_info_cap_header *hdr;
1869 struct vfio_iommu_type1_info_dma_avail *cap;
1870
1871 /* If the capability cannot be found, assume no DMA limiting */
1872 hdr = vfio_get_iommu_type1_info_cap(info,
1873 VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
1874 if (hdr == NULL) {
1875 return false;
1876 }
1877
1878 if (avail != NULL) {
1879 cap = (void *) hdr;
1880 *avail = cap->avail;
1881 }
1882
1883 return true;
1884}
1885
24acf72b
AW
1886static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
1887 struct vfio_region_info *info)
b53b0f69
AW
1888{
1889 struct vfio_info_cap_header *hdr;
1890 struct vfio_region_info_cap_sparse_mmap *sparse;
24acf72b 1891 int i, j;
b53b0f69
AW
1892
1893 hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
1894 if (!hdr) {
24acf72b 1895 return -ENODEV;
b53b0f69
AW
1896 }
1897
1898 sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
1899
1900 trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
1901 region->nr, sparse->nr_areas);
1902
24acf72b
AW
1903 region->mmaps = g_new0(VFIOMmap, sparse->nr_areas);
1904
1905 for (i = 0, j = 0; i < sparse->nr_areas; i++) {
24acf72b 1906 if (sparse->areas[i].size) {
99510d27
XC
1907 trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset,
1908 sparse->areas[i].offset +
1909 sparse->areas[i].size - 1);
24acf72b
AW
1910 region->mmaps[j].offset = sparse->areas[i].offset;
1911 region->mmaps[j].size = sparse->areas[i].size;
1912 j++;
1913 }
b53b0f69 1914 }
24acf72b
AW
1915
1916 region->nr_mmaps = j;
1917 region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap));
1918
1919 return 0;
b53b0f69
AW
1920}
1921
db0da029
AW
1922int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
1923 int index, const char *name)
e2c7d025 1924{
db0da029
AW
1925 struct vfio_region_info *info;
1926 int ret;
1927
1928 ret = vfio_get_region_info(vbasedev, index, &info);
1929 if (ret) {
1930 return ret;
1931 }
1932
1933 region->vbasedev = vbasedev;
1934 region->flags = info->flags;
1935 region->size = info->size;
1936 region->fd_offset = info->offset;
1937 region->nr = index;
1938
1939 if (region->size) {
1940 region->mem = g_new0(MemoryRegion, 1);
1941 memory_region_init_io(region->mem, obj, &vfio_region_ops,
1942 region, name, region->size);
e2c7d025 1943
db0da029 1944 if (!vbasedev->no_mmap &&
95251725 1945 region->flags & VFIO_REGION_INFO_FLAG_MMAP) {
e2c7d025 1946
24acf72b 1947 ret = vfio_setup_region_sparse_mmaps(region, info);
db0da029 1948
24acf72b 1949 if (ret) {
b53b0f69
AW
1950 region->nr_mmaps = 1;
1951 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
1952 region->mmaps[0].offset = 0;
1953 region->mmaps[0].size = region->size;
1954 }
e2c7d025 1955 }
db0da029
AW
1956 }
1957
1958 g_free(info);
1959
1960 trace_vfio_region_setup(vbasedev->name, index, name,
1961 region->flags, region->fd_offset, region->size);
1962 return 0;
1963}
e2c7d025 1964
0f7a903b
KW
1965static void vfio_subregion_unmap(VFIORegion *region, int index)
1966{
1967 trace_vfio_region_unmap(memory_region_name(&region->mmaps[index].mem),
1968 region->mmaps[index].offset,
1969 region->mmaps[index].offset +
1970 region->mmaps[index].size - 1);
1971 memory_region_del_subregion(region->mem, &region->mmaps[index].mem);
1972 munmap(region->mmaps[index].mmap, region->mmaps[index].size);
1973 object_unparent(OBJECT(&region->mmaps[index].mem));
1974 region->mmaps[index].mmap = NULL;
1975}
1976
db0da029
AW
1977int vfio_region_mmap(VFIORegion *region)
1978{
1979 int i, prot = 0;
1980 char *name;
1981
1982 if (!region->mem) {
1983 return 0;
1984 }
1985
1986 prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0;
1987 prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0;
1988
1989 for (i = 0; i < region->nr_mmaps; i++) {
1990 region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot,
1991 MAP_SHARED, region->vbasedev->fd,
1992 region->fd_offset +
1993 region->mmaps[i].offset);
1994 if (region->mmaps[i].mmap == MAP_FAILED) {
1995 int ret = -errno;
1996
1997 trace_vfio_region_mmap_fault(memory_region_name(region->mem), i,
1998 region->fd_offset +
1999 region->mmaps[i].offset,
2000 region->fd_offset +
2001 region->mmaps[i].offset +
2002 region->mmaps[i].size - 1, ret);
2003
2004 region->mmaps[i].mmap = NULL;
2005
2006 for (i--; i >= 0; i--) {
0f7a903b 2007 vfio_subregion_unmap(region, i);
db0da029
AW
2008 }
2009
2010 return ret;
e2c7d025
EA
2011 }
2012
db0da029
AW
2013 name = g_strdup_printf("%s mmaps[%d]",
2014 memory_region_name(region->mem), i);
21e00fa5
AW
2015 memory_region_init_ram_device_ptr(&region->mmaps[i].mem,
2016 memory_region_owner(region->mem),
2017 name, region->mmaps[i].size,
2018 region->mmaps[i].mmap);
db0da029 2019 g_free(name);
db0da029
AW
2020 memory_region_add_subregion(region->mem, region->mmaps[i].offset,
2021 &region->mmaps[i].mem);
2022
2023 trace_vfio_region_mmap(memory_region_name(&region->mmaps[i].mem),
2024 region->mmaps[i].offset,
2025 region->mmaps[i].offset +
2026 region->mmaps[i].size - 1);
2027 }
2028
2029 return 0;
2030}
2031
0f7a903b
KW
2032void vfio_region_unmap(VFIORegion *region)
2033{
2034 int i;
2035
2036 if (!region->mem) {
2037 return;
2038 }
2039
2040 for (i = 0; i < region->nr_mmaps; i++) {
2041 if (region->mmaps[i].mmap) {
2042 vfio_subregion_unmap(region, i);
2043 }
2044 }
2045}
2046
db0da029
AW
2047void vfio_region_exit(VFIORegion *region)
2048{
2049 int i;
2050
2051 if (!region->mem) {
2052 return;
2053 }
2054
2055 for (i = 0; i < region->nr_mmaps; i++) {
2056 if (region->mmaps[i].mmap) {
2057 memory_region_del_subregion(region->mem, &region->mmaps[i].mem);
e2c7d025 2058 }
db0da029 2059 }
e2c7d025 2060
db0da029
AW
2061 trace_vfio_region_exit(region->vbasedev->name, region->nr);
2062}
2063
2064void vfio_region_finalize(VFIORegion *region)
2065{
2066 int i;
2067
2068 if (!region->mem) {
2069 return;
e2c7d025
EA
2070 }
2071
db0da029
AW
2072 for (i = 0; i < region->nr_mmaps; i++) {
2073 if (region->mmaps[i].mmap) {
2074 munmap(region->mmaps[i].mmap, region->mmaps[i].size);
2075 object_unparent(OBJECT(&region->mmaps[i].mem));
2076 }
2077 }
2078
2079 object_unparent(OBJECT(region->mem));
2080
2081 g_free(region->mem);
2082 g_free(region->mmaps);
2083
2084 trace_vfio_region_finalize(region->vbasedev->name, region->nr);
92f86bff
GH
2085
2086 region->mem = NULL;
2087 region->mmaps = NULL;
2088 region->nr_mmaps = 0;
2089 region->size = 0;
2090 region->flags = 0;
2091 region->nr = 0;
db0da029
AW
2092}
2093
2094void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled)
2095{
2096 int i;
2097
2098 if (!region->mem) {
2099 return;
2100 }
2101
2102 for (i = 0; i < region->nr_mmaps; i++) {
2103 if (region->mmaps[i].mmap) {
2104 memory_region_set_enabled(&region->mmaps[i].mem, enabled);
2105 }
2106 }
e2c7d025 2107
db0da029
AW
2108 trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem),
2109 enabled);
e2c7d025
EA
2110}
2111
2112void vfio_reset_handler(void *opaque)
2113{
2114 VFIOGroup *group;
2115 VFIODevice *vbasedev;
2116
2117 QLIST_FOREACH(group, &vfio_group_list, next) {
2118 QLIST_FOREACH(vbasedev, &group->device_list, next) {
7da624e2
AW
2119 if (vbasedev->dev->realized) {
2120 vbasedev->ops->vfio_compute_needs_reset(vbasedev);
2121 }
e2c7d025
EA
2122 }
2123 }
2124
2125 QLIST_FOREACH(group, &vfio_group_list, next) {
2126 QLIST_FOREACH(vbasedev, &group->device_list, next) {
7da624e2 2127 if (vbasedev->dev->realized && vbasedev->needs_reset) {
e2c7d025
EA
2128 vbasedev->ops->vfio_hot_reset_multi(vbasedev);
2129 }
2130 }
2131 }
2132}
2133
2134static void vfio_kvm_device_add_group(VFIOGroup *group)
2135{
2136#ifdef CONFIG_KVM
2137 struct kvm_device_attr attr = {
2138 .group = KVM_DEV_VFIO_GROUP,
2139 .attr = KVM_DEV_VFIO_GROUP_ADD,
2140 .addr = (uint64_t)(unsigned long)&group->fd,
2141 };
2142
2143 if (!kvm_enabled()) {
2144 return;
2145 }
2146
2147 if (vfio_kvm_device_fd < 0) {
2148 struct kvm_create_device cd = {
2149 .type = KVM_DEV_TYPE_VFIO,
2150 };
2151
2152 if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
78e5b17f 2153 error_report("Failed to create KVM VFIO device: %m");
e2c7d025
EA
2154 return;
2155 }
2156
2157 vfio_kvm_device_fd = cd.fd;
2158 }
2159
2160 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
2161 error_report("Failed to add group %d to KVM VFIO device: %m",
2162 group->groupid);
2163 }
2164#endif
2165}
2166
2167static void vfio_kvm_device_del_group(VFIOGroup *group)
2168{
2169#ifdef CONFIG_KVM
2170 struct kvm_device_attr attr = {
2171 .group = KVM_DEV_VFIO_GROUP,
2172 .attr = KVM_DEV_VFIO_GROUP_DEL,
2173 .addr = (uint64_t)(unsigned long)&group->fd,
2174 };
2175
2176 if (vfio_kvm_device_fd < 0) {
2177 return;
2178 }
2179
2180 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
2181 error_report("Failed to remove group %d from KVM VFIO device: %m",
2182 group->groupid);
2183 }
2184#endif
2185}
2186
2187static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
2188{
2189 VFIOAddressSpace *space;
2190
2191 QLIST_FOREACH(space, &vfio_address_spaces, list) {
2192 if (space->as == as) {
2193 return space;
2194 }
2195 }
2196
2197 /* No suitable VFIOAddressSpace, create a new one */
2198 space = g_malloc0(sizeof(*space));
2199 space->as = as;
2200 QLIST_INIT(&space->containers);
2201
2202 QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
2203
2204 return space;
2205}
2206
2207static void vfio_put_address_space(VFIOAddressSpace *space)
2208{
2209 if (QLIST_EMPTY(&space->containers)) {
2210 QLIST_REMOVE(space, list);
2211 g_free(space);
2212 }
2213}
2214
2b6326c0
EA
2215/*
2216 * vfio_get_iommu_type - selects the richest iommu_type (v2 first)
2217 */
2218static int vfio_get_iommu_type(VFIOContainer *container,
2219 Error **errp)
2220{
2221 int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
2222 VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
2223 int i;
2224
2225 for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
2226 if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
2227 return iommu_types[i];
2228 }
2229 }
2230 error_setg(errp, "No available IOMMU models");
2231 return -EINVAL;
2232}
2233
2234static int vfio_init_container(VFIOContainer *container, int group_fd,
2235 Error **errp)
2236{
2237 int iommu_type, ret;
2238
2239 iommu_type = vfio_get_iommu_type(container, errp);
2240 if (iommu_type < 0) {
2241 return iommu_type;
2242 }
2243
2244 ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd);
2245 if (ret) {
2246 error_setg_errno(errp, errno, "Failed to set group container");
2247 return -errno;
2248 }
2249
2250 while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
2251 if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
2252 /*
2253 * On sPAPR, despite the IOMMU subdriver always advertises v1 and
2254 * v2, the running platform may not support v2 and there is no
2255 * way to guess it until an IOMMU group gets added to the container.
2256 * So in case it fails with v2, try v1 as a fallback.
2257 */
2258 iommu_type = VFIO_SPAPR_TCE_IOMMU;
2259 continue;
2260 }
2261 error_setg_errno(errp, errno, "Failed to set iommu for container");
2262 return -errno;
2263 }
2264
2265 container->iommu_type = iommu_type;
2266 return 0;
2267}
2268
87ea529c
KW
2269static int vfio_get_iommu_info(VFIOContainer *container,
2270 struct vfio_iommu_type1_info **info)
2271{
2272
2273 size_t argsz = sizeof(struct vfio_iommu_type1_info);
2274
2275 *info = g_new0(struct vfio_iommu_type1_info, 1);
2276again:
2277 (*info)->argsz = argsz;
2278
2279 if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
2280 g_free(*info);
2281 *info = NULL;
2282 return -errno;
2283 }
2284
2285 if (((*info)->argsz > argsz)) {
2286 argsz = (*info)->argsz;
2287 *info = g_realloc(*info, argsz);
2288 goto again;
2289 }
2290
2291 return 0;
2292}
2293
2294static struct vfio_info_cap_header *
2295vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
2296{
2297 struct vfio_info_cap_header *hdr;
2298 void *ptr = info;
2299
2300 if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
2301 return NULL;
2302 }
2303
2304 for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
2305 if (hdr->id == id) {
2306 return hdr;
2307 }
2308 }
2309
2310 return NULL;
2311}
2312
2313static void vfio_get_iommu_info_migration(VFIOContainer *container,
2314 struct vfio_iommu_type1_info *info)
2315{
2316 struct vfio_info_cap_header *hdr;
2317 struct vfio_iommu_type1_info_cap_migration *cap_mig;
2318
2319 hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
2320 if (!hdr) {
2321 return;
2322 }
2323
2324 cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
2325 header);
2326
2327 /*
1eb7f642
KJ
2328 * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
2329 * qemu_real_host_page_size to mark those dirty.
87ea529c 2330 */
8e3b0cbb 2331 if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
87ea529c
KW
2332 container->dirty_pages_supported = true;
2333 container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
2334 container->dirty_pgsizes = cap_mig->pgsize_bitmap;
2335 }
2336}
2337
01905f58
EA
2338static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
2339 Error **errp)
e2c7d025
EA
2340{
2341 VFIOContainer *container;
2342 int ret, fd;
2343 VFIOAddressSpace *space;
2344
2345 space = vfio_get_address_space(as);
2346
c65ee433 2347 /*
aff92b82 2348 * VFIO is currently incompatible with discarding of RAM insofar as the
c65ee433
AW
2349 * madvise to purge (zap) the page from QEMU's address space does not
2350 * interact with the memory API and therefore leaves stale virtual to
2351 * physical mappings in the IOMMU if the page was previously pinned. We
aff92b82 2352 * therefore set discarding broken for each group added to a container,
c65ee433
AW
2353 * whether the container is used individually or shared. This provides
2354 * us with options to allow devices within a group to opt-in and allow
aff92b82 2355 * discarding, so long as it is done consistently for a group (for instance
c65ee433
AW
2356 * if the device is an mdev device where it is known that the host vendor
2357 * driver will never pin pages outside of the working set of the guest
aff92b82 2358 * driver, which would thus not be discarding candidates).
c65ee433
AW
2359 *
2360 * The first opportunity to induce pinning occurs here where we attempt to
2361 * attach the group to existing containers within the AddressSpace. If any
aff92b82
DH
2362 * pages are already zapped from the virtual address space, such as from
2363 * previous discards, new pinning will cause valid mappings to be
c65ee433
AW
2364 * re-established. Likewise, when the overall MemoryListener for a new
2365 * container is registered, a replay of mappings within the AddressSpace
2366 * will occur, re-establishing any previously zapped pages as well.
2367 *
aff92b82
DH
2368 * Especially virtio-balloon is currently only prevented from discarding
2369 * new memory, it will not yet set ram_block_discard_set_required() and
2370 * therefore, neither stops us here or deals with the sudden memory
2371 * consumption of inflated memory.
53d1b5fc
DH
2372 *
2373 * We do support discarding of memory coordinated via the RamDiscardManager
2374 * with some IOMMU types. vfio_ram_block_discard_disable() handles the
2375 * details once we know which type of IOMMU we are using.
c65ee433 2376 */
c65ee433 2377
e2c7d025
EA
2378 QLIST_FOREACH(container, &space->containers, next) {
2379 if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
53d1b5fc
DH
2380 ret = vfio_ram_block_discard_disable(container, true);
2381 if (ret) {
2382 error_setg_errno(errp, -ret,
2383 "Cannot set discarding of RAM broken");
2384 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER,
2385 &container->fd)) {
2386 error_report("vfio: error disconnecting group %d from"
2387 " container", group->groupid);
2388 }
2389 return ret;
2390 }
e2c7d025
EA
2391 group->container = container;
2392 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2016986a 2393 vfio_kvm_device_add_group(group);
e2c7d025
EA
2394 return 0;
2395 }
2396 }
2397
448058aa 2398 fd = qemu_open_old("/dev/vfio/vfio", O_RDWR);
e2c7d025 2399 if (fd < 0) {
01905f58 2400 error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio");
e2c7d025
EA
2401 ret = -errno;
2402 goto put_space_exit;
2403 }
2404
2405 ret = ioctl(fd, VFIO_GET_API_VERSION);
2406 if (ret != VFIO_API_VERSION) {
01905f58
EA
2407 error_setg(errp, "supported vfio version: %d, "
2408 "reported version: %d", VFIO_API_VERSION, ret);
e2c7d025
EA
2409 ret = -EINVAL;
2410 goto close_fd_exit;
2411 }
2412
2413 container = g_malloc0(sizeof(*container));
2414 container->space = space;
2415 container->fd = fd;
d7d87836 2416 container->error = NULL;
87ea529c 2417 container->dirty_pages_supported = false;
3eed155c 2418 container->dma_max_mappings = 0;
f7f9c7b2
LY
2419 QLIST_INIT(&container->giommu_list);
2420 QLIST_INIT(&container->hostwin_list);
5e3b981c 2421 QLIST_INIT(&container->vrdl_list);
2e6e697e 2422
2b6326c0
EA
2423 ret = vfio_init_container(container, group->fd, errp);
2424 if (ret) {
2425 goto free_container_exit;
2426 }
e2c7d025 2427
53d1b5fc
DH
2428 ret = vfio_ram_block_discard_disable(container, true);
2429 if (ret) {
2430 error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
2431 goto free_container_exit;
2432 }
2433
2b6326c0
EA
2434 switch (container->iommu_type) {
2435 case VFIO_TYPE1v2_IOMMU:
2436 case VFIO_TYPE1_IOMMU:
2437 {
87ea529c 2438 struct vfio_iommu_type1_info *info;
3898aad3 2439
87ea529c 2440 ret = vfio_get_iommu_info(container, &info);
85b6d2b5
AW
2441 if (ret) {
2442 error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
2443 goto enable_discards_exit;
2444 }
87ea529c 2445
85b6d2b5
AW
2446 if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
2447 container->pgsizes = info->iova_pgsizes;
2448 } else {
2449 container->pgsizes = qemu_real_host_page_size();
87ea529c 2450 }
85b6d2b5
AW
2451
2452 if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) {
2453 container->dma_max_mappings = 65535;
7a140a57 2454 }
85b6d2b5 2455 vfio_get_iommu_info_migration(container, info);
87ea529c 2456 g_free(info);
85b6d2b5
AW
2457
2458 /*
2459 * FIXME: We should parse VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE
2460 * information to get the actual window extent rather than assume
2461 * a 64-bit IOVA address space.
2462 */
2463 vfio_host_win_add(container, 0, (hwaddr)-1, container->pgsizes);
2464
2b6326c0
EA
2465 break;
2466 }
2467 case VFIO_SPAPR_TCE_v2_IOMMU:
2468 case VFIO_SPAPR_TCE_IOMMU:
2469 {
3898aad3 2470 struct vfio_iommu_spapr_tce_info info;
2b6326c0 2471 bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
e2c7d025
EA
2472
2473 /*
2474 * The host kernel code implementing VFIO_IOMMU_DISABLE is called
2475 * when container fd is closed so we do not call it explicitly
2476 * in this file.
2477 */
318f67ce
AK
2478 if (!v2) {
2479 ret = ioctl(fd, VFIO_IOMMU_ENABLE);
2480 if (ret) {
01905f58 2481 error_setg_errno(errp, errno, "failed to enable container");
318f67ce 2482 ret = -errno;
53d1b5fc 2483 goto enable_discards_exit;
318f67ce
AK
2484 }
2485 } else {
2486 container->prereg_listener = vfio_prereg_listener;
2487
2488 memory_listener_register(&container->prereg_listener,
2489 &address_space_memory);
2490 if (container->error) {
2491 memory_listener_unregister(&container->prereg_listener);
d7d87836
EA
2492 ret = -1;
2493 error_propagate_prepend(errp, container->error,
2494 "RAM memory listener initialization failed: ");
53d1b5fc 2495 goto enable_discards_exit;
318f67ce 2496 }
e2c7d025 2497 }
3898aad3 2498
3898aad3
DG
2499 info.argsz = sizeof(info);
2500 ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
2501 if (ret) {
01905f58
EA
2502 error_setg_errno(errp, errno,
2503 "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
3898aad3 2504 ret = -errno;
318f67ce
AK
2505 if (v2) {
2506 memory_listener_unregister(&container->prereg_listener);
2507 }
53d1b5fc 2508 goto enable_discards_exit;
3898aad3 2509 }
7a140a57 2510
2e4109de 2511 if (v2) {
c26bc185 2512 container->pgsizes = info.ddw.pgsizes;
2e4109de
AK
2513 /*
2514 * There is a default window in just created container.
2515 * To make region_add/del simpler, we better remove this
2516 * window now and let those iommu_listener callbacks
2517 * create/remove them when needed.
2518 */
2519 ret = vfio_spapr_remove_window(container, info.dma32_window_start);
2520 if (ret) {
01905f58
EA
2521 error_setg_errno(errp, -ret,
2522 "failed to remove existing window");
53d1b5fc 2523 goto enable_discards_exit;
2e4109de
AK
2524 }
2525 } else {
2526 /* The default table uses 4K pages */
c26bc185 2527 container->pgsizes = 0x1000;
2e4109de
AK
2528 vfio_host_win_add(container, info.dma32_window_start,
2529 info.dma32_window_start +
2530 info.dma32_window_size - 1,
2531 0x1000);
2532 }
2b6326c0 2533 }
e2c7d025
EA
2534 }
2535
8c37faa4
AK
2536 vfio_kvm_device_add_group(group);
2537
2538 QLIST_INIT(&container->group_list);
2539 QLIST_INSERT_HEAD(&space->containers, container, next);
2540
2541 group->container = container;
2542 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2543
ee0bf0e5
DG
2544 container->listener = vfio_memory_listener;
2545
2546 memory_listener_register(&container->listener, container->space->as);
2547
2548 if (container->error) {
d7d87836
EA
2549 ret = -1;
2550 error_propagate_prepend(errp, container->error,
2551 "memory listener initialization failed: ");
ee0bf0e5
DG
2552 goto listener_release_exit;
2553 }
2554
2555 container->initialized = true;
2556
e2c7d025
EA
2557 return 0;
2558listener_release_exit:
8c37faa4
AK
2559 QLIST_REMOVE(group, container_next);
2560 QLIST_REMOVE(container, next);
2561 vfio_kvm_device_del_group(group);
e2c7d025
EA
2562 vfio_listener_release(container);
2563
53d1b5fc
DH
2564enable_discards_exit:
2565 vfio_ram_block_discard_disable(container, false);
2566
e2c7d025
EA
2567free_container_exit:
2568 g_free(container);
2569
2570close_fd_exit:
2571 close(fd);
2572
2573put_space_exit:
2574 vfio_put_address_space(space);
2575
2576 return ret;
2577}
2578
2579static void vfio_disconnect_container(VFIOGroup *group)
2580{
2581 VFIOContainer *container = group->container;
2582
36968626
PX
2583 QLIST_REMOVE(group, container_next);
2584 group->container = NULL;
2585
2586 /*
2587 * Explicitly release the listener first before unset container,
2588 * since unset may destroy the backend container if it's the last
2589 * group.
2590 */
2591 if (QLIST_EMPTY(&container->group_list)) {
2592 vfio_listener_release(container);
2593 }
2594
e2c7d025
EA
2595 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
2596 error_report("vfio: error disconnecting group %d from container",
2597 group->groupid);
2598 }
2599
e2c7d025
EA
2600 if (QLIST_EMPTY(&container->group_list)) {
2601 VFIOAddressSpace *space = container->space;
f8d8a944 2602 VFIOGuestIOMMU *giommu, *tmp;
f3bc3a73 2603 VFIOHostDMAWindow *hostwin, *next;
e2c7d025 2604
e2c7d025 2605 QLIST_REMOVE(container, next);
f8d8a944
AK
2606
2607 QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
3df9d748 2608 memory_region_unregister_iommu_notifier(
44ee6aaa 2609 MEMORY_REGION(giommu->iommu_mr), &giommu->n);
f8d8a944
AK
2610 QLIST_REMOVE(giommu, giommu_next);
2611 g_free(giommu);
2612 }
2613
f3bc3a73
PL
2614 QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
2615 next) {
2616 QLIST_REMOVE(hostwin, hostwin_next);
2617 g_free(hostwin);
2618 }
2619
e2c7d025
EA
2620 trace_vfio_disconnect_container(container->fd);
2621 close(container->fd);
2622 g_free(container);
2623
2624 vfio_put_address_space(space);
2625 }
2626}
2627
1b808d5b 2628VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
e2c7d025
EA
2629{
2630 VFIOGroup *group;
2631 char path[32];
2632 struct vfio_group_status status = { .argsz = sizeof(status) };
2633
2634 QLIST_FOREACH(group, &vfio_group_list, next) {
2635 if (group->groupid == groupid) {
2636 /* Found it. Now is it already in the right context? */
2637 if (group->container->space->as == as) {
2638 return group;
2639 } else {
1b808d5b
EA
2640 error_setg(errp, "group %d used in multiple address spaces",
2641 group->groupid);
e2c7d025
EA
2642 return NULL;
2643 }
2644 }
2645 }
2646
2647 group = g_malloc0(sizeof(*group));
2648
2649 snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
448058aa 2650 group->fd = qemu_open_old(path, O_RDWR);
e2c7d025 2651 if (group->fd < 0) {
1b808d5b 2652 error_setg_errno(errp, errno, "failed to open %s", path);
e2c7d025
EA
2653 goto free_group_exit;
2654 }
2655
2656 if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
1b808d5b 2657 error_setg_errno(errp, errno, "failed to get group %d status", groupid);
e2c7d025
EA
2658 goto close_fd_exit;
2659 }
2660
2661 if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
1b808d5b
EA
2662 error_setg(errp, "group %d is not viable", groupid);
2663 error_append_hint(errp,
2664 "Please ensure all devices within the iommu_group "
2665 "are bound to their vfio bus driver.\n");
e2c7d025
EA
2666 goto close_fd_exit;
2667 }
2668
2669 group->groupid = groupid;
2670 QLIST_INIT(&group->device_list);
2671
1b808d5b
EA
2672 if (vfio_connect_container(group, as, errp)) {
2673 error_prepend(errp, "failed to setup container for group %d: ",
2674 groupid);
e2c7d025
EA
2675 goto close_fd_exit;
2676 }
2677
2678 if (QLIST_EMPTY(&vfio_group_list)) {
2679 qemu_register_reset(vfio_reset_handler, NULL);
2680 }
2681
2682 QLIST_INSERT_HEAD(&vfio_group_list, group, next);
2683
e2c7d025
EA
2684 return group;
2685
2686close_fd_exit:
2687 close(group->fd);
2688
2689free_group_exit:
2690 g_free(group);
2691
2692 return NULL;
2693}
2694
2695void vfio_put_group(VFIOGroup *group)
2696{
77a10d04 2697 if (!group || !QLIST_EMPTY(&group->device_list)) {
e2c7d025
EA
2698 return;
2699 }
2700
aff92b82 2701 if (!group->ram_block_discard_allowed) {
53d1b5fc 2702 vfio_ram_block_discard_disable(group->container, false);
238e9172 2703 }
e2c7d025
EA
2704 vfio_kvm_device_del_group(group);
2705 vfio_disconnect_container(group);
2706 QLIST_REMOVE(group, next);
2707 trace_vfio_put_group(group->fd);
2708 close(group->fd);
2709 g_free(group);
2710
2711 if (QLIST_EMPTY(&vfio_group_list)) {
2712 qemu_unregister_reset(vfio_reset_handler, NULL);
2713 }
2714}
2715
2716int vfio_get_device(VFIOGroup *group, const char *name,
59f7d674 2717 VFIODevice *vbasedev, Error **errp)
e2c7d025
EA
2718{
2719 struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
217e9fdc 2720 int ret, fd;
e2c7d025 2721
217e9fdc
PB
2722 fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
2723 if (fd < 0) {
59f7d674
EA
2724 error_setg_errno(errp, errno, "error getting device from group %d",
2725 group->groupid);
2726 error_append_hint(errp,
2727 "Verify all devices in group %d are bound to vfio-<bus> "
2728 "or pci-stub and not already in use\n", group->groupid);
217e9fdc 2729 return fd;
e2c7d025
EA
2730 }
2731
217e9fdc 2732 ret = ioctl(fd, VFIO_DEVICE_GET_INFO, &dev_info);
e2c7d025 2733 if (ret) {
59f7d674 2734 error_setg_errno(errp, errno, "error getting device info");
217e9fdc
PB
2735 close(fd);
2736 return ret;
e2c7d025
EA
2737 }
2738
238e9172 2739 /*
aff92b82
DH
2740 * Set discarding of RAM as not broken for this group if the driver knows
2741 * the device operates compatibly with discarding. Setting must be
2742 * consistent per group, but since compatibility is really only possible
2743 * with mdev currently, we expect singleton groups.
238e9172 2744 */
aff92b82
DH
2745 if (vbasedev->ram_block_discard_allowed !=
2746 group->ram_block_discard_allowed) {
238e9172 2747 if (!QLIST_EMPTY(&group->device_list)) {
aff92b82
DH
2748 error_setg(errp, "Inconsistent setting of support for discarding "
2749 "RAM (e.g., balloon) within group");
8709b395 2750 close(fd);
238e9172
AW
2751 return -1;
2752 }
2753
aff92b82
DH
2754 if (!group->ram_block_discard_allowed) {
2755 group->ram_block_discard_allowed = true;
53d1b5fc 2756 vfio_ram_block_discard_disable(group->container, false);
238e9172
AW
2757 }
2758 }
2759
217e9fdc
PB
2760 vbasedev->fd = fd;
2761 vbasedev->group = group;
2762 QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
2763
e2c7d025
EA
2764 vbasedev->num_irqs = dev_info.num_irqs;
2765 vbasedev->num_regions = dev_info.num_regions;
2766 vbasedev->flags = dev_info.flags;
2767
2768 trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions,
2769 dev_info.num_irqs);
2770
2771 vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
217e9fdc 2772 return 0;
e2c7d025
EA
2773}
2774
2775void vfio_put_base_device(VFIODevice *vbasedev)
2776{
77a10d04
PB
2777 if (!vbasedev->group) {
2778 return;
2779 }
e2c7d025
EA
2780 QLIST_REMOVE(vbasedev, next);
2781 vbasedev->group = NULL;
2782 trace_vfio_put_base_device(vbasedev->fd);
2783 close(vbasedev->fd);
2784}
2785
46900226
AW
2786int vfio_get_region_info(VFIODevice *vbasedev, int index,
2787 struct vfio_region_info **info)
2788{
2789 size_t argsz = sizeof(struct vfio_region_info);
2790
2791 *info = g_malloc0(argsz);
2792
2793 (*info)->index = index;
b53b0f69 2794retry:
46900226
AW
2795 (*info)->argsz = argsz;
2796
2797 if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
2798 g_free(*info);
e61a424f 2799 *info = NULL;
46900226
AW
2800 return -errno;
2801 }
2802
b53b0f69
AW
2803 if ((*info)->argsz > argsz) {
2804 argsz = (*info)->argsz;
2805 *info = g_realloc(*info, argsz);
2806
2807 goto retry;
2808 }
2809
46900226
AW
2810 return 0;
2811}
2812
e61a424f
AW
2813int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
2814 uint32_t subtype, struct vfio_region_info **info)
2815{
2816 int i;
2817
2818 for (i = 0; i < vbasedev->num_regions; i++) {
2819 struct vfio_info_cap_header *hdr;
2820 struct vfio_region_info_cap_type *cap_type;
2821
2822 if (vfio_get_region_info(vbasedev, i, info)) {
2823 continue;
2824 }
2825
2826 hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
2827 if (!hdr) {
2828 g_free(*info);
2829 continue;
2830 }
2831
2832 cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
2833
2834 trace_vfio_get_dev_region(vbasedev->name, i,
2835 cap_type->type, cap_type->subtype);
2836
2837 if (cap_type->type == type && cap_type->subtype == subtype) {
2838 return 0;
2839 }
2840
2841 g_free(*info);
2842 }
2843
2844 *info = NULL;
2845 return -ENODEV;
2846}
2847
ae0215b2
AK
2848bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
2849{
2850 struct vfio_region_info *info = NULL;
2851 bool ret = false;
2852
2853 if (!vfio_get_region_info(vbasedev, region, &info)) {
2854 if (vfio_get_region_info_cap(info, cap_type)) {
2855 ret = true;
2856 }
2857 g_free(info);
2858 }
2859
2860 return ret;
2861}
2862
3153119e
DG
2863/*
2864 * Interfaces for IBM EEH (Enhanced Error Handling)
2865 */
2866static bool vfio_eeh_container_ok(VFIOContainer *container)
2867{
2868 /*
2869 * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO
2870 * implementation is broken if there are multiple groups in a
2871 * container. The hardware works in units of Partitionable
2872 * Endpoints (== IOMMU groups) and the EEH operations naively
2873 * iterate across all groups in the container, without any logic
2874 * to make sure the groups have their state synchronized. For
2875 * certain operations (ENABLE) that might be ok, until an error
2876 * occurs, but for others (GET_STATE) it's clearly broken.
2877 */
2878
2879 /*
2880 * XXX Once fixed kernels exist, test for them here
2881 */
2882
2883 if (QLIST_EMPTY(&container->group_list)) {
2884 return false;
2885 }
2886
2887 if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) {
2888 return false;
2889 }
2890
2891 return true;
2892}
2893
2894static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
2895{
2896 struct vfio_eeh_pe_op pe_op = {
2897 .argsz = sizeof(pe_op),
2898 .op = op,
2899 };
2900 int ret;
2901
2902 if (!vfio_eeh_container_ok(container)) {
2903 error_report("vfio/eeh: EEH_PE_OP 0x%x: "
2904 "kernel requires a container with exactly one group", op);
2905 return -EPERM;
2906 }
2907
2908 ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op);
2909 if (ret < 0) {
2910 error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op);
2911 return -errno;
2912 }
2913
d917e88d 2914 return ret;
3153119e
DG
2915}
2916
2917static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
2918{
2919 VFIOAddressSpace *space = vfio_get_address_space(as);
2920 VFIOContainer *container = NULL;
2921
2922 if (QLIST_EMPTY(&space->containers)) {
2923 /* No containers to act on */
2924 goto out;
2925 }
2926
2927 container = QLIST_FIRST(&space->containers);
2928
2929 if (QLIST_NEXT(container, next)) {
2930 /* We don't yet have logic to synchronize EEH state across
2931 * multiple containers */
2932 container = NULL;
2933 goto out;
2934 }
2935
2936out:
2937 vfio_put_address_space(space);
2938 return container;
2939}
2940
2941bool vfio_eeh_as_ok(AddressSpace *as)
2942{
2943 VFIOContainer *container = vfio_eeh_as_container(as);
2944
2945 return (container != NULL) && vfio_eeh_container_ok(container);
2946}
2947
2948int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
2949{
2950 VFIOContainer *container = vfio_eeh_as_container(as);
2951
2952 if (!container) {
2953 return -ENODEV;
2954 }
2955 return vfio_eeh_container_op(container, op);
2956}