]> git.proxmox.com Git - mirror_qemu.git/blame - hw/vfio/common.c
Merge tag 'for_upstream' of https://git.kernel.org/pub/scm/virt/kvm/mst/qemu into...
[mirror_qemu.git] / hw / vfio / common.c
CommitLineData
e2c7d025
EA
1/*
2 * generic functions used by VFIO devices
3 *
4 * Copyright Red Hat, Inc. 2012
5 *
6 * Authors:
7 * Alex Williamson <alex.williamson@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 * Based on qemu-kvm device-assignment:
13 * Adapted for KVM by Qumranet.
14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
19 */
20
c6eacb1a 21#include "qemu/osdep.h"
e2c7d025 22#include <sys/ioctl.h>
a9c94277
MA
23#ifdef CONFIG_KVM
24#include <linux/kvm.h>
25#endif
e2c7d025
EA
26#include <linux/vfio.h>
27
28#include "hw/vfio/vfio-common.h"
29#include "hw/vfio/vfio.h"
30#include "exec/address-spaces.h"
31#include "exec/memory.h"
b6dd6504 32#include "exec/ram_addr.h"
e2c7d025
EA
33#include "hw/hw.h"
34#include "qemu/error-report.h"
db725815 35#include "qemu/main-loop.h"
f4ec5e26 36#include "qemu/range.h"
e2c7d025 37#include "sysemu/kvm.h"
71e8a915 38#include "sysemu/reset.h"
0fd7616e 39#include "sysemu/runstate.h"
e2c7d025 40#include "trace.h"
01905f58 41#include "qapi/error.h"
b6dd6504 42#include "migration/migration.h"
8b942af3 43#include "migration/misc.h"
29d81b71 44#include "migration/blocker.h"
236e0a45 45#include "migration/qemu-file.h"
851d6d1a 46#include "sysemu/tpm.h"
e2c7d025 47
f481ee2d 48VFIOGroupList vfio_group_list =
39cb514f 49 QLIST_HEAD_INITIALIZER(vfio_group_list);
10ca76b4 50static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
e2c7d025
EA
51 QLIST_HEAD_INITIALIZER(vfio_address_spaces);
52
53#ifdef CONFIG_KVM
54/*
55 * We have a single VFIO pseudo device per KVM VM. Once created it lives
56 * for the life of the VM. Closing the file descriptor only drops our
57 * reference to it and the device's reference to kvm. Therefore once
58 * initialized, this file descriptor is only released on QEMU exit and
59 * we'll re-use it should another vfio device be attached before then.
60 */
61static int vfio_kvm_device_fd = -1;
62#endif
63
64/*
65 * Common VFIO interrupt disable
66 */
67void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
68{
69 struct vfio_irq_set irq_set = {
70 .argsz = sizeof(irq_set),
71 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
72 .index = index,
73 .start = 0,
74 .count = 0,
75 };
76
77 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
78}
79
80void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
81{
82 struct vfio_irq_set irq_set = {
83 .argsz = sizeof(irq_set),
84 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
85 .index = index,
86 .start = 0,
87 .count = 1,
88 };
89
90 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
91}
92
93void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
94{
95 struct vfio_irq_set irq_set = {
96 .argsz = sizeof(irq_set),
97 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
98 .index = index,
99 .start = 0,
100 .count = 1,
101 };
102
103 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
104}
105
201a7331
EA
106static inline const char *action_to_str(int action)
107{
108 switch (action) {
109 case VFIO_IRQ_SET_ACTION_MASK:
110 return "MASK";
111 case VFIO_IRQ_SET_ACTION_UNMASK:
112 return "UNMASK";
113 case VFIO_IRQ_SET_ACTION_TRIGGER:
114 return "TRIGGER";
115 default:
116 return "UNKNOWN ACTION";
117 }
118}
119
120static const char *index_to_str(VFIODevice *vbasedev, int index)
121{
122 if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
123 return NULL;
124 }
125
126 switch (index) {
127 case VFIO_PCI_INTX_IRQ_INDEX:
128 return "INTX";
129 case VFIO_PCI_MSI_IRQ_INDEX:
130 return "MSI";
131 case VFIO_PCI_MSIX_IRQ_INDEX:
132 return "MSIX";
133 case VFIO_PCI_ERR_IRQ_INDEX:
134 return "ERR";
135 case VFIO_PCI_REQ_IRQ_INDEX:
136 return "REQ";
137 default:
138 return NULL;
139 }
140}
141
53d1b5fc
DH
142static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
143{
144 switch (container->iommu_type) {
145 case VFIO_TYPE1v2_IOMMU:
146 case VFIO_TYPE1_IOMMU:
147 /*
148 * We support coordinated discarding of RAM via the RamDiscardManager.
149 */
150 return ram_block_uncoordinated_discard_disable(state);
151 default:
152 /*
153 * VFIO_SPAPR_TCE_IOMMU most probably works just fine with
154 * RamDiscardManager, however, it is completely untested.
155 *
156 * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does
157 * completely the opposite of managing mapping/pinning dynamically as
158 * required by RamDiscardManager. We would have to special-case sections
159 * with a RamDiscardManager.
160 */
161 return ram_block_discard_disable(state);
162 }
163}
164
201a7331
EA
165int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
166 int action, int fd, Error **errp)
167{
168 struct vfio_irq_set *irq_set;
169 int argsz, ret = 0;
170 const char *name;
171 int32_t *pfd;
172
173 argsz = sizeof(*irq_set) + sizeof(*pfd);
174
175 irq_set = g_malloc0(argsz);
176 irq_set->argsz = argsz;
177 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action;
178 irq_set->index = index;
179 irq_set->start = subindex;
180 irq_set->count = 1;
181 pfd = (int32_t *)&irq_set->data;
182 *pfd = fd;
183
184 if (ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
185 ret = -errno;
186 }
187 g_free(irq_set);
188
189 if (!ret) {
190 return 0;
191 }
192
193 error_setg_errno(errp, -ret, "VFIO_DEVICE_SET_IRQS failure");
194
195 name = index_to_str(vbasedev, index);
196 if (name) {
197 error_prepend(errp, "%s-%d: ", name, subindex);
198 } else {
199 error_prepend(errp, "index %d-%d: ", index, subindex);
200 }
201 error_prepend(errp,
202 "Failed to %s %s eventfd signaling for interrupt ",
203 fd < 0 ? "tear down" : "set up", action_to_str(action));
204 return ret;
205}
206
e2c7d025
EA
207/*
208 * IO Port/MMIO - Beware of the endians, VFIO is always little endian
209 */
210void vfio_region_write(void *opaque, hwaddr addr,
211 uint64_t data, unsigned size)
212{
213 VFIORegion *region = opaque;
214 VFIODevice *vbasedev = region->vbasedev;
215 union {
216 uint8_t byte;
217 uint16_t word;
218 uint32_t dword;
219 uint64_t qword;
220 } buf;
221
222 switch (size) {
223 case 1:
224 buf.byte = data;
225 break;
226 case 2:
227 buf.word = cpu_to_le16(data);
228 break;
229 case 4:
230 buf.dword = cpu_to_le32(data);
231 break;
38d49e8c
JRZ
232 case 8:
233 buf.qword = cpu_to_le64(data);
234 break;
e2c7d025 235 default:
c624b6b3 236 hw_error("vfio: unsupported write size, %u bytes", size);
e2c7d025
EA
237 break;
238 }
239
240 if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
241 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
242 ",%d) failed: %m",
243 __func__, vbasedev->name, region->nr,
244 addr, data, size);
245 }
246
247 trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
248
249 /*
250 * A read or write to a BAR always signals an INTx EOI. This will
251 * do nothing if not pending (including not in INTx mode). We assume
252 * that a BAR access is in response to an interrupt and that BAR
253 * accesses will service the interrupt. Unfortunately, we don't know
254 * which access will service the interrupt, so we're potentially
255 * getting quite a few host interrupts per guest interrupt.
256 */
257 vbasedev->ops->vfio_eoi(vbasedev);
258}
259
260uint64_t vfio_region_read(void *opaque,
261 hwaddr addr, unsigned size)
262{
263 VFIORegion *region = opaque;
264 VFIODevice *vbasedev = region->vbasedev;
265 union {
266 uint8_t byte;
267 uint16_t word;
268 uint32_t dword;
269 uint64_t qword;
270 } buf;
271 uint64_t data = 0;
272
273 if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
274 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
275 __func__, vbasedev->name, region->nr,
276 addr, size);
277 return (uint64_t)-1;
278 }
279 switch (size) {
280 case 1:
281 data = buf.byte;
282 break;
283 case 2:
284 data = le16_to_cpu(buf.word);
285 break;
286 case 4:
287 data = le32_to_cpu(buf.dword);
288 break;
38d49e8c
JRZ
289 case 8:
290 data = le64_to_cpu(buf.qword);
291 break;
e2c7d025 292 default:
c624b6b3 293 hw_error("vfio: unsupported read size, %u bytes", size);
e2c7d025
EA
294 break;
295 }
296
297 trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data);
298
299 /* Same as write above */
300 vbasedev->ops->vfio_eoi(vbasedev);
301
302 return data;
303}
304
305const MemoryRegionOps vfio_region_ops = {
306 .read = vfio_region_read,
307 .write = vfio_region_write,
308 .endianness = DEVICE_LITTLE_ENDIAN,
15126cba
JRZ
309 .valid = {
310 .min_access_size = 1,
311 .max_access_size = 8,
312 },
38d49e8c
JRZ
313 .impl = {
314 .min_access_size = 1,
315 .max_access_size = 8,
316 },
e2c7d025
EA
317};
318
b6dd6504
KW
319/*
320 * Device state interfaces
321 */
322
725ccd7e
AH
323typedef struct {
324 unsigned long *bitmap;
325 hwaddr size;
326 hwaddr pages;
327} VFIOBitmap;
328
329static int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size)
330{
331 vbmap->pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
332 vbmap->size = ROUND_UP(vbmap->pages, sizeof(__u64) * BITS_PER_BYTE) /
333 BITS_PER_BYTE;
334 vbmap->bitmap = g_try_malloc0(vbmap->size);
335 if (!vbmap->bitmap) {
336 return -ENOMEM;
337 }
338
339 return 0;
340}
341
b153402a
JM
342static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
343 uint64_t size, ram_addr_t ram_addr);
344
3710586c
KW
345bool vfio_mig_active(void)
346{
347 VFIOGroup *group;
348 VFIODevice *vbasedev;
349
350 if (QLIST_EMPTY(&vfio_group_list)) {
351 return false;
352 }
353
354 QLIST_FOREACH(group, &vfio_group_list, next) {
355 QLIST_FOREACH(vbasedev, &group->device_list, next) {
356 if (vbasedev->migration_blocker) {
357 return false;
358 }
359 }
360 }
361 return true;
362}
363
29d81b71 364static Error *multiple_devices_migration_blocker;
e4688320 365static Error *giommu_migration_blocker;
29d81b71
AH
366
367static unsigned int vfio_migratable_device_num(void)
368{
369 VFIOGroup *group;
370 VFIODevice *vbasedev;
371 unsigned int device_num = 0;
372
373 QLIST_FOREACH(group, &vfio_group_list, next) {
374 QLIST_FOREACH(vbasedev, &group->device_list, next) {
375 if (vbasedev->migration) {
376 device_num++;
377 }
378 }
379 }
380
381 return device_num;
382}
383
384int vfio_block_multiple_devices_migration(Error **errp)
385{
386 int ret;
387
388 if (multiple_devices_migration_blocker ||
389 vfio_migratable_device_num() <= 1) {
390 return 0;
391 }
392
393 error_setg(&multiple_devices_migration_blocker,
394 "Migration is currently not supported with multiple "
395 "VFIO devices");
396 ret = migrate_add_blocker(multiple_devices_migration_blocker, errp);
397 if (ret < 0) {
398 error_free(multiple_devices_migration_blocker);
399 multiple_devices_migration_blocker = NULL;
400 }
401
402 return ret;
403}
404
405void vfio_unblock_multiple_devices_migration(void)
406{
407 if (!multiple_devices_migration_blocker ||
408 vfio_migratable_device_num() > 1) {
409 return;
410 }
411
412 migrate_del_blocker(multiple_devices_migration_blocker);
413 error_free(multiple_devices_migration_blocker);
414 multiple_devices_migration_blocker = NULL;
e4688320
JM
415}
416
417static bool vfio_viommu_preset(void)
418{
419 VFIOAddressSpace *space;
420
421 QLIST_FOREACH(space, &vfio_address_spaces, list) {
422 if (space->as != &address_space_memory) {
423 return true;
424 }
425 }
426
427 return false;
428}
429
430int vfio_block_giommu_migration(Error **errp)
431{
432 int ret;
433
434 if (giommu_migration_blocker ||
435 !vfio_viommu_preset()) {
436 return 0;
437 }
438
439 error_setg(&giommu_migration_blocker,
440 "Migration is currently not supported with vIOMMU enabled");
441 ret = migrate_add_blocker(giommu_migration_blocker, errp);
442 if (ret < 0) {
443 error_free(giommu_migration_blocker);
444 giommu_migration_blocker = NULL;
445 }
446
447 return ret;
448}
449
8249cffc 450void vfio_migration_finalize(void)
e4688320
JM
451{
452 if (!giommu_migration_blocker ||
453 vfio_viommu_preset()) {
454 return;
455 }
456
457 migrate_del_blocker(giommu_migration_blocker);
458 error_free(giommu_migration_blocker);
459 giommu_migration_blocker = NULL;
29d81b71
AH
460}
461
236e0a45
AH
462static void vfio_set_migration_error(int err)
463{
464 MigrationState *ms = migrate_get_current();
465
466 if (migration_is_setup_or_active(ms->state)) {
467 WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
468 if (ms->to_dst_file) {
469 qemu_file_set_error(ms->to_dst_file, err);
470 }
471 }
472 }
473}
474
758b96b6 475static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
b6dd6504
KW
476{
477 VFIOGroup *group;
478 VFIODevice *vbasedev;
479 MigrationState *ms = migrate_get_current();
480
481 if (!migration_is_setup_or_active(ms->state)) {
482 return false;
483 }
484
485 QLIST_FOREACH(group, &container->group_list, container_next) {
486 QLIST_FOREACH(vbasedev, &group->device_list, next) {
487 VFIOMigration *migration = vbasedev->migration;
488
489 if (!migration) {
490 return false;
491 }
492
7429aebe 493 if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF &&
31bcbbb5
AH
494 migration->device_state == VFIO_DEVICE_STATE_RUNNING) {
495 return false;
496 }
b6dd6504
KW
497 }
498 }
499 return true;
500}
501
5255bbf4
JM
502static bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container)
503{
504 VFIOGroup *group;
505 VFIODevice *vbasedev;
506
507 QLIST_FOREACH(group, &container->group_list, container_next) {
508 QLIST_FOREACH(vbasedev, &group->device_list, next) {
509 if (!vbasedev->dirty_pages_supported) {
510 return false;
511 }
512 }
513 }
514
515 return true;
516}
517
8b942af3
AH
518/*
519 * Check if all VFIO devices are running and migration is active, which is
520 * essentially equivalent to the migration being in pre-copy phase.
521 */
522static bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
9e7b0442
KW
523{
524 VFIOGroup *group;
525 VFIODevice *vbasedev;
9e7b0442 526
8b942af3 527 if (!migration_is_active(migrate_get_current())) {
9e7b0442
KW
528 return false;
529 }
530
531 QLIST_FOREACH(group, &container->group_list, container_next) {
532 QLIST_FOREACH(vbasedev, &group->device_list, next) {
533 VFIOMigration *migration = vbasedev->migration;
534
535 if (!migration) {
536 return false;
537 }
538
7429aebe 539 if (migration->device_state == VFIO_DEVICE_STATE_RUNNING) {
9e7b0442
KW
540 continue;
541 } else {
542 return false;
543 }
544 }
545 }
546 return true;
547}
548
549static int vfio_dma_unmap_bitmap(VFIOContainer *container,
550 hwaddr iova, ram_addr_t size,
551 IOMMUTLBEntry *iotlb)
552{
553 struct vfio_iommu_type1_dma_unmap *unmap;
554 struct vfio_bitmap *bitmap;
725ccd7e 555 VFIOBitmap vbmap;
9e7b0442
KW
556 int ret;
557
725ccd7e
AH
558 ret = vfio_bitmap_alloc(&vbmap, size);
559 if (ret) {
560 return ret;
561 }
562
9e7b0442
KW
563 unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
564
565 unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
566 unmap->iova = iova;
567 unmap->size = size;
568 unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
569 bitmap = (struct vfio_bitmap *)&unmap->data;
570
571 /*
1eb7f642
KJ
572 * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
573 * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
574 * to qemu_real_host_page_size.
9e7b0442 575 */
8e3b0cbb 576 bitmap->pgsize = qemu_real_host_page_size();
725ccd7e
AH
577 bitmap->size = vbmap.size;
578 bitmap->data = (__u64 *)vbmap.bitmap;
9e7b0442 579
725ccd7e
AH
580 if (vbmap.size > container->max_dirty_bitmap_size) {
581 error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size);
9e7b0442
KW
582 ret = -E2BIG;
583 goto unmap_exit;
584 }
585
9e7b0442
KW
586 ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
587 if (!ret) {
725ccd7e
AH
588 cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap,
589 iotlb->translated_addr, vbmap.pages);
9e7b0442
KW
590 } else {
591 error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
592 }
593
9e7b0442
KW
594unmap_exit:
595 g_free(unmap);
725ccd7e
AH
596 g_free(vbmap.bitmap);
597
9e7b0442
KW
598 return ret;
599}
600
e2c7d025
EA
601/*
602 * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
603 */
604static int vfio_dma_unmap(VFIOContainer *container,
9e7b0442
KW
605 hwaddr iova, ram_addr_t size,
606 IOMMUTLBEntry *iotlb)
e2c7d025
EA
607{
608 struct vfio_iommu_type1_dma_unmap unmap = {
609 .argsz = sizeof(unmap),
610 .flags = 0,
611 .iova = iova,
612 .size = size,
613 };
b153402a
JM
614 bool need_dirty_sync = false;
615 int ret;
616
617 if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
618 if (!vfio_devices_all_device_dirty_tracking(container) &&
619 container->dirty_pages_supported) {
620 return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
621 }
e2c7d025 622
b153402a 623 need_dirty_sync = true;
9e7b0442
KW
624 }
625
567d7d3e
AW
626 while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
627 /*
628 * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
629 * v4.15) where an overflow in its wrap-around check prevents us from
630 * unmapping the last page of the address space. Test for the error
631 * condition and re-try the unmap excluding the last page. The
632 * expectation is that we've never mapped the last page anyway and this
633 * unmap request comes via vIOMMU support which also makes it unlikely
634 * that this page is used. This bug was introduced well after type1 v2
635 * support was introduced, so we shouldn't need to test for v1. A fix
636 * is queued for kernel v5.0 so this workaround can be removed once
637 * affected kernels are sufficiently deprecated.
638 */
639 if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
640 container->iommu_type == VFIO_TYPE1v2_IOMMU) {
641 trace_vfio_dma_unmap_overflow_workaround();
642 unmap.size -= 1ULL << ctz64(container->pgsizes);
643 continue;
644 }
b09d51c9 645 error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
e2c7d025
EA
646 return -errno;
647 }
648
b153402a
JM
649 if (need_dirty_sync) {
650 ret = vfio_get_dirty_bitmap(container, iova, size,
651 iotlb->translated_addr);
652 if (ret) {
653 return ret;
654 }
b051a3f6
AH
655 }
656
e2c7d025
EA
657 return 0;
658}
659
660static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
661 ram_addr_t size, void *vaddr, bool readonly)
662{
663 struct vfio_iommu_type1_dma_map map = {
664 .argsz = sizeof(map),
665 .flags = VFIO_DMA_MAP_FLAG_READ,
666 .vaddr = (__u64)(uintptr_t)vaddr,
667 .iova = iova,
668 .size = size,
669 };
670
671 if (!readonly) {
672 map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
673 }
674
675 /*
676 * Try the mapping, if it fails with EBUSY, unmap the region and try
677 * again. This shouldn't be necessary, but we sometimes see it in
b6af0975 678 * the VGA ROM space.
e2c7d025
EA
679 */
680 if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
9e7b0442 681 (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
e2c7d025
EA
682 ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
683 return 0;
684 }
685
b09d51c9 686 error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
e2c7d025
EA
687 return -errno;
688}
689
f4ec5e26
AK
690static void vfio_host_win_add(VFIOContainer *container,
691 hwaddr min_iova, hwaddr max_iova,
692 uint64_t iova_pgsizes)
693{
694 VFIOHostDMAWindow *hostwin;
695
696 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
697 if (ranges_overlap(hostwin->min_iova,
698 hostwin->max_iova - hostwin->min_iova + 1,
699 min_iova,
700 max_iova - min_iova + 1)) {
701 hw_error("%s: Overlapped IOMMU are not enabled", __func__);
702 }
703 }
704
705 hostwin = g_malloc0(sizeof(*hostwin));
706
707 hostwin->min_iova = min_iova;
708 hostwin->max_iova = max_iova;
709 hostwin->iova_pgsizes = iova_pgsizes;
710 QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
711}
712
2e4109de
AK
713static int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova,
714 hwaddr max_iova)
715{
716 VFIOHostDMAWindow *hostwin;
717
718 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
719 if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
720 QLIST_REMOVE(hostwin, hostwin_next);
f3bc3a73 721 g_free(hostwin);
2e4109de
AK
722 return 0;
723 }
724 }
725
726 return -1;
727}
728
e2c7d025
EA
729static bool vfio_listener_skipped_section(MemoryRegionSection *section)
730{
731 return (!memory_region_is_ram(section->mr) &&
732 !memory_region_is_iommu(section->mr)) ||
56918a12 733 memory_region_is_protected(section->mr) ||
e2c7d025
EA
734 /*
735 * Sizing an enabled 64-bit BAR can cause spurious mappings to
736 * addresses in the upper part of the 64-bit address space. These
737 * are never accessed by the CPU and beyond the address width of
738 * some IOMMU hardware. TODO: VFIO should tell us the IOMMU width.
739 */
740 section->offset_within_address_space & (1ULL << 63);
741}
742
4a4b88fb 743/* Called with rcu_read_lock held. */
9a04fe09
KW
744static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
745 ram_addr_t *ram_addr, bool *read_only)
e2c7d025 746{
baa44bce 747 bool ret, mr_has_discard_manager;
0fd7616e 748
baa44bce
CL
749 ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only,
750 &mr_has_discard_manager);
751 if (ret && mr_has_discard_manager) {
0fd7616e
DH
752 /*
753 * Malicious VMs might trigger discarding of IOMMU-mapped memory. The
754 * pages will remain pinned inside vfio until unmapped, resulting in a
755 * higher memory consumption than expected. If memory would get
756 * populated again later, there would be an inconsistency between pages
757 * pinned by vfio and pages seen by QEMU. This is the case until
758 * unmapped from the IOMMU (e.g., during device reset).
759 *
760 * With malicious guests, we really only care about pinning more memory
761 * than expected. RLIMIT_MEMLOCK set for the user/process can never be
762 * exceeded and can be used to mitigate this problem.
763 */
764 warn_report_once("Using vfio with vIOMMUs and coordinated discarding of"
765 " RAM (e.g., virtio-mem) works, however, malicious"
766 " guests can trigger pinning of more memory than"
767 " intended via an IOMMU. It's possible to mitigate "
768 " by setting/adjusting RLIMIT_MEMLOCK.");
e2c7d025 769 }
baa44bce 770 return ret;
4a4b88fb
PX
771}
772
773static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
774{
775 VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
776 VFIOContainer *container = giommu->container;
777 hwaddr iova = iotlb->iova + giommu->iommu_offset;
4a4b88fb
PX
778 void *vaddr;
779 int ret;
780
781 trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
782 iova, iova + iotlb->addr_mask);
783
784 if (iotlb->target_as != &address_space_memory) {
785 error_report("Wrong target AS \"%s\", only system memory is allowed",
786 iotlb->target_as->name ? iotlb->target_as->name : "none");
236e0a45 787 vfio_set_migration_error(-EINVAL);
4a4b88fb
PX
788 return;
789 }
790
791 rcu_read_lock();
792
e2c7d025 793 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
9a04fe09
KW
794 bool read_only;
795
796 if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) {
dfbd90e5
PX
797 goto out;
798 }
4a4b88fb
PX
799 /*
800 * vaddr is only valid until rcu_read_unlock(). But after
801 * vfio_dma_map has set up the mapping the pages will be
802 * pinned by the kernel. This makes sure that the RAM backend
803 * of vaddr will always be there, even if the memory object is
804 * destroyed and its backing memory munmap-ed.
805 */
d78c19b5 806 ret = vfio_dma_map(container, iova,
e2c7d025 807 iotlb->addr_mask + 1, vaddr,
4a4b88fb 808 read_only);
e2c7d025
EA
809 if (ret) {
810 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
db9b829b 811 "0x%"HWADDR_PRIx", %p) = %d (%s)",
d78c19b5 812 container, iova,
db9b829b 813 iotlb->addr_mask + 1, vaddr, ret, strerror(-ret));
e2c7d025
EA
814 }
815 } else {
9e7b0442 816 ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
e2c7d025
EA
817 if (ret) {
818 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
db9b829b 819 "0x%"HWADDR_PRIx") = %d (%s)",
d78c19b5 820 container, iova,
db9b829b 821 iotlb->addr_mask + 1, ret, strerror(-ret));
236e0a45 822 vfio_set_migration_error(ret);
e2c7d025
EA
823 }
824 }
41063e1e
PB
825out:
826 rcu_read_unlock();
e2c7d025
EA
827}
828
5e3b981c
DH
829static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
830 MemoryRegionSection *section)
831{
832 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
833 listener);
834 const hwaddr size = int128_get64(section->size);
835 const hwaddr iova = section->offset_within_address_space;
836 int ret;
837
838 /* Unmap with a single call. */
839 ret = vfio_dma_unmap(vrdl->container, iova, size , NULL);
840 if (ret) {
841 error_report("%s: vfio_dma_unmap() failed: %s", __func__,
842 strerror(-ret));
843 }
844}
845
846static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
847 MemoryRegionSection *section)
848{
849 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
850 listener);
851 const hwaddr end = section->offset_within_region +
852 int128_get64(section->size);
853 hwaddr start, next, iova;
854 void *vaddr;
855 int ret;
856
857 /*
858 * Map in (aligned within memory region) minimum granularity, so we can
859 * unmap in minimum granularity later.
860 */
861 for (start = section->offset_within_region; start < end; start = next) {
862 next = ROUND_UP(start + 1, vrdl->granularity);
863 next = MIN(next, end);
864
865 iova = start - section->offset_within_region +
866 section->offset_within_address_space;
867 vaddr = memory_region_get_ram_ptr(section->mr) + start;
868
869 ret = vfio_dma_map(vrdl->container, iova, next - start,
870 vaddr, section->readonly);
871 if (ret) {
872 /* Rollback */
873 vfio_ram_discard_notify_discard(rdl, section);
874 return ret;
875 }
876 }
877 return 0;
878}
879
880static void vfio_register_ram_discard_listener(VFIOContainer *container,
881 MemoryRegionSection *section)
882{
883 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
884 VFIORamDiscardListener *vrdl;
885
886 /* Ignore some corner cases not relevant in practice. */
887 g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE));
888 g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space,
889 TARGET_PAGE_SIZE));
890 g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE));
891
892 vrdl = g_new0(VFIORamDiscardListener, 1);
893 vrdl->container = container;
894 vrdl->mr = section->mr;
895 vrdl->offset_within_address_space = section->offset_within_address_space;
896 vrdl->size = int128_get64(section->size);
897 vrdl->granularity = ram_discard_manager_get_min_granularity(rdm,
898 section->mr);
899
900 g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
a5dba9bc
DH
901 g_assert(container->pgsizes &&
902 vrdl->granularity >= 1ULL << ctz64(container->pgsizes));
5e3b981c
DH
903
904 ram_discard_listener_init(&vrdl->listener,
905 vfio_ram_discard_notify_populate,
906 vfio_ram_discard_notify_discard, true);
907 ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
908 QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next);
a74317f6
DH
909
910 /*
911 * Sanity-check if we have a theoretically problematic setup where we could
912 * exceed the maximum number of possible DMA mappings over time. We assume
913 * that each mapped section in the same address space as a RamDiscardManager
914 * section consumes exactly one DMA mapping, with the exception of
915 * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections
916 * in the same address space as RamDiscardManager sections.
917 *
918 * We assume that each section in the address space consumes one memslot.
919 * We take the number of KVM memory slots as a best guess for the maximum
920 * number of sections in the address space we could have over time,
921 * also consuming DMA mappings.
922 */
923 if (container->dma_max_mappings) {
924 unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
925
926#ifdef CONFIG_KVM
927 if (kvm_enabled()) {
928 max_memslots = kvm_get_max_memslots();
929 }
930#endif
931
932 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
933 hwaddr start, end;
934
935 start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
936 vrdl->granularity);
937 end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size,
938 vrdl->granularity);
939 vrdl_mappings += (end - start) / vrdl->granularity;
940 vrdl_count++;
941 }
942
943 if (vrdl_mappings + max_memslots - vrdl_count >
944 container->dma_max_mappings) {
945 warn_report("%s: possibly running out of DMA mappings. E.g., try"
946 " increasing the 'block-size' of virtio-mem devies."
947 " Maximum possible DMA mappings: %d, Maximum possible"
948 " memslots: %d", __func__, container->dma_max_mappings,
949 max_memslots);
950 }
951 }
5e3b981c
DH
952}
953
954static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
955 MemoryRegionSection *section)
956{
957 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
958 VFIORamDiscardListener *vrdl = NULL;
959
960 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
961 if (vrdl->mr == section->mr &&
962 vrdl->offset_within_address_space ==
963 section->offset_within_address_space) {
964 break;
965 }
966 }
967
968 if (!vrdl) {
969 hw_error("vfio: Trying to unregister missing RAM discard listener");
970 }
971
972 ram_discard_manager_unregister_listener(rdm, &vrdl->listener);
973 QLIST_REMOVE(vrdl, next);
974 g_free(vrdl);
975}
976
fbc6c921
JM
977static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container,
978 hwaddr iova, hwaddr end)
979{
980 VFIOHostDMAWindow *hostwin;
981 bool hostwin_found = false;
982
983 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
984 if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
985 hostwin_found = true;
986 break;
987 }
988 }
989
990 return hostwin_found ? hostwin : NULL;
991}
992
851d6d1a
EA
993static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
994{
995 MemoryRegion *mr = section->mr;
996
997 if (!TPM_IS_CRB(mr->owner)) {
998 return false;
999 }
1000
1001 /* this is a known safe misaligned region, just trace for debug purpose */
1002 trace_vfio_known_safe_misalignment(memory_region_name(mr),
1003 section->offset_within_address_space,
1004 section->offset_within_region,
1005 qemu_real_host_page_size());
1006 return true;
1007}
1008
b92f2376
JM
1009static bool vfio_listener_valid_section(MemoryRegionSection *section,
1010 const char *name)
e2c7d025 1011{
e2c7d025 1012 if (vfio_listener_skipped_section(section)) {
b92f2376 1013 trace_vfio_listener_region_skip(name,
e2c7d025
EA
1014 section->offset_within_address_space,
1015 section->offset_within_address_space +
1016 int128_get64(int128_sub(section->size, int128_one())));
b92f2376 1017 return false;
e2c7d025
EA
1018 }
1019
1eb7f642 1020 if (unlikely((section->offset_within_address_space &
8e3b0cbb
MAL
1021 ~qemu_real_host_page_mask()) !=
1022 (section->offset_within_region & ~qemu_real_host_page_mask()))) {
851d6d1a
EA
1023 if (!vfio_known_safe_misalignment(section)) {
1024 error_report("%s received unaligned region %s iova=0x%"PRIx64
1025 " offset_within_region=0x%"PRIx64
1026 " qemu_real_host_page_size=0x%"PRIxPTR,
1027 __func__, memory_region_name(section->mr),
1028 section->offset_within_address_space,
1029 section->offset_within_region,
1030 qemu_real_host_page_size());
1031 }
b92f2376
JM
1032 return false;
1033 }
1034
1035 return true;
1036}
1037
4ead8308
JM
1038static bool vfio_get_section_iova_range(VFIOContainer *container,
1039 MemoryRegionSection *section,
1040 hwaddr *out_iova, hwaddr *out_end,
1041 Int128 *out_llend)
1042{
1043 Int128 llend;
1044 hwaddr iova;
1045
1046 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
1047 llend = int128_make64(section->offset_within_address_space);
1048 llend = int128_add(llend, section->size);
1049 llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
1050
1051 if (int128_ge(int128_make64(iova), llend)) {
1052 return false;
1053 }
1054
1055 *out_iova = iova;
1056 *out_end = int128_get64(int128_sub(llend, int128_one()));
1057 if (out_llend) {
1058 *out_llend = llend;
1059 }
1060 return true;
1061}
1062
b92f2376
JM
1063static void vfio_listener_region_add(MemoryListener *listener,
1064 MemoryRegionSection *section)
1065{
1066 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1067 hwaddr iova, end;
1068 Int128 llend, llsize;
1069 void *vaddr;
1070 int ret;
1071 VFIOHostDMAWindow *hostwin;
1072 Error *err = NULL;
1073
1074 if (!vfio_listener_valid_section(section, "region_add")) {
e2c7d025
EA
1075 return;
1076 }
1077
4ead8308 1078 if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) {
e4b34708
KJ
1079 if (memory_region_is_ram_device(section->mr)) {
1080 trace_vfio_listener_region_add_no_dma_map(
1081 memory_region_name(section->mr),
1082 section->offset_within_address_space,
1083 int128_getlo(section->size),
8e3b0cbb 1084 qemu_real_host_page_size());
e4b34708 1085 }
e2c7d025
EA
1086 return;
1087 }
3898aad3 1088
2e4109de 1089 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
2e4109de
AK
1090 hwaddr pgsize = 0;
1091
1092 /* For now intersections are not allowed, we may relax this later */
1093 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
1094 if (ranges_overlap(hostwin->min_iova,
1095 hostwin->max_iova - hostwin->min_iova + 1,
1096 section->offset_within_address_space,
1097 int128_get64(section->size))) {
d7d87836
EA
1098 error_setg(&err,
1099 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
1100 "host DMA window [0x%"PRIx64",0x%"PRIx64"]",
1101 section->offset_within_address_space,
1102 section->offset_within_address_space +
1103 int128_get64(section->size) - 1,
1104 hostwin->min_iova, hostwin->max_iova);
2e4109de
AK
1105 goto fail;
1106 }
1107 }
1108
1109 ret = vfio_spapr_create_window(container, section, &pgsize);
1110 if (ret) {
d7d87836 1111 error_setg_errno(&err, -ret, "Failed to create SPAPR window");
2e4109de
AK
1112 goto fail;
1113 }
1114
1115 vfio_host_win_add(container, section->offset_within_address_space,
1116 section->offset_within_address_space +
1117 int128_get64(section->size) - 1, pgsize);
07bc681a
AK
1118#ifdef CONFIG_KVM
1119 if (kvm_enabled()) {
1120 VFIOGroup *group;
1121 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
1122 struct kvm_vfio_spapr_tce param;
1123 struct kvm_device_attr attr = {
1124 .group = KVM_DEV_VFIO_GROUP,
1125 .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
1126 .addr = (uint64_t)(unsigned long)&param,
1127 };
1128
1129 if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
1130 &param.tablefd)) {
1131 QLIST_FOREACH(group, &container->group_list, container_next) {
1132 param.groupfd = group->fd;
1133 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
1134 error_report("vfio: failed to setup fd %d "
1135 "for a group with fd %d: %s",
1136 param.tablefd, param.groupfd,
1137 strerror(errno));
1138 return;
1139 }
1140 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
1141 }
1142 }
1143 }
1144#endif
2e4109de
AK
1145 }
1146
fbc6c921
JM
1147 hostwin = vfio_find_hostwin(container, iova, end);
1148 if (!hostwin) {
d7d87836
EA
1149 error_setg(&err, "Container %p can't map guest IOVA region"
1150 " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end);
3898aad3
DG
1151 goto fail;
1152 }
e2c7d025
EA
1153
1154 memory_region_ref(section->mr);
1155
1156 if (memory_region_is_iommu(section->mr)) {
1157 VFIOGuestIOMMU *giommu;
3df9d748 1158 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
cb1efcf4 1159 int iommu_idx;
e2c7d025 1160
55efcc53 1161 trace_vfio_listener_region_add_iommu(iova, end);
e2c7d025 1162 /*
e2c7d025
EA
1163 * FIXME: For VFIO iommu types which have KVM acceleration to
1164 * avoid bouncing all map/unmaps through qemu this way, this
1165 * would be the right place to wire that up (tell the KVM
1166 * device emulation the VFIO iommu handles to use).
1167 */
e2c7d025 1168 giommu = g_malloc0(sizeof(*giommu));
44ee6aaa 1169 giommu->iommu_mr = iommu_mr;
d78c19b5
AK
1170 giommu->iommu_offset = section->offset_within_address_space -
1171 section->offset_within_region;
e2c7d025 1172 giommu->container = container;
698feb5e
PX
1173 llend = int128_add(int128_make64(section->offset_within_region),
1174 section->size);
1175 llend = int128_sub(llend, int128_one());
cb1efcf4
PM
1176 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
1177 MEMTXATTRS_UNSPECIFIED);
698feb5e 1178 iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
8dca037b 1179 IOMMU_NOTIFIER_IOTLB_EVENTS,
698feb5e 1180 section->offset_within_region,
cb1efcf4
PM
1181 int128_get64(llend),
1182 iommu_idx);
508ce5eb 1183
44ee6aaa 1184 ret = memory_region_iommu_set_page_size_mask(giommu->iommu_mr,
b9177498
BB
1185 container->pgsizes,
1186 &err);
1187 if (ret) {
1188 g_free(giommu);
1189 goto fail;
1190 }
1191
549d4005
EA
1192 ret = memory_region_register_iommu_notifier(section->mr, &giommu->n,
1193 &err);
1194 if (ret) {
1195 g_free(giommu);
1196 goto fail;
1197 }
1198 QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
44ee6aaa 1199 memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
e2c7d025
EA
1200
1201 return;
1202 }
1203
1204 /* Here we assume that memory_region_is_ram(section->mr)==true */
1205
5e3b981c
DH
1206 /*
1207 * For RAM memory regions with a RamDiscardManager, we only want to map the
1208 * actually populated parts - and update the mapping whenever we're notified
1209 * about changes.
1210 */
1211 if (memory_region_has_ram_discard_manager(section->mr)) {
1212 vfio_register_ram_discard_listener(container, section);
1213 return;
1214 }
1215
e2c7d025
EA
1216 vaddr = memory_region_get_ram_ptr(section->mr) +
1217 section->offset_within_region +
1218 (iova - section->offset_within_address_space);
1219
55efcc53 1220 trace_vfio_listener_region_add_ram(iova, end, vaddr);
e2c7d025 1221
55efcc53
BD
1222 llsize = int128_sub(llend, int128_make64(iova));
1223
567b5b30
AK
1224 if (memory_region_is_ram_device(section->mr)) {
1225 hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
1226
1227 if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
5c086005
EA
1228 trace_vfio_listener_region_add_no_dma_map(
1229 memory_region_name(section->mr),
1230 section->offset_within_address_space,
1231 int128_getlo(section->size),
1232 pgmask + 1);
567b5b30
AK
1233 return;
1234 }
1235 }
1236
55efcc53
BD
1237 ret = vfio_dma_map(container, iova, int128_get64(llsize),
1238 vaddr, section->readonly);
e2c7d025 1239 if (ret) {
d7d87836 1240 error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
db9b829b
AH
1241 "0x%"HWADDR_PRIx", %p) = %d (%s)",
1242 container, iova, int128_get64(llsize), vaddr, ret,
1243 strerror(-ret));
567b5b30
AK
1244 if (memory_region_is_ram_device(section->mr)) {
1245 /* Allow unexpected mappings not to be fatal for RAM devices */
d7d87836 1246 error_report_err(err);
567b5b30
AK
1247 return;
1248 }
ac6dc389
DG
1249 goto fail;
1250 }
e2c7d025 1251
ac6dc389
DG
1252 return;
1253
1254fail:
567b5b30
AK
1255 if (memory_region_is_ram_device(section->mr)) {
1256 error_report("failed to vfio_dma_map. pci p2p may not work");
1257 return;
1258 }
ac6dc389
DG
1259 /*
1260 * On the initfn path, store the first error in the container so we
1261 * can gracefully fail. Runtime, there's not much we can do other
1262 * than throw a hardware error.
1263 */
1264 if (!container->initialized) {
1265 if (!container->error) {
d7d87836
EA
1266 error_propagate_prepend(&container->error, err,
1267 "Region %s: ",
1268 memory_region_name(section->mr));
1269 } else {
1270 error_free(err);
e2c7d025 1271 }
ac6dc389 1272 } else {
d7d87836 1273 error_report_err(err);
ac6dc389 1274 hw_error("vfio: DMA mapping failed, unable to continue");
e2c7d025
EA
1275 }
1276}
1277
1278static void vfio_listener_region_del(MemoryListener *listener,
1279 MemoryRegionSection *section)
1280{
ee0bf0e5 1281 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
e2c7d025 1282 hwaddr iova, end;
7a057b4f 1283 Int128 llend, llsize;
e2c7d025 1284 int ret;
567b5b30 1285 bool try_unmap = true;
e2c7d025 1286
b92f2376 1287 if (!vfio_listener_valid_section(section, "region_del")) {
e2c7d025
EA
1288 return;
1289 }
1290
1291 if (memory_region_is_iommu(section->mr)) {
1292 VFIOGuestIOMMU *giommu;
1293
1294 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
44ee6aaa 1295 if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
698feb5e 1296 giommu->n.start == section->offset_within_region) {
3df9d748 1297 memory_region_unregister_iommu_notifier(section->mr,
d22d8956 1298 &giommu->n);
e2c7d025
EA
1299 QLIST_REMOVE(giommu, giommu_next);
1300 g_free(giommu);
1301 break;
1302 }
1303 }
1304
1305 /*
1306 * FIXME: We assume the one big unmap below is adequate to
1307 * remove any individual page mappings in the IOMMU which
1308 * might have been copied into VFIO. This works for a page table
1309 * based IOMMU where a big unmap flattens a large range of IO-PTEs.
1310 * That may not be true for all IOMMU types.
1311 */
1312 }
1313
4ead8308 1314 if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) {
e2c7d025
EA
1315 return;
1316 }
7a057b4f
AK
1317
1318 llsize = int128_sub(llend, int128_make64(iova));
e2c7d025 1319
7a057b4f 1320 trace_vfio_listener_region_del(iova, end);
e2c7d025 1321
567b5b30
AK
1322 if (memory_region_is_ram_device(section->mr)) {
1323 hwaddr pgmask;
1324 VFIOHostDMAWindow *hostwin;
567b5b30 1325
fbc6c921
JM
1326 hostwin = vfio_find_hostwin(container, iova, end);
1327 assert(hostwin); /* or region_add() would have failed */
567b5b30
AK
1328
1329 pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
1330 try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
5e3b981c
DH
1331 } else if (memory_region_has_ram_discard_manager(section->mr)) {
1332 vfio_unregister_ram_discard_listener(container, section);
1333 /* Unregistering will trigger an unmap. */
1334 try_unmap = false;
e2c7d025 1335 }
2e4109de 1336
567b5b30 1337 if (try_unmap) {
1b296c3d
JPB
1338 if (int128_eq(llsize, int128_2_64())) {
1339 /* The unmap ioctl doesn't accept a full 64-bit span. */
1340 llsize = int128_rshift(llsize, 1);
1341 ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
1342 if (ret) {
1343 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
db9b829b
AH
1344 "0x%"HWADDR_PRIx") = %d (%s)",
1345 container, iova, int128_get64(llsize), ret,
1346 strerror(-ret));
1b296c3d
JPB
1347 }
1348 iova += int128_get64(llsize);
1349 }
9e7b0442 1350 ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
567b5b30
AK
1351 if (ret) {
1352 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
db9b829b
AH
1353 "0x%"HWADDR_PRIx") = %d (%s)",
1354 container, iova, int128_get64(llsize), ret,
1355 strerror(-ret));
567b5b30
AK
1356 }
1357 }
1358
1359 memory_region_unref(section->mr);
1360
2e4109de
AK
1361 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1362 vfio_spapr_remove_window(container,
1363 section->offset_within_address_space);
1364 if (vfio_host_win_del(container,
1365 section->offset_within_address_space,
1366 section->offset_within_address_space +
1367 int128_get64(section->size) - 1) < 0) {
1368 hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
1369 __func__, section->offset_within_address_space);
1370 }
1371 }
e2c7d025
EA
1372}
1373
236e0a45 1374static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
758b96b6
KZ
1375{
1376 int ret;
1377 struct vfio_iommu_type1_dirty_bitmap dirty = {
1378 .argsz = sizeof(dirty),
1379 };
1380
b051a3f6 1381 if (!container->dirty_pages_supported) {
236e0a45 1382 return 0;
b051a3f6
AH
1383 }
1384
758b96b6
KZ
1385 if (start) {
1386 dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
1387 } else {
1388 dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
1389 }
1390
1391 ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
1392 if (ret) {
236e0a45 1393 ret = -errno;
758b96b6
KZ
1394 error_report("Failed to set dirty tracking flag 0x%x errno: %d",
1395 dirty.flags, errno);
1396 }
236e0a45
AH
1397
1398 return ret;
758b96b6
KZ
1399}
1400
62c1b002
JM
1401typedef struct VFIODirtyRanges {
1402 hwaddr min32;
1403 hwaddr max32;
1404 hwaddr min64;
1405 hwaddr max64;
1406} VFIODirtyRanges;
1407
1408typedef struct VFIODirtyRangesListener {
1409 VFIOContainer *container;
1410 VFIODirtyRanges ranges;
1411 MemoryListener listener;
1412} VFIODirtyRangesListener;
1413
1414static void vfio_dirty_tracking_update(MemoryListener *listener,
1415 MemoryRegionSection *section)
1416{
1417 VFIODirtyRangesListener *dirty = container_of(listener,
1418 VFIODirtyRangesListener,
1419 listener);
1420 VFIODirtyRanges *range = &dirty->ranges;
1421 hwaddr iova, end, *min, *max;
1422
1423 if (!vfio_listener_valid_section(section, "tracking_update") ||
1424 !vfio_get_section_iova_range(dirty->container, section,
1425 &iova, &end, NULL)) {
1426 return;
1427 }
1428
1429 /*
1430 * The address space passed to the dirty tracker is reduced to two ranges:
1431 * one for 32-bit DMA ranges, and another one for 64-bit DMA ranges.
1432 * The underlying reports of dirty will query a sub-interval of each of
1433 * these ranges.
1434 *
1435 * The purpose of the dual range handling is to handle known cases of big
1436 * holes in the address space, like the x86 AMD 1T hole. The alternative
1437 * would be an IOVATree but that has a much bigger runtime overhead and
1438 * unnecessary complexity.
1439 */
1440 min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
1441 max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
1442
1443 if (*min > iova) {
1444 *min = iova;
1445 }
1446 if (*max < end) {
1447 *max = end;
1448 }
1449
1450 trace_vfio_device_dirty_tracking_update(iova, end, *min, *max);
1451 return;
1452}
1453
1454static const MemoryListener vfio_dirty_tracking_listener = {
1455 .name = "vfio-tracking",
1456 .region_add = vfio_dirty_tracking_update,
1457};
1458
1459static void vfio_dirty_tracking_init(VFIOContainer *container,
1460 VFIODirtyRanges *ranges)
1461{
1462 VFIODirtyRangesListener dirty;
1463
1464 memset(&dirty, 0, sizeof(dirty));
1465 dirty.ranges.min32 = UINT32_MAX;
1466 dirty.ranges.min64 = UINT64_MAX;
1467 dirty.listener = vfio_dirty_tracking_listener;
1468 dirty.container = container;
1469
1470 memory_listener_register(&dirty.listener,
1471 container->space->as);
1472
1473 *ranges = dirty.ranges;
1474
1475 /*
1476 * The memory listener is synchronous, and used to calculate the range
1477 * to dirty tracking. Unregister it after we are done as we are not
1478 * interested in any follow-up updates.
1479 */
1480 memory_listener_unregister(&dirty.listener);
1481}
1482
5255bbf4
JM
1483static void vfio_devices_dma_logging_stop(VFIOContainer *container)
1484{
1485 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
1486 sizeof(uint64_t))] = {};
1487 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
1488 VFIODevice *vbasedev;
1489 VFIOGroup *group;
1490
1491 feature->argsz = sizeof(buf);
1492 feature->flags = VFIO_DEVICE_FEATURE_SET |
1493 VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
1494
1495 QLIST_FOREACH(group, &container->group_list, container_next) {
1496 QLIST_FOREACH(vbasedev, &group->device_list, next) {
1497 if (!vbasedev->dirty_tracking) {
1498 continue;
1499 }
1500
1501 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
1502 warn_report("%s: Failed to stop DMA logging, err %d (%s)",
1503 vbasedev->name, -errno, strerror(errno));
1504 }
1505 vbasedev->dirty_tracking = false;
1506 }
1507 }
1508}
1509
1510static struct vfio_device_feature *
1511vfio_device_feature_dma_logging_start_create(VFIOContainer *container,
1512 VFIODirtyRanges *tracking)
1513{
1514 struct vfio_device_feature *feature;
1515 size_t feature_size;
1516 struct vfio_device_feature_dma_logging_control *control;
1517 struct vfio_device_feature_dma_logging_range *ranges;
1518
1519 feature_size = sizeof(struct vfio_device_feature) +
1520 sizeof(struct vfio_device_feature_dma_logging_control);
1521 feature = g_try_malloc0(feature_size);
1522 if (!feature) {
1523 errno = ENOMEM;
1524 return NULL;
1525 }
1526 feature->argsz = feature_size;
1527 feature->flags = VFIO_DEVICE_FEATURE_SET |
1528 VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
1529
1530 control = (struct vfio_device_feature_dma_logging_control *)feature->data;
1531 control->page_size = qemu_real_host_page_size();
1532
1533 /*
1534 * DMA logging uAPI guarantees to support at least a number of ranges that
1535 * fits into a single host kernel base page.
1536 */
1537 control->num_ranges = !!tracking->max32 + !!tracking->max64;
1538 ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
1539 control->num_ranges);
1540 if (!ranges) {
1541 g_free(feature);
1542 errno = ENOMEM;
1543
1544 return NULL;
1545 }
1546
1547 control->ranges = (__u64)(uintptr_t)ranges;
1548 if (tracking->max32) {
1549 ranges->iova = tracking->min32;
1550 ranges->length = (tracking->max32 - tracking->min32) + 1;
1551 ranges++;
1552 }
1553 if (tracking->max64) {
1554 ranges->iova = tracking->min64;
1555 ranges->length = (tracking->max64 - tracking->min64) + 1;
1556 }
1557
1558 trace_vfio_device_dirty_tracking_start(control->num_ranges,
1559 tracking->min32, tracking->max32,
1560 tracking->min64, tracking->max64);
1561
1562 return feature;
1563}
1564
1565static void vfio_device_feature_dma_logging_start_destroy(
1566 struct vfio_device_feature *feature)
1567{
1568 struct vfio_device_feature_dma_logging_control *control =
1569 (struct vfio_device_feature_dma_logging_control *)feature->data;
1570 struct vfio_device_feature_dma_logging_range *ranges =
1571 (struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges;
1572
1573 g_free(ranges);
1574 g_free(feature);
1575}
1576
1577static int vfio_devices_dma_logging_start(VFIOContainer *container)
1578{
1579 struct vfio_device_feature *feature;
1580 VFIODirtyRanges ranges;
1581 VFIODevice *vbasedev;
1582 VFIOGroup *group;
1583 int ret = 0;
1584
1585 vfio_dirty_tracking_init(container, &ranges);
1586 feature = vfio_device_feature_dma_logging_start_create(container,
1587 &ranges);
1588 if (!feature) {
1589 return -errno;
1590 }
1591
1592 QLIST_FOREACH(group, &container->group_list, container_next) {
1593 QLIST_FOREACH(vbasedev, &group->device_list, next) {
1594 if (vbasedev->dirty_tracking) {
1595 continue;
1596 }
1597
1598 ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
1599 if (ret) {
1600 ret = -errno;
1601 error_report("%s: Failed to start DMA logging, err %d (%s)",
1602 vbasedev->name, ret, strerror(errno));
1603 goto out;
1604 }
1605 vbasedev->dirty_tracking = true;
1606 }
1607 }
1608
1609out:
1610 if (ret) {
1611 vfio_devices_dma_logging_stop(container);
1612 }
1613
1614 vfio_device_feature_dma_logging_start_destroy(feature);
1615
1616 return ret;
1617}
1618
758b96b6
KZ
1619static void vfio_listener_log_global_start(MemoryListener *listener)
1620{
1621 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
236e0a45 1622 int ret;
758b96b6 1623
5255bbf4
JM
1624 if (vfio_devices_all_device_dirty_tracking(container)) {
1625 ret = vfio_devices_dma_logging_start(container);
1626 } else {
1627 ret = vfio_set_dirty_page_tracking(container, true);
1628 }
62c1b002 1629
236e0a45 1630 if (ret) {
5255bbf4
JM
1631 error_report("vfio: Could not start dirty page tracking, err: %d (%s)",
1632 ret, strerror(-ret));
236e0a45
AH
1633 vfio_set_migration_error(ret);
1634 }
758b96b6
KZ
1635}
1636
1637static void vfio_listener_log_global_stop(MemoryListener *listener)
1638{
1639 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
5255bbf4
JM
1640 int ret = 0;
1641
1642 if (vfio_devices_all_device_dirty_tracking(container)) {
1643 vfio_devices_dma_logging_stop(container);
1644 } else {
1645 ret = vfio_set_dirty_page_tracking(container, false);
1646 }
758b96b6 1647
236e0a45 1648 if (ret) {
5255bbf4
JM
1649 error_report("vfio: Could not stop dirty page tracking, err: %d (%s)",
1650 ret, strerror(-ret));
236e0a45
AH
1651 vfio_set_migration_error(ret);
1652 }
758b96b6
KZ
1653}
1654
b153402a
JM
1655static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
1656 hwaddr size, void *bitmap)
1657{
1658 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
1659 sizeof(struct vfio_device_feature_dma_logging_report),
1660 sizeof(__u64))] = {};
1661 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
1662 struct vfio_device_feature_dma_logging_report *report =
1663 (struct vfio_device_feature_dma_logging_report *)feature->data;
1664
1665 report->iova = iova;
1666 report->length = size;
1667 report->page_size = qemu_real_host_page_size();
1668 report->bitmap = (__u64)(uintptr_t)bitmap;
1669
1670 feature->argsz = sizeof(buf);
1671 feature->flags = VFIO_DEVICE_FEATURE_GET |
1672 VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT;
1673
1674 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
1675 return -errno;
1676 }
1677
1678 return 0;
1679}
1680
1681static int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
1682 VFIOBitmap *vbmap, hwaddr iova,
1683 hwaddr size)
1684{
1685 VFIODevice *vbasedev;
1686 VFIOGroup *group;
1687 int ret;
1688
1689 QLIST_FOREACH(group, &container->group_list, container_next) {
1690 QLIST_FOREACH(vbasedev, &group->device_list, next) {
1691 ret = vfio_device_dma_logging_report(vbasedev, iova, size,
1692 vbmap->bitmap);
1693 if (ret) {
1694 error_report("%s: Failed to get DMA logging report, iova: "
1695 "0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx
1696 ", err: %d (%s)",
1697 vbasedev->name, iova, size, ret, strerror(-ret));
1698
1699 return ret;
1700 }
1701 }
1702 }
1703
1704 return 0;
1705}
1706
6607109f
AH
1707static int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap,
1708 hwaddr iova, hwaddr size)
b6dd6504
KW
1709{
1710 struct vfio_iommu_type1_dirty_bitmap *dbitmap;
1711 struct vfio_iommu_type1_dirty_bitmap_get *range;
b6dd6504
KW
1712 int ret;
1713
1714 dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
1715
1716 dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
1717 dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
1718 range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
1719 range->iova = iova;
1720 range->size = size;
1721
1722 /*
1eb7f642
KJ
1723 * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
1724 * qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize
1725 * to qemu_real_host_page_size.
b6dd6504 1726 */
8e3b0cbb 1727 range->bitmap.pgsize = qemu_real_host_page_size();
6607109f
AH
1728 range->bitmap.size = vbmap->size;
1729 range->bitmap.data = (__u64 *)vbmap->bitmap;
b6dd6504
KW
1730
1731 ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
1732 if (ret) {
3e2413a6 1733 ret = -errno;
b6dd6504
KW
1734 error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64
1735 " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
1736 (uint64_t)range->size, errno);
6607109f
AH
1737 }
1738
1739 g_free(dbitmap);
1740
1741 return ret;
1742}
1743
1744static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
1745 uint64_t size, ram_addr_t ram_addr)
1746{
b153402a
JM
1747 bool all_device_dirty_tracking =
1748 vfio_devices_all_device_dirty_tracking(container);
6607109f
AH
1749 VFIOBitmap vbmap;
1750 int ret;
1751
b153402a 1752 if (!container->dirty_pages_supported && !all_device_dirty_tracking) {
6607109f
AH
1753 cpu_physical_memory_set_dirty_range(ram_addr, size,
1754 tcg_enabled() ? DIRTY_CLIENTS_ALL :
1755 DIRTY_CLIENTS_NOCODE);
1756 return 0;
1757 }
1758
1759 ret = vfio_bitmap_alloc(&vbmap, size);
1760 if (ret) {
1761 return ret;
1762 }
1763
b153402a
JM
1764 if (all_device_dirty_tracking) {
1765 ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size);
1766 } else {
1767 ret = vfio_query_dirty_bitmap(container, &vbmap, iova, size);
1768 }
1769
6607109f
AH
1770 if (ret) {
1771 goto out;
b6dd6504
KW
1772 }
1773
725ccd7e
AH
1774 cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr,
1775 vbmap.pages);
b6dd6504 1776
6607109f
AH
1777 trace_vfio_get_dirty_bitmap(container->fd, iova, size, vbmap.size,
1778 ram_addr);
1779out:
725ccd7e 1780 g_free(vbmap.bitmap);
b6dd6504
KW
1781
1782 return ret;
1783}
1784
9a04fe09
KW
1785typedef struct {
1786 IOMMUNotifier n;
1787 VFIOGuestIOMMU *giommu;
1788} vfio_giommu_dirty_notifier;
1789
1790static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
1791{
1792 vfio_giommu_dirty_notifier *gdn = container_of(n,
1793 vfio_giommu_dirty_notifier, n);
1794 VFIOGuestIOMMU *giommu = gdn->giommu;
1795 VFIOContainer *container = giommu->container;
1796 hwaddr iova = iotlb->iova + giommu->iommu_offset;
1797 ram_addr_t translated_addr;
236e0a45 1798 int ret = -EINVAL;
9a04fe09
KW
1799
1800 trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
1801
1802 if (iotlb->target_as != &address_space_memory) {
1803 error_report("Wrong target AS \"%s\", only system memory is allowed",
1804 iotlb->target_as->name ? iotlb->target_as->name : "none");
236e0a45 1805 goto out;
9a04fe09
KW
1806 }
1807
1808 rcu_read_lock();
1809 if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
9a04fe09
KW
1810 ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
1811 translated_addr);
1812 if (ret) {
1813 error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
db9b829b
AH
1814 "0x%"HWADDR_PRIx") = %d (%s)",
1815 container, iova, iotlb->addr_mask + 1, ret,
1816 strerror(-ret));
9a04fe09
KW
1817 }
1818 }
1819 rcu_read_unlock();
236e0a45
AH
1820
1821out:
1822 if (ret) {
1823 vfio_set_migration_error(ret);
1824 }
9a04fe09
KW
1825}
1826
5e3b981c
DH
1827static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
1828 void *opaque)
1829{
1830 const hwaddr size = int128_get64(section->size);
1831 const hwaddr iova = section->offset_within_address_space;
1832 const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) +
1833 section->offset_within_region;
1834 VFIORamDiscardListener *vrdl = opaque;
1835
1836 /*
1837 * Sync the whole mapped region (spanning multiple individual mappings)
1838 * in one go.
1839 */
1840 return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr);
1841}
1842
1843static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
1844 MemoryRegionSection *section)
1845{
1846 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
1847 VFIORamDiscardListener *vrdl = NULL;
1848
1849 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
1850 if (vrdl->mr == section->mr &&
1851 vrdl->offset_within_address_space ==
1852 section->offset_within_address_space) {
1853 break;
1854 }
1855 }
1856
1857 if (!vrdl) {
1858 hw_error("vfio: Trying to sync missing RAM discard listener");
1859 }
1860
1861 /*
1862 * We only want/can synchronize the bitmap for actually mapped parts -
1863 * which correspond to populated parts. Replay all populated parts.
1864 */
1865 return ram_discard_manager_replay_populated(rdm, section,
1866 vfio_ram_discard_get_dirty_bitmap,
1867 &vrdl);
1868}
1869
b6dd6504
KW
1870static int vfio_sync_dirty_bitmap(VFIOContainer *container,
1871 MemoryRegionSection *section)
1872{
1873 ram_addr_t ram_addr;
1874
9a04fe09
KW
1875 if (memory_region_is_iommu(section->mr)) {
1876 VFIOGuestIOMMU *giommu;
1877
1878 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
44ee6aaa 1879 if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
9a04fe09
KW
1880 giommu->n.start == section->offset_within_region) {
1881 Int128 llend;
1882 vfio_giommu_dirty_notifier gdn = { .giommu = giommu };
44ee6aaa 1883 int idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr,
9a04fe09
KW
1884 MEMTXATTRS_UNSPECIFIED);
1885
1886 llend = int128_add(int128_make64(section->offset_within_region),
1887 section->size);
1888 llend = int128_sub(llend, int128_one());
1889
1890 iommu_notifier_init(&gdn.n,
1891 vfio_iommu_map_dirty_notify,
1892 IOMMU_NOTIFIER_MAP,
1893 section->offset_within_region,
1894 int128_get64(llend),
1895 idx);
44ee6aaa 1896 memory_region_iommu_replay(giommu->iommu_mr, &gdn.n);
9a04fe09
KW
1897 break;
1898 }
1899 }
1900 return 0;
5e3b981c
DH
1901 } else if (memory_region_has_ram_discard_manager(section->mr)) {
1902 return vfio_sync_ram_discard_listener_dirty_bitmap(container, section);
9a04fe09
KW
1903 }
1904
b6dd6504
KW
1905 ram_addr = memory_region_get_ram_addr(section->mr) +
1906 section->offset_within_region;
1907
1908 return vfio_get_dirty_bitmap(container,
1eb7f642
KJ
1909 REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
1910 int128_get64(section->size), ram_addr);
b6dd6504
KW
1911}
1912
4292d501 1913static void vfio_listener_log_sync(MemoryListener *listener,
b6dd6504
KW
1914 MemoryRegionSection *section)
1915{
1916 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
236e0a45 1917 int ret;
b6dd6504 1918
b051a3f6 1919 if (vfio_listener_skipped_section(section)) {
b6dd6504
KW
1920 return;
1921 }
1922
758b96b6 1923 if (vfio_devices_all_dirty_tracking(container)) {
236e0a45
AH
1924 ret = vfio_sync_dirty_bitmap(container, section);
1925 if (ret) {
1926 error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret,
1927 strerror(-ret));
1928 vfio_set_migration_error(ret);
1929 }
b6dd6504
KW
1930 }
1931}
1932
51b833f4 1933static const MemoryListener vfio_memory_listener = {
142518bd 1934 .name = "vfio",
e2c7d025
EA
1935 .region_add = vfio_listener_region_add,
1936 .region_del = vfio_listener_region_del,
758b96b6
KZ
1937 .log_global_start = vfio_listener_log_global_start,
1938 .log_global_stop = vfio_listener_log_global_stop,
4292d501 1939 .log_sync = vfio_listener_log_sync,
e2c7d025
EA
1940};
1941
51b833f4 1942static void vfio_listener_release(VFIOContainer *container)
e2c7d025 1943{
ee0bf0e5 1944 memory_listener_unregister(&container->listener);
318f67ce
AK
1945 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1946 memory_listener_unregister(&container->prereg_listener);
1947 }
e2c7d025
EA
1948}
1949
3ab7a0b4
MR
1950static struct vfio_info_cap_header *
1951vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id)
b53b0f69
AW
1952{
1953 struct vfio_info_cap_header *hdr;
b53b0f69 1954
3ab7a0b4 1955 for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
b53b0f69
AW
1956 if (hdr->id == id) {
1957 return hdr;
1958 }
1959 }
1960
1961 return NULL;
1962}
1963
3ab7a0b4
MR
1964struct vfio_info_cap_header *
1965vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
1966{
1967 if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) {
1968 return NULL;
1969 }
1970
1971 return vfio_get_cap((void *)info, info->cap_offset, id);
1972}
1973
7486a628
MR
1974static struct vfio_info_cap_header *
1975vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
1976{
1977 if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
1978 return NULL;
1979 }
1980
1981 return vfio_get_cap((void *)info, info->cap_offset, id);
1982}
1983
92fe289a
MR
1984struct vfio_info_cap_header *
1985vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id)
1986{
1987 if (!(info->flags & VFIO_DEVICE_FLAGS_CAPS)) {
1988 return NULL;
1989 }
1990
1991 return vfio_get_cap((void *)info, info->cap_offset, id);
1992}
1993
7486a628
MR
1994bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
1995 unsigned int *avail)
1996{
1997 struct vfio_info_cap_header *hdr;
1998 struct vfio_iommu_type1_info_dma_avail *cap;
1999
2000 /* If the capability cannot be found, assume no DMA limiting */
2001 hdr = vfio_get_iommu_type1_info_cap(info,
2002 VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
2003 if (hdr == NULL) {
2004 return false;
2005 }
2006
2007 if (avail != NULL) {
2008 cap = (void *) hdr;
2009 *avail = cap->avail;
2010 }
2011
2012 return true;
2013}
2014
24acf72b
AW
2015static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
2016 struct vfio_region_info *info)
b53b0f69
AW
2017{
2018 struct vfio_info_cap_header *hdr;
2019 struct vfio_region_info_cap_sparse_mmap *sparse;
24acf72b 2020 int i, j;
b53b0f69
AW
2021
2022 hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
2023 if (!hdr) {
24acf72b 2024 return -ENODEV;
b53b0f69
AW
2025 }
2026
2027 sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
2028
2029 trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
2030 region->nr, sparse->nr_areas);
2031
24acf72b
AW
2032 region->mmaps = g_new0(VFIOMmap, sparse->nr_areas);
2033
2034 for (i = 0, j = 0; i < sparse->nr_areas; i++) {
24acf72b 2035 if (sparse->areas[i].size) {
99510d27
XC
2036 trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset,
2037 sparse->areas[i].offset +
2038 sparse->areas[i].size - 1);
24acf72b
AW
2039 region->mmaps[j].offset = sparse->areas[i].offset;
2040 region->mmaps[j].size = sparse->areas[i].size;
2041 j++;
2042 }
b53b0f69 2043 }
24acf72b
AW
2044
2045 region->nr_mmaps = j;
2046 region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap));
2047
2048 return 0;
b53b0f69
AW
2049}
2050
db0da029
AW
2051int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
2052 int index, const char *name)
e2c7d025 2053{
db0da029
AW
2054 struct vfio_region_info *info;
2055 int ret;
2056
2057 ret = vfio_get_region_info(vbasedev, index, &info);
2058 if (ret) {
2059 return ret;
2060 }
2061
2062 region->vbasedev = vbasedev;
2063 region->flags = info->flags;
2064 region->size = info->size;
2065 region->fd_offset = info->offset;
2066 region->nr = index;
2067
2068 if (region->size) {
2069 region->mem = g_new0(MemoryRegion, 1);
2070 memory_region_init_io(region->mem, obj, &vfio_region_ops,
2071 region, name, region->size);
e2c7d025 2072
db0da029 2073 if (!vbasedev->no_mmap &&
95251725 2074 region->flags & VFIO_REGION_INFO_FLAG_MMAP) {
e2c7d025 2075
24acf72b 2076 ret = vfio_setup_region_sparse_mmaps(region, info);
db0da029 2077
24acf72b 2078 if (ret) {
b53b0f69
AW
2079 region->nr_mmaps = 1;
2080 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
2081 region->mmaps[0].offset = 0;
2082 region->mmaps[0].size = region->size;
2083 }
e2c7d025 2084 }
db0da029
AW
2085 }
2086
2087 g_free(info);
2088
2089 trace_vfio_region_setup(vbasedev->name, index, name,
2090 region->flags, region->fd_offset, region->size);
2091 return 0;
2092}
e2c7d025 2093
0f7a903b
KW
2094static void vfio_subregion_unmap(VFIORegion *region, int index)
2095{
2096 trace_vfio_region_unmap(memory_region_name(&region->mmaps[index].mem),
2097 region->mmaps[index].offset,
2098 region->mmaps[index].offset +
2099 region->mmaps[index].size - 1);
2100 memory_region_del_subregion(region->mem, &region->mmaps[index].mem);
2101 munmap(region->mmaps[index].mmap, region->mmaps[index].size);
2102 object_unparent(OBJECT(&region->mmaps[index].mem));
2103 region->mmaps[index].mmap = NULL;
2104}
2105
db0da029
AW
2106int vfio_region_mmap(VFIORegion *region)
2107{
2108 int i, prot = 0;
2109 char *name;
2110
2111 if (!region->mem) {
2112 return 0;
2113 }
2114
2115 prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0;
2116 prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0;
2117
2118 for (i = 0; i < region->nr_mmaps; i++) {
2119 region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot,
2120 MAP_SHARED, region->vbasedev->fd,
2121 region->fd_offset +
2122 region->mmaps[i].offset);
2123 if (region->mmaps[i].mmap == MAP_FAILED) {
2124 int ret = -errno;
2125
2126 trace_vfio_region_mmap_fault(memory_region_name(region->mem), i,
2127 region->fd_offset +
2128 region->mmaps[i].offset,
2129 region->fd_offset +
2130 region->mmaps[i].offset +
2131 region->mmaps[i].size - 1, ret);
2132
2133 region->mmaps[i].mmap = NULL;
2134
2135 for (i--; i >= 0; i--) {
0f7a903b 2136 vfio_subregion_unmap(region, i);
db0da029
AW
2137 }
2138
2139 return ret;
e2c7d025
EA
2140 }
2141
db0da029
AW
2142 name = g_strdup_printf("%s mmaps[%d]",
2143 memory_region_name(region->mem), i);
21e00fa5
AW
2144 memory_region_init_ram_device_ptr(&region->mmaps[i].mem,
2145 memory_region_owner(region->mem),
2146 name, region->mmaps[i].size,
2147 region->mmaps[i].mmap);
db0da029 2148 g_free(name);
db0da029
AW
2149 memory_region_add_subregion(region->mem, region->mmaps[i].offset,
2150 &region->mmaps[i].mem);
2151
2152 trace_vfio_region_mmap(memory_region_name(&region->mmaps[i].mem),
2153 region->mmaps[i].offset,
2154 region->mmaps[i].offset +
2155 region->mmaps[i].size - 1);
2156 }
2157
2158 return 0;
2159}
2160
0f7a903b
KW
2161void vfio_region_unmap(VFIORegion *region)
2162{
2163 int i;
2164
2165 if (!region->mem) {
2166 return;
2167 }
2168
2169 for (i = 0; i < region->nr_mmaps; i++) {
2170 if (region->mmaps[i].mmap) {
2171 vfio_subregion_unmap(region, i);
2172 }
2173 }
2174}
2175
db0da029
AW
2176void vfio_region_exit(VFIORegion *region)
2177{
2178 int i;
2179
2180 if (!region->mem) {
2181 return;
2182 }
2183
2184 for (i = 0; i < region->nr_mmaps; i++) {
2185 if (region->mmaps[i].mmap) {
2186 memory_region_del_subregion(region->mem, &region->mmaps[i].mem);
e2c7d025 2187 }
db0da029 2188 }
e2c7d025 2189
db0da029
AW
2190 trace_vfio_region_exit(region->vbasedev->name, region->nr);
2191}
2192
2193void vfio_region_finalize(VFIORegion *region)
2194{
2195 int i;
2196
2197 if (!region->mem) {
2198 return;
e2c7d025
EA
2199 }
2200
db0da029
AW
2201 for (i = 0; i < region->nr_mmaps; i++) {
2202 if (region->mmaps[i].mmap) {
2203 munmap(region->mmaps[i].mmap, region->mmaps[i].size);
2204 object_unparent(OBJECT(&region->mmaps[i].mem));
2205 }
2206 }
2207
2208 object_unparent(OBJECT(region->mem));
2209
2210 g_free(region->mem);
2211 g_free(region->mmaps);
2212
2213 trace_vfio_region_finalize(region->vbasedev->name, region->nr);
92f86bff
GH
2214
2215 region->mem = NULL;
2216 region->mmaps = NULL;
2217 region->nr_mmaps = 0;
2218 region->size = 0;
2219 region->flags = 0;
2220 region->nr = 0;
db0da029
AW
2221}
2222
2223void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled)
2224{
2225 int i;
2226
2227 if (!region->mem) {
2228 return;
2229 }
2230
2231 for (i = 0; i < region->nr_mmaps; i++) {
2232 if (region->mmaps[i].mmap) {
2233 memory_region_set_enabled(&region->mmaps[i].mem, enabled);
2234 }
2235 }
e2c7d025 2236
db0da029
AW
2237 trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem),
2238 enabled);
e2c7d025
EA
2239}
2240
2241void vfio_reset_handler(void *opaque)
2242{
2243 VFIOGroup *group;
2244 VFIODevice *vbasedev;
2245
2246 QLIST_FOREACH(group, &vfio_group_list, next) {
2247 QLIST_FOREACH(vbasedev, &group->device_list, next) {
7da624e2
AW
2248 if (vbasedev->dev->realized) {
2249 vbasedev->ops->vfio_compute_needs_reset(vbasedev);
2250 }
e2c7d025
EA
2251 }
2252 }
2253
2254 QLIST_FOREACH(group, &vfio_group_list, next) {
2255 QLIST_FOREACH(vbasedev, &group->device_list, next) {
7da624e2 2256 if (vbasedev->dev->realized && vbasedev->needs_reset) {
e2c7d025
EA
2257 vbasedev->ops->vfio_hot_reset_multi(vbasedev);
2258 }
2259 }
2260 }
2261}
2262
2263static void vfio_kvm_device_add_group(VFIOGroup *group)
2264{
2265#ifdef CONFIG_KVM
2266 struct kvm_device_attr attr = {
2267 .group = KVM_DEV_VFIO_GROUP,
2268 .attr = KVM_DEV_VFIO_GROUP_ADD,
2269 .addr = (uint64_t)(unsigned long)&group->fd,
2270 };
2271
2272 if (!kvm_enabled()) {
2273 return;
2274 }
2275
2276 if (vfio_kvm_device_fd < 0) {
2277 struct kvm_create_device cd = {
2278 .type = KVM_DEV_TYPE_VFIO,
2279 };
2280
2281 if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
78e5b17f 2282 error_report("Failed to create KVM VFIO device: %m");
e2c7d025
EA
2283 return;
2284 }
2285
2286 vfio_kvm_device_fd = cd.fd;
2287 }
2288
2289 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
2290 error_report("Failed to add group %d to KVM VFIO device: %m",
2291 group->groupid);
2292 }
2293#endif
2294}
2295
2296static void vfio_kvm_device_del_group(VFIOGroup *group)
2297{
2298#ifdef CONFIG_KVM
2299 struct kvm_device_attr attr = {
2300 .group = KVM_DEV_VFIO_GROUP,
2301 .attr = KVM_DEV_VFIO_GROUP_DEL,
2302 .addr = (uint64_t)(unsigned long)&group->fd,
2303 };
2304
2305 if (vfio_kvm_device_fd < 0) {
2306 return;
2307 }
2308
2309 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
2310 error_report("Failed to remove group %d from KVM VFIO device: %m",
2311 group->groupid);
2312 }
2313#endif
2314}
2315
2316static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
2317{
2318 VFIOAddressSpace *space;
2319
2320 QLIST_FOREACH(space, &vfio_address_spaces, list) {
2321 if (space->as == as) {
2322 return space;
2323 }
2324 }
2325
2326 /* No suitable VFIOAddressSpace, create a new one */
2327 space = g_malloc0(sizeof(*space));
2328 space->as = as;
2329 QLIST_INIT(&space->containers);
2330
2331 QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
2332
2333 return space;
2334}
2335
2336static void vfio_put_address_space(VFIOAddressSpace *space)
2337{
2338 if (QLIST_EMPTY(&space->containers)) {
2339 QLIST_REMOVE(space, list);
2340 g_free(space);
2341 }
2342}
2343
2b6326c0
EA
2344/*
2345 * vfio_get_iommu_type - selects the richest iommu_type (v2 first)
2346 */
2347static int vfio_get_iommu_type(VFIOContainer *container,
2348 Error **errp)
2349{
2350 int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
2351 VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
2352 int i;
2353
2354 for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
2355 if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
2356 return iommu_types[i];
2357 }
2358 }
2359 error_setg(errp, "No available IOMMU models");
2360 return -EINVAL;
2361}
2362
2363static int vfio_init_container(VFIOContainer *container, int group_fd,
2364 Error **errp)
2365{
2366 int iommu_type, ret;
2367
2368 iommu_type = vfio_get_iommu_type(container, errp);
2369 if (iommu_type < 0) {
2370 return iommu_type;
2371 }
2372
2373 ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd);
2374 if (ret) {
2375 error_setg_errno(errp, errno, "Failed to set group container");
2376 return -errno;
2377 }
2378
2379 while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
2380 if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
2381 /*
2382 * On sPAPR, despite the IOMMU subdriver always advertises v1 and
2383 * v2, the running platform may not support v2 and there is no
2384 * way to guess it until an IOMMU group gets added to the container.
2385 * So in case it fails with v2, try v1 as a fallback.
2386 */
2387 iommu_type = VFIO_SPAPR_TCE_IOMMU;
2388 continue;
2389 }
2390 error_setg_errno(errp, errno, "Failed to set iommu for container");
2391 return -errno;
2392 }
2393
2394 container->iommu_type = iommu_type;
2395 return 0;
2396}
2397
87ea529c
KW
2398static int vfio_get_iommu_info(VFIOContainer *container,
2399 struct vfio_iommu_type1_info **info)
2400{
2401
2402 size_t argsz = sizeof(struct vfio_iommu_type1_info);
2403
2404 *info = g_new0(struct vfio_iommu_type1_info, 1);
2405again:
2406 (*info)->argsz = argsz;
2407
2408 if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
2409 g_free(*info);
2410 *info = NULL;
2411 return -errno;
2412 }
2413
2414 if (((*info)->argsz > argsz)) {
2415 argsz = (*info)->argsz;
2416 *info = g_realloc(*info, argsz);
2417 goto again;
2418 }
2419
2420 return 0;
2421}
2422
2423static struct vfio_info_cap_header *
2424vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
2425{
2426 struct vfio_info_cap_header *hdr;
2427 void *ptr = info;
2428
2429 if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
2430 return NULL;
2431 }
2432
2433 for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
2434 if (hdr->id == id) {
2435 return hdr;
2436 }
2437 }
2438
2439 return NULL;
2440}
2441
2442static void vfio_get_iommu_info_migration(VFIOContainer *container,
2443 struct vfio_iommu_type1_info *info)
2444{
2445 struct vfio_info_cap_header *hdr;
2446 struct vfio_iommu_type1_info_cap_migration *cap_mig;
2447
2448 hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
2449 if (!hdr) {
2450 return;
2451 }
2452
2453 cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
2454 header);
2455
2456 /*
1eb7f642
KJ
2457 * cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
2458 * qemu_real_host_page_size to mark those dirty.
87ea529c 2459 */
8e3b0cbb 2460 if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
87ea529c
KW
2461 container->dirty_pages_supported = true;
2462 container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
2463 container->dirty_pgsizes = cap_mig->pgsize_bitmap;
2464 }
2465}
2466
01905f58
EA
2467static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
2468 Error **errp)
e2c7d025
EA
2469{
2470 VFIOContainer *container;
2471 int ret, fd;
2472 VFIOAddressSpace *space;
2473
2474 space = vfio_get_address_space(as);
2475
c65ee433 2476 /*
aff92b82 2477 * VFIO is currently incompatible with discarding of RAM insofar as the
c65ee433
AW
2478 * madvise to purge (zap) the page from QEMU's address space does not
2479 * interact with the memory API and therefore leaves stale virtual to
2480 * physical mappings in the IOMMU if the page was previously pinned. We
aff92b82 2481 * therefore set discarding broken for each group added to a container,
c65ee433
AW
2482 * whether the container is used individually or shared. This provides
2483 * us with options to allow devices within a group to opt-in and allow
aff92b82 2484 * discarding, so long as it is done consistently for a group (for instance
c65ee433
AW
2485 * if the device is an mdev device where it is known that the host vendor
2486 * driver will never pin pages outside of the working set of the guest
aff92b82 2487 * driver, which would thus not be discarding candidates).
c65ee433
AW
2488 *
2489 * The first opportunity to induce pinning occurs here where we attempt to
2490 * attach the group to existing containers within the AddressSpace. If any
aff92b82
DH
2491 * pages are already zapped from the virtual address space, such as from
2492 * previous discards, new pinning will cause valid mappings to be
c65ee433
AW
2493 * re-established. Likewise, when the overall MemoryListener for a new
2494 * container is registered, a replay of mappings within the AddressSpace
2495 * will occur, re-establishing any previously zapped pages as well.
2496 *
aff92b82
DH
2497 * Especially virtio-balloon is currently only prevented from discarding
2498 * new memory, it will not yet set ram_block_discard_set_required() and
2499 * therefore, neither stops us here or deals with the sudden memory
2500 * consumption of inflated memory.
53d1b5fc
DH
2501 *
2502 * We do support discarding of memory coordinated via the RamDiscardManager
2503 * with some IOMMU types. vfio_ram_block_discard_disable() handles the
2504 * details once we know which type of IOMMU we are using.
c65ee433 2505 */
c65ee433 2506
e2c7d025
EA
2507 QLIST_FOREACH(container, &space->containers, next) {
2508 if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
53d1b5fc
DH
2509 ret = vfio_ram_block_discard_disable(container, true);
2510 if (ret) {
2511 error_setg_errno(errp, -ret,
2512 "Cannot set discarding of RAM broken");
2513 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER,
2514 &container->fd)) {
2515 error_report("vfio: error disconnecting group %d from"
2516 " container", group->groupid);
2517 }
2518 return ret;
2519 }
e2c7d025
EA
2520 group->container = container;
2521 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2016986a 2522 vfio_kvm_device_add_group(group);
e2c7d025
EA
2523 return 0;
2524 }
2525 }
2526
448058aa 2527 fd = qemu_open_old("/dev/vfio/vfio", O_RDWR);
e2c7d025 2528 if (fd < 0) {
01905f58 2529 error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio");
e2c7d025
EA
2530 ret = -errno;
2531 goto put_space_exit;
2532 }
2533
2534 ret = ioctl(fd, VFIO_GET_API_VERSION);
2535 if (ret != VFIO_API_VERSION) {
01905f58
EA
2536 error_setg(errp, "supported vfio version: %d, "
2537 "reported version: %d", VFIO_API_VERSION, ret);
e2c7d025
EA
2538 ret = -EINVAL;
2539 goto close_fd_exit;
2540 }
2541
2542 container = g_malloc0(sizeof(*container));
2543 container->space = space;
2544 container->fd = fd;
d7d87836 2545 container->error = NULL;
87ea529c 2546 container->dirty_pages_supported = false;
3eed155c 2547 container->dma_max_mappings = 0;
f7f9c7b2
LY
2548 QLIST_INIT(&container->giommu_list);
2549 QLIST_INIT(&container->hostwin_list);
5e3b981c 2550 QLIST_INIT(&container->vrdl_list);
2e6e697e 2551
2b6326c0
EA
2552 ret = vfio_init_container(container, group->fd, errp);
2553 if (ret) {
2554 goto free_container_exit;
2555 }
e2c7d025 2556
53d1b5fc
DH
2557 ret = vfio_ram_block_discard_disable(container, true);
2558 if (ret) {
2559 error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
2560 goto free_container_exit;
2561 }
2562
2b6326c0
EA
2563 switch (container->iommu_type) {
2564 case VFIO_TYPE1v2_IOMMU:
2565 case VFIO_TYPE1_IOMMU:
2566 {
87ea529c 2567 struct vfio_iommu_type1_info *info;
3898aad3 2568
87ea529c 2569 ret = vfio_get_iommu_info(container, &info);
85b6d2b5
AW
2570 if (ret) {
2571 error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
2572 goto enable_discards_exit;
2573 }
87ea529c 2574
85b6d2b5
AW
2575 if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
2576 container->pgsizes = info->iova_pgsizes;
2577 } else {
2578 container->pgsizes = qemu_real_host_page_size();
87ea529c 2579 }
85b6d2b5
AW
2580
2581 if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) {
2582 container->dma_max_mappings = 65535;
7a140a57 2583 }
85b6d2b5 2584 vfio_get_iommu_info_migration(container, info);
87ea529c 2585 g_free(info);
85b6d2b5
AW
2586
2587 /*
2588 * FIXME: We should parse VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE
2589 * information to get the actual window extent rather than assume
2590 * a 64-bit IOVA address space.
2591 */
2592 vfio_host_win_add(container, 0, (hwaddr)-1, container->pgsizes);
2593
2b6326c0
EA
2594 break;
2595 }
2596 case VFIO_SPAPR_TCE_v2_IOMMU:
2597 case VFIO_SPAPR_TCE_IOMMU:
2598 {
3898aad3 2599 struct vfio_iommu_spapr_tce_info info;
2b6326c0 2600 bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
e2c7d025
EA
2601
2602 /*
2603 * The host kernel code implementing VFIO_IOMMU_DISABLE is called
2604 * when container fd is closed so we do not call it explicitly
2605 * in this file.
2606 */
318f67ce
AK
2607 if (!v2) {
2608 ret = ioctl(fd, VFIO_IOMMU_ENABLE);
2609 if (ret) {
01905f58 2610 error_setg_errno(errp, errno, "failed to enable container");
318f67ce 2611 ret = -errno;
53d1b5fc 2612 goto enable_discards_exit;
318f67ce
AK
2613 }
2614 } else {
2615 container->prereg_listener = vfio_prereg_listener;
2616
2617 memory_listener_register(&container->prereg_listener,
2618 &address_space_memory);
2619 if (container->error) {
2620 memory_listener_unregister(&container->prereg_listener);
d7d87836
EA
2621 ret = -1;
2622 error_propagate_prepend(errp, container->error,
2623 "RAM memory listener initialization failed: ");
53d1b5fc 2624 goto enable_discards_exit;
318f67ce 2625 }
e2c7d025 2626 }
3898aad3 2627
3898aad3
DG
2628 info.argsz = sizeof(info);
2629 ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
2630 if (ret) {
01905f58
EA
2631 error_setg_errno(errp, errno,
2632 "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
3898aad3 2633 ret = -errno;
318f67ce
AK
2634 if (v2) {
2635 memory_listener_unregister(&container->prereg_listener);
2636 }
53d1b5fc 2637 goto enable_discards_exit;
3898aad3 2638 }
7a140a57 2639
2e4109de 2640 if (v2) {
c26bc185 2641 container->pgsizes = info.ddw.pgsizes;
2e4109de
AK
2642 /*
2643 * There is a default window in just created container.
2644 * To make region_add/del simpler, we better remove this
2645 * window now and let those iommu_listener callbacks
2646 * create/remove them when needed.
2647 */
2648 ret = vfio_spapr_remove_window(container, info.dma32_window_start);
2649 if (ret) {
01905f58
EA
2650 error_setg_errno(errp, -ret,
2651 "failed to remove existing window");
53d1b5fc 2652 goto enable_discards_exit;
2e4109de
AK
2653 }
2654 } else {
2655 /* The default table uses 4K pages */
c26bc185 2656 container->pgsizes = 0x1000;
2e4109de
AK
2657 vfio_host_win_add(container, info.dma32_window_start,
2658 info.dma32_window_start +
2659 info.dma32_window_size - 1,
2660 0x1000);
2661 }
2b6326c0 2662 }
e2c7d025
EA
2663 }
2664
8c37faa4
AK
2665 vfio_kvm_device_add_group(group);
2666
2667 QLIST_INIT(&container->group_list);
2668 QLIST_INSERT_HEAD(&space->containers, container, next);
2669
2670 group->container = container;
2671 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2672
ee0bf0e5
DG
2673 container->listener = vfio_memory_listener;
2674
2675 memory_listener_register(&container->listener, container->space->as);
2676
2677 if (container->error) {
d7d87836
EA
2678 ret = -1;
2679 error_propagate_prepend(errp, container->error,
2680 "memory listener initialization failed: ");
ee0bf0e5
DG
2681 goto listener_release_exit;
2682 }
2683
2684 container->initialized = true;
2685
e2c7d025
EA
2686 return 0;
2687listener_release_exit:
8c37faa4
AK
2688 QLIST_REMOVE(group, container_next);
2689 QLIST_REMOVE(container, next);
2690 vfio_kvm_device_del_group(group);
e2c7d025
EA
2691 vfio_listener_release(container);
2692
53d1b5fc
DH
2693enable_discards_exit:
2694 vfio_ram_block_discard_disable(container, false);
2695
e2c7d025
EA
2696free_container_exit:
2697 g_free(container);
2698
2699close_fd_exit:
2700 close(fd);
2701
2702put_space_exit:
2703 vfio_put_address_space(space);
2704
2705 return ret;
2706}
2707
2708static void vfio_disconnect_container(VFIOGroup *group)
2709{
2710 VFIOContainer *container = group->container;
2711
36968626
PX
2712 QLIST_REMOVE(group, container_next);
2713 group->container = NULL;
2714
2715 /*
2716 * Explicitly release the listener first before unset container,
2717 * since unset may destroy the backend container if it's the last
2718 * group.
2719 */
2720 if (QLIST_EMPTY(&container->group_list)) {
2721 vfio_listener_release(container);
2722 }
2723
e2c7d025
EA
2724 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
2725 error_report("vfio: error disconnecting group %d from container",
2726 group->groupid);
2727 }
2728
e2c7d025
EA
2729 if (QLIST_EMPTY(&container->group_list)) {
2730 VFIOAddressSpace *space = container->space;
f8d8a944 2731 VFIOGuestIOMMU *giommu, *tmp;
f3bc3a73 2732 VFIOHostDMAWindow *hostwin, *next;
e2c7d025 2733
e2c7d025 2734 QLIST_REMOVE(container, next);
f8d8a944
AK
2735
2736 QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
3df9d748 2737 memory_region_unregister_iommu_notifier(
44ee6aaa 2738 MEMORY_REGION(giommu->iommu_mr), &giommu->n);
f8d8a944
AK
2739 QLIST_REMOVE(giommu, giommu_next);
2740 g_free(giommu);
2741 }
2742
f3bc3a73
PL
2743 QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
2744 next) {
2745 QLIST_REMOVE(hostwin, hostwin_next);
2746 g_free(hostwin);
2747 }
2748
e2c7d025
EA
2749 trace_vfio_disconnect_container(container->fd);
2750 close(container->fd);
2751 g_free(container);
2752
2753 vfio_put_address_space(space);
2754 }
2755}
2756
1b808d5b 2757VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
e2c7d025
EA
2758{
2759 VFIOGroup *group;
2760 char path[32];
2761 struct vfio_group_status status = { .argsz = sizeof(status) };
2762
2763 QLIST_FOREACH(group, &vfio_group_list, next) {
2764 if (group->groupid == groupid) {
2765 /* Found it. Now is it already in the right context? */
2766 if (group->container->space->as == as) {
2767 return group;
2768 } else {
1b808d5b
EA
2769 error_setg(errp, "group %d used in multiple address spaces",
2770 group->groupid);
e2c7d025
EA
2771 return NULL;
2772 }
2773 }
2774 }
2775
2776 group = g_malloc0(sizeof(*group));
2777
2778 snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
448058aa 2779 group->fd = qemu_open_old(path, O_RDWR);
e2c7d025 2780 if (group->fd < 0) {
1b808d5b 2781 error_setg_errno(errp, errno, "failed to open %s", path);
e2c7d025
EA
2782 goto free_group_exit;
2783 }
2784
2785 if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
1b808d5b 2786 error_setg_errno(errp, errno, "failed to get group %d status", groupid);
e2c7d025
EA
2787 goto close_fd_exit;
2788 }
2789
2790 if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
1b808d5b
EA
2791 error_setg(errp, "group %d is not viable", groupid);
2792 error_append_hint(errp,
2793 "Please ensure all devices within the iommu_group "
2794 "are bound to their vfio bus driver.\n");
e2c7d025
EA
2795 goto close_fd_exit;
2796 }
2797
2798 group->groupid = groupid;
2799 QLIST_INIT(&group->device_list);
2800
1b808d5b
EA
2801 if (vfio_connect_container(group, as, errp)) {
2802 error_prepend(errp, "failed to setup container for group %d: ",
2803 groupid);
e2c7d025
EA
2804 goto close_fd_exit;
2805 }
2806
2807 if (QLIST_EMPTY(&vfio_group_list)) {
2808 qemu_register_reset(vfio_reset_handler, NULL);
2809 }
2810
2811 QLIST_INSERT_HEAD(&vfio_group_list, group, next);
2812
e2c7d025
EA
2813 return group;
2814
2815close_fd_exit:
2816 close(group->fd);
2817
2818free_group_exit:
2819 g_free(group);
2820
2821 return NULL;
2822}
2823
2824void vfio_put_group(VFIOGroup *group)
2825{
77a10d04 2826 if (!group || !QLIST_EMPTY(&group->device_list)) {
e2c7d025
EA
2827 return;
2828 }
2829
aff92b82 2830 if (!group->ram_block_discard_allowed) {
53d1b5fc 2831 vfio_ram_block_discard_disable(group->container, false);
238e9172 2832 }
e2c7d025
EA
2833 vfio_kvm_device_del_group(group);
2834 vfio_disconnect_container(group);
2835 QLIST_REMOVE(group, next);
2836 trace_vfio_put_group(group->fd);
2837 close(group->fd);
2838 g_free(group);
2839
2840 if (QLIST_EMPTY(&vfio_group_list)) {
2841 qemu_unregister_reset(vfio_reset_handler, NULL);
2842 }
2843}
2844
2845int vfio_get_device(VFIOGroup *group, const char *name,
59f7d674 2846 VFIODevice *vbasedev, Error **errp)
e2c7d025
EA
2847{
2848 struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
217e9fdc 2849 int ret, fd;
e2c7d025 2850
217e9fdc
PB
2851 fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
2852 if (fd < 0) {
59f7d674
EA
2853 error_setg_errno(errp, errno, "error getting device from group %d",
2854 group->groupid);
2855 error_append_hint(errp,
2856 "Verify all devices in group %d are bound to vfio-<bus> "
2857 "or pci-stub and not already in use\n", group->groupid);
217e9fdc 2858 return fd;
e2c7d025
EA
2859 }
2860
217e9fdc 2861 ret = ioctl(fd, VFIO_DEVICE_GET_INFO, &dev_info);
e2c7d025 2862 if (ret) {
59f7d674 2863 error_setg_errno(errp, errno, "error getting device info");
217e9fdc
PB
2864 close(fd);
2865 return ret;
e2c7d025
EA
2866 }
2867
238e9172 2868 /*
aff92b82
DH
2869 * Set discarding of RAM as not broken for this group if the driver knows
2870 * the device operates compatibly with discarding. Setting must be
2871 * consistent per group, but since compatibility is really only possible
2872 * with mdev currently, we expect singleton groups.
238e9172 2873 */
aff92b82
DH
2874 if (vbasedev->ram_block_discard_allowed !=
2875 group->ram_block_discard_allowed) {
238e9172 2876 if (!QLIST_EMPTY(&group->device_list)) {
aff92b82
DH
2877 error_setg(errp, "Inconsistent setting of support for discarding "
2878 "RAM (e.g., balloon) within group");
8709b395 2879 close(fd);
238e9172
AW
2880 return -1;
2881 }
2882
aff92b82
DH
2883 if (!group->ram_block_discard_allowed) {
2884 group->ram_block_discard_allowed = true;
53d1b5fc 2885 vfio_ram_block_discard_disable(group->container, false);
238e9172
AW
2886 }
2887 }
2888
217e9fdc
PB
2889 vbasedev->fd = fd;
2890 vbasedev->group = group;
2891 QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
2892
e2c7d025
EA
2893 vbasedev->num_irqs = dev_info.num_irqs;
2894 vbasedev->num_regions = dev_info.num_regions;
2895 vbasedev->flags = dev_info.flags;
2896
2897 trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions,
2898 dev_info.num_irqs);
2899
2900 vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
217e9fdc 2901 return 0;
e2c7d025
EA
2902}
2903
2904void vfio_put_base_device(VFIODevice *vbasedev)
2905{
77a10d04
PB
2906 if (!vbasedev->group) {
2907 return;
2908 }
e2c7d025
EA
2909 QLIST_REMOVE(vbasedev, next);
2910 vbasedev->group = NULL;
2911 trace_vfio_put_base_device(vbasedev->fd);
2912 close(vbasedev->fd);
2913}
2914
46900226
AW
2915int vfio_get_region_info(VFIODevice *vbasedev, int index,
2916 struct vfio_region_info **info)
2917{
2918 size_t argsz = sizeof(struct vfio_region_info);
2919
2920 *info = g_malloc0(argsz);
2921
2922 (*info)->index = index;
b53b0f69 2923retry:
46900226
AW
2924 (*info)->argsz = argsz;
2925
2926 if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
2927 g_free(*info);
e61a424f 2928 *info = NULL;
46900226
AW
2929 return -errno;
2930 }
2931
b53b0f69
AW
2932 if ((*info)->argsz > argsz) {
2933 argsz = (*info)->argsz;
2934 *info = g_realloc(*info, argsz);
2935
2936 goto retry;
2937 }
2938
46900226
AW
2939 return 0;
2940}
2941
e61a424f
AW
2942int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
2943 uint32_t subtype, struct vfio_region_info **info)
2944{
2945 int i;
2946
2947 for (i = 0; i < vbasedev->num_regions; i++) {
2948 struct vfio_info_cap_header *hdr;
2949 struct vfio_region_info_cap_type *cap_type;
2950
2951 if (vfio_get_region_info(vbasedev, i, info)) {
2952 continue;
2953 }
2954
2955 hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
2956 if (!hdr) {
2957 g_free(*info);
2958 continue;
2959 }
2960
2961 cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
2962
2963 trace_vfio_get_dev_region(vbasedev->name, i,
2964 cap_type->type, cap_type->subtype);
2965
2966 if (cap_type->type == type && cap_type->subtype == subtype) {
2967 return 0;
2968 }
2969
2970 g_free(*info);
2971 }
2972
2973 *info = NULL;
2974 return -ENODEV;
2975}
2976
ae0215b2
AK
2977bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
2978{
2979 struct vfio_region_info *info = NULL;
2980 bool ret = false;
2981
2982 if (!vfio_get_region_info(vbasedev, region, &info)) {
2983 if (vfio_get_region_info_cap(info, cap_type)) {
2984 ret = true;
2985 }
2986 g_free(info);
2987 }
2988
2989 return ret;
2990}
2991
3153119e
DG
2992/*
2993 * Interfaces for IBM EEH (Enhanced Error Handling)
2994 */
2995static bool vfio_eeh_container_ok(VFIOContainer *container)
2996{
2997 /*
2998 * As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO
2999 * implementation is broken if there are multiple groups in a
3000 * container. The hardware works in units of Partitionable
3001 * Endpoints (== IOMMU groups) and the EEH operations naively
3002 * iterate across all groups in the container, without any logic
3003 * to make sure the groups have their state synchronized. For
3004 * certain operations (ENABLE) that might be ok, until an error
3005 * occurs, but for others (GET_STATE) it's clearly broken.
3006 */
3007
3008 /*
3009 * XXX Once fixed kernels exist, test for them here
3010 */
3011
3012 if (QLIST_EMPTY(&container->group_list)) {
3013 return false;
3014 }
3015
3016 if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) {
3017 return false;
3018 }
3019
3020 return true;
3021}
3022
3023static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
3024{
3025 struct vfio_eeh_pe_op pe_op = {
3026 .argsz = sizeof(pe_op),
3027 .op = op,
3028 };
3029 int ret;
3030
3031 if (!vfio_eeh_container_ok(container)) {
3032 error_report("vfio/eeh: EEH_PE_OP 0x%x: "
3033 "kernel requires a container with exactly one group", op);
3034 return -EPERM;
3035 }
3036
3037 ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op);
3038 if (ret < 0) {
3039 error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op);
3040 return -errno;
3041 }
3042
d917e88d 3043 return ret;
3153119e
DG
3044}
3045
3046static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
3047{
3048 VFIOAddressSpace *space = vfio_get_address_space(as);
3049 VFIOContainer *container = NULL;
3050
3051 if (QLIST_EMPTY(&space->containers)) {
3052 /* No containers to act on */
3053 goto out;
3054 }
3055
3056 container = QLIST_FIRST(&space->containers);
3057
3058 if (QLIST_NEXT(container, next)) {
3059 /* We don't yet have logic to synchronize EEH state across
3060 * multiple containers */
3061 container = NULL;
3062 goto out;
3063 }
3064
3065out:
3066 vfio_put_address_space(space);
3067 return container;
3068}
3069
3070bool vfio_eeh_as_ok(AddressSpace *as)
3071{
3072 VFIOContainer *container = vfio_eeh_as_container(as);
3073
3074 return (container != NULL) && vfio_eeh_container_ok(container);
3075}
3076
3077int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
3078{
3079 VFIOContainer *container = vfio_eeh_as_container(as);
3080
3081 if (!container) {
3082 return -ENODEV;
3083 }
3084 return vfio_eeh_container_op(container, op);
3085}