1 // SPDX-License-Identifier: GPL-2.0-only
3 * VFIO: IOMMU DMA mapping support for Type1 IOMMU
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6 * Author: Alex Williamson <alex.williamson@redhat.com>
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
12 * We arbitrarily define a Type1 IOMMU as one matching the below code.
13 * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
14 * VT-d, but that makes it harder to re-use as theoretically anyone
15 * implementing a similar IOMMU could make use of this. We expect the
16 * IOMMU to support the IOMMU API and have few to no restrictions around
17 * the IOVA range that can be mapped. The Type1 IOMMU is currently
18 * optimized for relatively static mappings of a userspace process with
19 * userspace pages pinned into memory. We also assume devices and IOMMU
20 * domains are PCI based as the IOMMU API is still centered around a
21 * device/bus interface rather than a group interface.
24 #include <linux/compat.h>
25 #include <linux/device.h>
27 #include <linux/highmem.h>
28 #include <linux/iommu.h>
29 #include <linux/module.h>
31 #include <linux/kthread.h>
32 #include <linux/rbtree.h>
33 #include <linux/sched/signal.h>
34 #include <linux/sched/mm.h>
35 #include <linux/slab.h>
36 #include <linux/uaccess.h>
37 #include <linux/vfio.h>
38 #include <linux/workqueue.h>
39 #include <linux/mdev.h>
40 #include <linux/notifier.h>
41 #include <linux/dma-iommu.h>
42 #include <linux/irqdomain.h>
44 #define DRIVER_VERSION "0.2"
45 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
46 #define DRIVER_DESC "Type1 IOMMU driver for VFIO"
48 static bool allow_unsafe_interrupts
;
49 module_param_named(allow_unsafe_interrupts
,
50 allow_unsafe_interrupts
, bool, S_IRUGO
| S_IWUSR
);
51 MODULE_PARM_DESC(allow_unsafe_interrupts
,
52 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
54 static bool disable_hugepages
;
55 module_param_named(disable_hugepages
,
56 disable_hugepages
, bool, S_IRUGO
| S_IWUSR
);
57 MODULE_PARM_DESC(disable_hugepages
,
58 "Disable VFIO IOMMU support for IOMMU hugepages.");
60 static unsigned int dma_entry_limit __read_mostly
= U16_MAX
;
61 module_param_named(dma_entry_limit
, dma_entry_limit
, uint
, 0644);
62 MODULE_PARM_DESC(dma_entry_limit
,
63 "Maximum number of user DMA mappings per container (65535).");
66 struct list_head domain_list
;
67 struct list_head iova_list
;
68 struct vfio_domain
*external_domain
; /* domain for external user */
70 struct rb_root dma_list
;
71 struct blocking_notifier_head notifier
;
72 unsigned int dma_avail
;
73 unsigned int vaddr_invalid_count
;
74 uint64_t pgsize_bitmap
;
75 uint64_t num_non_pinned_groups
;
76 wait_queue_head_t vaddr_wait
;
79 bool dirty_page_tracking
;
84 struct iommu_domain
*domain
;
85 struct list_head next
;
86 struct list_head group_list
;
87 int prot
; /* IOMMU_CACHE */
88 bool fgsp
; /* Fine-grained super pages */
93 dma_addr_t iova
; /* Device address */
94 unsigned long vaddr
; /* Process virtual addr */
95 size_t size
; /* Map size (bytes) */
96 int prot
; /* IOMMU_READ/WRITE */
98 bool lock_cap
; /* capable(CAP_IPC_LOCK) */
100 struct task_struct
*task
;
101 struct rb_root pfn_list
; /* Ex-user pinned pfn list */
102 unsigned long *bitmap
;
106 struct page
**pages
; /* for pin_user_pages_remote */
107 struct page
*fallback_page
; /* if pages alloc fails */
108 int capacity
; /* length of pages array */
109 int size
; /* of batch currently */
110 int offset
; /* of next entry in pages */
113 struct vfio_iommu_group
{
114 struct iommu_group
*iommu_group
;
115 struct list_head next
;
116 bool mdev_group
; /* An mdev group */
117 bool pinned_page_dirty_scope
;
121 struct list_head list
;
127 * Guest RAM pinning working set or DMA target
131 dma_addr_t iova
; /* Device address */
132 unsigned long pfn
; /* Host pfn */
133 unsigned int ref_count
;
136 struct vfio_regions
{
137 struct list_head list
;
143 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
144 (!list_empty(&iommu->domain_list))
146 #define DIRTY_BITMAP_BYTES(n) (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
149 * Input argument of number of bits to bitmap_set() is unsigned integer, which
150 * further casts to signed integer for unaligned multi-bit operation,
152 * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
153 * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
156 #define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX)
157 #define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
161 static int put_pfn(unsigned long pfn
, int prot
);
163 static struct vfio_iommu_group
*
164 vfio_iommu_find_iommu_group(struct vfio_iommu
*iommu
,
165 struct iommu_group
*iommu_group
);
168 * This code handles mapping and unmapping of user data buffers
169 * into DMA'ble space using the IOMMU
172 static struct vfio_dma
*vfio_find_dma(struct vfio_iommu
*iommu
,
173 dma_addr_t start
, size_t size
)
175 struct rb_node
*node
= iommu
->dma_list
.rb_node
;
178 struct vfio_dma
*dma
= rb_entry(node
, struct vfio_dma
, node
);
180 if (start
+ size
<= dma
->iova
)
181 node
= node
->rb_left
;
182 else if (start
>= dma
->iova
+ dma
->size
)
183 node
= node
->rb_right
;
191 static struct rb_node
*vfio_find_dma_first_node(struct vfio_iommu
*iommu
,
192 dma_addr_t start
, u64 size
)
194 struct rb_node
*res
= NULL
;
195 struct rb_node
*node
= iommu
->dma_list
.rb_node
;
196 struct vfio_dma
*dma_res
= NULL
;
199 struct vfio_dma
*dma
= rb_entry(node
, struct vfio_dma
, node
);
201 if (start
< dma
->iova
+ dma
->size
) {
204 if (start
>= dma
->iova
)
206 node
= node
->rb_left
;
208 node
= node
->rb_right
;
211 if (res
&& size
&& dma_res
->iova
>= start
+ size
)
216 static void vfio_link_dma(struct vfio_iommu
*iommu
, struct vfio_dma
*new)
218 struct rb_node
**link
= &iommu
->dma_list
.rb_node
, *parent
= NULL
;
219 struct vfio_dma
*dma
;
223 dma
= rb_entry(parent
, struct vfio_dma
, node
);
225 if (new->iova
+ new->size
<= dma
->iova
)
226 link
= &(*link
)->rb_left
;
228 link
= &(*link
)->rb_right
;
231 rb_link_node(&new->node
, parent
, link
);
232 rb_insert_color(&new->node
, &iommu
->dma_list
);
235 static void vfio_unlink_dma(struct vfio_iommu
*iommu
, struct vfio_dma
*old
)
237 rb_erase(&old
->node
, &iommu
->dma_list
);
241 static int vfio_dma_bitmap_alloc(struct vfio_dma
*dma
, size_t pgsize
)
243 uint64_t npages
= dma
->size
/ pgsize
;
245 if (npages
> DIRTY_BITMAP_PAGES_MAX
)
249 * Allocate extra 64 bits that are used to calculate shift required for
250 * bitmap_shift_left() to manipulate and club unaligned number of pages
251 * in adjacent vfio_dma ranges.
253 dma
->bitmap
= kvzalloc(DIRTY_BITMAP_BYTES(npages
) + sizeof(u64
),
261 static void vfio_dma_bitmap_free(struct vfio_dma
*dma
)
267 static void vfio_dma_populate_bitmap(struct vfio_dma
*dma
, size_t pgsize
)
270 unsigned long pgshift
= __ffs(pgsize
);
272 for (p
= rb_first(&dma
->pfn_list
); p
; p
= rb_next(p
)) {
273 struct vfio_pfn
*vpfn
= rb_entry(p
, struct vfio_pfn
, node
);
275 bitmap_set(dma
->bitmap
, (vpfn
->iova
- dma
->iova
) >> pgshift
, 1);
279 static void vfio_iommu_populate_bitmap_full(struct vfio_iommu
*iommu
)
282 unsigned long pgshift
= __ffs(iommu
->pgsize_bitmap
);
284 for (n
= rb_first(&iommu
->dma_list
); n
; n
= rb_next(n
)) {
285 struct vfio_dma
*dma
= rb_entry(n
, struct vfio_dma
, node
);
287 bitmap_set(dma
->bitmap
, 0, dma
->size
>> pgshift
);
291 static int vfio_dma_bitmap_alloc_all(struct vfio_iommu
*iommu
, size_t pgsize
)
295 for (n
= rb_first(&iommu
->dma_list
); n
; n
= rb_next(n
)) {
296 struct vfio_dma
*dma
= rb_entry(n
, struct vfio_dma
, node
);
299 ret
= vfio_dma_bitmap_alloc(dma
, pgsize
);
303 for (p
= rb_prev(n
); p
; p
= rb_prev(p
)) {
304 struct vfio_dma
*dma
= rb_entry(n
,
305 struct vfio_dma
, node
);
307 vfio_dma_bitmap_free(dma
);
311 vfio_dma_populate_bitmap(dma
, pgsize
);
316 static void vfio_dma_bitmap_free_all(struct vfio_iommu
*iommu
)
320 for (n
= rb_first(&iommu
->dma_list
); n
; n
= rb_next(n
)) {
321 struct vfio_dma
*dma
= rb_entry(n
, struct vfio_dma
, node
);
323 vfio_dma_bitmap_free(dma
);
328 * Helper Functions for host iova-pfn list
330 static struct vfio_pfn
*vfio_find_vpfn(struct vfio_dma
*dma
, dma_addr_t iova
)
332 struct vfio_pfn
*vpfn
;
333 struct rb_node
*node
= dma
->pfn_list
.rb_node
;
336 vpfn
= rb_entry(node
, struct vfio_pfn
, node
);
338 if (iova
< vpfn
->iova
)
339 node
= node
->rb_left
;
340 else if (iova
> vpfn
->iova
)
341 node
= node
->rb_right
;
348 static void vfio_link_pfn(struct vfio_dma
*dma
,
349 struct vfio_pfn
*new)
351 struct rb_node
**link
, *parent
= NULL
;
352 struct vfio_pfn
*vpfn
;
354 link
= &dma
->pfn_list
.rb_node
;
357 vpfn
= rb_entry(parent
, struct vfio_pfn
, node
);
359 if (new->iova
< vpfn
->iova
)
360 link
= &(*link
)->rb_left
;
362 link
= &(*link
)->rb_right
;
365 rb_link_node(&new->node
, parent
, link
);
366 rb_insert_color(&new->node
, &dma
->pfn_list
);
369 static void vfio_unlink_pfn(struct vfio_dma
*dma
, struct vfio_pfn
*old
)
371 rb_erase(&old
->node
, &dma
->pfn_list
);
374 static int vfio_add_to_pfn_list(struct vfio_dma
*dma
, dma_addr_t iova
,
377 struct vfio_pfn
*vpfn
;
379 vpfn
= kzalloc(sizeof(*vpfn
), GFP_KERNEL
);
386 vfio_link_pfn(dma
, vpfn
);
390 static void vfio_remove_from_pfn_list(struct vfio_dma
*dma
,
391 struct vfio_pfn
*vpfn
)
393 vfio_unlink_pfn(dma
, vpfn
);
397 static struct vfio_pfn
*vfio_iova_get_vfio_pfn(struct vfio_dma
*dma
,
400 struct vfio_pfn
*vpfn
= vfio_find_vpfn(dma
, iova
);
407 static int vfio_iova_put_vfio_pfn(struct vfio_dma
*dma
, struct vfio_pfn
*vpfn
)
412 if (!vpfn
->ref_count
) {
413 ret
= put_pfn(vpfn
->pfn
, dma
->prot
);
414 vfio_remove_from_pfn_list(dma
, vpfn
);
419 static int vfio_lock_acct(struct vfio_dma
*dma
, long npage
, bool async
)
421 struct mm_struct
*mm
;
427 mm
= async
? get_task_mm(dma
->task
) : dma
->task
->mm
;
429 return -ESRCH
; /* process exited */
431 ret
= mmap_write_lock_killable(mm
);
433 ret
= __account_locked_vm(mm
, abs(npage
), npage
> 0, dma
->task
,
435 mmap_write_unlock(mm
);
445 * Some mappings aren't backed by a struct page, for example an mmap'd
446 * MMIO range for our own or another device. These use a different
447 * pfn conversion and shouldn't be tracked as locked pages.
448 * For compound pages, any driver that sets the reserved bit in head
449 * page needs to set the reserved bit in all subpages to be safe.
451 static bool is_invalid_reserved_pfn(unsigned long pfn
)
454 return PageReserved(pfn_to_page(pfn
));
459 static int put_pfn(unsigned long pfn
, int prot
)
461 if (!is_invalid_reserved_pfn(pfn
)) {
462 struct page
*page
= pfn_to_page(pfn
);
464 unpin_user_pages_dirty_lock(&page
, 1, prot
& IOMMU_WRITE
);
470 #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
472 static void vfio_batch_init(struct vfio_batch
*batch
)
477 if (unlikely(disable_hugepages
))
480 batch
->pages
= (struct page
**) __get_free_page(GFP_KERNEL
);
484 batch
->capacity
= VFIO_BATCH_MAX_CAPACITY
;
488 batch
->pages
= &batch
->fallback_page
;
492 static void vfio_batch_unpin(struct vfio_batch
*batch
, struct vfio_dma
*dma
)
494 while (batch
->size
) {
495 unsigned long pfn
= page_to_pfn(batch
->pages
[batch
->offset
]);
497 put_pfn(pfn
, dma
->prot
);
503 static void vfio_batch_fini(struct vfio_batch
*batch
)
505 if (batch
->capacity
== VFIO_BATCH_MAX_CAPACITY
)
506 free_page((unsigned long)batch
->pages
);
509 static int follow_fault_pfn(struct vm_area_struct
*vma
, struct mm_struct
*mm
,
510 unsigned long vaddr
, unsigned long *pfn
,
517 ret
= follow_pte(vma
->vm_mm
, vaddr
, &ptep
, &ptl
);
519 bool unlocked
= false;
521 ret
= fixup_user_fault(mm
, vaddr
,
523 (write_fault
? FAULT_FLAG_WRITE
: 0),
531 ret
= follow_pte(vma
->vm_mm
, vaddr
, &ptep
, &ptl
);
536 if (write_fault
&& !pte_write(*ptep
))
539 *pfn
= pte_pfn(*ptep
);
541 pte_unmap_unlock(ptep
, ptl
);
546 * Returns the positive number of pfns successfully obtained or a negative
549 static int vaddr_get_pfns(struct mm_struct
*mm
, unsigned long vaddr
,
550 long npages
, int prot
, unsigned long *pfn
,
553 struct vm_area_struct
*vma
;
554 unsigned int flags
= 0;
557 if (prot
& IOMMU_WRITE
)
561 ret
= pin_user_pages_remote(mm
, vaddr
, npages
, flags
| FOLL_LONGTERM
,
564 *pfn
= page_to_pfn(pages
[0]);
568 vaddr
= untagged_addr(vaddr
);
571 vma
= vma_lookup(mm
, vaddr
);
573 if (vma
&& vma
->vm_flags
& VM_PFNMAP
) {
574 ret
= follow_fault_pfn(vma
, mm
, vaddr
, pfn
, prot
& IOMMU_WRITE
);
579 if (is_invalid_reserved_pfn(*pfn
))
586 mmap_read_unlock(mm
);
590 static int vfio_wait(struct vfio_iommu
*iommu
)
594 prepare_to_wait(&iommu
->vaddr_wait
, &wait
, TASK_KILLABLE
);
595 mutex_unlock(&iommu
->lock
);
597 mutex_lock(&iommu
->lock
);
598 finish_wait(&iommu
->vaddr_wait
, &wait
);
599 if (kthread_should_stop() || !iommu
->container_open
||
600 fatal_signal_pending(current
)) {
607 * Find dma struct and wait for its vaddr to be valid. iommu lock is dropped
608 * if the task waits, but is re-locked on return. Return result in *dma_p.
609 * Return 0 on success with no waiting, WAITED on success if waited, and -errno
612 static int vfio_find_dma_valid(struct vfio_iommu
*iommu
, dma_addr_t start
,
613 size_t size
, struct vfio_dma
**dma_p
)
618 *dma_p
= vfio_find_dma(iommu
, start
, size
);
621 else if (!(*dma_p
)->vaddr_invalid
)
624 ret
= vfio_wait(iommu
);
631 * Wait for all vaddr in the dma_list to become valid. iommu lock is dropped
632 * if the task waits, but is re-locked on return. Return 0 on success with no
633 * waiting, WAITED on success if waited, and -errno on error.
635 static int vfio_wait_all_valid(struct vfio_iommu
*iommu
)
639 while (iommu
->vaddr_invalid_count
&& ret
>= 0)
640 ret
= vfio_wait(iommu
);
646 * Attempt to pin pages. We really don't want to track all the pfns and
647 * the iommu can only map chunks of consecutive pfns anyway, so get the
648 * first page and all consecutive pages with the same locking.
650 static long vfio_pin_pages_remote(struct vfio_dma
*dma
, unsigned long vaddr
,
651 long npage
, unsigned long *pfn_base
,
652 unsigned long limit
, struct vfio_batch
*batch
)
655 struct mm_struct
*mm
= current
->mm
;
656 long ret
, pinned
= 0, lock_acct
= 0;
658 dma_addr_t iova
= vaddr
- dma
->vaddr
+ dma
->iova
;
660 /* This code path is only user initiated */
665 /* Leftover pages in batch from an earlier call. */
666 *pfn_base
= page_to_pfn(batch
->pages
[batch
->offset
]);
668 rsvd
= is_invalid_reserved_pfn(*pfn_base
);
675 /* Empty batch, so refill it. */
676 long req_pages
= min_t(long, npage
, batch
->capacity
);
678 ret
= vaddr_get_pfns(mm
, vaddr
, req_pages
, dma
->prot
,
688 rsvd
= is_invalid_reserved_pfn(*pfn_base
);
693 * pfn is preset for the first iteration of this inner loop and
694 * updated at the end to handle a VM_PFNMAP pfn. In that case,
695 * batch->pages isn't valid (there's no struct page), so allow
696 * batch->pages to be touched only when there's more than one
697 * pfn to check, which guarantees the pfns are from a
701 if (pfn
!= *pfn_base
+ pinned
||
702 rsvd
!= is_invalid_reserved_pfn(pfn
))
706 * Reserved pages aren't counted against the user,
707 * externally pinned pages are already counted against
710 if (!rsvd
&& !vfio_find_vpfn(dma
, iova
)) {
711 if (!dma
->lock_cap
&&
712 mm
->locked_vm
+ lock_acct
+ 1 > limit
) {
713 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
714 __func__
, limit
<< PAGE_SHIFT
);
731 pfn
= page_to_pfn(batch
->pages
[batch
->offset
]);
734 if (unlikely(disable_hugepages
))
739 ret
= vfio_lock_acct(dma
, lock_acct
, false);
742 if (batch
->size
== 1 && !batch
->offset
) {
743 /* May be a VM_PFNMAP pfn, which the batch can't remember. */
744 put_pfn(pfn
, dma
->prot
);
749 if (pinned
&& !rsvd
) {
750 for (pfn
= *pfn_base
; pinned
; pfn
++, pinned
--)
751 put_pfn(pfn
, dma
->prot
);
753 vfio_batch_unpin(batch
, dma
);
761 static long vfio_unpin_pages_remote(struct vfio_dma
*dma
, dma_addr_t iova
,
762 unsigned long pfn
, long npage
,
765 long unlocked
= 0, locked
= 0;
768 for (i
= 0; i
< npage
; i
++, iova
+= PAGE_SIZE
) {
769 if (put_pfn(pfn
++, dma
->prot
)) {
771 if (vfio_find_vpfn(dma
, iova
))
777 vfio_lock_acct(dma
, locked
- unlocked
, true);
782 static int vfio_pin_page_external(struct vfio_dma
*dma
, unsigned long vaddr
,
783 unsigned long *pfn_base
, bool do_accounting
)
785 struct page
*pages
[1];
786 struct mm_struct
*mm
;
789 mm
= get_task_mm(dma
->task
);
793 ret
= vaddr_get_pfns(mm
, vaddr
, 1, dma
->prot
, pfn_base
, pages
);
799 if (do_accounting
&& !is_invalid_reserved_pfn(*pfn_base
)) {
800 ret
= vfio_lock_acct(dma
, 1, true);
802 put_pfn(*pfn_base
, dma
->prot
);
804 pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK "
805 "(%ld) exceeded\n", __func__
,
806 dma
->task
->comm
, task_pid_nr(dma
->task
),
807 task_rlimit(dma
->task
, RLIMIT_MEMLOCK
));
816 static int vfio_unpin_page_external(struct vfio_dma
*dma
, dma_addr_t iova
,
820 struct vfio_pfn
*vpfn
= vfio_find_vpfn(dma
, iova
);
825 unlocked
= vfio_iova_put_vfio_pfn(dma
, vpfn
);
828 vfio_lock_acct(dma
, -unlocked
, true);
833 static int vfio_iommu_type1_pin_pages(void *iommu_data
,
834 struct iommu_group
*iommu_group
,
835 unsigned long *user_pfn
,
837 unsigned long *phys_pfn
)
839 struct vfio_iommu
*iommu
= iommu_data
;
840 struct vfio_iommu_group
*group
;
842 unsigned long remote_vaddr
;
843 struct vfio_dma
*dma
;
847 if (!iommu
|| !user_pfn
|| !phys_pfn
)
850 /* Supported for v2 version only */
854 mutex_lock(&iommu
->lock
);
857 * Wait for all necessary vaddr's to be valid so they can be used in
858 * the main loop without dropping the lock, to avoid racing vs unmap.
861 if (iommu
->vaddr_invalid_count
) {
862 for (i
= 0; i
< npage
; i
++) {
863 iova
= user_pfn
[i
] << PAGE_SHIFT
;
864 ret
= vfio_find_dma_valid(iommu
, iova
, PAGE_SIZE
, &dma
);
872 /* Fail if notifier list is empty */
873 if (!iommu
->notifier
.head
) {
879 * If iommu capable domain exist in the container then all pages are
880 * already pinned and accounted. Accounting should be done if there is no
881 * iommu capable domain in the container.
883 do_accounting
= !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu
);
885 for (i
= 0; i
< npage
; i
++) {
886 struct vfio_pfn
*vpfn
;
888 iova
= user_pfn
[i
] << PAGE_SHIFT
;
889 dma
= vfio_find_dma(iommu
, iova
, PAGE_SIZE
);
895 if ((dma
->prot
& prot
) != prot
) {
900 vpfn
= vfio_iova_get_vfio_pfn(dma
, iova
);
902 phys_pfn
[i
] = vpfn
->pfn
;
906 remote_vaddr
= dma
->vaddr
+ (iova
- dma
->iova
);
907 ret
= vfio_pin_page_external(dma
, remote_vaddr
, &phys_pfn
[i
],
912 ret
= vfio_add_to_pfn_list(dma
, iova
, phys_pfn
[i
]);
914 if (put_pfn(phys_pfn
[i
], dma
->prot
) && do_accounting
)
915 vfio_lock_acct(dma
, -1, true);
919 if (iommu
->dirty_page_tracking
) {
920 unsigned long pgshift
= __ffs(iommu
->pgsize_bitmap
);
923 * Bitmap populated with the smallest supported page
926 bitmap_set(dma
->bitmap
,
927 (iova
- dma
->iova
) >> pgshift
, 1);
932 group
= vfio_iommu_find_iommu_group(iommu
, iommu_group
);
933 if (!group
->pinned_page_dirty_scope
) {
934 group
->pinned_page_dirty_scope
= true;
935 iommu
->num_non_pinned_groups
--;
942 for (j
= 0; j
< i
; j
++) {
945 iova
= user_pfn
[j
] << PAGE_SHIFT
;
946 dma
= vfio_find_dma(iommu
, iova
, PAGE_SIZE
);
947 vfio_unpin_page_external(dma
, iova
, do_accounting
);
951 mutex_unlock(&iommu
->lock
);
955 static int vfio_iommu_type1_unpin_pages(void *iommu_data
,
956 unsigned long *user_pfn
,
959 struct vfio_iommu
*iommu
= iommu_data
;
963 if (!iommu
|| !user_pfn
|| npage
<= 0)
966 /* Supported for v2 version only */
970 mutex_lock(&iommu
->lock
);
972 do_accounting
= !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu
);
973 for (i
= 0; i
< npage
; i
++) {
974 struct vfio_dma
*dma
;
977 iova
= user_pfn
[i
] << PAGE_SHIFT
;
978 dma
= vfio_find_dma(iommu
, iova
, PAGE_SIZE
);
982 vfio_unpin_page_external(dma
, iova
, do_accounting
);
985 mutex_unlock(&iommu
->lock
);
986 return i
> 0 ? i
: -EINVAL
;
989 static long vfio_sync_unpin(struct vfio_dma
*dma
, struct vfio_domain
*domain
,
990 struct list_head
*regions
,
991 struct iommu_iotlb_gather
*iotlb_gather
)
994 struct vfio_regions
*entry
, *next
;
996 iommu_iotlb_sync(domain
->domain
, iotlb_gather
);
998 list_for_each_entry_safe(entry
, next
, regions
, list
) {
999 unlocked
+= vfio_unpin_pages_remote(dma
,
1001 entry
->phys
>> PAGE_SHIFT
,
1002 entry
->len
>> PAGE_SHIFT
,
1004 list_del(&entry
->list
);
1014 * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
1015 * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
1016 * of these regions (currently using a list).
1018 * This value specifies maximum number of regions for each IOTLB flush sync.
1020 #define VFIO_IOMMU_TLB_SYNC_MAX 512
1022 static size_t unmap_unpin_fast(struct vfio_domain
*domain
,
1023 struct vfio_dma
*dma
, dma_addr_t
*iova
,
1024 size_t len
, phys_addr_t phys
, long *unlocked
,
1025 struct list_head
*unmapped_list
,
1027 struct iommu_iotlb_gather
*iotlb_gather
)
1029 size_t unmapped
= 0;
1030 struct vfio_regions
*entry
= kzalloc(sizeof(*entry
), GFP_KERNEL
);
1033 unmapped
= iommu_unmap_fast(domain
->domain
, *iova
, len
,
1039 entry
->iova
= *iova
;
1041 entry
->len
= unmapped
;
1042 list_add_tail(&entry
->list
, unmapped_list
);
1050 * Sync if the number of fast-unmap regions hits the limit
1051 * or in case of errors.
1053 if (*unmapped_cnt
>= VFIO_IOMMU_TLB_SYNC_MAX
|| !unmapped
) {
1054 *unlocked
+= vfio_sync_unpin(dma
, domain
, unmapped_list
,
1062 static size_t unmap_unpin_slow(struct vfio_domain
*domain
,
1063 struct vfio_dma
*dma
, dma_addr_t
*iova
,
1064 size_t len
, phys_addr_t phys
,
1067 size_t unmapped
= iommu_unmap(domain
->domain
, *iova
, len
);
1070 *unlocked
+= vfio_unpin_pages_remote(dma
, *iova
,
1072 unmapped
>> PAGE_SHIFT
,
1080 static long vfio_unmap_unpin(struct vfio_iommu
*iommu
, struct vfio_dma
*dma
,
1083 dma_addr_t iova
= dma
->iova
, end
= dma
->iova
+ dma
->size
;
1084 struct vfio_domain
*domain
, *d
;
1085 LIST_HEAD(unmapped_region_list
);
1086 struct iommu_iotlb_gather iotlb_gather
;
1087 int unmapped_region_cnt
= 0;
1093 if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu
))
1097 * We use the IOMMU to track the physical addresses, otherwise we'd
1098 * need a much more complicated tracking system. Unfortunately that
1099 * means we need to use one of the iommu domains to figure out the
1100 * pfns to unpin. The rest need to be unmapped in advance so we have
1101 * no iommu translations remaining when the pages are unpinned.
1103 domain
= d
= list_first_entry(&iommu
->domain_list
,
1104 struct vfio_domain
, next
);
1106 list_for_each_entry_continue(d
, &iommu
->domain_list
, next
) {
1107 iommu_unmap(d
->domain
, dma
->iova
, dma
->size
);
1111 iommu_iotlb_gather_init(&iotlb_gather
);
1112 while (iova
< end
) {
1113 size_t unmapped
, len
;
1114 phys_addr_t phys
, next
;
1116 phys
= iommu_iova_to_phys(domain
->domain
, iova
);
1117 if (WARN_ON(!phys
)) {
1123 * To optimize for fewer iommu_unmap() calls, each of which
1124 * may require hardware cache flushing, try to find the
1125 * largest contiguous physical memory chunk to unmap.
1127 for (len
= PAGE_SIZE
;
1128 !domain
->fgsp
&& iova
+ len
< end
; len
+= PAGE_SIZE
) {
1129 next
= iommu_iova_to_phys(domain
->domain
, iova
+ len
);
1130 if (next
!= phys
+ len
)
1135 * First, try to use fast unmap/unpin. In case of failure,
1136 * switch to slow unmap/unpin path.
1138 unmapped
= unmap_unpin_fast(domain
, dma
, &iova
, len
, phys
,
1139 &unlocked
, &unmapped_region_list
,
1140 &unmapped_region_cnt
,
1143 unmapped
= unmap_unpin_slow(domain
, dma
, &iova
, len
,
1145 if (WARN_ON(!unmapped
))
1150 dma
->iommu_mapped
= false;
1152 if (unmapped_region_cnt
) {
1153 unlocked
+= vfio_sync_unpin(dma
, domain
, &unmapped_region_list
,
1157 if (do_accounting
) {
1158 vfio_lock_acct(dma
, -unlocked
, true);
1164 static void vfio_remove_dma(struct vfio_iommu
*iommu
, struct vfio_dma
*dma
)
1166 WARN_ON(!RB_EMPTY_ROOT(&dma
->pfn_list
));
1167 vfio_unmap_unpin(iommu
, dma
, true);
1168 vfio_unlink_dma(iommu
, dma
);
1169 put_task_struct(dma
->task
);
1170 vfio_dma_bitmap_free(dma
);
1171 if (dma
->vaddr_invalid
) {
1172 iommu
->vaddr_invalid_count
--;
1173 wake_up_all(&iommu
->vaddr_wait
);
1179 static void vfio_update_pgsize_bitmap(struct vfio_iommu
*iommu
)
1181 struct vfio_domain
*domain
;
1183 iommu
->pgsize_bitmap
= ULONG_MAX
;
1185 list_for_each_entry(domain
, &iommu
->domain_list
, next
)
1186 iommu
->pgsize_bitmap
&= domain
->domain
->pgsize_bitmap
;
1189 * In case the IOMMU supports page sizes smaller than PAGE_SIZE
1190 * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
1191 * That way the user will be able to map/unmap buffers whose size/
1192 * start address is aligned with PAGE_SIZE. Pinning code uses that
1193 * granularity while iommu driver can use the sub-PAGE_SIZE size
1194 * to map the buffer.
1196 if (iommu
->pgsize_bitmap
& ~PAGE_MASK
) {
1197 iommu
->pgsize_bitmap
&= PAGE_MASK
;
1198 iommu
->pgsize_bitmap
|= PAGE_SIZE
;
1202 static int update_user_bitmap(u64 __user
*bitmap
, struct vfio_iommu
*iommu
,
1203 struct vfio_dma
*dma
, dma_addr_t base_iova
,
1206 unsigned long pgshift
= __ffs(pgsize
);
1207 unsigned long nbits
= dma
->size
>> pgshift
;
1208 unsigned long bit_offset
= (dma
->iova
- base_iova
) >> pgshift
;
1209 unsigned long copy_offset
= bit_offset
/ BITS_PER_LONG
;
1210 unsigned long shift
= bit_offset
% BITS_PER_LONG
;
1211 unsigned long leftover
;
1214 * mark all pages dirty if any IOMMU capable device is not able
1215 * to report dirty pages and all pages are pinned and mapped.
1217 if (iommu
->num_non_pinned_groups
&& dma
->iommu_mapped
)
1218 bitmap_set(dma
->bitmap
, 0, nbits
);
1221 bitmap_shift_left(dma
->bitmap
, dma
->bitmap
, shift
,
1224 if (copy_from_user(&leftover
,
1225 (void __user
*)(bitmap
+ copy_offset
),
1229 bitmap_or(dma
->bitmap
, dma
->bitmap
, &leftover
, shift
);
1232 if (copy_to_user((void __user
*)(bitmap
+ copy_offset
), dma
->bitmap
,
1233 DIRTY_BITMAP_BYTES(nbits
+ shift
)))
1239 static int vfio_iova_dirty_bitmap(u64 __user
*bitmap
, struct vfio_iommu
*iommu
,
1240 dma_addr_t iova
, size_t size
, size_t pgsize
)
1242 struct vfio_dma
*dma
;
1244 unsigned long pgshift
= __ffs(pgsize
);
1248 * GET_BITMAP request must fully cover vfio_dma mappings. Multiple
1249 * vfio_dma mappings may be clubbed by specifying large ranges, but
1250 * there must not be any previous mappings bisected by the range.
1251 * An error will be returned if these conditions are not met.
1253 dma
= vfio_find_dma(iommu
, iova
, 1);
1254 if (dma
&& dma
->iova
!= iova
)
1257 dma
= vfio_find_dma(iommu
, iova
+ size
- 1, 0);
1258 if (dma
&& dma
->iova
+ dma
->size
!= iova
+ size
)
1261 for (n
= rb_first(&iommu
->dma_list
); n
; n
= rb_next(n
)) {
1262 struct vfio_dma
*dma
= rb_entry(n
, struct vfio_dma
, node
);
1264 if (dma
->iova
< iova
)
1267 if (dma
->iova
> iova
+ size
- 1)
1270 ret
= update_user_bitmap(bitmap
, iommu
, dma
, iova
, pgsize
);
1275 * Re-populate bitmap to include all pinned pages which are
1276 * considered as dirty but exclude pages which are unpinned and
1277 * pages which are marked dirty by vfio_dma_rw()
1279 bitmap_clear(dma
->bitmap
, 0, dma
->size
>> pgshift
);
1280 vfio_dma_populate_bitmap(dma
, pgsize
);
1285 static int verify_bitmap_size(uint64_t npages
, uint64_t bitmap_size
)
1287 if (!npages
|| !bitmap_size
|| (bitmap_size
> DIRTY_BITMAP_SIZE_MAX
) ||
1288 (bitmap_size
< DIRTY_BITMAP_BYTES(npages
)))
1294 static int vfio_dma_do_unmap(struct vfio_iommu
*iommu
,
1295 struct vfio_iommu_type1_dma_unmap
*unmap
,
1296 struct vfio_bitmap
*bitmap
)
1298 struct vfio_dma
*dma
, *dma_last
= NULL
;
1299 size_t unmapped
= 0, pgsize
;
1300 int ret
= -EINVAL
, retries
= 0;
1301 unsigned long pgshift
;
1302 dma_addr_t iova
= unmap
->iova
;
1303 u64 size
= unmap
->size
;
1304 bool unmap_all
= unmap
->flags
& VFIO_DMA_UNMAP_FLAG_ALL
;
1305 bool invalidate_vaddr
= unmap
->flags
& VFIO_DMA_UNMAP_FLAG_VADDR
;
1306 struct rb_node
*n
, *first_n
;
1308 mutex_lock(&iommu
->lock
);
1310 pgshift
= __ffs(iommu
->pgsize_bitmap
);
1311 pgsize
= (size_t)1 << pgshift
;
1313 if (iova
& (pgsize
- 1))
1320 } else if (!size
|| size
& (pgsize
- 1) ||
1321 iova
+ size
- 1 < iova
|| size
> SIZE_MAX
) {
1325 /* When dirty tracking is enabled, allow only min supported pgsize */
1326 if ((unmap
->flags
& VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP
) &&
1327 (!iommu
->dirty_page_tracking
|| (bitmap
->pgsize
!= pgsize
))) {
1331 WARN_ON((pgsize
- 1) & PAGE_MASK
);
1334 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
1335 * avoid tracking individual mappings. This means that the granularity
1336 * of the original mapping was lost and the user was allowed to attempt
1337 * to unmap any range. Depending on the contiguousness of physical
1338 * memory and page sizes supported by the IOMMU, arbitrary unmaps may
1339 * or may not have worked. We only guaranteed unmap granularity
1340 * matching the original mapping; even though it was untracked here,
1341 * the original mappings are reflected in IOMMU mappings. This
1342 * resulted in a couple unusual behaviors. First, if a range is not
1343 * able to be unmapped, ex. a set of 4k pages that was mapped as a
1344 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
1345 * a zero sized unmap. Also, if an unmap request overlaps the first
1346 * address of a hugepage, the IOMMU will unmap the entire hugepage.
1347 * This also returns success and the returned unmap size reflects the
1348 * actual size unmapped.
1350 * We attempt to maintain compatibility with this "v1" interface, but
1351 * we take control out of the hands of the IOMMU. Therefore, an unmap
1352 * request offset from the beginning of the original mapping will
1353 * return success with zero sized unmap. And an unmap request covering
1354 * the first iova of mapping will unmap the entire range.
1356 * The v2 version of this interface intends to be more deterministic.
1357 * Unmap requests must fully cover previous mappings. Multiple
1358 * mappings may still be unmaped by specifying large ranges, but there
1359 * must not be any previous mappings bisected by the range. An error
1360 * will be returned if these conditions are not met. The v2 interface
1361 * will only return success and a size of zero if there were no
1362 * mappings within the range.
1364 if (iommu
->v2
&& !unmap_all
) {
1365 dma
= vfio_find_dma(iommu
, iova
, 1);
1366 if (dma
&& dma
->iova
!= iova
)
1369 dma
= vfio_find_dma(iommu
, iova
+ size
- 1, 0);
1370 if (dma
&& dma
->iova
+ dma
->size
!= iova
+ size
)
1375 n
= first_n
= vfio_find_dma_first_node(iommu
, iova
, size
);
1378 dma
= rb_entry(n
, struct vfio_dma
, node
);
1379 if (dma
->iova
>= iova
+ size
)
1382 if (!iommu
->v2
&& iova
> dma
->iova
)
1385 * Task with same address space who mapped this iova range is
1386 * allowed to unmap the iova range.
1388 if (dma
->task
->mm
!= current
->mm
)
1391 if (invalidate_vaddr
) {
1392 if (dma
->vaddr_invalid
) {
1393 struct rb_node
*last_n
= n
;
1395 for (n
= first_n
; n
!= last_n
; n
= rb_next(n
)) {
1397 struct vfio_dma
, node
);
1398 dma
->vaddr_invalid
= false;
1399 iommu
->vaddr_invalid_count
--;
1405 dma
->vaddr_invalid
= true;
1406 iommu
->vaddr_invalid_count
++;
1407 unmapped
+= dma
->size
;
1412 if (!RB_EMPTY_ROOT(&dma
->pfn_list
)) {
1413 struct vfio_iommu_type1_dma_unmap nb_unmap
;
1415 if (dma_last
== dma
) {
1416 BUG_ON(++retries
> 10);
1422 nb_unmap
.iova
= dma
->iova
;
1423 nb_unmap
.size
= dma
->size
;
1426 * Notify anyone (mdev vendor drivers) to invalidate and
1427 * unmap iovas within the range we're about to unmap.
1428 * Vendor drivers MUST unpin pages in response to an
1431 mutex_unlock(&iommu
->lock
);
1432 blocking_notifier_call_chain(&iommu
->notifier
,
1433 VFIO_IOMMU_NOTIFY_DMA_UNMAP
,
1435 mutex_lock(&iommu
->lock
);
1439 if (unmap
->flags
& VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP
) {
1440 ret
= update_user_bitmap(bitmap
->data
, iommu
, dma
,
1446 unmapped
+= dma
->size
;
1448 vfio_remove_dma(iommu
, dma
);
1452 mutex_unlock(&iommu
->lock
);
1454 /* Report how much was unmapped */
1455 unmap
->size
= unmapped
;
1460 static int vfio_iommu_map(struct vfio_iommu
*iommu
, dma_addr_t iova
,
1461 unsigned long pfn
, long npage
, int prot
)
1463 struct vfio_domain
*d
;
1466 list_for_each_entry(d
, &iommu
->domain_list
, next
) {
1467 ret
= iommu_map(d
->domain
, iova
, (phys_addr_t
)pfn
<< PAGE_SHIFT
,
1468 npage
<< PAGE_SHIFT
, prot
| d
->prot
);
1478 list_for_each_entry_continue_reverse(d
, &iommu
->domain_list
, next
) {
1479 iommu_unmap(d
->domain
, iova
, npage
<< PAGE_SHIFT
);
1486 static int vfio_pin_map_dma(struct vfio_iommu
*iommu
, struct vfio_dma
*dma
,
1489 dma_addr_t iova
= dma
->iova
;
1490 unsigned long vaddr
= dma
->vaddr
;
1491 struct vfio_batch batch
;
1492 size_t size
= map_size
;
1494 unsigned long pfn
, limit
= rlimit(RLIMIT_MEMLOCK
) >> PAGE_SHIFT
;
1497 vfio_batch_init(&batch
);
1500 /* Pin a contiguous chunk of memory */
1501 npage
= vfio_pin_pages_remote(dma
, vaddr
+ dma
->size
,
1502 size
>> PAGE_SHIFT
, &pfn
, limit
,
1511 ret
= vfio_iommu_map(iommu
, iova
+ dma
->size
, pfn
, npage
,
1514 vfio_unpin_pages_remote(dma
, iova
+ dma
->size
, pfn
,
1516 vfio_batch_unpin(&batch
, dma
);
1520 size
-= npage
<< PAGE_SHIFT
;
1521 dma
->size
+= npage
<< PAGE_SHIFT
;
1524 vfio_batch_fini(&batch
);
1525 dma
->iommu_mapped
= true;
1528 vfio_remove_dma(iommu
, dma
);
1534 * Check dma map request is within a valid iova range
1536 static bool vfio_iommu_iova_dma_valid(struct vfio_iommu
*iommu
,
1537 dma_addr_t start
, dma_addr_t end
)
1539 struct list_head
*iova
= &iommu
->iova_list
;
1540 struct vfio_iova
*node
;
1542 list_for_each_entry(node
, iova
, list
) {
1543 if (start
>= node
->start
&& end
<= node
->end
)
1548 * Check for list_empty() as well since a container with
1549 * a single mdev device will have an empty list.
1551 return list_empty(iova
);
1554 static int vfio_dma_do_map(struct vfio_iommu
*iommu
,
1555 struct vfio_iommu_type1_dma_map
*map
)
1557 bool set_vaddr
= map
->flags
& VFIO_DMA_MAP_FLAG_VADDR
;
1558 dma_addr_t iova
= map
->iova
;
1559 unsigned long vaddr
= map
->vaddr
;
1560 size_t size
= map
->size
;
1561 int ret
= 0, prot
= 0;
1563 struct vfio_dma
*dma
;
1565 /* Verify that none of our __u64 fields overflow */
1566 if (map
->size
!= size
|| map
->vaddr
!= vaddr
|| map
->iova
!= iova
)
1569 /* READ/WRITE from device perspective */
1570 if (map
->flags
& VFIO_DMA_MAP_FLAG_WRITE
)
1571 prot
|= IOMMU_WRITE
;
1572 if (map
->flags
& VFIO_DMA_MAP_FLAG_READ
)
1575 if ((prot
&& set_vaddr
) || (!prot
&& !set_vaddr
))
1578 mutex_lock(&iommu
->lock
);
1580 pgsize
= (size_t)1 << __ffs(iommu
->pgsize_bitmap
);
1582 WARN_ON((pgsize
- 1) & PAGE_MASK
);
1584 if (!size
|| (size
| iova
| vaddr
) & (pgsize
- 1)) {
1589 /* Don't allow IOVA or virtual address wrap */
1590 if (iova
+ size
- 1 < iova
|| vaddr
+ size
- 1 < vaddr
) {
1595 dma
= vfio_find_dma(iommu
, iova
, size
);
1599 } else if (!dma
->vaddr_invalid
|| dma
->iova
!= iova
||
1600 dma
->size
!= size
) {
1604 dma
->vaddr_invalid
= false;
1605 iommu
->vaddr_invalid_count
--;
1606 wake_up_all(&iommu
->vaddr_wait
);
1614 if (!iommu
->dma_avail
) {
1619 if (!vfio_iommu_iova_dma_valid(iommu
, iova
, iova
+ size
- 1)) {
1624 dma
= kzalloc(sizeof(*dma
), GFP_KERNEL
);
1636 * We need to be able to both add to a task's locked memory and test
1637 * against the locked memory limit and we need to be able to do both
1638 * outside of this call path as pinning can be asynchronous via the
1639 * external interfaces for mdev devices. RLIMIT_MEMLOCK requires a
1640 * task_struct and VM locked pages requires an mm_struct, however
1641 * holding an indefinite mm reference is not recommended, therefore we
1642 * only hold a reference to a task. We could hold a reference to
1643 * current, however QEMU uses this call path through vCPU threads,
1644 * which can be killed resulting in a NULL mm and failure in the unmap
1645 * path when called via a different thread. Avoid this problem by
1646 * using the group_leader as threads within the same group require
1647 * both CLONE_THREAD and CLONE_VM and will therefore use the same
1650 * Previously we also used the task for testing CAP_IPC_LOCK at the
1651 * time of pinning and accounting, however has_capability() makes use
1652 * of real_cred, a copy-on-write field, so we can't guarantee that it
1653 * matches group_leader, or in fact that it might not change by the
1654 * time it's evaluated. If a process were to call MAP_DMA with
1655 * CAP_IPC_LOCK but later drop it, it doesn't make sense that they
1656 * possibly see different results for an iommu_mapped vfio_dma vs
1657 * externally mapped. Therefore track CAP_IPC_LOCK in vfio_dma at the
1658 * time of calling MAP_DMA.
1660 get_task_struct(current
->group_leader
);
1661 dma
->task
= current
->group_leader
;
1662 dma
->lock_cap
= capable(CAP_IPC_LOCK
);
1664 dma
->pfn_list
= RB_ROOT
;
1666 /* Insert zero-sized and grow as we map chunks of it */
1667 vfio_link_dma(iommu
, dma
);
1669 /* Don't pin and map if container doesn't contain IOMMU capable domain*/
1670 if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu
))
1673 ret
= vfio_pin_map_dma(iommu
, dma
, size
);
1675 if (!ret
&& iommu
->dirty_page_tracking
) {
1676 ret
= vfio_dma_bitmap_alloc(dma
, pgsize
);
1678 vfio_remove_dma(iommu
, dma
);
1682 mutex_unlock(&iommu
->lock
);
1686 static int vfio_bus_type(struct device
*dev
, void *data
)
1688 struct bus_type
**bus
= data
;
1690 if (*bus
&& *bus
!= dev
->bus
)
1698 static int vfio_iommu_replay(struct vfio_iommu
*iommu
,
1699 struct vfio_domain
*domain
)
1701 struct vfio_batch batch
;
1702 struct vfio_domain
*d
= NULL
;
1704 unsigned long limit
= rlimit(RLIMIT_MEMLOCK
) >> PAGE_SHIFT
;
1707 ret
= vfio_wait_all_valid(iommu
);
1711 /* Arbitrarily pick the first domain in the list for lookups */
1712 if (!list_empty(&iommu
->domain_list
))
1713 d
= list_first_entry(&iommu
->domain_list
,
1714 struct vfio_domain
, next
);
1716 vfio_batch_init(&batch
);
1718 n
= rb_first(&iommu
->dma_list
);
1720 for (; n
; n
= rb_next(n
)) {
1721 struct vfio_dma
*dma
;
1724 dma
= rb_entry(n
, struct vfio_dma
, node
);
1727 while (iova
< dma
->iova
+ dma
->size
) {
1731 if (dma
->iommu_mapped
) {
1735 if (WARN_ON(!d
)) { /* mapped w/o a domain?! */
1740 phys
= iommu_iova_to_phys(d
->domain
, iova
);
1742 if (WARN_ON(!phys
)) {
1750 while (i
< dma
->iova
+ dma
->size
&&
1751 p
== iommu_iova_to_phys(d
->domain
, i
)) {
1758 unsigned long vaddr
= dma
->vaddr
+
1760 size_t n
= dma
->iova
+ dma
->size
- iova
;
1763 npage
= vfio_pin_pages_remote(dma
, vaddr
,
1773 phys
= pfn
<< PAGE_SHIFT
;
1774 size
= npage
<< PAGE_SHIFT
;
1777 ret
= iommu_map(domain
->domain
, iova
, phys
,
1778 size
, dma
->prot
| domain
->prot
);
1780 if (!dma
->iommu_mapped
) {
1781 vfio_unpin_pages_remote(dma
, iova
,
1785 vfio_batch_unpin(&batch
, dma
);
1794 /* All dmas are now mapped, defer to second tree walk for unwind */
1795 for (n
= rb_first(&iommu
->dma_list
); n
; n
= rb_next(n
)) {
1796 struct vfio_dma
*dma
= rb_entry(n
, struct vfio_dma
, node
);
1798 dma
->iommu_mapped
= true;
1801 vfio_batch_fini(&batch
);
1805 for (; n
; n
= rb_prev(n
)) {
1806 struct vfio_dma
*dma
= rb_entry(n
, struct vfio_dma
, node
);
1809 if (dma
->iommu_mapped
) {
1810 iommu_unmap(domain
->domain
, dma
->iova
, dma
->size
);
1815 while (iova
< dma
->iova
+ dma
->size
) {
1816 phys_addr_t phys
, p
;
1820 phys
= iommu_iova_to_phys(domain
->domain
, iova
);
1829 while (i
< dma
->iova
+ dma
->size
&&
1830 p
== iommu_iova_to_phys(domain
->domain
, i
)) {
1836 iommu_unmap(domain
->domain
, iova
, size
);
1837 vfio_unpin_pages_remote(dma
, iova
, phys
>> PAGE_SHIFT
,
1838 size
>> PAGE_SHIFT
, true);
1842 vfio_batch_fini(&batch
);
1847 * We change our unmap behavior slightly depending on whether the IOMMU
1848 * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage
1849 * for practically any contiguous power-of-two mapping we give it. This means
1850 * we don't need to look for contiguous chunks ourselves to make unmapping
1851 * more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d
1852 * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
1853 * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
1854 * hugetlbfs is in use.
1856 static void vfio_test_domain_fgsp(struct vfio_domain
*domain
)
1859 int ret
, order
= get_order(PAGE_SIZE
* 2);
1861 pages
= alloc_pages(GFP_KERNEL
| __GFP_ZERO
, order
);
1865 ret
= iommu_map(domain
->domain
, 0, page_to_phys(pages
), PAGE_SIZE
* 2,
1866 IOMMU_READ
| IOMMU_WRITE
| domain
->prot
);
1868 size_t unmapped
= iommu_unmap(domain
->domain
, 0, PAGE_SIZE
);
1870 if (unmapped
== PAGE_SIZE
)
1871 iommu_unmap(domain
->domain
, PAGE_SIZE
, PAGE_SIZE
);
1873 domain
->fgsp
= true;
1876 __free_pages(pages
, order
);
1879 static struct vfio_iommu_group
*find_iommu_group(struct vfio_domain
*domain
,
1880 struct iommu_group
*iommu_group
)
1882 struct vfio_iommu_group
*g
;
1884 list_for_each_entry(g
, &domain
->group_list
, next
) {
1885 if (g
->iommu_group
== iommu_group
)
1892 static struct vfio_iommu_group
*
1893 vfio_iommu_find_iommu_group(struct vfio_iommu
*iommu
,
1894 struct iommu_group
*iommu_group
)
1896 struct vfio_domain
*domain
;
1897 struct vfio_iommu_group
*group
= NULL
;
1899 list_for_each_entry(domain
, &iommu
->domain_list
, next
) {
1900 group
= find_iommu_group(domain
, iommu_group
);
1905 if (iommu
->external_domain
)
1906 group
= find_iommu_group(iommu
->external_domain
, iommu_group
);
1911 static bool vfio_iommu_has_sw_msi(struct list_head
*group_resv_regions
,
1914 struct iommu_resv_region
*region
;
1917 list_for_each_entry(region
, group_resv_regions
, list
) {
1919 * The presence of any 'real' MSI regions should take
1920 * precedence over the software-managed one if the
1921 * IOMMU driver happens to advertise both types.
1923 if (region
->type
== IOMMU_RESV_MSI
) {
1928 if (region
->type
== IOMMU_RESV_SW_MSI
) {
1929 *base
= region
->start
;
1937 static int vfio_mdev_attach_domain(struct device
*dev
, void *data
)
1939 struct mdev_device
*mdev
= to_mdev_device(dev
);
1940 struct iommu_domain
*domain
= data
;
1941 struct device
*iommu_device
;
1943 iommu_device
= mdev_get_iommu_device(mdev
);
1945 if (iommu_dev_feature_enabled(iommu_device
, IOMMU_DEV_FEAT_AUX
))
1946 return iommu_aux_attach_device(domain
, iommu_device
);
1948 return iommu_attach_device(domain
, iommu_device
);
1954 static int vfio_mdev_detach_domain(struct device
*dev
, void *data
)
1956 struct mdev_device
*mdev
= to_mdev_device(dev
);
1957 struct iommu_domain
*domain
= data
;
1958 struct device
*iommu_device
;
1960 iommu_device
= mdev_get_iommu_device(mdev
);
1962 if (iommu_dev_feature_enabled(iommu_device
, IOMMU_DEV_FEAT_AUX
))
1963 iommu_aux_detach_device(domain
, iommu_device
);
1965 iommu_detach_device(domain
, iommu_device
);
1971 static int vfio_iommu_attach_group(struct vfio_domain
*domain
,
1972 struct vfio_iommu_group
*group
)
1974 if (group
->mdev_group
)
1975 return iommu_group_for_each_dev(group
->iommu_group
,
1977 vfio_mdev_attach_domain
);
1979 return iommu_attach_group(domain
->domain
, group
->iommu_group
);
1982 static void vfio_iommu_detach_group(struct vfio_domain
*domain
,
1983 struct vfio_iommu_group
*group
)
1985 if (group
->mdev_group
)
1986 iommu_group_for_each_dev(group
->iommu_group
, domain
->domain
,
1987 vfio_mdev_detach_domain
);
1989 iommu_detach_group(domain
->domain
, group
->iommu_group
);
1992 static bool vfio_bus_is_mdev(struct bus_type
*bus
)
1994 struct bus_type
*mdev_bus
;
1997 mdev_bus
= symbol_get(mdev_bus_type
);
1999 ret
= (bus
== mdev_bus
);
2000 symbol_put(mdev_bus_type
);
2006 static int vfio_mdev_iommu_device(struct device
*dev
, void *data
)
2008 struct mdev_device
*mdev
= to_mdev_device(dev
);
2009 struct device
**old
= data
, *new;
2011 new = mdev_get_iommu_device(mdev
);
2012 if (!new || (*old
&& *old
!= new))
2021 * This is a helper function to insert an address range to iova list.
2022 * The list is initially created with a single entry corresponding to
2023 * the IOMMU domain geometry to which the device group is attached.
2024 * The list aperture gets modified when a new domain is added to the
2025 * container if the new aperture doesn't conflict with the current one
2026 * or with any existing dma mappings. The list is also modified to
2027 * exclude any reserved regions associated with the device group.
2029 static int vfio_iommu_iova_insert(struct list_head
*head
,
2030 dma_addr_t start
, dma_addr_t end
)
2032 struct vfio_iova
*region
;
2034 region
= kmalloc(sizeof(*region
), GFP_KERNEL
);
2038 INIT_LIST_HEAD(®ion
->list
);
2039 region
->start
= start
;
2042 list_add_tail(®ion
->list
, head
);
2047 * Check the new iommu aperture conflicts with existing aper or with any
2048 * existing dma mappings.
2050 static bool vfio_iommu_aper_conflict(struct vfio_iommu
*iommu
,
2051 dma_addr_t start
, dma_addr_t end
)
2053 struct vfio_iova
*first
, *last
;
2054 struct list_head
*iova
= &iommu
->iova_list
;
2056 if (list_empty(iova
))
2059 /* Disjoint sets, return conflict */
2060 first
= list_first_entry(iova
, struct vfio_iova
, list
);
2061 last
= list_last_entry(iova
, struct vfio_iova
, list
);
2062 if (start
> last
->end
|| end
< first
->start
)
2065 /* Check for any existing dma mappings below the new start */
2066 if (start
> first
->start
) {
2067 if (vfio_find_dma(iommu
, first
->start
, start
- first
->start
))
2071 /* Check for any existing dma mappings beyond the new end */
2072 if (end
< last
->end
) {
2073 if (vfio_find_dma(iommu
, end
+ 1, last
->end
- end
))
2081 * Resize iommu iova aperture window. This is called only if the new
2082 * aperture has no conflict with existing aperture and dma mappings.
2084 static int vfio_iommu_aper_resize(struct list_head
*iova
,
2085 dma_addr_t start
, dma_addr_t end
)
2087 struct vfio_iova
*node
, *next
;
2089 if (list_empty(iova
))
2090 return vfio_iommu_iova_insert(iova
, start
, end
);
2092 /* Adjust iova list start */
2093 list_for_each_entry_safe(node
, next
, iova
, list
) {
2094 if (start
< node
->start
)
2096 if (start
>= node
->start
&& start
< node
->end
) {
2097 node
->start
= start
;
2100 /* Delete nodes before new start */
2101 list_del(&node
->list
);
2105 /* Adjust iova list end */
2106 list_for_each_entry_safe(node
, next
, iova
, list
) {
2107 if (end
> node
->end
)
2109 if (end
> node
->start
&& end
<= node
->end
) {
2113 /* Delete nodes after new end */
2114 list_del(&node
->list
);
2122 * Check reserved region conflicts with existing dma mappings
2124 static bool vfio_iommu_resv_conflict(struct vfio_iommu
*iommu
,
2125 struct list_head
*resv_regions
)
2127 struct iommu_resv_region
*region
;
2129 /* Check for conflict with existing dma mappings */
2130 list_for_each_entry(region
, resv_regions
, list
) {
2131 if (region
->type
== IOMMU_RESV_DIRECT_RELAXABLE
)
2134 if (vfio_find_dma(iommu
, region
->start
, region
->length
))
2142 * Check iova region overlap with reserved regions and
2143 * exclude them from the iommu iova range
2145 static int vfio_iommu_resv_exclude(struct list_head
*iova
,
2146 struct list_head
*resv_regions
)
2148 struct iommu_resv_region
*resv
;
2149 struct vfio_iova
*n
, *next
;
2151 list_for_each_entry(resv
, resv_regions
, list
) {
2152 phys_addr_t start
, end
;
2154 if (resv
->type
== IOMMU_RESV_DIRECT_RELAXABLE
)
2157 start
= resv
->start
;
2158 end
= resv
->start
+ resv
->length
- 1;
2160 list_for_each_entry_safe(n
, next
, iova
, list
) {
2164 if (start
> n
->end
|| end
< n
->start
)
2167 * Insert a new node if current node overlaps with the
2168 * reserve region to exclude that from valid iova range.
2169 * Note that, new node is inserted before the current
2170 * node and finally the current node is deleted keeping
2171 * the list updated and sorted.
2173 if (start
> n
->start
)
2174 ret
= vfio_iommu_iova_insert(&n
->list
, n
->start
,
2176 if (!ret
&& end
< n
->end
)
2177 ret
= vfio_iommu_iova_insert(&n
->list
, end
+ 1,
2187 if (list_empty(iova
))
2193 static void vfio_iommu_resv_free(struct list_head
*resv_regions
)
2195 struct iommu_resv_region
*n
, *next
;
2197 list_for_each_entry_safe(n
, next
, resv_regions
, list
) {
2203 static void vfio_iommu_iova_free(struct list_head
*iova
)
2205 struct vfio_iova
*n
, *next
;
2207 list_for_each_entry_safe(n
, next
, iova
, list
) {
2213 static int vfio_iommu_iova_get_copy(struct vfio_iommu
*iommu
,
2214 struct list_head
*iova_copy
)
2216 struct list_head
*iova
= &iommu
->iova_list
;
2217 struct vfio_iova
*n
;
2220 list_for_each_entry(n
, iova
, list
) {
2221 ret
= vfio_iommu_iova_insert(iova_copy
, n
->start
, n
->end
);
2229 vfio_iommu_iova_free(iova_copy
);
2233 static void vfio_iommu_iova_insert_copy(struct vfio_iommu
*iommu
,
2234 struct list_head
*iova_copy
)
2236 struct list_head
*iova
= &iommu
->iova_list
;
2238 vfio_iommu_iova_free(iova
);
2240 list_splice_tail(iova_copy
, iova
);
2243 static int vfio_iommu_type1_attach_group(void *iommu_data
,
2244 struct iommu_group
*iommu_group
)
2246 struct vfio_iommu
*iommu
= iommu_data
;
2247 struct vfio_iommu_group
*group
;
2248 struct vfio_domain
*domain
, *d
;
2249 struct bus_type
*bus
= NULL
;
2251 bool resv_msi
, msi_remap
;
2252 phys_addr_t resv_msi_base
= 0;
2253 struct iommu_domain_geometry
*geo
;
2254 LIST_HEAD(iova_copy
);
2255 LIST_HEAD(group_resv_regions
);
2257 mutex_lock(&iommu
->lock
);
2259 /* Check for duplicates */
2260 if (vfio_iommu_find_iommu_group(iommu
, iommu_group
)) {
2261 mutex_unlock(&iommu
->lock
);
2265 group
= kzalloc(sizeof(*group
), GFP_KERNEL
);
2266 domain
= kzalloc(sizeof(*domain
), GFP_KERNEL
);
2267 if (!group
|| !domain
) {
2272 group
->iommu_group
= iommu_group
;
2274 /* Determine bus_type in order to allocate a domain */
2275 ret
= iommu_group_for_each_dev(iommu_group
, &bus
, vfio_bus_type
);
2279 if (vfio_bus_is_mdev(bus
)) {
2280 struct device
*iommu_device
= NULL
;
2282 group
->mdev_group
= true;
2284 /* Determine the isolation type */
2285 ret
= iommu_group_for_each_dev(iommu_group
, &iommu_device
,
2286 vfio_mdev_iommu_device
);
2287 if (ret
|| !iommu_device
) {
2288 if (!iommu
->external_domain
) {
2289 INIT_LIST_HEAD(&domain
->group_list
);
2290 iommu
->external_domain
= domain
;
2291 vfio_update_pgsize_bitmap(iommu
);
2296 list_add(&group
->next
,
2297 &iommu
->external_domain
->group_list
);
2299 * Non-iommu backed group cannot dirty memory directly,
2300 * it can only use interfaces that provide dirty
2302 * The iommu scope can only be promoted with the
2303 * addition of a dirty tracking group.
2305 group
->pinned_page_dirty_scope
= true;
2306 mutex_unlock(&iommu
->lock
);
2311 bus
= iommu_device
->bus
;
2314 domain
->domain
= iommu_domain_alloc(bus
);
2315 if (!domain
->domain
) {
2320 if (iommu
->nesting
) {
2321 ret
= iommu_enable_nesting(domain
->domain
);
2326 ret
= vfio_iommu_attach_group(domain
, group
);
2330 /* Get aperture info */
2331 geo
= &domain
->domain
->geometry
;
2332 if (vfio_iommu_aper_conflict(iommu
, geo
->aperture_start
,
2333 geo
->aperture_end
)) {
2338 ret
= iommu_get_group_resv_regions(iommu_group
, &group_resv_regions
);
2342 if (vfio_iommu_resv_conflict(iommu
, &group_resv_regions
)) {
2348 * We don't want to work on the original iova list as the list
2349 * gets modified and in case of failure we have to retain the
2350 * original list. Get a copy here.
2352 ret
= vfio_iommu_iova_get_copy(iommu
, &iova_copy
);
2356 ret
= vfio_iommu_aper_resize(&iova_copy
, geo
->aperture_start
,
2361 ret
= vfio_iommu_resv_exclude(&iova_copy
, &group_resv_regions
);
2365 resv_msi
= vfio_iommu_has_sw_msi(&group_resv_regions
, &resv_msi_base
);
2367 INIT_LIST_HEAD(&domain
->group_list
);
2368 list_add(&group
->next
, &domain
->group_list
);
2370 msi_remap
= irq_domain_check_msi_remap() ||
2371 iommu_capable(bus
, IOMMU_CAP_INTR_REMAP
);
2373 if (!allow_unsafe_interrupts
&& !msi_remap
) {
2374 pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
2380 if (iommu_capable(bus
, IOMMU_CAP_CACHE_COHERENCY
))
2381 domain
->prot
|= IOMMU_CACHE
;
2384 * Try to match an existing compatible domain. We don't want to
2385 * preclude an IOMMU driver supporting multiple bus_types and being
2386 * able to include different bus_types in the same IOMMU domain, so
2387 * we test whether the domains use the same iommu_ops rather than
2388 * testing if they're on the same bus_type.
2390 list_for_each_entry(d
, &iommu
->domain_list
, next
) {
2391 if (d
->domain
->ops
== domain
->domain
->ops
&&
2392 d
->prot
== domain
->prot
) {
2393 vfio_iommu_detach_group(domain
, group
);
2394 if (!vfio_iommu_attach_group(d
, group
)) {
2395 list_add(&group
->next
, &d
->group_list
);
2396 iommu_domain_free(domain
->domain
);
2401 ret
= vfio_iommu_attach_group(domain
, group
);
2407 vfio_test_domain_fgsp(domain
);
2409 /* replay mappings on new domains */
2410 ret
= vfio_iommu_replay(iommu
, domain
);
2415 ret
= iommu_get_msi_cookie(domain
->domain
, resv_msi_base
);
2416 if (ret
&& ret
!= -ENODEV
)
2420 list_add(&domain
->next
, &iommu
->domain_list
);
2421 vfio_update_pgsize_bitmap(iommu
);
2423 /* Delete the old one and insert new iova list */
2424 vfio_iommu_iova_insert_copy(iommu
, &iova_copy
);
2427 * An iommu backed group can dirty memory directly and therefore
2428 * demotes the iommu scope until it declares itself dirty tracking
2429 * capable via the page pinning interface.
2431 iommu
->num_non_pinned_groups
++;
2432 mutex_unlock(&iommu
->lock
);
2433 vfio_iommu_resv_free(&group_resv_regions
);
2438 vfio_iommu_detach_group(domain
, group
);
2440 iommu_domain_free(domain
->domain
);
2441 vfio_iommu_iova_free(&iova_copy
);
2442 vfio_iommu_resv_free(&group_resv_regions
);
2446 mutex_unlock(&iommu
->lock
);
2450 static void vfio_iommu_unmap_unpin_all(struct vfio_iommu
*iommu
)
2452 struct rb_node
*node
;
2454 while ((node
= rb_first(&iommu
->dma_list
)))
2455 vfio_remove_dma(iommu
, rb_entry(node
, struct vfio_dma
, node
));
2458 static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu
*iommu
)
2460 struct rb_node
*n
, *p
;
2462 n
= rb_first(&iommu
->dma_list
);
2463 for (; n
; n
= rb_next(n
)) {
2464 struct vfio_dma
*dma
;
2465 long locked
= 0, unlocked
= 0;
2467 dma
= rb_entry(n
, struct vfio_dma
, node
);
2468 unlocked
+= vfio_unmap_unpin(iommu
, dma
, false);
2469 p
= rb_first(&dma
->pfn_list
);
2470 for (; p
; p
= rb_next(p
)) {
2471 struct vfio_pfn
*vpfn
= rb_entry(p
, struct vfio_pfn
,
2474 if (!is_invalid_reserved_pfn(vpfn
->pfn
))
2477 vfio_lock_acct(dma
, locked
- unlocked
, true);
2482 * Called when a domain is removed in detach. It is possible that
2483 * the removed domain decided the iova aperture window. Modify the
2484 * iova aperture with the smallest window among existing domains.
2486 static void vfio_iommu_aper_expand(struct vfio_iommu
*iommu
,
2487 struct list_head
*iova_copy
)
2489 struct vfio_domain
*domain
;
2490 struct vfio_iova
*node
;
2491 dma_addr_t start
= 0;
2492 dma_addr_t end
= (dma_addr_t
)~0;
2494 if (list_empty(iova_copy
))
2497 list_for_each_entry(domain
, &iommu
->domain_list
, next
) {
2498 struct iommu_domain_geometry
*geo
= &domain
->domain
->geometry
;
2500 if (geo
->aperture_start
> start
)
2501 start
= geo
->aperture_start
;
2502 if (geo
->aperture_end
< end
)
2503 end
= geo
->aperture_end
;
2506 /* Modify aperture limits. The new aper is either same or bigger */
2507 node
= list_first_entry(iova_copy
, struct vfio_iova
, list
);
2508 node
->start
= start
;
2509 node
= list_last_entry(iova_copy
, struct vfio_iova
, list
);
2514 * Called when a group is detached. The reserved regions for that
2515 * group can be part of valid iova now. But since reserved regions
2516 * may be duplicated among groups, populate the iova valid regions
2519 static int vfio_iommu_resv_refresh(struct vfio_iommu
*iommu
,
2520 struct list_head
*iova_copy
)
2522 struct vfio_domain
*d
;
2523 struct vfio_iommu_group
*g
;
2524 struct vfio_iova
*node
;
2525 dma_addr_t start
, end
;
2526 LIST_HEAD(resv_regions
);
2529 if (list_empty(iova_copy
))
2532 list_for_each_entry(d
, &iommu
->domain_list
, next
) {
2533 list_for_each_entry(g
, &d
->group_list
, next
) {
2534 ret
= iommu_get_group_resv_regions(g
->iommu_group
,
2541 node
= list_first_entry(iova_copy
, struct vfio_iova
, list
);
2542 start
= node
->start
;
2543 node
= list_last_entry(iova_copy
, struct vfio_iova
, list
);
2546 /* purge the iova list and create new one */
2547 vfio_iommu_iova_free(iova_copy
);
2549 ret
= vfio_iommu_aper_resize(iova_copy
, start
, end
);
2553 /* Exclude current reserved regions from iova ranges */
2554 ret
= vfio_iommu_resv_exclude(iova_copy
, &resv_regions
);
2556 vfio_iommu_resv_free(&resv_regions
);
2560 static void vfio_iommu_type1_detach_group(void *iommu_data
,
2561 struct iommu_group
*iommu_group
)
2563 struct vfio_iommu
*iommu
= iommu_data
;
2564 struct vfio_domain
*domain
;
2565 struct vfio_iommu_group
*group
;
2566 bool update_dirty_scope
= false;
2567 LIST_HEAD(iova_copy
);
2569 mutex_lock(&iommu
->lock
);
2571 if (iommu
->external_domain
) {
2572 group
= find_iommu_group(iommu
->external_domain
, iommu_group
);
2574 update_dirty_scope
= !group
->pinned_page_dirty_scope
;
2575 list_del(&group
->next
);
2578 if (list_empty(&iommu
->external_domain
->group_list
)) {
2579 if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu
)) {
2580 WARN_ON(iommu
->notifier
.head
);
2581 vfio_iommu_unmap_unpin_all(iommu
);
2584 kfree(iommu
->external_domain
);
2585 iommu
->external_domain
= NULL
;
2587 goto detach_group_done
;
2592 * Get a copy of iova list. This will be used to update
2593 * and to replace the current one later. Please note that
2594 * we will leave the original list as it is if update fails.
2596 vfio_iommu_iova_get_copy(iommu
, &iova_copy
);
2598 list_for_each_entry(domain
, &iommu
->domain_list
, next
) {
2599 group
= find_iommu_group(domain
, iommu_group
);
2603 vfio_iommu_detach_group(domain
, group
);
2604 update_dirty_scope
= !group
->pinned_page_dirty_scope
;
2605 list_del(&group
->next
);
2608 * Group ownership provides privilege, if the group list is
2609 * empty, the domain goes away. If it's the last domain with
2610 * iommu and external domain doesn't exist, then all the
2611 * mappings go away too. If it's the last domain with iommu and
2612 * external domain exist, update accounting
2614 if (list_empty(&domain
->group_list
)) {
2615 if (list_is_singular(&iommu
->domain_list
)) {
2616 if (!iommu
->external_domain
) {
2617 WARN_ON(iommu
->notifier
.head
);
2618 vfio_iommu_unmap_unpin_all(iommu
);
2620 vfio_iommu_unmap_unpin_reaccount(iommu
);
2623 iommu_domain_free(domain
->domain
);
2624 list_del(&domain
->next
);
2626 vfio_iommu_aper_expand(iommu
, &iova_copy
);
2627 vfio_update_pgsize_bitmap(iommu
);
2632 if (!vfio_iommu_resv_refresh(iommu
, &iova_copy
))
2633 vfio_iommu_iova_insert_copy(iommu
, &iova_copy
);
2635 vfio_iommu_iova_free(&iova_copy
);
2639 * Removal of a group without dirty tracking may allow the iommu scope
2642 if (update_dirty_scope
) {
2643 iommu
->num_non_pinned_groups
--;
2644 if (iommu
->dirty_page_tracking
)
2645 vfio_iommu_populate_bitmap_full(iommu
);
2647 mutex_unlock(&iommu
->lock
);
2650 static void *vfio_iommu_type1_open(unsigned long arg
)
2652 struct vfio_iommu
*iommu
;
2654 iommu
= kzalloc(sizeof(*iommu
), GFP_KERNEL
);
2656 return ERR_PTR(-ENOMEM
);
2659 case VFIO_TYPE1_IOMMU
:
2661 case VFIO_TYPE1_NESTING_IOMMU
:
2662 iommu
->nesting
= true;
2664 case VFIO_TYPE1v2_IOMMU
:
2669 return ERR_PTR(-EINVAL
);
2672 INIT_LIST_HEAD(&iommu
->domain_list
);
2673 INIT_LIST_HEAD(&iommu
->iova_list
);
2674 iommu
->dma_list
= RB_ROOT
;
2675 iommu
->dma_avail
= dma_entry_limit
;
2676 iommu
->container_open
= true;
2677 mutex_init(&iommu
->lock
);
2678 BLOCKING_INIT_NOTIFIER_HEAD(&iommu
->notifier
);
2679 init_waitqueue_head(&iommu
->vaddr_wait
);
2684 static void vfio_release_domain(struct vfio_domain
*domain
, bool external
)
2686 struct vfio_iommu_group
*group
, *group_tmp
;
2688 list_for_each_entry_safe(group
, group_tmp
,
2689 &domain
->group_list
, next
) {
2691 vfio_iommu_detach_group(domain
, group
);
2692 list_del(&group
->next
);
2697 iommu_domain_free(domain
->domain
);
2700 static void vfio_iommu_type1_release(void *iommu_data
)
2702 struct vfio_iommu
*iommu
= iommu_data
;
2703 struct vfio_domain
*domain
, *domain_tmp
;
2705 if (iommu
->external_domain
) {
2706 vfio_release_domain(iommu
->external_domain
, true);
2707 kfree(iommu
->external_domain
);
2710 vfio_iommu_unmap_unpin_all(iommu
);
2712 list_for_each_entry_safe(domain
, domain_tmp
,
2713 &iommu
->domain_list
, next
) {
2714 vfio_release_domain(domain
, false);
2715 list_del(&domain
->next
);
2719 vfio_iommu_iova_free(&iommu
->iova_list
);
2724 static int vfio_domains_have_iommu_cache(struct vfio_iommu
*iommu
)
2726 struct vfio_domain
*domain
;
2729 mutex_lock(&iommu
->lock
);
2730 list_for_each_entry(domain
, &iommu
->domain_list
, next
) {
2731 if (!(domain
->prot
& IOMMU_CACHE
)) {
2736 mutex_unlock(&iommu
->lock
);
2741 static int vfio_iommu_type1_check_extension(struct vfio_iommu
*iommu
,
2745 case VFIO_TYPE1_IOMMU
:
2746 case VFIO_TYPE1v2_IOMMU
:
2747 case VFIO_TYPE1_NESTING_IOMMU
:
2748 case VFIO_UNMAP_ALL
:
2749 case VFIO_UPDATE_VADDR
:
2751 case VFIO_DMA_CC_IOMMU
:
2754 return vfio_domains_have_iommu_cache(iommu
);
2760 static int vfio_iommu_iova_add_cap(struct vfio_info_cap
*caps
,
2761 struct vfio_iommu_type1_info_cap_iova_range
*cap_iovas
,
2764 struct vfio_info_cap_header
*header
;
2765 struct vfio_iommu_type1_info_cap_iova_range
*iova_cap
;
2767 header
= vfio_info_cap_add(caps
, size
,
2768 VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE
, 1);
2770 return PTR_ERR(header
);
2772 iova_cap
= container_of(header
,
2773 struct vfio_iommu_type1_info_cap_iova_range
,
2775 iova_cap
->nr_iovas
= cap_iovas
->nr_iovas
;
2776 memcpy(iova_cap
->iova_ranges
, cap_iovas
->iova_ranges
,
2777 cap_iovas
->nr_iovas
* sizeof(*cap_iovas
->iova_ranges
));
2781 static int vfio_iommu_iova_build_caps(struct vfio_iommu
*iommu
,
2782 struct vfio_info_cap
*caps
)
2784 struct vfio_iommu_type1_info_cap_iova_range
*cap_iovas
;
2785 struct vfio_iova
*iova
;
2787 int iovas
= 0, i
= 0, ret
;
2789 list_for_each_entry(iova
, &iommu
->iova_list
, list
)
2794 * Return 0 as a container with a single mdev device
2795 * will have an empty list
2800 size
= struct_size(cap_iovas
, iova_ranges
, iovas
);
2802 cap_iovas
= kzalloc(size
, GFP_KERNEL
);
2806 cap_iovas
->nr_iovas
= iovas
;
2808 list_for_each_entry(iova
, &iommu
->iova_list
, list
) {
2809 cap_iovas
->iova_ranges
[i
].start
= iova
->start
;
2810 cap_iovas
->iova_ranges
[i
].end
= iova
->end
;
2814 ret
= vfio_iommu_iova_add_cap(caps
, cap_iovas
, size
);
2820 static int vfio_iommu_migration_build_caps(struct vfio_iommu
*iommu
,
2821 struct vfio_info_cap
*caps
)
2823 struct vfio_iommu_type1_info_cap_migration cap_mig
;
2825 cap_mig
.header
.id
= VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION
;
2826 cap_mig
.header
.version
= 1;
2829 /* support minimum pgsize */
2830 cap_mig
.pgsize_bitmap
= (size_t)1 << __ffs(iommu
->pgsize_bitmap
);
2831 cap_mig
.max_dirty_bitmap_size
= DIRTY_BITMAP_SIZE_MAX
;
2833 return vfio_info_add_capability(caps
, &cap_mig
.header
, sizeof(cap_mig
));
2836 static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu
*iommu
,
2837 struct vfio_info_cap
*caps
)
2839 struct vfio_iommu_type1_info_dma_avail cap_dma_avail
;
2841 cap_dma_avail
.header
.id
= VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL
;
2842 cap_dma_avail
.header
.version
= 1;
2844 cap_dma_avail
.avail
= iommu
->dma_avail
;
2846 return vfio_info_add_capability(caps
, &cap_dma_avail
.header
,
2847 sizeof(cap_dma_avail
));
2850 static int vfio_iommu_type1_get_info(struct vfio_iommu
*iommu
,
2853 struct vfio_iommu_type1_info info
;
2854 unsigned long minsz
;
2855 struct vfio_info_cap caps
= { .buf
= NULL
, .size
= 0 };
2856 unsigned long capsz
;
2859 minsz
= offsetofend(struct vfio_iommu_type1_info
, iova_pgsizes
);
2861 /* For backward compatibility, cannot require this */
2862 capsz
= offsetofend(struct vfio_iommu_type1_info
, cap_offset
);
2864 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
2867 if (info
.argsz
< minsz
)
2870 if (info
.argsz
>= capsz
) {
2872 info
.cap_offset
= 0; /* output, no-recopy necessary */
2875 mutex_lock(&iommu
->lock
);
2876 info
.flags
= VFIO_IOMMU_INFO_PGSIZES
;
2878 info
.iova_pgsizes
= iommu
->pgsize_bitmap
;
2880 ret
= vfio_iommu_migration_build_caps(iommu
, &caps
);
2883 ret
= vfio_iommu_dma_avail_build_caps(iommu
, &caps
);
2886 ret
= vfio_iommu_iova_build_caps(iommu
, &caps
);
2888 mutex_unlock(&iommu
->lock
);
2894 info
.flags
|= VFIO_IOMMU_INFO_CAPS
;
2896 if (info
.argsz
< sizeof(info
) + caps
.size
) {
2897 info
.argsz
= sizeof(info
) + caps
.size
;
2899 vfio_info_cap_shift(&caps
, sizeof(info
));
2900 if (copy_to_user((void __user
*)arg
+
2901 sizeof(info
), caps
.buf
,
2906 info
.cap_offset
= sizeof(info
);
2912 return copy_to_user((void __user
*)arg
, &info
, minsz
) ?
2916 static int vfio_iommu_type1_map_dma(struct vfio_iommu
*iommu
,
2919 struct vfio_iommu_type1_dma_map map
;
2920 unsigned long minsz
;
2921 uint32_t mask
= VFIO_DMA_MAP_FLAG_READ
| VFIO_DMA_MAP_FLAG_WRITE
|
2922 VFIO_DMA_MAP_FLAG_VADDR
;
2924 minsz
= offsetofend(struct vfio_iommu_type1_dma_map
, size
);
2926 if (copy_from_user(&map
, (void __user
*)arg
, minsz
))
2929 if (map
.argsz
< minsz
|| map
.flags
& ~mask
)
2932 return vfio_dma_do_map(iommu
, &map
);
2935 static int vfio_iommu_type1_unmap_dma(struct vfio_iommu
*iommu
,
2938 struct vfio_iommu_type1_dma_unmap unmap
;
2939 struct vfio_bitmap bitmap
= { 0 };
2940 uint32_t mask
= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP
|
2941 VFIO_DMA_UNMAP_FLAG_VADDR
|
2942 VFIO_DMA_UNMAP_FLAG_ALL
;
2943 unsigned long minsz
;
2946 minsz
= offsetofend(struct vfio_iommu_type1_dma_unmap
, size
);
2948 if (copy_from_user(&unmap
, (void __user
*)arg
, minsz
))
2951 if (unmap
.argsz
< minsz
|| unmap
.flags
& ~mask
)
2954 if ((unmap
.flags
& VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP
) &&
2955 (unmap
.flags
& (VFIO_DMA_UNMAP_FLAG_ALL
|
2956 VFIO_DMA_UNMAP_FLAG_VADDR
)))
2959 if (unmap
.flags
& VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP
) {
2960 unsigned long pgshift
;
2962 if (unmap
.argsz
< (minsz
+ sizeof(bitmap
)))
2965 if (copy_from_user(&bitmap
,
2966 (void __user
*)(arg
+ minsz
),
2970 if (!access_ok((void __user
*)bitmap
.data
, bitmap
.size
))
2973 pgshift
= __ffs(bitmap
.pgsize
);
2974 ret
= verify_bitmap_size(unmap
.size
>> pgshift
,
2980 ret
= vfio_dma_do_unmap(iommu
, &unmap
, &bitmap
);
2984 return copy_to_user((void __user
*)arg
, &unmap
, minsz
) ?
2988 static int vfio_iommu_type1_dirty_pages(struct vfio_iommu
*iommu
,
2991 struct vfio_iommu_type1_dirty_bitmap dirty
;
2992 uint32_t mask
= VFIO_IOMMU_DIRTY_PAGES_FLAG_START
|
2993 VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP
|
2994 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP
;
2995 unsigned long minsz
;
3001 minsz
= offsetofend(struct vfio_iommu_type1_dirty_bitmap
, flags
);
3003 if (copy_from_user(&dirty
, (void __user
*)arg
, minsz
))
3006 if (dirty
.argsz
< minsz
|| dirty
.flags
& ~mask
)
3009 /* only one flag should be set at a time */
3010 if (__ffs(dirty
.flags
) != __fls(dirty
.flags
))
3013 if (dirty
.flags
& VFIO_IOMMU_DIRTY_PAGES_FLAG_START
) {
3016 mutex_lock(&iommu
->lock
);
3017 pgsize
= 1 << __ffs(iommu
->pgsize_bitmap
);
3018 if (!iommu
->dirty_page_tracking
) {
3019 ret
= vfio_dma_bitmap_alloc_all(iommu
, pgsize
);
3021 iommu
->dirty_page_tracking
= true;
3023 mutex_unlock(&iommu
->lock
);
3025 } else if (dirty
.flags
& VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP
) {
3026 mutex_lock(&iommu
->lock
);
3027 if (iommu
->dirty_page_tracking
) {
3028 iommu
->dirty_page_tracking
= false;
3029 vfio_dma_bitmap_free_all(iommu
);
3031 mutex_unlock(&iommu
->lock
);
3033 } else if (dirty
.flags
& VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP
) {
3034 struct vfio_iommu_type1_dirty_bitmap_get range
;
3035 unsigned long pgshift
;
3036 size_t data_size
= dirty
.argsz
- minsz
;
3037 size_t iommu_pgsize
;
3039 if (!data_size
|| data_size
< sizeof(range
))
3042 if (copy_from_user(&range
, (void __user
*)(arg
+ minsz
),
3046 if (range
.iova
+ range
.size
< range
.iova
)
3048 if (!access_ok((void __user
*)range
.bitmap
.data
,
3052 pgshift
= __ffs(range
.bitmap
.pgsize
);
3053 ret
= verify_bitmap_size(range
.size
>> pgshift
,
3058 mutex_lock(&iommu
->lock
);
3060 iommu_pgsize
= (size_t)1 << __ffs(iommu
->pgsize_bitmap
);
3062 /* allow only smallest supported pgsize */
3063 if (range
.bitmap
.pgsize
!= iommu_pgsize
) {
3067 if (range
.iova
& (iommu_pgsize
- 1)) {
3071 if (!range
.size
|| range
.size
& (iommu_pgsize
- 1)) {
3076 if (iommu
->dirty_page_tracking
)
3077 ret
= vfio_iova_dirty_bitmap(range
.bitmap
.data
,
3080 range
.bitmap
.pgsize
);
3084 mutex_unlock(&iommu
->lock
);
3092 static long vfio_iommu_type1_ioctl(void *iommu_data
,
3093 unsigned int cmd
, unsigned long arg
)
3095 struct vfio_iommu
*iommu
= iommu_data
;
3098 case VFIO_CHECK_EXTENSION
:
3099 return vfio_iommu_type1_check_extension(iommu
, arg
);
3100 case VFIO_IOMMU_GET_INFO
:
3101 return vfio_iommu_type1_get_info(iommu
, arg
);
3102 case VFIO_IOMMU_MAP_DMA
:
3103 return vfio_iommu_type1_map_dma(iommu
, arg
);
3104 case VFIO_IOMMU_UNMAP_DMA
:
3105 return vfio_iommu_type1_unmap_dma(iommu
, arg
);
3106 case VFIO_IOMMU_DIRTY_PAGES
:
3107 return vfio_iommu_type1_dirty_pages(iommu
, arg
);
3113 static int vfio_iommu_type1_register_notifier(void *iommu_data
,
3114 unsigned long *events
,
3115 struct notifier_block
*nb
)
3117 struct vfio_iommu
*iommu
= iommu_data
;
3119 /* clear known events */
3120 *events
&= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP
;
3122 /* refuse to register if still events remaining */
3126 return blocking_notifier_chain_register(&iommu
->notifier
, nb
);
3129 static int vfio_iommu_type1_unregister_notifier(void *iommu_data
,
3130 struct notifier_block
*nb
)
3132 struct vfio_iommu
*iommu
= iommu_data
;
3134 return blocking_notifier_chain_unregister(&iommu
->notifier
, nb
);
3137 static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu
*iommu
,
3138 dma_addr_t user_iova
, void *data
,
3139 size_t count
, bool write
,
3142 struct mm_struct
*mm
;
3143 unsigned long vaddr
;
3144 struct vfio_dma
*dma
;
3145 bool kthread
= current
->mm
== NULL
;
3151 ret
= vfio_find_dma_valid(iommu
, user_iova
, 1, &dma
);
3155 if ((write
&& !(dma
->prot
& IOMMU_WRITE
)) ||
3156 !(dma
->prot
& IOMMU_READ
))
3159 mm
= get_task_mm(dma
->task
);
3166 else if (current
->mm
!= mm
)
3169 offset
= user_iova
- dma
->iova
;
3171 if (count
> dma
->size
- offset
)
3172 count
= dma
->size
- offset
;
3174 vaddr
= dma
->vaddr
+ offset
;
3177 *copied
= copy_to_user((void __user
*)vaddr
, data
,
3179 if (*copied
&& iommu
->dirty_page_tracking
) {
3180 unsigned long pgshift
= __ffs(iommu
->pgsize_bitmap
);
3182 * Bitmap populated with the smallest supported page
3185 bitmap_set(dma
->bitmap
, offset
>> pgshift
,
3186 ((offset
+ *copied
- 1) >> pgshift
) -
3187 (offset
>> pgshift
) + 1);
3190 *copied
= copy_from_user(data
, (void __user
*)vaddr
,
3193 kthread_unuse_mm(mm
);
3196 return *copied
? 0 : -EFAULT
;
3199 static int vfio_iommu_type1_dma_rw(void *iommu_data
, dma_addr_t user_iova
,
3200 void *data
, size_t count
, bool write
)
3202 struct vfio_iommu
*iommu
= iommu_data
;
3206 mutex_lock(&iommu
->lock
);
3208 ret
= vfio_iommu_type1_dma_rw_chunk(iommu
, user_iova
, data
,
3209 count
, write
, &done
);
3218 mutex_unlock(&iommu
->lock
);
3222 static struct iommu_domain
*
3223 vfio_iommu_type1_group_iommu_domain(void *iommu_data
,
3224 struct iommu_group
*iommu_group
)
3226 struct iommu_domain
*domain
= ERR_PTR(-ENODEV
);
3227 struct vfio_iommu
*iommu
= iommu_data
;
3228 struct vfio_domain
*d
;
3230 if (!iommu
|| !iommu_group
)
3231 return ERR_PTR(-EINVAL
);
3233 mutex_lock(&iommu
->lock
);
3234 list_for_each_entry(d
, &iommu
->domain_list
, next
) {
3235 if (find_iommu_group(d
, iommu_group
)) {
3240 mutex_unlock(&iommu
->lock
);
3245 static void vfio_iommu_type1_notify(void *iommu_data
,
3246 enum vfio_iommu_notify_type event
)
3248 struct vfio_iommu
*iommu
= iommu_data
;
3250 if (event
!= VFIO_IOMMU_CONTAINER_CLOSE
)
3252 mutex_lock(&iommu
->lock
);
3253 iommu
->container_open
= false;
3254 mutex_unlock(&iommu
->lock
);
3255 wake_up_all(&iommu
->vaddr_wait
);
3258 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1
= {
3259 .name
= "vfio-iommu-type1",
3260 .owner
= THIS_MODULE
,
3261 .open
= vfio_iommu_type1_open
,
3262 .release
= vfio_iommu_type1_release
,
3263 .ioctl
= vfio_iommu_type1_ioctl
,
3264 .attach_group
= vfio_iommu_type1_attach_group
,
3265 .detach_group
= vfio_iommu_type1_detach_group
,
3266 .pin_pages
= vfio_iommu_type1_pin_pages
,
3267 .unpin_pages
= vfio_iommu_type1_unpin_pages
,
3268 .register_notifier
= vfio_iommu_type1_register_notifier
,
3269 .unregister_notifier
= vfio_iommu_type1_unregister_notifier
,
3270 .dma_rw
= vfio_iommu_type1_dma_rw
,
3271 .group_iommu_domain
= vfio_iommu_type1_group_iommu_domain
,
3272 .notify
= vfio_iommu_type1_notify
,
3275 static int __init
vfio_iommu_type1_init(void)
3277 return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1
);
3280 static void __exit
vfio_iommu_type1_cleanup(void)
3282 vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1
);
3285 module_init(vfio_iommu_type1_init
);
3286 module_exit(vfio_iommu_type1_cleanup
);
3288 MODULE_VERSION(DRIVER_VERSION
);
3289 MODULE_LICENSE("GPL v2");
3290 MODULE_AUTHOR(DRIVER_AUTHOR
);
3291 MODULE_DESCRIPTION(DRIVER_DESC
);