]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - drivers/vfio/vfio_iommu_type1.c
rtc: add support for Abracon AB-RTCMC-32.768kHz-B5ZE-S3 I2C RTC chip
[mirror_ubuntu-artful-kernel.git] / drivers / vfio / vfio_iommu_type1.c
CommitLineData
73fa0d10
AW
1/*
2 * VFIO: IOMMU DMA mapping support for Type1 IOMMU
3 *
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
14 *
15 * We arbitrarily define a Type1 IOMMU as one matching the below code.
16 * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
17 * VT-d, but that makes it harder to re-use as theoretically anyone
18 * implementing a similar IOMMU could make use of this. We expect the
19 * IOMMU to support the IOMMU API and have few to no restrictions around
20 * the IOVA range that can be mapped. The Type1 IOMMU is currently
21 * optimized for relatively static mappings of a userspace process with
22 * userpsace pages pinned into memory. We also assume devices and IOMMU
23 * domains are PCI based as the IOMMU API is still centered around a
24 * device/bus interface rather than a group interface.
25 */
26
27#include <linux/compat.h>
28#include <linux/device.h>
29#include <linux/fs.h>
30#include <linux/iommu.h>
31#include <linux/module.h>
32#include <linux/mm.h>
cd9b2268 33#include <linux/rbtree.h>
73fa0d10
AW
34#include <linux/sched.h>
35#include <linux/slab.h>
36#include <linux/uaccess.h>
37#include <linux/vfio.h>
38#include <linux/workqueue.h>
39
40#define DRIVER_VERSION "0.2"
41#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
42#define DRIVER_DESC "Type1 IOMMU driver for VFIO"
43
44static bool allow_unsafe_interrupts;
45module_param_named(allow_unsafe_interrupts,
46 allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
47MODULE_PARM_DESC(allow_unsafe_interrupts,
48 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
49
5c6c2b21
AW
50static bool disable_hugepages;
51module_param_named(disable_hugepages,
52 disable_hugepages, bool, S_IRUGO | S_IWUSR);
53MODULE_PARM_DESC(disable_hugepages,
54 "Disable VFIO IOMMU support for IOMMU hugepages.");
55
73fa0d10 56struct vfio_iommu {
1ef3e2bc 57 struct list_head domain_list;
73fa0d10 58 struct mutex lock;
cd9b2268 59 struct rb_root dma_list;
f5c9eceb
WD
60 bool v2;
61 bool nesting;
1ef3e2bc
AW
62};
63
64struct vfio_domain {
65 struct iommu_domain *domain;
66 struct list_head next;
73fa0d10 67 struct list_head group_list;
1ef3e2bc 68 int prot; /* IOMMU_CACHE */
73fa0d10
AW
69};
70
71struct vfio_dma {
cd9b2268 72 struct rb_node node;
73fa0d10
AW
73 dma_addr_t iova; /* Device address */
74 unsigned long vaddr; /* Process virtual addr */
166fd7d9 75 size_t size; /* Map size (bytes) */
73fa0d10
AW
76 int prot; /* IOMMU_READ/WRITE */
77};
78
79struct vfio_group {
80 struct iommu_group *iommu_group;
81 struct list_head next;
82};
83
84/*
85 * This code handles mapping and unmapping of user data buffers
86 * into DMA'ble space using the IOMMU
87 */
88
cd9b2268
AW
89static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
90 dma_addr_t start, size_t size)
91{
92 struct rb_node *node = iommu->dma_list.rb_node;
93
94 while (node) {
95 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
96
97 if (start + size <= dma->iova)
98 node = node->rb_left;
166fd7d9 99 else if (start >= dma->iova + dma->size)
cd9b2268
AW
100 node = node->rb_right;
101 else
102 return dma;
103 }
104
105 return NULL;
106}
107
1ef3e2bc 108static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
cd9b2268
AW
109{
110 struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
111 struct vfio_dma *dma;
112
113 while (*link) {
114 parent = *link;
115 dma = rb_entry(parent, struct vfio_dma, node);
116
166fd7d9 117 if (new->iova + new->size <= dma->iova)
cd9b2268
AW
118 link = &(*link)->rb_left;
119 else
120 link = &(*link)->rb_right;
121 }
122
123 rb_link_node(&new->node, parent, link);
124 rb_insert_color(&new->node, &iommu->dma_list);
125}
126
1ef3e2bc 127static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
cd9b2268
AW
128{
129 rb_erase(&old->node, &iommu->dma_list);
130}
131
73fa0d10
AW
132struct vwork {
133 struct mm_struct *mm;
134 long npage;
135 struct work_struct work;
136};
137
138/* delayed decrement/increment for locked_vm */
139static void vfio_lock_acct_bg(struct work_struct *work)
140{
141 struct vwork *vwork = container_of(work, struct vwork, work);
142 struct mm_struct *mm;
143
144 mm = vwork->mm;
145 down_write(&mm->mmap_sem);
146 mm->locked_vm += vwork->npage;
147 up_write(&mm->mmap_sem);
148 mmput(mm);
149 kfree(vwork);
150}
151
152static void vfio_lock_acct(long npage)
153{
154 struct vwork *vwork;
155 struct mm_struct *mm;
156
166fd7d9
AW
157 if (!current->mm || !npage)
158 return; /* process exited or nothing to do */
73fa0d10
AW
159
160 if (down_write_trylock(&current->mm->mmap_sem)) {
161 current->mm->locked_vm += npage;
162 up_write(&current->mm->mmap_sem);
163 return;
164 }
165
166 /*
167 * Couldn't get mmap_sem lock, so must setup to update
168 * mm->locked_vm later. If locked_vm were atomic, we
169 * wouldn't need this silliness
170 */
171 vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
172 if (!vwork)
173 return;
174 mm = get_task_mm(current);
175 if (!mm) {
176 kfree(vwork);
177 return;
178 }
179 INIT_WORK(&vwork->work, vfio_lock_acct_bg);
180 vwork->mm = mm;
181 vwork->npage = npage;
182 schedule_work(&vwork->work);
183}
184
185/*
186 * Some mappings aren't backed by a struct page, for example an mmap'd
187 * MMIO range for our own or another device. These use a different
188 * pfn conversion and shouldn't be tracked as locked pages.
189 */
190static bool is_invalid_reserved_pfn(unsigned long pfn)
191{
192 if (pfn_valid(pfn)) {
193 bool reserved;
194 struct page *tail = pfn_to_page(pfn);
668f9abb 195 struct page *head = compound_head(tail);
73fa0d10
AW
196 reserved = !!(PageReserved(head));
197 if (head != tail) {
198 /*
199 * "head" is not a dangling pointer
668f9abb 200 * (compound_head takes care of that)
73fa0d10
AW
201 * but the hugepage may have been split
202 * from under us (and we may not hold a
203 * reference count on the head page so it can
204 * be reused before we run PageReferenced), so
205 * we've to check PageTail before returning
206 * what we just read.
207 */
208 smp_rmb();
209 if (PageTail(tail))
210 return reserved;
211 }
212 return PageReserved(tail);
213 }
214
215 return true;
216}
217
218static int put_pfn(unsigned long pfn, int prot)
219{
220 if (!is_invalid_reserved_pfn(pfn)) {
221 struct page *page = pfn_to_page(pfn);
222 if (prot & IOMMU_WRITE)
223 SetPageDirty(page);
224 put_page(page);
225 return 1;
226 }
227 return 0;
228}
229
73fa0d10
AW
230static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
231{
232 struct page *page[1];
233 struct vm_area_struct *vma;
234 int ret = -EFAULT;
235
236 if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
237 *pfn = page_to_pfn(page[0]);
238 return 0;
239 }
240
241 down_read(&current->mm->mmap_sem);
242
243 vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
244
245 if (vma && vma->vm_flags & VM_PFNMAP) {
246 *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
247 if (is_invalid_reserved_pfn(*pfn))
248 ret = 0;
249 }
250
251 up_read(&current->mm->mmap_sem);
252
253 return ret;
254}
255
166fd7d9
AW
256/*
257 * Attempt to pin pages. We really don't want to track all the pfns and
258 * the iommu can only map chunks of consecutive pfns anyway, so get the
259 * first page and all consecutive pages with the same locking.
260 */
261static long vfio_pin_pages(unsigned long vaddr, long npage,
262 int prot, unsigned long *pfn_base)
73fa0d10 263{
166fd7d9
AW
264 unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
265 bool lock_cap = capable(CAP_IPC_LOCK);
266 long ret, i;
73fa0d10 267
166fd7d9
AW
268 if (!current->mm)
269 return -ENODEV;
73fa0d10 270
166fd7d9
AW
271 ret = vaddr_get_pfn(vaddr, prot, pfn_base);
272 if (ret)
273 return ret;
73fa0d10 274
166fd7d9
AW
275 if (is_invalid_reserved_pfn(*pfn_base))
276 return 1;
73fa0d10 277
166fd7d9
AW
278 if (!lock_cap && current->mm->locked_vm + 1 > limit) {
279 put_pfn(*pfn_base, prot);
280 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
281 limit << PAGE_SHIFT);
282 return -ENOMEM;
283 }
284
5c6c2b21
AW
285 if (unlikely(disable_hugepages)) {
286 vfio_lock_acct(1);
287 return 1;
288 }
289
166fd7d9
AW
290 /* Lock all the consecutive pages from pfn_base */
291 for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
73fa0d10
AW
292 unsigned long pfn = 0;
293
294 ret = vaddr_get_pfn(vaddr, prot, &pfn);
166fd7d9
AW
295 if (ret)
296 break;
297
298 if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) {
299 put_pfn(pfn, prot);
300 break;
73fa0d10
AW
301 }
302
166fd7d9
AW
303 if (!lock_cap && current->mm->locked_vm + i + 1 > limit) {
304 put_pfn(pfn, prot);
305 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
306 __func__, limit << PAGE_SHIFT);
307 break;
308 }
309 }
310
311 vfio_lock_acct(i);
312
313 return i;
314}
315
316static long vfio_unpin_pages(unsigned long pfn, long npage,
317 int prot, bool do_accounting)
318{
319 unsigned long unlocked = 0;
320 long i;
321
322 for (i = 0; i < npage; i++)
323 unlocked += put_pfn(pfn++, prot);
324
325 if (do_accounting)
326 vfio_lock_acct(-unlocked);
327
328 return unlocked;
329}
330
1ef3e2bc 331static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma)
166fd7d9 332{
1ef3e2bc
AW
333 dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
334 struct vfio_domain *domain, *d;
166fd7d9
AW
335 long unlocked = 0;
336
1ef3e2bc
AW
337 if (!dma->size)
338 return;
339 /*
340 * We use the IOMMU to track the physical addresses, otherwise we'd
341 * need a much more complicated tracking system. Unfortunately that
342 * means we need to use one of the iommu domains to figure out the
343 * pfns to unpin. The rest need to be unmapped in advance so we have
344 * no iommu translations remaining when the pages are unpinned.
345 */
346 domain = d = list_first_entry(&iommu->domain_list,
347 struct vfio_domain, next);
348
349 list_for_each_entry_continue(d, &iommu->domain_list, next)
350 iommu_unmap(d->domain, dma->iova, dma->size);
351
166fd7d9
AW
352 while (iova < end) {
353 size_t unmapped;
354 phys_addr_t phys;
355
1ef3e2bc 356 phys = iommu_iova_to_phys(domain->domain, iova);
166fd7d9
AW
357 if (WARN_ON(!phys)) {
358 iova += PAGE_SIZE;
359 continue;
73fa0d10 360 }
166fd7d9 361
1ef3e2bc
AW
362 unmapped = iommu_unmap(domain->domain, iova, PAGE_SIZE);
363 if (WARN_ON(!unmapped))
166fd7d9
AW
364 break;
365
366 unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
367 unmapped >> PAGE_SHIFT,
368 dma->prot, false);
369 iova += unmapped;
73fa0d10 370 }
166fd7d9
AW
371
372 vfio_lock_acct(-unlocked);
73fa0d10
AW
373}
374
1ef3e2bc 375static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
73fa0d10 376{
1ef3e2bc
AW
377 vfio_unmap_unpin(iommu, dma);
378 vfio_unlink_dma(iommu, dma);
379 kfree(dma);
380}
73fa0d10 381
1ef3e2bc
AW
382static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
383{
384 struct vfio_domain *domain;
385 unsigned long bitmap = PAGE_MASK;
166fd7d9 386
1ef3e2bc
AW
387 mutex_lock(&iommu->lock);
388 list_for_each_entry(domain, &iommu->domain_list, next)
389 bitmap &= domain->domain->ops->pgsize_bitmap;
390 mutex_unlock(&iommu->lock);
73fa0d10 391
1ef3e2bc 392 return bitmap;
73fa0d10
AW
393}
394
395static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
396 struct vfio_iommu_type1_dma_unmap *unmap)
397{
73fa0d10 398 uint64_t mask;
cd9b2268 399 struct vfio_dma *dma;
1ef3e2bc 400 size_t unmapped = 0;
cd9b2268 401 int ret = 0;
73fa0d10 402
1ef3e2bc 403 mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
73fa0d10
AW
404
405 if (unmap->iova & mask)
406 return -EINVAL;
f5bfdbf2 407 if (!unmap->size || unmap->size & mask)
73fa0d10
AW
408 return -EINVAL;
409
73fa0d10
AW
410 WARN_ON(mask & PAGE_MASK);
411
412 mutex_lock(&iommu->lock);
413
1ef3e2bc
AW
414 /*
415 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
416 * avoid tracking individual mappings. This means that the granularity
417 * of the original mapping was lost and the user was allowed to attempt
418 * to unmap any range. Depending on the contiguousness of physical
419 * memory and page sizes supported by the IOMMU, arbitrary unmaps may
420 * or may not have worked. We only guaranteed unmap granularity
421 * matching the original mapping; even though it was untracked here,
422 * the original mappings are reflected in IOMMU mappings. This
423 * resulted in a couple unusual behaviors. First, if a range is not
424 * able to be unmapped, ex. a set of 4k pages that was mapped as a
425 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
426 * a zero sized unmap. Also, if an unmap request overlaps the first
427 * address of a hugepage, the IOMMU will unmap the entire hugepage.
428 * This also returns success and the returned unmap size reflects the
429 * actual size unmapped.
430 *
431 * We attempt to maintain compatibility with this "v1" interface, but
432 * we take control out of the hands of the IOMMU. Therefore, an unmap
433 * request offset from the beginning of the original mapping will
434 * return success with zero sized unmap. And an unmap request covering
435 * the first iova of mapping will unmap the entire range.
436 *
437 * The v2 version of this interface intends to be more deterministic.
438 * Unmap requests must fully cover previous mappings. Multiple
439 * mappings may still be unmaped by specifying large ranges, but there
440 * must not be any previous mappings bisected by the range. An error
441 * will be returned if these conditions are not met. The v2 interface
442 * will only return success and a size of zero if there were no
443 * mappings within the range.
444 */
445 if (iommu->v2) {
446 dma = vfio_find_dma(iommu, unmap->iova, 0);
447 if (dma && dma->iova != unmap->iova) {
448 ret = -EINVAL;
449 goto unlock;
450 }
451 dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
452 if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
453 ret = -EINVAL;
454 goto unlock;
455 }
456 }
457
166fd7d9 458 while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
1ef3e2bc 459 if (!iommu->v2 && unmap->iova > dma->iova)
166fd7d9 460 break;
1ef3e2bc
AW
461 unmapped += dma->size;
462 vfio_remove_dma(iommu, dma);
166fd7d9 463 }
cd9b2268 464
1ef3e2bc 465unlock:
73fa0d10 466 mutex_unlock(&iommu->lock);
166fd7d9 467
1ef3e2bc 468 /* Report how much was unmapped */
166fd7d9
AW
469 unmap->size = unmapped;
470
471 return ret;
472}
473
474/*
475 * Turns out AMD IOMMU has a page table bug where it won't map large pages
476 * to a region that previously mapped smaller pages. This should be fixed
477 * soon, so this is just a temporary workaround to break mappings down into
478 * PAGE_SIZE. Better to map smaller pages than nothing.
479 */
1ef3e2bc 480static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
166fd7d9
AW
481 unsigned long pfn, long npage, int prot)
482{
483 long i;
484 int ret;
485
486 for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
1ef3e2bc 487 ret = iommu_map(domain->domain, iova,
166fd7d9 488 (phys_addr_t)pfn << PAGE_SHIFT,
1ef3e2bc 489 PAGE_SIZE, prot | domain->prot);
166fd7d9
AW
490 if (ret)
491 break;
492 }
493
494 for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
1ef3e2bc
AW
495 iommu_unmap(domain->domain, iova, PAGE_SIZE);
496
497 return ret;
498}
499
500static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
501 unsigned long pfn, long npage, int prot)
502{
503 struct vfio_domain *d;
504 int ret;
505
506 list_for_each_entry(d, &iommu->domain_list, next) {
507 ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
508 npage << PAGE_SHIFT, prot | d->prot);
509 if (ret) {
510 if (ret != -EBUSY ||
511 map_try_harder(d, iova, pfn, npage, prot))
512 goto unwind;
513 }
514 }
515
516 return 0;
517
518unwind:
519 list_for_each_entry_continue_reverse(d, &iommu->domain_list, next)
520 iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
166fd7d9 521
cd9b2268 522 return ret;
73fa0d10
AW
523}
524
525static int vfio_dma_do_map(struct vfio_iommu *iommu,
526 struct vfio_iommu_type1_dma_map *map)
527{
c8dbca16 528 dma_addr_t iova = map->iova;
166fd7d9 529 unsigned long vaddr = map->vaddr;
73fa0d10 530 size_t size = map->size;
166fd7d9 531 long npage;
73fa0d10
AW
532 int ret = 0, prot = 0;
533 uint64_t mask;
1ef3e2bc 534 struct vfio_dma *dma;
d93b3ac0 535 unsigned long pfn;
166fd7d9 536
c8dbca16
AW
537 /* Verify that none of our __u64 fields overflow */
538 if (map->size != size || map->vaddr != vaddr || map->iova != iova)
539 return -EINVAL;
73fa0d10 540
1ef3e2bc 541 mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
73fa0d10 542
c8dbca16
AW
543 WARN_ON(mask & PAGE_MASK);
544
73fa0d10
AW
545 /* READ/WRITE from device perspective */
546 if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
547 prot |= IOMMU_WRITE;
548 if (map->flags & VFIO_DMA_MAP_FLAG_READ)
549 prot |= IOMMU_READ;
550
c8dbca16 551 if (!prot || !size || (size | iova | vaddr) & mask)
73fa0d10
AW
552 return -EINVAL;
553
c8dbca16
AW
554 /* Don't allow IOVA or virtual address wrap */
555 if (iova + size - 1 < iova || vaddr + size - 1 < vaddr)
73fa0d10
AW
556 return -EINVAL;
557
558 mutex_lock(&iommu->lock);
559
c8dbca16 560 if (vfio_find_dma(iommu, iova, size)) {
166fd7d9
AW
561 mutex_unlock(&iommu->lock);
562 return -EEXIST;
73fa0d10
AW
563 }
564
1ef3e2bc
AW
565 dma = kzalloc(sizeof(*dma), GFP_KERNEL);
566 if (!dma) {
567 mutex_unlock(&iommu->lock);
568 return -ENOMEM;
569 }
570
c8dbca16
AW
571 dma->iova = iova;
572 dma->vaddr = vaddr;
1ef3e2bc 573 dma->prot = prot;
166fd7d9 574
1ef3e2bc
AW
575 /* Insert zero-sized and grow as we map chunks of it */
576 vfio_link_dma(iommu, dma);
166fd7d9 577
c8dbca16 578 while (size) {
166fd7d9 579 /* Pin a contiguous chunk of memory */
c8dbca16
AW
580 npage = vfio_pin_pages(vaddr + dma->size,
581 size >> PAGE_SHIFT, prot, &pfn);
166fd7d9
AW
582 if (npage <= 0) {
583 WARN_ON(!npage);
584 ret = (int)npage;
1ef3e2bc 585 break;
166fd7d9
AW
586 }
587
1ef3e2bc 588 /* Map it! */
c8dbca16 589 ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot);
166fd7d9 590 if (ret) {
1ef3e2bc
AW
591 vfio_unpin_pages(pfn, npage, prot, true);
592 break;
166fd7d9
AW
593 }
594
c8dbca16
AW
595 size -= npage << PAGE_SHIFT;
596 dma->size += npage << PAGE_SHIFT;
1ef3e2bc 597 }
166fd7d9 598
1ef3e2bc
AW
599 if (ret)
600 vfio_remove_dma(iommu, dma);
166fd7d9 601
1ef3e2bc
AW
602 mutex_unlock(&iommu->lock);
603 return ret;
604}
605
606static int vfio_bus_type(struct device *dev, void *data)
607{
608 struct bus_type **bus = data;
609
610 if (*bus && *bus != dev->bus)
611 return -EINVAL;
612
613 *bus = dev->bus;
614
615 return 0;
616}
617
618static int vfio_iommu_replay(struct vfio_iommu *iommu,
619 struct vfio_domain *domain)
620{
621 struct vfio_domain *d;
622 struct rb_node *n;
623 int ret;
624
625 /* Arbitrarily pick the first domain in the list for lookups */
626 d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
627 n = rb_first(&iommu->dma_list);
628
629 /* If there's not a domain, there better not be any mappings */
630 if (WARN_ON(n && !d))
631 return -EINVAL;
632
633 for (; n; n = rb_next(n)) {
634 struct vfio_dma *dma;
635 dma_addr_t iova;
636
637 dma = rb_entry(n, struct vfio_dma, node);
638 iova = dma->iova;
639
640 while (iova < dma->iova + dma->size) {
641 phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
642 size_t size;
73fa0d10 643
1ef3e2bc
AW
644 if (WARN_ON(!phys)) {
645 iova += PAGE_SIZE;
646 continue;
166fd7d9
AW
647 }
648
1ef3e2bc 649 size = PAGE_SIZE;
73fa0d10 650
1ef3e2bc
AW
651 while (iova + size < dma->iova + dma->size &&
652 phys + size == iommu_iova_to_phys(d->domain,
653 iova + size))
654 size += PAGE_SIZE;
d93b3ac0 655
1ef3e2bc
AW
656 ret = iommu_map(domain->domain, iova, phys,
657 size, dma->prot | domain->prot);
658 if (ret)
659 return ret;
d93b3ac0 660
1ef3e2bc
AW
661 iova += size;
662 }
166fd7d9 663 }
73fa0d10 664
1ef3e2bc 665 return 0;
73fa0d10
AW
666}
667
668static int vfio_iommu_type1_attach_group(void *iommu_data,
669 struct iommu_group *iommu_group)
670{
671 struct vfio_iommu *iommu = iommu_data;
1ef3e2bc
AW
672 struct vfio_group *group, *g;
673 struct vfio_domain *domain, *d;
674 struct bus_type *bus = NULL;
73fa0d10
AW
675 int ret;
676
73fa0d10
AW
677 mutex_lock(&iommu->lock);
678
1ef3e2bc
AW
679 list_for_each_entry(d, &iommu->domain_list, next) {
680 list_for_each_entry(g, &d->group_list, next) {
681 if (g->iommu_group != iommu_group)
682 continue;
683
73fa0d10 684 mutex_unlock(&iommu->lock);
73fa0d10
AW
685 return -EINVAL;
686 }
687 }
688
1ef3e2bc
AW
689 group = kzalloc(sizeof(*group), GFP_KERNEL);
690 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
691 if (!group || !domain) {
692 ret = -ENOMEM;
693 goto out_free;
694 }
695
696 group->iommu_group = iommu_group;
697
698 /* Determine bus_type in order to allocate a domain */
699 ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
700 if (ret)
701 goto out_free;
702
703 domain->domain = iommu_domain_alloc(bus);
704 if (!domain->domain) {
705 ret = -EIO;
706 goto out_free;
707 }
708
f5c9eceb
WD
709 if (iommu->nesting) {
710 int attr = 1;
711
712 ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING,
713 &attr);
714 if (ret)
715 goto out_domain;
716 }
717
1ef3e2bc
AW
718 ret = iommu_attach_group(domain->domain, iommu_group);
719 if (ret)
720 goto out_domain;
721
722 INIT_LIST_HEAD(&domain->group_list);
723 list_add(&group->next, &domain->group_list);
724
725 if (!allow_unsafe_interrupts &&
eb165f05 726 !iommu_capable(bus, IOMMU_CAP_INTR_REMAP)) {
1ef3e2bc
AW
727 pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
728 __func__);
729 ret = -EPERM;
730 goto out_detach;
731 }
732
eb165f05 733 if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
1ef3e2bc
AW
734 domain->prot |= IOMMU_CACHE;
735
73fa0d10 736 /*
1ef3e2bc
AW
737 * Try to match an existing compatible domain. We don't want to
738 * preclude an IOMMU driver supporting multiple bus_types and being
739 * able to include different bus_types in the same IOMMU domain, so
740 * we test whether the domains use the same iommu_ops rather than
741 * testing if they're on the same bus_type.
73fa0d10 742 */
1ef3e2bc
AW
743 list_for_each_entry(d, &iommu->domain_list, next) {
744 if (d->domain->ops == domain->domain->ops &&
745 d->prot == domain->prot) {
746 iommu_detach_group(domain->domain, iommu_group);
747 if (!iommu_attach_group(d->domain, iommu_group)) {
748 list_add(&group->next, &d->group_list);
749 iommu_domain_free(domain->domain);
750 kfree(domain);
751 mutex_unlock(&iommu->lock);
752 return 0;
753 }
754
755 ret = iommu_attach_group(domain->domain, iommu_group);
756 if (ret)
757 goto out_domain;
758 }
73fa0d10
AW
759 }
760
1ef3e2bc
AW
761 /* replay mappings on new domains */
762 ret = vfio_iommu_replay(iommu, domain);
763 if (ret)
764 goto out_detach;
765
766 list_add(&domain->next, &iommu->domain_list);
73fa0d10
AW
767
768 mutex_unlock(&iommu->lock);
769
770 return 0;
1ef3e2bc
AW
771
772out_detach:
773 iommu_detach_group(domain->domain, iommu_group);
774out_domain:
775 iommu_domain_free(domain->domain);
776out_free:
777 kfree(domain);
778 kfree(group);
779 mutex_unlock(&iommu->lock);
780 return ret;
781}
782
783static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
784{
785 struct rb_node *node;
786
787 while ((node = rb_first(&iommu->dma_list)))
788 vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
73fa0d10
AW
789}
790
791static void vfio_iommu_type1_detach_group(void *iommu_data,
792 struct iommu_group *iommu_group)
793{
794 struct vfio_iommu *iommu = iommu_data;
1ef3e2bc 795 struct vfio_domain *domain;
73fa0d10
AW
796 struct vfio_group *group;
797
798 mutex_lock(&iommu->lock);
799
1ef3e2bc
AW
800 list_for_each_entry(domain, &iommu->domain_list, next) {
801 list_for_each_entry(group, &domain->group_list, next) {
802 if (group->iommu_group != iommu_group)
803 continue;
804
805 iommu_detach_group(domain->domain, iommu_group);
73fa0d10
AW
806 list_del(&group->next);
807 kfree(group);
1ef3e2bc
AW
808 /*
809 * Group ownership provides privilege, if the group
810 * list is empty, the domain goes away. If it's the
811 * last domain, then all the mappings go away too.
812 */
813 if (list_empty(&domain->group_list)) {
814 if (list_is_singular(&iommu->domain_list))
815 vfio_iommu_unmap_unpin_all(iommu);
816 iommu_domain_free(domain->domain);
817 list_del(&domain->next);
818 kfree(domain);
819 }
820 goto done;
73fa0d10
AW
821 }
822 }
823
1ef3e2bc 824done:
73fa0d10
AW
825 mutex_unlock(&iommu->lock);
826}
827
828static void *vfio_iommu_type1_open(unsigned long arg)
829{
830 struct vfio_iommu *iommu;
831
73fa0d10
AW
832 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
833 if (!iommu)
834 return ERR_PTR(-ENOMEM);
835
f5c9eceb
WD
836 switch (arg) {
837 case VFIO_TYPE1_IOMMU:
838 break;
839 case VFIO_TYPE1_NESTING_IOMMU:
840 iommu->nesting = true;
841 case VFIO_TYPE1v2_IOMMU:
842 iommu->v2 = true;
843 break;
844 default:
845 kfree(iommu);
846 return ERR_PTR(-EINVAL);
847 }
848
1ef3e2bc 849 INIT_LIST_HEAD(&iommu->domain_list);
cd9b2268 850 iommu->dma_list = RB_ROOT;
73fa0d10 851 mutex_init(&iommu->lock);
73fa0d10
AW
852
853 return iommu;
854}
855
856static void vfio_iommu_type1_release(void *iommu_data)
857{
858 struct vfio_iommu *iommu = iommu_data;
1ef3e2bc 859 struct vfio_domain *domain, *domain_tmp;
73fa0d10 860 struct vfio_group *group, *group_tmp;
73fa0d10 861
1ef3e2bc 862 vfio_iommu_unmap_unpin_all(iommu);
73fa0d10 863
1ef3e2bc
AW
864 list_for_each_entry_safe(domain, domain_tmp,
865 &iommu->domain_list, next) {
866 list_for_each_entry_safe(group, group_tmp,
867 &domain->group_list, next) {
868 iommu_detach_group(domain->domain, group->iommu_group);
869 list_del(&group->next);
870 kfree(group);
871 }
872 iommu_domain_free(domain->domain);
873 list_del(&domain->next);
874 kfree(domain);
73fa0d10
AW
875 }
876
73fa0d10
AW
877 kfree(iommu);
878}
879
aa429318
AW
880static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
881{
882 struct vfio_domain *domain;
883 int ret = 1;
884
885 mutex_lock(&iommu->lock);
886 list_for_each_entry(domain, &iommu->domain_list, next) {
887 if (!(domain->prot & IOMMU_CACHE)) {
888 ret = 0;
f5bfdbf2 889 break;
aa429318 890 }
73fa0d10 891 }
aa429318 892 mutex_unlock(&iommu->lock);
73fa0d10 893
aa429318 894 return ret;
73fa0d10
AW
895}
896
897static long vfio_iommu_type1_ioctl(void *iommu_data,
898 unsigned int cmd, unsigned long arg)
899{
900 struct vfio_iommu *iommu = iommu_data;
901 unsigned long minsz;
902
903 if (cmd == VFIO_CHECK_EXTENSION) {
904 switch (arg) {
905 case VFIO_TYPE1_IOMMU:
1ef3e2bc 906 case VFIO_TYPE1v2_IOMMU:
f5c9eceb 907 case VFIO_TYPE1_NESTING_IOMMU:
73fa0d10 908 return 1;
aa429318
AW
909 case VFIO_DMA_CC_IOMMU:
910 if (!iommu)
911 return 0;
912 return vfio_domains_have_iommu_cache(iommu);
73fa0d10
AW
913 default:
914 return 0;
915 }
916 } else if (cmd == VFIO_IOMMU_GET_INFO) {
917 struct vfio_iommu_type1_info info;
918
919 minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
920
921 if (copy_from_user(&info, (void __user *)arg, minsz))
922 return -EFAULT;
923
924 if (info.argsz < minsz)
925 return -EINVAL;
926
927 info.flags = 0;
928
1ef3e2bc 929 info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
73fa0d10
AW
930
931 return copy_to_user((void __user *)arg, &info, minsz);
932
933 } else if (cmd == VFIO_IOMMU_MAP_DMA) {
934 struct vfio_iommu_type1_dma_map map;
935 uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
936 VFIO_DMA_MAP_FLAG_WRITE;
937
938 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
939
940 if (copy_from_user(&map, (void __user *)arg, minsz))
941 return -EFAULT;
942
943 if (map.argsz < minsz || map.flags & ~mask)
944 return -EINVAL;
945
946 return vfio_dma_do_map(iommu, &map);
947
948 } else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
949 struct vfio_iommu_type1_dma_unmap unmap;
166fd7d9 950 long ret;
73fa0d10
AW
951
952 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
953
954 if (copy_from_user(&unmap, (void __user *)arg, minsz))
955 return -EFAULT;
956
957 if (unmap.argsz < minsz || unmap.flags)
958 return -EINVAL;
959
166fd7d9
AW
960 ret = vfio_dma_do_unmap(iommu, &unmap);
961 if (ret)
962 return ret;
963
964 return copy_to_user((void __user *)arg, &unmap, minsz);
73fa0d10
AW
965 }
966
967 return -ENOTTY;
968}
969
970static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
971 .name = "vfio-iommu-type1",
972 .owner = THIS_MODULE,
973 .open = vfio_iommu_type1_open,
974 .release = vfio_iommu_type1_release,
975 .ioctl = vfio_iommu_type1_ioctl,
976 .attach_group = vfio_iommu_type1_attach_group,
977 .detach_group = vfio_iommu_type1_detach_group,
978};
979
980static int __init vfio_iommu_type1_init(void)
981{
73fa0d10
AW
982 return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
983}
984
985static void __exit vfio_iommu_type1_cleanup(void)
986{
987 vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
988}
989
990module_init(vfio_iommu_type1_init);
991module_exit(vfio_iommu_type1_cleanup);
992
993MODULE_VERSION(DRIVER_VERSION);
994MODULE_LICENSE("GPL v2");
995MODULE_AUTHOR(DRIVER_AUTHOR);
996MODULE_DESCRIPTION(DRIVER_DESC);