]>
Commit | Line | Data |
---|---|---|
73fa0d10 AW |
1 | /* |
2 | * VFIO: IOMMU DMA mapping support for Type1 IOMMU | |
3 | * | |
4 | * Copyright (C) 2012 Red Hat, Inc. All rights reserved. | |
5 | * Author: Alex Williamson <alex.williamson@redhat.com> | |
6 | * | |
7 | * This program is free software; you can redistribute it and/or modify | |
8 | * it under the terms of the GNU General Public License version 2 as | |
9 | * published by the Free Software Foundation. | |
10 | * | |
11 | * Derived from original vfio: | |
12 | * Copyright 2010 Cisco Systems, Inc. All rights reserved. | |
13 | * Author: Tom Lyon, pugs@cisco.com | |
14 | * | |
15 | * We arbitrarily define a Type1 IOMMU as one matching the below code. | |
16 | * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel | |
17 | * VT-d, but that makes it harder to re-use as theoretically anyone | |
18 | * implementing a similar IOMMU could make use of this. We expect the | |
19 | * IOMMU to support the IOMMU API and have few to no restrictions around | |
20 | * the IOVA range that can be mapped. The Type1 IOMMU is currently | |
21 | * optimized for relatively static mappings of a userspace process with | |
22 | * userpsace pages pinned into memory. We also assume devices and IOMMU | |
23 | * domains are PCI based as the IOMMU API is still centered around a | |
24 | * device/bus interface rather than a group interface. | |
25 | */ | |
26 | ||
27 | #include <linux/compat.h> | |
28 | #include <linux/device.h> | |
29 | #include <linux/fs.h> | |
30 | #include <linux/iommu.h> | |
31 | #include <linux/module.h> | |
32 | #include <linux/mm.h> | |
33 | #include <linux/pci.h> /* pci_bus_type */ | |
cd9b2268 | 34 | #include <linux/rbtree.h> |
73fa0d10 AW |
35 | #include <linux/sched.h> |
36 | #include <linux/slab.h> | |
37 | #include <linux/uaccess.h> | |
38 | #include <linux/vfio.h> | |
39 | #include <linux/workqueue.h> | |
40 | ||
41 | #define DRIVER_VERSION "0.2" | |
42 | #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" | |
43 | #define DRIVER_DESC "Type1 IOMMU driver for VFIO" | |
44 | ||
45 | static bool allow_unsafe_interrupts; | |
46 | module_param_named(allow_unsafe_interrupts, | |
47 | allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR); | |
48 | MODULE_PARM_DESC(allow_unsafe_interrupts, | |
49 | "Enable VFIO IOMMU support for on platforms without interrupt remapping support."); | |
50 | ||
5c6c2b21 AW |
51 | static bool disable_hugepages; |
52 | module_param_named(disable_hugepages, | |
53 | disable_hugepages, bool, S_IRUGO | S_IWUSR); | |
54 | MODULE_PARM_DESC(disable_hugepages, | |
55 | "Disable VFIO IOMMU support for IOMMU hugepages."); | |
56 | ||
73fa0d10 AW |
57 | struct vfio_iommu { |
58 | struct iommu_domain *domain; | |
59 | struct mutex lock; | |
cd9b2268 | 60 | struct rb_root dma_list; |
73fa0d10 AW |
61 | struct list_head group_list; |
62 | bool cache; | |
63 | }; | |
64 | ||
65 | struct vfio_dma { | |
cd9b2268 | 66 | struct rb_node node; |
73fa0d10 AW |
67 | dma_addr_t iova; /* Device address */ |
68 | unsigned long vaddr; /* Process virtual addr */ | |
166fd7d9 | 69 | size_t size; /* Map size (bytes) */ |
73fa0d10 AW |
70 | int prot; /* IOMMU_READ/WRITE */ |
71 | }; | |
72 | ||
73 | struct vfio_group { | |
74 | struct iommu_group *iommu_group; | |
75 | struct list_head next; | |
76 | }; | |
77 | ||
78 | /* | |
79 | * This code handles mapping and unmapping of user data buffers | |
80 | * into DMA'ble space using the IOMMU | |
81 | */ | |
82 | ||
cd9b2268 AW |
83 | static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, |
84 | dma_addr_t start, size_t size) | |
85 | { | |
86 | struct rb_node *node = iommu->dma_list.rb_node; | |
87 | ||
88 | while (node) { | |
89 | struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); | |
90 | ||
91 | if (start + size <= dma->iova) | |
92 | node = node->rb_left; | |
166fd7d9 | 93 | else if (start >= dma->iova + dma->size) |
cd9b2268 AW |
94 | node = node->rb_right; |
95 | else | |
96 | return dma; | |
97 | } | |
98 | ||
99 | return NULL; | |
100 | } | |
101 | ||
102 | static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new) | |
103 | { | |
104 | struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL; | |
105 | struct vfio_dma *dma; | |
106 | ||
107 | while (*link) { | |
108 | parent = *link; | |
109 | dma = rb_entry(parent, struct vfio_dma, node); | |
110 | ||
166fd7d9 | 111 | if (new->iova + new->size <= dma->iova) |
cd9b2268 AW |
112 | link = &(*link)->rb_left; |
113 | else | |
114 | link = &(*link)->rb_right; | |
115 | } | |
116 | ||
117 | rb_link_node(&new->node, parent, link); | |
118 | rb_insert_color(&new->node, &iommu->dma_list); | |
119 | } | |
120 | ||
121 | static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old) | |
122 | { | |
123 | rb_erase(&old->node, &iommu->dma_list); | |
124 | } | |
125 | ||
73fa0d10 AW |
126 | struct vwork { |
127 | struct mm_struct *mm; | |
128 | long npage; | |
129 | struct work_struct work; | |
130 | }; | |
131 | ||
132 | /* delayed decrement/increment for locked_vm */ | |
133 | static void vfio_lock_acct_bg(struct work_struct *work) | |
134 | { | |
135 | struct vwork *vwork = container_of(work, struct vwork, work); | |
136 | struct mm_struct *mm; | |
137 | ||
138 | mm = vwork->mm; | |
139 | down_write(&mm->mmap_sem); | |
140 | mm->locked_vm += vwork->npage; | |
141 | up_write(&mm->mmap_sem); | |
142 | mmput(mm); | |
143 | kfree(vwork); | |
144 | } | |
145 | ||
146 | static void vfio_lock_acct(long npage) | |
147 | { | |
148 | struct vwork *vwork; | |
149 | struct mm_struct *mm; | |
150 | ||
166fd7d9 AW |
151 | if (!current->mm || !npage) |
152 | return; /* process exited or nothing to do */ | |
73fa0d10 AW |
153 | |
154 | if (down_write_trylock(¤t->mm->mmap_sem)) { | |
155 | current->mm->locked_vm += npage; | |
156 | up_write(¤t->mm->mmap_sem); | |
157 | return; | |
158 | } | |
159 | ||
160 | /* | |
161 | * Couldn't get mmap_sem lock, so must setup to update | |
162 | * mm->locked_vm later. If locked_vm were atomic, we | |
163 | * wouldn't need this silliness | |
164 | */ | |
165 | vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); | |
166 | if (!vwork) | |
167 | return; | |
168 | mm = get_task_mm(current); | |
169 | if (!mm) { | |
170 | kfree(vwork); | |
171 | return; | |
172 | } | |
173 | INIT_WORK(&vwork->work, vfio_lock_acct_bg); | |
174 | vwork->mm = mm; | |
175 | vwork->npage = npage; | |
176 | schedule_work(&vwork->work); | |
177 | } | |
178 | ||
179 | /* | |
180 | * Some mappings aren't backed by a struct page, for example an mmap'd | |
181 | * MMIO range for our own or another device. These use a different | |
182 | * pfn conversion and shouldn't be tracked as locked pages. | |
183 | */ | |
184 | static bool is_invalid_reserved_pfn(unsigned long pfn) | |
185 | { | |
186 | if (pfn_valid(pfn)) { | |
187 | bool reserved; | |
188 | struct page *tail = pfn_to_page(pfn); | |
189 | struct page *head = compound_trans_head(tail); | |
190 | reserved = !!(PageReserved(head)); | |
191 | if (head != tail) { | |
192 | /* | |
193 | * "head" is not a dangling pointer | |
194 | * (compound_trans_head takes care of that) | |
195 | * but the hugepage may have been split | |
196 | * from under us (and we may not hold a | |
197 | * reference count on the head page so it can | |
198 | * be reused before we run PageReferenced), so | |
199 | * we've to check PageTail before returning | |
200 | * what we just read. | |
201 | */ | |
202 | smp_rmb(); | |
203 | if (PageTail(tail)) | |
204 | return reserved; | |
205 | } | |
206 | return PageReserved(tail); | |
207 | } | |
208 | ||
209 | return true; | |
210 | } | |
211 | ||
212 | static int put_pfn(unsigned long pfn, int prot) | |
213 | { | |
214 | if (!is_invalid_reserved_pfn(pfn)) { | |
215 | struct page *page = pfn_to_page(pfn); | |
216 | if (prot & IOMMU_WRITE) | |
217 | SetPageDirty(page); | |
218 | put_page(page); | |
219 | return 1; | |
220 | } | |
221 | return 0; | |
222 | } | |
223 | ||
73fa0d10 AW |
224 | static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) |
225 | { | |
226 | struct page *page[1]; | |
227 | struct vm_area_struct *vma; | |
228 | int ret = -EFAULT; | |
229 | ||
230 | if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) { | |
231 | *pfn = page_to_pfn(page[0]); | |
232 | return 0; | |
233 | } | |
234 | ||
235 | down_read(¤t->mm->mmap_sem); | |
236 | ||
237 | vma = find_vma_intersection(current->mm, vaddr, vaddr + 1); | |
238 | ||
239 | if (vma && vma->vm_flags & VM_PFNMAP) { | |
240 | *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | |
241 | if (is_invalid_reserved_pfn(*pfn)) | |
242 | ret = 0; | |
243 | } | |
244 | ||
245 | up_read(¤t->mm->mmap_sem); | |
246 | ||
247 | return ret; | |
248 | } | |
249 | ||
166fd7d9 AW |
250 | /* |
251 | * Attempt to pin pages. We really don't want to track all the pfns and | |
252 | * the iommu can only map chunks of consecutive pfns anyway, so get the | |
253 | * first page and all consecutive pages with the same locking. | |
254 | */ | |
255 | static long vfio_pin_pages(unsigned long vaddr, long npage, | |
256 | int prot, unsigned long *pfn_base) | |
73fa0d10 | 257 | { |
166fd7d9 AW |
258 | unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; |
259 | bool lock_cap = capable(CAP_IPC_LOCK); | |
260 | long ret, i; | |
73fa0d10 | 261 | |
166fd7d9 AW |
262 | if (!current->mm) |
263 | return -ENODEV; | |
73fa0d10 | 264 | |
166fd7d9 AW |
265 | ret = vaddr_get_pfn(vaddr, prot, pfn_base); |
266 | if (ret) | |
267 | return ret; | |
73fa0d10 | 268 | |
166fd7d9 AW |
269 | if (is_invalid_reserved_pfn(*pfn_base)) |
270 | return 1; | |
73fa0d10 | 271 | |
166fd7d9 AW |
272 | if (!lock_cap && current->mm->locked_vm + 1 > limit) { |
273 | put_pfn(*pfn_base, prot); | |
274 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, | |
275 | limit << PAGE_SHIFT); | |
276 | return -ENOMEM; | |
277 | } | |
278 | ||
5c6c2b21 AW |
279 | if (unlikely(disable_hugepages)) { |
280 | vfio_lock_acct(1); | |
281 | return 1; | |
282 | } | |
283 | ||
166fd7d9 AW |
284 | /* Lock all the consecutive pages from pfn_base */ |
285 | for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) { | |
73fa0d10 AW |
286 | unsigned long pfn = 0; |
287 | ||
288 | ret = vaddr_get_pfn(vaddr, prot, &pfn); | |
166fd7d9 AW |
289 | if (ret) |
290 | break; | |
291 | ||
292 | if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) { | |
293 | put_pfn(pfn, prot); | |
294 | break; | |
73fa0d10 AW |
295 | } |
296 | ||
166fd7d9 AW |
297 | if (!lock_cap && current->mm->locked_vm + i + 1 > limit) { |
298 | put_pfn(pfn, prot); | |
299 | pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", | |
300 | __func__, limit << PAGE_SHIFT); | |
301 | break; | |
302 | } | |
303 | } | |
304 | ||
305 | vfio_lock_acct(i); | |
306 | ||
307 | return i; | |
308 | } | |
309 | ||
310 | static long vfio_unpin_pages(unsigned long pfn, long npage, | |
311 | int prot, bool do_accounting) | |
312 | { | |
313 | unsigned long unlocked = 0; | |
314 | long i; | |
315 | ||
316 | for (i = 0; i < npage; i++) | |
317 | unlocked += put_pfn(pfn++, prot); | |
318 | ||
319 | if (do_accounting) | |
320 | vfio_lock_acct(-unlocked); | |
321 | ||
322 | return unlocked; | |
323 | } | |
324 | ||
325 | static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, | |
326 | dma_addr_t iova, size_t *size) | |
327 | { | |
328 | dma_addr_t start = iova, end = iova + *size; | |
329 | long unlocked = 0; | |
330 | ||
331 | while (iova < end) { | |
332 | size_t unmapped; | |
333 | phys_addr_t phys; | |
334 | ||
73fa0d10 | 335 | /* |
166fd7d9 AW |
336 | * We use the IOMMU to track the physical address. This |
337 | * saves us from having a lot more entries in our mapping | |
338 | * tree. The downside is that we don't track the size | |
339 | * used to do the mapping. We request unmap of a single | |
340 | * page, but expect IOMMUs that support large pages to | |
341 | * unmap a larger chunk. | |
73fa0d10 | 342 | */ |
166fd7d9 AW |
343 | phys = iommu_iova_to_phys(iommu->domain, iova); |
344 | if (WARN_ON(!phys)) { | |
345 | iova += PAGE_SIZE; | |
346 | continue; | |
73fa0d10 | 347 | } |
166fd7d9 AW |
348 | |
349 | unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE); | |
350 | if (!unmapped) | |
351 | break; | |
352 | ||
353 | unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT, | |
354 | unmapped >> PAGE_SHIFT, | |
355 | dma->prot, false); | |
356 | iova += unmapped; | |
73fa0d10 | 357 | } |
166fd7d9 AW |
358 | |
359 | vfio_lock_acct(-unlocked); | |
360 | ||
361 | *size = iova - start; | |
362 | ||
73fa0d10 AW |
363 | return 0; |
364 | } | |
365 | ||
cd9b2268 | 366 | static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, |
166fd7d9 | 367 | size_t *size, struct vfio_dma *dma) |
73fa0d10 | 368 | { |
166fd7d9 | 369 | size_t offset, overlap, tmp; |
73fa0d10 | 370 | struct vfio_dma *split; |
166fd7d9 AW |
371 | int ret; |
372 | ||
373 | /* | |
374 | * Existing dma region is completely covered, unmap all. This is | |
375 | * the likely case since userspace tends to map and unmap buffers | |
376 | * in one shot rather than multiple mappings within a buffer. | |
377 | */ | |
378 | if (likely(start <= dma->iova && | |
379 | start + *size >= dma->iova + dma->size)) { | |
380 | *size = dma->size; | |
381 | ret = vfio_unmap_unpin(iommu, dma, dma->iova, size); | |
382 | if (ret) | |
383 | return ret; | |
384 | ||
385 | /* | |
386 | * Did we remove more than we have? Should never happen | |
387 | * since a vfio_dma is contiguous in iova and vaddr. | |
388 | */ | |
389 | WARN_ON(*size != dma->size); | |
73fa0d10 | 390 | |
cd9b2268 | 391 | vfio_remove_dma(iommu, dma); |
73fa0d10 | 392 | kfree(dma); |
cd9b2268 | 393 | return 0; |
73fa0d10 AW |
394 | } |
395 | ||
396 | /* Overlap low address of existing range */ | |
397 | if (start <= dma->iova) { | |
166fd7d9 AW |
398 | overlap = start + *size - dma->iova; |
399 | ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap); | |
400 | if (ret) | |
401 | return ret; | |
73fa0d10 | 402 | |
166fd7d9 | 403 | vfio_remove_dma(iommu, dma); |
73fa0d10 | 404 | |
166fd7d9 AW |
405 | /* |
406 | * Check, we may have removed to whole vfio_dma. If not | |
407 | * fixup and re-insert. | |
408 | */ | |
409 | if (overlap < dma->size) { | |
410 | dma->iova += overlap; | |
411 | dma->vaddr += overlap; | |
412 | dma->size -= overlap; | |
413 | vfio_insert_dma(iommu, dma); | |
414 | } | |
415 | *size = overlap; | |
cd9b2268 | 416 | return 0; |
73fa0d10 AW |
417 | } |
418 | ||
419 | /* Overlap high address of existing range */ | |
166fd7d9 AW |
420 | if (start + *size >= dma->iova + dma->size) { |
421 | offset = start - dma->iova; | |
422 | overlap = dma->size - offset; | |
73fa0d10 | 423 | |
166fd7d9 AW |
424 | ret = vfio_unmap_unpin(iommu, dma, start, &overlap); |
425 | if (ret) | |
426 | return ret; | |
427 | ||
428 | /* | |
429 | * We may have unmapped the entire vfio_dma if the user is | |
430 | * trying to unmap a sub-region of what was originally | |
431 | * mapped. If anything left, we can resize in place since | |
432 | * iova is unchanged. | |
433 | */ | |
434 | if (overlap < dma->size) | |
435 | dma->size -= overlap; | |
436 | else | |
437 | vfio_remove_dma(iommu, dma); | |
73fa0d10 | 438 | |
166fd7d9 | 439 | *size = overlap; |
cd9b2268 | 440 | return 0; |
73fa0d10 AW |
441 | } |
442 | ||
443 | /* Split existing */ | |
166fd7d9 | 444 | offset = start - dma->iova; |
73fa0d10 | 445 | |
166fd7d9 AW |
446 | ret = vfio_unmap_unpin(iommu, dma, start, size); |
447 | if (ret) | |
448 | return ret; | |
73fa0d10 | 449 | |
166fd7d9 AW |
450 | WARN_ON(!*size); |
451 | tmp = dma->size; | |
73fa0d10 | 452 | |
166fd7d9 AW |
453 | /* |
454 | * Resize the lower vfio_dma in place, insert new for remaining | |
455 | * upper segment. | |
456 | */ | |
457 | dma->size = offset; | |
458 | ||
459 | if (offset + *size < tmp) { | |
460 | split = kzalloc(sizeof(*split), GFP_KERNEL); | |
461 | if (!split) | |
462 | return -ENOMEM; | |
463 | ||
464 | split->size = tmp - offset - *size; | |
465 | split->iova = dma->iova + offset + *size; | |
466 | split->vaddr = dma->vaddr + offset + *size; | |
467 | split->prot = dma->prot; | |
468 | vfio_insert_dma(iommu, split); | |
469 | } | |
73fa0d10 | 470 | |
cd9b2268 | 471 | return 0; |
73fa0d10 AW |
472 | } |
473 | ||
474 | static int vfio_dma_do_unmap(struct vfio_iommu *iommu, | |
475 | struct vfio_iommu_type1_dma_unmap *unmap) | |
476 | { | |
73fa0d10 | 477 | uint64_t mask; |
cd9b2268 | 478 | struct vfio_dma *dma; |
166fd7d9 | 479 | size_t unmapped = 0, size; |
cd9b2268 | 480 | int ret = 0; |
73fa0d10 AW |
481 | |
482 | mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; | |
483 | ||
484 | if (unmap->iova & mask) | |
485 | return -EINVAL; | |
486 | if (unmap->size & mask) | |
487 | return -EINVAL; | |
488 | ||
73fa0d10 AW |
489 | WARN_ON(mask & PAGE_MASK); |
490 | ||
491 | mutex_lock(&iommu->lock); | |
492 | ||
166fd7d9 AW |
493 | while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { |
494 | size = unmap->size; | |
495 | ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma); | |
496 | if (ret) | |
497 | break; | |
498 | unmapped += size; | |
499 | } | |
cd9b2268 | 500 | |
73fa0d10 | 501 | mutex_unlock(&iommu->lock); |
166fd7d9 AW |
502 | |
503 | /* | |
504 | * We may unmap more than requested, update the unmap struct so | |
505 | * userspace can know. | |
506 | */ | |
507 | unmap->size = unmapped; | |
508 | ||
509 | return ret; | |
510 | } | |
511 | ||
512 | /* | |
513 | * Turns out AMD IOMMU has a page table bug where it won't map large pages | |
514 | * to a region that previously mapped smaller pages. This should be fixed | |
515 | * soon, so this is just a temporary workaround to break mappings down into | |
516 | * PAGE_SIZE. Better to map smaller pages than nothing. | |
517 | */ | |
518 | static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova, | |
519 | unsigned long pfn, long npage, int prot) | |
520 | { | |
521 | long i; | |
522 | int ret; | |
523 | ||
524 | for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) { | |
525 | ret = iommu_map(iommu->domain, iova, | |
526 | (phys_addr_t)pfn << PAGE_SHIFT, | |
527 | PAGE_SIZE, prot); | |
528 | if (ret) | |
529 | break; | |
530 | } | |
531 | ||
532 | for (; i < npage && i > 0; i--, iova -= PAGE_SIZE) | |
533 | iommu_unmap(iommu->domain, iova, PAGE_SIZE); | |
534 | ||
cd9b2268 | 535 | return ret; |
73fa0d10 AW |
536 | } |
537 | ||
538 | static int vfio_dma_do_map(struct vfio_iommu *iommu, | |
539 | struct vfio_iommu_type1_dma_map *map) | |
540 | { | |
166fd7d9 AW |
541 | dma_addr_t end, iova; |
542 | unsigned long vaddr = map->vaddr; | |
73fa0d10 | 543 | size_t size = map->size; |
166fd7d9 | 544 | long npage; |
73fa0d10 AW |
545 | int ret = 0, prot = 0; |
546 | uint64_t mask; | |
166fd7d9 AW |
547 | |
548 | end = map->iova + map->size; | |
73fa0d10 AW |
549 | |
550 | mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; | |
551 | ||
552 | /* READ/WRITE from device perspective */ | |
553 | if (map->flags & VFIO_DMA_MAP_FLAG_WRITE) | |
554 | prot |= IOMMU_WRITE; | |
555 | if (map->flags & VFIO_DMA_MAP_FLAG_READ) | |
556 | prot |= IOMMU_READ; | |
557 | ||
558 | if (!prot) | |
559 | return -EINVAL; /* No READ/WRITE? */ | |
560 | ||
166fd7d9 AW |
561 | if (iommu->cache) |
562 | prot |= IOMMU_CACHE; | |
563 | ||
73fa0d10 AW |
564 | if (vaddr & mask) |
565 | return -EINVAL; | |
166fd7d9 | 566 | if (map->iova & mask) |
73fa0d10 | 567 | return -EINVAL; |
166fd7d9 | 568 | if (!map->size || map->size & mask) |
73fa0d10 AW |
569 | return -EINVAL; |
570 | ||
73fa0d10 AW |
571 | WARN_ON(mask & PAGE_MASK); |
572 | ||
573 | /* Don't allow IOVA wrap */ | |
166fd7d9 | 574 | if (end && end < map->iova) |
73fa0d10 AW |
575 | return -EINVAL; |
576 | ||
577 | /* Don't allow virtual address wrap */ | |
166fd7d9 | 578 | if (vaddr + map->size && vaddr + map->size < vaddr) |
73fa0d10 AW |
579 | return -EINVAL; |
580 | ||
581 | mutex_lock(&iommu->lock); | |
582 | ||
166fd7d9 AW |
583 | if (vfio_find_dma(iommu, map->iova, map->size)) { |
584 | mutex_unlock(&iommu->lock); | |
585 | return -EEXIST; | |
73fa0d10 AW |
586 | } |
587 | ||
166fd7d9 AW |
588 | for (iova = map->iova; iova < end; iova += size, vaddr += size) { |
589 | struct vfio_dma *dma = NULL; | |
590 | unsigned long pfn; | |
591 | long i; | |
592 | ||
593 | /* Pin a contiguous chunk of memory */ | |
594 | npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT, | |
595 | prot, &pfn); | |
596 | if (npage <= 0) { | |
597 | WARN_ON(!npage); | |
598 | ret = (int)npage; | |
599 | break; | |
600 | } | |
73fa0d10 | 601 | |
166fd7d9 AW |
602 | /* Verify pages are not already mapped */ |
603 | for (i = 0; i < npage; i++) { | |
604 | if (iommu_iova_to_phys(iommu->domain, | |
605 | iova + (i << PAGE_SHIFT))) { | |
606 | vfio_unpin_pages(pfn, npage, prot, true); | |
607 | ret = -EBUSY; | |
608 | break; | |
609 | } | |
610 | } | |
611 | ||
612 | ret = iommu_map(iommu->domain, iova, | |
613 | (phys_addr_t)pfn << PAGE_SHIFT, | |
614 | npage << PAGE_SHIFT, prot); | |
615 | if (ret) { | |
616 | if (ret != -EBUSY || | |
617 | map_try_harder(iommu, iova, pfn, npage, prot)) { | |
618 | vfio_unpin_pages(pfn, npage, prot, true); | |
619 | break; | |
620 | } | |
621 | } | |
622 | ||
623 | size = npage << PAGE_SHIFT; | |
624 | ||
625 | /* | |
626 | * Check if we abut a region below - nothing below 0. | |
627 | * This is the most likely case when mapping chunks of | |
628 | * physically contiguous regions within a virtual address | |
629 | * range. Update the abutting entry in place since iova | |
630 | * doesn't change. | |
631 | */ | |
632 | if (likely(iova)) { | |
633 | struct vfio_dma *tmp; | |
634 | tmp = vfio_find_dma(iommu, iova - 1, 1); | |
635 | if (tmp && tmp->prot == prot && | |
636 | tmp->vaddr + tmp->size == vaddr) { | |
637 | tmp->size += size; | |
638 | ||
639 | iova = tmp->iova; | |
640 | size = tmp->size; | |
641 | vaddr = tmp->vaddr; | |
642 | dma = tmp; | |
643 | } | |
644 | } | |
645 | ||
646 | /* Check if we abut a region above - nothing above ~0 + 1 */ | |
647 | if (likely(iova + size)) { | |
648 | struct vfio_dma *tmp; | |
649 | ||
650 | tmp = vfio_find_dma(iommu, iova + size, 1); | |
651 | if (tmp && tmp->prot == prot && | |
652 | tmp->vaddr == vaddr + size) { | |
653 | vfio_remove_dma(iommu, tmp); | |
654 | if (dma) | |
655 | dma->size += tmp->size; | |
656 | else | |
657 | size += tmp->size; | |
658 | kfree(tmp); | |
659 | } | |
73fa0d10 | 660 | } |
73fa0d10 | 661 | |
166fd7d9 AW |
662 | if (!dma) { |
663 | dma = kzalloc(sizeof(*dma), GFP_KERNEL); | |
664 | if (!dma) { | |
665 | iommu_unmap(iommu->domain, iova, size); | |
666 | vfio_unpin_pages(pfn, npage, prot, true); | |
667 | ret = -ENOMEM; | |
668 | break; | |
669 | } | |
670 | ||
671 | dma->size = size; | |
672 | dma->iova = iova; | |
673 | dma->vaddr = vaddr; | |
674 | dma->prot = prot; | |
675 | vfio_insert_dma(iommu, dma); | |
73fa0d10 AW |
676 | } |
677 | } | |
678 | ||
166fd7d9 AW |
679 | if (ret) { |
680 | struct vfio_dma *tmp; | |
681 | iova = map->iova; | |
682 | size = map->size; | |
683 | while ((tmp = vfio_find_dma(iommu, iova, size))) { | |
684 | if (vfio_remove_dma_overlap(iommu, iova, &size, tmp)) { | |
685 | pr_warn("%s: Error rolling back failed map\n", | |
686 | __func__); | |
687 | break; | |
688 | } | |
689 | } | |
690 | } | |
73fa0d10 | 691 | |
73fa0d10 AW |
692 | mutex_unlock(&iommu->lock); |
693 | return ret; | |
694 | } | |
695 | ||
696 | static int vfio_iommu_type1_attach_group(void *iommu_data, | |
697 | struct iommu_group *iommu_group) | |
698 | { | |
699 | struct vfio_iommu *iommu = iommu_data; | |
700 | struct vfio_group *group, *tmp; | |
701 | int ret; | |
702 | ||
703 | group = kzalloc(sizeof(*group), GFP_KERNEL); | |
704 | if (!group) | |
705 | return -ENOMEM; | |
706 | ||
707 | mutex_lock(&iommu->lock); | |
708 | ||
709 | list_for_each_entry(tmp, &iommu->group_list, next) { | |
710 | if (tmp->iommu_group == iommu_group) { | |
711 | mutex_unlock(&iommu->lock); | |
712 | kfree(group); | |
713 | return -EINVAL; | |
714 | } | |
715 | } | |
716 | ||
717 | /* | |
718 | * TODO: Domain have capabilities that might change as we add | |
719 | * groups (see iommu->cache, currently never set). Check for | |
720 | * them and potentially disallow groups to be attached when it | |
721 | * would change capabilities (ugh). | |
722 | */ | |
723 | ret = iommu_attach_group(iommu->domain, iommu_group); | |
724 | if (ret) { | |
725 | mutex_unlock(&iommu->lock); | |
726 | kfree(group); | |
727 | return ret; | |
728 | } | |
729 | ||
730 | group->iommu_group = iommu_group; | |
731 | list_add(&group->next, &iommu->group_list); | |
732 | ||
733 | mutex_unlock(&iommu->lock); | |
734 | ||
735 | return 0; | |
736 | } | |
737 | ||
738 | static void vfio_iommu_type1_detach_group(void *iommu_data, | |
739 | struct iommu_group *iommu_group) | |
740 | { | |
741 | struct vfio_iommu *iommu = iommu_data; | |
742 | struct vfio_group *group; | |
743 | ||
744 | mutex_lock(&iommu->lock); | |
745 | ||
746 | list_for_each_entry(group, &iommu->group_list, next) { | |
747 | if (group->iommu_group == iommu_group) { | |
748 | iommu_detach_group(iommu->domain, iommu_group); | |
749 | list_del(&group->next); | |
750 | kfree(group); | |
751 | break; | |
752 | } | |
753 | } | |
754 | ||
755 | mutex_unlock(&iommu->lock); | |
756 | } | |
757 | ||
758 | static void *vfio_iommu_type1_open(unsigned long arg) | |
759 | { | |
760 | struct vfio_iommu *iommu; | |
761 | ||
762 | if (arg != VFIO_TYPE1_IOMMU) | |
763 | return ERR_PTR(-EINVAL); | |
764 | ||
765 | iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); | |
766 | if (!iommu) | |
767 | return ERR_PTR(-ENOMEM); | |
768 | ||
769 | INIT_LIST_HEAD(&iommu->group_list); | |
cd9b2268 | 770 | iommu->dma_list = RB_ROOT; |
73fa0d10 AW |
771 | mutex_init(&iommu->lock); |
772 | ||
773 | /* | |
774 | * Wish we didn't have to know about bus_type here. | |
775 | */ | |
776 | iommu->domain = iommu_domain_alloc(&pci_bus_type); | |
777 | if (!iommu->domain) { | |
778 | kfree(iommu); | |
779 | return ERR_PTR(-EIO); | |
780 | } | |
781 | ||
782 | /* | |
783 | * Wish we could specify required capabilities rather than create | |
784 | * a domain, see what comes out and hope it doesn't change along | |
785 | * the way. Fortunately we know interrupt remapping is global for | |
786 | * our iommus. | |
787 | */ | |
788 | if (!allow_unsafe_interrupts && | |
789 | !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) { | |
790 | pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n", | |
791 | __func__); | |
792 | iommu_domain_free(iommu->domain); | |
793 | kfree(iommu); | |
794 | return ERR_PTR(-EPERM); | |
795 | } | |
796 | ||
797 | return iommu; | |
798 | } | |
799 | ||
800 | static void vfio_iommu_type1_release(void *iommu_data) | |
801 | { | |
802 | struct vfio_iommu *iommu = iommu_data; | |
803 | struct vfio_group *group, *group_tmp; | |
cd9b2268 | 804 | struct rb_node *node; |
73fa0d10 AW |
805 | |
806 | list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) { | |
807 | iommu_detach_group(iommu->domain, group->iommu_group); | |
808 | list_del(&group->next); | |
809 | kfree(group); | |
810 | } | |
811 | ||
cd9b2268 AW |
812 | while ((node = rb_first(&iommu->dma_list))) { |
813 | struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); | |
166fd7d9 AW |
814 | size_t size = dma->size; |
815 | vfio_remove_dma_overlap(iommu, dma->iova, &size, dma); | |
73fa0d10 AW |
816 | } |
817 | ||
818 | iommu_domain_free(iommu->domain); | |
819 | iommu->domain = NULL; | |
820 | kfree(iommu); | |
821 | } | |
822 | ||
823 | static long vfio_iommu_type1_ioctl(void *iommu_data, | |
824 | unsigned int cmd, unsigned long arg) | |
825 | { | |
826 | struct vfio_iommu *iommu = iommu_data; | |
827 | unsigned long minsz; | |
828 | ||
829 | if (cmd == VFIO_CHECK_EXTENSION) { | |
830 | switch (arg) { | |
831 | case VFIO_TYPE1_IOMMU: | |
832 | return 1; | |
833 | default: | |
834 | return 0; | |
835 | } | |
836 | } else if (cmd == VFIO_IOMMU_GET_INFO) { | |
837 | struct vfio_iommu_type1_info info; | |
838 | ||
839 | minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); | |
840 | ||
841 | if (copy_from_user(&info, (void __user *)arg, minsz)) | |
842 | return -EFAULT; | |
843 | ||
844 | if (info.argsz < minsz) | |
845 | return -EINVAL; | |
846 | ||
847 | info.flags = 0; | |
848 | ||
849 | info.iova_pgsizes = iommu->domain->ops->pgsize_bitmap; | |
850 | ||
851 | return copy_to_user((void __user *)arg, &info, minsz); | |
852 | ||
853 | } else if (cmd == VFIO_IOMMU_MAP_DMA) { | |
854 | struct vfio_iommu_type1_dma_map map; | |
855 | uint32_t mask = VFIO_DMA_MAP_FLAG_READ | | |
856 | VFIO_DMA_MAP_FLAG_WRITE; | |
857 | ||
858 | minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); | |
859 | ||
860 | if (copy_from_user(&map, (void __user *)arg, minsz)) | |
861 | return -EFAULT; | |
862 | ||
863 | if (map.argsz < minsz || map.flags & ~mask) | |
864 | return -EINVAL; | |
865 | ||
866 | return vfio_dma_do_map(iommu, &map); | |
867 | ||
868 | } else if (cmd == VFIO_IOMMU_UNMAP_DMA) { | |
869 | struct vfio_iommu_type1_dma_unmap unmap; | |
166fd7d9 | 870 | long ret; |
73fa0d10 AW |
871 | |
872 | minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); | |
873 | ||
874 | if (copy_from_user(&unmap, (void __user *)arg, minsz)) | |
875 | return -EFAULT; | |
876 | ||
877 | if (unmap.argsz < minsz || unmap.flags) | |
878 | return -EINVAL; | |
879 | ||
166fd7d9 AW |
880 | ret = vfio_dma_do_unmap(iommu, &unmap); |
881 | if (ret) | |
882 | return ret; | |
883 | ||
884 | return copy_to_user((void __user *)arg, &unmap, minsz); | |
73fa0d10 AW |
885 | } |
886 | ||
887 | return -ENOTTY; | |
888 | } | |
889 | ||
890 | static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { | |
891 | .name = "vfio-iommu-type1", | |
892 | .owner = THIS_MODULE, | |
893 | .open = vfio_iommu_type1_open, | |
894 | .release = vfio_iommu_type1_release, | |
895 | .ioctl = vfio_iommu_type1_ioctl, | |
896 | .attach_group = vfio_iommu_type1_attach_group, | |
897 | .detach_group = vfio_iommu_type1_detach_group, | |
898 | }; | |
899 | ||
900 | static int __init vfio_iommu_type1_init(void) | |
901 | { | |
902 | if (!iommu_present(&pci_bus_type)) | |
903 | return -ENODEV; | |
904 | ||
905 | return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1); | |
906 | } | |
907 | ||
908 | static void __exit vfio_iommu_type1_cleanup(void) | |
909 | { | |
910 | vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1); | |
911 | } | |
912 | ||
913 | module_init(vfio_iommu_type1_init); | |
914 | module_exit(vfio_iommu_type1_cleanup); | |
915 | ||
916 | MODULE_VERSION(DRIVER_VERSION); | |
917 | MODULE_LICENSE("GPL v2"); | |
918 | MODULE_AUTHOR(DRIVER_AUTHOR); | |
919 | MODULE_DESCRIPTION(DRIVER_DESC); |