]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blob - drivers/iommu/intel-iommu.c
Merge tag 'clk-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/clk/linux
[mirror_ubuntu-hirsute-kernel.git] / drivers / iommu / intel-iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
52
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
81
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
92 * that we support.
93 *
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
97 *
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
100 *
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
103 */
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108 return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123 return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(unsigned long pfn, int level)
127 {
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline unsigned long level_mask(int level)
132 {
133 return -1UL << level_to_offset_bits(level);
134 }
135
136 static inline unsigned long level_size(int level)
137 {
138 return 1UL << level_to_offset_bits(level);
139 }
140
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
142 {
143 return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164 return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168 return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
180 */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189 * if marked present.
190 */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193 if (!(re->lo & 1))
194 return 0;
195
196 return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201 * if marked present.
202 */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205 if (!(re->hi & 1))
206 return 0;
207
208 return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213 context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218 return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223 context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228 return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233 return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245 context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250 context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254 unsigned long value)
255 {
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261 unsigned long value)
262 {
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268 unsigned long value)
269 {
270 context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274 unsigned long value)
275 {
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281 return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286 context->lo = 0;
287 context->hi = 0;
288 }
289
290 /*
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
295 */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
301
302 /*
303 * This is a DMA domain allocated through the iommu domain allocation
304 * interface. But one or more devices belonging to this domain have
305 * been chosen to use a private domain. We should avoid to use the
306 * map/unmap/iova_to_phys APIs on it.
307 */
308 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
309
310 /*
311 * When VT-d works in the scalable mode, it allows DMA translation to
312 * happen through either first level or second level page table. This
313 * bit marks that the DMA translation for the domain goes through the
314 * first level page table, otherwise, it goes through the second level.
315 */
316 #define DOMAIN_FLAG_USE_FIRST_LEVEL BIT(2)
317
318 /*
319 * Domain represents a virtual machine which demands iommu nested
320 * translation mode support.
321 */
322 #define DOMAIN_FLAG_NESTING_MODE BIT(3)
323
324 #define for_each_domain_iommu(idx, domain) \
325 for (idx = 0; idx < g_num_of_iommus; idx++) \
326 if (domain->iommu_refcnt[idx])
327
328 struct dmar_rmrr_unit {
329 struct list_head list; /* list of rmrr units */
330 struct acpi_dmar_header *hdr; /* ACPI header */
331 u64 base_address; /* reserved base address*/
332 u64 end_address; /* reserved end address */
333 struct dmar_dev_scope *devices; /* target devices */
334 int devices_cnt; /* target device count */
335 };
336
337 struct dmar_atsr_unit {
338 struct list_head list; /* list of ATSR units */
339 struct acpi_dmar_header *hdr; /* ACPI header */
340 struct dmar_dev_scope *devices; /* target devices */
341 int devices_cnt; /* target device count */
342 u8 include_all:1; /* include all ports */
343 };
344
345 static LIST_HEAD(dmar_atsr_units);
346 static LIST_HEAD(dmar_rmrr_units);
347
348 #define for_each_rmrr_units(rmrr) \
349 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
350
351 /* bitmap for indexing intel_iommus */
352 static int g_num_of_iommus;
353
354 static void domain_exit(struct dmar_domain *domain);
355 static void domain_remove_dev_info(struct dmar_domain *domain);
356 static void dmar_remove_one_dev_info(struct device *dev);
357 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
358 static void domain_context_clear(struct intel_iommu *iommu,
359 struct device *dev);
360 static int domain_detach_iommu(struct dmar_domain *domain,
361 struct intel_iommu *iommu);
362 static bool device_is_rmrr_locked(struct device *dev);
363 static int intel_iommu_attach_device(struct iommu_domain *domain,
364 struct device *dev);
365 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
366 dma_addr_t iova);
367
368 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
369 int dmar_disabled = 0;
370 #else
371 int dmar_disabled = 1;
372 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
373
374 #ifdef INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
375 int intel_iommu_sm = 1;
376 #else
377 int intel_iommu_sm;
378 #endif /* INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
379
380 int intel_iommu_enabled = 0;
381 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
382
383 static int dmar_map_gfx = 1;
384 static int dmar_forcedac;
385 static int intel_iommu_strict;
386 static int intel_iommu_superpage = 1;
387 static int iommu_identity_mapping;
388 static int intel_no_bounce;
389
390 #define IDENTMAP_GFX 2
391 #define IDENTMAP_AZALIA 4
392
393 int intel_iommu_gfx_mapped;
394 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
395
396 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
397 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
398 DEFINE_SPINLOCK(device_domain_lock);
399 static LIST_HEAD(device_domain_list);
400
401 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
402 to_pci_dev(d)->untrusted)
403
404 /*
405 * Iterate over elements in device_domain_list and call the specified
406 * callback @fn against each element.
407 */
408 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
409 void *data), void *data)
410 {
411 int ret = 0;
412 unsigned long flags;
413 struct device_domain_info *info;
414
415 spin_lock_irqsave(&device_domain_lock, flags);
416 list_for_each_entry(info, &device_domain_list, global) {
417 ret = fn(info, data);
418 if (ret) {
419 spin_unlock_irqrestore(&device_domain_lock, flags);
420 return ret;
421 }
422 }
423 spin_unlock_irqrestore(&device_domain_lock, flags);
424
425 return 0;
426 }
427
428 const struct iommu_ops intel_iommu_ops;
429
430 static bool translation_pre_enabled(struct intel_iommu *iommu)
431 {
432 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
433 }
434
435 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
436 {
437 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
438 }
439
440 static void init_translation_status(struct intel_iommu *iommu)
441 {
442 u32 gsts;
443
444 gsts = readl(iommu->reg + DMAR_GSTS_REG);
445 if (gsts & DMA_GSTS_TES)
446 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
447 }
448
449 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
450 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
451 {
452 return container_of(dom, struct dmar_domain, domain);
453 }
454
455 static int __init intel_iommu_setup(char *str)
456 {
457 if (!str)
458 return -EINVAL;
459 while (*str) {
460 if (!strncmp(str, "on", 2)) {
461 dmar_disabled = 0;
462 pr_info("IOMMU enabled\n");
463 } else if (!strncmp(str, "off", 3)) {
464 dmar_disabled = 1;
465 no_platform_optin = 1;
466 pr_info("IOMMU disabled\n");
467 } else if (!strncmp(str, "igfx_off", 8)) {
468 dmar_map_gfx = 0;
469 pr_info("Disable GFX device mapping\n");
470 } else if (!strncmp(str, "forcedac", 8)) {
471 pr_info("Forcing DAC for PCI devices\n");
472 dmar_forcedac = 1;
473 } else if (!strncmp(str, "strict", 6)) {
474 pr_info("Disable batched IOTLB flush\n");
475 intel_iommu_strict = 1;
476 } else if (!strncmp(str, "sp_off", 6)) {
477 pr_info("Disable supported super page\n");
478 intel_iommu_superpage = 0;
479 } else if (!strncmp(str, "sm_on", 5)) {
480 pr_info("Intel-IOMMU: scalable mode supported\n");
481 intel_iommu_sm = 1;
482 } else if (!strncmp(str, "tboot_noforce", 13)) {
483 printk(KERN_INFO
484 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
485 intel_iommu_tboot_noforce = 1;
486 } else if (!strncmp(str, "nobounce", 8)) {
487 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
488 intel_no_bounce = 1;
489 }
490
491 str += strcspn(str, ",");
492 while (*str == ',')
493 str++;
494 }
495 return 0;
496 }
497 __setup("intel_iommu=", intel_iommu_setup);
498
499 static struct kmem_cache *iommu_domain_cache;
500 static struct kmem_cache *iommu_devinfo_cache;
501
502 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
503 {
504 struct dmar_domain **domains;
505 int idx = did >> 8;
506
507 domains = iommu->domains[idx];
508 if (!domains)
509 return NULL;
510
511 return domains[did & 0xff];
512 }
513
514 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
515 struct dmar_domain *domain)
516 {
517 struct dmar_domain **domains;
518 int idx = did >> 8;
519
520 if (!iommu->domains[idx]) {
521 size_t size = 256 * sizeof(struct dmar_domain *);
522 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
523 }
524
525 domains = iommu->domains[idx];
526 if (WARN_ON(!domains))
527 return;
528 else
529 domains[did & 0xff] = domain;
530 }
531
532 void *alloc_pgtable_page(int node)
533 {
534 struct page *page;
535 void *vaddr = NULL;
536
537 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
538 if (page)
539 vaddr = page_address(page);
540 return vaddr;
541 }
542
543 void free_pgtable_page(void *vaddr)
544 {
545 free_page((unsigned long)vaddr);
546 }
547
548 static inline void *alloc_domain_mem(void)
549 {
550 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
551 }
552
553 static void free_domain_mem(void *vaddr)
554 {
555 kmem_cache_free(iommu_domain_cache, vaddr);
556 }
557
558 static inline void * alloc_devinfo_mem(void)
559 {
560 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
561 }
562
563 static inline void free_devinfo_mem(void *vaddr)
564 {
565 kmem_cache_free(iommu_devinfo_cache, vaddr);
566 }
567
568 static inline int domain_type_is_si(struct dmar_domain *domain)
569 {
570 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
571 }
572
573 static inline bool domain_use_first_level(struct dmar_domain *domain)
574 {
575 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
576 }
577
578 static inline int domain_pfn_supported(struct dmar_domain *domain,
579 unsigned long pfn)
580 {
581 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
582
583 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
584 }
585
586 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
587 {
588 unsigned long sagaw;
589 int agaw = -1;
590
591 sagaw = cap_sagaw(iommu->cap);
592 for (agaw = width_to_agaw(max_gaw);
593 agaw >= 0; agaw--) {
594 if (test_bit(agaw, &sagaw))
595 break;
596 }
597
598 return agaw;
599 }
600
601 /*
602 * Calculate max SAGAW for each iommu.
603 */
604 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
605 {
606 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
607 }
608
609 /*
610 * calculate agaw for each iommu.
611 * "SAGAW" may be different across iommus, use a default agaw, and
612 * get a supported less agaw for iommus that don't support the default agaw.
613 */
614 int iommu_calculate_agaw(struct intel_iommu *iommu)
615 {
616 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
617 }
618
619 /* This functionin only returns single iommu in a domain */
620 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
621 {
622 int iommu_id;
623
624 /* si_domain and vm domain should not get here. */
625 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
626 return NULL;
627
628 for_each_domain_iommu(iommu_id, domain)
629 break;
630
631 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
632 return NULL;
633
634 return g_iommus[iommu_id];
635 }
636
637 static void domain_update_iommu_coherency(struct dmar_domain *domain)
638 {
639 struct dmar_drhd_unit *drhd;
640 struct intel_iommu *iommu;
641 bool found = false;
642 int i;
643
644 domain->iommu_coherency = 1;
645
646 for_each_domain_iommu(i, domain) {
647 found = true;
648 if (!ecap_coherent(g_iommus[i]->ecap)) {
649 domain->iommu_coherency = 0;
650 break;
651 }
652 }
653 if (found)
654 return;
655
656 /* No hardware attached; use lowest common denominator */
657 rcu_read_lock();
658 for_each_active_iommu(iommu, drhd) {
659 if (!ecap_coherent(iommu->ecap)) {
660 domain->iommu_coherency = 0;
661 break;
662 }
663 }
664 rcu_read_unlock();
665 }
666
667 static int domain_update_iommu_snooping(struct intel_iommu *skip)
668 {
669 struct dmar_drhd_unit *drhd;
670 struct intel_iommu *iommu;
671 int ret = 1;
672
673 rcu_read_lock();
674 for_each_active_iommu(iommu, drhd) {
675 if (iommu != skip) {
676 if (!ecap_sc_support(iommu->ecap)) {
677 ret = 0;
678 break;
679 }
680 }
681 }
682 rcu_read_unlock();
683
684 return ret;
685 }
686
687 static int domain_update_iommu_superpage(struct dmar_domain *domain,
688 struct intel_iommu *skip)
689 {
690 struct dmar_drhd_unit *drhd;
691 struct intel_iommu *iommu;
692 int mask = 0x3;
693
694 if (!intel_iommu_superpage) {
695 return 0;
696 }
697
698 /* set iommu_superpage to the smallest common denominator */
699 rcu_read_lock();
700 for_each_active_iommu(iommu, drhd) {
701 if (iommu != skip) {
702 if (domain && domain_use_first_level(domain)) {
703 if (!cap_fl1gp_support(iommu->cap))
704 mask = 0x1;
705 } else {
706 mask &= cap_super_page_val(iommu->cap);
707 }
708
709 if (!mask)
710 break;
711 }
712 }
713 rcu_read_unlock();
714
715 return fls(mask);
716 }
717
718 /* Some capabilities may be different across iommus */
719 static void domain_update_iommu_cap(struct dmar_domain *domain)
720 {
721 domain_update_iommu_coherency(domain);
722 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
723 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
724 }
725
726 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
727 u8 devfn, int alloc)
728 {
729 struct root_entry *root = &iommu->root_entry[bus];
730 struct context_entry *context;
731 u64 *entry;
732
733 entry = &root->lo;
734 if (sm_supported(iommu)) {
735 if (devfn >= 0x80) {
736 devfn -= 0x80;
737 entry = &root->hi;
738 }
739 devfn *= 2;
740 }
741 if (*entry & 1)
742 context = phys_to_virt(*entry & VTD_PAGE_MASK);
743 else {
744 unsigned long phy_addr;
745 if (!alloc)
746 return NULL;
747
748 context = alloc_pgtable_page(iommu->node);
749 if (!context)
750 return NULL;
751
752 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
753 phy_addr = virt_to_phys((void *)context);
754 *entry = phy_addr | 1;
755 __iommu_flush_cache(iommu, entry, sizeof(*entry));
756 }
757 return &context[devfn];
758 }
759
760 static int iommu_dummy(struct device *dev)
761 {
762 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
763 }
764
765 static bool attach_deferred(struct device *dev)
766 {
767 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
768 }
769
770 /**
771 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
772 * sub-hierarchy of a candidate PCI-PCI bridge
773 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
774 * @bridge: the candidate PCI-PCI bridge
775 *
776 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
777 */
778 static bool
779 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
780 {
781 struct pci_dev *pdev, *pbridge;
782
783 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
784 return false;
785
786 pdev = to_pci_dev(dev);
787 pbridge = to_pci_dev(bridge);
788
789 if (pbridge->subordinate &&
790 pbridge->subordinate->number <= pdev->bus->number &&
791 pbridge->subordinate->busn_res.end >= pdev->bus->number)
792 return true;
793
794 return false;
795 }
796
797 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
798 {
799 struct dmar_drhd_unit *drhd = NULL;
800 struct intel_iommu *iommu;
801 struct device *tmp;
802 struct pci_dev *pdev = NULL;
803 u16 segment = 0;
804 int i;
805
806 if (iommu_dummy(dev))
807 return NULL;
808
809 if (dev_is_pci(dev)) {
810 struct pci_dev *pf_pdev;
811
812 pdev = pci_real_dma_dev(to_pci_dev(dev));
813
814 /* VFs aren't listed in scope tables; we need to look up
815 * the PF instead to find the IOMMU. */
816 pf_pdev = pci_physfn(pdev);
817 dev = &pf_pdev->dev;
818 segment = pci_domain_nr(pdev->bus);
819 } else if (has_acpi_companion(dev))
820 dev = &ACPI_COMPANION(dev)->dev;
821
822 rcu_read_lock();
823 for_each_active_iommu(iommu, drhd) {
824 if (pdev && segment != drhd->segment)
825 continue;
826
827 for_each_active_dev_scope(drhd->devices,
828 drhd->devices_cnt, i, tmp) {
829 if (tmp == dev) {
830 /* For a VF use its original BDF# not that of the PF
831 * which we used for the IOMMU lookup. Strictly speaking
832 * we could do this for all PCI devices; we only need to
833 * get the BDF# from the scope table for ACPI matches. */
834 if (pdev && pdev->is_virtfn)
835 goto got_pdev;
836
837 *bus = drhd->devices[i].bus;
838 *devfn = drhd->devices[i].devfn;
839 goto out;
840 }
841
842 if (is_downstream_to_pci_bridge(dev, tmp))
843 goto got_pdev;
844 }
845
846 if (pdev && drhd->include_all) {
847 got_pdev:
848 *bus = pdev->bus->number;
849 *devfn = pdev->devfn;
850 goto out;
851 }
852 }
853 iommu = NULL;
854 out:
855 rcu_read_unlock();
856
857 return iommu;
858 }
859
860 static void domain_flush_cache(struct dmar_domain *domain,
861 void *addr, int size)
862 {
863 if (!domain->iommu_coherency)
864 clflush_cache_range(addr, size);
865 }
866
867 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
868 {
869 struct context_entry *context;
870 int ret = 0;
871 unsigned long flags;
872
873 spin_lock_irqsave(&iommu->lock, flags);
874 context = iommu_context_addr(iommu, bus, devfn, 0);
875 if (context)
876 ret = context_present(context);
877 spin_unlock_irqrestore(&iommu->lock, flags);
878 return ret;
879 }
880
881 static void free_context_table(struct intel_iommu *iommu)
882 {
883 int i;
884 unsigned long flags;
885 struct context_entry *context;
886
887 spin_lock_irqsave(&iommu->lock, flags);
888 if (!iommu->root_entry) {
889 goto out;
890 }
891 for (i = 0; i < ROOT_ENTRY_NR; i++) {
892 context = iommu_context_addr(iommu, i, 0, 0);
893 if (context)
894 free_pgtable_page(context);
895
896 if (!sm_supported(iommu))
897 continue;
898
899 context = iommu_context_addr(iommu, i, 0x80, 0);
900 if (context)
901 free_pgtable_page(context);
902
903 }
904 free_pgtable_page(iommu->root_entry);
905 iommu->root_entry = NULL;
906 out:
907 spin_unlock_irqrestore(&iommu->lock, flags);
908 }
909
910 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
911 unsigned long pfn, int *target_level)
912 {
913 struct dma_pte *parent, *pte;
914 int level = agaw_to_level(domain->agaw);
915 int offset;
916
917 BUG_ON(!domain->pgd);
918
919 if (!domain_pfn_supported(domain, pfn))
920 /* Address beyond IOMMU's addressing capabilities. */
921 return NULL;
922
923 parent = domain->pgd;
924
925 while (1) {
926 void *tmp_page;
927
928 offset = pfn_level_offset(pfn, level);
929 pte = &parent[offset];
930 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
931 break;
932 if (level == *target_level)
933 break;
934
935 if (!dma_pte_present(pte)) {
936 uint64_t pteval;
937
938 tmp_page = alloc_pgtable_page(domain->nid);
939
940 if (!tmp_page)
941 return NULL;
942
943 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
944 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
945 if (domain_use_first_level(domain))
946 pteval |= DMA_FL_PTE_XD;
947 if (cmpxchg64(&pte->val, 0ULL, pteval))
948 /* Someone else set it while we were thinking; use theirs. */
949 free_pgtable_page(tmp_page);
950 else
951 domain_flush_cache(domain, pte, sizeof(*pte));
952 }
953 if (level == 1)
954 break;
955
956 parent = phys_to_virt(dma_pte_addr(pte));
957 level--;
958 }
959
960 if (!*target_level)
961 *target_level = level;
962
963 return pte;
964 }
965
966 /* return address's pte at specific level */
967 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
968 unsigned long pfn,
969 int level, int *large_page)
970 {
971 struct dma_pte *parent, *pte;
972 int total = agaw_to_level(domain->agaw);
973 int offset;
974
975 parent = domain->pgd;
976 while (level <= total) {
977 offset = pfn_level_offset(pfn, total);
978 pte = &parent[offset];
979 if (level == total)
980 return pte;
981
982 if (!dma_pte_present(pte)) {
983 *large_page = total;
984 break;
985 }
986
987 if (dma_pte_superpage(pte)) {
988 *large_page = total;
989 return pte;
990 }
991
992 parent = phys_to_virt(dma_pte_addr(pte));
993 total--;
994 }
995 return NULL;
996 }
997
998 /* clear last level pte, a tlb flush should be followed */
999 static void dma_pte_clear_range(struct dmar_domain *domain,
1000 unsigned long start_pfn,
1001 unsigned long last_pfn)
1002 {
1003 unsigned int large_page;
1004 struct dma_pte *first_pte, *pte;
1005
1006 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1007 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1008 BUG_ON(start_pfn > last_pfn);
1009
1010 /* we don't need lock here; nobody else touches the iova range */
1011 do {
1012 large_page = 1;
1013 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1014 if (!pte) {
1015 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1016 continue;
1017 }
1018 do {
1019 dma_clear_pte(pte);
1020 start_pfn += lvl_to_nr_pages(large_page);
1021 pte++;
1022 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1023
1024 domain_flush_cache(domain, first_pte,
1025 (void *)pte - (void *)first_pte);
1026
1027 } while (start_pfn && start_pfn <= last_pfn);
1028 }
1029
1030 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1031 int retain_level, struct dma_pte *pte,
1032 unsigned long pfn, unsigned long start_pfn,
1033 unsigned long last_pfn)
1034 {
1035 pfn = max(start_pfn, pfn);
1036 pte = &pte[pfn_level_offset(pfn, level)];
1037
1038 do {
1039 unsigned long level_pfn;
1040 struct dma_pte *level_pte;
1041
1042 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1043 goto next;
1044
1045 level_pfn = pfn & level_mask(level);
1046 level_pte = phys_to_virt(dma_pte_addr(pte));
1047
1048 if (level > 2) {
1049 dma_pte_free_level(domain, level - 1, retain_level,
1050 level_pte, level_pfn, start_pfn,
1051 last_pfn);
1052 }
1053
1054 /*
1055 * Free the page table if we're below the level we want to
1056 * retain and the range covers the entire table.
1057 */
1058 if (level < retain_level && !(start_pfn > level_pfn ||
1059 last_pfn < level_pfn + level_size(level) - 1)) {
1060 dma_clear_pte(pte);
1061 domain_flush_cache(domain, pte, sizeof(*pte));
1062 free_pgtable_page(level_pte);
1063 }
1064 next:
1065 pfn += level_size(level);
1066 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1067 }
1068
1069 /*
1070 * clear last level (leaf) ptes and free page table pages below the
1071 * level we wish to keep intact.
1072 */
1073 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1074 unsigned long start_pfn,
1075 unsigned long last_pfn,
1076 int retain_level)
1077 {
1078 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1079 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1080 BUG_ON(start_pfn > last_pfn);
1081
1082 dma_pte_clear_range(domain, start_pfn, last_pfn);
1083
1084 /* We don't need lock here; nobody else touches the iova range */
1085 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1086 domain->pgd, 0, start_pfn, last_pfn);
1087
1088 /* free pgd */
1089 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1090 free_pgtable_page(domain->pgd);
1091 domain->pgd = NULL;
1092 }
1093 }
1094
1095 /* When a page at a given level is being unlinked from its parent, we don't
1096 need to *modify* it at all. All we need to do is make a list of all the
1097 pages which can be freed just as soon as we've flushed the IOTLB and we
1098 know the hardware page-walk will no longer touch them.
1099 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1100 be freed. */
1101 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1102 int level, struct dma_pte *pte,
1103 struct page *freelist)
1104 {
1105 struct page *pg;
1106
1107 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1108 pg->freelist = freelist;
1109 freelist = pg;
1110
1111 if (level == 1)
1112 return freelist;
1113
1114 pte = page_address(pg);
1115 do {
1116 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1117 freelist = dma_pte_list_pagetables(domain, level - 1,
1118 pte, freelist);
1119 pte++;
1120 } while (!first_pte_in_page(pte));
1121
1122 return freelist;
1123 }
1124
1125 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1126 struct dma_pte *pte, unsigned long pfn,
1127 unsigned long start_pfn,
1128 unsigned long last_pfn,
1129 struct page *freelist)
1130 {
1131 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1132
1133 pfn = max(start_pfn, pfn);
1134 pte = &pte[pfn_level_offset(pfn, level)];
1135
1136 do {
1137 unsigned long level_pfn;
1138
1139 if (!dma_pte_present(pte))
1140 goto next;
1141
1142 level_pfn = pfn & level_mask(level);
1143
1144 /* If range covers entire pagetable, free it */
1145 if (start_pfn <= level_pfn &&
1146 last_pfn >= level_pfn + level_size(level) - 1) {
1147 /* These suborbinate page tables are going away entirely. Don't
1148 bother to clear them; we're just going to *free* them. */
1149 if (level > 1 && !dma_pte_superpage(pte))
1150 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1151
1152 dma_clear_pte(pte);
1153 if (!first_pte)
1154 first_pte = pte;
1155 last_pte = pte;
1156 } else if (level > 1) {
1157 /* Recurse down into a level that isn't *entirely* obsolete */
1158 freelist = dma_pte_clear_level(domain, level - 1,
1159 phys_to_virt(dma_pte_addr(pte)),
1160 level_pfn, start_pfn, last_pfn,
1161 freelist);
1162 }
1163 next:
1164 pfn += level_size(level);
1165 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1166
1167 if (first_pte)
1168 domain_flush_cache(domain, first_pte,
1169 (void *)++last_pte - (void *)first_pte);
1170
1171 return freelist;
1172 }
1173
1174 /* We can't just free the pages because the IOMMU may still be walking
1175 the page tables, and may have cached the intermediate levels. The
1176 pages can only be freed after the IOTLB flush has been done. */
1177 static struct page *domain_unmap(struct dmar_domain *domain,
1178 unsigned long start_pfn,
1179 unsigned long last_pfn)
1180 {
1181 struct page *freelist;
1182
1183 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1184 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1185 BUG_ON(start_pfn > last_pfn);
1186
1187 /* we don't need lock here; nobody else touches the iova range */
1188 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1189 domain->pgd, 0, start_pfn, last_pfn, NULL);
1190
1191 /* free pgd */
1192 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1193 struct page *pgd_page = virt_to_page(domain->pgd);
1194 pgd_page->freelist = freelist;
1195 freelist = pgd_page;
1196
1197 domain->pgd = NULL;
1198 }
1199
1200 return freelist;
1201 }
1202
1203 static void dma_free_pagelist(struct page *freelist)
1204 {
1205 struct page *pg;
1206
1207 while ((pg = freelist)) {
1208 freelist = pg->freelist;
1209 free_pgtable_page(page_address(pg));
1210 }
1211 }
1212
1213 static void iova_entry_free(unsigned long data)
1214 {
1215 struct page *freelist = (struct page *)data;
1216
1217 dma_free_pagelist(freelist);
1218 }
1219
1220 /* iommu handling */
1221 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1222 {
1223 struct root_entry *root;
1224 unsigned long flags;
1225
1226 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1227 if (!root) {
1228 pr_err("Allocating root entry for %s failed\n",
1229 iommu->name);
1230 return -ENOMEM;
1231 }
1232
1233 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1234
1235 spin_lock_irqsave(&iommu->lock, flags);
1236 iommu->root_entry = root;
1237 spin_unlock_irqrestore(&iommu->lock, flags);
1238
1239 return 0;
1240 }
1241
1242 static void iommu_set_root_entry(struct intel_iommu *iommu)
1243 {
1244 u64 addr;
1245 u32 sts;
1246 unsigned long flag;
1247
1248 addr = virt_to_phys(iommu->root_entry);
1249 if (sm_supported(iommu))
1250 addr |= DMA_RTADDR_SMT;
1251
1252 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1253 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1254
1255 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1256
1257 /* Make sure hardware complete it */
1258 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1259 readl, (sts & DMA_GSTS_RTPS), sts);
1260
1261 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1262 }
1263
1264 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1265 {
1266 u32 val;
1267 unsigned long flag;
1268
1269 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1270 return;
1271
1272 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1273 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1274
1275 /* Make sure hardware complete it */
1276 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1277 readl, (!(val & DMA_GSTS_WBFS)), val);
1278
1279 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1280 }
1281
1282 /* return value determine if we need a write buffer flush */
1283 static void __iommu_flush_context(struct intel_iommu *iommu,
1284 u16 did, u16 source_id, u8 function_mask,
1285 u64 type)
1286 {
1287 u64 val = 0;
1288 unsigned long flag;
1289
1290 switch (type) {
1291 case DMA_CCMD_GLOBAL_INVL:
1292 val = DMA_CCMD_GLOBAL_INVL;
1293 break;
1294 case DMA_CCMD_DOMAIN_INVL:
1295 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1296 break;
1297 case DMA_CCMD_DEVICE_INVL:
1298 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1299 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1300 break;
1301 default:
1302 BUG();
1303 }
1304 val |= DMA_CCMD_ICC;
1305
1306 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1307 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1308
1309 /* Make sure hardware complete it */
1310 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1311 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1312
1313 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1314 }
1315
1316 /* return value determine if we need a write buffer flush */
1317 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1318 u64 addr, unsigned int size_order, u64 type)
1319 {
1320 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1321 u64 val = 0, val_iva = 0;
1322 unsigned long flag;
1323
1324 switch (type) {
1325 case DMA_TLB_GLOBAL_FLUSH:
1326 /* global flush doesn't need set IVA_REG */
1327 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1328 break;
1329 case DMA_TLB_DSI_FLUSH:
1330 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1331 break;
1332 case DMA_TLB_PSI_FLUSH:
1333 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1334 /* IH bit is passed in as part of address */
1335 val_iva = size_order | addr;
1336 break;
1337 default:
1338 BUG();
1339 }
1340 /* Note: set drain read/write */
1341 #if 0
1342 /*
1343 * This is probably to be super secure.. Looks like we can
1344 * ignore it without any impact.
1345 */
1346 if (cap_read_drain(iommu->cap))
1347 val |= DMA_TLB_READ_DRAIN;
1348 #endif
1349 if (cap_write_drain(iommu->cap))
1350 val |= DMA_TLB_WRITE_DRAIN;
1351
1352 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1353 /* Note: Only uses first TLB reg currently */
1354 if (val_iva)
1355 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1356 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1357
1358 /* Make sure hardware complete it */
1359 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1360 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1361
1362 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1363
1364 /* check IOTLB invalidation granularity */
1365 if (DMA_TLB_IAIG(val) == 0)
1366 pr_err("Flush IOTLB failed\n");
1367 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1368 pr_debug("TLB flush request %Lx, actual %Lx\n",
1369 (unsigned long long)DMA_TLB_IIRG(type),
1370 (unsigned long long)DMA_TLB_IAIG(val));
1371 }
1372
1373 static struct device_domain_info *
1374 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1375 u8 bus, u8 devfn)
1376 {
1377 struct device_domain_info *info;
1378
1379 assert_spin_locked(&device_domain_lock);
1380
1381 if (!iommu->qi)
1382 return NULL;
1383
1384 list_for_each_entry(info, &domain->devices, link)
1385 if (info->iommu == iommu && info->bus == bus &&
1386 info->devfn == devfn) {
1387 if (info->ats_supported && info->dev)
1388 return info;
1389 break;
1390 }
1391
1392 return NULL;
1393 }
1394
1395 static void domain_update_iotlb(struct dmar_domain *domain)
1396 {
1397 struct device_domain_info *info;
1398 bool has_iotlb_device = false;
1399
1400 assert_spin_locked(&device_domain_lock);
1401
1402 list_for_each_entry(info, &domain->devices, link) {
1403 struct pci_dev *pdev;
1404
1405 if (!info->dev || !dev_is_pci(info->dev))
1406 continue;
1407
1408 pdev = to_pci_dev(info->dev);
1409 if (pdev->ats_enabled) {
1410 has_iotlb_device = true;
1411 break;
1412 }
1413 }
1414
1415 domain->has_iotlb_device = has_iotlb_device;
1416 }
1417
1418 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1419 {
1420 struct pci_dev *pdev;
1421
1422 assert_spin_locked(&device_domain_lock);
1423
1424 if (!info || !dev_is_pci(info->dev))
1425 return;
1426
1427 pdev = to_pci_dev(info->dev);
1428 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1429 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1430 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1431 * reserved, which should be set to 0.
1432 */
1433 if (!ecap_dit(info->iommu->ecap))
1434 info->pfsid = 0;
1435 else {
1436 struct pci_dev *pf_pdev;
1437
1438 /* pdev will be returned if device is not a vf */
1439 pf_pdev = pci_physfn(pdev);
1440 info->pfsid = pci_dev_id(pf_pdev);
1441 }
1442
1443 #ifdef CONFIG_INTEL_IOMMU_SVM
1444 /* The PCIe spec, in its wisdom, declares that the behaviour of
1445 the device if you enable PASID support after ATS support is
1446 undefined. So always enable PASID support on devices which
1447 have it, even if we can't yet know if we're ever going to
1448 use it. */
1449 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1450 info->pasid_enabled = 1;
1451
1452 if (info->pri_supported &&
1453 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1454 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1455 info->pri_enabled = 1;
1456 #endif
1457 if (!pdev->untrusted && info->ats_supported &&
1458 pci_ats_page_aligned(pdev) &&
1459 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1460 info->ats_enabled = 1;
1461 domain_update_iotlb(info->domain);
1462 info->ats_qdep = pci_ats_queue_depth(pdev);
1463 }
1464 }
1465
1466 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1467 {
1468 struct pci_dev *pdev;
1469
1470 assert_spin_locked(&device_domain_lock);
1471
1472 if (!dev_is_pci(info->dev))
1473 return;
1474
1475 pdev = to_pci_dev(info->dev);
1476
1477 if (info->ats_enabled) {
1478 pci_disable_ats(pdev);
1479 info->ats_enabled = 0;
1480 domain_update_iotlb(info->domain);
1481 }
1482 #ifdef CONFIG_INTEL_IOMMU_SVM
1483 if (info->pri_enabled) {
1484 pci_disable_pri(pdev);
1485 info->pri_enabled = 0;
1486 }
1487 if (info->pasid_enabled) {
1488 pci_disable_pasid(pdev);
1489 info->pasid_enabled = 0;
1490 }
1491 #endif
1492 }
1493
1494 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1495 u64 addr, unsigned mask)
1496 {
1497 u16 sid, qdep;
1498 unsigned long flags;
1499 struct device_domain_info *info;
1500
1501 if (!domain->has_iotlb_device)
1502 return;
1503
1504 spin_lock_irqsave(&device_domain_lock, flags);
1505 list_for_each_entry(info, &domain->devices, link) {
1506 if (!info->ats_enabled)
1507 continue;
1508
1509 sid = info->bus << 8 | info->devfn;
1510 qdep = info->ats_qdep;
1511 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1512 qdep, addr, mask);
1513 }
1514 spin_unlock_irqrestore(&device_domain_lock, flags);
1515 }
1516
1517 static void domain_flush_piotlb(struct intel_iommu *iommu,
1518 struct dmar_domain *domain,
1519 u64 addr, unsigned long npages, bool ih)
1520 {
1521 u16 did = domain->iommu_did[iommu->seq_id];
1522
1523 if (domain->default_pasid)
1524 qi_flush_piotlb(iommu, did, domain->default_pasid,
1525 addr, npages, ih);
1526
1527 if (!list_empty(&domain->devices))
1528 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1529 }
1530
1531 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1532 struct dmar_domain *domain,
1533 unsigned long pfn, unsigned int pages,
1534 int ih, int map)
1535 {
1536 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1537 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1538 u16 did = domain->iommu_did[iommu->seq_id];
1539
1540 BUG_ON(pages == 0);
1541
1542 if (ih)
1543 ih = 1 << 6;
1544
1545 if (domain_use_first_level(domain)) {
1546 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1547 } else {
1548 /*
1549 * Fallback to domain selective flush if no PSI support or
1550 * the size is too big. PSI requires page size to be 2 ^ x,
1551 * and the base address is naturally aligned to the size.
1552 */
1553 if (!cap_pgsel_inv(iommu->cap) ||
1554 mask > cap_max_amask_val(iommu->cap))
1555 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1556 DMA_TLB_DSI_FLUSH);
1557 else
1558 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1559 DMA_TLB_PSI_FLUSH);
1560 }
1561
1562 /*
1563 * In caching mode, changes of pages from non-present to present require
1564 * flush. However, device IOTLB doesn't need to be flushed in this case.
1565 */
1566 if (!cap_caching_mode(iommu->cap) || !map)
1567 iommu_flush_dev_iotlb(domain, addr, mask);
1568 }
1569
1570 /* Notification for newly created mappings */
1571 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1572 struct dmar_domain *domain,
1573 unsigned long pfn, unsigned int pages)
1574 {
1575 /*
1576 * It's a non-present to present mapping. Only flush if caching mode
1577 * and second level.
1578 */
1579 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1580 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1581 else
1582 iommu_flush_write_buffer(iommu);
1583 }
1584
1585 static void iommu_flush_iova(struct iova_domain *iovad)
1586 {
1587 struct dmar_domain *domain;
1588 int idx;
1589
1590 domain = container_of(iovad, struct dmar_domain, iovad);
1591
1592 for_each_domain_iommu(idx, domain) {
1593 struct intel_iommu *iommu = g_iommus[idx];
1594 u16 did = domain->iommu_did[iommu->seq_id];
1595
1596 if (domain_use_first_level(domain))
1597 domain_flush_piotlb(iommu, domain, 0, -1, 0);
1598 else
1599 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1600 DMA_TLB_DSI_FLUSH);
1601
1602 if (!cap_caching_mode(iommu->cap))
1603 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1604 0, MAX_AGAW_PFN_WIDTH);
1605 }
1606 }
1607
1608 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1609 {
1610 u32 pmen;
1611 unsigned long flags;
1612
1613 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1614 return;
1615
1616 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1617 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1618 pmen &= ~DMA_PMEN_EPM;
1619 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1620
1621 /* wait for the protected region status bit to clear */
1622 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1623 readl, !(pmen & DMA_PMEN_PRS), pmen);
1624
1625 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1626 }
1627
1628 static void iommu_enable_translation(struct intel_iommu *iommu)
1629 {
1630 u32 sts;
1631 unsigned long flags;
1632
1633 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1634 iommu->gcmd |= DMA_GCMD_TE;
1635 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1636
1637 /* Make sure hardware complete it */
1638 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1639 readl, (sts & DMA_GSTS_TES), sts);
1640
1641 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1642 }
1643
1644 static void iommu_disable_translation(struct intel_iommu *iommu)
1645 {
1646 u32 sts;
1647 unsigned long flag;
1648
1649 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1650 iommu->gcmd &= ~DMA_GCMD_TE;
1651 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1652
1653 /* Make sure hardware complete it */
1654 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1655 readl, (!(sts & DMA_GSTS_TES)), sts);
1656
1657 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1658 }
1659
1660 static int iommu_init_domains(struct intel_iommu *iommu)
1661 {
1662 u32 ndomains, nlongs;
1663 size_t size;
1664
1665 ndomains = cap_ndoms(iommu->cap);
1666 pr_debug("%s: Number of Domains supported <%d>\n",
1667 iommu->name, ndomains);
1668 nlongs = BITS_TO_LONGS(ndomains);
1669
1670 spin_lock_init(&iommu->lock);
1671
1672 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1673 if (!iommu->domain_ids) {
1674 pr_err("%s: Allocating domain id array failed\n",
1675 iommu->name);
1676 return -ENOMEM;
1677 }
1678
1679 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1680 iommu->domains = kzalloc(size, GFP_KERNEL);
1681
1682 if (iommu->domains) {
1683 size = 256 * sizeof(struct dmar_domain *);
1684 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1685 }
1686
1687 if (!iommu->domains || !iommu->domains[0]) {
1688 pr_err("%s: Allocating domain array failed\n",
1689 iommu->name);
1690 kfree(iommu->domain_ids);
1691 kfree(iommu->domains);
1692 iommu->domain_ids = NULL;
1693 iommu->domains = NULL;
1694 return -ENOMEM;
1695 }
1696
1697 /*
1698 * If Caching mode is set, then invalid translations are tagged
1699 * with domain-id 0, hence we need to pre-allocate it. We also
1700 * use domain-id 0 as a marker for non-allocated domain-id, so
1701 * make sure it is not used for a real domain.
1702 */
1703 set_bit(0, iommu->domain_ids);
1704
1705 /*
1706 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1707 * entry for first-level or pass-through translation modes should
1708 * be programmed with a domain id different from those used for
1709 * second-level or nested translation. We reserve a domain id for
1710 * this purpose.
1711 */
1712 if (sm_supported(iommu))
1713 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1714
1715 return 0;
1716 }
1717
1718 static void disable_dmar_iommu(struct intel_iommu *iommu)
1719 {
1720 struct device_domain_info *info, *tmp;
1721 unsigned long flags;
1722
1723 if (!iommu->domains || !iommu->domain_ids)
1724 return;
1725
1726 spin_lock_irqsave(&device_domain_lock, flags);
1727 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1728 if (info->iommu != iommu)
1729 continue;
1730
1731 if (!info->dev || !info->domain)
1732 continue;
1733
1734 __dmar_remove_one_dev_info(info);
1735 }
1736 spin_unlock_irqrestore(&device_domain_lock, flags);
1737
1738 if (iommu->gcmd & DMA_GCMD_TE)
1739 iommu_disable_translation(iommu);
1740 }
1741
1742 static void free_dmar_iommu(struct intel_iommu *iommu)
1743 {
1744 if ((iommu->domains) && (iommu->domain_ids)) {
1745 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1746 int i;
1747
1748 for (i = 0; i < elems; i++)
1749 kfree(iommu->domains[i]);
1750 kfree(iommu->domains);
1751 kfree(iommu->domain_ids);
1752 iommu->domains = NULL;
1753 iommu->domain_ids = NULL;
1754 }
1755
1756 g_iommus[iommu->seq_id] = NULL;
1757
1758 /* free context mapping */
1759 free_context_table(iommu);
1760
1761 #ifdef CONFIG_INTEL_IOMMU_SVM
1762 if (pasid_supported(iommu)) {
1763 if (ecap_prs(iommu->ecap))
1764 intel_svm_finish_prq(iommu);
1765 }
1766 #endif
1767 }
1768
1769 /*
1770 * Check and return whether first level is used by default for
1771 * DMA translation.
1772 */
1773 static bool first_level_by_default(void)
1774 {
1775 struct dmar_drhd_unit *drhd;
1776 struct intel_iommu *iommu;
1777 static int first_level_support = -1;
1778
1779 if (likely(first_level_support != -1))
1780 return first_level_support;
1781
1782 first_level_support = 1;
1783
1784 rcu_read_lock();
1785 for_each_active_iommu(iommu, drhd) {
1786 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1787 first_level_support = 0;
1788 break;
1789 }
1790 }
1791 rcu_read_unlock();
1792
1793 return first_level_support;
1794 }
1795
1796 static struct dmar_domain *alloc_domain(int flags)
1797 {
1798 struct dmar_domain *domain;
1799
1800 domain = alloc_domain_mem();
1801 if (!domain)
1802 return NULL;
1803
1804 memset(domain, 0, sizeof(*domain));
1805 domain->nid = NUMA_NO_NODE;
1806 domain->flags = flags;
1807 if (first_level_by_default())
1808 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1809 domain->has_iotlb_device = false;
1810 INIT_LIST_HEAD(&domain->devices);
1811
1812 return domain;
1813 }
1814
1815 /* Must be called with iommu->lock */
1816 static int domain_attach_iommu(struct dmar_domain *domain,
1817 struct intel_iommu *iommu)
1818 {
1819 unsigned long ndomains;
1820 int num;
1821
1822 assert_spin_locked(&device_domain_lock);
1823 assert_spin_locked(&iommu->lock);
1824
1825 domain->iommu_refcnt[iommu->seq_id] += 1;
1826 domain->iommu_count += 1;
1827 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1828 ndomains = cap_ndoms(iommu->cap);
1829 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1830
1831 if (num >= ndomains) {
1832 pr_err("%s: No free domain ids\n", iommu->name);
1833 domain->iommu_refcnt[iommu->seq_id] -= 1;
1834 domain->iommu_count -= 1;
1835 return -ENOSPC;
1836 }
1837
1838 set_bit(num, iommu->domain_ids);
1839 set_iommu_domain(iommu, num, domain);
1840
1841 domain->iommu_did[iommu->seq_id] = num;
1842 domain->nid = iommu->node;
1843
1844 domain_update_iommu_cap(domain);
1845 }
1846
1847 return 0;
1848 }
1849
1850 static int domain_detach_iommu(struct dmar_domain *domain,
1851 struct intel_iommu *iommu)
1852 {
1853 int num, count;
1854
1855 assert_spin_locked(&device_domain_lock);
1856 assert_spin_locked(&iommu->lock);
1857
1858 domain->iommu_refcnt[iommu->seq_id] -= 1;
1859 count = --domain->iommu_count;
1860 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1861 num = domain->iommu_did[iommu->seq_id];
1862 clear_bit(num, iommu->domain_ids);
1863 set_iommu_domain(iommu, num, NULL);
1864
1865 domain_update_iommu_cap(domain);
1866 domain->iommu_did[iommu->seq_id] = 0;
1867 }
1868
1869 return count;
1870 }
1871
1872 static struct iova_domain reserved_iova_list;
1873 static struct lock_class_key reserved_rbtree_key;
1874
1875 static int dmar_init_reserved_ranges(void)
1876 {
1877 struct pci_dev *pdev = NULL;
1878 struct iova *iova;
1879 int i;
1880
1881 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1882
1883 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1884 &reserved_rbtree_key);
1885
1886 /* IOAPIC ranges shouldn't be accessed by DMA */
1887 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1888 IOVA_PFN(IOAPIC_RANGE_END));
1889 if (!iova) {
1890 pr_err("Reserve IOAPIC range failed\n");
1891 return -ENODEV;
1892 }
1893
1894 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1895 for_each_pci_dev(pdev) {
1896 struct resource *r;
1897
1898 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1899 r = &pdev->resource[i];
1900 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1901 continue;
1902 iova = reserve_iova(&reserved_iova_list,
1903 IOVA_PFN(r->start),
1904 IOVA_PFN(r->end));
1905 if (!iova) {
1906 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1907 return -ENODEV;
1908 }
1909 }
1910 }
1911 return 0;
1912 }
1913
1914 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1915 {
1916 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1917 }
1918
1919 static inline int guestwidth_to_adjustwidth(int gaw)
1920 {
1921 int agaw;
1922 int r = (gaw - 12) % 9;
1923
1924 if (r == 0)
1925 agaw = gaw;
1926 else
1927 agaw = gaw + 9 - r;
1928 if (agaw > 64)
1929 agaw = 64;
1930 return agaw;
1931 }
1932
1933 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1934 int guest_width)
1935 {
1936 int adjust_width, agaw;
1937 unsigned long sagaw;
1938 int ret;
1939
1940 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1941
1942 if (!intel_iommu_strict) {
1943 ret = init_iova_flush_queue(&domain->iovad,
1944 iommu_flush_iova, iova_entry_free);
1945 if (ret)
1946 pr_info("iova flush queue initialization failed\n");
1947 }
1948
1949 domain_reserve_special_ranges(domain);
1950
1951 /* calculate AGAW */
1952 if (guest_width > cap_mgaw(iommu->cap))
1953 guest_width = cap_mgaw(iommu->cap);
1954 domain->gaw = guest_width;
1955 adjust_width = guestwidth_to_adjustwidth(guest_width);
1956 agaw = width_to_agaw(adjust_width);
1957 sagaw = cap_sagaw(iommu->cap);
1958 if (!test_bit(agaw, &sagaw)) {
1959 /* hardware doesn't support it, choose a bigger one */
1960 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1961 agaw = find_next_bit(&sagaw, 5, agaw);
1962 if (agaw >= 5)
1963 return -ENODEV;
1964 }
1965 domain->agaw = agaw;
1966
1967 if (ecap_coherent(iommu->ecap))
1968 domain->iommu_coherency = 1;
1969 else
1970 domain->iommu_coherency = 0;
1971
1972 if (ecap_sc_support(iommu->ecap))
1973 domain->iommu_snooping = 1;
1974 else
1975 domain->iommu_snooping = 0;
1976
1977 if (intel_iommu_superpage)
1978 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1979 else
1980 domain->iommu_superpage = 0;
1981
1982 domain->nid = iommu->node;
1983
1984 /* always allocate the top pgd */
1985 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1986 if (!domain->pgd)
1987 return -ENOMEM;
1988 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1989 return 0;
1990 }
1991
1992 static void domain_exit(struct dmar_domain *domain)
1993 {
1994
1995 /* Remove associated devices and clear attached or cached domains */
1996 domain_remove_dev_info(domain);
1997
1998 /* destroy iovas */
1999 put_iova_domain(&domain->iovad);
2000
2001 if (domain->pgd) {
2002 struct page *freelist;
2003
2004 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2005 dma_free_pagelist(freelist);
2006 }
2007
2008 free_domain_mem(domain);
2009 }
2010
2011 /*
2012 * Get the PASID directory size for scalable mode context entry.
2013 * Value of X in the PDTS field of a scalable mode context entry
2014 * indicates PASID directory with 2^(X + 7) entries.
2015 */
2016 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2017 {
2018 int pds, max_pde;
2019
2020 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2021 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2022 if (pds < 7)
2023 return 0;
2024
2025 return pds - 7;
2026 }
2027
2028 /*
2029 * Set the RID_PASID field of a scalable mode context entry. The
2030 * IOMMU hardware will use the PASID value set in this field for
2031 * DMA translations of DMA requests without PASID.
2032 */
2033 static inline void
2034 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2035 {
2036 context->hi |= pasid & ((1 << 20) - 1);
2037 context->hi |= (1 << 20);
2038 }
2039
2040 /*
2041 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2042 * entry.
2043 */
2044 static inline void context_set_sm_dte(struct context_entry *context)
2045 {
2046 context->lo |= (1 << 2);
2047 }
2048
2049 /*
2050 * Set the PRE(Page Request Enable) field of a scalable mode context
2051 * entry.
2052 */
2053 static inline void context_set_sm_pre(struct context_entry *context)
2054 {
2055 context->lo |= (1 << 4);
2056 }
2057
2058 /* Convert value to context PASID directory size field coding. */
2059 #define context_pdts(pds) (((pds) & 0x7) << 9)
2060
2061 static int domain_context_mapping_one(struct dmar_domain *domain,
2062 struct intel_iommu *iommu,
2063 struct pasid_table *table,
2064 u8 bus, u8 devfn)
2065 {
2066 u16 did = domain->iommu_did[iommu->seq_id];
2067 int translation = CONTEXT_TT_MULTI_LEVEL;
2068 struct device_domain_info *info = NULL;
2069 struct context_entry *context;
2070 unsigned long flags;
2071 int ret;
2072
2073 WARN_ON(did == 0);
2074
2075 if (hw_pass_through && domain_type_is_si(domain))
2076 translation = CONTEXT_TT_PASS_THROUGH;
2077
2078 pr_debug("Set context mapping for %02x:%02x.%d\n",
2079 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2080
2081 BUG_ON(!domain->pgd);
2082
2083 spin_lock_irqsave(&device_domain_lock, flags);
2084 spin_lock(&iommu->lock);
2085
2086 ret = -ENOMEM;
2087 context = iommu_context_addr(iommu, bus, devfn, 1);
2088 if (!context)
2089 goto out_unlock;
2090
2091 ret = 0;
2092 if (context_present(context))
2093 goto out_unlock;
2094
2095 /*
2096 * For kdump cases, old valid entries may be cached due to the
2097 * in-flight DMA and copied pgtable, but there is no unmapping
2098 * behaviour for them, thus we need an explicit cache flush for
2099 * the newly-mapped device. For kdump, at this point, the device
2100 * is supposed to finish reset at its driver probe stage, so no
2101 * in-flight DMA will exist, and we don't need to worry anymore
2102 * hereafter.
2103 */
2104 if (context_copied(context)) {
2105 u16 did_old = context_domain_id(context);
2106
2107 if (did_old < cap_ndoms(iommu->cap)) {
2108 iommu->flush.flush_context(iommu, did_old,
2109 (((u16)bus) << 8) | devfn,
2110 DMA_CCMD_MASK_NOBIT,
2111 DMA_CCMD_DEVICE_INVL);
2112 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2113 DMA_TLB_DSI_FLUSH);
2114 }
2115 }
2116
2117 context_clear_entry(context);
2118
2119 if (sm_supported(iommu)) {
2120 unsigned long pds;
2121
2122 WARN_ON(!table);
2123
2124 /* Setup the PASID DIR pointer: */
2125 pds = context_get_sm_pds(table);
2126 context->lo = (u64)virt_to_phys(table->table) |
2127 context_pdts(pds);
2128
2129 /* Setup the RID_PASID field: */
2130 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2131
2132 /*
2133 * Setup the Device-TLB enable bit and Page request
2134 * Enable bit:
2135 */
2136 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2137 if (info && info->ats_supported)
2138 context_set_sm_dte(context);
2139 if (info && info->pri_supported)
2140 context_set_sm_pre(context);
2141 } else {
2142 struct dma_pte *pgd = domain->pgd;
2143 int agaw;
2144
2145 context_set_domain_id(context, did);
2146
2147 if (translation != CONTEXT_TT_PASS_THROUGH) {
2148 /*
2149 * Skip top levels of page tables for iommu which has
2150 * less agaw than default. Unnecessary for PT mode.
2151 */
2152 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2153 ret = -ENOMEM;
2154 pgd = phys_to_virt(dma_pte_addr(pgd));
2155 if (!dma_pte_present(pgd))
2156 goto out_unlock;
2157 }
2158
2159 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2160 if (info && info->ats_supported)
2161 translation = CONTEXT_TT_DEV_IOTLB;
2162 else
2163 translation = CONTEXT_TT_MULTI_LEVEL;
2164
2165 context_set_address_root(context, virt_to_phys(pgd));
2166 context_set_address_width(context, agaw);
2167 } else {
2168 /*
2169 * In pass through mode, AW must be programmed to
2170 * indicate the largest AGAW value supported by
2171 * hardware. And ASR is ignored by hardware.
2172 */
2173 context_set_address_width(context, iommu->msagaw);
2174 }
2175
2176 context_set_translation_type(context, translation);
2177 }
2178
2179 context_set_fault_enable(context);
2180 context_set_present(context);
2181 domain_flush_cache(domain, context, sizeof(*context));
2182
2183 /*
2184 * It's a non-present to present mapping. If hardware doesn't cache
2185 * non-present entry we only need to flush the write-buffer. If the
2186 * _does_ cache non-present entries, then it does so in the special
2187 * domain #0, which we have to flush:
2188 */
2189 if (cap_caching_mode(iommu->cap)) {
2190 iommu->flush.flush_context(iommu, 0,
2191 (((u16)bus) << 8) | devfn,
2192 DMA_CCMD_MASK_NOBIT,
2193 DMA_CCMD_DEVICE_INVL);
2194 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2195 } else {
2196 iommu_flush_write_buffer(iommu);
2197 }
2198 iommu_enable_dev_iotlb(info);
2199
2200 ret = 0;
2201
2202 out_unlock:
2203 spin_unlock(&iommu->lock);
2204 spin_unlock_irqrestore(&device_domain_lock, flags);
2205
2206 return ret;
2207 }
2208
2209 struct domain_context_mapping_data {
2210 struct dmar_domain *domain;
2211 struct intel_iommu *iommu;
2212 struct pasid_table *table;
2213 };
2214
2215 static int domain_context_mapping_cb(struct pci_dev *pdev,
2216 u16 alias, void *opaque)
2217 {
2218 struct domain_context_mapping_data *data = opaque;
2219
2220 return domain_context_mapping_one(data->domain, data->iommu,
2221 data->table, PCI_BUS_NUM(alias),
2222 alias & 0xff);
2223 }
2224
2225 static int
2226 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2227 {
2228 struct domain_context_mapping_data data;
2229 struct pasid_table *table;
2230 struct intel_iommu *iommu;
2231 u8 bus, devfn;
2232
2233 iommu = device_to_iommu(dev, &bus, &devfn);
2234 if (!iommu)
2235 return -ENODEV;
2236
2237 table = intel_pasid_get_table(dev);
2238
2239 if (!dev_is_pci(dev))
2240 return domain_context_mapping_one(domain, iommu, table,
2241 bus, devfn);
2242
2243 data.domain = domain;
2244 data.iommu = iommu;
2245 data.table = table;
2246
2247 return pci_for_each_dma_alias(to_pci_dev(dev),
2248 &domain_context_mapping_cb, &data);
2249 }
2250
2251 static int domain_context_mapped_cb(struct pci_dev *pdev,
2252 u16 alias, void *opaque)
2253 {
2254 struct intel_iommu *iommu = opaque;
2255
2256 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2257 }
2258
2259 static int domain_context_mapped(struct device *dev)
2260 {
2261 struct intel_iommu *iommu;
2262 u8 bus, devfn;
2263
2264 iommu = device_to_iommu(dev, &bus, &devfn);
2265 if (!iommu)
2266 return -ENODEV;
2267
2268 if (!dev_is_pci(dev))
2269 return device_context_mapped(iommu, bus, devfn);
2270
2271 return !pci_for_each_dma_alias(to_pci_dev(dev),
2272 domain_context_mapped_cb, iommu);
2273 }
2274
2275 /* Returns a number of VTD pages, but aligned to MM page size */
2276 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2277 size_t size)
2278 {
2279 host_addr &= ~PAGE_MASK;
2280 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2281 }
2282
2283 /* Return largest possible superpage level for a given mapping */
2284 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2285 unsigned long iov_pfn,
2286 unsigned long phy_pfn,
2287 unsigned long pages)
2288 {
2289 int support, level = 1;
2290 unsigned long pfnmerge;
2291
2292 support = domain->iommu_superpage;
2293
2294 /* To use a large page, the virtual *and* physical addresses
2295 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2296 of them will mean we have to use smaller pages. So just
2297 merge them and check both at once. */
2298 pfnmerge = iov_pfn | phy_pfn;
2299
2300 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2301 pages >>= VTD_STRIDE_SHIFT;
2302 if (!pages)
2303 break;
2304 pfnmerge >>= VTD_STRIDE_SHIFT;
2305 level++;
2306 support--;
2307 }
2308 return level;
2309 }
2310
2311 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2312 struct scatterlist *sg, unsigned long phys_pfn,
2313 unsigned long nr_pages, int prot)
2314 {
2315 struct dma_pte *first_pte = NULL, *pte = NULL;
2316 phys_addr_t uninitialized_var(pteval);
2317 unsigned long sg_res = 0;
2318 unsigned int largepage_lvl = 0;
2319 unsigned long lvl_pages = 0;
2320 u64 attr;
2321
2322 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2323
2324 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2325 return -EINVAL;
2326
2327 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2328 if (domain_use_first_level(domain))
2329 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
2330
2331 if (!sg) {
2332 sg_res = nr_pages;
2333 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2334 }
2335
2336 while (nr_pages > 0) {
2337 uint64_t tmp;
2338
2339 if (!sg_res) {
2340 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2341
2342 sg_res = aligned_nrpages(sg->offset, sg->length);
2343 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2344 sg->dma_length = sg->length;
2345 pteval = (sg_phys(sg) - pgoff) | attr;
2346 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2347 }
2348
2349 if (!pte) {
2350 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2351
2352 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2353 if (!pte)
2354 return -ENOMEM;
2355 /* It is large page*/
2356 if (largepage_lvl > 1) {
2357 unsigned long nr_superpages, end_pfn;
2358
2359 pteval |= DMA_PTE_LARGE_PAGE;
2360 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2361
2362 nr_superpages = sg_res / lvl_pages;
2363 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2364
2365 /*
2366 * Ensure that old small page tables are
2367 * removed to make room for superpage(s).
2368 * We're adding new large pages, so make sure
2369 * we don't remove their parent tables.
2370 */
2371 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2372 largepage_lvl + 1);
2373 } else {
2374 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2375 }
2376
2377 }
2378 /* We don't need lock here, nobody else
2379 * touches the iova range
2380 */
2381 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2382 if (tmp) {
2383 static int dumps = 5;
2384 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2385 iov_pfn, tmp, (unsigned long long)pteval);
2386 if (dumps) {
2387 dumps--;
2388 debug_dma_dump_mappings(NULL);
2389 }
2390 WARN_ON(1);
2391 }
2392
2393 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2394
2395 BUG_ON(nr_pages < lvl_pages);
2396 BUG_ON(sg_res < lvl_pages);
2397
2398 nr_pages -= lvl_pages;
2399 iov_pfn += lvl_pages;
2400 phys_pfn += lvl_pages;
2401 pteval += lvl_pages * VTD_PAGE_SIZE;
2402 sg_res -= lvl_pages;
2403
2404 /* If the next PTE would be the first in a new page, then we
2405 need to flush the cache on the entries we've just written.
2406 And then we'll need to recalculate 'pte', so clear it and
2407 let it get set again in the if (!pte) block above.
2408
2409 If we're done (!nr_pages) we need to flush the cache too.
2410
2411 Also if we've been setting superpages, we may need to
2412 recalculate 'pte' and switch back to smaller pages for the
2413 end of the mapping, if the trailing size is not enough to
2414 use another superpage (i.e. sg_res < lvl_pages). */
2415 pte++;
2416 if (!nr_pages || first_pte_in_page(pte) ||
2417 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2418 domain_flush_cache(domain, first_pte,
2419 (void *)pte - (void *)first_pte);
2420 pte = NULL;
2421 }
2422
2423 if (!sg_res && nr_pages)
2424 sg = sg_next(sg);
2425 }
2426 return 0;
2427 }
2428
2429 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2430 struct scatterlist *sg, unsigned long phys_pfn,
2431 unsigned long nr_pages, int prot)
2432 {
2433 int iommu_id, ret;
2434 struct intel_iommu *iommu;
2435
2436 /* Do the real mapping first */
2437 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2438 if (ret)
2439 return ret;
2440
2441 for_each_domain_iommu(iommu_id, domain) {
2442 iommu = g_iommus[iommu_id];
2443 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2444 }
2445
2446 return 0;
2447 }
2448
2449 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2450 struct scatterlist *sg, unsigned long nr_pages,
2451 int prot)
2452 {
2453 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2454 }
2455
2456 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2457 unsigned long phys_pfn, unsigned long nr_pages,
2458 int prot)
2459 {
2460 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2461 }
2462
2463 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2464 {
2465 unsigned long flags;
2466 struct context_entry *context;
2467 u16 did_old;
2468
2469 if (!iommu)
2470 return;
2471
2472 spin_lock_irqsave(&iommu->lock, flags);
2473 context = iommu_context_addr(iommu, bus, devfn, 0);
2474 if (!context) {
2475 spin_unlock_irqrestore(&iommu->lock, flags);
2476 return;
2477 }
2478 did_old = context_domain_id(context);
2479 context_clear_entry(context);
2480 __iommu_flush_cache(iommu, context, sizeof(*context));
2481 spin_unlock_irqrestore(&iommu->lock, flags);
2482 iommu->flush.flush_context(iommu,
2483 did_old,
2484 (((u16)bus) << 8) | devfn,
2485 DMA_CCMD_MASK_NOBIT,
2486 DMA_CCMD_DEVICE_INVL);
2487 iommu->flush.flush_iotlb(iommu,
2488 did_old,
2489 0,
2490 0,
2491 DMA_TLB_DSI_FLUSH);
2492 }
2493
2494 static inline void unlink_domain_info(struct device_domain_info *info)
2495 {
2496 assert_spin_locked(&device_domain_lock);
2497 list_del(&info->link);
2498 list_del(&info->global);
2499 if (info->dev)
2500 info->dev->archdata.iommu = NULL;
2501 }
2502
2503 static void domain_remove_dev_info(struct dmar_domain *domain)
2504 {
2505 struct device_domain_info *info, *tmp;
2506 unsigned long flags;
2507
2508 spin_lock_irqsave(&device_domain_lock, flags);
2509 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2510 __dmar_remove_one_dev_info(info);
2511 spin_unlock_irqrestore(&device_domain_lock, flags);
2512 }
2513
2514 struct dmar_domain *find_domain(struct device *dev)
2515 {
2516 struct device_domain_info *info;
2517
2518 if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2519 return NULL;
2520
2521 if (dev_is_pci(dev))
2522 dev = &pci_real_dma_dev(to_pci_dev(dev))->dev;
2523
2524 /* No lock here, assumes no domain exit in normal case */
2525 info = dev->archdata.iommu;
2526 if (likely(info))
2527 return info->domain;
2528
2529 return NULL;
2530 }
2531
2532 static void do_deferred_attach(struct device *dev)
2533 {
2534 struct iommu_domain *domain;
2535
2536 dev->archdata.iommu = NULL;
2537 domain = iommu_get_domain_for_dev(dev);
2538 if (domain)
2539 intel_iommu_attach_device(domain, dev);
2540 }
2541
2542 static inline struct device_domain_info *
2543 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2544 {
2545 struct device_domain_info *info;
2546
2547 list_for_each_entry(info, &device_domain_list, global)
2548 if (info->iommu->segment == segment && info->bus == bus &&
2549 info->devfn == devfn)
2550 return info;
2551
2552 return NULL;
2553 }
2554
2555 static int domain_setup_first_level(struct intel_iommu *iommu,
2556 struct dmar_domain *domain,
2557 struct device *dev,
2558 int pasid)
2559 {
2560 int flags = PASID_FLAG_SUPERVISOR_MODE;
2561 struct dma_pte *pgd = domain->pgd;
2562 int agaw, level;
2563
2564 /*
2565 * Skip top levels of page tables for iommu which has
2566 * less agaw than default. Unnecessary for PT mode.
2567 */
2568 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2569 pgd = phys_to_virt(dma_pte_addr(pgd));
2570 if (!dma_pte_present(pgd))
2571 return -ENOMEM;
2572 }
2573
2574 level = agaw_to_level(agaw);
2575 if (level != 4 && level != 5)
2576 return -EINVAL;
2577
2578 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2579
2580 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2581 domain->iommu_did[iommu->seq_id],
2582 flags);
2583 }
2584
2585 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2586 int bus, int devfn,
2587 struct device *dev,
2588 struct dmar_domain *domain)
2589 {
2590 struct dmar_domain *found = NULL;
2591 struct device_domain_info *info;
2592 unsigned long flags;
2593 int ret;
2594
2595 info = alloc_devinfo_mem();
2596 if (!info)
2597 return NULL;
2598
2599 info->bus = bus;
2600 info->devfn = devfn;
2601 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2602 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2603 info->ats_qdep = 0;
2604 info->dev = dev;
2605 info->domain = domain;
2606 info->iommu = iommu;
2607 info->pasid_table = NULL;
2608 info->auxd_enabled = 0;
2609 INIT_LIST_HEAD(&info->auxiliary_domains);
2610
2611 if (dev && dev_is_pci(dev)) {
2612 struct pci_dev *pdev = to_pci_dev(info->dev);
2613
2614 if (!pdev->untrusted &&
2615 !pci_ats_disabled() &&
2616 ecap_dev_iotlb_support(iommu->ecap) &&
2617 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2618 dmar_find_matched_atsr_unit(pdev))
2619 info->ats_supported = 1;
2620
2621 if (sm_supported(iommu)) {
2622 if (pasid_supported(iommu)) {
2623 int features = pci_pasid_features(pdev);
2624 if (features >= 0)
2625 info->pasid_supported = features | 1;
2626 }
2627
2628 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2629 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2630 info->pri_supported = 1;
2631 }
2632 }
2633
2634 spin_lock_irqsave(&device_domain_lock, flags);
2635 if (dev)
2636 found = find_domain(dev);
2637
2638 if (!found) {
2639 struct device_domain_info *info2;
2640 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2641 if (info2) {
2642 found = info2->domain;
2643 info2->dev = dev;
2644 }
2645 }
2646
2647 if (found) {
2648 spin_unlock_irqrestore(&device_domain_lock, flags);
2649 free_devinfo_mem(info);
2650 /* Caller must free the original domain */
2651 return found;
2652 }
2653
2654 spin_lock(&iommu->lock);
2655 ret = domain_attach_iommu(domain, iommu);
2656 spin_unlock(&iommu->lock);
2657
2658 if (ret) {
2659 spin_unlock_irqrestore(&device_domain_lock, flags);
2660 free_devinfo_mem(info);
2661 return NULL;
2662 }
2663
2664 list_add(&info->link, &domain->devices);
2665 list_add(&info->global, &device_domain_list);
2666 if (dev)
2667 dev->archdata.iommu = info;
2668 spin_unlock_irqrestore(&device_domain_lock, flags);
2669
2670 /* PASID table is mandatory for a PCI device in scalable mode. */
2671 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2672 ret = intel_pasid_alloc_table(dev);
2673 if (ret) {
2674 dev_err(dev, "PASID table allocation failed\n");
2675 dmar_remove_one_dev_info(dev);
2676 return NULL;
2677 }
2678
2679 /* Setup the PASID entry for requests without PASID: */
2680 spin_lock(&iommu->lock);
2681 if (hw_pass_through && domain_type_is_si(domain))
2682 ret = intel_pasid_setup_pass_through(iommu, domain,
2683 dev, PASID_RID2PASID);
2684 else if (domain_use_first_level(domain))
2685 ret = domain_setup_first_level(iommu, domain, dev,
2686 PASID_RID2PASID);
2687 else
2688 ret = intel_pasid_setup_second_level(iommu, domain,
2689 dev, PASID_RID2PASID);
2690 spin_unlock(&iommu->lock);
2691 if (ret) {
2692 dev_err(dev, "Setup RID2PASID failed\n");
2693 dmar_remove_one_dev_info(dev);
2694 return NULL;
2695 }
2696 }
2697
2698 if (dev && domain_context_mapping(domain, dev)) {
2699 dev_err(dev, "Domain context map failed\n");
2700 dmar_remove_one_dev_info(dev);
2701 return NULL;
2702 }
2703
2704 return domain;
2705 }
2706
2707 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2708 {
2709 *(u16 *)opaque = alias;
2710 return 0;
2711 }
2712
2713 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2714 {
2715 struct device_domain_info *info;
2716 struct dmar_domain *domain = NULL;
2717 struct intel_iommu *iommu;
2718 u16 dma_alias;
2719 unsigned long flags;
2720 u8 bus, devfn;
2721
2722 iommu = device_to_iommu(dev, &bus, &devfn);
2723 if (!iommu)
2724 return NULL;
2725
2726 if (dev_is_pci(dev)) {
2727 struct pci_dev *pdev = to_pci_dev(dev);
2728
2729 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2730
2731 spin_lock_irqsave(&device_domain_lock, flags);
2732 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2733 PCI_BUS_NUM(dma_alias),
2734 dma_alias & 0xff);
2735 if (info) {
2736 iommu = info->iommu;
2737 domain = info->domain;
2738 }
2739 spin_unlock_irqrestore(&device_domain_lock, flags);
2740
2741 /* DMA alias already has a domain, use it */
2742 if (info)
2743 goto out;
2744 }
2745
2746 /* Allocate and initialize new domain for the device */
2747 domain = alloc_domain(0);
2748 if (!domain)
2749 return NULL;
2750 if (domain_init(domain, iommu, gaw)) {
2751 domain_exit(domain);
2752 return NULL;
2753 }
2754
2755 out:
2756 return domain;
2757 }
2758
2759 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2760 struct dmar_domain *domain)
2761 {
2762 struct intel_iommu *iommu;
2763 struct dmar_domain *tmp;
2764 u16 req_id, dma_alias;
2765 u8 bus, devfn;
2766
2767 iommu = device_to_iommu(dev, &bus, &devfn);
2768 if (!iommu)
2769 return NULL;
2770
2771 req_id = ((u16)bus << 8) | devfn;
2772
2773 if (dev_is_pci(dev)) {
2774 struct pci_dev *pdev = to_pci_dev(dev);
2775
2776 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2777
2778 /* register PCI DMA alias device */
2779 if (req_id != dma_alias) {
2780 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2781 dma_alias & 0xff, NULL, domain);
2782
2783 if (!tmp || tmp != domain)
2784 return tmp;
2785 }
2786 }
2787
2788 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2789 if (!tmp || tmp != domain)
2790 return tmp;
2791
2792 return domain;
2793 }
2794
2795 static int iommu_domain_identity_map(struct dmar_domain *domain,
2796 unsigned long long start,
2797 unsigned long long end)
2798 {
2799 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2800 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2801
2802 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2803 dma_to_mm_pfn(last_vpfn))) {
2804 pr_err("Reserving iova failed\n");
2805 return -ENOMEM;
2806 }
2807
2808 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2809 /*
2810 * RMRR range might have overlap with physical memory range,
2811 * clear it first
2812 */
2813 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2814
2815 return __domain_mapping(domain, first_vpfn, NULL,
2816 first_vpfn, last_vpfn - first_vpfn + 1,
2817 DMA_PTE_READ|DMA_PTE_WRITE);
2818 }
2819
2820 static int domain_prepare_identity_map(struct device *dev,
2821 struct dmar_domain *domain,
2822 unsigned long long start,
2823 unsigned long long end)
2824 {
2825 /* For _hardware_ passthrough, don't bother. But for software
2826 passthrough, we do it anyway -- it may indicate a memory
2827 range which is reserved in E820, so which didn't get set
2828 up to start with in si_domain */
2829 if (domain == si_domain && hw_pass_through) {
2830 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2831 start, end);
2832 return 0;
2833 }
2834
2835 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2836
2837 if (end < start) {
2838 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2839 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2840 dmi_get_system_info(DMI_BIOS_VENDOR),
2841 dmi_get_system_info(DMI_BIOS_VERSION),
2842 dmi_get_system_info(DMI_PRODUCT_VERSION));
2843 return -EIO;
2844 }
2845
2846 if (end >> agaw_to_width(domain->agaw)) {
2847 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2848 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2849 agaw_to_width(domain->agaw),
2850 dmi_get_system_info(DMI_BIOS_VENDOR),
2851 dmi_get_system_info(DMI_BIOS_VERSION),
2852 dmi_get_system_info(DMI_PRODUCT_VERSION));
2853 return -EIO;
2854 }
2855
2856 return iommu_domain_identity_map(domain, start, end);
2857 }
2858
2859 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2860
2861 static int __init si_domain_init(int hw)
2862 {
2863 struct dmar_rmrr_unit *rmrr;
2864 struct device *dev;
2865 int i, nid, ret;
2866
2867 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2868 if (!si_domain)
2869 return -EFAULT;
2870
2871 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2872 domain_exit(si_domain);
2873 return -EFAULT;
2874 }
2875
2876 if (hw)
2877 return 0;
2878
2879 for_each_online_node(nid) {
2880 unsigned long start_pfn, end_pfn;
2881 int i;
2882
2883 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2884 ret = iommu_domain_identity_map(si_domain,
2885 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2886 if (ret)
2887 return ret;
2888 }
2889 }
2890
2891 /*
2892 * Identity map the RMRRs so that devices with RMRRs could also use
2893 * the si_domain.
2894 */
2895 for_each_rmrr_units(rmrr) {
2896 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2897 i, dev) {
2898 unsigned long long start = rmrr->base_address;
2899 unsigned long long end = rmrr->end_address;
2900
2901 if (WARN_ON(end < start ||
2902 end >> agaw_to_width(si_domain->agaw)))
2903 continue;
2904
2905 ret = iommu_domain_identity_map(si_domain, start, end);
2906 if (ret)
2907 return ret;
2908 }
2909 }
2910
2911 return 0;
2912 }
2913
2914 static int identity_mapping(struct device *dev)
2915 {
2916 struct device_domain_info *info;
2917
2918 info = dev->archdata.iommu;
2919 if (info)
2920 return (info->domain == si_domain);
2921
2922 return 0;
2923 }
2924
2925 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2926 {
2927 struct dmar_domain *ndomain;
2928 struct intel_iommu *iommu;
2929 u8 bus, devfn;
2930
2931 iommu = device_to_iommu(dev, &bus, &devfn);
2932 if (!iommu)
2933 return -ENODEV;
2934
2935 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2936 if (ndomain != domain)
2937 return -EBUSY;
2938
2939 return 0;
2940 }
2941
2942 static bool device_has_rmrr(struct device *dev)
2943 {
2944 struct dmar_rmrr_unit *rmrr;
2945 struct device *tmp;
2946 int i;
2947
2948 rcu_read_lock();
2949 for_each_rmrr_units(rmrr) {
2950 /*
2951 * Return TRUE if this RMRR contains the device that
2952 * is passed in.
2953 */
2954 for_each_active_dev_scope(rmrr->devices,
2955 rmrr->devices_cnt, i, tmp)
2956 if (tmp == dev ||
2957 is_downstream_to_pci_bridge(dev, tmp)) {
2958 rcu_read_unlock();
2959 return true;
2960 }
2961 }
2962 rcu_read_unlock();
2963 return false;
2964 }
2965
2966 /**
2967 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2968 * is relaxable (ie. is allowed to be not enforced under some conditions)
2969 * @dev: device handle
2970 *
2971 * We assume that PCI USB devices with RMRRs have them largely
2972 * for historical reasons and that the RMRR space is not actively used post
2973 * boot. This exclusion may change if vendors begin to abuse it.
2974 *
2975 * The same exception is made for graphics devices, with the requirement that
2976 * any use of the RMRR regions will be torn down before assigning the device
2977 * to a guest.
2978 *
2979 * Return: true if the RMRR is relaxable, false otherwise
2980 */
2981 static bool device_rmrr_is_relaxable(struct device *dev)
2982 {
2983 struct pci_dev *pdev;
2984
2985 if (!dev_is_pci(dev))
2986 return false;
2987
2988 pdev = to_pci_dev(dev);
2989 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2990 return true;
2991 else
2992 return false;
2993 }
2994
2995 /*
2996 * There are a couple cases where we need to restrict the functionality of
2997 * devices associated with RMRRs. The first is when evaluating a device for
2998 * identity mapping because problems exist when devices are moved in and out
2999 * of domains and their respective RMRR information is lost. This means that
3000 * a device with associated RMRRs will never be in a "passthrough" domain.
3001 * The second is use of the device through the IOMMU API. This interface
3002 * expects to have full control of the IOVA space for the device. We cannot
3003 * satisfy both the requirement that RMRR access is maintained and have an
3004 * unencumbered IOVA space. We also have no ability to quiesce the device's
3005 * use of the RMRR space or even inform the IOMMU API user of the restriction.
3006 * We therefore prevent devices associated with an RMRR from participating in
3007 * the IOMMU API, which eliminates them from device assignment.
3008 *
3009 * In both cases, devices which have relaxable RMRRs are not concerned by this
3010 * restriction. See device_rmrr_is_relaxable comment.
3011 */
3012 static bool device_is_rmrr_locked(struct device *dev)
3013 {
3014 if (!device_has_rmrr(dev))
3015 return false;
3016
3017 if (device_rmrr_is_relaxable(dev))
3018 return false;
3019
3020 return true;
3021 }
3022
3023 /*
3024 * Return the required default domain type for a specific device.
3025 *
3026 * @dev: the device in query
3027 * @startup: true if this is during early boot
3028 *
3029 * Returns:
3030 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
3031 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
3032 * - 0: both identity and dynamic domains work for this device
3033 */
3034 static int device_def_domain_type(struct device *dev)
3035 {
3036 if (dev_is_pci(dev)) {
3037 struct pci_dev *pdev = to_pci_dev(dev);
3038
3039 /*
3040 * Prevent any device marked as untrusted from getting
3041 * placed into the statically identity mapping domain.
3042 */
3043 if (pdev->untrusted)
3044 return IOMMU_DOMAIN_DMA;
3045
3046 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
3047 return IOMMU_DOMAIN_IDENTITY;
3048
3049 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
3050 return IOMMU_DOMAIN_IDENTITY;
3051
3052 /*
3053 * We want to start off with all devices in the 1:1 domain, and
3054 * take them out later if we find they can't access all of memory.
3055 *
3056 * However, we can't do this for PCI devices behind bridges,
3057 * because all PCI devices behind the same bridge will end up
3058 * with the same source-id on their transactions.
3059 *
3060 * Practically speaking, we can't change things around for these
3061 * devices at run-time, because we can't be sure there'll be no
3062 * DMA transactions in flight for any of their siblings.
3063 *
3064 * So PCI devices (unless they're on the root bus) as well as
3065 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
3066 * the 1:1 domain, just in _case_ one of their siblings turns out
3067 * not to be able to map all of memory.
3068 */
3069 if (!pci_is_pcie(pdev)) {
3070 if (!pci_is_root_bus(pdev->bus))
3071 return IOMMU_DOMAIN_DMA;
3072 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
3073 return IOMMU_DOMAIN_DMA;
3074 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
3075 return IOMMU_DOMAIN_DMA;
3076 }
3077
3078 return 0;
3079 }
3080
3081 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3082 {
3083 /*
3084 * Start from the sane iommu hardware state.
3085 * If the queued invalidation is already initialized by us
3086 * (for example, while enabling interrupt-remapping) then
3087 * we got the things already rolling from a sane state.
3088 */
3089 if (!iommu->qi) {
3090 /*
3091 * Clear any previous faults.
3092 */
3093 dmar_fault(-1, iommu);
3094 /*
3095 * Disable queued invalidation if supported and already enabled
3096 * before OS handover.
3097 */
3098 dmar_disable_qi(iommu);
3099 }
3100
3101 if (dmar_enable_qi(iommu)) {
3102 /*
3103 * Queued Invalidate not enabled, use Register Based Invalidate
3104 */
3105 iommu->flush.flush_context = __iommu_flush_context;
3106 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3107 pr_info("%s: Using Register based invalidation\n",
3108 iommu->name);
3109 } else {
3110 iommu->flush.flush_context = qi_flush_context;
3111 iommu->flush.flush_iotlb = qi_flush_iotlb;
3112 pr_info("%s: Using Queued invalidation\n", iommu->name);
3113 }
3114 }
3115
3116 static int copy_context_table(struct intel_iommu *iommu,
3117 struct root_entry *old_re,
3118 struct context_entry **tbl,
3119 int bus, bool ext)
3120 {
3121 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3122 struct context_entry *new_ce = NULL, ce;
3123 struct context_entry *old_ce = NULL;
3124 struct root_entry re;
3125 phys_addr_t old_ce_phys;
3126
3127 tbl_idx = ext ? bus * 2 : bus;
3128 memcpy(&re, old_re, sizeof(re));
3129
3130 for (devfn = 0; devfn < 256; devfn++) {
3131 /* First calculate the correct index */
3132 idx = (ext ? devfn * 2 : devfn) % 256;
3133
3134 if (idx == 0) {
3135 /* First save what we may have and clean up */
3136 if (new_ce) {
3137 tbl[tbl_idx] = new_ce;
3138 __iommu_flush_cache(iommu, new_ce,
3139 VTD_PAGE_SIZE);
3140 pos = 1;
3141 }
3142
3143 if (old_ce)
3144 memunmap(old_ce);
3145
3146 ret = 0;
3147 if (devfn < 0x80)
3148 old_ce_phys = root_entry_lctp(&re);
3149 else
3150 old_ce_phys = root_entry_uctp(&re);
3151
3152 if (!old_ce_phys) {
3153 if (ext && devfn == 0) {
3154 /* No LCTP, try UCTP */
3155 devfn = 0x7f;
3156 continue;
3157 } else {
3158 goto out;
3159 }
3160 }
3161
3162 ret = -ENOMEM;
3163 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3164 MEMREMAP_WB);
3165 if (!old_ce)
3166 goto out;
3167
3168 new_ce = alloc_pgtable_page(iommu->node);
3169 if (!new_ce)
3170 goto out_unmap;
3171
3172 ret = 0;
3173 }
3174
3175 /* Now copy the context entry */
3176 memcpy(&ce, old_ce + idx, sizeof(ce));
3177
3178 if (!__context_present(&ce))
3179 continue;
3180
3181 did = context_domain_id(&ce);
3182 if (did >= 0 && did < cap_ndoms(iommu->cap))
3183 set_bit(did, iommu->domain_ids);
3184
3185 /*
3186 * We need a marker for copied context entries. This
3187 * marker needs to work for the old format as well as
3188 * for extended context entries.
3189 *
3190 * Bit 67 of the context entry is used. In the old
3191 * format this bit is available to software, in the
3192 * extended format it is the PGE bit, but PGE is ignored
3193 * by HW if PASIDs are disabled (and thus still
3194 * available).
3195 *
3196 * So disable PASIDs first and then mark the entry
3197 * copied. This means that we don't copy PASID
3198 * translations from the old kernel, but this is fine as
3199 * faults there are not fatal.
3200 */
3201 context_clear_pasid_enable(&ce);
3202 context_set_copied(&ce);
3203
3204 new_ce[idx] = ce;
3205 }
3206
3207 tbl[tbl_idx + pos] = new_ce;
3208
3209 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3210
3211 out_unmap:
3212 memunmap(old_ce);
3213
3214 out:
3215 return ret;
3216 }
3217
3218 static int copy_translation_tables(struct intel_iommu *iommu)
3219 {
3220 struct context_entry **ctxt_tbls;
3221 struct root_entry *old_rt;
3222 phys_addr_t old_rt_phys;
3223 int ctxt_table_entries;
3224 unsigned long flags;
3225 u64 rtaddr_reg;
3226 int bus, ret;
3227 bool new_ext, ext;
3228
3229 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3230 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3231 new_ext = !!ecap_ecs(iommu->ecap);
3232
3233 /*
3234 * The RTT bit can only be changed when translation is disabled,
3235 * but disabling translation means to open a window for data
3236 * corruption. So bail out and don't copy anything if we would
3237 * have to change the bit.
3238 */
3239 if (new_ext != ext)
3240 return -EINVAL;
3241
3242 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3243 if (!old_rt_phys)
3244 return -EINVAL;
3245
3246 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3247 if (!old_rt)
3248 return -ENOMEM;
3249
3250 /* This is too big for the stack - allocate it from slab */
3251 ctxt_table_entries = ext ? 512 : 256;
3252 ret = -ENOMEM;
3253 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3254 if (!ctxt_tbls)
3255 goto out_unmap;
3256
3257 for (bus = 0; bus < 256; bus++) {
3258 ret = copy_context_table(iommu, &old_rt[bus],
3259 ctxt_tbls, bus, ext);
3260 if (ret) {
3261 pr_err("%s: Failed to copy context table for bus %d\n",
3262 iommu->name, bus);
3263 continue;
3264 }
3265 }
3266
3267 spin_lock_irqsave(&iommu->lock, flags);
3268
3269 /* Context tables are copied, now write them to the root_entry table */
3270 for (bus = 0; bus < 256; bus++) {
3271 int idx = ext ? bus * 2 : bus;
3272 u64 val;
3273
3274 if (ctxt_tbls[idx]) {
3275 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3276 iommu->root_entry[bus].lo = val;
3277 }
3278
3279 if (!ext || !ctxt_tbls[idx + 1])
3280 continue;
3281
3282 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3283 iommu->root_entry[bus].hi = val;
3284 }
3285
3286 spin_unlock_irqrestore(&iommu->lock, flags);
3287
3288 kfree(ctxt_tbls);
3289
3290 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3291
3292 ret = 0;
3293
3294 out_unmap:
3295 memunmap(old_rt);
3296
3297 return ret;
3298 }
3299
3300 static int __init init_dmars(void)
3301 {
3302 struct dmar_drhd_unit *drhd;
3303 struct intel_iommu *iommu;
3304 int ret;
3305
3306 /*
3307 * for each drhd
3308 * allocate root
3309 * initialize and program root entry to not present
3310 * endfor
3311 */
3312 for_each_drhd_unit(drhd) {
3313 /*
3314 * lock not needed as this is only incremented in the single
3315 * threaded kernel __init code path all other access are read
3316 * only
3317 */
3318 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3319 g_num_of_iommus++;
3320 continue;
3321 }
3322 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3323 }
3324
3325 /* Preallocate enough resources for IOMMU hot-addition */
3326 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3327 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3328
3329 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3330 GFP_KERNEL);
3331 if (!g_iommus) {
3332 pr_err("Allocating global iommu array failed\n");
3333 ret = -ENOMEM;
3334 goto error;
3335 }
3336
3337 for_each_iommu(iommu, drhd) {
3338 if (drhd->ignored) {
3339 iommu_disable_translation(iommu);
3340 continue;
3341 }
3342
3343 /*
3344 * Find the max pasid size of all IOMMU's in the system.
3345 * We need to ensure the system pasid table is no bigger
3346 * than the smallest supported.
3347 */
3348 if (pasid_supported(iommu)) {
3349 u32 temp = 2 << ecap_pss(iommu->ecap);
3350
3351 intel_pasid_max_id = min_t(u32, temp,
3352 intel_pasid_max_id);
3353 }
3354
3355 g_iommus[iommu->seq_id] = iommu;
3356
3357 intel_iommu_init_qi(iommu);
3358
3359 ret = iommu_init_domains(iommu);
3360 if (ret)
3361 goto free_iommu;
3362
3363 init_translation_status(iommu);
3364
3365 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3366 iommu_disable_translation(iommu);
3367 clear_translation_pre_enabled(iommu);
3368 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3369 iommu->name);
3370 }
3371
3372 /*
3373 * TBD:
3374 * we could share the same root & context tables
3375 * among all IOMMU's. Need to Split it later.
3376 */
3377 ret = iommu_alloc_root_entry(iommu);
3378 if (ret)
3379 goto free_iommu;
3380
3381 if (translation_pre_enabled(iommu)) {
3382 pr_info("Translation already enabled - trying to copy translation structures\n");
3383
3384 ret = copy_translation_tables(iommu);
3385 if (ret) {
3386 /*
3387 * We found the IOMMU with translation
3388 * enabled - but failed to copy over the
3389 * old root-entry table. Try to proceed
3390 * by disabling translation now and
3391 * allocating a clean root-entry table.
3392 * This might cause DMAR faults, but
3393 * probably the dump will still succeed.
3394 */
3395 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3396 iommu->name);
3397 iommu_disable_translation(iommu);
3398 clear_translation_pre_enabled(iommu);
3399 } else {
3400 pr_info("Copied translation tables from previous kernel for %s\n",
3401 iommu->name);
3402 }
3403 }
3404
3405 if (!ecap_pass_through(iommu->ecap))
3406 hw_pass_through = 0;
3407 intel_svm_check(iommu);
3408 }
3409
3410 /*
3411 * Now that qi is enabled on all iommus, set the root entry and flush
3412 * caches. This is required on some Intel X58 chipsets, otherwise the
3413 * flush_context function will loop forever and the boot hangs.
3414 */
3415 for_each_active_iommu(iommu, drhd) {
3416 iommu_flush_write_buffer(iommu);
3417 iommu_set_root_entry(iommu);
3418 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3419 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3420 }
3421
3422 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3423 dmar_map_gfx = 0;
3424 #endif
3425
3426 if (!dmar_map_gfx)
3427 iommu_identity_mapping |= IDENTMAP_GFX;
3428
3429 check_tylersburg_isoch();
3430
3431 ret = si_domain_init(hw_pass_through);
3432 if (ret)
3433 goto free_iommu;
3434
3435 /*
3436 * for each drhd
3437 * enable fault log
3438 * global invalidate context cache
3439 * global invalidate iotlb
3440 * enable translation
3441 */
3442 for_each_iommu(iommu, drhd) {
3443 if (drhd->ignored) {
3444 /*
3445 * we always have to disable PMRs or DMA may fail on
3446 * this device
3447 */
3448 if (force_on)
3449 iommu_disable_protect_mem_regions(iommu);
3450 continue;
3451 }
3452
3453 iommu_flush_write_buffer(iommu);
3454
3455 #ifdef CONFIG_INTEL_IOMMU_SVM
3456 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3457 /*
3458 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3459 * could cause possible lock race condition.
3460 */
3461 up_write(&dmar_global_lock);
3462 ret = intel_svm_enable_prq(iommu);
3463 down_write(&dmar_global_lock);
3464 if (ret)
3465 goto free_iommu;
3466 }
3467 #endif
3468 ret = dmar_set_interrupt(iommu);
3469 if (ret)
3470 goto free_iommu;
3471 }
3472
3473 return 0;
3474
3475 free_iommu:
3476 for_each_active_iommu(iommu, drhd) {
3477 disable_dmar_iommu(iommu);
3478 free_dmar_iommu(iommu);
3479 }
3480
3481 kfree(g_iommus);
3482
3483 error:
3484 return ret;
3485 }
3486
3487 /* This takes a number of _MM_ pages, not VTD pages */
3488 static unsigned long intel_alloc_iova(struct device *dev,
3489 struct dmar_domain *domain,
3490 unsigned long nrpages, uint64_t dma_mask)
3491 {
3492 unsigned long iova_pfn;
3493
3494 /*
3495 * Restrict dma_mask to the width that the iommu can handle.
3496 * First-level translation restricts the input-address to a
3497 * canonical address (i.e., address bits 63:N have the same
3498 * value as address bit [N-1], where N is 48-bits with 4-level
3499 * paging and 57-bits with 5-level paging). Hence, skip bit
3500 * [N-1].
3501 */
3502 if (domain_use_first_level(domain))
3503 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3504 dma_mask);
3505 else
3506 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3507 dma_mask);
3508
3509 /* Ensure we reserve the whole size-aligned region */
3510 nrpages = __roundup_pow_of_two(nrpages);
3511
3512 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3513 /*
3514 * First try to allocate an io virtual address in
3515 * DMA_BIT_MASK(32) and if that fails then try allocating
3516 * from higher range
3517 */
3518 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3519 IOVA_PFN(DMA_BIT_MASK(32)), false);
3520 if (iova_pfn)
3521 return iova_pfn;
3522 }
3523 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3524 IOVA_PFN(dma_mask), true);
3525 if (unlikely(!iova_pfn)) {
3526 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3527 nrpages);
3528 return 0;
3529 }
3530
3531 return iova_pfn;
3532 }
3533
3534 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3535 {
3536 struct dmar_domain *domain, *tmp;
3537 struct dmar_rmrr_unit *rmrr;
3538 struct device *i_dev;
3539 int i, ret;
3540
3541 /* Device shouldn't be attached by any domains. */
3542 domain = find_domain(dev);
3543 if (domain)
3544 return NULL;
3545
3546 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3547 if (!domain)
3548 goto out;
3549
3550 /* We have a new domain - setup possible RMRRs for the device */
3551 rcu_read_lock();
3552 for_each_rmrr_units(rmrr) {
3553 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3554 i, i_dev) {
3555 if (i_dev != dev)
3556 continue;
3557
3558 ret = domain_prepare_identity_map(dev, domain,
3559 rmrr->base_address,
3560 rmrr->end_address);
3561 if (ret)
3562 dev_err(dev, "Mapping reserved region failed\n");
3563 }
3564 }
3565 rcu_read_unlock();
3566
3567 tmp = set_domain_for_dev(dev, domain);
3568 if (!tmp || domain != tmp) {
3569 domain_exit(domain);
3570 domain = tmp;
3571 }
3572
3573 out:
3574 if (!domain)
3575 dev_err(dev, "Allocating domain failed\n");
3576 else
3577 domain->domain.type = IOMMU_DOMAIN_DMA;
3578
3579 return domain;
3580 }
3581
3582 /* Check if the dev needs to go through non-identity map and unmap process.*/
3583 static bool iommu_need_mapping(struct device *dev)
3584 {
3585 int ret;
3586
3587 if (iommu_dummy(dev))
3588 return false;
3589
3590 if (unlikely(attach_deferred(dev)))
3591 do_deferred_attach(dev);
3592
3593 ret = identity_mapping(dev);
3594 if (ret) {
3595 u64 dma_mask = *dev->dma_mask;
3596
3597 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3598 dma_mask = dev->coherent_dma_mask;
3599
3600 if (dma_mask >= dma_direct_get_required_mask(dev))
3601 return false;
3602
3603 /*
3604 * 32 bit DMA is removed from si_domain and fall back to
3605 * non-identity mapping.
3606 */
3607 dmar_remove_one_dev_info(dev);
3608 ret = iommu_request_dma_domain_for_dev(dev);
3609 if (ret) {
3610 struct iommu_domain *domain;
3611 struct dmar_domain *dmar_domain;
3612
3613 domain = iommu_get_domain_for_dev(dev);
3614 if (domain) {
3615 dmar_domain = to_dmar_domain(domain);
3616 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3617 }
3618 dmar_remove_one_dev_info(dev);
3619 get_private_domain_for_dev(dev);
3620 }
3621
3622 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3623 }
3624
3625 return true;
3626 }
3627
3628 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3629 size_t size, int dir, u64 dma_mask)
3630 {
3631 struct dmar_domain *domain;
3632 phys_addr_t start_paddr;
3633 unsigned long iova_pfn;
3634 int prot = 0;
3635 int ret;
3636 struct intel_iommu *iommu;
3637 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3638
3639 BUG_ON(dir == DMA_NONE);
3640
3641 domain = find_domain(dev);
3642 if (!domain)
3643 return DMA_MAPPING_ERROR;
3644
3645 iommu = domain_get_iommu(domain);
3646 size = aligned_nrpages(paddr, size);
3647
3648 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3649 if (!iova_pfn)
3650 goto error;
3651
3652 /*
3653 * Check if DMAR supports zero-length reads on write only
3654 * mappings..
3655 */
3656 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3657 !cap_zlr(iommu->cap))
3658 prot |= DMA_PTE_READ;
3659 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3660 prot |= DMA_PTE_WRITE;
3661 /*
3662 * paddr - (paddr + size) might be partial page, we should map the whole
3663 * page. Note: if two part of one page are separately mapped, we
3664 * might have two guest_addr mapping to the same host paddr, but this
3665 * is not a big problem
3666 */
3667 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3668 mm_to_dma_pfn(paddr_pfn), size, prot);
3669 if (ret)
3670 goto error;
3671
3672 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3673 start_paddr += paddr & ~PAGE_MASK;
3674
3675 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3676
3677 return start_paddr;
3678
3679 error:
3680 if (iova_pfn)
3681 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3682 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3683 size, (unsigned long long)paddr, dir);
3684 return DMA_MAPPING_ERROR;
3685 }
3686
3687 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3688 unsigned long offset, size_t size,
3689 enum dma_data_direction dir,
3690 unsigned long attrs)
3691 {
3692 if (iommu_need_mapping(dev))
3693 return __intel_map_single(dev, page_to_phys(page) + offset,
3694 size, dir, *dev->dma_mask);
3695 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3696 }
3697
3698 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3699 size_t size, enum dma_data_direction dir,
3700 unsigned long attrs)
3701 {
3702 if (iommu_need_mapping(dev))
3703 return __intel_map_single(dev, phys_addr, size, dir,
3704 *dev->dma_mask);
3705 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3706 }
3707
3708 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3709 {
3710 struct dmar_domain *domain;
3711 unsigned long start_pfn, last_pfn;
3712 unsigned long nrpages;
3713 unsigned long iova_pfn;
3714 struct intel_iommu *iommu;
3715 struct page *freelist;
3716 struct pci_dev *pdev = NULL;
3717
3718 domain = find_domain(dev);
3719 BUG_ON(!domain);
3720
3721 iommu = domain_get_iommu(domain);
3722
3723 iova_pfn = IOVA_PFN(dev_addr);
3724
3725 nrpages = aligned_nrpages(dev_addr, size);
3726 start_pfn = mm_to_dma_pfn(iova_pfn);
3727 last_pfn = start_pfn + nrpages - 1;
3728
3729 if (dev_is_pci(dev))
3730 pdev = to_pci_dev(dev);
3731
3732 freelist = domain_unmap(domain, start_pfn, last_pfn);
3733 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3734 !has_iova_flush_queue(&domain->iovad)) {
3735 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3736 nrpages, !freelist, 0);
3737 /* free iova */
3738 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3739 dma_free_pagelist(freelist);
3740 } else {
3741 queue_iova(&domain->iovad, iova_pfn, nrpages,
3742 (unsigned long)freelist);
3743 /*
3744 * queue up the release of the unmap to save the 1/6th of the
3745 * cpu used up by the iotlb flush operation...
3746 */
3747 }
3748
3749 trace_unmap_single(dev, dev_addr, size);
3750 }
3751
3752 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3753 size_t size, enum dma_data_direction dir,
3754 unsigned long attrs)
3755 {
3756 if (iommu_need_mapping(dev))
3757 intel_unmap(dev, dev_addr, size);
3758 else
3759 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3760 }
3761
3762 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3763 size_t size, enum dma_data_direction dir, unsigned long attrs)
3764 {
3765 if (iommu_need_mapping(dev))
3766 intel_unmap(dev, dev_addr, size);
3767 }
3768
3769 static void *intel_alloc_coherent(struct device *dev, size_t size,
3770 dma_addr_t *dma_handle, gfp_t flags,
3771 unsigned long attrs)
3772 {
3773 struct page *page = NULL;
3774 int order;
3775
3776 if (!iommu_need_mapping(dev))
3777 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3778
3779 size = PAGE_ALIGN(size);
3780 order = get_order(size);
3781
3782 if (gfpflags_allow_blocking(flags)) {
3783 unsigned int count = size >> PAGE_SHIFT;
3784
3785 page = dma_alloc_from_contiguous(dev, count, order,
3786 flags & __GFP_NOWARN);
3787 }
3788
3789 if (!page)
3790 page = alloc_pages(flags, order);
3791 if (!page)
3792 return NULL;
3793 memset(page_address(page), 0, size);
3794
3795 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3796 DMA_BIDIRECTIONAL,
3797 dev->coherent_dma_mask);
3798 if (*dma_handle != DMA_MAPPING_ERROR)
3799 return page_address(page);
3800 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3801 __free_pages(page, order);
3802
3803 return NULL;
3804 }
3805
3806 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3807 dma_addr_t dma_handle, unsigned long attrs)
3808 {
3809 int order;
3810 struct page *page = virt_to_page(vaddr);
3811
3812 if (!iommu_need_mapping(dev))
3813 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3814
3815 size = PAGE_ALIGN(size);
3816 order = get_order(size);
3817
3818 intel_unmap(dev, dma_handle, size);
3819 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3820 __free_pages(page, order);
3821 }
3822
3823 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3824 int nelems, enum dma_data_direction dir,
3825 unsigned long attrs)
3826 {
3827 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3828 unsigned long nrpages = 0;
3829 struct scatterlist *sg;
3830 int i;
3831
3832 if (!iommu_need_mapping(dev))
3833 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3834
3835 for_each_sg(sglist, sg, nelems, i) {
3836 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3837 }
3838
3839 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3840
3841 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3842 }
3843
3844 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3845 enum dma_data_direction dir, unsigned long attrs)
3846 {
3847 int i;
3848 struct dmar_domain *domain;
3849 size_t size = 0;
3850 int prot = 0;
3851 unsigned long iova_pfn;
3852 int ret;
3853 struct scatterlist *sg;
3854 unsigned long start_vpfn;
3855 struct intel_iommu *iommu;
3856
3857 BUG_ON(dir == DMA_NONE);
3858 if (!iommu_need_mapping(dev))
3859 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3860
3861 domain = find_domain(dev);
3862 if (!domain)
3863 return 0;
3864
3865 iommu = domain_get_iommu(domain);
3866
3867 for_each_sg(sglist, sg, nelems, i)
3868 size += aligned_nrpages(sg->offset, sg->length);
3869
3870 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3871 *dev->dma_mask);
3872 if (!iova_pfn) {
3873 sglist->dma_length = 0;
3874 return 0;
3875 }
3876
3877 /*
3878 * Check if DMAR supports zero-length reads on write only
3879 * mappings..
3880 */
3881 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3882 !cap_zlr(iommu->cap))
3883 prot |= DMA_PTE_READ;
3884 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3885 prot |= DMA_PTE_WRITE;
3886
3887 start_vpfn = mm_to_dma_pfn(iova_pfn);
3888
3889 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3890 if (unlikely(ret)) {
3891 dma_pte_free_pagetable(domain, start_vpfn,
3892 start_vpfn + size - 1,
3893 agaw_to_level(domain->agaw) + 1);
3894 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3895 return 0;
3896 }
3897
3898 for_each_sg(sglist, sg, nelems, i)
3899 trace_map_sg(dev, i + 1, nelems, sg);
3900
3901 return nelems;
3902 }
3903
3904 static u64 intel_get_required_mask(struct device *dev)
3905 {
3906 if (!iommu_need_mapping(dev))
3907 return dma_direct_get_required_mask(dev);
3908 return DMA_BIT_MASK(32);
3909 }
3910
3911 static const struct dma_map_ops intel_dma_ops = {
3912 .alloc = intel_alloc_coherent,
3913 .free = intel_free_coherent,
3914 .map_sg = intel_map_sg,
3915 .unmap_sg = intel_unmap_sg,
3916 .map_page = intel_map_page,
3917 .unmap_page = intel_unmap_page,
3918 .map_resource = intel_map_resource,
3919 .unmap_resource = intel_unmap_resource,
3920 .dma_supported = dma_direct_supported,
3921 .mmap = dma_common_mmap,
3922 .get_sgtable = dma_common_get_sgtable,
3923 .get_required_mask = intel_get_required_mask,
3924 };
3925
3926 static void
3927 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3928 enum dma_data_direction dir, enum dma_sync_target target)
3929 {
3930 struct dmar_domain *domain;
3931 phys_addr_t tlb_addr;
3932
3933 domain = find_domain(dev);
3934 if (WARN_ON(!domain))
3935 return;
3936
3937 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3938 if (is_swiotlb_buffer(tlb_addr))
3939 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3940 }
3941
3942 static dma_addr_t
3943 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3944 enum dma_data_direction dir, unsigned long attrs,
3945 u64 dma_mask)
3946 {
3947 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3948 struct dmar_domain *domain;
3949 struct intel_iommu *iommu;
3950 unsigned long iova_pfn;
3951 unsigned long nrpages;
3952 phys_addr_t tlb_addr;
3953 int prot = 0;
3954 int ret;
3955
3956 if (unlikely(attach_deferred(dev)))
3957 do_deferred_attach(dev);
3958
3959 domain = find_domain(dev);
3960
3961 if (WARN_ON(dir == DMA_NONE || !domain))
3962 return DMA_MAPPING_ERROR;
3963
3964 iommu = domain_get_iommu(domain);
3965 if (WARN_ON(!iommu))
3966 return DMA_MAPPING_ERROR;
3967
3968 nrpages = aligned_nrpages(0, size);
3969 iova_pfn = intel_alloc_iova(dev, domain,
3970 dma_to_mm_pfn(nrpages), dma_mask);
3971 if (!iova_pfn)
3972 return DMA_MAPPING_ERROR;
3973
3974 /*
3975 * Check if DMAR supports zero-length reads on write only
3976 * mappings..
3977 */
3978 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3979 !cap_zlr(iommu->cap))
3980 prot |= DMA_PTE_READ;
3981 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3982 prot |= DMA_PTE_WRITE;
3983
3984 /*
3985 * If both the physical buffer start address and size are
3986 * page aligned, we don't need to use a bounce page.
3987 */
3988 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3989 tlb_addr = swiotlb_tbl_map_single(dev,
3990 __phys_to_dma(dev, io_tlb_start),
3991 paddr, size, aligned_size, dir, attrs);
3992 if (tlb_addr == DMA_MAPPING_ERROR) {
3993 goto swiotlb_error;
3994 } else {
3995 /* Cleanup the padding area. */
3996 void *padding_start = phys_to_virt(tlb_addr);
3997 size_t padding_size = aligned_size;
3998
3999 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
4000 (dir == DMA_TO_DEVICE ||
4001 dir == DMA_BIDIRECTIONAL)) {
4002 padding_start += size;
4003 padding_size -= size;
4004 }
4005
4006 memset(padding_start, 0, padding_size);
4007 }
4008 } else {
4009 tlb_addr = paddr;
4010 }
4011
4012 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
4013 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
4014 if (ret)
4015 goto mapping_error;
4016
4017 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
4018
4019 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
4020
4021 mapping_error:
4022 if (is_swiotlb_buffer(tlb_addr))
4023 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4024 aligned_size, dir, attrs);
4025 swiotlb_error:
4026 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
4027 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
4028 size, (unsigned long long)paddr, dir);
4029
4030 return DMA_MAPPING_ERROR;
4031 }
4032
4033 static void
4034 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
4035 enum dma_data_direction dir, unsigned long attrs)
4036 {
4037 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
4038 struct dmar_domain *domain;
4039 phys_addr_t tlb_addr;
4040
4041 domain = find_domain(dev);
4042 if (WARN_ON(!domain))
4043 return;
4044
4045 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
4046 if (WARN_ON(!tlb_addr))
4047 return;
4048
4049 intel_unmap(dev, dev_addr, size);
4050 if (is_swiotlb_buffer(tlb_addr))
4051 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4052 aligned_size, dir, attrs);
4053
4054 trace_bounce_unmap_single(dev, dev_addr, size);
4055 }
4056
4057 static dma_addr_t
4058 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
4059 size_t size, enum dma_data_direction dir, unsigned long attrs)
4060 {
4061 return bounce_map_single(dev, page_to_phys(page) + offset,
4062 size, dir, attrs, *dev->dma_mask);
4063 }
4064
4065 static dma_addr_t
4066 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
4067 enum dma_data_direction dir, unsigned long attrs)
4068 {
4069 return bounce_map_single(dev, phys_addr, size,
4070 dir, attrs, *dev->dma_mask);
4071 }
4072
4073 static void
4074 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
4075 enum dma_data_direction dir, unsigned long attrs)
4076 {
4077 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4078 }
4079
4080 static void
4081 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
4082 enum dma_data_direction dir, unsigned long attrs)
4083 {
4084 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4085 }
4086
4087 static void
4088 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4089 enum dma_data_direction dir, unsigned long attrs)
4090 {
4091 struct scatterlist *sg;
4092 int i;
4093
4094 for_each_sg(sglist, sg, nelems, i)
4095 bounce_unmap_page(dev, sg->dma_address,
4096 sg_dma_len(sg), dir, attrs);
4097 }
4098
4099 static int
4100 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4101 enum dma_data_direction dir, unsigned long attrs)
4102 {
4103 int i;
4104 struct scatterlist *sg;
4105
4106 for_each_sg(sglist, sg, nelems, i) {
4107 sg->dma_address = bounce_map_page(dev, sg_page(sg),
4108 sg->offset, sg->length,
4109 dir, attrs);
4110 if (sg->dma_address == DMA_MAPPING_ERROR)
4111 goto out_unmap;
4112 sg_dma_len(sg) = sg->length;
4113 }
4114
4115 for_each_sg(sglist, sg, nelems, i)
4116 trace_bounce_map_sg(dev, i + 1, nelems, sg);
4117
4118 return nelems;
4119
4120 out_unmap:
4121 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
4122 return 0;
4123 }
4124
4125 static void
4126 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4127 size_t size, enum dma_data_direction dir)
4128 {
4129 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4130 }
4131
4132 static void
4133 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4134 size_t size, enum dma_data_direction dir)
4135 {
4136 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4137 }
4138
4139 static void
4140 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4141 int nelems, enum dma_data_direction dir)
4142 {
4143 struct scatterlist *sg;
4144 int i;
4145
4146 for_each_sg(sglist, sg, nelems, i)
4147 bounce_sync_single(dev, sg_dma_address(sg),
4148 sg_dma_len(sg), dir, SYNC_FOR_CPU);
4149 }
4150
4151 static void
4152 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4153 int nelems, enum dma_data_direction dir)
4154 {
4155 struct scatterlist *sg;
4156 int i;
4157
4158 for_each_sg(sglist, sg, nelems, i)
4159 bounce_sync_single(dev, sg_dma_address(sg),
4160 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4161 }
4162
4163 static const struct dma_map_ops bounce_dma_ops = {
4164 .alloc = intel_alloc_coherent,
4165 .free = intel_free_coherent,
4166 .map_sg = bounce_map_sg,
4167 .unmap_sg = bounce_unmap_sg,
4168 .map_page = bounce_map_page,
4169 .unmap_page = bounce_unmap_page,
4170 .sync_single_for_cpu = bounce_sync_single_for_cpu,
4171 .sync_single_for_device = bounce_sync_single_for_device,
4172 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
4173 .sync_sg_for_device = bounce_sync_sg_for_device,
4174 .map_resource = bounce_map_resource,
4175 .unmap_resource = bounce_unmap_resource,
4176 .dma_supported = dma_direct_supported,
4177 };
4178
4179 static inline int iommu_domain_cache_init(void)
4180 {
4181 int ret = 0;
4182
4183 iommu_domain_cache = kmem_cache_create("iommu_domain",
4184 sizeof(struct dmar_domain),
4185 0,
4186 SLAB_HWCACHE_ALIGN,
4187
4188 NULL);
4189 if (!iommu_domain_cache) {
4190 pr_err("Couldn't create iommu_domain cache\n");
4191 ret = -ENOMEM;
4192 }
4193
4194 return ret;
4195 }
4196
4197 static inline int iommu_devinfo_cache_init(void)
4198 {
4199 int ret = 0;
4200
4201 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4202 sizeof(struct device_domain_info),
4203 0,
4204 SLAB_HWCACHE_ALIGN,
4205 NULL);
4206 if (!iommu_devinfo_cache) {
4207 pr_err("Couldn't create devinfo cache\n");
4208 ret = -ENOMEM;
4209 }
4210
4211 return ret;
4212 }
4213
4214 static int __init iommu_init_mempool(void)
4215 {
4216 int ret;
4217 ret = iova_cache_get();
4218 if (ret)
4219 return ret;
4220
4221 ret = iommu_domain_cache_init();
4222 if (ret)
4223 goto domain_error;
4224
4225 ret = iommu_devinfo_cache_init();
4226 if (!ret)
4227 return ret;
4228
4229 kmem_cache_destroy(iommu_domain_cache);
4230 domain_error:
4231 iova_cache_put();
4232
4233 return -ENOMEM;
4234 }
4235
4236 static void __init iommu_exit_mempool(void)
4237 {
4238 kmem_cache_destroy(iommu_devinfo_cache);
4239 kmem_cache_destroy(iommu_domain_cache);
4240 iova_cache_put();
4241 }
4242
4243 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4244 {
4245 struct dmar_drhd_unit *drhd;
4246 u32 vtbar;
4247 int rc;
4248
4249 /* We know that this device on this chipset has its own IOMMU.
4250 * If we find it under a different IOMMU, then the BIOS is lying
4251 * to us. Hope that the IOMMU for this device is actually
4252 * disabled, and it needs no translation...
4253 */
4254 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4255 if (rc) {
4256 /* "can't" happen */
4257 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4258 return;
4259 }
4260 vtbar &= 0xffff0000;
4261
4262 /* we know that the this iommu should be at offset 0xa000 from vtbar */
4263 drhd = dmar_find_matched_drhd_unit(pdev);
4264 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
4265 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
4266 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4267 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4268 }
4269 }
4270 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4271
4272 static void __init init_no_remapping_devices(void)
4273 {
4274 struct dmar_drhd_unit *drhd;
4275 struct device *dev;
4276 int i;
4277
4278 for_each_drhd_unit(drhd) {
4279 if (!drhd->include_all) {
4280 for_each_active_dev_scope(drhd->devices,
4281 drhd->devices_cnt, i, dev)
4282 break;
4283 /* ignore DMAR unit if no devices exist */
4284 if (i == drhd->devices_cnt)
4285 drhd->ignored = 1;
4286 }
4287 }
4288
4289 for_each_active_drhd_unit(drhd) {
4290 if (drhd->include_all)
4291 continue;
4292
4293 for_each_active_dev_scope(drhd->devices,
4294 drhd->devices_cnt, i, dev)
4295 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4296 break;
4297 if (i < drhd->devices_cnt)
4298 continue;
4299
4300 /* This IOMMU has *only* gfx devices. Either bypass it or
4301 set the gfx_mapped flag, as appropriate */
4302 if (!dmar_map_gfx) {
4303 drhd->ignored = 1;
4304 for_each_active_dev_scope(drhd->devices,
4305 drhd->devices_cnt, i, dev)
4306 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4307 }
4308 }
4309 }
4310
4311 #ifdef CONFIG_SUSPEND
4312 static int init_iommu_hw(void)
4313 {
4314 struct dmar_drhd_unit *drhd;
4315 struct intel_iommu *iommu = NULL;
4316
4317 for_each_active_iommu(iommu, drhd)
4318 if (iommu->qi)
4319 dmar_reenable_qi(iommu);
4320
4321 for_each_iommu(iommu, drhd) {
4322 if (drhd->ignored) {
4323 /*
4324 * we always have to disable PMRs or DMA may fail on
4325 * this device
4326 */
4327 if (force_on)
4328 iommu_disable_protect_mem_regions(iommu);
4329 continue;
4330 }
4331
4332 iommu_flush_write_buffer(iommu);
4333
4334 iommu_set_root_entry(iommu);
4335
4336 iommu->flush.flush_context(iommu, 0, 0, 0,
4337 DMA_CCMD_GLOBAL_INVL);
4338 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4339 iommu_enable_translation(iommu);
4340 iommu_disable_protect_mem_regions(iommu);
4341 }
4342
4343 return 0;
4344 }
4345
4346 static void iommu_flush_all(void)
4347 {
4348 struct dmar_drhd_unit *drhd;
4349 struct intel_iommu *iommu;
4350
4351 for_each_active_iommu(iommu, drhd) {
4352 iommu->flush.flush_context(iommu, 0, 0, 0,
4353 DMA_CCMD_GLOBAL_INVL);
4354 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4355 DMA_TLB_GLOBAL_FLUSH);
4356 }
4357 }
4358
4359 static int iommu_suspend(void)
4360 {
4361 struct dmar_drhd_unit *drhd;
4362 struct intel_iommu *iommu = NULL;
4363 unsigned long flag;
4364
4365 for_each_active_iommu(iommu, drhd) {
4366 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4367 GFP_ATOMIC);
4368 if (!iommu->iommu_state)
4369 goto nomem;
4370 }
4371
4372 iommu_flush_all();
4373
4374 for_each_active_iommu(iommu, drhd) {
4375 iommu_disable_translation(iommu);
4376
4377 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4378
4379 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4380 readl(iommu->reg + DMAR_FECTL_REG);
4381 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4382 readl(iommu->reg + DMAR_FEDATA_REG);
4383 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4384 readl(iommu->reg + DMAR_FEADDR_REG);
4385 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4386 readl(iommu->reg + DMAR_FEUADDR_REG);
4387
4388 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4389 }
4390 return 0;
4391
4392 nomem:
4393 for_each_active_iommu(iommu, drhd)
4394 kfree(iommu->iommu_state);
4395
4396 return -ENOMEM;
4397 }
4398
4399 static void iommu_resume(void)
4400 {
4401 struct dmar_drhd_unit *drhd;
4402 struct intel_iommu *iommu = NULL;
4403 unsigned long flag;
4404
4405 if (init_iommu_hw()) {
4406 if (force_on)
4407 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4408 else
4409 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4410 return;
4411 }
4412
4413 for_each_active_iommu(iommu, drhd) {
4414
4415 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4416
4417 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4418 iommu->reg + DMAR_FECTL_REG);
4419 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4420 iommu->reg + DMAR_FEDATA_REG);
4421 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4422 iommu->reg + DMAR_FEADDR_REG);
4423 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4424 iommu->reg + DMAR_FEUADDR_REG);
4425
4426 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4427 }
4428
4429 for_each_active_iommu(iommu, drhd)
4430 kfree(iommu->iommu_state);
4431 }
4432
4433 static struct syscore_ops iommu_syscore_ops = {
4434 .resume = iommu_resume,
4435 .suspend = iommu_suspend,
4436 };
4437
4438 static void __init init_iommu_pm_ops(void)
4439 {
4440 register_syscore_ops(&iommu_syscore_ops);
4441 }
4442
4443 #else
4444 static inline void init_iommu_pm_ops(void) {}
4445 #endif /* CONFIG_PM */
4446
4447 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4448 {
4449 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4450 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4451 rmrr->end_address <= rmrr->base_address ||
4452 arch_rmrr_sanity_check(rmrr))
4453 return -EINVAL;
4454
4455 return 0;
4456 }
4457
4458 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4459 {
4460 struct acpi_dmar_reserved_memory *rmrr;
4461 struct dmar_rmrr_unit *rmrru;
4462
4463 rmrr = (struct acpi_dmar_reserved_memory *)header;
4464 if (rmrr_sanity_check(rmrr)) {
4465 pr_warn(FW_BUG
4466 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4467 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4468 rmrr->base_address, rmrr->end_address,
4469 dmi_get_system_info(DMI_BIOS_VENDOR),
4470 dmi_get_system_info(DMI_BIOS_VERSION),
4471 dmi_get_system_info(DMI_PRODUCT_VERSION));
4472 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4473 }
4474
4475 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4476 if (!rmrru)
4477 goto out;
4478
4479 rmrru->hdr = header;
4480
4481 rmrru->base_address = rmrr->base_address;
4482 rmrru->end_address = rmrr->end_address;
4483
4484 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4485 ((void *)rmrr) + rmrr->header.length,
4486 &rmrru->devices_cnt);
4487 if (rmrru->devices_cnt && rmrru->devices == NULL)
4488 goto free_rmrru;
4489
4490 list_add(&rmrru->list, &dmar_rmrr_units);
4491
4492 return 0;
4493 free_rmrru:
4494 kfree(rmrru);
4495 out:
4496 return -ENOMEM;
4497 }
4498
4499 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4500 {
4501 struct dmar_atsr_unit *atsru;
4502 struct acpi_dmar_atsr *tmp;
4503
4504 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4505 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4506 if (atsr->segment != tmp->segment)
4507 continue;
4508 if (atsr->header.length != tmp->header.length)
4509 continue;
4510 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4511 return atsru;
4512 }
4513
4514 return NULL;
4515 }
4516
4517 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4518 {
4519 struct acpi_dmar_atsr *atsr;
4520 struct dmar_atsr_unit *atsru;
4521
4522 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4523 return 0;
4524
4525 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4526 atsru = dmar_find_atsr(atsr);
4527 if (atsru)
4528 return 0;
4529
4530 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4531 if (!atsru)
4532 return -ENOMEM;
4533
4534 /*
4535 * If memory is allocated from slab by ACPI _DSM method, we need to
4536 * copy the memory content because the memory buffer will be freed
4537 * on return.
4538 */
4539 atsru->hdr = (void *)(atsru + 1);
4540 memcpy(atsru->hdr, hdr, hdr->length);
4541 atsru->include_all = atsr->flags & 0x1;
4542 if (!atsru->include_all) {
4543 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4544 (void *)atsr + atsr->header.length,
4545 &atsru->devices_cnt);
4546 if (atsru->devices_cnt && atsru->devices == NULL) {
4547 kfree(atsru);
4548 return -ENOMEM;
4549 }
4550 }
4551
4552 list_add_rcu(&atsru->list, &dmar_atsr_units);
4553
4554 return 0;
4555 }
4556
4557 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4558 {
4559 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4560 kfree(atsru);
4561 }
4562
4563 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4564 {
4565 struct acpi_dmar_atsr *atsr;
4566 struct dmar_atsr_unit *atsru;
4567
4568 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4569 atsru = dmar_find_atsr(atsr);
4570 if (atsru) {
4571 list_del_rcu(&atsru->list);
4572 synchronize_rcu();
4573 intel_iommu_free_atsr(atsru);
4574 }
4575
4576 return 0;
4577 }
4578
4579 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4580 {
4581 int i;
4582 struct device *dev;
4583 struct acpi_dmar_atsr *atsr;
4584 struct dmar_atsr_unit *atsru;
4585
4586 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4587 atsru = dmar_find_atsr(atsr);
4588 if (!atsru)
4589 return 0;
4590
4591 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4592 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4593 i, dev)
4594 return -EBUSY;
4595 }
4596
4597 return 0;
4598 }
4599
4600 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4601 {
4602 int sp, ret;
4603 struct intel_iommu *iommu = dmaru->iommu;
4604
4605 if (g_iommus[iommu->seq_id])
4606 return 0;
4607
4608 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4609 pr_warn("%s: Doesn't support hardware pass through.\n",
4610 iommu->name);
4611 return -ENXIO;
4612 }
4613 if (!ecap_sc_support(iommu->ecap) &&
4614 domain_update_iommu_snooping(iommu)) {
4615 pr_warn("%s: Doesn't support snooping.\n",
4616 iommu->name);
4617 return -ENXIO;
4618 }
4619 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4620 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4621 pr_warn("%s: Doesn't support large page.\n",
4622 iommu->name);
4623 return -ENXIO;
4624 }
4625
4626 /*
4627 * Disable translation if already enabled prior to OS handover.
4628 */
4629 if (iommu->gcmd & DMA_GCMD_TE)
4630 iommu_disable_translation(iommu);
4631
4632 g_iommus[iommu->seq_id] = iommu;
4633 ret = iommu_init_domains(iommu);
4634 if (ret == 0)
4635 ret = iommu_alloc_root_entry(iommu);
4636 if (ret)
4637 goto out;
4638
4639 intel_svm_check(iommu);
4640
4641 if (dmaru->ignored) {
4642 /*
4643 * we always have to disable PMRs or DMA may fail on this device
4644 */
4645 if (force_on)
4646 iommu_disable_protect_mem_regions(iommu);
4647 return 0;
4648 }
4649
4650 intel_iommu_init_qi(iommu);
4651 iommu_flush_write_buffer(iommu);
4652
4653 #ifdef CONFIG_INTEL_IOMMU_SVM
4654 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4655 ret = intel_svm_enable_prq(iommu);
4656 if (ret)
4657 goto disable_iommu;
4658 }
4659 #endif
4660 ret = dmar_set_interrupt(iommu);
4661 if (ret)
4662 goto disable_iommu;
4663
4664 iommu_set_root_entry(iommu);
4665 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4666 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4667 iommu_enable_translation(iommu);
4668
4669 iommu_disable_protect_mem_regions(iommu);
4670 return 0;
4671
4672 disable_iommu:
4673 disable_dmar_iommu(iommu);
4674 out:
4675 free_dmar_iommu(iommu);
4676 return ret;
4677 }
4678
4679 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4680 {
4681 int ret = 0;
4682 struct intel_iommu *iommu = dmaru->iommu;
4683
4684 if (!intel_iommu_enabled)
4685 return 0;
4686 if (iommu == NULL)
4687 return -EINVAL;
4688
4689 if (insert) {
4690 ret = intel_iommu_add(dmaru);
4691 } else {
4692 disable_dmar_iommu(iommu);
4693 free_dmar_iommu(iommu);
4694 }
4695
4696 return ret;
4697 }
4698
4699 static void intel_iommu_free_dmars(void)
4700 {
4701 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4702 struct dmar_atsr_unit *atsru, *atsr_n;
4703
4704 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4705 list_del(&rmrru->list);
4706 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4707 kfree(rmrru);
4708 }
4709
4710 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4711 list_del(&atsru->list);
4712 intel_iommu_free_atsr(atsru);
4713 }
4714 }
4715
4716 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4717 {
4718 int i, ret = 1;
4719 struct pci_bus *bus;
4720 struct pci_dev *bridge = NULL;
4721 struct device *tmp;
4722 struct acpi_dmar_atsr *atsr;
4723 struct dmar_atsr_unit *atsru;
4724
4725 dev = pci_physfn(dev);
4726 for (bus = dev->bus; bus; bus = bus->parent) {
4727 bridge = bus->self;
4728 /* If it's an integrated device, allow ATS */
4729 if (!bridge)
4730 return 1;
4731 /* Connected via non-PCIe: no ATS */
4732 if (!pci_is_pcie(bridge) ||
4733 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4734 return 0;
4735 /* If we found the root port, look it up in the ATSR */
4736 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4737 break;
4738 }
4739
4740 rcu_read_lock();
4741 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4742 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4743 if (atsr->segment != pci_domain_nr(dev->bus))
4744 continue;
4745
4746 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4747 if (tmp == &bridge->dev)
4748 goto out;
4749
4750 if (atsru->include_all)
4751 goto out;
4752 }
4753 ret = 0;
4754 out:
4755 rcu_read_unlock();
4756
4757 return ret;
4758 }
4759
4760 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4761 {
4762 int ret;
4763 struct dmar_rmrr_unit *rmrru;
4764 struct dmar_atsr_unit *atsru;
4765 struct acpi_dmar_atsr *atsr;
4766 struct acpi_dmar_reserved_memory *rmrr;
4767
4768 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4769 return 0;
4770
4771 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4772 rmrr = container_of(rmrru->hdr,
4773 struct acpi_dmar_reserved_memory, header);
4774 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4775 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4776 ((void *)rmrr) + rmrr->header.length,
4777 rmrr->segment, rmrru->devices,
4778 rmrru->devices_cnt);
4779 if (ret < 0)
4780 return ret;
4781 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4782 dmar_remove_dev_scope(info, rmrr->segment,
4783 rmrru->devices, rmrru->devices_cnt);
4784 }
4785 }
4786
4787 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4788 if (atsru->include_all)
4789 continue;
4790
4791 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4792 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4793 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4794 (void *)atsr + atsr->header.length,
4795 atsr->segment, atsru->devices,
4796 atsru->devices_cnt);
4797 if (ret > 0)
4798 break;
4799 else if (ret < 0)
4800 return ret;
4801 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4802 if (dmar_remove_dev_scope(info, atsr->segment,
4803 atsru->devices, atsru->devices_cnt))
4804 break;
4805 }
4806 }
4807
4808 return 0;
4809 }
4810
4811 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4812 unsigned long val, void *v)
4813 {
4814 struct memory_notify *mhp = v;
4815 unsigned long long start, end;
4816 unsigned long start_vpfn, last_vpfn;
4817
4818 switch (val) {
4819 case MEM_GOING_ONLINE:
4820 start = mhp->start_pfn << PAGE_SHIFT;
4821 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4822 if (iommu_domain_identity_map(si_domain, start, end)) {
4823 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4824 start, end);
4825 return NOTIFY_BAD;
4826 }
4827 break;
4828
4829 case MEM_OFFLINE:
4830 case MEM_CANCEL_ONLINE:
4831 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4832 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4833 while (start_vpfn <= last_vpfn) {
4834 struct iova *iova;
4835 struct dmar_drhd_unit *drhd;
4836 struct intel_iommu *iommu;
4837 struct page *freelist;
4838
4839 iova = find_iova(&si_domain->iovad, start_vpfn);
4840 if (iova == NULL) {
4841 pr_debug("Failed get IOVA for PFN %lx\n",
4842 start_vpfn);
4843 break;
4844 }
4845
4846 iova = split_and_remove_iova(&si_domain->iovad, iova,
4847 start_vpfn, last_vpfn);
4848 if (iova == NULL) {
4849 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4850 start_vpfn, last_vpfn);
4851 return NOTIFY_BAD;
4852 }
4853
4854 freelist = domain_unmap(si_domain, iova->pfn_lo,
4855 iova->pfn_hi);
4856
4857 rcu_read_lock();
4858 for_each_active_iommu(iommu, drhd)
4859 iommu_flush_iotlb_psi(iommu, si_domain,
4860 iova->pfn_lo, iova_size(iova),
4861 !freelist, 0);
4862 rcu_read_unlock();
4863 dma_free_pagelist(freelist);
4864
4865 start_vpfn = iova->pfn_hi + 1;
4866 free_iova_mem(iova);
4867 }
4868 break;
4869 }
4870
4871 return NOTIFY_OK;
4872 }
4873
4874 static struct notifier_block intel_iommu_memory_nb = {
4875 .notifier_call = intel_iommu_memory_notifier,
4876 .priority = 0
4877 };
4878
4879 static void free_all_cpu_cached_iovas(unsigned int cpu)
4880 {
4881 int i;
4882
4883 for (i = 0; i < g_num_of_iommus; i++) {
4884 struct intel_iommu *iommu = g_iommus[i];
4885 struct dmar_domain *domain;
4886 int did;
4887
4888 if (!iommu)
4889 continue;
4890
4891 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4892 domain = get_iommu_domain(iommu, (u16)did);
4893
4894 if (!domain)
4895 continue;
4896 free_cpu_cached_iovas(cpu, &domain->iovad);
4897 }
4898 }
4899 }
4900
4901 static int intel_iommu_cpu_dead(unsigned int cpu)
4902 {
4903 free_all_cpu_cached_iovas(cpu);
4904 return 0;
4905 }
4906
4907 static void intel_disable_iommus(void)
4908 {
4909 struct intel_iommu *iommu = NULL;
4910 struct dmar_drhd_unit *drhd;
4911
4912 for_each_iommu(iommu, drhd)
4913 iommu_disable_translation(iommu);
4914 }
4915
4916 void intel_iommu_shutdown(void)
4917 {
4918 struct dmar_drhd_unit *drhd;
4919 struct intel_iommu *iommu = NULL;
4920
4921 if (no_iommu || dmar_disabled)
4922 return;
4923
4924 down_write(&dmar_global_lock);
4925
4926 /* Disable PMRs explicitly here. */
4927 for_each_iommu(iommu, drhd)
4928 iommu_disable_protect_mem_regions(iommu);
4929
4930 /* Make sure the IOMMUs are switched off */
4931 intel_disable_iommus();
4932
4933 up_write(&dmar_global_lock);
4934 }
4935
4936 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4937 {
4938 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4939
4940 return container_of(iommu_dev, struct intel_iommu, iommu);
4941 }
4942
4943 static ssize_t intel_iommu_show_version(struct device *dev,
4944 struct device_attribute *attr,
4945 char *buf)
4946 {
4947 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4948 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4949 return sprintf(buf, "%d:%d\n",
4950 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4951 }
4952 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4953
4954 static ssize_t intel_iommu_show_address(struct device *dev,
4955 struct device_attribute *attr,
4956 char *buf)
4957 {
4958 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4959 return sprintf(buf, "%llx\n", iommu->reg_phys);
4960 }
4961 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4962
4963 static ssize_t intel_iommu_show_cap(struct device *dev,
4964 struct device_attribute *attr,
4965 char *buf)
4966 {
4967 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4968 return sprintf(buf, "%llx\n", iommu->cap);
4969 }
4970 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4971
4972 static ssize_t intel_iommu_show_ecap(struct device *dev,
4973 struct device_attribute *attr,
4974 char *buf)
4975 {
4976 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4977 return sprintf(buf, "%llx\n", iommu->ecap);
4978 }
4979 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4980
4981 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4982 struct device_attribute *attr,
4983 char *buf)
4984 {
4985 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4986 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4987 }
4988 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4989
4990 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4991 struct device_attribute *attr,
4992 char *buf)
4993 {
4994 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4995 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4996 cap_ndoms(iommu->cap)));
4997 }
4998 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4999
5000 static struct attribute *intel_iommu_attrs[] = {
5001 &dev_attr_version.attr,
5002 &dev_attr_address.attr,
5003 &dev_attr_cap.attr,
5004 &dev_attr_ecap.attr,
5005 &dev_attr_domains_supported.attr,
5006 &dev_attr_domains_used.attr,
5007 NULL,
5008 };
5009
5010 static struct attribute_group intel_iommu_group = {
5011 .name = "intel-iommu",
5012 .attrs = intel_iommu_attrs,
5013 };
5014
5015 const struct attribute_group *intel_iommu_groups[] = {
5016 &intel_iommu_group,
5017 NULL,
5018 };
5019
5020 static inline bool has_untrusted_dev(void)
5021 {
5022 struct pci_dev *pdev = NULL;
5023
5024 for_each_pci_dev(pdev)
5025 if (pdev->untrusted)
5026 return true;
5027
5028 return false;
5029 }
5030
5031 static int __init platform_optin_force_iommu(void)
5032 {
5033 if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
5034 return 0;
5035
5036 if (no_iommu || dmar_disabled)
5037 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
5038
5039 /*
5040 * If Intel-IOMMU is disabled by default, we will apply identity
5041 * map for all devices except those marked as being untrusted.
5042 */
5043 if (dmar_disabled)
5044 iommu_set_default_passthrough(false);
5045
5046 dmar_disabled = 0;
5047 no_iommu = 0;
5048
5049 return 1;
5050 }
5051
5052 static int __init probe_acpi_namespace_devices(void)
5053 {
5054 struct dmar_drhd_unit *drhd;
5055 /* To avoid a -Wunused-but-set-variable warning. */
5056 struct intel_iommu *iommu __maybe_unused;
5057 struct device *dev;
5058 int i, ret = 0;
5059
5060 for_each_active_iommu(iommu, drhd) {
5061 for_each_active_dev_scope(drhd->devices,
5062 drhd->devices_cnt, i, dev) {
5063 struct acpi_device_physical_node *pn;
5064 struct iommu_group *group;
5065 struct acpi_device *adev;
5066
5067 if (dev->bus != &acpi_bus_type)
5068 continue;
5069
5070 adev = to_acpi_device(dev);
5071 mutex_lock(&adev->physical_node_lock);
5072 list_for_each_entry(pn,
5073 &adev->physical_node_list, node) {
5074 group = iommu_group_get(pn->dev);
5075 if (group) {
5076 iommu_group_put(group);
5077 continue;
5078 }
5079
5080 pn->dev->bus->iommu_ops = &intel_iommu_ops;
5081 ret = iommu_probe_device(pn->dev);
5082 if (ret)
5083 break;
5084 }
5085 mutex_unlock(&adev->physical_node_lock);
5086
5087 if (ret)
5088 return ret;
5089 }
5090 }
5091
5092 return 0;
5093 }
5094
5095 int __init intel_iommu_init(void)
5096 {
5097 int ret = -ENODEV;
5098 struct dmar_drhd_unit *drhd;
5099 struct intel_iommu *iommu;
5100
5101 /*
5102 * Intel IOMMU is required for a TXT/tboot launch or platform
5103 * opt in, so enforce that.
5104 */
5105 force_on = tboot_force_iommu() || platform_optin_force_iommu();
5106
5107 if (iommu_init_mempool()) {
5108 if (force_on)
5109 panic("tboot: Failed to initialize iommu memory\n");
5110 return -ENOMEM;
5111 }
5112
5113 down_write(&dmar_global_lock);
5114 if (dmar_table_init()) {
5115 if (force_on)
5116 panic("tboot: Failed to initialize DMAR table\n");
5117 goto out_free_dmar;
5118 }
5119
5120 if (dmar_dev_scope_init() < 0) {
5121 if (force_on)
5122 panic("tboot: Failed to initialize DMAR device scope\n");
5123 goto out_free_dmar;
5124 }
5125
5126 up_write(&dmar_global_lock);
5127
5128 /*
5129 * The bus notifier takes the dmar_global_lock, so lockdep will
5130 * complain later when we register it under the lock.
5131 */
5132 dmar_register_bus_notifier();
5133
5134 down_write(&dmar_global_lock);
5135
5136 if (!no_iommu)
5137 intel_iommu_debugfs_init();
5138
5139 if (no_iommu || dmar_disabled) {
5140 /*
5141 * We exit the function here to ensure IOMMU's remapping and
5142 * mempool aren't setup, which means that the IOMMU's PMRs
5143 * won't be disabled via the call to init_dmars(). So disable
5144 * it explicitly here. The PMRs were setup by tboot prior to
5145 * calling SENTER, but the kernel is expected to reset/tear
5146 * down the PMRs.
5147 */
5148 if (intel_iommu_tboot_noforce) {
5149 for_each_iommu(iommu, drhd)
5150 iommu_disable_protect_mem_regions(iommu);
5151 }
5152
5153 /*
5154 * Make sure the IOMMUs are switched off, even when we
5155 * boot into a kexec kernel and the previous kernel left
5156 * them enabled
5157 */
5158 intel_disable_iommus();
5159 goto out_free_dmar;
5160 }
5161
5162 if (list_empty(&dmar_rmrr_units))
5163 pr_info("No RMRR found\n");
5164
5165 if (list_empty(&dmar_atsr_units))
5166 pr_info("No ATSR found\n");
5167
5168 if (dmar_init_reserved_ranges()) {
5169 if (force_on)
5170 panic("tboot: Failed to reserve iommu ranges\n");
5171 goto out_free_reserved_range;
5172 }
5173
5174 if (dmar_map_gfx)
5175 intel_iommu_gfx_mapped = 1;
5176
5177 init_no_remapping_devices();
5178
5179 ret = init_dmars();
5180 if (ret) {
5181 if (force_on)
5182 panic("tboot: Failed to initialize DMARs\n");
5183 pr_err("Initialization failed\n");
5184 goto out_free_reserved_range;
5185 }
5186 up_write(&dmar_global_lock);
5187
5188 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5189 /*
5190 * If the system has no untrusted device or the user has decided
5191 * to disable the bounce page mechanisms, we don't need swiotlb.
5192 * Mark this and the pre-allocated bounce pages will be released
5193 * later.
5194 */
5195 if (!has_untrusted_dev() || intel_no_bounce)
5196 swiotlb = 0;
5197 #endif
5198 dma_ops = &intel_dma_ops;
5199
5200 init_iommu_pm_ops();
5201
5202 down_read(&dmar_global_lock);
5203 for_each_active_iommu(iommu, drhd) {
5204 iommu_device_sysfs_add(&iommu->iommu, NULL,
5205 intel_iommu_groups,
5206 "%s", iommu->name);
5207 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5208 iommu_device_register(&iommu->iommu);
5209 }
5210 up_read(&dmar_global_lock);
5211
5212 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5213 if (si_domain && !hw_pass_through)
5214 register_memory_notifier(&intel_iommu_memory_nb);
5215 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5216 intel_iommu_cpu_dead);
5217
5218 down_read(&dmar_global_lock);
5219 if (probe_acpi_namespace_devices())
5220 pr_warn("ACPI name space devices didn't probe correctly\n");
5221
5222 /* Finally, we enable the DMA remapping hardware. */
5223 for_each_iommu(iommu, drhd) {
5224 if (!drhd->ignored && !translation_pre_enabled(iommu))
5225 iommu_enable_translation(iommu);
5226
5227 iommu_disable_protect_mem_regions(iommu);
5228 }
5229 up_read(&dmar_global_lock);
5230
5231 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5232
5233 intel_iommu_enabled = 1;
5234
5235 return 0;
5236
5237 out_free_reserved_range:
5238 put_iova_domain(&reserved_iova_list);
5239 out_free_dmar:
5240 intel_iommu_free_dmars();
5241 up_write(&dmar_global_lock);
5242 iommu_exit_mempool();
5243 return ret;
5244 }
5245
5246 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5247 {
5248 struct intel_iommu *iommu = opaque;
5249
5250 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5251 return 0;
5252 }
5253
5254 /*
5255 * NB - intel-iommu lacks any sort of reference counting for the users of
5256 * dependent devices. If multiple endpoints have intersecting dependent
5257 * devices, unbinding the driver from any one of them will possibly leave
5258 * the others unable to operate.
5259 */
5260 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5261 {
5262 if (!iommu || !dev || !dev_is_pci(dev))
5263 return;
5264
5265 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5266 }
5267
5268 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5269 {
5270 struct dmar_domain *domain;
5271 struct intel_iommu *iommu;
5272 unsigned long flags;
5273
5274 assert_spin_locked(&device_domain_lock);
5275
5276 if (WARN_ON(!info))
5277 return;
5278
5279 iommu = info->iommu;
5280 domain = info->domain;
5281
5282 if (info->dev) {
5283 if (dev_is_pci(info->dev) && sm_supported(iommu))
5284 intel_pasid_tear_down_entry(iommu, info->dev,
5285 PASID_RID2PASID);
5286
5287 iommu_disable_dev_iotlb(info);
5288 domain_context_clear(iommu, info->dev);
5289 intel_pasid_free_table(info->dev);
5290 }
5291
5292 unlink_domain_info(info);
5293
5294 spin_lock_irqsave(&iommu->lock, flags);
5295 domain_detach_iommu(domain, iommu);
5296 spin_unlock_irqrestore(&iommu->lock, flags);
5297
5298 /* free the private domain */
5299 if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5300 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5301 list_empty(&domain->devices))
5302 domain_exit(info->domain);
5303
5304 free_devinfo_mem(info);
5305 }
5306
5307 static void dmar_remove_one_dev_info(struct device *dev)
5308 {
5309 struct device_domain_info *info;
5310 unsigned long flags;
5311
5312 spin_lock_irqsave(&device_domain_lock, flags);
5313 info = dev->archdata.iommu;
5314 if (info && info != DEFER_DEVICE_DOMAIN_INFO
5315 && info != DUMMY_DEVICE_DOMAIN_INFO)
5316 __dmar_remove_one_dev_info(info);
5317 spin_unlock_irqrestore(&device_domain_lock, flags);
5318 }
5319
5320 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5321 {
5322 int adjust_width;
5323
5324 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5325 domain_reserve_special_ranges(domain);
5326
5327 /* calculate AGAW */
5328 domain->gaw = guest_width;
5329 adjust_width = guestwidth_to_adjustwidth(guest_width);
5330 domain->agaw = width_to_agaw(adjust_width);
5331
5332 domain->iommu_coherency = 0;
5333 domain->iommu_snooping = 0;
5334 domain->iommu_superpage = 0;
5335 domain->max_addr = 0;
5336
5337 /* always allocate the top pgd */
5338 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5339 if (!domain->pgd)
5340 return -ENOMEM;
5341 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5342 return 0;
5343 }
5344
5345 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5346 {
5347 struct dmar_domain *dmar_domain;
5348 struct iommu_domain *domain;
5349 int ret;
5350
5351 switch (type) {
5352 case IOMMU_DOMAIN_DMA:
5353 /* fallthrough */
5354 case IOMMU_DOMAIN_UNMANAGED:
5355 dmar_domain = alloc_domain(0);
5356 if (!dmar_domain) {
5357 pr_err("Can't allocate dmar_domain\n");
5358 return NULL;
5359 }
5360 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5361 pr_err("Domain initialization failed\n");
5362 domain_exit(dmar_domain);
5363 return NULL;
5364 }
5365
5366 if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) {
5367 ret = init_iova_flush_queue(&dmar_domain->iovad,
5368 iommu_flush_iova,
5369 iova_entry_free);
5370 if (ret)
5371 pr_info("iova flush queue initialization failed\n");
5372 }
5373
5374 domain_update_iommu_cap(dmar_domain);
5375
5376 domain = &dmar_domain->domain;
5377 domain->geometry.aperture_start = 0;
5378 domain->geometry.aperture_end =
5379 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5380 domain->geometry.force_aperture = true;
5381
5382 return domain;
5383 case IOMMU_DOMAIN_IDENTITY:
5384 return &si_domain->domain;
5385 default:
5386 return NULL;
5387 }
5388
5389 return NULL;
5390 }
5391
5392 static void intel_iommu_domain_free(struct iommu_domain *domain)
5393 {
5394 if (domain != &si_domain->domain)
5395 domain_exit(to_dmar_domain(domain));
5396 }
5397
5398 /*
5399 * Check whether a @domain could be attached to the @dev through the
5400 * aux-domain attach/detach APIs.
5401 */
5402 static inline bool
5403 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5404 {
5405 struct device_domain_info *info = dev->archdata.iommu;
5406
5407 return info && info->auxd_enabled &&
5408 domain->type == IOMMU_DOMAIN_UNMANAGED;
5409 }
5410
5411 static void auxiliary_link_device(struct dmar_domain *domain,
5412 struct device *dev)
5413 {
5414 struct device_domain_info *info = dev->archdata.iommu;
5415
5416 assert_spin_locked(&device_domain_lock);
5417 if (WARN_ON(!info))
5418 return;
5419
5420 domain->auxd_refcnt++;
5421 list_add(&domain->auxd, &info->auxiliary_domains);
5422 }
5423
5424 static void auxiliary_unlink_device(struct dmar_domain *domain,
5425 struct device *dev)
5426 {
5427 struct device_domain_info *info = dev->archdata.iommu;
5428
5429 assert_spin_locked(&device_domain_lock);
5430 if (WARN_ON(!info))
5431 return;
5432
5433 list_del(&domain->auxd);
5434 domain->auxd_refcnt--;
5435
5436 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5437 ioasid_free(domain->default_pasid);
5438 }
5439
5440 static int aux_domain_add_dev(struct dmar_domain *domain,
5441 struct device *dev)
5442 {
5443 int ret;
5444 u8 bus, devfn;
5445 unsigned long flags;
5446 struct intel_iommu *iommu;
5447
5448 iommu = device_to_iommu(dev, &bus, &devfn);
5449 if (!iommu)
5450 return -ENODEV;
5451
5452 if (domain->default_pasid <= 0) {
5453 int pasid;
5454
5455 /* No private data needed for the default pasid */
5456 pasid = ioasid_alloc(NULL, PASID_MIN,
5457 pci_max_pasids(to_pci_dev(dev)) - 1,
5458 NULL);
5459 if (pasid == INVALID_IOASID) {
5460 pr_err("Can't allocate default pasid\n");
5461 return -ENODEV;
5462 }
5463 domain->default_pasid = pasid;
5464 }
5465
5466 spin_lock_irqsave(&device_domain_lock, flags);
5467 /*
5468 * iommu->lock must be held to attach domain to iommu and setup the
5469 * pasid entry for second level translation.
5470 */
5471 spin_lock(&iommu->lock);
5472 ret = domain_attach_iommu(domain, iommu);
5473 if (ret)
5474 goto attach_failed;
5475
5476 /* Setup the PASID entry for mediated devices: */
5477 if (domain_use_first_level(domain))
5478 ret = domain_setup_first_level(iommu, domain, dev,
5479 domain->default_pasid);
5480 else
5481 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5482 domain->default_pasid);
5483 if (ret)
5484 goto table_failed;
5485 spin_unlock(&iommu->lock);
5486
5487 auxiliary_link_device(domain, dev);
5488
5489 spin_unlock_irqrestore(&device_domain_lock, flags);
5490
5491 return 0;
5492
5493 table_failed:
5494 domain_detach_iommu(domain, iommu);
5495 attach_failed:
5496 spin_unlock(&iommu->lock);
5497 spin_unlock_irqrestore(&device_domain_lock, flags);
5498 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5499 ioasid_free(domain->default_pasid);
5500
5501 return ret;
5502 }
5503
5504 static void aux_domain_remove_dev(struct dmar_domain *domain,
5505 struct device *dev)
5506 {
5507 struct device_domain_info *info;
5508 struct intel_iommu *iommu;
5509 unsigned long flags;
5510
5511 if (!is_aux_domain(dev, &domain->domain))
5512 return;
5513
5514 spin_lock_irqsave(&device_domain_lock, flags);
5515 info = dev->archdata.iommu;
5516 iommu = info->iommu;
5517
5518 auxiliary_unlink_device(domain, dev);
5519
5520 spin_lock(&iommu->lock);
5521 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5522 domain_detach_iommu(domain, iommu);
5523 spin_unlock(&iommu->lock);
5524
5525 spin_unlock_irqrestore(&device_domain_lock, flags);
5526 }
5527
5528 static int prepare_domain_attach_device(struct iommu_domain *domain,
5529 struct device *dev)
5530 {
5531 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5532 struct intel_iommu *iommu;
5533 int addr_width;
5534 u8 bus, devfn;
5535
5536 iommu = device_to_iommu(dev, &bus, &devfn);
5537 if (!iommu)
5538 return -ENODEV;
5539
5540 /* check if this iommu agaw is sufficient for max mapped address */
5541 addr_width = agaw_to_width(iommu->agaw);
5542 if (addr_width > cap_mgaw(iommu->cap))
5543 addr_width = cap_mgaw(iommu->cap);
5544
5545 if (dmar_domain->max_addr > (1LL << addr_width)) {
5546 dev_err(dev, "%s: iommu width (%d) is not "
5547 "sufficient for the mapped address (%llx)\n",
5548 __func__, addr_width, dmar_domain->max_addr);
5549 return -EFAULT;
5550 }
5551 dmar_domain->gaw = addr_width;
5552
5553 /*
5554 * Knock out extra levels of page tables if necessary
5555 */
5556 while (iommu->agaw < dmar_domain->agaw) {
5557 struct dma_pte *pte;
5558
5559 pte = dmar_domain->pgd;
5560 if (dma_pte_present(pte)) {
5561 dmar_domain->pgd = (struct dma_pte *)
5562 phys_to_virt(dma_pte_addr(pte));
5563 free_pgtable_page(pte);
5564 }
5565 dmar_domain->agaw--;
5566 }
5567
5568 return 0;
5569 }
5570
5571 static int intel_iommu_attach_device(struct iommu_domain *domain,
5572 struct device *dev)
5573 {
5574 int ret;
5575
5576 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5577 device_is_rmrr_locked(dev)) {
5578 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5579 return -EPERM;
5580 }
5581
5582 if (is_aux_domain(dev, domain))
5583 return -EPERM;
5584
5585 /* normally dev is not mapped */
5586 if (unlikely(domain_context_mapped(dev))) {
5587 struct dmar_domain *old_domain;
5588
5589 old_domain = find_domain(dev);
5590 if (old_domain)
5591 dmar_remove_one_dev_info(dev);
5592 }
5593
5594 ret = prepare_domain_attach_device(domain, dev);
5595 if (ret)
5596 return ret;
5597
5598 return domain_add_dev_info(to_dmar_domain(domain), dev);
5599 }
5600
5601 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5602 struct device *dev)
5603 {
5604 int ret;
5605
5606 if (!is_aux_domain(dev, domain))
5607 return -EPERM;
5608
5609 ret = prepare_domain_attach_device(domain, dev);
5610 if (ret)
5611 return ret;
5612
5613 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5614 }
5615
5616 static void intel_iommu_detach_device(struct iommu_domain *domain,
5617 struct device *dev)
5618 {
5619 dmar_remove_one_dev_info(dev);
5620 }
5621
5622 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5623 struct device *dev)
5624 {
5625 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5626 }
5627
5628 static int intel_iommu_map(struct iommu_domain *domain,
5629 unsigned long iova, phys_addr_t hpa,
5630 size_t size, int iommu_prot, gfp_t gfp)
5631 {
5632 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5633 u64 max_addr;
5634 int prot = 0;
5635 int ret;
5636
5637 if (iommu_prot & IOMMU_READ)
5638 prot |= DMA_PTE_READ;
5639 if (iommu_prot & IOMMU_WRITE)
5640 prot |= DMA_PTE_WRITE;
5641 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5642 prot |= DMA_PTE_SNP;
5643
5644 max_addr = iova + size;
5645 if (dmar_domain->max_addr < max_addr) {
5646 u64 end;
5647
5648 /* check if minimum agaw is sufficient for mapped address */
5649 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5650 if (end < max_addr) {
5651 pr_err("%s: iommu width (%d) is not "
5652 "sufficient for the mapped address (%llx)\n",
5653 __func__, dmar_domain->gaw, max_addr);
5654 return -EFAULT;
5655 }
5656 dmar_domain->max_addr = max_addr;
5657 }
5658 /* Round up size to next multiple of PAGE_SIZE, if it and
5659 the low bits of hpa would take us onto the next page */
5660 size = aligned_nrpages(hpa, size);
5661 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5662 hpa >> VTD_PAGE_SHIFT, size, prot);
5663 return ret;
5664 }
5665
5666 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5667 unsigned long iova, size_t size,
5668 struct iommu_iotlb_gather *gather)
5669 {
5670 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5671 struct page *freelist = NULL;
5672 unsigned long start_pfn, last_pfn;
5673 unsigned int npages;
5674 int iommu_id, level = 0;
5675
5676 /* Cope with horrid API which requires us to unmap more than the
5677 size argument if it happens to be a large-page mapping. */
5678 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5679
5680 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5681 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5682
5683 start_pfn = iova >> VTD_PAGE_SHIFT;
5684 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5685
5686 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5687
5688 npages = last_pfn - start_pfn + 1;
5689
5690 for_each_domain_iommu(iommu_id, dmar_domain)
5691 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5692 start_pfn, npages, !freelist, 0);
5693
5694 dma_free_pagelist(freelist);
5695
5696 if (dmar_domain->max_addr == iova + size)
5697 dmar_domain->max_addr = iova;
5698
5699 return size;
5700 }
5701
5702 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5703 dma_addr_t iova)
5704 {
5705 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5706 struct dma_pte *pte;
5707 int level = 0;
5708 u64 phys = 0;
5709
5710 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5711 if (pte && dma_pte_present(pte))
5712 phys = dma_pte_addr(pte) +
5713 (iova & (BIT_MASK(level_to_offset_bits(level) +
5714 VTD_PAGE_SHIFT) - 1));
5715
5716 return phys;
5717 }
5718
5719 static inline bool scalable_mode_support(void)
5720 {
5721 struct dmar_drhd_unit *drhd;
5722 struct intel_iommu *iommu;
5723 bool ret = true;
5724
5725 rcu_read_lock();
5726 for_each_active_iommu(iommu, drhd) {
5727 if (!sm_supported(iommu)) {
5728 ret = false;
5729 break;
5730 }
5731 }
5732 rcu_read_unlock();
5733
5734 return ret;
5735 }
5736
5737 static inline bool iommu_pasid_support(void)
5738 {
5739 struct dmar_drhd_unit *drhd;
5740 struct intel_iommu *iommu;
5741 bool ret = true;
5742
5743 rcu_read_lock();
5744 for_each_active_iommu(iommu, drhd) {
5745 if (!pasid_supported(iommu)) {
5746 ret = false;
5747 break;
5748 }
5749 }
5750 rcu_read_unlock();
5751
5752 return ret;
5753 }
5754
5755 static inline bool nested_mode_support(void)
5756 {
5757 struct dmar_drhd_unit *drhd;
5758 struct intel_iommu *iommu;
5759 bool ret = true;
5760
5761 rcu_read_lock();
5762 for_each_active_iommu(iommu, drhd) {
5763 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5764 ret = false;
5765 break;
5766 }
5767 }
5768 rcu_read_unlock();
5769
5770 return ret;
5771 }
5772
5773 static bool intel_iommu_capable(enum iommu_cap cap)
5774 {
5775 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5776 return domain_update_iommu_snooping(NULL) == 1;
5777 if (cap == IOMMU_CAP_INTR_REMAP)
5778 return irq_remapping_enabled == 1;
5779
5780 return false;
5781 }
5782
5783 static int intel_iommu_add_device(struct device *dev)
5784 {
5785 struct dmar_domain *dmar_domain;
5786 struct iommu_domain *domain;
5787 struct intel_iommu *iommu;
5788 struct iommu_group *group;
5789 u8 bus, devfn;
5790 int ret;
5791
5792 iommu = device_to_iommu(dev, &bus, &devfn);
5793 if (!iommu)
5794 return -ENODEV;
5795
5796 iommu_device_link(&iommu->iommu, dev);
5797
5798 if (translation_pre_enabled(iommu))
5799 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5800
5801 group = iommu_group_get_for_dev(dev);
5802
5803 if (IS_ERR(group)) {
5804 ret = PTR_ERR(group);
5805 goto unlink;
5806 }
5807
5808 iommu_group_put(group);
5809
5810 domain = iommu_get_domain_for_dev(dev);
5811 dmar_domain = to_dmar_domain(domain);
5812 if (domain->type == IOMMU_DOMAIN_DMA) {
5813 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5814 ret = iommu_request_dm_for_dev(dev);
5815 if (ret) {
5816 dmar_remove_one_dev_info(dev);
5817 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5818 domain_add_dev_info(si_domain, dev);
5819 dev_info(dev,
5820 "Device uses a private identity domain.\n");
5821 }
5822 }
5823 } else {
5824 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5825 ret = iommu_request_dma_domain_for_dev(dev);
5826 if (ret) {
5827 dmar_remove_one_dev_info(dev);
5828 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5829 if (!get_private_domain_for_dev(dev)) {
5830 dev_warn(dev,
5831 "Failed to get a private domain.\n");
5832 ret = -ENOMEM;
5833 goto unlink;
5834 }
5835
5836 dev_info(dev,
5837 "Device uses a private dma domain.\n");
5838 }
5839 }
5840 }
5841
5842 if (device_needs_bounce(dev)) {
5843 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5844 set_dma_ops(dev, &bounce_dma_ops);
5845 }
5846
5847 return 0;
5848
5849 unlink:
5850 iommu_device_unlink(&iommu->iommu, dev);
5851 return ret;
5852 }
5853
5854 static void intel_iommu_remove_device(struct device *dev)
5855 {
5856 struct intel_iommu *iommu;
5857 u8 bus, devfn;
5858
5859 iommu = device_to_iommu(dev, &bus, &devfn);
5860 if (!iommu)
5861 return;
5862
5863 dmar_remove_one_dev_info(dev);
5864
5865 iommu_group_remove_device(dev);
5866
5867 iommu_device_unlink(&iommu->iommu, dev);
5868
5869 if (device_needs_bounce(dev))
5870 set_dma_ops(dev, NULL);
5871 }
5872
5873 static void intel_iommu_get_resv_regions(struct device *device,
5874 struct list_head *head)
5875 {
5876 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5877 struct iommu_resv_region *reg;
5878 struct dmar_rmrr_unit *rmrr;
5879 struct device *i_dev;
5880 int i;
5881
5882 down_read(&dmar_global_lock);
5883 for_each_rmrr_units(rmrr) {
5884 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5885 i, i_dev) {
5886 struct iommu_resv_region *resv;
5887 enum iommu_resv_type type;
5888 size_t length;
5889
5890 if (i_dev != device &&
5891 !is_downstream_to_pci_bridge(device, i_dev))
5892 continue;
5893
5894 length = rmrr->end_address - rmrr->base_address + 1;
5895
5896 type = device_rmrr_is_relaxable(device) ?
5897 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5898
5899 resv = iommu_alloc_resv_region(rmrr->base_address,
5900 length, prot, type);
5901 if (!resv)
5902 break;
5903
5904 list_add_tail(&resv->list, head);
5905 }
5906 }
5907 up_read(&dmar_global_lock);
5908
5909 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5910 if (dev_is_pci(device)) {
5911 struct pci_dev *pdev = to_pci_dev(device);
5912
5913 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5914 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5915 IOMMU_RESV_DIRECT_RELAXABLE);
5916 if (reg)
5917 list_add_tail(&reg->list, head);
5918 }
5919 }
5920 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5921
5922 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5923 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5924 0, IOMMU_RESV_MSI);
5925 if (!reg)
5926 return;
5927 list_add_tail(&reg->list, head);
5928 }
5929
5930 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5931 {
5932 struct device_domain_info *info;
5933 struct context_entry *context;
5934 struct dmar_domain *domain;
5935 unsigned long flags;
5936 u64 ctx_lo;
5937 int ret;
5938
5939 domain = find_domain(dev);
5940 if (!domain)
5941 return -EINVAL;
5942
5943 spin_lock_irqsave(&device_domain_lock, flags);
5944 spin_lock(&iommu->lock);
5945
5946 ret = -EINVAL;
5947 info = dev->archdata.iommu;
5948 if (!info || !info->pasid_supported)
5949 goto out;
5950
5951 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5952 if (WARN_ON(!context))
5953 goto out;
5954
5955 ctx_lo = context[0].lo;
5956
5957 if (!(ctx_lo & CONTEXT_PASIDE)) {
5958 ctx_lo |= CONTEXT_PASIDE;
5959 context[0].lo = ctx_lo;
5960 wmb();
5961 iommu->flush.flush_context(iommu,
5962 domain->iommu_did[iommu->seq_id],
5963 PCI_DEVID(info->bus, info->devfn),
5964 DMA_CCMD_MASK_NOBIT,
5965 DMA_CCMD_DEVICE_INVL);
5966 }
5967
5968 /* Enable PASID support in the device, if it wasn't already */
5969 if (!info->pasid_enabled)
5970 iommu_enable_dev_iotlb(info);
5971
5972 ret = 0;
5973
5974 out:
5975 spin_unlock(&iommu->lock);
5976 spin_unlock_irqrestore(&device_domain_lock, flags);
5977
5978 return ret;
5979 }
5980
5981 static void intel_iommu_apply_resv_region(struct device *dev,
5982 struct iommu_domain *domain,
5983 struct iommu_resv_region *region)
5984 {
5985 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5986 unsigned long start, end;
5987
5988 start = IOVA_PFN(region->start);
5989 end = IOVA_PFN(region->start + region->length - 1);
5990
5991 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5992 }
5993
5994 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5995 {
5996 if (dev_is_pci(dev))
5997 return pci_device_group(dev);
5998 return generic_device_group(dev);
5999 }
6000
6001 #ifdef CONFIG_INTEL_IOMMU_SVM
6002 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
6003 {
6004 struct intel_iommu *iommu;
6005 u8 bus, devfn;
6006
6007 if (iommu_dummy(dev)) {
6008 dev_warn(dev,
6009 "No IOMMU translation for device; cannot enable SVM\n");
6010 return NULL;
6011 }
6012
6013 iommu = device_to_iommu(dev, &bus, &devfn);
6014 if ((!iommu)) {
6015 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
6016 return NULL;
6017 }
6018
6019 return iommu;
6020 }
6021 #endif /* CONFIG_INTEL_IOMMU_SVM */
6022
6023 static int intel_iommu_enable_auxd(struct device *dev)
6024 {
6025 struct device_domain_info *info;
6026 struct intel_iommu *iommu;
6027 unsigned long flags;
6028 u8 bus, devfn;
6029 int ret;
6030
6031 iommu = device_to_iommu(dev, &bus, &devfn);
6032 if (!iommu || dmar_disabled)
6033 return -EINVAL;
6034
6035 if (!sm_supported(iommu) || !pasid_supported(iommu))
6036 return -EINVAL;
6037
6038 ret = intel_iommu_enable_pasid(iommu, dev);
6039 if (ret)
6040 return -ENODEV;
6041
6042 spin_lock_irqsave(&device_domain_lock, flags);
6043 info = dev->archdata.iommu;
6044 info->auxd_enabled = 1;
6045 spin_unlock_irqrestore(&device_domain_lock, flags);
6046
6047 return 0;
6048 }
6049
6050 static int intel_iommu_disable_auxd(struct device *dev)
6051 {
6052 struct device_domain_info *info;
6053 unsigned long flags;
6054
6055 spin_lock_irqsave(&device_domain_lock, flags);
6056 info = dev->archdata.iommu;
6057 if (!WARN_ON(!info))
6058 info->auxd_enabled = 0;
6059 spin_unlock_irqrestore(&device_domain_lock, flags);
6060
6061 return 0;
6062 }
6063
6064 /*
6065 * A PCI express designated vendor specific extended capability is defined
6066 * in the section 3.7 of Intel scalable I/O virtualization technical spec
6067 * for system software and tools to detect endpoint devices supporting the
6068 * Intel scalable IO virtualization without host driver dependency.
6069 *
6070 * Returns the address of the matching extended capability structure within
6071 * the device's PCI configuration space or 0 if the device does not support
6072 * it.
6073 */
6074 static int siov_find_pci_dvsec(struct pci_dev *pdev)
6075 {
6076 int pos;
6077 u16 vendor, id;
6078
6079 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
6080 while (pos) {
6081 pci_read_config_word(pdev, pos + 4, &vendor);
6082 pci_read_config_word(pdev, pos + 8, &id);
6083 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
6084 return pos;
6085
6086 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
6087 }
6088
6089 return 0;
6090 }
6091
6092 static bool
6093 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
6094 {
6095 if (feat == IOMMU_DEV_FEAT_AUX) {
6096 int ret;
6097
6098 if (!dev_is_pci(dev) || dmar_disabled ||
6099 !scalable_mode_support() || !iommu_pasid_support())
6100 return false;
6101
6102 ret = pci_pasid_features(to_pci_dev(dev));
6103 if (ret < 0)
6104 return false;
6105
6106 return !!siov_find_pci_dvsec(to_pci_dev(dev));
6107 }
6108
6109 return false;
6110 }
6111
6112 static int
6113 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
6114 {
6115 if (feat == IOMMU_DEV_FEAT_AUX)
6116 return intel_iommu_enable_auxd(dev);
6117
6118 return -ENODEV;
6119 }
6120
6121 static int
6122 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6123 {
6124 if (feat == IOMMU_DEV_FEAT_AUX)
6125 return intel_iommu_disable_auxd(dev);
6126
6127 return -ENODEV;
6128 }
6129
6130 static bool
6131 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6132 {
6133 struct device_domain_info *info = dev->archdata.iommu;
6134
6135 if (feat == IOMMU_DEV_FEAT_AUX)
6136 return scalable_mode_support() && info && info->auxd_enabled;
6137
6138 return false;
6139 }
6140
6141 static int
6142 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6143 {
6144 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6145
6146 return dmar_domain->default_pasid > 0 ?
6147 dmar_domain->default_pasid : -EINVAL;
6148 }
6149
6150 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6151 struct device *dev)
6152 {
6153 return attach_deferred(dev);
6154 }
6155
6156 static int
6157 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6158 enum iommu_attr attr, void *data)
6159 {
6160 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6161 unsigned long flags;
6162 int ret = 0;
6163
6164 if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6165 return -EINVAL;
6166
6167 switch (attr) {
6168 case DOMAIN_ATTR_NESTING:
6169 spin_lock_irqsave(&device_domain_lock, flags);
6170 if (nested_mode_support() &&
6171 list_empty(&dmar_domain->devices)) {
6172 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6173 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6174 } else {
6175 ret = -ENODEV;
6176 }
6177 spin_unlock_irqrestore(&device_domain_lock, flags);
6178 break;
6179 default:
6180 ret = -EINVAL;
6181 break;
6182 }
6183
6184 return ret;
6185 }
6186
6187 const struct iommu_ops intel_iommu_ops = {
6188 .capable = intel_iommu_capable,
6189 .domain_alloc = intel_iommu_domain_alloc,
6190 .domain_free = intel_iommu_domain_free,
6191 .domain_set_attr = intel_iommu_domain_set_attr,
6192 .attach_dev = intel_iommu_attach_device,
6193 .detach_dev = intel_iommu_detach_device,
6194 .aux_attach_dev = intel_iommu_aux_attach_device,
6195 .aux_detach_dev = intel_iommu_aux_detach_device,
6196 .aux_get_pasid = intel_iommu_aux_get_pasid,
6197 .map = intel_iommu_map,
6198 .unmap = intel_iommu_unmap,
6199 .iova_to_phys = intel_iommu_iova_to_phys,
6200 .add_device = intel_iommu_add_device,
6201 .remove_device = intel_iommu_remove_device,
6202 .get_resv_regions = intel_iommu_get_resv_regions,
6203 .put_resv_regions = generic_iommu_put_resv_regions,
6204 .apply_resv_region = intel_iommu_apply_resv_region,
6205 .device_group = intel_iommu_device_group,
6206 .dev_has_feat = intel_iommu_dev_has_feat,
6207 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
6208 .dev_enable_feat = intel_iommu_dev_enable_feat,
6209 .dev_disable_feat = intel_iommu_dev_disable_feat,
6210 .is_attach_deferred = intel_iommu_is_attach_deferred,
6211 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
6212 };
6213
6214 static void quirk_iommu_igfx(struct pci_dev *dev)
6215 {
6216 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6217 dmar_map_gfx = 0;
6218 }
6219
6220 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6221 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6222 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6223 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6224 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6225 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6226 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6227 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6228
6229 /* Broadwell igfx malfunctions with dmar */
6230 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6232 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6233 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6234 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6235 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6236 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6237 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6238 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6239 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6240 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6241 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6242 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6243 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6244 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6245 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6247 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6248 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6249 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6250 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6251 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6252 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6253 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6254
6255 static void quirk_iommu_rwbf(struct pci_dev *dev)
6256 {
6257 /*
6258 * Mobile 4 Series Chipset neglects to set RWBF capability,
6259 * but needs it. Same seems to hold for the desktop versions.
6260 */
6261 pci_info(dev, "Forcing write-buffer flush capability\n");
6262 rwbf_quirk = 1;
6263 }
6264
6265 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6266 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6267 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6268 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6269 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6270 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6271 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6272
6273 #define GGC 0x52
6274 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
6275 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6276 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
6277 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
6278 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6279 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6280 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6281 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6282
6283 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6284 {
6285 unsigned short ggc;
6286
6287 if (pci_read_config_word(dev, GGC, &ggc))
6288 return;
6289
6290 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6291 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6292 dmar_map_gfx = 0;
6293 } else if (dmar_map_gfx) {
6294 /* we have to ensure the gfx device is idle before we flush */
6295 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6296 intel_iommu_strict = 1;
6297 }
6298 }
6299 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6300 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6301 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6302 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6303
6304 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6305 ISOCH DMAR unit for the Azalia sound device, but not give it any
6306 TLB entries, which causes it to deadlock. Check for that. We do
6307 this in a function called from init_dmars(), instead of in a PCI
6308 quirk, because we don't want to print the obnoxious "BIOS broken"
6309 message if VT-d is actually disabled.
6310 */
6311 static void __init check_tylersburg_isoch(void)
6312 {
6313 struct pci_dev *pdev;
6314 uint32_t vtisochctrl;
6315
6316 /* If there's no Azalia in the system anyway, forget it. */
6317 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6318 if (!pdev)
6319 return;
6320 pci_dev_put(pdev);
6321
6322 /* System Management Registers. Might be hidden, in which case
6323 we can't do the sanity check. But that's OK, because the
6324 known-broken BIOSes _don't_ actually hide it, so far. */
6325 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6326 if (!pdev)
6327 return;
6328
6329 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6330 pci_dev_put(pdev);
6331 return;
6332 }
6333
6334 pci_dev_put(pdev);
6335
6336 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6337 if (vtisochctrl & 1)
6338 return;
6339
6340 /* Drop all bits other than the number of TLB entries */
6341 vtisochctrl &= 0x1c;
6342
6343 /* If we have the recommended number of TLB entries (16), fine. */
6344 if (vtisochctrl == 0x10)
6345 return;
6346
6347 /* Zero TLB entries? You get to ride the short bus to school. */
6348 if (!vtisochctrl) {
6349 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6350 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6351 dmi_get_system_info(DMI_BIOS_VENDOR),
6352 dmi_get_system_info(DMI_BIOS_VERSION),
6353 dmi_get_system_info(DMI_PRODUCT_VERSION));
6354 iommu_identity_mapping |= IDENTMAP_AZALIA;
6355 return;
6356 }
6357
6358 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6359 vtisochctrl);
6360 }