2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
18 * Joerg Roedel <jroedel@suse.de>
21 #define pr_fmt(fmt) "DMAR: " fmt
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/cpu.h>
37 #include <linux/timer.h>
39 #include <linux/iova.h>
40 #include <linux/iommu.h>
41 #include <linux/intel-iommu.h>
42 #include <linux/syscore_ops.h>
43 #include <linux/tboot.h>
44 #include <linux/dmi.h>
45 #include <linux/pci-ats.h>
46 #include <linux/memblock.h>
47 #include <linux/dma-contiguous.h>
48 #include <linux/dma-direct.h>
49 #include <linux/crash_dump.h>
50 #include <asm/irq_remapping.h>
51 #include <asm/cacheflush.h>
52 #include <asm/iommu.h>
54 #include "irq_remapping.h"
55 #include "intel-pasid.h"
57 #define ROOT_SIZE VTD_PAGE_SIZE
58 #define CONTEXT_SIZE VTD_PAGE_SIZE
60 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
61 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
62 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
63 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
65 #define IOAPIC_RANGE_START (0xfee00000)
66 #define IOAPIC_RANGE_END (0xfeefffff)
67 #define IOVA_START_ADDR (0x1000)
69 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
71 #define MAX_AGAW_WIDTH 64
72 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
74 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
75 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
77 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
78 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
79 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
80 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
81 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
83 /* IO virtual address start page frame number */
84 #define IOVA_START_PFN (1)
86 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
88 /* page table handling */
89 #define LEVEL_STRIDE (9)
90 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
93 * This bitmap is used to advertise the page sizes our hardware support
94 * to the IOMMU core, which will then use this information to split
95 * physically contiguous memory regions it is mapping into page sizes
98 * Traditionally the IOMMU core just handed us the mappings directly,
99 * after making sure the size is an order of a 4KiB page and that the
100 * mapping has natural alignment.
102 * To retain this behavior, we currently advertise that we support
103 * all page sizes that are an order of 4KiB.
105 * If at some point we'd like to utilize the IOMMU core's new behavior,
106 * we could change this to advertise the real page sizes we support.
108 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
110 static inline int agaw_to_level(int agaw
)
115 static inline int agaw_to_width(int agaw
)
117 return min_t(int, 30 + agaw
* LEVEL_STRIDE
, MAX_AGAW_WIDTH
);
120 static inline int width_to_agaw(int width
)
122 return DIV_ROUND_UP(width
- 30, LEVEL_STRIDE
);
125 static inline unsigned int level_to_offset_bits(int level
)
127 return (level
- 1) * LEVEL_STRIDE
;
130 static inline int pfn_level_offset(unsigned long pfn
, int level
)
132 return (pfn
>> level_to_offset_bits(level
)) & LEVEL_MASK
;
135 static inline unsigned long level_mask(int level
)
137 return -1UL << level_to_offset_bits(level
);
140 static inline unsigned long level_size(int level
)
142 return 1UL << level_to_offset_bits(level
);
145 static inline unsigned long align_to_level(unsigned long pfn
, int level
)
147 return (pfn
+ level_size(level
) - 1) & level_mask(level
);
150 static inline unsigned long lvl_to_nr_pages(unsigned int lvl
)
152 return 1 << min_t(int, (lvl
- 1) * LEVEL_STRIDE
, MAX_AGAW_PFN_WIDTH
);
155 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
156 are never going to work. */
157 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn
)
159 return dma_pfn
>> (PAGE_SHIFT
- VTD_PAGE_SHIFT
);
162 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn
)
164 return mm_pfn
<< (PAGE_SHIFT
- VTD_PAGE_SHIFT
);
166 static inline unsigned long page_to_dma_pfn(struct page
*pg
)
168 return mm_to_dma_pfn(page_to_pfn(pg
));
170 static inline unsigned long virt_to_dma_pfn(void *p
)
172 return page_to_dma_pfn(virt_to_page(p
));
175 /* global iommu list, set NULL for ignored DMAR units */
176 static struct intel_iommu
**g_iommus
;
178 static void __init
check_tylersburg_isoch(void);
179 static int rwbf_quirk
;
182 * set to 1 to panic kernel if can't successfully enable VT-d
183 * (used when kernel is launched w/ TXT)
185 static int force_on
= 0;
186 int intel_iommu_tboot_noforce
;
188 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
191 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
194 static phys_addr_t
root_entry_lctp(struct root_entry
*re
)
199 return re
->lo
& VTD_PAGE_MASK
;
203 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
206 static phys_addr_t
root_entry_uctp(struct root_entry
*re
)
211 return re
->hi
& VTD_PAGE_MASK
;
214 static inline void context_clear_pasid_enable(struct context_entry
*context
)
216 context
->lo
&= ~(1ULL << 11);
219 static inline bool context_pasid_enabled(struct context_entry
*context
)
221 return !!(context
->lo
& (1ULL << 11));
224 static inline void context_set_copied(struct context_entry
*context
)
226 context
->hi
|= (1ull << 3);
229 static inline bool context_copied(struct context_entry
*context
)
231 return !!(context
->hi
& (1ULL << 3));
234 static inline bool __context_present(struct context_entry
*context
)
236 return (context
->lo
& 1);
239 bool context_present(struct context_entry
*context
)
241 return context_pasid_enabled(context
) ?
242 __context_present(context
) :
243 __context_present(context
) && !context_copied(context
);
246 static inline void context_set_present(struct context_entry
*context
)
251 static inline void context_set_fault_enable(struct context_entry
*context
)
253 context
->lo
&= (((u64
)-1) << 2) | 1;
256 static inline void context_set_translation_type(struct context_entry
*context
,
259 context
->lo
&= (((u64
)-1) << 4) | 3;
260 context
->lo
|= (value
& 3) << 2;
263 static inline void context_set_address_root(struct context_entry
*context
,
266 context
->lo
&= ~VTD_PAGE_MASK
;
267 context
->lo
|= value
& VTD_PAGE_MASK
;
270 static inline void context_set_address_width(struct context_entry
*context
,
273 context
->hi
|= value
& 7;
276 static inline void context_set_domain_id(struct context_entry
*context
,
279 context
->hi
|= (value
& ((1 << 16) - 1)) << 8;
282 static inline int context_domain_id(struct context_entry
*c
)
284 return((c
->hi
>> 8) & 0xffff);
287 static inline void context_clear_entry(struct context_entry
*context
)
294 * This domain is a statically identity mapping domain.
295 * 1. This domain creats a static 1:1 mapping to all usable memory.
296 * 2. It maps to each iommu if successful.
297 * 3. Each iommu mapps to this domain if successful.
299 static struct dmar_domain
*si_domain
;
300 static int hw_pass_through
= 1;
303 * Domain represents a virtual machine, more than one devices
304 * across iommus may be owned in one domain, e.g. kvm guest.
306 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
308 /* si_domain contains mulitple devices */
309 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
311 #define for_each_domain_iommu(idx, domain) \
312 for (idx = 0; idx < g_num_of_iommus; idx++) \
313 if (domain->iommu_refcnt[idx])
315 struct dmar_rmrr_unit
{
316 struct list_head list
; /* list of rmrr units */
317 struct acpi_dmar_header
*hdr
; /* ACPI header */
318 u64 base_address
; /* reserved base address*/
319 u64 end_address
; /* reserved end address */
320 struct dmar_dev_scope
*devices
; /* target devices */
321 int devices_cnt
; /* target device count */
322 struct iommu_resv_region
*resv
; /* reserved region handle */
325 struct dmar_atsr_unit
{
326 struct list_head list
; /* list of ATSR units */
327 struct acpi_dmar_header
*hdr
; /* ACPI header */
328 struct dmar_dev_scope
*devices
; /* target devices */
329 int devices_cnt
; /* target device count */
330 u8 include_all
:1; /* include all ports */
333 static LIST_HEAD(dmar_atsr_units
);
334 static LIST_HEAD(dmar_rmrr_units
);
336 #define for_each_rmrr_units(rmrr) \
337 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
339 /* bitmap for indexing intel_iommus */
340 static int g_num_of_iommus
;
342 static void domain_exit(struct dmar_domain
*domain
);
343 static void domain_remove_dev_info(struct dmar_domain
*domain
);
344 static void dmar_remove_one_dev_info(struct dmar_domain
*domain
,
346 static void __dmar_remove_one_dev_info(struct device_domain_info
*info
);
347 static void domain_context_clear(struct intel_iommu
*iommu
,
349 static int domain_detach_iommu(struct dmar_domain
*domain
,
350 struct intel_iommu
*iommu
);
352 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
353 int dmar_disabled
= 0;
355 int dmar_disabled
= 1;
356 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
358 int intel_iommu_enabled
= 0;
359 EXPORT_SYMBOL_GPL(intel_iommu_enabled
);
361 static int dmar_map_gfx
= 1;
362 static int dmar_forcedac
;
363 static int intel_iommu_strict
;
364 static int intel_iommu_superpage
= 1;
365 static int intel_iommu_sm
= 1;
366 static int iommu_identity_mapping
;
368 #define IDENTMAP_ALL 1
369 #define IDENTMAP_GFX 2
370 #define IDENTMAP_AZALIA 4
372 #define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap))
373 #define pasid_supported(iommu) (sm_supported(iommu) && \
374 ecap_pasid((iommu)->ecap))
376 int intel_iommu_gfx_mapped
;
377 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped
);
379 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
380 static DEFINE_SPINLOCK(device_domain_lock
);
381 static LIST_HEAD(device_domain_list
);
384 * Iterate over elements in device_domain_list and call the specified
385 * callback @fn against each element.
387 int for_each_device_domain(int (*fn
)(struct device_domain_info
*info
,
388 void *data
), void *data
)
392 struct device_domain_info
*info
;
394 spin_lock_irqsave(&device_domain_lock
, flags
);
395 list_for_each_entry(info
, &device_domain_list
, global
) {
396 ret
= fn(info
, data
);
398 spin_unlock_irqrestore(&device_domain_lock
, flags
);
402 spin_unlock_irqrestore(&device_domain_lock
, flags
);
407 const struct iommu_ops intel_iommu_ops
;
409 static bool translation_pre_enabled(struct intel_iommu
*iommu
)
411 return (iommu
->flags
& VTD_FLAG_TRANS_PRE_ENABLED
);
414 static void clear_translation_pre_enabled(struct intel_iommu
*iommu
)
416 iommu
->flags
&= ~VTD_FLAG_TRANS_PRE_ENABLED
;
419 static void init_translation_status(struct intel_iommu
*iommu
)
423 gsts
= readl(iommu
->reg
+ DMAR_GSTS_REG
);
424 if (gsts
& DMA_GSTS_TES
)
425 iommu
->flags
|= VTD_FLAG_TRANS_PRE_ENABLED
;
428 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
429 static struct dmar_domain
*to_dmar_domain(struct iommu_domain
*dom
)
431 return container_of(dom
, struct dmar_domain
, domain
);
434 static int __init
intel_iommu_setup(char *str
)
439 if (!strncmp(str
, "on", 2)) {
441 pr_info("IOMMU enabled\n");
442 } else if (!strncmp(str
, "off", 3)) {
444 pr_info("IOMMU disabled\n");
445 } else if (!strncmp(str
, "igfx_off", 8)) {
447 pr_info("Disable GFX device mapping\n");
448 } else if (!strncmp(str
, "forcedac", 8)) {
449 pr_info("Forcing DAC for PCI devices\n");
451 } else if (!strncmp(str
, "strict", 6)) {
452 pr_info("Disable batched IOTLB flush\n");
453 intel_iommu_strict
= 1;
454 } else if (!strncmp(str
, "sp_off", 6)) {
455 pr_info("Disable supported super page\n");
456 intel_iommu_superpage
= 0;
457 } else if (!strncmp(str
, "sm_off", 6)) {
458 pr_info("Intel-IOMMU: disable scalable mode support\n");
460 } else if (!strncmp(str
, "tboot_noforce", 13)) {
462 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
463 intel_iommu_tboot_noforce
= 1;
466 str
+= strcspn(str
, ",");
472 __setup("intel_iommu=", intel_iommu_setup
);
474 static struct kmem_cache
*iommu_domain_cache
;
475 static struct kmem_cache
*iommu_devinfo_cache
;
477 static struct dmar_domain
* get_iommu_domain(struct intel_iommu
*iommu
, u16 did
)
479 struct dmar_domain
**domains
;
482 domains
= iommu
->domains
[idx
];
486 return domains
[did
& 0xff];
489 static void set_iommu_domain(struct intel_iommu
*iommu
, u16 did
,
490 struct dmar_domain
*domain
)
492 struct dmar_domain
**domains
;
495 if (!iommu
->domains
[idx
]) {
496 size_t size
= 256 * sizeof(struct dmar_domain
*);
497 iommu
->domains
[idx
] = kzalloc(size
, GFP_ATOMIC
);
500 domains
= iommu
->domains
[idx
];
501 if (WARN_ON(!domains
))
504 domains
[did
& 0xff] = domain
;
507 void *alloc_pgtable_page(int node
)
512 page
= alloc_pages_node(node
, GFP_ATOMIC
| __GFP_ZERO
, 0);
514 vaddr
= page_address(page
);
518 void free_pgtable_page(void *vaddr
)
520 free_page((unsigned long)vaddr
);
523 static inline void *alloc_domain_mem(void)
525 return kmem_cache_alloc(iommu_domain_cache
, GFP_ATOMIC
);
528 static void free_domain_mem(void *vaddr
)
530 kmem_cache_free(iommu_domain_cache
, vaddr
);
533 static inline void * alloc_devinfo_mem(void)
535 return kmem_cache_alloc(iommu_devinfo_cache
, GFP_ATOMIC
);
538 static inline void free_devinfo_mem(void *vaddr
)
540 kmem_cache_free(iommu_devinfo_cache
, vaddr
);
543 static inline int domain_type_is_vm(struct dmar_domain
*domain
)
545 return domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
;
548 static inline int domain_type_is_si(struct dmar_domain
*domain
)
550 return domain
->flags
& DOMAIN_FLAG_STATIC_IDENTITY
;
553 static inline int domain_type_is_vm_or_si(struct dmar_domain
*domain
)
555 return domain
->flags
& (DOMAIN_FLAG_VIRTUAL_MACHINE
|
556 DOMAIN_FLAG_STATIC_IDENTITY
);
559 static inline int domain_pfn_supported(struct dmar_domain
*domain
,
562 int addr_width
= agaw_to_width(domain
->agaw
) - VTD_PAGE_SHIFT
;
564 return !(addr_width
< BITS_PER_LONG
&& pfn
>> addr_width
);
567 static int __iommu_calculate_agaw(struct intel_iommu
*iommu
, int max_gaw
)
572 sagaw
= cap_sagaw(iommu
->cap
);
573 for (agaw
= width_to_agaw(max_gaw
);
575 if (test_bit(agaw
, &sagaw
))
583 * Calculate max SAGAW for each iommu.
585 int iommu_calculate_max_sagaw(struct intel_iommu
*iommu
)
587 return __iommu_calculate_agaw(iommu
, MAX_AGAW_WIDTH
);
591 * calculate agaw for each iommu.
592 * "SAGAW" may be different across iommus, use a default agaw, and
593 * get a supported less agaw for iommus that don't support the default agaw.
595 int iommu_calculate_agaw(struct intel_iommu
*iommu
)
597 return __iommu_calculate_agaw(iommu
, DEFAULT_DOMAIN_ADDRESS_WIDTH
);
600 /* This functionin only returns single iommu in a domain */
601 struct intel_iommu
*domain_get_iommu(struct dmar_domain
*domain
)
605 /* si_domain and vm domain should not get here. */
606 BUG_ON(domain_type_is_vm_or_si(domain
));
607 for_each_domain_iommu(iommu_id
, domain
)
610 if (iommu_id
< 0 || iommu_id
>= g_num_of_iommus
)
613 return g_iommus
[iommu_id
];
616 static void domain_update_iommu_coherency(struct dmar_domain
*domain
)
618 struct dmar_drhd_unit
*drhd
;
619 struct intel_iommu
*iommu
;
623 domain
->iommu_coherency
= 1;
625 for_each_domain_iommu(i
, domain
) {
627 if (!ecap_coherent(g_iommus
[i
]->ecap
)) {
628 domain
->iommu_coherency
= 0;
635 /* No hardware attached; use lowest common denominator */
637 for_each_active_iommu(iommu
, drhd
) {
638 if (!ecap_coherent(iommu
->ecap
)) {
639 domain
->iommu_coherency
= 0;
646 static int domain_update_iommu_snooping(struct intel_iommu
*skip
)
648 struct dmar_drhd_unit
*drhd
;
649 struct intel_iommu
*iommu
;
653 for_each_active_iommu(iommu
, drhd
) {
655 if (!ecap_sc_support(iommu
->ecap
)) {
666 static int domain_update_iommu_superpage(struct intel_iommu
*skip
)
668 struct dmar_drhd_unit
*drhd
;
669 struct intel_iommu
*iommu
;
672 if (!intel_iommu_superpage
) {
676 /* set iommu_superpage to the smallest common denominator */
678 for_each_active_iommu(iommu
, drhd
) {
680 mask
&= cap_super_page_val(iommu
->cap
);
690 /* Some capabilities may be different across iommus */
691 static void domain_update_iommu_cap(struct dmar_domain
*domain
)
693 domain_update_iommu_coherency(domain
);
694 domain
->iommu_snooping
= domain_update_iommu_snooping(NULL
);
695 domain
->iommu_superpage
= domain_update_iommu_superpage(NULL
);
698 struct context_entry
*iommu_context_addr(struct intel_iommu
*iommu
, u8 bus
,
701 struct root_entry
*root
= &iommu
->root_entry
[bus
];
702 struct context_entry
*context
;
706 if (sm_supported(iommu
)) {
714 context
= phys_to_virt(*entry
& VTD_PAGE_MASK
);
716 unsigned long phy_addr
;
720 context
= alloc_pgtable_page(iommu
->node
);
724 __iommu_flush_cache(iommu
, (void *)context
, CONTEXT_SIZE
);
725 phy_addr
= virt_to_phys((void *)context
);
726 *entry
= phy_addr
| 1;
727 __iommu_flush_cache(iommu
, entry
, sizeof(*entry
));
729 return &context
[devfn
];
732 static int iommu_dummy(struct device
*dev
)
734 return dev
->archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
;
737 static struct intel_iommu
*device_to_iommu(struct device
*dev
, u8
*bus
, u8
*devfn
)
739 struct dmar_drhd_unit
*drhd
= NULL
;
740 struct intel_iommu
*iommu
;
742 struct pci_dev
*ptmp
, *pdev
= NULL
;
746 if (iommu_dummy(dev
))
749 if (dev_is_pci(dev
)) {
750 struct pci_dev
*pf_pdev
;
752 pdev
= to_pci_dev(dev
);
755 /* VMD child devices currently cannot be handled individually */
756 if (is_vmd(pdev
->bus
))
760 /* VFs aren't listed in scope tables; we need to look up
761 * the PF instead to find the IOMMU. */
762 pf_pdev
= pci_physfn(pdev
);
764 segment
= pci_domain_nr(pdev
->bus
);
765 } else if (has_acpi_companion(dev
))
766 dev
= &ACPI_COMPANION(dev
)->dev
;
769 for_each_active_iommu(iommu
, drhd
) {
770 if (pdev
&& segment
!= drhd
->segment
)
773 for_each_active_dev_scope(drhd
->devices
,
774 drhd
->devices_cnt
, i
, tmp
) {
776 /* For a VF use its original BDF# not that of the PF
777 * which we used for the IOMMU lookup. Strictly speaking
778 * we could do this for all PCI devices; we only need to
779 * get the BDF# from the scope table for ACPI matches. */
780 if (pdev
&& pdev
->is_virtfn
)
783 *bus
= drhd
->devices
[i
].bus
;
784 *devfn
= drhd
->devices
[i
].devfn
;
788 if (!pdev
|| !dev_is_pci(tmp
))
791 ptmp
= to_pci_dev(tmp
);
792 if (ptmp
->subordinate
&&
793 ptmp
->subordinate
->number
<= pdev
->bus
->number
&&
794 ptmp
->subordinate
->busn_res
.end
>= pdev
->bus
->number
)
798 if (pdev
&& drhd
->include_all
) {
800 *bus
= pdev
->bus
->number
;
801 *devfn
= pdev
->devfn
;
812 static void domain_flush_cache(struct dmar_domain
*domain
,
813 void *addr
, int size
)
815 if (!domain
->iommu_coherency
)
816 clflush_cache_range(addr
, size
);
819 static int device_context_mapped(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
821 struct context_entry
*context
;
825 spin_lock_irqsave(&iommu
->lock
, flags
);
826 context
= iommu_context_addr(iommu
, bus
, devfn
, 0);
828 ret
= context_present(context
);
829 spin_unlock_irqrestore(&iommu
->lock
, flags
);
833 static void free_context_table(struct intel_iommu
*iommu
)
837 struct context_entry
*context
;
839 spin_lock_irqsave(&iommu
->lock
, flags
);
840 if (!iommu
->root_entry
) {
843 for (i
= 0; i
< ROOT_ENTRY_NR
; i
++) {
844 context
= iommu_context_addr(iommu
, i
, 0, 0);
846 free_pgtable_page(context
);
848 if (!sm_supported(iommu
))
851 context
= iommu_context_addr(iommu
, i
, 0x80, 0);
853 free_pgtable_page(context
);
856 free_pgtable_page(iommu
->root_entry
);
857 iommu
->root_entry
= NULL
;
859 spin_unlock_irqrestore(&iommu
->lock
, flags
);
862 static struct dma_pte
*pfn_to_dma_pte(struct dmar_domain
*domain
,
863 unsigned long pfn
, int *target_level
)
865 struct dma_pte
*parent
, *pte
= NULL
;
866 int level
= agaw_to_level(domain
->agaw
);
869 BUG_ON(!domain
->pgd
);
871 if (!domain_pfn_supported(domain
, pfn
))
872 /* Address beyond IOMMU's addressing capabilities. */
875 parent
= domain
->pgd
;
880 offset
= pfn_level_offset(pfn
, level
);
881 pte
= &parent
[offset
];
882 if (!*target_level
&& (dma_pte_superpage(pte
) || !dma_pte_present(pte
)))
884 if (level
== *target_level
)
887 if (!dma_pte_present(pte
)) {
890 tmp_page
= alloc_pgtable_page(domain
->nid
);
895 domain_flush_cache(domain
, tmp_page
, VTD_PAGE_SIZE
);
896 pteval
= ((uint64_t)virt_to_dma_pfn(tmp_page
) << VTD_PAGE_SHIFT
) | DMA_PTE_READ
| DMA_PTE_WRITE
;
897 if (cmpxchg64(&pte
->val
, 0ULL, pteval
))
898 /* Someone else set it while we were thinking; use theirs. */
899 free_pgtable_page(tmp_page
);
901 domain_flush_cache(domain
, pte
, sizeof(*pte
));
906 parent
= phys_to_virt(dma_pte_addr(pte
));
911 *target_level
= level
;
917 /* return address's pte at specific level */
918 static struct dma_pte
*dma_pfn_level_pte(struct dmar_domain
*domain
,
920 int level
, int *large_page
)
922 struct dma_pte
*parent
, *pte
= NULL
;
923 int total
= agaw_to_level(domain
->agaw
);
926 parent
= domain
->pgd
;
927 while (level
<= total
) {
928 offset
= pfn_level_offset(pfn
, total
);
929 pte
= &parent
[offset
];
933 if (!dma_pte_present(pte
)) {
938 if (dma_pte_superpage(pte
)) {
943 parent
= phys_to_virt(dma_pte_addr(pte
));
949 /* clear last level pte, a tlb flush should be followed */
950 static void dma_pte_clear_range(struct dmar_domain
*domain
,
951 unsigned long start_pfn
,
952 unsigned long last_pfn
)
954 unsigned int large_page
= 1;
955 struct dma_pte
*first_pte
, *pte
;
957 BUG_ON(!domain_pfn_supported(domain
, start_pfn
));
958 BUG_ON(!domain_pfn_supported(domain
, last_pfn
));
959 BUG_ON(start_pfn
> last_pfn
);
961 /* we don't need lock here; nobody else touches the iova range */
964 first_pte
= pte
= dma_pfn_level_pte(domain
, start_pfn
, 1, &large_page
);
966 start_pfn
= align_to_level(start_pfn
+ 1, large_page
+ 1);
971 start_pfn
+= lvl_to_nr_pages(large_page
);
973 } while (start_pfn
<= last_pfn
&& !first_pte_in_page(pte
));
975 domain_flush_cache(domain
, first_pte
,
976 (void *)pte
- (void *)first_pte
);
978 } while (start_pfn
&& start_pfn
<= last_pfn
);
981 static void dma_pte_free_level(struct dmar_domain
*domain
, int level
,
982 int retain_level
, struct dma_pte
*pte
,
983 unsigned long pfn
, unsigned long start_pfn
,
984 unsigned long last_pfn
)
986 pfn
= max(start_pfn
, pfn
);
987 pte
= &pte
[pfn_level_offset(pfn
, level
)];
990 unsigned long level_pfn
;
991 struct dma_pte
*level_pte
;
993 if (!dma_pte_present(pte
) || dma_pte_superpage(pte
))
996 level_pfn
= pfn
& level_mask(level
);
997 level_pte
= phys_to_virt(dma_pte_addr(pte
));
1000 dma_pte_free_level(domain
, level
- 1, retain_level
,
1001 level_pte
, level_pfn
, start_pfn
,
1006 * Free the page table if we're below the level we want to
1007 * retain and the range covers the entire table.
1009 if (level
< retain_level
&& !(start_pfn
> level_pfn
||
1010 last_pfn
< level_pfn
+ level_size(level
) - 1)) {
1012 domain_flush_cache(domain
, pte
, sizeof(*pte
));
1013 free_pgtable_page(level_pte
);
1016 pfn
+= level_size(level
);
1017 } while (!first_pte_in_page(++pte
) && pfn
<= last_pfn
);
1021 * clear last level (leaf) ptes and free page table pages below the
1022 * level we wish to keep intact.
1024 static void dma_pte_free_pagetable(struct dmar_domain
*domain
,
1025 unsigned long start_pfn
,
1026 unsigned long last_pfn
,
1029 BUG_ON(!domain_pfn_supported(domain
, start_pfn
));
1030 BUG_ON(!domain_pfn_supported(domain
, last_pfn
));
1031 BUG_ON(start_pfn
> last_pfn
);
1033 dma_pte_clear_range(domain
, start_pfn
, last_pfn
);
1035 /* We don't need lock here; nobody else touches the iova range */
1036 dma_pte_free_level(domain
, agaw_to_level(domain
->agaw
), retain_level
,
1037 domain
->pgd
, 0, start_pfn
, last_pfn
);
1040 if (start_pfn
== 0 && last_pfn
== DOMAIN_MAX_PFN(domain
->gaw
)) {
1041 free_pgtable_page(domain
->pgd
);
1046 /* When a page at a given level is being unlinked from its parent, we don't
1047 need to *modify* it at all. All we need to do is make a list of all the
1048 pages which can be freed just as soon as we've flushed the IOTLB and we
1049 know the hardware page-walk will no longer touch them.
1050 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1052 static struct page
*dma_pte_list_pagetables(struct dmar_domain
*domain
,
1053 int level
, struct dma_pte
*pte
,
1054 struct page
*freelist
)
1058 pg
= pfn_to_page(dma_pte_addr(pte
) >> PAGE_SHIFT
);
1059 pg
->freelist
= freelist
;
1065 pte
= page_address(pg
);
1067 if (dma_pte_present(pte
) && !dma_pte_superpage(pte
))
1068 freelist
= dma_pte_list_pagetables(domain
, level
- 1,
1071 } while (!first_pte_in_page(pte
));
1076 static struct page
*dma_pte_clear_level(struct dmar_domain
*domain
, int level
,
1077 struct dma_pte
*pte
, unsigned long pfn
,
1078 unsigned long start_pfn
,
1079 unsigned long last_pfn
,
1080 struct page
*freelist
)
1082 struct dma_pte
*first_pte
= NULL
, *last_pte
= NULL
;
1084 pfn
= max(start_pfn
, pfn
);
1085 pte
= &pte
[pfn_level_offset(pfn
, level
)];
1088 unsigned long level_pfn
;
1090 if (!dma_pte_present(pte
))
1093 level_pfn
= pfn
& level_mask(level
);
1095 /* If range covers entire pagetable, free it */
1096 if (start_pfn
<= level_pfn
&&
1097 last_pfn
>= level_pfn
+ level_size(level
) - 1) {
1098 /* These suborbinate page tables are going away entirely. Don't
1099 bother to clear them; we're just going to *free* them. */
1100 if (level
> 1 && !dma_pte_superpage(pte
))
1101 freelist
= dma_pte_list_pagetables(domain
, level
- 1, pte
, freelist
);
1107 } else if (level
> 1) {
1108 /* Recurse down into a level that isn't *entirely* obsolete */
1109 freelist
= dma_pte_clear_level(domain
, level
- 1,
1110 phys_to_virt(dma_pte_addr(pte
)),
1111 level_pfn
, start_pfn
, last_pfn
,
1115 pfn
+= level_size(level
);
1116 } while (!first_pte_in_page(++pte
) && pfn
<= last_pfn
);
1119 domain_flush_cache(domain
, first_pte
,
1120 (void *)++last_pte
- (void *)first_pte
);
1125 /* We can't just free the pages because the IOMMU may still be walking
1126 the page tables, and may have cached the intermediate levels. The
1127 pages can only be freed after the IOTLB flush has been done. */
1128 static struct page
*domain_unmap(struct dmar_domain
*domain
,
1129 unsigned long start_pfn
,
1130 unsigned long last_pfn
)
1132 struct page
*freelist
= NULL
;
1134 BUG_ON(!domain_pfn_supported(domain
, start_pfn
));
1135 BUG_ON(!domain_pfn_supported(domain
, last_pfn
));
1136 BUG_ON(start_pfn
> last_pfn
);
1138 /* we don't need lock here; nobody else touches the iova range */
1139 freelist
= dma_pte_clear_level(domain
, agaw_to_level(domain
->agaw
),
1140 domain
->pgd
, 0, start_pfn
, last_pfn
, NULL
);
1143 if (start_pfn
== 0 && last_pfn
== DOMAIN_MAX_PFN(domain
->gaw
)) {
1144 struct page
*pgd_page
= virt_to_page(domain
->pgd
);
1145 pgd_page
->freelist
= freelist
;
1146 freelist
= pgd_page
;
1154 static void dma_free_pagelist(struct page
*freelist
)
1158 while ((pg
= freelist
)) {
1159 freelist
= pg
->freelist
;
1160 free_pgtable_page(page_address(pg
));
1164 static void iova_entry_free(unsigned long data
)
1166 struct page
*freelist
= (struct page
*)data
;
1168 dma_free_pagelist(freelist
);
1171 /* iommu handling */
1172 static int iommu_alloc_root_entry(struct intel_iommu
*iommu
)
1174 struct root_entry
*root
;
1175 unsigned long flags
;
1177 root
= (struct root_entry
*)alloc_pgtable_page(iommu
->node
);
1179 pr_err("Allocating root entry for %s failed\n",
1184 __iommu_flush_cache(iommu
, root
, ROOT_SIZE
);
1186 spin_lock_irqsave(&iommu
->lock
, flags
);
1187 iommu
->root_entry
= root
;
1188 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1193 static void iommu_set_root_entry(struct intel_iommu
*iommu
)
1199 addr
= virt_to_phys(iommu
->root_entry
);
1200 if (sm_supported(iommu
))
1201 addr
|= DMA_RTADDR_SMT
;
1203 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
1204 dmar_writeq(iommu
->reg
+ DMAR_RTADDR_REG
, addr
);
1206 writel(iommu
->gcmd
| DMA_GCMD_SRTP
, iommu
->reg
+ DMAR_GCMD_REG
);
1208 /* Make sure hardware complete it */
1209 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1210 readl
, (sts
& DMA_GSTS_RTPS
), sts
);
1212 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1215 void iommu_flush_write_buffer(struct intel_iommu
*iommu
)
1220 if (!rwbf_quirk
&& !cap_rwbf(iommu
->cap
))
1223 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
1224 writel(iommu
->gcmd
| DMA_GCMD_WBF
, iommu
->reg
+ DMAR_GCMD_REG
);
1226 /* Make sure hardware complete it */
1227 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1228 readl
, (!(val
& DMA_GSTS_WBFS
)), val
);
1230 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1233 /* return value determine if we need a write buffer flush */
1234 static void __iommu_flush_context(struct intel_iommu
*iommu
,
1235 u16 did
, u16 source_id
, u8 function_mask
,
1242 case DMA_CCMD_GLOBAL_INVL
:
1243 val
= DMA_CCMD_GLOBAL_INVL
;
1245 case DMA_CCMD_DOMAIN_INVL
:
1246 val
= DMA_CCMD_DOMAIN_INVL
|DMA_CCMD_DID(did
);
1248 case DMA_CCMD_DEVICE_INVL
:
1249 val
= DMA_CCMD_DEVICE_INVL
|DMA_CCMD_DID(did
)
1250 | DMA_CCMD_SID(source_id
) | DMA_CCMD_FM(function_mask
);
1255 val
|= DMA_CCMD_ICC
;
1257 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
1258 dmar_writeq(iommu
->reg
+ DMAR_CCMD_REG
, val
);
1260 /* Make sure hardware complete it */
1261 IOMMU_WAIT_OP(iommu
, DMAR_CCMD_REG
,
1262 dmar_readq
, (!(val
& DMA_CCMD_ICC
)), val
);
1264 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1267 /* return value determine if we need a write buffer flush */
1268 static void __iommu_flush_iotlb(struct intel_iommu
*iommu
, u16 did
,
1269 u64 addr
, unsigned int size_order
, u64 type
)
1271 int tlb_offset
= ecap_iotlb_offset(iommu
->ecap
);
1272 u64 val
= 0, val_iva
= 0;
1276 case DMA_TLB_GLOBAL_FLUSH
:
1277 /* global flush doesn't need set IVA_REG */
1278 val
= DMA_TLB_GLOBAL_FLUSH
|DMA_TLB_IVT
;
1280 case DMA_TLB_DSI_FLUSH
:
1281 val
= DMA_TLB_DSI_FLUSH
|DMA_TLB_IVT
|DMA_TLB_DID(did
);
1283 case DMA_TLB_PSI_FLUSH
:
1284 val
= DMA_TLB_PSI_FLUSH
|DMA_TLB_IVT
|DMA_TLB_DID(did
);
1285 /* IH bit is passed in as part of address */
1286 val_iva
= size_order
| addr
;
1291 /* Note: set drain read/write */
1294 * This is probably to be super secure.. Looks like we can
1295 * ignore it without any impact.
1297 if (cap_read_drain(iommu
->cap
))
1298 val
|= DMA_TLB_READ_DRAIN
;
1300 if (cap_write_drain(iommu
->cap
))
1301 val
|= DMA_TLB_WRITE_DRAIN
;
1303 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
1304 /* Note: Only uses first TLB reg currently */
1306 dmar_writeq(iommu
->reg
+ tlb_offset
, val_iva
);
1307 dmar_writeq(iommu
->reg
+ tlb_offset
+ 8, val
);
1309 /* Make sure hardware complete it */
1310 IOMMU_WAIT_OP(iommu
, tlb_offset
+ 8,
1311 dmar_readq
, (!(val
& DMA_TLB_IVT
)), val
);
1313 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1315 /* check IOTLB invalidation granularity */
1316 if (DMA_TLB_IAIG(val
) == 0)
1317 pr_err("Flush IOTLB failed\n");
1318 if (DMA_TLB_IAIG(val
) != DMA_TLB_IIRG(type
))
1319 pr_debug("TLB flush request %Lx, actual %Lx\n",
1320 (unsigned long long)DMA_TLB_IIRG(type
),
1321 (unsigned long long)DMA_TLB_IAIG(val
));
1324 static struct device_domain_info
*
1325 iommu_support_dev_iotlb (struct dmar_domain
*domain
, struct intel_iommu
*iommu
,
1328 struct device_domain_info
*info
;
1330 assert_spin_locked(&device_domain_lock
);
1335 list_for_each_entry(info
, &domain
->devices
, link
)
1336 if (info
->iommu
== iommu
&& info
->bus
== bus
&&
1337 info
->devfn
== devfn
) {
1338 if (info
->ats_supported
&& info
->dev
)
1346 static void domain_update_iotlb(struct dmar_domain
*domain
)
1348 struct device_domain_info
*info
;
1349 bool has_iotlb_device
= false;
1351 assert_spin_locked(&device_domain_lock
);
1353 list_for_each_entry(info
, &domain
->devices
, link
) {
1354 struct pci_dev
*pdev
;
1356 if (!info
->dev
|| !dev_is_pci(info
->dev
))
1359 pdev
= to_pci_dev(info
->dev
);
1360 if (pdev
->ats_enabled
) {
1361 has_iotlb_device
= true;
1366 domain
->has_iotlb_device
= has_iotlb_device
;
1369 static void iommu_enable_dev_iotlb(struct device_domain_info
*info
)
1371 struct pci_dev
*pdev
;
1373 assert_spin_locked(&device_domain_lock
);
1375 if (!info
|| !dev_is_pci(info
->dev
))
1378 pdev
= to_pci_dev(info
->dev
);
1379 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1380 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1381 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1382 * reserved, which should be set to 0.
1384 if (!ecap_dit(info
->iommu
->ecap
))
1387 struct pci_dev
*pf_pdev
;
1389 /* pdev will be returned if device is not a vf */
1390 pf_pdev
= pci_physfn(pdev
);
1391 info
->pfsid
= PCI_DEVID(pf_pdev
->bus
->number
, pf_pdev
->devfn
);
1394 #ifdef CONFIG_INTEL_IOMMU_SVM
1395 /* The PCIe spec, in its wisdom, declares that the behaviour of
1396 the device if you enable PASID support after ATS support is
1397 undefined. So always enable PASID support on devices which
1398 have it, even if we can't yet know if we're ever going to
1400 if (info
->pasid_supported
&& !pci_enable_pasid(pdev
, info
->pasid_supported
& ~1))
1401 info
->pasid_enabled
= 1;
1403 if (info
->pri_supported
&& !pci_reset_pri(pdev
) && !pci_enable_pri(pdev
, 32))
1404 info
->pri_enabled
= 1;
1406 if (info
->ats_supported
&& !pci_enable_ats(pdev
, VTD_PAGE_SHIFT
)) {
1407 info
->ats_enabled
= 1;
1408 domain_update_iotlb(info
->domain
);
1409 info
->ats_qdep
= pci_ats_queue_depth(pdev
);
1413 static void iommu_disable_dev_iotlb(struct device_domain_info
*info
)
1415 struct pci_dev
*pdev
;
1417 assert_spin_locked(&device_domain_lock
);
1419 if (!dev_is_pci(info
->dev
))
1422 pdev
= to_pci_dev(info
->dev
);
1424 if (info
->ats_enabled
) {
1425 pci_disable_ats(pdev
);
1426 info
->ats_enabled
= 0;
1427 domain_update_iotlb(info
->domain
);
1429 #ifdef CONFIG_INTEL_IOMMU_SVM
1430 if (info
->pri_enabled
) {
1431 pci_disable_pri(pdev
);
1432 info
->pri_enabled
= 0;
1434 if (info
->pasid_enabled
) {
1435 pci_disable_pasid(pdev
);
1436 info
->pasid_enabled
= 0;
1441 static void iommu_flush_dev_iotlb(struct dmar_domain
*domain
,
1442 u64 addr
, unsigned mask
)
1445 unsigned long flags
;
1446 struct device_domain_info
*info
;
1448 if (!domain
->has_iotlb_device
)
1451 spin_lock_irqsave(&device_domain_lock
, flags
);
1452 list_for_each_entry(info
, &domain
->devices
, link
) {
1453 if (!info
->ats_enabled
)
1456 sid
= info
->bus
<< 8 | info
->devfn
;
1457 qdep
= info
->ats_qdep
;
1458 qi_flush_dev_iotlb(info
->iommu
, sid
, info
->pfsid
,
1461 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1464 static void iommu_flush_iotlb_psi(struct intel_iommu
*iommu
,
1465 struct dmar_domain
*domain
,
1466 unsigned long pfn
, unsigned int pages
,
1469 unsigned int mask
= ilog2(__roundup_pow_of_two(pages
));
1470 uint64_t addr
= (uint64_t)pfn
<< VTD_PAGE_SHIFT
;
1471 u16 did
= domain
->iommu_did
[iommu
->seq_id
];
1478 * Fallback to domain selective flush if no PSI support or the size is
1480 * PSI requires page size to be 2 ^ x, and the base address is naturally
1481 * aligned to the size
1483 if (!cap_pgsel_inv(iommu
->cap
) || mask
> cap_max_amask_val(iommu
->cap
))
1484 iommu
->flush
.flush_iotlb(iommu
, did
, 0, 0,
1487 iommu
->flush
.flush_iotlb(iommu
, did
, addr
| ih
, mask
,
1491 * In caching mode, changes of pages from non-present to present require
1492 * flush. However, device IOTLB doesn't need to be flushed in this case.
1494 if (!cap_caching_mode(iommu
->cap
) || !map
)
1495 iommu_flush_dev_iotlb(domain
, addr
, mask
);
1498 /* Notification for newly created mappings */
1499 static inline void __mapping_notify_one(struct intel_iommu
*iommu
,
1500 struct dmar_domain
*domain
,
1501 unsigned long pfn
, unsigned int pages
)
1503 /* It's a non-present to present mapping. Only flush if caching mode */
1504 if (cap_caching_mode(iommu
->cap
))
1505 iommu_flush_iotlb_psi(iommu
, domain
, pfn
, pages
, 0, 1);
1507 iommu_flush_write_buffer(iommu
);
1510 static void iommu_flush_iova(struct iova_domain
*iovad
)
1512 struct dmar_domain
*domain
;
1515 domain
= container_of(iovad
, struct dmar_domain
, iovad
);
1517 for_each_domain_iommu(idx
, domain
) {
1518 struct intel_iommu
*iommu
= g_iommus
[idx
];
1519 u16 did
= domain
->iommu_did
[iommu
->seq_id
];
1521 iommu
->flush
.flush_iotlb(iommu
, did
, 0, 0, DMA_TLB_DSI_FLUSH
);
1523 if (!cap_caching_mode(iommu
->cap
))
1524 iommu_flush_dev_iotlb(get_iommu_domain(iommu
, did
),
1525 0, MAX_AGAW_PFN_WIDTH
);
1529 static void iommu_disable_protect_mem_regions(struct intel_iommu
*iommu
)
1532 unsigned long flags
;
1534 raw_spin_lock_irqsave(&iommu
->register_lock
, flags
);
1535 pmen
= readl(iommu
->reg
+ DMAR_PMEN_REG
);
1536 pmen
&= ~DMA_PMEN_EPM
;
1537 writel(pmen
, iommu
->reg
+ DMAR_PMEN_REG
);
1539 /* wait for the protected region status bit to clear */
1540 IOMMU_WAIT_OP(iommu
, DMAR_PMEN_REG
,
1541 readl
, !(pmen
& DMA_PMEN_PRS
), pmen
);
1543 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flags
);
1546 static void iommu_enable_translation(struct intel_iommu
*iommu
)
1549 unsigned long flags
;
1551 raw_spin_lock_irqsave(&iommu
->register_lock
, flags
);
1552 iommu
->gcmd
|= DMA_GCMD_TE
;
1553 writel(iommu
->gcmd
, iommu
->reg
+ DMAR_GCMD_REG
);
1555 /* Make sure hardware complete it */
1556 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1557 readl
, (sts
& DMA_GSTS_TES
), sts
);
1559 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flags
);
1562 static void iommu_disable_translation(struct intel_iommu
*iommu
)
1567 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
1568 iommu
->gcmd
&= ~DMA_GCMD_TE
;
1569 writel(iommu
->gcmd
, iommu
->reg
+ DMAR_GCMD_REG
);
1571 /* Make sure hardware complete it */
1572 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1573 readl
, (!(sts
& DMA_GSTS_TES
)), sts
);
1575 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1579 static int iommu_init_domains(struct intel_iommu
*iommu
)
1581 u32 ndomains
, nlongs
;
1584 ndomains
= cap_ndoms(iommu
->cap
);
1585 pr_debug("%s: Number of Domains supported <%d>\n",
1586 iommu
->name
, ndomains
);
1587 nlongs
= BITS_TO_LONGS(ndomains
);
1589 spin_lock_init(&iommu
->lock
);
1591 iommu
->domain_ids
= kcalloc(nlongs
, sizeof(unsigned long), GFP_KERNEL
);
1592 if (!iommu
->domain_ids
) {
1593 pr_err("%s: Allocating domain id array failed\n",
1598 size
= (ALIGN(ndomains
, 256) >> 8) * sizeof(struct dmar_domain
**);
1599 iommu
->domains
= kzalloc(size
, GFP_KERNEL
);
1601 if (iommu
->domains
) {
1602 size
= 256 * sizeof(struct dmar_domain
*);
1603 iommu
->domains
[0] = kzalloc(size
, GFP_KERNEL
);
1606 if (!iommu
->domains
|| !iommu
->domains
[0]) {
1607 pr_err("%s: Allocating domain array failed\n",
1609 kfree(iommu
->domain_ids
);
1610 kfree(iommu
->domains
);
1611 iommu
->domain_ids
= NULL
;
1612 iommu
->domains
= NULL
;
1619 * If Caching mode is set, then invalid translations are tagged
1620 * with domain-id 0, hence we need to pre-allocate it. We also
1621 * use domain-id 0 as a marker for non-allocated domain-id, so
1622 * make sure it is not used for a real domain.
1624 set_bit(0, iommu
->domain_ids
);
1627 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1628 * entry for first-level or pass-through translation modes should
1629 * be programmed with a domain id different from those used for
1630 * second-level or nested translation. We reserve a domain id for
1633 if (sm_supported(iommu
))
1634 set_bit(FLPT_DEFAULT_DID
, iommu
->domain_ids
);
1639 static void disable_dmar_iommu(struct intel_iommu
*iommu
)
1641 struct device_domain_info
*info
, *tmp
;
1642 unsigned long flags
;
1644 if (!iommu
->domains
|| !iommu
->domain_ids
)
1648 spin_lock_irqsave(&device_domain_lock
, flags
);
1649 list_for_each_entry_safe(info
, tmp
, &device_domain_list
, global
) {
1650 struct dmar_domain
*domain
;
1652 if (info
->iommu
!= iommu
)
1655 if (!info
->dev
|| !info
->domain
)
1658 domain
= info
->domain
;
1660 __dmar_remove_one_dev_info(info
);
1662 if (!domain_type_is_vm_or_si(domain
)) {
1664 * The domain_exit() function can't be called under
1665 * device_domain_lock, as it takes this lock itself.
1666 * So release the lock here and re-run the loop
1669 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1670 domain_exit(domain
);
1674 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1676 if (iommu
->gcmd
& DMA_GCMD_TE
)
1677 iommu_disable_translation(iommu
);
1680 static void free_dmar_iommu(struct intel_iommu
*iommu
)
1682 if ((iommu
->domains
) && (iommu
->domain_ids
)) {
1683 int elems
= ALIGN(cap_ndoms(iommu
->cap
), 256) >> 8;
1686 for (i
= 0; i
< elems
; i
++)
1687 kfree(iommu
->domains
[i
]);
1688 kfree(iommu
->domains
);
1689 kfree(iommu
->domain_ids
);
1690 iommu
->domains
= NULL
;
1691 iommu
->domain_ids
= NULL
;
1694 g_iommus
[iommu
->seq_id
] = NULL
;
1696 /* free context mapping */
1697 free_context_table(iommu
);
1699 #ifdef CONFIG_INTEL_IOMMU_SVM
1700 if (pasid_supported(iommu
)) {
1701 if (ecap_prs(iommu
->ecap
))
1702 intel_svm_finish_prq(iommu
);
1707 static struct dmar_domain
*alloc_domain(int flags
)
1709 struct dmar_domain
*domain
;
1711 domain
= alloc_domain_mem();
1715 memset(domain
, 0, sizeof(*domain
));
1717 domain
->flags
= flags
;
1718 domain
->has_iotlb_device
= false;
1719 INIT_LIST_HEAD(&domain
->devices
);
1724 /* Must be called with iommu->lock */
1725 static int domain_attach_iommu(struct dmar_domain
*domain
,
1726 struct intel_iommu
*iommu
)
1728 unsigned long ndomains
;
1731 assert_spin_locked(&device_domain_lock
);
1732 assert_spin_locked(&iommu
->lock
);
1734 domain
->iommu_refcnt
[iommu
->seq_id
] += 1;
1735 domain
->iommu_count
+= 1;
1736 if (domain
->iommu_refcnt
[iommu
->seq_id
] == 1) {
1737 ndomains
= cap_ndoms(iommu
->cap
);
1738 num
= find_first_zero_bit(iommu
->domain_ids
, ndomains
);
1740 if (num
>= ndomains
) {
1741 pr_err("%s: No free domain ids\n", iommu
->name
);
1742 domain
->iommu_refcnt
[iommu
->seq_id
] -= 1;
1743 domain
->iommu_count
-= 1;
1747 set_bit(num
, iommu
->domain_ids
);
1748 set_iommu_domain(iommu
, num
, domain
);
1750 domain
->iommu_did
[iommu
->seq_id
] = num
;
1751 domain
->nid
= iommu
->node
;
1753 domain_update_iommu_cap(domain
);
1759 static int domain_detach_iommu(struct dmar_domain
*domain
,
1760 struct intel_iommu
*iommu
)
1762 int num
, count
= INT_MAX
;
1764 assert_spin_locked(&device_domain_lock
);
1765 assert_spin_locked(&iommu
->lock
);
1767 domain
->iommu_refcnt
[iommu
->seq_id
] -= 1;
1768 count
= --domain
->iommu_count
;
1769 if (domain
->iommu_refcnt
[iommu
->seq_id
] == 0) {
1770 num
= domain
->iommu_did
[iommu
->seq_id
];
1771 clear_bit(num
, iommu
->domain_ids
);
1772 set_iommu_domain(iommu
, num
, NULL
);
1774 domain_update_iommu_cap(domain
);
1775 domain
->iommu_did
[iommu
->seq_id
] = 0;
1781 static struct iova_domain reserved_iova_list
;
1782 static struct lock_class_key reserved_rbtree_key
;
1784 static int dmar_init_reserved_ranges(void)
1786 struct pci_dev
*pdev
= NULL
;
1790 init_iova_domain(&reserved_iova_list
, VTD_PAGE_SIZE
, IOVA_START_PFN
);
1792 lockdep_set_class(&reserved_iova_list
.iova_rbtree_lock
,
1793 &reserved_rbtree_key
);
1795 /* IOAPIC ranges shouldn't be accessed by DMA */
1796 iova
= reserve_iova(&reserved_iova_list
, IOVA_PFN(IOAPIC_RANGE_START
),
1797 IOVA_PFN(IOAPIC_RANGE_END
));
1799 pr_err("Reserve IOAPIC range failed\n");
1803 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1804 for_each_pci_dev(pdev
) {
1807 for (i
= 0; i
< PCI_NUM_RESOURCES
; i
++) {
1808 r
= &pdev
->resource
[i
];
1809 if (!r
->flags
|| !(r
->flags
& IORESOURCE_MEM
))
1811 iova
= reserve_iova(&reserved_iova_list
,
1815 pr_err("Reserve iova failed\n");
1823 static void domain_reserve_special_ranges(struct dmar_domain
*domain
)
1825 copy_reserved_iova(&reserved_iova_list
, &domain
->iovad
);
1828 static inline int guestwidth_to_adjustwidth(int gaw
)
1831 int r
= (gaw
- 12) % 9;
1842 static int domain_init(struct dmar_domain
*domain
, struct intel_iommu
*iommu
,
1845 int adjust_width
, agaw
;
1846 unsigned long sagaw
;
1849 init_iova_domain(&domain
->iovad
, VTD_PAGE_SIZE
, IOVA_START_PFN
);
1851 err
= init_iova_flush_queue(&domain
->iovad
,
1852 iommu_flush_iova
, iova_entry_free
);
1856 domain_reserve_special_ranges(domain
);
1858 /* calculate AGAW */
1859 if (guest_width
> cap_mgaw(iommu
->cap
))
1860 guest_width
= cap_mgaw(iommu
->cap
);
1861 domain
->gaw
= guest_width
;
1862 adjust_width
= guestwidth_to_adjustwidth(guest_width
);
1863 agaw
= width_to_agaw(adjust_width
);
1864 sagaw
= cap_sagaw(iommu
->cap
);
1865 if (!test_bit(agaw
, &sagaw
)) {
1866 /* hardware doesn't support it, choose a bigger one */
1867 pr_debug("Hardware doesn't support agaw %d\n", agaw
);
1868 agaw
= find_next_bit(&sagaw
, 5, agaw
);
1872 domain
->agaw
= agaw
;
1874 if (ecap_coherent(iommu
->ecap
))
1875 domain
->iommu_coherency
= 1;
1877 domain
->iommu_coherency
= 0;
1879 if (ecap_sc_support(iommu
->ecap
))
1880 domain
->iommu_snooping
= 1;
1882 domain
->iommu_snooping
= 0;
1884 if (intel_iommu_superpage
)
1885 domain
->iommu_superpage
= fls(cap_super_page_val(iommu
->cap
));
1887 domain
->iommu_superpage
= 0;
1889 domain
->nid
= iommu
->node
;
1891 /* always allocate the top pgd */
1892 domain
->pgd
= (struct dma_pte
*)alloc_pgtable_page(domain
->nid
);
1895 __iommu_flush_cache(iommu
, domain
->pgd
, PAGE_SIZE
);
1899 static void domain_exit(struct dmar_domain
*domain
)
1901 struct page
*freelist
= NULL
;
1903 /* Domain 0 is reserved, so dont process it */
1907 /* Remove associated devices and clear attached or cached domains */
1909 domain_remove_dev_info(domain
);
1913 put_iova_domain(&domain
->iovad
);
1915 freelist
= domain_unmap(domain
, 0, DOMAIN_MAX_PFN(domain
->gaw
));
1917 dma_free_pagelist(freelist
);
1919 free_domain_mem(domain
);
1923 * Get the PASID directory size for scalable mode context entry.
1924 * Value of X in the PDTS field of a scalable mode context entry
1925 * indicates PASID directory with 2^(X + 7) entries.
1927 static inline unsigned long context_get_sm_pds(struct pasid_table
*table
)
1931 max_pde
= table
->max_pasid
>> PASID_PDE_SHIFT
;
1932 pds
= find_first_bit((unsigned long *)&max_pde
, MAX_NR_PASID_BITS
);
1940 * Set the RID_PASID field of a scalable mode context entry. The
1941 * IOMMU hardware will use the PASID value set in this field for
1942 * DMA translations of DMA requests without PASID.
1945 context_set_sm_rid2pasid(struct context_entry
*context
, unsigned long pasid
)
1947 context
->hi
|= pasid
& ((1 << 20) - 1);
1948 context
->hi
|= (1 << 20);
1952 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1955 static inline void context_set_sm_dte(struct context_entry
*context
)
1957 context
->lo
|= (1 << 2);
1961 * Set the PRE(Page Request Enable) field of a scalable mode context
1964 static inline void context_set_sm_pre(struct context_entry
*context
)
1966 context
->lo
|= (1 << 4);
1969 /* Convert value to context PASID directory size field coding. */
1970 #define context_pdts(pds) (((pds) & 0x7) << 9)
1972 static int domain_context_mapping_one(struct dmar_domain
*domain
,
1973 struct intel_iommu
*iommu
,
1974 struct pasid_table
*table
,
1977 u16 did
= domain
->iommu_did
[iommu
->seq_id
];
1978 int translation
= CONTEXT_TT_MULTI_LEVEL
;
1979 struct device_domain_info
*info
= NULL
;
1980 struct context_entry
*context
;
1981 unsigned long flags
;
1986 if (hw_pass_through
&& domain_type_is_si(domain
))
1987 translation
= CONTEXT_TT_PASS_THROUGH
;
1989 pr_debug("Set context mapping for %02x:%02x.%d\n",
1990 bus
, PCI_SLOT(devfn
), PCI_FUNC(devfn
));
1992 BUG_ON(!domain
->pgd
);
1994 spin_lock_irqsave(&device_domain_lock
, flags
);
1995 spin_lock(&iommu
->lock
);
1998 context
= iommu_context_addr(iommu
, bus
, devfn
, 1);
2003 if (context_present(context
))
2007 * For kdump cases, old valid entries may be cached due to the
2008 * in-flight DMA and copied pgtable, but there is no unmapping
2009 * behaviour for them, thus we need an explicit cache flush for
2010 * the newly-mapped device. For kdump, at this point, the device
2011 * is supposed to finish reset at its driver probe stage, so no
2012 * in-flight DMA will exist, and we don't need to worry anymore
2015 if (context_copied(context
)) {
2016 u16 did_old
= context_domain_id(context
);
2018 if (did_old
< cap_ndoms(iommu
->cap
)) {
2019 iommu
->flush
.flush_context(iommu
, did_old
,
2020 (((u16
)bus
) << 8) | devfn
,
2021 DMA_CCMD_MASK_NOBIT
,
2022 DMA_CCMD_DEVICE_INVL
);
2023 iommu
->flush
.flush_iotlb(iommu
, did_old
, 0, 0,
2028 context_clear_entry(context
);
2030 if (sm_supported(iommu
)) {
2035 /* Setup the PASID DIR pointer: */
2036 pds
= context_get_sm_pds(table
);
2037 context
->lo
= (u64
)virt_to_phys(table
->table
) |
2040 /* Setup the RID_PASID field: */
2041 context_set_sm_rid2pasid(context
, PASID_RID2PASID
);
2044 * Setup the Device-TLB enable bit and Page request
2047 info
= iommu_support_dev_iotlb(domain
, iommu
, bus
, devfn
);
2048 if (info
&& info
->ats_supported
)
2049 context_set_sm_dte(context
);
2050 if (info
&& info
->pri_supported
)
2051 context_set_sm_pre(context
);
2053 struct dma_pte
*pgd
= domain
->pgd
;
2056 context_set_domain_id(context
, did
);
2057 context_set_translation_type(context
, translation
);
2059 if (translation
!= CONTEXT_TT_PASS_THROUGH
) {
2061 * Skip top levels of page tables for iommu which has
2062 * less agaw than default. Unnecessary for PT mode.
2064 for (agaw
= domain
->agaw
; agaw
> iommu
->agaw
; agaw
--) {
2066 pgd
= phys_to_virt(dma_pte_addr(pgd
));
2067 if (!dma_pte_present(pgd
))
2071 info
= iommu_support_dev_iotlb(domain
, iommu
, bus
, devfn
);
2072 if (info
&& info
->ats_supported
)
2073 translation
= CONTEXT_TT_DEV_IOTLB
;
2075 translation
= CONTEXT_TT_MULTI_LEVEL
;
2077 context_set_address_root(context
, virt_to_phys(pgd
));
2078 context_set_address_width(context
, agaw
);
2081 * In pass through mode, AW must be programmed to
2082 * indicate the largest AGAW value supported by
2083 * hardware. And ASR is ignored by hardware.
2085 context_set_address_width(context
, iommu
->msagaw
);
2089 context_set_fault_enable(context
);
2090 context_set_present(context
);
2091 domain_flush_cache(domain
, context
, sizeof(*context
));
2094 * It's a non-present to present mapping. If hardware doesn't cache
2095 * non-present entry we only need to flush the write-buffer. If the
2096 * _does_ cache non-present entries, then it does so in the special
2097 * domain #0, which we have to flush:
2099 if (cap_caching_mode(iommu
->cap
)) {
2100 iommu
->flush
.flush_context(iommu
, 0,
2101 (((u16
)bus
) << 8) | devfn
,
2102 DMA_CCMD_MASK_NOBIT
,
2103 DMA_CCMD_DEVICE_INVL
);
2104 iommu
->flush
.flush_iotlb(iommu
, did
, 0, 0, DMA_TLB_DSI_FLUSH
);
2106 iommu_flush_write_buffer(iommu
);
2108 iommu_enable_dev_iotlb(info
);
2113 spin_unlock(&iommu
->lock
);
2114 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2119 struct domain_context_mapping_data
{
2120 struct dmar_domain
*domain
;
2121 struct intel_iommu
*iommu
;
2122 struct pasid_table
*table
;
2125 static int domain_context_mapping_cb(struct pci_dev
*pdev
,
2126 u16 alias
, void *opaque
)
2128 struct domain_context_mapping_data
*data
= opaque
;
2130 return domain_context_mapping_one(data
->domain
, data
->iommu
,
2131 data
->table
, PCI_BUS_NUM(alias
),
2136 domain_context_mapping(struct dmar_domain
*domain
, struct device
*dev
)
2138 struct domain_context_mapping_data data
;
2139 struct pasid_table
*table
;
2140 struct intel_iommu
*iommu
;
2143 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
2147 table
= intel_pasid_get_table(dev
);
2149 if (!dev_is_pci(dev
))
2150 return domain_context_mapping_one(domain
, iommu
, table
,
2153 data
.domain
= domain
;
2157 return pci_for_each_dma_alias(to_pci_dev(dev
),
2158 &domain_context_mapping_cb
, &data
);
2161 static int domain_context_mapped_cb(struct pci_dev
*pdev
,
2162 u16 alias
, void *opaque
)
2164 struct intel_iommu
*iommu
= opaque
;
2166 return !device_context_mapped(iommu
, PCI_BUS_NUM(alias
), alias
& 0xff);
2169 static int domain_context_mapped(struct device
*dev
)
2171 struct intel_iommu
*iommu
;
2174 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
2178 if (!dev_is_pci(dev
))
2179 return device_context_mapped(iommu
, bus
, devfn
);
2181 return !pci_for_each_dma_alias(to_pci_dev(dev
),
2182 domain_context_mapped_cb
, iommu
);
2185 /* Returns a number of VTD pages, but aligned to MM page size */
2186 static inline unsigned long aligned_nrpages(unsigned long host_addr
,
2189 host_addr
&= ~PAGE_MASK
;
2190 return PAGE_ALIGN(host_addr
+ size
) >> VTD_PAGE_SHIFT
;
2193 /* Return largest possible superpage level for a given mapping */
2194 static inline int hardware_largepage_caps(struct dmar_domain
*domain
,
2195 unsigned long iov_pfn
,
2196 unsigned long phy_pfn
,
2197 unsigned long pages
)
2199 int support
, level
= 1;
2200 unsigned long pfnmerge
;
2202 support
= domain
->iommu_superpage
;
2204 /* To use a large page, the virtual *and* physical addresses
2205 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2206 of them will mean we have to use smaller pages. So just
2207 merge them and check both at once. */
2208 pfnmerge
= iov_pfn
| phy_pfn
;
2210 while (support
&& !(pfnmerge
& ~VTD_STRIDE_MASK
)) {
2211 pages
>>= VTD_STRIDE_SHIFT
;
2214 pfnmerge
>>= VTD_STRIDE_SHIFT
;
2221 static int __domain_mapping(struct dmar_domain
*domain
, unsigned long iov_pfn
,
2222 struct scatterlist
*sg
, unsigned long phys_pfn
,
2223 unsigned long nr_pages
, int prot
)
2225 struct dma_pte
*first_pte
= NULL
, *pte
= NULL
;
2226 phys_addr_t
uninitialized_var(pteval
);
2227 unsigned long sg_res
= 0;
2228 unsigned int largepage_lvl
= 0;
2229 unsigned long lvl_pages
= 0;
2231 BUG_ON(!domain_pfn_supported(domain
, iov_pfn
+ nr_pages
- 1));
2233 if ((prot
& (DMA_PTE_READ
|DMA_PTE_WRITE
)) == 0)
2236 prot
&= DMA_PTE_READ
| DMA_PTE_WRITE
| DMA_PTE_SNP
;
2240 pteval
= ((phys_addr_t
)phys_pfn
<< VTD_PAGE_SHIFT
) | prot
;
2243 while (nr_pages
> 0) {
2247 unsigned int pgoff
= sg
->offset
& ~PAGE_MASK
;
2249 sg_res
= aligned_nrpages(sg
->offset
, sg
->length
);
2250 sg
->dma_address
= ((dma_addr_t
)iov_pfn
<< VTD_PAGE_SHIFT
) + pgoff
;
2251 sg
->dma_length
= sg
->length
;
2252 pteval
= (sg_phys(sg
) - pgoff
) | prot
;
2253 phys_pfn
= pteval
>> VTD_PAGE_SHIFT
;
2257 largepage_lvl
= hardware_largepage_caps(domain
, iov_pfn
, phys_pfn
, sg_res
);
2259 first_pte
= pte
= pfn_to_dma_pte(domain
, iov_pfn
, &largepage_lvl
);
2262 /* It is large page*/
2263 if (largepage_lvl
> 1) {
2264 unsigned long nr_superpages
, end_pfn
;
2266 pteval
|= DMA_PTE_LARGE_PAGE
;
2267 lvl_pages
= lvl_to_nr_pages(largepage_lvl
);
2269 nr_superpages
= sg_res
/ lvl_pages
;
2270 end_pfn
= iov_pfn
+ nr_superpages
* lvl_pages
- 1;
2273 * Ensure that old small page tables are
2274 * removed to make room for superpage(s).
2275 * We're adding new large pages, so make sure
2276 * we don't remove their parent tables.
2278 dma_pte_free_pagetable(domain
, iov_pfn
, end_pfn
,
2281 pteval
&= ~(uint64_t)DMA_PTE_LARGE_PAGE
;
2285 /* We don't need lock here, nobody else
2286 * touches the iova range
2288 tmp
= cmpxchg64_local(&pte
->val
, 0ULL, pteval
);
2290 static int dumps
= 5;
2291 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2292 iov_pfn
, tmp
, (unsigned long long)pteval
);
2295 debug_dma_dump_mappings(NULL
);
2300 lvl_pages
= lvl_to_nr_pages(largepage_lvl
);
2302 BUG_ON(nr_pages
< lvl_pages
);
2303 BUG_ON(sg_res
< lvl_pages
);
2305 nr_pages
-= lvl_pages
;
2306 iov_pfn
+= lvl_pages
;
2307 phys_pfn
+= lvl_pages
;
2308 pteval
+= lvl_pages
* VTD_PAGE_SIZE
;
2309 sg_res
-= lvl_pages
;
2311 /* If the next PTE would be the first in a new page, then we
2312 need to flush the cache on the entries we've just written.
2313 And then we'll need to recalculate 'pte', so clear it and
2314 let it get set again in the if (!pte) block above.
2316 If we're done (!nr_pages) we need to flush the cache too.
2318 Also if we've been setting superpages, we may need to
2319 recalculate 'pte' and switch back to smaller pages for the
2320 end of the mapping, if the trailing size is not enough to
2321 use another superpage (i.e. sg_res < lvl_pages). */
2323 if (!nr_pages
|| first_pte_in_page(pte
) ||
2324 (largepage_lvl
> 1 && sg_res
< lvl_pages
)) {
2325 domain_flush_cache(domain
, first_pte
,
2326 (void *)pte
- (void *)first_pte
);
2330 if (!sg_res
&& nr_pages
)
2336 static int domain_mapping(struct dmar_domain
*domain
, unsigned long iov_pfn
,
2337 struct scatterlist
*sg
, unsigned long phys_pfn
,
2338 unsigned long nr_pages
, int prot
)
2341 struct intel_iommu
*iommu
;
2343 /* Do the real mapping first */
2344 ret
= __domain_mapping(domain
, iov_pfn
, sg
, phys_pfn
, nr_pages
, prot
);
2348 /* Notify about the new mapping */
2349 if (domain_type_is_vm(domain
)) {
2350 /* VM typed domains can have more than one IOMMUs */
2352 for_each_domain_iommu(iommu_id
, domain
) {
2353 iommu
= g_iommus
[iommu_id
];
2354 __mapping_notify_one(iommu
, domain
, iov_pfn
, nr_pages
);
2357 /* General domains only have one IOMMU */
2358 iommu
= domain_get_iommu(domain
);
2359 __mapping_notify_one(iommu
, domain
, iov_pfn
, nr_pages
);
2365 static inline int domain_sg_mapping(struct dmar_domain
*domain
, unsigned long iov_pfn
,
2366 struct scatterlist
*sg
, unsigned long nr_pages
,
2369 return domain_mapping(domain
, iov_pfn
, sg
, 0, nr_pages
, prot
);
2372 static inline int domain_pfn_mapping(struct dmar_domain
*domain
, unsigned long iov_pfn
,
2373 unsigned long phys_pfn
, unsigned long nr_pages
,
2376 return domain_mapping(domain
, iov_pfn
, NULL
, phys_pfn
, nr_pages
, prot
);
2379 static void domain_context_clear_one(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
2381 unsigned long flags
;
2382 struct context_entry
*context
;
2388 spin_lock_irqsave(&iommu
->lock
, flags
);
2389 context
= iommu_context_addr(iommu
, bus
, devfn
, 0);
2391 spin_unlock_irqrestore(&iommu
->lock
, flags
);
2394 did_old
= context_domain_id(context
);
2395 context_clear_entry(context
);
2396 __iommu_flush_cache(iommu
, context
, sizeof(*context
));
2397 spin_unlock_irqrestore(&iommu
->lock
, flags
);
2398 iommu
->flush
.flush_context(iommu
,
2400 (((u16
)bus
) << 8) | devfn
,
2401 DMA_CCMD_MASK_NOBIT
,
2402 DMA_CCMD_DEVICE_INVL
);
2403 iommu
->flush
.flush_iotlb(iommu
,
2410 static inline void unlink_domain_info(struct device_domain_info
*info
)
2412 assert_spin_locked(&device_domain_lock
);
2413 list_del(&info
->link
);
2414 list_del(&info
->global
);
2416 info
->dev
->archdata
.iommu
= NULL
;
2419 static void domain_remove_dev_info(struct dmar_domain
*domain
)
2421 struct device_domain_info
*info
, *tmp
;
2422 unsigned long flags
;
2424 spin_lock_irqsave(&device_domain_lock
, flags
);
2425 list_for_each_entry_safe(info
, tmp
, &domain
->devices
, link
)
2426 __dmar_remove_one_dev_info(info
);
2427 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2432 * Note: we use struct device->archdata.iommu stores the info
2434 static struct dmar_domain
*find_domain(struct device
*dev
)
2436 struct device_domain_info
*info
;
2438 /* No lock here, assumes no domain exit in normal case */
2439 info
= dev
->archdata
.iommu
;
2441 return info
->domain
;
2445 static inline struct device_domain_info
*
2446 dmar_search_domain_by_dev_info(int segment
, int bus
, int devfn
)
2448 struct device_domain_info
*info
;
2450 list_for_each_entry(info
, &device_domain_list
, global
)
2451 if (info
->iommu
->segment
== segment
&& info
->bus
== bus
&&
2452 info
->devfn
== devfn
)
2458 static struct dmar_domain
*dmar_insert_one_dev_info(struct intel_iommu
*iommu
,
2461 struct dmar_domain
*domain
)
2463 struct dmar_domain
*found
= NULL
;
2464 struct device_domain_info
*info
;
2465 unsigned long flags
;
2468 info
= alloc_devinfo_mem();
2473 info
->devfn
= devfn
;
2474 info
->ats_supported
= info
->pasid_supported
= info
->pri_supported
= 0;
2475 info
->ats_enabled
= info
->pasid_enabled
= info
->pri_enabled
= 0;
2478 info
->domain
= domain
;
2479 info
->iommu
= iommu
;
2480 info
->pasid_table
= NULL
;
2482 if (dev
&& dev_is_pci(dev
)) {
2483 struct pci_dev
*pdev
= to_pci_dev(info
->dev
);
2485 if (!pci_ats_disabled() &&
2486 ecap_dev_iotlb_support(iommu
->ecap
) &&
2487 pci_find_ext_capability(pdev
, PCI_EXT_CAP_ID_ATS
) &&
2488 dmar_find_matched_atsr_unit(pdev
))
2489 info
->ats_supported
= 1;
2491 if (sm_supported(iommu
)) {
2492 if (pasid_supported(iommu
)) {
2493 int features
= pci_pasid_features(pdev
);
2495 info
->pasid_supported
= features
| 1;
2498 if (info
->ats_supported
&& ecap_prs(iommu
->ecap
) &&
2499 pci_find_ext_capability(pdev
, PCI_EXT_CAP_ID_PRI
))
2500 info
->pri_supported
= 1;
2504 spin_lock_irqsave(&device_domain_lock
, flags
);
2506 found
= find_domain(dev
);
2509 struct device_domain_info
*info2
;
2510 info2
= dmar_search_domain_by_dev_info(iommu
->segment
, bus
, devfn
);
2512 found
= info2
->domain
;
2518 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2519 free_devinfo_mem(info
);
2520 /* Caller must free the original domain */
2524 spin_lock(&iommu
->lock
);
2525 ret
= domain_attach_iommu(domain
, iommu
);
2526 spin_unlock(&iommu
->lock
);
2529 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2530 free_devinfo_mem(info
);
2534 list_add(&info
->link
, &domain
->devices
);
2535 list_add(&info
->global
, &device_domain_list
);
2537 dev
->archdata
.iommu
= info
;
2538 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2540 /* PASID table is mandatory for a PCI device in scalable mode. */
2541 if (dev
&& dev_is_pci(dev
) && sm_supported(iommu
)) {
2542 ret
= intel_pasid_alloc_table(dev
);
2544 pr_err("PASID table allocation for %s failed\n",
2546 dmar_remove_one_dev_info(domain
, dev
);
2550 /* Setup the PASID entry for requests without PASID: */
2551 spin_lock(&iommu
->lock
);
2552 if (hw_pass_through
&& domain_type_is_si(domain
))
2553 ret
= intel_pasid_setup_pass_through(iommu
, domain
,
2554 dev
, PASID_RID2PASID
);
2556 ret
= intel_pasid_setup_second_level(iommu
, domain
,
2557 dev
, PASID_RID2PASID
);
2558 spin_unlock(&iommu
->lock
);
2560 pr_err("Setup RID2PASID for %s failed\n",
2562 dmar_remove_one_dev_info(domain
, dev
);
2567 if (dev
&& domain_context_mapping(domain
, dev
)) {
2568 pr_err("Domain context map for %s failed\n", dev_name(dev
));
2569 dmar_remove_one_dev_info(domain
, dev
);
2576 static int get_last_alias(struct pci_dev
*pdev
, u16 alias
, void *opaque
)
2578 *(u16
*)opaque
= alias
;
2582 static struct dmar_domain
*find_or_alloc_domain(struct device
*dev
, int gaw
)
2584 struct device_domain_info
*info
= NULL
;
2585 struct dmar_domain
*domain
= NULL
;
2586 struct intel_iommu
*iommu
;
2588 unsigned long flags
;
2591 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
2595 if (dev_is_pci(dev
)) {
2596 struct pci_dev
*pdev
= to_pci_dev(dev
);
2598 pci_for_each_dma_alias(pdev
, get_last_alias
, &dma_alias
);
2600 spin_lock_irqsave(&device_domain_lock
, flags
);
2601 info
= dmar_search_domain_by_dev_info(pci_domain_nr(pdev
->bus
),
2602 PCI_BUS_NUM(dma_alias
),
2605 iommu
= info
->iommu
;
2606 domain
= info
->domain
;
2608 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2610 /* DMA alias already has a domain, use it */
2615 /* Allocate and initialize new domain for the device */
2616 domain
= alloc_domain(0);
2619 if (domain_init(domain
, iommu
, gaw
)) {
2620 domain_exit(domain
);
2629 static struct dmar_domain
*set_domain_for_dev(struct device
*dev
,
2630 struct dmar_domain
*domain
)
2632 struct intel_iommu
*iommu
;
2633 struct dmar_domain
*tmp
;
2634 u16 req_id
, dma_alias
;
2637 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
2641 req_id
= ((u16
)bus
<< 8) | devfn
;
2643 if (dev_is_pci(dev
)) {
2644 struct pci_dev
*pdev
= to_pci_dev(dev
);
2646 pci_for_each_dma_alias(pdev
, get_last_alias
, &dma_alias
);
2648 /* register PCI DMA alias device */
2649 if (req_id
!= dma_alias
) {
2650 tmp
= dmar_insert_one_dev_info(iommu
, PCI_BUS_NUM(dma_alias
),
2651 dma_alias
& 0xff, NULL
, domain
);
2653 if (!tmp
|| tmp
!= domain
)
2658 tmp
= dmar_insert_one_dev_info(iommu
, bus
, devfn
, dev
, domain
);
2659 if (!tmp
|| tmp
!= domain
)
2665 static struct dmar_domain
*get_domain_for_dev(struct device
*dev
, int gaw
)
2667 struct dmar_domain
*domain
, *tmp
;
2669 domain
= find_domain(dev
);
2673 domain
= find_or_alloc_domain(dev
, gaw
);
2677 tmp
= set_domain_for_dev(dev
, domain
);
2678 if (!tmp
|| domain
!= tmp
) {
2679 domain_exit(domain
);
2688 static int iommu_domain_identity_map(struct dmar_domain
*domain
,
2689 unsigned long long start
,
2690 unsigned long long end
)
2692 unsigned long first_vpfn
= start
>> VTD_PAGE_SHIFT
;
2693 unsigned long last_vpfn
= end
>> VTD_PAGE_SHIFT
;
2695 if (!reserve_iova(&domain
->iovad
, dma_to_mm_pfn(first_vpfn
),
2696 dma_to_mm_pfn(last_vpfn
))) {
2697 pr_err("Reserving iova failed\n");
2701 pr_debug("Mapping reserved region %llx-%llx\n", start
, end
);
2703 * RMRR range might have overlap with physical memory range,
2706 dma_pte_clear_range(domain
, first_vpfn
, last_vpfn
);
2708 return __domain_mapping(domain
, first_vpfn
, NULL
,
2709 first_vpfn
, last_vpfn
- first_vpfn
+ 1,
2710 DMA_PTE_READ
|DMA_PTE_WRITE
);
2713 static int domain_prepare_identity_map(struct device
*dev
,
2714 struct dmar_domain
*domain
,
2715 unsigned long long start
,
2716 unsigned long long end
)
2718 /* For _hardware_ passthrough, don't bother. But for software
2719 passthrough, we do it anyway -- it may indicate a memory
2720 range which is reserved in E820, so which didn't get set
2721 up to start with in si_domain */
2722 if (domain
== si_domain
&& hw_pass_through
) {
2723 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2724 dev_name(dev
), start
, end
);
2728 pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2729 dev_name(dev
), start
, end
);
2732 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2733 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2734 dmi_get_system_info(DMI_BIOS_VENDOR
),
2735 dmi_get_system_info(DMI_BIOS_VERSION
),
2736 dmi_get_system_info(DMI_PRODUCT_VERSION
));
2740 if (end
>> agaw_to_width(domain
->agaw
)) {
2741 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2742 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2743 agaw_to_width(domain
->agaw
),
2744 dmi_get_system_info(DMI_BIOS_VENDOR
),
2745 dmi_get_system_info(DMI_BIOS_VERSION
),
2746 dmi_get_system_info(DMI_PRODUCT_VERSION
));
2750 return iommu_domain_identity_map(domain
, start
, end
);
2753 static int iommu_prepare_identity_map(struct device
*dev
,
2754 unsigned long long start
,
2755 unsigned long long end
)
2757 struct dmar_domain
*domain
;
2760 domain
= get_domain_for_dev(dev
, DEFAULT_DOMAIN_ADDRESS_WIDTH
);
2764 ret
= domain_prepare_identity_map(dev
, domain
, start
, end
);
2766 domain_exit(domain
);
2771 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit
*rmrr
,
2774 if (dev
->archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
)
2776 return iommu_prepare_identity_map(dev
, rmrr
->base_address
,
2780 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2781 static inline void iommu_prepare_isa(void)
2783 struct pci_dev
*pdev
;
2786 pdev
= pci_get_class(PCI_CLASS_BRIDGE_ISA
<< 8, NULL
);
2790 pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2791 ret
= iommu_prepare_identity_map(&pdev
->dev
, 0, 16*1024*1024 - 1);
2794 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2799 static inline void iommu_prepare_isa(void)
2803 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2805 static int md_domain_init(struct dmar_domain
*domain
, int guest_width
);
2807 static int __init
si_domain_init(int hw
)
2811 si_domain
= alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY
);
2815 if (md_domain_init(si_domain
, DEFAULT_DOMAIN_ADDRESS_WIDTH
)) {
2816 domain_exit(si_domain
);
2820 pr_debug("Identity mapping domain allocated\n");
2825 for_each_online_node(nid
) {
2826 unsigned long start_pfn
, end_pfn
;
2829 for_each_mem_pfn_range(i
, nid
, &start_pfn
, &end_pfn
, NULL
) {
2830 ret
= iommu_domain_identity_map(si_domain
,
2831 PFN_PHYS(start_pfn
), PFN_PHYS(end_pfn
));
2840 static int identity_mapping(struct device
*dev
)
2842 struct device_domain_info
*info
;
2844 if (likely(!iommu_identity_mapping
))
2847 info
= dev
->archdata
.iommu
;
2848 if (info
&& info
!= DUMMY_DEVICE_DOMAIN_INFO
)
2849 return (info
->domain
== si_domain
);
2854 static int domain_add_dev_info(struct dmar_domain
*domain
, struct device
*dev
)
2856 struct dmar_domain
*ndomain
;
2857 struct intel_iommu
*iommu
;
2860 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
2864 ndomain
= dmar_insert_one_dev_info(iommu
, bus
, devfn
, dev
, domain
);
2865 if (ndomain
!= domain
)
2871 static bool device_has_rmrr(struct device
*dev
)
2873 struct dmar_rmrr_unit
*rmrr
;
2878 for_each_rmrr_units(rmrr
) {
2880 * Return TRUE if this RMRR contains the device that
2883 for_each_active_dev_scope(rmrr
->devices
,
2884 rmrr
->devices_cnt
, i
, tmp
)
2895 * There are a couple cases where we need to restrict the functionality of
2896 * devices associated with RMRRs. The first is when evaluating a device for
2897 * identity mapping because problems exist when devices are moved in and out
2898 * of domains and their respective RMRR information is lost. This means that
2899 * a device with associated RMRRs will never be in a "passthrough" domain.
2900 * The second is use of the device through the IOMMU API. This interface
2901 * expects to have full control of the IOVA space for the device. We cannot
2902 * satisfy both the requirement that RMRR access is maintained and have an
2903 * unencumbered IOVA space. We also have no ability to quiesce the device's
2904 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2905 * We therefore prevent devices associated with an RMRR from participating in
2906 * the IOMMU API, which eliminates them from device assignment.
2908 * In both cases we assume that PCI USB devices with RMRRs have them largely
2909 * for historical reasons and that the RMRR space is not actively used post
2910 * boot. This exclusion may change if vendors begin to abuse it.
2912 * The same exception is made for graphics devices, with the requirement that
2913 * any use of the RMRR regions will be torn down before assigning the device
2916 static bool device_is_rmrr_locked(struct device
*dev
)
2918 if (!device_has_rmrr(dev
))
2921 if (dev_is_pci(dev
)) {
2922 struct pci_dev
*pdev
= to_pci_dev(dev
);
2924 if (IS_USB_DEVICE(pdev
) || IS_GFX_DEVICE(pdev
))
2931 static int iommu_should_identity_map(struct device
*dev
, int startup
)
2934 if (dev_is_pci(dev
)) {
2935 struct pci_dev
*pdev
= to_pci_dev(dev
);
2937 if (device_is_rmrr_locked(dev
))
2940 if ((iommu_identity_mapping
& IDENTMAP_AZALIA
) && IS_AZALIA(pdev
))
2943 if ((iommu_identity_mapping
& IDENTMAP_GFX
) && IS_GFX_DEVICE(pdev
))
2946 if (!(iommu_identity_mapping
& IDENTMAP_ALL
))
2950 * We want to start off with all devices in the 1:1 domain, and
2951 * take them out later if we find they can't access all of memory.
2953 * However, we can't do this for PCI devices behind bridges,
2954 * because all PCI devices behind the same bridge will end up
2955 * with the same source-id on their transactions.
2957 * Practically speaking, we can't change things around for these
2958 * devices at run-time, because we can't be sure there'll be no
2959 * DMA transactions in flight for any of their siblings.
2961 * So PCI devices (unless they're on the root bus) as well as
2962 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2963 * the 1:1 domain, just in _case_ one of their siblings turns out
2964 * not to be able to map all of memory.
2966 if (!pci_is_pcie(pdev
)) {
2967 if (!pci_is_root_bus(pdev
->bus
))
2969 if (pdev
->class >> 8 == PCI_CLASS_BRIDGE_PCI
)
2971 } else if (pci_pcie_type(pdev
) == PCI_EXP_TYPE_PCI_BRIDGE
)
2974 if (device_has_rmrr(dev
))
2979 * At boot time, we don't yet know if devices will be 64-bit capable.
2980 * Assume that they will — if they turn out not to be, then we can
2981 * take them out of the 1:1 domain later.
2985 * If the device's dma_mask is less than the system's memory
2986 * size then this is not a candidate for identity mapping.
2988 u64 dma_mask
= *dev
->dma_mask
;
2990 if (dev
->coherent_dma_mask
&&
2991 dev
->coherent_dma_mask
< dma_mask
)
2992 dma_mask
= dev
->coherent_dma_mask
;
2994 return dma_mask
>= dma_get_required_mask(dev
);
3000 static int __init
dev_prepare_static_identity_mapping(struct device
*dev
, int hw
)
3004 if (!iommu_should_identity_map(dev
, 1))
3007 ret
= domain_add_dev_info(si_domain
, dev
);
3009 pr_info("%s identity mapping for device %s\n",
3010 hw
? "Hardware" : "Software", dev_name(dev
));
3011 else if (ret
== -ENODEV
)
3012 /* device not associated with an iommu */
3019 static int __init
iommu_prepare_static_identity_mapping(int hw
)
3021 struct pci_dev
*pdev
= NULL
;
3022 struct dmar_drhd_unit
*drhd
;
3023 struct intel_iommu
*iommu
;
3028 for_each_pci_dev(pdev
) {
3029 ret
= dev_prepare_static_identity_mapping(&pdev
->dev
, hw
);
3034 for_each_active_iommu(iommu
, drhd
)
3035 for_each_active_dev_scope(drhd
->devices
, drhd
->devices_cnt
, i
, dev
) {
3036 struct acpi_device_physical_node
*pn
;
3037 struct acpi_device
*adev
;
3039 if (dev
->bus
!= &acpi_bus_type
)
3042 adev
= to_acpi_device(dev
);
3043 mutex_lock(&adev
->physical_node_lock
);
3044 list_for_each_entry(pn
, &adev
->physical_node_list
, node
) {
3045 ret
= dev_prepare_static_identity_mapping(pn
->dev
, hw
);
3049 mutex_unlock(&adev
->physical_node_lock
);
3057 static void intel_iommu_init_qi(struct intel_iommu
*iommu
)
3060 * Start from the sane iommu hardware state.
3061 * If the queued invalidation is already initialized by us
3062 * (for example, while enabling interrupt-remapping) then
3063 * we got the things already rolling from a sane state.
3067 * Clear any previous faults.
3069 dmar_fault(-1, iommu
);
3071 * Disable queued invalidation if supported and already enabled
3072 * before OS handover.
3074 dmar_disable_qi(iommu
);
3077 if (dmar_enable_qi(iommu
)) {
3079 * Queued Invalidate not enabled, use Register Based Invalidate
3081 iommu
->flush
.flush_context
= __iommu_flush_context
;
3082 iommu
->flush
.flush_iotlb
= __iommu_flush_iotlb
;
3083 pr_info("%s: Using Register based invalidation\n",
3086 iommu
->flush
.flush_context
= qi_flush_context
;
3087 iommu
->flush
.flush_iotlb
= qi_flush_iotlb
;
3088 pr_info("%s: Using Queued invalidation\n", iommu
->name
);
3092 static int copy_context_table(struct intel_iommu
*iommu
,
3093 struct root_entry
*old_re
,
3094 struct context_entry
**tbl
,
3097 int tbl_idx
, pos
= 0, idx
, devfn
, ret
= 0, did
;
3098 struct context_entry
*new_ce
= NULL
, ce
;
3099 struct context_entry
*old_ce
= NULL
;
3100 struct root_entry re
;
3101 phys_addr_t old_ce_phys
;
3103 tbl_idx
= ext
? bus
* 2 : bus
;
3104 memcpy(&re
, old_re
, sizeof(re
));
3106 for (devfn
= 0; devfn
< 256; devfn
++) {
3107 /* First calculate the correct index */
3108 idx
= (ext
? devfn
* 2 : devfn
) % 256;
3111 /* First save what we may have and clean up */
3113 tbl
[tbl_idx
] = new_ce
;
3114 __iommu_flush_cache(iommu
, new_ce
,
3124 old_ce_phys
= root_entry_lctp(&re
);
3126 old_ce_phys
= root_entry_uctp(&re
);
3129 if (ext
&& devfn
== 0) {
3130 /* No LCTP, try UCTP */
3139 old_ce
= memremap(old_ce_phys
, PAGE_SIZE
,
3144 new_ce
= alloc_pgtable_page(iommu
->node
);
3151 /* Now copy the context entry */
3152 memcpy(&ce
, old_ce
+ idx
, sizeof(ce
));
3154 if (!__context_present(&ce
))
3157 did
= context_domain_id(&ce
);
3158 if (did
>= 0 && did
< cap_ndoms(iommu
->cap
))
3159 set_bit(did
, iommu
->domain_ids
);
3162 * We need a marker for copied context entries. This
3163 * marker needs to work for the old format as well as
3164 * for extended context entries.
3166 * Bit 67 of the context entry is used. In the old
3167 * format this bit is available to software, in the
3168 * extended format it is the PGE bit, but PGE is ignored
3169 * by HW if PASIDs are disabled (and thus still
3172 * So disable PASIDs first and then mark the entry
3173 * copied. This means that we don't copy PASID
3174 * translations from the old kernel, but this is fine as
3175 * faults there are not fatal.
3177 context_clear_pasid_enable(&ce
);
3178 context_set_copied(&ce
);
3183 tbl
[tbl_idx
+ pos
] = new_ce
;
3185 __iommu_flush_cache(iommu
, new_ce
, VTD_PAGE_SIZE
);
3194 static int copy_translation_tables(struct intel_iommu
*iommu
)
3196 struct context_entry
**ctxt_tbls
;
3197 struct root_entry
*old_rt
;
3198 phys_addr_t old_rt_phys
;
3199 int ctxt_table_entries
;
3200 unsigned long flags
;
3205 rtaddr_reg
= dmar_readq(iommu
->reg
+ DMAR_RTADDR_REG
);
3206 ext
= !!(rtaddr_reg
& DMA_RTADDR_RTT
);
3207 new_ext
= !!ecap_ecs(iommu
->ecap
);
3210 * The RTT bit can only be changed when translation is disabled,
3211 * but disabling translation means to open a window for data
3212 * corruption. So bail out and don't copy anything if we would
3213 * have to change the bit.
3218 old_rt_phys
= rtaddr_reg
& VTD_PAGE_MASK
;
3222 old_rt
= memremap(old_rt_phys
, PAGE_SIZE
, MEMREMAP_WB
);
3226 /* This is too big for the stack - allocate it from slab */
3227 ctxt_table_entries
= ext
? 512 : 256;
3229 ctxt_tbls
= kcalloc(ctxt_table_entries
, sizeof(void *), GFP_KERNEL
);
3233 for (bus
= 0; bus
< 256; bus
++) {
3234 ret
= copy_context_table(iommu
, &old_rt
[bus
],
3235 ctxt_tbls
, bus
, ext
);
3237 pr_err("%s: Failed to copy context table for bus %d\n",
3243 spin_lock_irqsave(&iommu
->lock
, flags
);
3245 /* Context tables are copied, now write them to the root_entry table */
3246 for (bus
= 0; bus
< 256; bus
++) {
3247 int idx
= ext
? bus
* 2 : bus
;
3250 if (ctxt_tbls
[idx
]) {
3251 val
= virt_to_phys(ctxt_tbls
[idx
]) | 1;
3252 iommu
->root_entry
[bus
].lo
= val
;
3255 if (!ext
|| !ctxt_tbls
[idx
+ 1])
3258 val
= virt_to_phys(ctxt_tbls
[idx
+ 1]) | 1;
3259 iommu
->root_entry
[bus
].hi
= val
;
3262 spin_unlock_irqrestore(&iommu
->lock
, flags
);
3266 __iommu_flush_cache(iommu
, iommu
->root_entry
, PAGE_SIZE
);
3276 static int __init
init_dmars(void)
3278 struct dmar_drhd_unit
*drhd
;
3279 struct dmar_rmrr_unit
*rmrr
;
3280 bool copied_tables
= false;
3282 struct intel_iommu
*iommu
;
3288 * initialize and program root entry to not present
3291 for_each_drhd_unit(drhd
) {
3293 * lock not needed as this is only incremented in the single
3294 * threaded kernel __init code path all other access are read
3297 if (g_num_of_iommus
< DMAR_UNITS_SUPPORTED
) {
3301 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED
);
3304 /* Preallocate enough resources for IOMMU hot-addition */
3305 if (g_num_of_iommus
< DMAR_UNITS_SUPPORTED
)
3306 g_num_of_iommus
= DMAR_UNITS_SUPPORTED
;
3308 g_iommus
= kcalloc(g_num_of_iommus
, sizeof(struct intel_iommu
*),
3311 pr_err("Allocating global iommu array failed\n");
3316 for_each_active_iommu(iommu
, drhd
) {
3318 * Find the max pasid size of all IOMMU's in the system.
3319 * We need to ensure the system pasid table is no bigger
3320 * than the smallest supported.
3322 if (pasid_supported(iommu
)) {
3323 u32 temp
= 2 << ecap_pss(iommu
->ecap
);
3325 intel_pasid_max_id
= min_t(u32
, temp
,
3326 intel_pasid_max_id
);
3329 g_iommus
[iommu
->seq_id
] = iommu
;
3331 intel_iommu_init_qi(iommu
);
3333 ret
= iommu_init_domains(iommu
);
3337 init_translation_status(iommu
);
3339 if (translation_pre_enabled(iommu
) && !is_kdump_kernel()) {
3340 iommu_disable_translation(iommu
);
3341 clear_translation_pre_enabled(iommu
);
3342 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3348 * we could share the same root & context tables
3349 * among all IOMMU's. Need to Split it later.
3351 ret
= iommu_alloc_root_entry(iommu
);
3355 if (translation_pre_enabled(iommu
)) {
3356 pr_info("Translation already enabled - trying to copy translation structures\n");
3358 ret
= copy_translation_tables(iommu
);
3361 * We found the IOMMU with translation
3362 * enabled - but failed to copy over the
3363 * old root-entry table. Try to proceed
3364 * by disabling translation now and
3365 * allocating a clean root-entry table.
3366 * This might cause DMAR faults, but
3367 * probably the dump will still succeed.
3369 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3371 iommu_disable_translation(iommu
);
3372 clear_translation_pre_enabled(iommu
);
3374 pr_info("Copied translation tables from previous kernel for %s\n",
3376 copied_tables
= true;
3380 if (!ecap_pass_through(iommu
->ecap
))
3381 hw_pass_through
= 0;
3382 #ifdef CONFIG_INTEL_IOMMU_SVM
3383 if (pasid_supported(iommu
))
3384 intel_svm_init(iommu
);
3389 * Now that qi is enabled on all iommus, set the root entry and flush
3390 * caches. This is required on some Intel X58 chipsets, otherwise the
3391 * flush_context function will loop forever and the boot hangs.
3393 for_each_active_iommu(iommu
, drhd
) {
3394 iommu_flush_write_buffer(iommu
);
3395 iommu_set_root_entry(iommu
);
3396 iommu
->flush
.flush_context(iommu
, 0, 0, 0, DMA_CCMD_GLOBAL_INVL
);
3397 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH
);
3400 if (iommu_pass_through
)
3401 iommu_identity_mapping
|= IDENTMAP_ALL
;
3403 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3404 iommu_identity_mapping
|= IDENTMAP_GFX
;
3407 check_tylersburg_isoch();
3409 if (iommu_identity_mapping
) {
3410 ret
= si_domain_init(hw_pass_through
);
3417 * If we copied translations from a previous kernel in the kdump
3418 * case, we can not assign the devices to domains now, as that
3419 * would eliminate the old mappings. So skip this part and defer
3420 * the assignment to device driver initialization time.
3426 * If pass through is not set or not enabled, setup context entries for
3427 * identity mappings for rmrr, gfx, and isa and may fall back to static
3428 * identity mapping if iommu_identity_mapping is set.
3430 if (iommu_identity_mapping
) {
3431 ret
= iommu_prepare_static_identity_mapping(hw_pass_through
);
3433 pr_crit("Failed to setup IOMMU pass-through\n");
3439 * for each dev attached to rmrr
3441 * locate drhd for dev, alloc domain for dev
3442 * allocate free domain
3443 * allocate page table entries for rmrr
3444 * if context not allocated for bus
3445 * allocate and init context
3446 * set present in root table for this bus
3447 * init context with domain, translation etc
3451 pr_info("Setting RMRR:\n");
3452 for_each_rmrr_units(rmrr
) {
3453 /* some BIOS lists non-exist devices in DMAR table. */
3454 for_each_active_dev_scope(rmrr
->devices
, rmrr
->devices_cnt
,
3456 ret
= iommu_prepare_rmrr_dev(rmrr
, dev
);
3458 pr_err("Mapping reserved region failed\n");
3462 iommu_prepare_isa();
3469 * global invalidate context cache
3470 * global invalidate iotlb
3471 * enable translation
3473 for_each_iommu(iommu
, drhd
) {
3474 if (drhd
->ignored
) {
3476 * we always have to disable PMRs or DMA may fail on
3480 iommu_disable_protect_mem_regions(iommu
);
3484 iommu_flush_write_buffer(iommu
);
3486 #ifdef CONFIG_INTEL_IOMMU_SVM
3487 if (pasid_supported(iommu
) && ecap_prs(iommu
->ecap
)) {
3488 ret
= intel_svm_enable_prq(iommu
);
3493 ret
= dmar_set_interrupt(iommu
);
3497 if (!translation_pre_enabled(iommu
))
3498 iommu_enable_translation(iommu
);
3500 iommu_disable_protect_mem_regions(iommu
);
3506 for_each_active_iommu(iommu
, drhd
) {
3507 disable_dmar_iommu(iommu
);
3508 free_dmar_iommu(iommu
);
3517 /* This takes a number of _MM_ pages, not VTD pages */
3518 static unsigned long intel_alloc_iova(struct device
*dev
,
3519 struct dmar_domain
*domain
,
3520 unsigned long nrpages
, uint64_t dma_mask
)
3522 unsigned long iova_pfn
= 0;
3524 /* Restrict dma_mask to the width that the iommu can handle */
3525 dma_mask
= min_t(uint64_t, DOMAIN_MAX_ADDR(domain
->gaw
), dma_mask
);
3526 /* Ensure we reserve the whole size-aligned region */
3527 nrpages
= __roundup_pow_of_two(nrpages
);
3529 if (!dmar_forcedac
&& dma_mask
> DMA_BIT_MASK(32)) {
3531 * First try to allocate an io virtual address in
3532 * DMA_BIT_MASK(32) and if that fails then try allocating
3535 iova_pfn
= alloc_iova_fast(&domain
->iovad
, nrpages
,
3536 IOVA_PFN(DMA_BIT_MASK(32)), false);
3540 iova_pfn
= alloc_iova_fast(&domain
->iovad
, nrpages
,
3541 IOVA_PFN(dma_mask
), true);
3542 if (unlikely(!iova_pfn
)) {
3543 pr_err("Allocating %ld-page iova for %s failed",
3544 nrpages
, dev_name(dev
));
3551 struct dmar_domain
*get_valid_domain_for_dev(struct device
*dev
)
3553 struct dmar_domain
*domain
, *tmp
;
3554 struct dmar_rmrr_unit
*rmrr
;
3555 struct device
*i_dev
;
3558 domain
= find_domain(dev
);
3562 domain
= find_or_alloc_domain(dev
, DEFAULT_DOMAIN_ADDRESS_WIDTH
);
3566 /* We have a new domain - setup possible RMRRs for the device */
3568 for_each_rmrr_units(rmrr
) {
3569 for_each_active_dev_scope(rmrr
->devices
, rmrr
->devices_cnt
,
3574 ret
= domain_prepare_identity_map(dev
, domain
,
3578 dev_err(dev
, "Mapping reserved region failed\n");
3583 tmp
= set_domain_for_dev(dev
, domain
);
3584 if (!tmp
|| domain
!= tmp
) {
3585 domain_exit(domain
);
3592 pr_err("Allocating domain for %s failed\n", dev_name(dev
));
3598 /* Check if the dev needs to go through non-identity map and unmap process.*/
3599 static int iommu_no_mapping(struct device
*dev
)
3603 if (iommu_dummy(dev
))
3606 if (!iommu_identity_mapping
)
3609 found
= identity_mapping(dev
);
3611 if (iommu_should_identity_map(dev
, 0))
3615 * 32 bit DMA is removed from si_domain and fall back
3616 * to non-identity mapping.
3618 dmar_remove_one_dev_info(si_domain
, dev
);
3619 pr_info("32bit %s uses non-identity mapping\n",
3625 * In case of a detached 64 bit DMA device from vm, the device
3626 * is put into si_domain for identity mapping.
3628 if (iommu_should_identity_map(dev
, 0)) {
3630 ret
= domain_add_dev_info(si_domain
, dev
);
3632 pr_info("64bit %s uses identity mapping\n",
3642 static dma_addr_t
__intel_map_single(struct device
*dev
, phys_addr_t paddr
,
3643 size_t size
, int dir
, u64 dma_mask
)
3645 struct dmar_domain
*domain
;
3646 phys_addr_t start_paddr
;
3647 unsigned long iova_pfn
;
3650 struct intel_iommu
*iommu
;
3651 unsigned long paddr_pfn
= paddr
>> PAGE_SHIFT
;
3653 BUG_ON(dir
== DMA_NONE
);
3655 if (iommu_no_mapping(dev
))
3658 domain
= get_valid_domain_for_dev(dev
);
3662 iommu
= domain_get_iommu(domain
);
3663 size
= aligned_nrpages(paddr
, size
);
3665 iova_pfn
= intel_alloc_iova(dev
, domain
, dma_to_mm_pfn(size
), dma_mask
);
3670 * Check if DMAR supports zero-length reads on write only
3673 if (dir
== DMA_TO_DEVICE
|| dir
== DMA_BIDIRECTIONAL
|| \
3674 !cap_zlr(iommu
->cap
))
3675 prot
|= DMA_PTE_READ
;
3676 if (dir
== DMA_FROM_DEVICE
|| dir
== DMA_BIDIRECTIONAL
)
3677 prot
|= DMA_PTE_WRITE
;
3679 * paddr - (paddr + size) might be partial page, we should map the whole
3680 * page. Note: if two part of one page are separately mapped, we
3681 * might have two guest_addr mapping to the same host paddr, but this
3682 * is not a big problem
3684 ret
= domain_pfn_mapping(domain
, mm_to_dma_pfn(iova_pfn
),
3685 mm_to_dma_pfn(paddr_pfn
), size
, prot
);
3689 start_paddr
= (phys_addr_t
)iova_pfn
<< PAGE_SHIFT
;
3690 start_paddr
+= paddr
& ~PAGE_MASK
;
3695 free_iova_fast(&domain
->iovad
, iova_pfn
, dma_to_mm_pfn(size
));
3696 pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3697 dev_name(dev
), size
, (unsigned long long)paddr
, dir
);
3701 static dma_addr_t
intel_map_page(struct device
*dev
, struct page
*page
,
3702 unsigned long offset
, size_t size
,
3703 enum dma_data_direction dir
,
3704 unsigned long attrs
)
3706 return __intel_map_single(dev
, page_to_phys(page
) + offset
, size
,
3707 dir
, *dev
->dma_mask
);
3710 static void intel_unmap(struct device
*dev
, dma_addr_t dev_addr
, size_t size
)
3712 struct dmar_domain
*domain
;
3713 unsigned long start_pfn
, last_pfn
;
3714 unsigned long nrpages
;
3715 unsigned long iova_pfn
;
3716 struct intel_iommu
*iommu
;
3717 struct page
*freelist
;
3719 if (iommu_no_mapping(dev
))
3722 domain
= find_domain(dev
);
3725 iommu
= domain_get_iommu(domain
);
3727 iova_pfn
= IOVA_PFN(dev_addr
);
3729 nrpages
= aligned_nrpages(dev_addr
, size
);
3730 start_pfn
= mm_to_dma_pfn(iova_pfn
);
3731 last_pfn
= start_pfn
+ nrpages
- 1;
3733 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3734 dev_name(dev
), start_pfn
, last_pfn
);
3736 freelist
= domain_unmap(domain
, start_pfn
, last_pfn
);
3738 if (intel_iommu_strict
) {
3739 iommu_flush_iotlb_psi(iommu
, domain
, start_pfn
,
3740 nrpages
, !freelist
, 0);
3742 free_iova_fast(&domain
->iovad
, iova_pfn
, dma_to_mm_pfn(nrpages
));
3743 dma_free_pagelist(freelist
);
3745 queue_iova(&domain
->iovad
, iova_pfn
, nrpages
,
3746 (unsigned long)freelist
);
3748 * queue up the release of the unmap to save the 1/6th of the
3749 * cpu used up by the iotlb flush operation...
3754 static void intel_unmap_page(struct device
*dev
, dma_addr_t dev_addr
,
3755 size_t size
, enum dma_data_direction dir
,
3756 unsigned long attrs
)
3758 intel_unmap(dev
, dev_addr
, size
);
3761 static void *intel_alloc_coherent(struct device
*dev
, size_t size
,
3762 dma_addr_t
*dma_handle
, gfp_t flags
,
3763 unsigned long attrs
)
3765 struct page
*page
= NULL
;
3768 size
= PAGE_ALIGN(size
);
3769 order
= get_order(size
);
3771 if (!iommu_no_mapping(dev
))
3772 flags
&= ~(GFP_DMA
| GFP_DMA32
);
3773 else if (dev
->coherent_dma_mask
< dma_get_required_mask(dev
)) {
3774 if (dev
->coherent_dma_mask
< DMA_BIT_MASK(32))
3780 if (gfpflags_allow_blocking(flags
)) {
3781 unsigned int count
= size
>> PAGE_SHIFT
;
3783 page
= dma_alloc_from_contiguous(dev
, count
, order
,
3784 flags
& __GFP_NOWARN
);
3785 if (page
&& iommu_no_mapping(dev
) &&
3786 page_to_phys(page
) + size
> dev
->coherent_dma_mask
) {
3787 dma_release_from_contiguous(dev
, page
, count
);
3793 page
= alloc_pages(flags
, order
);
3796 memset(page_address(page
), 0, size
);
3798 *dma_handle
= __intel_map_single(dev
, page_to_phys(page
), size
,
3800 dev
->coherent_dma_mask
);
3802 return page_address(page
);
3803 if (!dma_release_from_contiguous(dev
, page
, size
>> PAGE_SHIFT
))
3804 __free_pages(page
, order
);
3809 static void intel_free_coherent(struct device
*dev
, size_t size
, void *vaddr
,
3810 dma_addr_t dma_handle
, unsigned long attrs
)
3813 struct page
*page
= virt_to_page(vaddr
);
3815 size
= PAGE_ALIGN(size
);
3816 order
= get_order(size
);
3818 intel_unmap(dev
, dma_handle
, size
);
3819 if (!dma_release_from_contiguous(dev
, page
, size
>> PAGE_SHIFT
))
3820 __free_pages(page
, order
);
3823 static void intel_unmap_sg(struct device
*dev
, struct scatterlist
*sglist
,
3824 int nelems
, enum dma_data_direction dir
,
3825 unsigned long attrs
)
3827 dma_addr_t startaddr
= sg_dma_address(sglist
) & PAGE_MASK
;
3828 unsigned long nrpages
= 0;
3829 struct scatterlist
*sg
;
3832 for_each_sg(sglist
, sg
, nelems
, i
) {
3833 nrpages
+= aligned_nrpages(sg_dma_address(sg
), sg_dma_len(sg
));
3836 intel_unmap(dev
, startaddr
, nrpages
<< VTD_PAGE_SHIFT
);
3839 static int intel_nontranslate_map_sg(struct device
*hddev
,
3840 struct scatterlist
*sglist
, int nelems
, int dir
)
3843 struct scatterlist
*sg
;
3845 for_each_sg(sglist
, sg
, nelems
, i
) {
3846 BUG_ON(!sg_page(sg
));
3847 sg
->dma_address
= sg_phys(sg
);
3848 sg
->dma_length
= sg
->length
;
3853 static int intel_map_sg(struct device
*dev
, struct scatterlist
*sglist
, int nelems
,
3854 enum dma_data_direction dir
, unsigned long attrs
)
3857 struct dmar_domain
*domain
;
3860 unsigned long iova_pfn
;
3862 struct scatterlist
*sg
;
3863 unsigned long start_vpfn
;
3864 struct intel_iommu
*iommu
;
3866 BUG_ON(dir
== DMA_NONE
);
3867 if (iommu_no_mapping(dev
))
3868 return intel_nontranslate_map_sg(dev
, sglist
, nelems
, dir
);
3870 domain
= get_valid_domain_for_dev(dev
);
3874 iommu
= domain_get_iommu(domain
);
3876 for_each_sg(sglist
, sg
, nelems
, i
)
3877 size
+= aligned_nrpages(sg
->offset
, sg
->length
);
3879 iova_pfn
= intel_alloc_iova(dev
, domain
, dma_to_mm_pfn(size
),
3882 sglist
->dma_length
= 0;
3887 * Check if DMAR supports zero-length reads on write only
3890 if (dir
== DMA_TO_DEVICE
|| dir
== DMA_BIDIRECTIONAL
|| \
3891 !cap_zlr(iommu
->cap
))
3892 prot
|= DMA_PTE_READ
;
3893 if (dir
== DMA_FROM_DEVICE
|| dir
== DMA_BIDIRECTIONAL
)
3894 prot
|= DMA_PTE_WRITE
;
3896 start_vpfn
= mm_to_dma_pfn(iova_pfn
);
3898 ret
= domain_sg_mapping(domain
, start_vpfn
, sglist
, size
, prot
);
3899 if (unlikely(ret
)) {
3900 dma_pte_free_pagetable(domain
, start_vpfn
,
3901 start_vpfn
+ size
- 1,
3902 agaw_to_level(domain
->agaw
) + 1);
3903 free_iova_fast(&domain
->iovad
, iova_pfn
, dma_to_mm_pfn(size
));
3910 static int intel_mapping_error(struct device
*dev
, dma_addr_t dma_addr
)
3915 static const struct dma_map_ops intel_dma_ops
= {
3916 .alloc
= intel_alloc_coherent
,
3917 .free
= intel_free_coherent
,
3918 .map_sg
= intel_map_sg
,
3919 .unmap_sg
= intel_unmap_sg
,
3920 .map_page
= intel_map_page
,
3921 .unmap_page
= intel_unmap_page
,
3922 .mapping_error
= intel_mapping_error
,
3923 .dma_supported
= dma_direct_supported
,
3926 static inline int iommu_domain_cache_init(void)
3930 iommu_domain_cache
= kmem_cache_create("iommu_domain",
3931 sizeof(struct dmar_domain
),
3936 if (!iommu_domain_cache
) {
3937 pr_err("Couldn't create iommu_domain cache\n");
3944 static inline int iommu_devinfo_cache_init(void)
3948 iommu_devinfo_cache
= kmem_cache_create("iommu_devinfo",
3949 sizeof(struct device_domain_info
),
3953 if (!iommu_devinfo_cache
) {
3954 pr_err("Couldn't create devinfo cache\n");
3961 static int __init
iommu_init_mempool(void)
3964 ret
= iova_cache_get();
3968 ret
= iommu_domain_cache_init();
3972 ret
= iommu_devinfo_cache_init();
3976 kmem_cache_destroy(iommu_domain_cache
);
3983 static void __init
iommu_exit_mempool(void)
3985 kmem_cache_destroy(iommu_devinfo_cache
);
3986 kmem_cache_destroy(iommu_domain_cache
);
3990 static void quirk_ioat_snb_local_iommu(struct pci_dev
*pdev
)
3992 struct dmar_drhd_unit
*drhd
;
3996 /* We know that this device on this chipset has its own IOMMU.
3997 * If we find it under a different IOMMU, then the BIOS is lying
3998 * to us. Hope that the IOMMU for this device is actually
3999 * disabled, and it needs no translation...
4001 rc
= pci_bus_read_config_dword(pdev
->bus
, PCI_DEVFN(0, 0), 0xb0, &vtbar
);
4003 /* "can't" happen */
4004 dev_info(&pdev
->dev
, "failed to run vt-d quirk\n");
4007 vtbar
&= 0xffff0000;
4009 /* we know that the this iommu should be at offset 0xa000 from vtbar */
4010 drhd
= dmar_find_matched_drhd_unit(pdev
);
4011 if (WARN_TAINT_ONCE(!drhd
|| drhd
->reg_base_addr
- vtbar
!= 0xa000,
4012 TAINT_FIRMWARE_WORKAROUND
,
4013 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4014 pdev
->dev
.archdata
.iommu
= DUMMY_DEVICE_DOMAIN_INFO
;
4016 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL
, PCI_DEVICE_ID_INTEL_IOAT_SNB
, quirk_ioat_snb_local_iommu
);
4018 static void __init
init_no_remapping_devices(void)
4020 struct dmar_drhd_unit
*drhd
;
4024 for_each_drhd_unit(drhd
) {
4025 if (!drhd
->include_all
) {
4026 for_each_active_dev_scope(drhd
->devices
,
4027 drhd
->devices_cnt
, i
, dev
)
4029 /* ignore DMAR unit if no devices exist */
4030 if (i
== drhd
->devices_cnt
)
4035 for_each_active_drhd_unit(drhd
) {
4036 if (drhd
->include_all
)
4039 for_each_active_dev_scope(drhd
->devices
,
4040 drhd
->devices_cnt
, i
, dev
)
4041 if (!dev_is_pci(dev
) || !IS_GFX_DEVICE(to_pci_dev(dev
)))
4043 if (i
< drhd
->devices_cnt
)
4046 /* This IOMMU has *only* gfx devices. Either bypass it or
4047 set the gfx_mapped flag, as appropriate */
4049 intel_iommu_gfx_mapped
= 1;
4052 for_each_active_dev_scope(drhd
->devices
,
4053 drhd
->devices_cnt
, i
, dev
)
4054 dev
->archdata
.iommu
= DUMMY_DEVICE_DOMAIN_INFO
;
4059 #ifdef CONFIG_SUSPEND
4060 static int init_iommu_hw(void)
4062 struct dmar_drhd_unit
*drhd
;
4063 struct intel_iommu
*iommu
= NULL
;
4065 for_each_active_iommu(iommu
, drhd
)
4067 dmar_reenable_qi(iommu
);
4069 for_each_iommu(iommu
, drhd
) {
4070 if (drhd
->ignored
) {
4072 * we always have to disable PMRs or DMA may fail on
4076 iommu_disable_protect_mem_regions(iommu
);
4080 iommu_flush_write_buffer(iommu
);
4082 iommu_set_root_entry(iommu
);
4084 iommu
->flush
.flush_context(iommu
, 0, 0, 0,
4085 DMA_CCMD_GLOBAL_INVL
);
4086 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH
);
4087 iommu_enable_translation(iommu
);
4088 iommu_disable_protect_mem_regions(iommu
);
4094 static void iommu_flush_all(void)
4096 struct dmar_drhd_unit
*drhd
;
4097 struct intel_iommu
*iommu
;
4099 for_each_active_iommu(iommu
, drhd
) {
4100 iommu
->flush
.flush_context(iommu
, 0, 0, 0,
4101 DMA_CCMD_GLOBAL_INVL
);
4102 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0,
4103 DMA_TLB_GLOBAL_FLUSH
);
4107 static int iommu_suspend(void)
4109 struct dmar_drhd_unit
*drhd
;
4110 struct intel_iommu
*iommu
= NULL
;
4113 for_each_active_iommu(iommu
, drhd
) {
4114 iommu
->iommu_state
= kcalloc(MAX_SR_DMAR_REGS
, sizeof(u32
),
4116 if (!iommu
->iommu_state
)
4122 for_each_active_iommu(iommu
, drhd
) {
4123 iommu_disable_translation(iommu
);
4125 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
4127 iommu
->iommu_state
[SR_DMAR_FECTL_REG
] =
4128 readl(iommu
->reg
+ DMAR_FECTL_REG
);
4129 iommu
->iommu_state
[SR_DMAR_FEDATA_REG
] =
4130 readl(iommu
->reg
+ DMAR_FEDATA_REG
);
4131 iommu
->iommu_state
[SR_DMAR_FEADDR_REG
] =
4132 readl(iommu
->reg
+ DMAR_FEADDR_REG
);
4133 iommu
->iommu_state
[SR_DMAR_FEUADDR_REG
] =
4134 readl(iommu
->reg
+ DMAR_FEUADDR_REG
);
4136 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
4141 for_each_active_iommu(iommu
, drhd
)
4142 kfree(iommu
->iommu_state
);
4147 static void iommu_resume(void)
4149 struct dmar_drhd_unit
*drhd
;
4150 struct intel_iommu
*iommu
= NULL
;
4153 if (init_iommu_hw()) {
4155 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4157 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4161 for_each_active_iommu(iommu
, drhd
) {
4163 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
4165 writel(iommu
->iommu_state
[SR_DMAR_FECTL_REG
],
4166 iommu
->reg
+ DMAR_FECTL_REG
);
4167 writel(iommu
->iommu_state
[SR_DMAR_FEDATA_REG
],
4168 iommu
->reg
+ DMAR_FEDATA_REG
);
4169 writel(iommu
->iommu_state
[SR_DMAR_FEADDR_REG
],
4170 iommu
->reg
+ DMAR_FEADDR_REG
);
4171 writel(iommu
->iommu_state
[SR_DMAR_FEUADDR_REG
],
4172 iommu
->reg
+ DMAR_FEUADDR_REG
);
4174 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
4177 for_each_active_iommu(iommu
, drhd
)
4178 kfree(iommu
->iommu_state
);
4181 static struct syscore_ops iommu_syscore_ops
= {
4182 .resume
= iommu_resume
,
4183 .suspend
= iommu_suspend
,
4186 static void __init
init_iommu_pm_ops(void)
4188 register_syscore_ops(&iommu_syscore_ops
);
4192 static inline void init_iommu_pm_ops(void) {}
4193 #endif /* CONFIG_PM */
4196 int __init
dmar_parse_one_rmrr(struct acpi_dmar_header
*header
, void *arg
)
4198 struct acpi_dmar_reserved_memory
*rmrr
;
4199 int prot
= DMA_PTE_READ
|DMA_PTE_WRITE
;
4200 struct dmar_rmrr_unit
*rmrru
;
4203 rmrru
= kzalloc(sizeof(*rmrru
), GFP_KERNEL
);
4207 rmrru
->hdr
= header
;
4208 rmrr
= (struct acpi_dmar_reserved_memory
*)header
;
4209 rmrru
->base_address
= rmrr
->base_address
;
4210 rmrru
->end_address
= rmrr
->end_address
;
4212 length
= rmrr
->end_address
- rmrr
->base_address
+ 1;
4213 rmrru
->resv
= iommu_alloc_resv_region(rmrr
->base_address
, length
, prot
,
4218 rmrru
->devices
= dmar_alloc_dev_scope((void *)(rmrr
+ 1),
4219 ((void *)rmrr
) + rmrr
->header
.length
,
4220 &rmrru
->devices_cnt
);
4221 if (rmrru
->devices_cnt
&& rmrru
->devices
== NULL
)
4224 list_add(&rmrru
->list
, &dmar_rmrr_units
);
4235 static struct dmar_atsr_unit
*dmar_find_atsr(struct acpi_dmar_atsr
*atsr
)
4237 struct dmar_atsr_unit
*atsru
;
4238 struct acpi_dmar_atsr
*tmp
;
4240 list_for_each_entry_rcu(atsru
, &dmar_atsr_units
, list
) {
4241 tmp
= (struct acpi_dmar_atsr
*)atsru
->hdr
;
4242 if (atsr
->segment
!= tmp
->segment
)
4244 if (atsr
->header
.length
!= tmp
->header
.length
)
4246 if (memcmp(atsr
, tmp
, atsr
->header
.length
) == 0)
4253 int dmar_parse_one_atsr(struct acpi_dmar_header
*hdr
, void *arg
)
4255 struct acpi_dmar_atsr
*atsr
;
4256 struct dmar_atsr_unit
*atsru
;
4258 if (system_state
>= SYSTEM_RUNNING
&& !intel_iommu_enabled
)
4261 atsr
= container_of(hdr
, struct acpi_dmar_atsr
, header
);
4262 atsru
= dmar_find_atsr(atsr
);
4266 atsru
= kzalloc(sizeof(*atsru
) + hdr
->length
, GFP_KERNEL
);
4271 * If memory is allocated from slab by ACPI _DSM method, we need to
4272 * copy the memory content because the memory buffer will be freed
4275 atsru
->hdr
= (void *)(atsru
+ 1);
4276 memcpy(atsru
->hdr
, hdr
, hdr
->length
);
4277 atsru
->include_all
= atsr
->flags
& 0x1;
4278 if (!atsru
->include_all
) {
4279 atsru
->devices
= dmar_alloc_dev_scope((void *)(atsr
+ 1),
4280 (void *)atsr
+ atsr
->header
.length
,
4281 &atsru
->devices_cnt
);
4282 if (atsru
->devices_cnt
&& atsru
->devices
== NULL
) {
4288 list_add_rcu(&atsru
->list
, &dmar_atsr_units
);
4293 static void intel_iommu_free_atsr(struct dmar_atsr_unit
*atsru
)
4295 dmar_free_dev_scope(&atsru
->devices
, &atsru
->devices_cnt
);
4299 int dmar_release_one_atsr(struct acpi_dmar_header
*hdr
, void *arg
)
4301 struct acpi_dmar_atsr
*atsr
;
4302 struct dmar_atsr_unit
*atsru
;
4304 atsr
= container_of(hdr
, struct acpi_dmar_atsr
, header
);
4305 atsru
= dmar_find_atsr(atsr
);
4307 list_del_rcu(&atsru
->list
);
4309 intel_iommu_free_atsr(atsru
);
4315 int dmar_check_one_atsr(struct acpi_dmar_header
*hdr
, void *arg
)
4319 struct acpi_dmar_atsr
*atsr
;
4320 struct dmar_atsr_unit
*atsru
;
4322 atsr
= container_of(hdr
, struct acpi_dmar_atsr
, header
);
4323 atsru
= dmar_find_atsr(atsr
);
4327 if (!atsru
->include_all
&& atsru
->devices
&& atsru
->devices_cnt
) {
4328 for_each_active_dev_scope(atsru
->devices
, atsru
->devices_cnt
,
4336 static int intel_iommu_add(struct dmar_drhd_unit
*dmaru
)
4339 struct intel_iommu
*iommu
= dmaru
->iommu
;
4341 if (g_iommus
[iommu
->seq_id
])
4344 if (hw_pass_through
&& !ecap_pass_through(iommu
->ecap
)) {
4345 pr_warn("%s: Doesn't support hardware pass through.\n",
4349 if (!ecap_sc_support(iommu
->ecap
) &&
4350 domain_update_iommu_snooping(iommu
)) {
4351 pr_warn("%s: Doesn't support snooping.\n",
4355 sp
= domain_update_iommu_superpage(iommu
) - 1;
4356 if (sp
>= 0 && !(cap_super_page_val(iommu
->cap
) & (1 << sp
))) {
4357 pr_warn("%s: Doesn't support large page.\n",
4363 * Disable translation if already enabled prior to OS handover.
4365 if (iommu
->gcmd
& DMA_GCMD_TE
)
4366 iommu_disable_translation(iommu
);
4368 g_iommus
[iommu
->seq_id
] = iommu
;
4369 ret
= iommu_init_domains(iommu
);
4371 ret
= iommu_alloc_root_entry(iommu
);
4375 #ifdef CONFIG_INTEL_IOMMU_SVM
4376 if (pasid_supported(iommu
))
4377 intel_svm_init(iommu
);
4380 if (dmaru
->ignored
) {
4382 * we always have to disable PMRs or DMA may fail on this device
4385 iommu_disable_protect_mem_regions(iommu
);
4389 intel_iommu_init_qi(iommu
);
4390 iommu_flush_write_buffer(iommu
);
4392 #ifdef CONFIG_INTEL_IOMMU_SVM
4393 if (pasid_supported(iommu
) && ecap_prs(iommu
->ecap
)) {
4394 ret
= intel_svm_enable_prq(iommu
);
4399 ret
= dmar_set_interrupt(iommu
);
4403 iommu_set_root_entry(iommu
);
4404 iommu
->flush
.flush_context(iommu
, 0, 0, 0, DMA_CCMD_GLOBAL_INVL
);
4405 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH
);
4406 iommu_enable_translation(iommu
);
4408 iommu_disable_protect_mem_regions(iommu
);
4412 disable_dmar_iommu(iommu
);
4414 free_dmar_iommu(iommu
);
4418 int dmar_iommu_hotplug(struct dmar_drhd_unit
*dmaru
, bool insert
)
4421 struct intel_iommu
*iommu
= dmaru
->iommu
;
4423 if (!intel_iommu_enabled
)
4429 ret
= intel_iommu_add(dmaru
);
4431 disable_dmar_iommu(iommu
);
4432 free_dmar_iommu(iommu
);
4438 static void intel_iommu_free_dmars(void)
4440 struct dmar_rmrr_unit
*rmrru
, *rmrr_n
;
4441 struct dmar_atsr_unit
*atsru
, *atsr_n
;
4443 list_for_each_entry_safe(rmrru
, rmrr_n
, &dmar_rmrr_units
, list
) {
4444 list_del(&rmrru
->list
);
4445 dmar_free_dev_scope(&rmrru
->devices
, &rmrru
->devices_cnt
);
4450 list_for_each_entry_safe(atsru
, atsr_n
, &dmar_atsr_units
, list
) {
4451 list_del(&atsru
->list
);
4452 intel_iommu_free_atsr(atsru
);
4456 int dmar_find_matched_atsr_unit(struct pci_dev
*dev
)
4459 struct pci_bus
*bus
;
4460 struct pci_dev
*bridge
= NULL
;
4462 struct acpi_dmar_atsr
*atsr
;
4463 struct dmar_atsr_unit
*atsru
;
4465 dev
= pci_physfn(dev
);
4466 for (bus
= dev
->bus
; bus
; bus
= bus
->parent
) {
4468 /* If it's an integrated device, allow ATS */
4471 /* Connected via non-PCIe: no ATS */
4472 if (!pci_is_pcie(bridge
) ||
4473 pci_pcie_type(bridge
) == PCI_EXP_TYPE_PCI_BRIDGE
)
4475 /* If we found the root port, look it up in the ATSR */
4476 if (pci_pcie_type(bridge
) == PCI_EXP_TYPE_ROOT_PORT
)
4481 list_for_each_entry_rcu(atsru
, &dmar_atsr_units
, list
) {
4482 atsr
= container_of(atsru
->hdr
, struct acpi_dmar_atsr
, header
);
4483 if (atsr
->segment
!= pci_domain_nr(dev
->bus
))
4486 for_each_dev_scope(atsru
->devices
, atsru
->devices_cnt
, i
, tmp
)
4487 if (tmp
== &bridge
->dev
)
4490 if (atsru
->include_all
)
4500 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info
*info
)
4503 struct dmar_rmrr_unit
*rmrru
;
4504 struct dmar_atsr_unit
*atsru
;
4505 struct acpi_dmar_atsr
*atsr
;
4506 struct acpi_dmar_reserved_memory
*rmrr
;
4508 if (!intel_iommu_enabled
&& system_state
>= SYSTEM_RUNNING
)
4511 list_for_each_entry(rmrru
, &dmar_rmrr_units
, list
) {
4512 rmrr
= container_of(rmrru
->hdr
,
4513 struct acpi_dmar_reserved_memory
, header
);
4514 if (info
->event
== BUS_NOTIFY_ADD_DEVICE
) {
4515 ret
= dmar_insert_dev_scope(info
, (void *)(rmrr
+ 1),
4516 ((void *)rmrr
) + rmrr
->header
.length
,
4517 rmrr
->segment
, rmrru
->devices
,
4518 rmrru
->devices_cnt
);
4521 } else if (info
->event
== BUS_NOTIFY_REMOVED_DEVICE
) {
4522 dmar_remove_dev_scope(info
, rmrr
->segment
,
4523 rmrru
->devices
, rmrru
->devices_cnt
);
4527 list_for_each_entry(atsru
, &dmar_atsr_units
, list
) {
4528 if (atsru
->include_all
)
4531 atsr
= container_of(atsru
->hdr
, struct acpi_dmar_atsr
, header
);
4532 if (info
->event
== BUS_NOTIFY_ADD_DEVICE
) {
4533 ret
= dmar_insert_dev_scope(info
, (void *)(atsr
+ 1),
4534 (void *)atsr
+ atsr
->header
.length
,
4535 atsr
->segment
, atsru
->devices
,
4536 atsru
->devices_cnt
);
4541 } else if (info
->event
== BUS_NOTIFY_REMOVED_DEVICE
) {
4542 if (dmar_remove_dev_scope(info
, atsr
->segment
,
4543 atsru
->devices
, atsru
->devices_cnt
))
4552 * Here we only respond to action of unbound device from driver.
4554 * Added device is not attached to its DMAR domain here yet. That will happen
4555 * when mapping the device to iova.
4557 static int device_notifier(struct notifier_block
*nb
,
4558 unsigned long action
, void *data
)
4560 struct device
*dev
= data
;
4561 struct dmar_domain
*domain
;
4563 if (iommu_dummy(dev
))
4566 if (action
!= BUS_NOTIFY_REMOVED_DEVICE
)
4569 domain
= find_domain(dev
);
4573 dmar_remove_one_dev_info(domain
, dev
);
4574 if (!domain_type_is_vm_or_si(domain
) && list_empty(&domain
->devices
))
4575 domain_exit(domain
);
4580 static struct notifier_block device_nb
= {
4581 .notifier_call
= device_notifier
,
4584 static int intel_iommu_memory_notifier(struct notifier_block
*nb
,
4585 unsigned long val
, void *v
)
4587 struct memory_notify
*mhp
= v
;
4588 unsigned long long start
, end
;
4589 unsigned long start_vpfn
, last_vpfn
;
4592 case MEM_GOING_ONLINE
:
4593 start
= mhp
->start_pfn
<< PAGE_SHIFT
;
4594 end
= ((mhp
->start_pfn
+ mhp
->nr_pages
) << PAGE_SHIFT
) - 1;
4595 if (iommu_domain_identity_map(si_domain
, start
, end
)) {
4596 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4603 case MEM_CANCEL_ONLINE
:
4604 start_vpfn
= mm_to_dma_pfn(mhp
->start_pfn
);
4605 last_vpfn
= mm_to_dma_pfn(mhp
->start_pfn
+ mhp
->nr_pages
- 1);
4606 while (start_vpfn
<= last_vpfn
) {
4608 struct dmar_drhd_unit
*drhd
;
4609 struct intel_iommu
*iommu
;
4610 struct page
*freelist
;
4612 iova
= find_iova(&si_domain
->iovad
, start_vpfn
);
4614 pr_debug("Failed get IOVA for PFN %lx\n",
4619 iova
= split_and_remove_iova(&si_domain
->iovad
, iova
,
4620 start_vpfn
, last_vpfn
);
4622 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4623 start_vpfn
, last_vpfn
);
4627 freelist
= domain_unmap(si_domain
, iova
->pfn_lo
,
4631 for_each_active_iommu(iommu
, drhd
)
4632 iommu_flush_iotlb_psi(iommu
, si_domain
,
4633 iova
->pfn_lo
, iova_size(iova
),
4636 dma_free_pagelist(freelist
);
4638 start_vpfn
= iova
->pfn_hi
+ 1;
4639 free_iova_mem(iova
);
4647 static struct notifier_block intel_iommu_memory_nb
= {
4648 .notifier_call
= intel_iommu_memory_notifier
,
4652 static void free_all_cpu_cached_iovas(unsigned int cpu
)
4656 for (i
= 0; i
< g_num_of_iommus
; i
++) {
4657 struct intel_iommu
*iommu
= g_iommus
[i
];
4658 struct dmar_domain
*domain
;
4664 for (did
= 0; did
< cap_ndoms(iommu
->cap
); did
++) {
4665 domain
= get_iommu_domain(iommu
, (u16
)did
);
4669 free_cpu_cached_iovas(cpu
, &domain
->iovad
);
4674 static int intel_iommu_cpu_dead(unsigned int cpu
)
4676 free_all_cpu_cached_iovas(cpu
);
4680 static void intel_disable_iommus(void)
4682 struct intel_iommu
*iommu
= NULL
;
4683 struct dmar_drhd_unit
*drhd
;
4685 for_each_iommu(iommu
, drhd
)
4686 iommu_disable_translation(iommu
);
4689 static inline struct intel_iommu
*dev_to_intel_iommu(struct device
*dev
)
4691 struct iommu_device
*iommu_dev
= dev_to_iommu_device(dev
);
4693 return container_of(iommu_dev
, struct intel_iommu
, iommu
);
4696 static ssize_t
intel_iommu_show_version(struct device
*dev
,
4697 struct device_attribute
*attr
,
4700 struct intel_iommu
*iommu
= dev_to_intel_iommu(dev
);
4701 u32 ver
= readl(iommu
->reg
+ DMAR_VER_REG
);
4702 return sprintf(buf
, "%d:%d\n",
4703 DMAR_VER_MAJOR(ver
), DMAR_VER_MINOR(ver
));
4705 static DEVICE_ATTR(version
, S_IRUGO
, intel_iommu_show_version
, NULL
);
4707 static ssize_t
intel_iommu_show_address(struct device
*dev
,
4708 struct device_attribute
*attr
,
4711 struct intel_iommu
*iommu
= dev_to_intel_iommu(dev
);
4712 return sprintf(buf
, "%llx\n", iommu
->reg_phys
);
4714 static DEVICE_ATTR(address
, S_IRUGO
, intel_iommu_show_address
, NULL
);
4716 static ssize_t
intel_iommu_show_cap(struct device
*dev
,
4717 struct device_attribute
*attr
,
4720 struct intel_iommu
*iommu
= dev_to_intel_iommu(dev
);
4721 return sprintf(buf
, "%llx\n", iommu
->cap
);
4723 static DEVICE_ATTR(cap
, S_IRUGO
, intel_iommu_show_cap
, NULL
);
4725 static ssize_t
intel_iommu_show_ecap(struct device
*dev
,
4726 struct device_attribute
*attr
,
4729 struct intel_iommu
*iommu
= dev_to_intel_iommu(dev
);
4730 return sprintf(buf
, "%llx\n", iommu
->ecap
);
4732 static DEVICE_ATTR(ecap
, S_IRUGO
, intel_iommu_show_ecap
, NULL
);
4734 static ssize_t
intel_iommu_show_ndoms(struct device
*dev
,
4735 struct device_attribute
*attr
,
4738 struct intel_iommu
*iommu
= dev_to_intel_iommu(dev
);
4739 return sprintf(buf
, "%ld\n", cap_ndoms(iommu
->cap
));
4741 static DEVICE_ATTR(domains_supported
, S_IRUGO
, intel_iommu_show_ndoms
, NULL
);
4743 static ssize_t
intel_iommu_show_ndoms_used(struct device
*dev
,
4744 struct device_attribute
*attr
,
4747 struct intel_iommu
*iommu
= dev_to_intel_iommu(dev
);
4748 return sprintf(buf
, "%d\n", bitmap_weight(iommu
->domain_ids
,
4749 cap_ndoms(iommu
->cap
)));
4751 static DEVICE_ATTR(domains_used
, S_IRUGO
, intel_iommu_show_ndoms_used
, NULL
);
4753 static struct attribute
*intel_iommu_attrs
[] = {
4754 &dev_attr_version
.attr
,
4755 &dev_attr_address
.attr
,
4757 &dev_attr_ecap
.attr
,
4758 &dev_attr_domains_supported
.attr
,
4759 &dev_attr_domains_used
.attr
,
4763 static struct attribute_group intel_iommu_group
= {
4764 .name
= "intel-iommu",
4765 .attrs
= intel_iommu_attrs
,
4768 const struct attribute_group
*intel_iommu_groups
[] = {
4773 int __init
intel_iommu_init(void)
4776 struct dmar_drhd_unit
*drhd
;
4777 struct intel_iommu
*iommu
;
4779 /* VT-d is required for a TXT/tboot launch, so enforce that */
4780 force_on
= tboot_force_iommu();
4782 if (iommu_init_mempool()) {
4784 panic("tboot: Failed to initialize iommu memory\n");
4788 down_write(&dmar_global_lock
);
4789 if (dmar_table_init()) {
4791 panic("tboot: Failed to initialize DMAR table\n");
4795 if (dmar_dev_scope_init() < 0) {
4797 panic("tboot: Failed to initialize DMAR device scope\n");
4801 up_write(&dmar_global_lock
);
4804 * The bus notifier takes the dmar_global_lock, so lockdep will
4805 * complain later when we register it under the lock.
4807 dmar_register_bus_notifier();
4809 down_write(&dmar_global_lock
);
4811 if (no_iommu
|| dmar_disabled
) {
4813 * We exit the function here to ensure IOMMU's remapping and
4814 * mempool aren't setup, which means that the IOMMU's PMRs
4815 * won't be disabled via the call to init_dmars(). So disable
4816 * it explicitly here. The PMRs were setup by tboot prior to
4817 * calling SENTER, but the kernel is expected to reset/tear
4820 if (intel_iommu_tboot_noforce
) {
4821 for_each_iommu(iommu
, drhd
)
4822 iommu_disable_protect_mem_regions(iommu
);
4826 * Make sure the IOMMUs are switched off, even when we
4827 * boot into a kexec kernel and the previous kernel left
4830 intel_disable_iommus();
4834 if (list_empty(&dmar_rmrr_units
))
4835 pr_info("No RMRR found\n");
4837 if (list_empty(&dmar_atsr_units
))
4838 pr_info("No ATSR found\n");
4840 if (dmar_init_reserved_ranges()) {
4842 panic("tboot: Failed to reserve iommu ranges\n");
4843 goto out_free_reserved_range
;
4846 init_no_remapping_devices();
4851 panic("tboot: Failed to initialize DMARs\n");
4852 pr_err("Initialization failed\n");
4853 goto out_free_reserved_range
;
4855 up_write(&dmar_global_lock
);
4856 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4858 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4861 dma_ops
= &intel_dma_ops
;
4863 init_iommu_pm_ops();
4865 for_each_active_iommu(iommu
, drhd
) {
4866 iommu_device_sysfs_add(&iommu
->iommu
, NULL
,
4869 iommu_device_set_ops(&iommu
->iommu
, &intel_iommu_ops
);
4870 iommu_device_register(&iommu
->iommu
);
4873 bus_set_iommu(&pci_bus_type
, &intel_iommu_ops
);
4874 bus_register_notifier(&pci_bus_type
, &device_nb
);
4875 if (si_domain
&& !hw_pass_through
)
4876 register_memory_notifier(&intel_iommu_memory_nb
);
4877 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD
, "iommu/intel:dead", NULL
,
4878 intel_iommu_cpu_dead
);
4879 intel_iommu_enabled
= 1;
4880 intel_iommu_debugfs_init();
4884 out_free_reserved_range
:
4885 put_iova_domain(&reserved_iova_list
);
4887 intel_iommu_free_dmars();
4888 up_write(&dmar_global_lock
);
4889 iommu_exit_mempool();
4893 static int domain_context_clear_one_cb(struct pci_dev
*pdev
, u16 alias
, void *opaque
)
4895 struct intel_iommu
*iommu
= opaque
;
4897 domain_context_clear_one(iommu
, PCI_BUS_NUM(alias
), alias
& 0xff);
4902 * NB - intel-iommu lacks any sort of reference counting for the users of
4903 * dependent devices. If multiple endpoints have intersecting dependent
4904 * devices, unbinding the driver from any one of them will possibly leave
4905 * the others unable to operate.
4907 static void domain_context_clear(struct intel_iommu
*iommu
, struct device
*dev
)
4909 if (!iommu
|| !dev
|| !dev_is_pci(dev
))
4912 pci_for_each_dma_alias(to_pci_dev(dev
), &domain_context_clear_one_cb
, iommu
);
4915 static void __dmar_remove_one_dev_info(struct device_domain_info
*info
)
4917 struct intel_iommu
*iommu
;
4918 unsigned long flags
;
4920 assert_spin_locked(&device_domain_lock
);
4925 iommu
= info
->iommu
;
4928 if (dev_is_pci(info
->dev
) && sm_supported(iommu
))
4929 intel_pasid_tear_down_entry(iommu
, info
->dev
,
4932 iommu_disable_dev_iotlb(info
);
4933 domain_context_clear(iommu
, info
->dev
);
4934 intel_pasid_free_table(info
->dev
);
4937 unlink_domain_info(info
);
4939 spin_lock_irqsave(&iommu
->lock
, flags
);
4940 domain_detach_iommu(info
->domain
, iommu
);
4941 spin_unlock_irqrestore(&iommu
->lock
, flags
);
4943 free_devinfo_mem(info
);
4946 static void dmar_remove_one_dev_info(struct dmar_domain
*domain
,
4949 struct device_domain_info
*info
;
4950 unsigned long flags
;
4952 spin_lock_irqsave(&device_domain_lock
, flags
);
4953 info
= dev
->archdata
.iommu
;
4954 __dmar_remove_one_dev_info(info
);
4955 spin_unlock_irqrestore(&device_domain_lock
, flags
);
4958 static int md_domain_init(struct dmar_domain
*domain
, int guest_width
)
4962 init_iova_domain(&domain
->iovad
, VTD_PAGE_SIZE
, IOVA_START_PFN
);
4963 domain_reserve_special_ranges(domain
);
4965 /* calculate AGAW */
4966 domain
->gaw
= guest_width
;
4967 adjust_width
= guestwidth_to_adjustwidth(guest_width
);
4968 domain
->agaw
= width_to_agaw(adjust_width
);
4970 domain
->iommu_coherency
= 0;
4971 domain
->iommu_snooping
= 0;
4972 domain
->iommu_superpage
= 0;
4973 domain
->max_addr
= 0;
4975 /* always allocate the top pgd */
4976 domain
->pgd
= (struct dma_pte
*)alloc_pgtable_page(domain
->nid
);
4979 domain_flush_cache(domain
, domain
->pgd
, PAGE_SIZE
);
4983 static struct iommu_domain
*intel_iommu_domain_alloc(unsigned type
)
4985 struct dmar_domain
*dmar_domain
;
4986 struct iommu_domain
*domain
;
4988 if (type
!= IOMMU_DOMAIN_UNMANAGED
)
4991 dmar_domain
= alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE
);
4993 pr_err("Can't allocate dmar_domain\n");
4996 if (md_domain_init(dmar_domain
, DEFAULT_DOMAIN_ADDRESS_WIDTH
)) {
4997 pr_err("Domain initialization failed\n");
4998 domain_exit(dmar_domain
);
5001 domain_update_iommu_cap(dmar_domain
);
5003 domain
= &dmar_domain
->domain
;
5004 domain
->geometry
.aperture_start
= 0;
5005 domain
->geometry
.aperture_end
= __DOMAIN_MAX_ADDR(dmar_domain
->gaw
);
5006 domain
->geometry
.force_aperture
= true;
5011 static void intel_iommu_domain_free(struct iommu_domain
*domain
)
5013 domain_exit(to_dmar_domain(domain
));
5016 static int intel_iommu_attach_device(struct iommu_domain
*domain
,
5019 struct dmar_domain
*dmar_domain
= to_dmar_domain(domain
);
5020 struct intel_iommu
*iommu
;
5024 if (device_is_rmrr_locked(dev
)) {
5025 dev_warn(dev
, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5029 /* normally dev is not mapped */
5030 if (unlikely(domain_context_mapped(dev
))) {
5031 struct dmar_domain
*old_domain
;
5033 old_domain
= find_domain(dev
);
5036 dmar_remove_one_dev_info(old_domain
, dev
);
5039 if (!domain_type_is_vm_or_si(old_domain
) &&
5040 list_empty(&old_domain
->devices
))
5041 domain_exit(old_domain
);
5045 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
5049 /* check if this iommu agaw is sufficient for max mapped address */
5050 addr_width
= agaw_to_width(iommu
->agaw
);
5051 if (addr_width
> cap_mgaw(iommu
->cap
))
5052 addr_width
= cap_mgaw(iommu
->cap
);
5054 if (dmar_domain
->max_addr
> (1LL << addr_width
)) {
5055 pr_err("%s: iommu width (%d) is not "
5056 "sufficient for the mapped address (%llx)\n",
5057 __func__
, addr_width
, dmar_domain
->max_addr
);
5060 dmar_domain
->gaw
= addr_width
;
5063 * Knock out extra levels of page tables if necessary
5065 while (iommu
->agaw
< dmar_domain
->agaw
) {
5066 struct dma_pte
*pte
;
5068 pte
= dmar_domain
->pgd
;
5069 if (dma_pte_present(pte
)) {
5070 dmar_domain
->pgd
= (struct dma_pte
*)
5071 phys_to_virt(dma_pte_addr(pte
));
5072 free_pgtable_page(pte
);
5074 dmar_domain
->agaw
--;
5077 return domain_add_dev_info(dmar_domain
, dev
);
5080 static void intel_iommu_detach_device(struct iommu_domain
*domain
,
5083 dmar_remove_one_dev_info(to_dmar_domain(domain
), dev
);
5086 static int intel_iommu_map(struct iommu_domain
*domain
,
5087 unsigned long iova
, phys_addr_t hpa
,
5088 size_t size
, int iommu_prot
)
5090 struct dmar_domain
*dmar_domain
= to_dmar_domain(domain
);
5095 if (iommu_prot
& IOMMU_READ
)
5096 prot
|= DMA_PTE_READ
;
5097 if (iommu_prot
& IOMMU_WRITE
)
5098 prot
|= DMA_PTE_WRITE
;
5099 if ((iommu_prot
& IOMMU_CACHE
) && dmar_domain
->iommu_snooping
)
5100 prot
|= DMA_PTE_SNP
;
5102 max_addr
= iova
+ size
;
5103 if (dmar_domain
->max_addr
< max_addr
) {
5106 /* check if minimum agaw is sufficient for mapped address */
5107 end
= __DOMAIN_MAX_ADDR(dmar_domain
->gaw
) + 1;
5108 if (end
< max_addr
) {
5109 pr_err("%s: iommu width (%d) is not "
5110 "sufficient for the mapped address (%llx)\n",
5111 __func__
, dmar_domain
->gaw
, max_addr
);
5114 dmar_domain
->max_addr
= max_addr
;
5116 /* Round up size to next multiple of PAGE_SIZE, if it and
5117 the low bits of hpa would take us onto the next page */
5118 size
= aligned_nrpages(hpa
, size
);
5119 ret
= domain_pfn_mapping(dmar_domain
, iova
>> VTD_PAGE_SHIFT
,
5120 hpa
>> VTD_PAGE_SHIFT
, size
, prot
);
5124 static size_t intel_iommu_unmap(struct iommu_domain
*domain
,
5125 unsigned long iova
, size_t size
)
5127 struct dmar_domain
*dmar_domain
= to_dmar_domain(domain
);
5128 struct page
*freelist
= NULL
;
5129 unsigned long start_pfn
, last_pfn
;
5130 unsigned int npages
;
5131 int iommu_id
, level
= 0;
5133 /* Cope with horrid API which requires us to unmap more than the
5134 size argument if it happens to be a large-page mapping. */
5135 BUG_ON(!pfn_to_dma_pte(dmar_domain
, iova
>> VTD_PAGE_SHIFT
, &level
));
5137 if (size
< VTD_PAGE_SIZE
<< level_to_offset_bits(level
))
5138 size
= VTD_PAGE_SIZE
<< level_to_offset_bits(level
);
5140 start_pfn
= iova
>> VTD_PAGE_SHIFT
;
5141 last_pfn
= (iova
+ size
- 1) >> VTD_PAGE_SHIFT
;
5143 freelist
= domain_unmap(dmar_domain
, start_pfn
, last_pfn
);
5145 npages
= last_pfn
- start_pfn
+ 1;
5147 for_each_domain_iommu(iommu_id
, dmar_domain
)
5148 iommu_flush_iotlb_psi(g_iommus
[iommu_id
], dmar_domain
,
5149 start_pfn
, npages
, !freelist
, 0);
5151 dma_free_pagelist(freelist
);
5153 if (dmar_domain
->max_addr
== iova
+ size
)
5154 dmar_domain
->max_addr
= iova
;
5159 static phys_addr_t
intel_iommu_iova_to_phys(struct iommu_domain
*domain
,
5162 struct dmar_domain
*dmar_domain
= to_dmar_domain(domain
);
5163 struct dma_pte
*pte
;
5167 pte
= pfn_to_dma_pte(dmar_domain
, iova
>> VTD_PAGE_SHIFT
, &level
);
5169 phys
= dma_pte_addr(pte
);
5174 static bool intel_iommu_capable(enum iommu_cap cap
)
5176 if (cap
== IOMMU_CAP_CACHE_COHERENCY
)
5177 return domain_update_iommu_snooping(NULL
) == 1;
5178 if (cap
== IOMMU_CAP_INTR_REMAP
)
5179 return irq_remapping_enabled
== 1;
5184 static int intel_iommu_add_device(struct device
*dev
)
5186 struct intel_iommu
*iommu
;
5187 struct iommu_group
*group
;
5190 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
5194 iommu_device_link(&iommu
->iommu
, dev
);
5196 group
= iommu_group_get_for_dev(dev
);
5199 return PTR_ERR(group
);
5201 iommu_group_put(group
);
5205 static void intel_iommu_remove_device(struct device
*dev
)
5207 struct intel_iommu
*iommu
;
5210 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
5214 iommu_group_remove_device(dev
);
5216 iommu_device_unlink(&iommu
->iommu
, dev
);
5219 static void intel_iommu_get_resv_regions(struct device
*device
,
5220 struct list_head
*head
)
5222 struct iommu_resv_region
*reg
;
5223 struct dmar_rmrr_unit
*rmrr
;
5224 struct device
*i_dev
;
5228 for_each_rmrr_units(rmrr
) {
5229 for_each_active_dev_scope(rmrr
->devices
, rmrr
->devices_cnt
,
5231 if (i_dev
!= device
)
5234 list_add_tail(&rmrr
->resv
->list
, head
);
5239 reg
= iommu_alloc_resv_region(IOAPIC_RANGE_START
,
5240 IOAPIC_RANGE_END
- IOAPIC_RANGE_START
+ 1,
5244 list_add_tail(®
->list
, head
);
5247 static void intel_iommu_put_resv_regions(struct device
*dev
,
5248 struct list_head
*head
)
5250 struct iommu_resv_region
*entry
, *next
;
5252 list_for_each_entry_safe(entry
, next
, head
, list
) {
5253 if (entry
->type
== IOMMU_RESV_RESERVED
)
5258 #ifdef CONFIG_INTEL_IOMMU_SVM
5259 int intel_iommu_enable_pasid(struct intel_iommu
*iommu
, struct intel_svm_dev
*sdev
)
5261 struct device_domain_info
*info
;
5262 struct context_entry
*context
;
5263 struct dmar_domain
*domain
;
5264 unsigned long flags
;
5268 domain
= get_valid_domain_for_dev(sdev
->dev
);
5272 spin_lock_irqsave(&device_domain_lock
, flags
);
5273 spin_lock(&iommu
->lock
);
5276 info
= sdev
->dev
->archdata
.iommu
;
5277 if (!info
|| !info
->pasid_supported
)
5280 context
= iommu_context_addr(iommu
, info
->bus
, info
->devfn
, 0);
5281 if (WARN_ON(!context
))
5284 ctx_lo
= context
[0].lo
;
5286 sdev
->did
= domain
->iommu_did
[iommu
->seq_id
];
5287 sdev
->sid
= PCI_DEVID(info
->bus
, info
->devfn
);
5289 if (!(ctx_lo
& CONTEXT_PASIDE
)) {
5290 ctx_lo
|= CONTEXT_PASIDE
;
5291 context
[0].lo
= ctx_lo
;
5293 iommu
->flush
.flush_context(iommu
, sdev
->did
, sdev
->sid
,
5294 DMA_CCMD_MASK_NOBIT
,
5295 DMA_CCMD_DEVICE_INVL
);
5298 /* Enable PASID support in the device, if it wasn't already */
5299 if (!info
->pasid_enabled
)
5300 iommu_enable_dev_iotlb(info
);
5302 if (info
->ats_enabled
) {
5303 sdev
->dev_iotlb
= 1;
5304 sdev
->qdep
= info
->ats_qdep
;
5305 if (sdev
->qdep
>= QI_DEV_EIOTLB_MAX_INVS
)
5311 spin_unlock(&iommu
->lock
);
5312 spin_unlock_irqrestore(&device_domain_lock
, flags
);
5317 struct intel_iommu
*intel_svm_device_to_iommu(struct device
*dev
)
5319 struct intel_iommu
*iommu
;
5322 if (iommu_dummy(dev
)) {
5324 "No IOMMU translation for device; cannot enable SVM\n");
5328 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
5330 dev_err(dev
, "No IOMMU for device; cannot enable SVM\n");
5336 #endif /* CONFIG_INTEL_IOMMU_SVM */
5338 const struct iommu_ops intel_iommu_ops
= {
5339 .capable
= intel_iommu_capable
,
5340 .domain_alloc
= intel_iommu_domain_alloc
,
5341 .domain_free
= intel_iommu_domain_free
,
5342 .attach_dev
= intel_iommu_attach_device
,
5343 .detach_dev
= intel_iommu_detach_device
,
5344 .map
= intel_iommu_map
,
5345 .unmap
= intel_iommu_unmap
,
5346 .iova_to_phys
= intel_iommu_iova_to_phys
,
5347 .add_device
= intel_iommu_add_device
,
5348 .remove_device
= intel_iommu_remove_device
,
5349 .get_resv_regions
= intel_iommu_get_resv_regions
,
5350 .put_resv_regions
= intel_iommu_put_resv_regions
,
5351 .device_group
= pci_device_group
,
5352 .pgsize_bitmap
= INTEL_IOMMU_PGSIZES
,
5355 static void quirk_iommu_g4x_gfx(struct pci_dev
*dev
)
5357 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5358 pr_info("Disabling IOMMU for graphics on this chipset\n");
5362 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2a40, quirk_iommu_g4x_gfx
);
5363 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e00, quirk_iommu_g4x_gfx
);
5364 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e10, quirk_iommu_g4x_gfx
);
5365 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e20, quirk_iommu_g4x_gfx
);
5366 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e30, quirk_iommu_g4x_gfx
);
5367 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e40, quirk_iommu_g4x_gfx
);
5368 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e90, quirk_iommu_g4x_gfx
);
5370 static void quirk_iommu_rwbf(struct pci_dev
*dev
)
5373 * Mobile 4 Series Chipset neglects to set RWBF capability,
5374 * but needs it. Same seems to hold for the desktop versions.
5376 pr_info("Forcing write-buffer flush capability\n");
5380 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2a40, quirk_iommu_rwbf
);
5381 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e00, quirk_iommu_rwbf
);
5382 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e10, quirk_iommu_rwbf
);
5383 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e20, quirk_iommu_rwbf
);
5384 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e30, quirk_iommu_rwbf
);
5385 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e40, quirk_iommu_rwbf
);
5386 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e90, quirk_iommu_rwbf
);
5389 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5390 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5391 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5392 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5393 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5394 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5395 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5396 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5398 static void quirk_calpella_no_shadow_gtt(struct pci_dev
*dev
)
5402 if (pci_read_config_word(dev
, GGC
, &ggc
))
5405 if (!(ggc
& GGC_MEMORY_VT_ENABLED
)) {
5406 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5408 } else if (dmar_map_gfx
) {
5409 /* we have to ensure the gfx device is idle before we flush */
5410 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5411 intel_iommu_strict
= 1;
5414 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x0040, quirk_calpella_no_shadow_gtt
);
5415 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x0044, quirk_calpella_no_shadow_gtt
);
5416 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x0062, quirk_calpella_no_shadow_gtt
);
5417 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x006a, quirk_calpella_no_shadow_gtt
);
5419 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5420 ISOCH DMAR unit for the Azalia sound device, but not give it any
5421 TLB entries, which causes it to deadlock. Check for that. We do
5422 this in a function called from init_dmars(), instead of in a PCI
5423 quirk, because we don't want to print the obnoxious "BIOS broken"
5424 message if VT-d is actually disabled.
5426 static void __init
check_tylersburg_isoch(void)
5428 struct pci_dev
*pdev
;
5429 uint32_t vtisochctrl
;
5431 /* If there's no Azalia in the system anyway, forget it. */
5432 pdev
= pci_get_device(PCI_VENDOR_ID_INTEL
, 0x3a3e, NULL
);
5437 /* System Management Registers. Might be hidden, in which case
5438 we can't do the sanity check. But that's OK, because the
5439 known-broken BIOSes _don't_ actually hide it, so far. */
5440 pdev
= pci_get_device(PCI_VENDOR_ID_INTEL
, 0x342e, NULL
);
5444 if (pci_read_config_dword(pdev
, 0x188, &vtisochctrl
)) {
5451 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5452 if (vtisochctrl
& 1)
5455 /* Drop all bits other than the number of TLB entries */
5456 vtisochctrl
&= 0x1c;
5458 /* If we have the recommended number of TLB entries (16), fine. */
5459 if (vtisochctrl
== 0x10)
5462 /* Zero TLB entries? You get to ride the short bus to school. */
5464 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5465 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5466 dmi_get_system_info(DMI_BIOS_VENDOR
),
5467 dmi_get_system_info(DMI_BIOS_VERSION
),
5468 dmi_get_system_info(DMI_PRODUCT_VERSION
));
5469 iommu_identity_mapping
|= IDENTMAP_AZALIA
;
5473 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",