2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
18 * Joerg Roedel <jroedel@suse.de>
21 #define pr_fmt(fmt) "DMAR: " fmt
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/dma-direct.h>
35 #include <linux/mempool.h>
36 #include <linux/memory.h>
37 #include <linux/cpu.h>
38 #include <linux/timer.h>
40 #include <linux/iova.h>
41 #include <linux/iommu.h>
42 #include <linux/intel-iommu.h>
43 #include <linux/syscore_ops.h>
44 #include <linux/tboot.h>
45 #include <linux/dmi.h>
46 #include <linux/pci-ats.h>
47 #include <linux/memblock.h>
48 #include <linux/dma-contiguous.h>
49 #include <linux/dma-direct.h>
50 #include <linux/crash_dump.h>
51 #include <asm/irq_remapping.h>
52 #include <asm/cacheflush.h>
53 #include <asm/iommu.h>
55 #include "irq_remapping.h"
56 #include "intel-pasid.h"
58 #define ROOT_SIZE VTD_PAGE_SIZE
59 #define CONTEXT_SIZE VTD_PAGE_SIZE
61 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
62 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
63 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
64 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
66 #define IOAPIC_RANGE_START (0xfee00000)
67 #define IOAPIC_RANGE_END (0xfeefffff)
68 #define IOVA_START_ADDR (0x1000)
70 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
72 #define MAX_AGAW_WIDTH 64
73 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
75 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
76 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
78 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
79 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
80 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
81 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
82 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
84 /* IO virtual address start page frame number */
85 #define IOVA_START_PFN (1)
87 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
89 /* page table handling */
90 #define LEVEL_STRIDE (9)
91 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
94 * This bitmap is used to advertise the page sizes our hardware support
95 * to the IOMMU core, which will then use this information to split
96 * physically contiguous memory regions it is mapping into page sizes
99 * Traditionally the IOMMU core just handed us the mappings directly,
100 * after making sure the size is an order of a 4KiB page and that the
101 * mapping has natural alignment.
103 * To retain this behavior, we currently advertise that we support
104 * all page sizes that are an order of 4KiB.
106 * If at some point we'd like to utilize the IOMMU core's new behavior,
107 * we could change this to advertise the real page sizes we support.
109 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
111 static inline int agaw_to_level(int agaw
)
116 static inline int agaw_to_width(int agaw
)
118 return min_t(int, 30 + agaw
* LEVEL_STRIDE
, MAX_AGAW_WIDTH
);
121 static inline int width_to_agaw(int width
)
123 return DIV_ROUND_UP(width
- 30, LEVEL_STRIDE
);
126 static inline unsigned int level_to_offset_bits(int level
)
128 return (level
- 1) * LEVEL_STRIDE
;
131 static inline int pfn_level_offset(unsigned long pfn
, int level
)
133 return (pfn
>> level_to_offset_bits(level
)) & LEVEL_MASK
;
136 static inline unsigned long level_mask(int level
)
138 return -1UL << level_to_offset_bits(level
);
141 static inline unsigned long level_size(int level
)
143 return 1UL << level_to_offset_bits(level
);
146 static inline unsigned long align_to_level(unsigned long pfn
, int level
)
148 return (pfn
+ level_size(level
) - 1) & level_mask(level
);
151 static inline unsigned long lvl_to_nr_pages(unsigned int lvl
)
153 return 1 << min_t(int, (lvl
- 1) * LEVEL_STRIDE
, MAX_AGAW_PFN_WIDTH
);
156 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
157 are never going to work. */
158 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn
)
160 return dma_pfn
>> (PAGE_SHIFT
- VTD_PAGE_SHIFT
);
163 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn
)
165 return mm_pfn
<< (PAGE_SHIFT
- VTD_PAGE_SHIFT
);
167 static inline unsigned long page_to_dma_pfn(struct page
*pg
)
169 return mm_to_dma_pfn(page_to_pfn(pg
));
171 static inline unsigned long virt_to_dma_pfn(void *p
)
173 return page_to_dma_pfn(virt_to_page(p
));
176 /* global iommu list, set NULL for ignored DMAR units */
177 static struct intel_iommu
**g_iommus
;
179 static void __init
check_tylersburg_isoch(void);
180 static int rwbf_quirk
;
183 * set to 1 to panic kernel if can't successfully enable VT-d
184 * (used when kernel is launched w/ TXT)
186 static int force_on
= 0;
187 int intel_iommu_tboot_noforce
;
192 * 12-63: Context Ptr (12 - (haw-1))
199 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
202 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
205 static phys_addr_t
root_entry_lctp(struct root_entry
*re
)
210 return re
->lo
& VTD_PAGE_MASK
;
214 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
217 static phys_addr_t
root_entry_uctp(struct root_entry
*re
)
222 return re
->hi
& VTD_PAGE_MASK
;
227 * 1: fault processing disable
228 * 2-3: translation type
229 * 12-63: address space root
235 struct context_entry
{
240 static inline void context_clear_pasid_enable(struct context_entry
*context
)
242 context
->lo
&= ~(1ULL << 11);
245 static inline bool context_pasid_enabled(struct context_entry
*context
)
247 return !!(context
->lo
& (1ULL << 11));
250 static inline void context_set_copied(struct context_entry
*context
)
252 context
->hi
|= (1ull << 3);
255 static inline bool context_copied(struct context_entry
*context
)
257 return !!(context
->hi
& (1ULL << 3));
260 static inline bool __context_present(struct context_entry
*context
)
262 return (context
->lo
& 1);
265 static inline bool context_present(struct context_entry
*context
)
267 return context_pasid_enabled(context
) ?
268 __context_present(context
) :
269 __context_present(context
) && !context_copied(context
);
272 static inline void context_set_present(struct context_entry
*context
)
277 static inline void context_set_fault_enable(struct context_entry
*context
)
279 context
->lo
&= (((u64
)-1) << 2) | 1;
282 static inline void context_set_translation_type(struct context_entry
*context
,
285 context
->lo
&= (((u64
)-1) << 4) | 3;
286 context
->lo
|= (value
& 3) << 2;
289 static inline void context_set_address_root(struct context_entry
*context
,
292 context
->lo
&= ~VTD_PAGE_MASK
;
293 context
->lo
|= value
& VTD_PAGE_MASK
;
296 static inline void context_set_address_width(struct context_entry
*context
,
299 context
->hi
|= value
& 7;
302 static inline void context_set_domain_id(struct context_entry
*context
,
305 context
->hi
|= (value
& ((1 << 16) - 1)) << 8;
308 static inline int context_domain_id(struct context_entry
*c
)
310 return((c
->hi
>> 8) & 0xffff);
313 static inline void context_clear_entry(struct context_entry
*context
)
326 * 12-63: Host physcial address
332 static inline void dma_clear_pte(struct dma_pte
*pte
)
337 static inline u64
dma_pte_addr(struct dma_pte
*pte
)
340 return pte
->val
& VTD_PAGE_MASK
;
342 /* Must have a full atomic 64-bit read */
343 return __cmpxchg64(&pte
->val
, 0ULL, 0ULL) & VTD_PAGE_MASK
;
347 static inline bool dma_pte_present(struct dma_pte
*pte
)
349 return (pte
->val
& 3) != 0;
352 static inline bool dma_pte_superpage(struct dma_pte
*pte
)
354 return (pte
->val
& DMA_PTE_LARGE_PAGE
);
357 static inline int first_pte_in_page(struct dma_pte
*pte
)
359 return !((unsigned long)pte
& ~VTD_PAGE_MASK
);
363 * This domain is a statically identity mapping domain.
364 * 1. This domain creats a static 1:1 mapping to all usable memory.
365 * 2. It maps to each iommu if successful.
366 * 3. Each iommu mapps to this domain if successful.
368 static struct dmar_domain
*si_domain
;
369 static int hw_pass_through
= 1;
372 * Domain represents a virtual machine, more than one devices
373 * across iommus may be owned in one domain, e.g. kvm guest.
375 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
377 /* si_domain contains mulitple devices */
378 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
380 #define for_each_domain_iommu(idx, domain) \
381 for (idx = 0; idx < g_num_of_iommus; idx++) \
382 if (domain->iommu_refcnt[idx])
384 struct dmar_rmrr_unit
{
385 struct list_head list
; /* list of rmrr units */
386 struct acpi_dmar_header
*hdr
; /* ACPI header */
387 u64 base_address
; /* reserved base address*/
388 u64 end_address
; /* reserved end address */
389 struct dmar_dev_scope
*devices
; /* target devices */
390 int devices_cnt
; /* target device count */
391 struct iommu_resv_region
*resv
; /* reserved region handle */
394 struct dmar_atsr_unit
{
395 struct list_head list
; /* list of ATSR units */
396 struct acpi_dmar_header
*hdr
; /* ACPI header */
397 struct dmar_dev_scope
*devices
; /* target devices */
398 int devices_cnt
; /* target device count */
399 u8 include_all
:1; /* include all ports */
402 static LIST_HEAD(dmar_atsr_units
);
403 static LIST_HEAD(dmar_rmrr_units
);
405 #define for_each_rmrr_units(rmrr) \
406 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
408 /* bitmap for indexing intel_iommus */
409 static int g_num_of_iommus
;
411 static void domain_exit(struct dmar_domain
*domain
);
412 static void domain_remove_dev_info(struct dmar_domain
*domain
);
413 static void dmar_remove_one_dev_info(struct dmar_domain
*domain
,
415 static void __dmar_remove_one_dev_info(struct device_domain_info
*info
);
416 static void domain_context_clear(struct intel_iommu
*iommu
,
418 static int domain_detach_iommu(struct dmar_domain
*domain
,
419 struct intel_iommu
*iommu
);
421 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
422 int dmar_disabled
= 0;
424 int dmar_disabled
= 1;
425 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
427 int intel_iommu_enabled
= 0;
428 EXPORT_SYMBOL_GPL(intel_iommu_enabled
);
430 static int dmar_map_gfx
= 1;
431 static int dmar_forcedac
;
432 static int intel_iommu_strict
;
433 static int intel_iommu_superpage
= 1;
434 static int intel_iommu_ecs
= 1;
435 static int iommu_identity_mapping
;
437 #define IDENTMAP_ALL 1
438 #define IDENTMAP_GFX 2
439 #define IDENTMAP_AZALIA 4
441 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap))
442 #define pasid_enabled(iommu) (ecs_enabled(iommu) && ecap_pasid(iommu->ecap))
444 int intel_iommu_gfx_mapped
;
445 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped
);
447 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
448 static DEFINE_SPINLOCK(device_domain_lock
);
449 static LIST_HEAD(device_domain_list
);
451 const struct iommu_ops intel_iommu_ops
;
453 static bool translation_pre_enabled(struct intel_iommu
*iommu
)
455 return (iommu
->flags
& VTD_FLAG_TRANS_PRE_ENABLED
);
458 static void clear_translation_pre_enabled(struct intel_iommu
*iommu
)
460 iommu
->flags
&= ~VTD_FLAG_TRANS_PRE_ENABLED
;
463 static void init_translation_status(struct intel_iommu
*iommu
)
467 gsts
= readl(iommu
->reg
+ DMAR_GSTS_REG
);
468 if (gsts
& DMA_GSTS_TES
)
469 iommu
->flags
|= VTD_FLAG_TRANS_PRE_ENABLED
;
472 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
473 static struct dmar_domain
*to_dmar_domain(struct iommu_domain
*dom
)
475 return container_of(dom
, struct dmar_domain
, domain
);
478 static int __init
intel_iommu_setup(char *str
)
483 if (!strncmp(str
, "on", 2)) {
485 pr_info("IOMMU enabled\n");
486 } else if (!strncmp(str
, "off", 3)) {
488 pr_info("IOMMU disabled\n");
489 } else if (!strncmp(str
, "igfx_off", 8)) {
491 pr_info("Disable GFX device mapping\n");
492 } else if (!strncmp(str
, "forcedac", 8)) {
493 pr_info("Forcing DAC for PCI devices\n");
495 } else if (!strncmp(str
, "strict", 6)) {
496 pr_info("Disable batched IOTLB flush\n");
497 intel_iommu_strict
= 1;
498 } else if (!strncmp(str
, "sp_off", 6)) {
499 pr_info("Disable supported super page\n");
500 intel_iommu_superpage
= 0;
501 } else if (!strncmp(str
, "ecs_off", 7)) {
503 "Intel-IOMMU: disable extended context table support\n");
505 } else if (!strncmp(str
, "tboot_noforce", 13)) {
507 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
508 intel_iommu_tboot_noforce
= 1;
511 str
+= strcspn(str
, ",");
517 __setup("intel_iommu=", intel_iommu_setup
);
519 static struct kmem_cache
*iommu_domain_cache
;
520 static struct kmem_cache
*iommu_devinfo_cache
;
522 static struct dmar_domain
* get_iommu_domain(struct intel_iommu
*iommu
, u16 did
)
524 struct dmar_domain
**domains
;
527 domains
= iommu
->domains
[idx
];
531 return domains
[did
& 0xff];
534 static void set_iommu_domain(struct intel_iommu
*iommu
, u16 did
,
535 struct dmar_domain
*domain
)
537 struct dmar_domain
**domains
;
540 if (!iommu
->domains
[idx
]) {
541 size_t size
= 256 * sizeof(struct dmar_domain
*);
542 iommu
->domains
[idx
] = kzalloc(size
, GFP_ATOMIC
);
545 domains
= iommu
->domains
[idx
];
546 if (WARN_ON(!domains
))
549 domains
[did
& 0xff] = domain
;
552 void *alloc_pgtable_page(int node
)
557 page
= alloc_pages_node(node
, GFP_ATOMIC
| __GFP_ZERO
, 0);
559 vaddr
= page_address(page
);
563 void free_pgtable_page(void *vaddr
)
565 free_page((unsigned long)vaddr
);
568 static inline void *alloc_domain_mem(void)
570 return kmem_cache_alloc(iommu_domain_cache
, GFP_ATOMIC
);
573 static void free_domain_mem(void *vaddr
)
575 kmem_cache_free(iommu_domain_cache
, vaddr
);
578 static inline void * alloc_devinfo_mem(void)
580 return kmem_cache_alloc(iommu_devinfo_cache
, GFP_ATOMIC
);
583 static inline void free_devinfo_mem(void *vaddr
)
585 kmem_cache_free(iommu_devinfo_cache
, vaddr
);
588 static inline int domain_type_is_vm(struct dmar_domain
*domain
)
590 return domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
;
593 static inline int domain_type_is_si(struct dmar_domain
*domain
)
595 return domain
->flags
& DOMAIN_FLAG_STATIC_IDENTITY
;
598 static inline int domain_type_is_vm_or_si(struct dmar_domain
*domain
)
600 return domain
->flags
& (DOMAIN_FLAG_VIRTUAL_MACHINE
|
601 DOMAIN_FLAG_STATIC_IDENTITY
);
604 static inline int domain_pfn_supported(struct dmar_domain
*domain
,
607 int addr_width
= agaw_to_width(domain
->agaw
) - VTD_PAGE_SHIFT
;
609 return !(addr_width
< BITS_PER_LONG
&& pfn
>> addr_width
);
612 static int __iommu_calculate_agaw(struct intel_iommu
*iommu
, int max_gaw
)
617 sagaw
= cap_sagaw(iommu
->cap
);
618 for (agaw
= width_to_agaw(max_gaw
);
620 if (test_bit(agaw
, &sagaw
))
628 * Calculate max SAGAW for each iommu.
630 int iommu_calculate_max_sagaw(struct intel_iommu
*iommu
)
632 return __iommu_calculate_agaw(iommu
, MAX_AGAW_WIDTH
);
636 * calculate agaw for each iommu.
637 * "SAGAW" may be different across iommus, use a default agaw, and
638 * get a supported less agaw for iommus that don't support the default agaw.
640 int iommu_calculate_agaw(struct intel_iommu
*iommu
)
642 return __iommu_calculate_agaw(iommu
, DEFAULT_DOMAIN_ADDRESS_WIDTH
);
645 /* This functionin only returns single iommu in a domain */
646 struct intel_iommu
*domain_get_iommu(struct dmar_domain
*domain
)
650 /* si_domain and vm domain should not get here. */
651 BUG_ON(domain_type_is_vm_or_si(domain
));
652 for_each_domain_iommu(iommu_id
, domain
)
655 if (iommu_id
< 0 || iommu_id
>= g_num_of_iommus
)
658 return g_iommus
[iommu_id
];
661 static void domain_update_iommu_coherency(struct dmar_domain
*domain
)
663 struct dmar_drhd_unit
*drhd
;
664 struct intel_iommu
*iommu
;
668 domain
->iommu_coherency
= 1;
670 for_each_domain_iommu(i
, domain
) {
672 if (!ecap_coherent(g_iommus
[i
]->ecap
)) {
673 domain
->iommu_coherency
= 0;
680 /* No hardware attached; use lowest common denominator */
682 for_each_active_iommu(iommu
, drhd
) {
683 if (!ecap_coherent(iommu
->ecap
)) {
684 domain
->iommu_coherency
= 0;
691 static int domain_update_iommu_snooping(struct intel_iommu
*skip
)
693 struct dmar_drhd_unit
*drhd
;
694 struct intel_iommu
*iommu
;
698 for_each_active_iommu(iommu
, drhd
) {
700 if (!ecap_sc_support(iommu
->ecap
)) {
711 static int domain_update_iommu_superpage(struct intel_iommu
*skip
)
713 struct dmar_drhd_unit
*drhd
;
714 struct intel_iommu
*iommu
;
717 if (!intel_iommu_superpage
) {
721 /* set iommu_superpage to the smallest common denominator */
723 for_each_active_iommu(iommu
, drhd
) {
725 mask
&= cap_super_page_val(iommu
->cap
);
735 /* Some capabilities may be different across iommus */
736 static void domain_update_iommu_cap(struct dmar_domain
*domain
)
738 domain_update_iommu_coherency(domain
);
739 domain
->iommu_snooping
= domain_update_iommu_snooping(NULL
);
740 domain
->iommu_superpage
= domain_update_iommu_superpage(NULL
);
743 static inline struct context_entry
*iommu_context_addr(struct intel_iommu
*iommu
,
744 u8 bus
, u8 devfn
, int alloc
)
746 struct root_entry
*root
= &iommu
->root_entry
[bus
];
747 struct context_entry
*context
;
751 if (ecs_enabled(iommu
)) {
759 context
= phys_to_virt(*entry
& VTD_PAGE_MASK
);
761 unsigned long phy_addr
;
765 context
= alloc_pgtable_page(iommu
->node
);
769 __iommu_flush_cache(iommu
, (void *)context
, CONTEXT_SIZE
);
770 phy_addr
= virt_to_phys((void *)context
);
771 *entry
= phy_addr
| 1;
772 __iommu_flush_cache(iommu
, entry
, sizeof(*entry
));
774 return &context
[devfn
];
777 static int iommu_dummy(struct device
*dev
)
779 return dev
->archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
;
782 static struct intel_iommu
*device_to_iommu(struct device
*dev
, u8
*bus
, u8
*devfn
)
784 struct dmar_drhd_unit
*drhd
= NULL
;
785 struct intel_iommu
*iommu
;
787 struct pci_dev
*ptmp
, *pdev
= NULL
;
791 if (iommu_dummy(dev
))
794 if (dev_is_pci(dev
)) {
795 struct pci_dev
*pf_pdev
;
797 pdev
= to_pci_dev(dev
);
800 /* VMD child devices currently cannot be handled individually */
801 if (is_vmd(pdev
->bus
))
805 /* VFs aren't listed in scope tables; we need to look up
806 * the PF instead to find the IOMMU. */
807 pf_pdev
= pci_physfn(pdev
);
809 segment
= pci_domain_nr(pdev
->bus
);
810 } else if (has_acpi_companion(dev
))
811 dev
= &ACPI_COMPANION(dev
)->dev
;
814 for_each_active_iommu(iommu
, drhd
) {
815 if (pdev
&& segment
!= drhd
->segment
)
818 for_each_active_dev_scope(drhd
->devices
,
819 drhd
->devices_cnt
, i
, tmp
) {
821 /* For a VF use its original BDF# not that of the PF
822 * which we used for the IOMMU lookup. Strictly speaking
823 * we could do this for all PCI devices; we only need to
824 * get the BDF# from the scope table for ACPI matches. */
825 if (pdev
&& pdev
->is_virtfn
)
828 *bus
= drhd
->devices
[i
].bus
;
829 *devfn
= drhd
->devices
[i
].devfn
;
833 if (!pdev
|| !dev_is_pci(tmp
))
836 ptmp
= to_pci_dev(tmp
);
837 if (ptmp
->subordinate
&&
838 ptmp
->subordinate
->number
<= pdev
->bus
->number
&&
839 ptmp
->subordinate
->busn_res
.end
>= pdev
->bus
->number
)
843 if (pdev
&& drhd
->include_all
) {
845 *bus
= pdev
->bus
->number
;
846 *devfn
= pdev
->devfn
;
857 static void domain_flush_cache(struct dmar_domain
*domain
,
858 void *addr
, int size
)
860 if (!domain
->iommu_coherency
)
861 clflush_cache_range(addr
, size
);
864 static int device_context_mapped(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
866 struct context_entry
*context
;
870 spin_lock_irqsave(&iommu
->lock
, flags
);
871 context
= iommu_context_addr(iommu
, bus
, devfn
, 0);
873 ret
= context_present(context
);
874 spin_unlock_irqrestore(&iommu
->lock
, flags
);
878 static void free_context_table(struct intel_iommu
*iommu
)
882 struct context_entry
*context
;
884 spin_lock_irqsave(&iommu
->lock
, flags
);
885 if (!iommu
->root_entry
) {
888 for (i
= 0; i
< ROOT_ENTRY_NR
; i
++) {
889 context
= iommu_context_addr(iommu
, i
, 0, 0);
891 free_pgtable_page(context
);
893 if (!ecs_enabled(iommu
))
896 context
= iommu_context_addr(iommu
, i
, 0x80, 0);
898 free_pgtable_page(context
);
901 free_pgtable_page(iommu
->root_entry
);
902 iommu
->root_entry
= NULL
;
904 spin_unlock_irqrestore(&iommu
->lock
, flags
);
907 static struct dma_pte
*pfn_to_dma_pte(struct dmar_domain
*domain
,
908 unsigned long pfn
, int *target_level
)
910 struct dma_pte
*parent
, *pte
= NULL
;
911 int level
= agaw_to_level(domain
->agaw
);
914 BUG_ON(!domain
->pgd
);
916 if (!domain_pfn_supported(domain
, pfn
))
917 /* Address beyond IOMMU's addressing capabilities. */
920 parent
= domain
->pgd
;
925 offset
= pfn_level_offset(pfn
, level
);
926 pte
= &parent
[offset
];
927 if (!*target_level
&& (dma_pte_superpage(pte
) || !dma_pte_present(pte
)))
929 if (level
== *target_level
)
932 if (!dma_pte_present(pte
)) {
935 tmp_page
= alloc_pgtable_page(domain
->nid
);
940 domain_flush_cache(domain
, tmp_page
, VTD_PAGE_SIZE
);
941 pteval
= ((uint64_t)virt_to_dma_pfn(tmp_page
) << VTD_PAGE_SHIFT
) | DMA_PTE_READ
| DMA_PTE_WRITE
;
942 if (cmpxchg64(&pte
->val
, 0ULL, pteval
))
943 /* Someone else set it while we were thinking; use theirs. */
944 free_pgtable_page(tmp_page
);
946 domain_flush_cache(domain
, pte
, sizeof(*pte
));
951 parent
= phys_to_virt(dma_pte_addr(pte
));
956 *target_level
= level
;
962 /* return address's pte at specific level */
963 static struct dma_pte
*dma_pfn_level_pte(struct dmar_domain
*domain
,
965 int level
, int *large_page
)
967 struct dma_pte
*parent
, *pte
= NULL
;
968 int total
= agaw_to_level(domain
->agaw
);
971 parent
= domain
->pgd
;
972 while (level
<= total
) {
973 offset
= pfn_level_offset(pfn
, total
);
974 pte
= &parent
[offset
];
978 if (!dma_pte_present(pte
)) {
983 if (dma_pte_superpage(pte
)) {
988 parent
= phys_to_virt(dma_pte_addr(pte
));
994 /* clear last level pte, a tlb flush should be followed */
995 static void dma_pte_clear_range(struct dmar_domain
*domain
,
996 unsigned long start_pfn
,
997 unsigned long last_pfn
)
999 unsigned int large_page
= 1;
1000 struct dma_pte
*first_pte
, *pte
;
1002 BUG_ON(!domain_pfn_supported(domain
, start_pfn
));
1003 BUG_ON(!domain_pfn_supported(domain
, last_pfn
));
1004 BUG_ON(start_pfn
> last_pfn
);
1006 /* we don't need lock here; nobody else touches the iova range */
1009 first_pte
= pte
= dma_pfn_level_pte(domain
, start_pfn
, 1, &large_page
);
1011 start_pfn
= align_to_level(start_pfn
+ 1, large_page
+ 1);
1016 start_pfn
+= lvl_to_nr_pages(large_page
);
1018 } while (start_pfn
<= last_pfn
&& !first_pte_in_page(pte
));
1020 domain_flush_cache(domain
, first_pte
,
1021 (void *)pte
- (void *)first_pte
);
1023 } while (start_pfn
&& start_pfn
<= last_pfn
);
1026 static void dma_pte_free_level(struct dmar_domain
*domain
, int level
,
1027 int retain_level
, struct dma_pte
*pte
,
1028 unsigned long pfn
, unsigned long start_pfn
,
1029 unsigned long last_pfn
)
1031 pfn
= max(start_pfn
, pfn
);
1032 pte
= &pte
[pfn_level_offset(pfn
, level
)];
1035 unsigned long level_pfn
;
1036 struct dma_pte
*level_pte
;
1038 if (!dma_pte_present(pte
) || dma_pte_superpage(pte
))
1041 level_pfn
= pfn
& level_mask(level
);
1042 level_pte
= phys_to_virt(dma_pte_addr(pte
));
1045 dma_pte_free_level(domain
, level
- 1, retain_level
,
1046 level_pte
, level_pfn
, start_pfn
,
1051 * Free the page table if we're below the level we want to
1052 * retain and the range covers the entire table.
1054 if (level
< retain_level
&& !(start_pfn
> level_pfn
||
1055 last_pfn
< level_pfn
+ level_size(level
) - 1)) {
1057 domain_flush_cache(domain
, pte
, sizeof(*pte
));
1058 free_pgtable_page(level_pte
);
1061 pfn
+= level_size(level
);
1062 } while (!first_pte_in_page(++pte
) && pfn
<= last_pfn
);
1066 * clear last level (leaf) ptes and free page table pages below the
1067 * level we wish to keep intact.
1069 static void dma_pte_free_pagetable(struct dmar_domain
*domain
,
1070 unsigned long start_pfn
,
1071 unsigned long last_pfn
,
1074 BUG_ON(!domain_pfn_supported(domain
, start_pfn
));
1075 BUG_ON(!domain_pfn_supported(domain
, last_pfn
));
1076 BUG_ON(start_pfn
> last_pfn
);
1078 dma_pte_clear_range(domain
, start_pfn
, last_pfn
);
1080 /* We don't need lock here; nobody else touches the iova range */
1081 dma_pte_free_level(domain
, agaw_to_level(domain
->agaw
), retain_level
,
1082 domain
->pgd
, 0, start_pfn
, last_pfn
);
1085 if (start_pfn
== 0 && last_pfn
== DOMAIN_MAX_PFN(domain
->gaw
)) {
1086 free_pgtable_page(domain
->pgd
);
1091 /* When a page at a given level is being unlinked from its parent, we don't
1092 need to *modify* it at all. All we need to do is make a list of all the
1093 pages which can be freed just as soon as we've flushed the IOTLB and we
1094 know the hardware page-walk will no longer touch them.
1095 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1097 static struct page
*dma_pte_list_pagetables(struct dmar_domain
*domain
,
1098 int level
, struct dma_pte
*pte
,
1099 struct page
*freelist
)
1103 pg
= pfn_to_page(dma_pte_addr(pte
) >> PAGE_SHIFT
);
1104 pg
->freelist
= freelist
;
1110 pte
= page_address(pg
);
1112 if (dma_pte_present(pte
) && !dma_pte_superpage(pte
))
1113 freelist
= dma_pte_list_pagetables(domain
, level
- 1,
1116 } while (!first_pte_in_page(pte
));
1121 static struct page
*dma_pte_clear_level(struct dmar_domain
*domain
, int level
,
1122 struct dma_pte
*pte
, unsigned long pfn
,
1123 unsigned long start_pfn
,
1124 unsigned long last_pfn
,
1125 struct page
*freelist
)
1127 struct dma_pte
*first_pte
= NULL
, *last_pte
= NULL
;
1129 pfn
= max(start_pfn
, pfn
);
1130 pte
= &pte
[pfn_level_offset(pfn
, level
)];
1133 unsigned long level_pfn
;
1135 if (!dma_pte_present(pte
))
1138 level_pfn
= pfn
& level_mask(level
);
1140 /* If range covers entire pagetable, free it */
1141 if (start_pfn
<= level_pfn
&&
1142 last_pfn
>= level_pfn
+ level_size(level
) - 1) {
1143 /* These suborbinate page tables are going away entirely. Don't
1144 bother to clear them; we're just going to *free* them. */
1145 if (level
> 1 && !dma_pte_superpage(pte
))
1146 freelist
= dma_pte_list_pagetables(domain
, level
- 1, pte
, freelist
);
1152 } else if (level
> 1) {
1153 /* Recurse down into a level that isn't *entirely* obsolete */
1154 freelist
= dma_pte_clear_level(domain
, level
- 1,
1155 phys_to_virt(dma_pte_addr(pte
)),
1156 level_pfn
, start_pfn
, last_pfn
,
1160 pfn
+= level_size(level
);
1161 } while (!first_pte_in_page(++pte
) && pfn
<= last_pfn
);
1164 domain_flush_cache(domain
, first_pte
,
1165 (void *)++last_pte
- (void *)first_pte
);
1170 /* We can't just free the pages because the IOMMU may still be walking
1171 the page tables, and may have cached the intermediate levels. The
1172 pages can only be freed after the IOTLB flush has been done. */
1173 static struct page
*domain_unmap(struct dmar_domain
*domain
,
1174 unsigned long start_pfn
,
1175 unsigned long last_pfn
)
1177 struct page
*freelist
= NULL
;
1179 BUG_ON(!domain_pfn_supported(domain
, start_pfn
));
1180 BUG_ON(!domain_pfn_supported(domain
, last_pfn
));
1181 BUG_ON(start_pfn
> last_pfn
);
1183 /* we don't need lock here; nobody else touches the iova range */
1184 freelist
= dma_pte_clear_level(domain
, agaw_to_level(domain
->agaw
),
1185 domain
->pgd
, 0, start_pfn
, last_pfn
, NULL
);
1188 if (start_pfn
== 0 && last_pfn
== DOMAIN_MAX_PFN(domain
->gaw
)) {
1189 struct page
*pgd_page
= virt_to_page(domain
->pgd
);
1190 pgd_page
->freelist
= freelist
;
1191 freelist
= pgd_page
;
1199 static void dma_free_pagelist(struct page
*freelist
)
1203 while ((pg
= freelist
)) {
1204 freelist
= pg
->freelist
;
1205 free_pgtable_page(page_address(pg
));
1209 static void iova_entry_free(unsigned long data
)
1211 struct page
*freelist
= (struct page
*)data
;
1213 dma_free_pagelist(freelist
);
1216 /* iommu handling */
1217 static int iommu_alloc_root_entry(struct intel_iommu
*iommu
)
1219 struct root_entry
*root
;
1220 unsigned long flags
;
1222 root
= (struct root_entry
*)alloc_pgtable_page(iommu
->node
);
1224 pr_err("Allocating root entry for %s failed\n",
1229 __iommu_flush_cache(iommu
, root
, ROOT_SIZE
);
1231 spin_lock_irqsave(&iommu
->lock
, flags
);
1232 iommu
->root_entry
= root
;
1233 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1238 static void iommu_set_root_entry(struct intel_iommu
*iommu
)
1244 addr
= virt_to_phys(iommu
->root_entry
);
1245 if (ecs_enabled(iommu
))
1246 addr
|= DMA_RTADDR_RTT
;
1248 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
1249 dmar_writeq(iommu
->reg
+ DMAR_RTADDR_REG
, addr
);
1251 writel(iommu
->gcmd
| DMA_GCMD_SRTP
, iommu
->reg
+ DMAR_GCMD_REG
);
1253 /* Make sure hardware complete it */
1254 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1255 readl
, (sts
& DMA_GSTS_RTPS
), sts
);
1257 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1260 static void iommu_flush_write_buffer(struct intel_iommu
*iommu
)
1265 if (!rwbf_quirk
&& !cap_rwbf(iommu
->cap
))
1268 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
1269 writel(iommu
->gcmd
| DMA_GCMD_WBF
, iommu
->reg
+ DMAR_GCMD_REG
);
1271 /* Make sure hardware complete it */
1272 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1273 readl
, (!(val
& DMA_GSTS_WBFS
)), val
);
1275 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1278 /* return value determine if we need a write buffer flush */
1279 static void __iommu_flush_context(struct intel_iommu
*iommu
,
1280 u16 did
, u16 source_id
, u8 function_mask
,
1287 case DMA_CCMD_GLOBAL_INVL
:
1288 val
= DMA_CCMD_GLOBAL_INVL
;
1290 case DMA_CCMD_DOMAIN_INVL
:
1291 val
= DMA_CCMD_DOMAIN_INVL
|DMA_CCMD_DID(did
);
1293 case DMA_CCMD_DEVICE_INVL
:
1294 val
= DMA_CCMD_DEVICE_INVL
|DMA_CCMD_DID(did
)
1295 | DMA_CCMD_SID(source_id
) | DMA_CCMD_FM(function_mask
);
1300 val
|= DMA_CCMD_ICC
;
1302 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
1303 dmar_writeq(iommu
->reg
+ DMAR_CCMD_REG
, val
);
1305 /* Make sure hardware complete it */
1306 IOMMU_WAIT_OP(iommu
, DMAR_CCMD_REG
,
1307 dmar_readq
, (!(val
& DMA_CCMD_ICC
)), val
);
1309 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1312 /* return value determine if we need a write buffer flush */
1313 static void __iommu_flush_iotlb(struct intel_iommu
*iommu
, u16 did
,
1314 u64 addr
, unsigned int size_order
, u64 type
)
1316 int tlb_offset
= ecap_iotlb_offset(iommu
->ecap
);
1317 u64 val
= 0, val_iva
= 0;
1321 case DMA_TLB_GLOBAL_FLUSH
:
1322 /* global flush doesn't need set IVA_REG */
1323 val
= DMA_TLB_GLOBAL_FLUSH
|DMA_TLB_IVT
;
1325 case DMA_TLB_DSI_FLUSH
:
1326 val
= DMA_TLB_DSI_FLUSH
|DMA_TLB_IVT
|DMA_TLB_DID(did
);
1328 case DMA_TLB_PSI_FLUSH
:
1329 val
= DMA_TLB_PSI_FLUSH
|DMA_TLB_IVT
|DMA_TLB_DID(did
);
1330 /* IH bit is passed in as part of address */
1331 val_iva
= size_order
| addr
;
1336 /* Note: set drain read/write */
1339 * This is probably to be super secure.. Looks like we can
1340 * ignore it without any impact.
1342 if (cap_read_drain(iommu
->cap
))
1343 val
|= DMA_TLB_READ_DRAIN
;
1345 if (cap_write_drain(iommu
->cap
))
1346 val
|= DMA_TLB_WRITE_DRAIN
;
1348 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
1349 /* Note: Only uses first TLB reg currently */
1351 dmar_writeq(iommu
->reg
+ tlb_offset
, val_iva
);
1352 dmar_writeq(iommu
->reg
+ tlb_offset
+ 8, val
);
1354 /* Make sure hardware complete it */
1355 IOMMU_WAIT_OP(iommu
, tlb_offset
+ 8,
1356 dmar_readq
, (!(val
& DMA_TLB_IVT
)), val
);
1358 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1360 /* check IOTLB invalidation granularity */
1361 if (DMA_TLB_IAIG(val
) == 0)
1362 pr_err("Flush IOTLB failed\n");
1363 if (DMA_TLB_IAIG(val
) != DMA_TLB_IIRG(type
))
1364 pr_debug("TLB flush request %Lx, actual %Lx\n",
1365 (unsigned long long)DMA_TLB_IIRG(type
),
1366 (unsigned long long)DMA_TLB_IAIG(val
));
1369 static struct device_domain_info
*
1370 iommu_support_dev_iotlb (struct dmar_domain
*domain
, struct intel_iommu
*iommu
,
1373 struct device_domain_info
*info
;
1375 assert_spin_locked(&device_domain_lock
);
1380 list_for_each_entry(info
, &domain
->devices
, link
)
1381 if (info
->iommu
== iommu
&& info
->bus
== bus
&&
1382 info
->devfn
== devfn
) {
1383 if (info
->ats_supported
&& info
->dev
)
1391 static void domain_update_iotlb(struct dmar_domain
*domain
)
1393 struct device_domain_info
*info
;
1394 bool has_iotlb_device
= false;
1396 assert_spin_locked(&device_domain_lock
);
1398 list_for_each_entry(info
, &domain
->devices
, link
) {
1399 struct pci_dev
*pdev
;
1401 if (!info
->dev
|| !dev_is_pci(info
->dev
))
1404 pdev
= to_pci_dev(info
->dev
);
1405 if (pdev
->ats_enabled
) {
1406 has_iotlb_device
= true;
1411 domain
->has_iotlb_device
= has_iotlb_device
;
1414 static void iommu_enable_dev_iotlb(struct device_domain_info
*info
)
1416 struct pci_dev
*pdev
;
1418 assert_spin_locked(&device_domain_lock
);
1420 if (!info
|| !dev_is_pci(info
->dev
))
1423 pdev
= to_pci_dev(info
->dev
);
1424 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1425 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1426 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1427 * reserved, which should be set to 0.
1429 if (!ecap_dit(info
->iommu
->ecap
))
1432 struct pci_dev
*pf_pdev
;
1434 /* pdev will be returned if device is not a vf */
1435 pf_pdev
= pci_physfn(pdev
);
1436 info
->pfsid
= PCI_DEVID(pf_pdev
->bus
->number
, pf_pdev
->devfn
);
1439 #ifdef CONFIG_INTEL_IOMMU_SVM
1440 /* The PCIe spec, in its wisdom, declares that the behaviour of
1441 the device if you enable PASID support after ATS support is
1442 undefined. So always enable PASID support on devices which
1443 have it, even if we can't yet know if we're ever going to
1445 if (info
->pasid_supported
&& !pci_enable_pasid(pdev
, info
->pasid_supported
& ~1))
1446 info
->pasid_enabled
= 1;
1448 if (info
->pri_supported
&& !pci_reset_pri(pdev
) && !pci_enable_pri(pdev
, 32))
1449 info
->pri_enabled
= 1;
1451 if (info
->ats_supported
&& !pci_enable_ats(pdev
, VTD_PAGE_SHIFT
)) {
1452 info
->ats_enabled
= 1;
1453 domain_update_iotlb(info
->domain
);
1454 info
->ats_qdep
= pci_ats_queue_depth(pdev
);
1458 static void iommu_disable_dev_iotlb(struct device_domain_info
*info
)
1460 struct pci_dev
*pdev
;
1462 assert_spin_locked(&device_domain_lock
);
1464 if (!dev_is_pci(info
->dev
))
1467 pdev
= to_pci_dev(info
->dev
);
1469 if (info
->ats_enabled
) {
1470 pci_disable_ats(pdev
);
1471 info
->ats_enabled
= 0;
1472 domain_update_iotlb(info
->domain
);
1474 #ifdef CONFIG_INTEL_IOMMU_SVM
1475 if (info
->pri_enabled
) {
1476 pci_disable_pri(pdev
);
1477 info
->pri_enabled
= 0;
1479 if (info
->pasid_enabled
) {
1480 pci_disable_pasid(pdev
);
1481 info
->pasid_enabled
= 0;
1486 static void iommu_flush_dev_iotlb(struct dmar_domain
*domain
,
1487 u64 addr
, unsigned mask
)
1490 unsigned long flags
;
1491 struct device_domain_info
*info
;
1493 if (!domain
->has_iotlb_device
)
1496 spin_lock_irqsave(&device_domain_lock
, flags
);
1497 list_for_each_entry(info
, &domain
->devices
, link
) {
1498 if (!info
->ats_enabled
)
1501 sid
= info
->bus
<< 8 | info
->devfn
;
1502 qdep
= info
->ats_qdep
;
1503 qi_flush_dev_iotlb(info
->iommu
, sid
, info
->pfsid
,
1506 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1509 static void iommu_flush_iotlb_psi(struct intel_iommu
*iommu
,
1510 struct dmar_domain
*domain
,
1511 unsigned long pfn
, unsigned int pages
,
1514 unsigned int mask
= ilog2(__roundup_pow_of_two(pages
));
1515 uint64_t addr
= (uint64_t)pfn
<< VTD_PAGE_SHIFT
;
1516 u16 did
= domain
->iommu_did
[iommu
->seq_id
];
1523 * Fallback to domain selective flush if no PSI support or the size is
1525 * PSI requires page size to be 2 ^ x, and the base address is naturally
1526 * aligned to the size
1528 if (!cap_pgsel_inv(iommu
->cap
) || mask
> cap_max_amask_val(iommu
->cap
))
1529 iommu
->flush
.flush_iotlb(iommu
, did
, 0, 0,
1532 iommu
->flush
.flush_iotlb(iommu
, did
, addr
| ih
, mask
,
1536 * In caching mode, changes of pages from non-present to present require
1537 * flush. However, device IOTLB doesn't need to be flushed in this case.
1539 if (!cap_caching_mode(iommu
->cap
) || !map
)
1540 iommu_flush_dev_iotlb(domain
, addr
, mask
);
1543 /* Notification for newly created mappings */
1544 static inline void __mapping_notify_one(struct intel_iommu
*iommu
,
1545 struct dmar_domain
*domain
,
1546 unsigned long pfn
, unsigned int pages
)
1548 /* It's a non-present to present mapping. Only flush if caching mode */
1549 if (cap_caching_mode(iommu
->cap
))
1550 iommu_flush_iotlb_psi(iommu
, domain
, pfn
, pages
, 0, 1);
1552 iommu_flush_write_buffer(iommu
);
1555 static void iommu_flush_iova(struct iova_domain
*iovad
)
1557 struct dmar_domain
*domain
;
1560 domain
= container_of(iovad
, struct dmar_domain
, iovad
);
1562 for_each_domain_iommu(idx
, domain
) {
1563 struct intel_iommu
*iommu
= g_iommus
[idx
];
1564 u16 did
= domain
->iommu_did
[iommu
->seq_id
];
1566 iommu
->flush
.flush_iotlb(iommu
, did
, 0, 0, DMA_TLB_DSI_FLUSH
);
1568 if (!cap_caching_mode(iommu
->cap
))
1569 iommu_flush_dev_iotlb(get_iommu_domain(iommu
, did
),
1570 0, MAX_AGAW_PFN_WIDTH
);
1574 static void iommu_disable_protect_mem_regions(struct intel_iommu
*iommu
)
1577 unsigned long flags
;
1579 raw_spin_lock_irqsave(&iommu
->register_lock
, flags
);
1580 pmen
= readl(iommu
->reg
+ DMAR_PMEN_REG
);
1581 pmen
&= ~DMA_PMEN_EPM
;
1582 writel(pmen
, iommu
->reg
+ DMAR_PMEN_REG
);
1584 /* wait for the protected region status bit to clear */
1585 IOMMU_WAIT_OP(iommu
, DMAR_PMEN_REG
,
1586 readl
, !(pmen
& DMA_PMEN_PRS
), pmen
);
1588 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flags
);
1591 static void iommu_enable_translation(struct intel_iommu
*iommu
)
1594 unsigned long flags
;
1596 raw_spin_lock_irqsave(&iommu
->register_lock
, flags
);
1597 iommu
->gcmd
|= DMA_GCMD_TE
;
1598 writel(iommu
->gcmd
, iommu
->reg
+ DMAR_GCMD_REG
);
1600 /* Make sure hardware complete it */
1601 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1602 readl
, (sts
& DMA_GSTS_TES
), sts
);
1604 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flags
);
1607 static void iommu_disable_translation(struct intel_iommu
*iommu
)
1612 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
1613 iommu
->gcmd
&= ~DMA_GCMD_TE
;
1614 writel(iommu
->gcmd
, iommu
->reg
+ DMAR_GCMD_REG
);
1616 /* Make sure hardware complete it */
1617 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1618 readl
, (!(sts
& DMA_GSTS_TES
)), sts
);
1620 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1624 static int iommu_init_domains(struct intel_iommu
*iommu
)
1626 u32 ndomains
, nlongs
;
1629 ndomains
= cap_ndoms(iommu
->cap
);
1630 pr_debug("%s: Number of Domains supported <%d>\n",
1631 iommu
->name
, ndomains
);
1632 nlongs
= BITS_TO_LONGS(ndomains
);
1634 spin_lock_init(&iommu
->lock
);
1636 iommu
->domain_ids
= kcalloc(nlongs
, sizeof(unsigned long), GFP_KERNEL
);
1637 if (!iommu
->domain_ids
) {
1638 pr_err("%s: Allocating domain id array failed\n",
1643 size
= (ALIGN(ndomains
, 256) >> 8) * sizeof(struct dmar_domain
**);
1644 iommu
->domains
= kzalloc(size
, GFP_KERNEL
);
1646 if (iommu
->domains
) {
1647 size
= 256 * sizeof(struct dmar_domain
*);
1648 iommu
->domains
[0] = kzalloc(size
, GFP_KERNEL
);
1651 if (!iommu
->domains
|| !iommu
->domains
[0]) {
1652 pr_err("%s: Allocating domain array failed\n",
1654 kfree(iommu
->domain_ids
);
1655 kfree(iommu
->domains
);
1656 iommu
->domain_ids
= NULL
;
1657 iommu
->domains
= NULL
;
1664 * If Caching mode is set, then invalid translations are tagged
1665 * with domain-id 0, hence we need to pre-allocate it. We also
1666 * use domain-id 0 as a marker for non-allocated domain-id, so
1667 * make sure it is not used for a real domain.
1669 set_bit(0, iommu
->domain_ids
);
1674 static void disable_dmar_iommu(struct intel_iommu
*iommu
)
1676 struct device_domain_info
*info
, *tmp
;
1677 unsigned long flags
;
1679 if (!iommu
->domains
|| !iommu
->domain_ids
)
1683 spin_lock_irqsave(&device_domain_lock
, flags
);
1684 list_for_each_entry_safe(info
, tmp
, &device_domain_list
, global
) {
1685 struct dmar_domain
*domain
;
1687 if (info
->iommu
!= iommu
)
1690 if (!info
->dev
|| !info
->domain
)
1693 domain
= info
->domain
;
1695 __dmar_remove_one_dev_info(info
);
1697 if (!domain_type_is_vm_or_si(domain
)) {
1699 * The domain_exit() function can't be called under
1700 * device_domain_lock, as it takes this lock itself.
1701 * So release the lock here and re-run the loop
1704 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1705 domain_exit(domain
);
1709 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1711 if (iommu
->gcmd
& DMA_GCMD_TE
)
1712 iommu_disable_translation(iommu
);
1715 static void free_dmar_iommu(struct intel_iommu
*iommu
)
1717 if ((iommu
->domains
) && (iommu
->domain_ids
)) {
1718 int elems
= ALIGN(cap_ndoms(iommu
->cap
), 256) >> 8;
1721 for (i
= 0; i
< elems
; i
++)
1722 kfree(iommu
->domains
[i
]);
1723 kfree(iommu
->domains
);
1724 kfree(iommu
->domain_ids
);
1725 iommu
->domains
= NULL
;
1726 iommu
->domain_ids
= NULL
;
1729 g_iommus
[iommu
->seq_id
] = NULL
;
1731 /* free context mapping */
1732 free_context_table(iommu
);
1734 #ifdef CONFIG_INTEL_IOMMU_SVM
1735 if (pasid_enabled(iommu
)) {
1736 if (ecap_prs(iommu
->ecap
))
1737 intel_svm_finish_prq(iommu
);
1738 intel_svm_free_pasid_tables(iommu
);
1743 static struct dmar_domain
*alloc_domain(int flags
)
1745 struct dmar_domain
*domain
;
1747 domain
= alloc_domain_mem();
1751 memset(domain
, 0, sizeof(*domain
));
1753 domain
->flags
= flags
;
1754 domain
->has_iotlb_device
= false;
1755 INIT_LIST_HEAD(&domain
->devices
);
1760 /* Must be called with iommu->lock */
1761 static int domain_attach_iommu(struct dmar_domain
*domain
,
1762 struct intel_iommu
*iommu
)
1764 unsigned long ndomains
;
1767 assert_spin_locked(&device_domain_lock
);
1768 assert_spin_locked(&iommu
->lock
);
1770 domain
->iommu_refcnt
[iommu
->seq_id
] += 1;
1771 domain
->iommu_count
+= 1;
1772 if (domain
->iommu_refcnt
[iommu
->seq_id
] == 1) {
1773 ndomains
= cap_ndoms(iommu
->cap
);
1774 num
= find_first_zero_bit(iommu
->domain_ids
, ndomains
);
1776 if (num
>= ndomains
) {
1777 pr_err("%s: No free domain ids\n", iommu
->name
);
1778 domain
->iommu_refcnt
[iommu
->seq_id
] -= 1;
1779 domain
->iommu_count
-= 1;
1783 set_bit(num
, iommu
->domain_ids
);
1784 set_iommu_domain(iommu
, num
, domain
);
1786 domain
->iommu_did
[iommu
->seq_id
] = num
;
1787 domain
->nid
= iommu
->node
;
1789 domain_update_iommu_cap(domain
);
1795 static int domain_detach_iommu(struct dmar_domain
*domain
,
1796 struct intel_iommu
*iommu
)
1798 int num
, count
= INT_MAX
;
1800 assert_spin_locked(&device_domain_lock
);
1801 assert_spin_locked(&iommu
->lock
);
1803 domain
->iommu_refcnt
[iommu
->seq_id
] -= 1;
1804 count
= --domain
->iommu_count
;
1805 if (domain
->iommu_refcnt
[iommu
->seq_id
] == 0) {
1806 num
= domain
->iommu_did
[iommu
->seq_id
];
1807 clear_bit(num
, iommu
->domain_ids
);
1808 set_iommu_domain(iommu
, num
, NULL
);
1810 domain_update_iommu_cap(domain
);
1811 domain
->iommu_did
[iommu
->seq_id
] = 0;
1817 static struct iova_domain reserved_iova_list
;
1818 static struct lock_class_key reserved_rbtree_key
;
1820 static int dmar_init_reserved_ranges(void)
1822 struct pci_dev
*pdev
= NULL
;
1826 init_iova_domain(&reserved_iova_list
, VTD_PAGE_SIZE
, IOVA_START_PFN
);
1828 lockdep_set_class(&reserved_iova_list
.iova_rbtree_lock
,
1829 &reserved_rbtree_key
);
1831 /* IOAPIC ranges shouldn't be accessed by DMA */
1832 iova
= reserve_iova(&reserved_iova_list
, IOVA_PFN(IOAPIC_RANGE_START
),
1833 IOVA_PFN(IOAPIC_RANGE_END
));
1835 pr_err("Reserve IOAPIC range failed\n");
1839 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1840 for_each_pci_dev(pdev
) {
1843 for (i
= 0; i
< PCI_NUM_RESOURCES
; i
++) {
1844 r
= &pdev
->resource
[i
];
1845 if (!r
->flags
|| !(r
->flags
& IORESOURCE_MEM
))
1847 iova
= reserve_iova(&reserved_iova_list
,
1851 pr_err("Reserve iova failed\n");
1859 static void domain_reserve_special_ranges(struct dmar_domain
*domain
)
1861 copy_reserved_iova(&reserved_iova_list
, &domain
->iovad
);
1864 static inline int guestwidth_to_adjustwidth(int gaw
)
1867 int r
= (gaw
- 12) % 9;
1878 static int domain_init(struct dmar_domain
*domain
, struct intel_iommu
*iommu
,
1881 int adjust_width
, agaw
;
1882 unsigned long sagaw
;
1885 init_iova_domain(&domain
->iovad
, VTD_PAGE_SIZE
, IOVA_START_PFN
);
1887 err
= init_iova_flush_queue(&domain
->iovad
,
1888 iommu_flush_iova
, iova_entry_free
);
1892 domain_reserve_special_ranges(domain
);
1894 /* calculate AGAW */
1895 if (guest_width
> cap_mgaw(iommu
->cap
))
1896 guest_width
= cap_mgaw(iommu
->cap
);
1897 domain
->gaw
= guest_width
;
1898 adjust_width
= guestwidth_to_adjustwidth(guest_width
);
1899 agaw
= width_to_agaw(adjust_width
);
1900 sagaw
= cap_sagaw(iommu
->cap
);
1901 if (!test_bit(agaw
, &sagaw
)) {
1902 /* hardware doesn't support it, choose a bigger one */
1903 pr_debug("Hardware doesn't support agaw %d\n", agaw
);
1904 agaw
= find_next_bit(&sagaw
, 5, agaw
);
1908 domain
->agaw
= agaw
;
1910 if (ecap_coherent(iommu
->ecap
))
1911 domain
->iommu_coherency
= 1;
1913 domain
->iommu_coherency
= 0;
1915 if (ecap_sc_support(iommu
->ecap
))
1916 domain
->iommu_snooping
= 1;
1918 domain
->iommu_snooping
= 0;
1920 if (intel_iommu_superpage
)
1921 domain
->iommu_superpage
= fls(cap_super_page_val(iommu
->cap
));
1923 domain
->iommu_superpage
= 0;
1925 domain
->nid
= iommu
->node
;
1927 /* always allocate the top pgd */
1928 domain
->pgd
= (struct dma_pte
*)alloc_pgtable_page(domain
->nid
);
1931 __iommu_flush_cache(iommu
, domain
->pgd
, PAGE_SIZE
);
1935 static void domain_exit(struct dmar_domain
*domain
)
1937 struct page
*freelist
= NULL
;
1939 /* Domain 0 is reserved, so dont process it */
1943 /* Remove associated devices and clear attached or cached domains */
1945 domain_remove_dev_info(domain
);
1949 put_iova_domain(&domain
->iovad
);
1951 freelist
= domain_unmap(domain
, 0, DOMAIN_MAX_PFN(domain
->gaw
));
1953 dma_free_pagelist(freelist
);
1955 free_domain_mem(domain
);
1958 static int domain_context_mapping_one(struct dmar_domain
*domain
,
1959 struct intel_iommu
*iommu
,
1962 u16 did
= domain
->iommu_did
[iommu
->seq_id
];
1963 int translation
= CONTEXT_TT_MULTI_LEVEL
;
1964 struct device_domain_info
*info
= NULL
;
1965 struct context_entry
*context
;
1966 unsigned long flags
;
1967 struct dma_pte
*pgd
;
1972 if (hw_pass_through
&& domain_type_is_si(domain
))
1973 translation
= CONTEXT_TT_PASS_THROUGH
;
1975 pr_debug("Set context mapping for %02x:%02x.%d\n",
1976 bus
, PCI_SLOT(devfn
), PCI_FUNC(devfn
));
1978 BUG_ON(!domain
->pgd
);
1980 spin_lock_irqsave(&device_domain_lock
, flags
);
1981 spin_lock(&iommu
->lock
);
1984 context
= iommu_context_addr(iommu
, bus
, devfn
, 1);
1989 if (context_present(context
))
1993 * For kdump cases, old valid entries may be cached due to the
1994 * in-flight DMA and copied pgtable, but there is no unmapping
1995 * behaviour for them, thus we need an explicit cache flush for
1996 * the newly-mapped device. For kdump, at this point, the device
1997 * is supposed to finish reset at its driver probe stage, so no
1998 * in-flight DMA will exist, and we don't need to worry anymore
2001 if (context_copied(context
)) {
2002 u16 did_old
= context_domain_id(context
);
2004 if (did_old
< cap_ndoms(iommu
->cap
)) {
2005 iommu
->flush
.flush_context(iommu
, did_old
,
2006 (((u16
)bus
) << 8) | devfn
,
2007 DMA_CCMD_MASK_NOBIT
,
2008 DMA_CCMD_DEVICE_INVL
);
2009 iommu
->flush
.flush_iotlb(iommu
, did_old
, 0, 0,
2016 context_clear_entry(context
);
2017 context_set_domain_id(context
, did
);
2020 * Skip top levels of page tables for iommu which has less agaw
2021 * than default. Unnecessary for PT mode.
2023 if (translation
!= CONTEXT_TT_PASS_THROUGH
) {
2024 for (agaw
= domain
->agaw
; agaw
!= iommu
->agaw
; agaw
--) {
2026 pgd
= phys_to_virt(dma_pte_addr(pgd
));
2027 if (!dma_pte_present(pgd
))
2031 info
= iommu_support_dev_iotlb(domain
, iommu
, bus
, devfn
);
2032 if (info
&& info
->ats_supported
)
2033 translation
= CONTEXT_TT_DEV_IOTLB
;
2035 translation
= CONTEXT_TT_MULTI_LEVEL
;
2037 context_set_address_root(context
, virt_to_phys(pgd
));
2038 context_set_address_width(context
, iommu
->agaw
);
2041 * In pass through mode, AW must be programmed to
2042 * indicate the largest AGAW value supported by
2043 * hardware. And ASR is ignored by hardware.
2045 context_set_address_width(context
, iommu
->msagaw
);
2048 context_set_translation_type(context
, translation
);
2049 context_set_fault_enable(context
);
2050 context_set_present(context
);
2051 domain_flush_cache(domain
, context
, sizeof(*context
));
2054 * It's a non-present to present mapping. If hardware doesn't cache
2055 * non-present entry we only need to flush the write-buffer. If the
2056 * _does_ cache non-present entries, then it does so in the special
2057 * domain #0, which we have to flush:
2059 if (cap_caching_mode(iommu
->cap
)) {
2060 iommu
->flush
.flush_context(iommu
, 0,
2061 (((u16
)bus
) << 8) | devfn
,
2062 DMA_CCMD_MASK_NOBIT
,
2063 DMA_CCMD_DEVICE_INVL
);
2064 iommu
->flush
.flush_iotlb(iommu
, did
, 0, 0, DMA_TLB_DSI_FLUSH
);
2066 iommu_flush_write_buffer(iommu
);
2068 iommu_enable_dev_iotlb(info
);
2073 spin_unlock(&iommu
->lock
);
2074 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2079 struct domain_context_mapping_data
{
2080 struct dmar_domain
*domain
;
2081 struct intel_iommu
*iommu
;
2084 static int domain_context_mapping_cb(struct pci_dev
*pdev
,
2085 u16 alias
, void *opaque
)
2087 struct domain_context_mapping_data
*data
= opaque
;
2089 return domain_context_mapping_one(data
->domain
, data
->iommu
,
2090 PCI_BUS_NUM(alias
), alias
& 0xff);
2094 domain_context_mapping(struct dmar_domain
*domain
, struct device
*dev
)
2096 struct intel_iommu
*iommu
;
2098 struct domain_context_mapping_data data
;
2100 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
2104 if (!dev_is_pci(dev
))
2105 return domain_context_mapping_one(domain
, iommu
, bus
, devfn
);
2107 data
.domain
= domain
;
2110 return pci_for_each_dma_alias(to_pci_dev(dev
),
2111 &domain_context_mapping_cb
, &data
);
2114 static int domain_context_mapped_cb(struct pci_dev
*pdev
,
2115 u16 alias
, void *opaque
)
2117 struct intel_iommu
*iommu
= opaque
;
2119 return !device_context_mapped(iommu
, PCI_BUS_NUM(alias
), alias
& 0xff);
2122 static int domain_context_mapped(struct device
*dev
)
2124 struct intel_iommu
*iommu
;
2127 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
2131 if (!dev_is_pci(dev
))
2132 return device_context_mapped(iommu
, bus
, devfn
);
2134 return !pci_for_each_dma_alias(to_pci_dev(dev
),
2135 domain_context_mapped_cb
, iommu
);
2138 /* Returns a number of VTD pages, but aligned to MM page size */
2139 static inline unsigned long aligned_nrpages(unsigned long host_addr
,
2142 host_addr
&= ~PAGE_MASK
;
2143 return PAGE_ALIGN(host_addr
+ size
) >> VTD_PAGE_SHIFT
;
2146 /* Return largest possible superpage level for a given mapping */
2147 static inline int hardware_largepage_caps(struct dmar_domain
*domain
,
2148 unsigned long iov_pfn
,
2149 unsigned long phy_pfn
,
2150 unsigned long pages
)
2152 int support
, level
= 1;
2153 unsigned long pfnmerge
;
2155 support
= domain
->iommu_superpage
;
2157 /* To use a large page, the virtual *and* physical addresses
2158 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2159 of them will mean we have to use smaller pages. So just
2160 merge them and check both at once. */
2161 pfnmerge
= iov_pfn
| phy_pfn
;
2163 while (support
&& !(pfnmerge
& ~VTD_STRIDE_MASK
)) {
2164 pages
>>= VTD_STRIDE_SHIFT
;
2167 pfnmerge
>>= VTD_STRIDE_SHIFT
;
2174 static int __domain_mapping(struct dmar_domain
*domain
, unsigned long iov_pfn
,
2175 struct scatterlist
*sg
, unsigned long phys_pfn
,
2176 unsigned long nr_pages
, int prot
)
2178 struct dma_pte
*first_pte
= NULL
, *pte
= NULL
;
2179 phys_addr_t
uninitialized_var(pteval
);
2180 unsigned long sg_res
= 0;
2181 unsigned int largepage_lvl
= 0;
2182 unsigned long lvl_pages
= 0;
2184 BUG_ON(!domain_pfn_supported(domain
, iov_pfn
+ nr_pages
- 1));
2186 if ((prot
& (DMA_PTE_READ
|DMA_PTE_WRITE
)) == 0)
2189 prot
&= DMA_PTE_READ
| DMA_PTE_WRITE
| DMA_PTE_SNP
;
2193 pteval
= ((phys_addr_t
)phys_pfn
<< VTD_PAGE_SHIFT
) | prot
;
2196 while (nr_pages
> 0) {
2200 unsigned int pgoff
= sg
->offset
& ~PAGE_MASK
;
2202 sg_res
= aligned_nrpages(sg
->offset
, sg
->length
);
2203 sg
->dma_address
= ((dma_addr_t
)iov_pfn
<< VTD_PAGE_SHIFT
) + pgoff
;
2204 sg
->dma_length
= sg
->length
;
2205 pteval
= (sg_phys(sg
) - pgoff
) | prot
;
2206 phys_pfn
= pteval
>> VTD_PAGE_SHIFT
;
2210 largepage_lvl
= hardware_largepage_caps(domain
, iov_pfn
, phys_pfn
, sg_res
);
2212 first_pte
= pte
= pfn_to_dma_pte(domain
, iov_pfn
, &largepage_lvl
);
2215 /* It is large page*/
2216 if (largepage_lvl
> 1) {
2217 unsigned long nr_superpages
, end_pfn
;
2219 pteval
|= DMA_PTE_LARGE_PAGE
;
2220 lvl_pages
= lvl_to_nr_pages(largepage_lvl
);
2222 nr_superpages
= sg_res
/ lvl_pages
;
2223 end_pfn
= iov_pfn
+ nr_superpages
* lvl_pages
- 1;
2226 * Ensure that old small page tables are
2227 * removed to make room for superpage(s).
2228 * We're adding new large pages, so make sure
2229 * we don't remove their parent tables.
2231 dma_pte_free_pagetable(domain
, iov_pfn
, end_pfn
,
2234 pteval
&= ~(uint64_t)DMA_PTE_LARGE_PAGE
;
2238 /* We don't need lock here, nobody else
2239 * touches the iova range
2241 tmp
= cmpxchg64_local(&pte
->val
, 0ULL, pteval
);
2243 static int dumps
= 5;
2244 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2245 iov_pfn
, tmp
, (unsigned long long)pteval
);
2248 debug_dma_dump_mappings(NULL
);
2253 lvl_pages
= lvl_to_nr_pages(largepage_lvl
);
2255 BUG_ON(nr_pages
< lvl_pages
);
2256 BUG_ON(sg_res
< lvl_pages
);
2258 nr_pages
-= lvl_pages
;
2259 iov_pfn
+= lvl_pages
;
2260 phys_pfn
+= lvl_pages
;
2261 pteval
+= lvl_pages
* VTD_PAGE_SIZE
;
2262 sg_res
-= lvl_pages
;
2264 /* If the next PTE would be the first in a new page, then we
2265 need to flush the cache on the entries we've just written.
2266 And then we'll need to recalculate 'pte', so clear it and
2267 let it get set again in the if (!pte) block above.
2269 If we're done (!nr_pages) we need to flush the cache too.
2271 Also if we've been setting superpages, we may need to
2272 recalculate 'pte' and switch back to smaller pages for the
2273 end of the mapping, if the trailing size is not enough to
2274 use another superpage (i.e. sg_res < lvl_pages). */
2276 if (!nr_pages
|| first_pte_in_page(pte
) ||
2277 (largepage_lvl
> 1 && sg_res
< lvl_pages
)) {
2278 domain_flush_cache(domain
, first_pte
,
2279 (void *)pte
- (void *)first_pte
);
2283 if (!sg_res
&& nr_pages
)
2289 static int domain_mapping(struct dmar_domain
*domain
, unsigned long iov_pfn
,
2290 struct scatterlist
*sg
, unsigned long phys_pfn
,
2291 unsigned long nr_pages
, int prot
)
2294 struct intel_iommu
*iommu
;
2296 /* Do the real mapping first */
2297 ret
= __domain_mapping(domain
, iov_pfn
, sg
, phys_pfn
, nr_pages
, prot
);
2301 /* Notify about the new mapping */
2302 if (domain_type_is_vm(domain
)) {
2303 /* VM typed domains can have more than one IOMMUs */
2305 for_each_domain_iommu(iommu_id
, domain
) {
2306 iommu
= g_iommus
[iommu_id
];
2307 __mapping_notify_one(iommu
, domain
, iov_pfn
, nr_pages
);
2310 /* General domains only have one IOMMU */
2311 iommu
= domain_get_iommu(domain
);
2312 __mapping_notify_one(iommu
, domain
, iov_pfn
, nr_pages
);
2318 static inline int domain_sg_mapping(struct dmar_domain
*domain
, unsigned long iov_pfn
,
2319 struct scatterlist
*sg
, unsigned long nr_pages
,
2322 return domain_mapping(domain
, iov_pfn
, sg
, 0, nr_pages
, prot
);
2325 static inline int domain_pfn_mapping(struct dmar_domain
*domain
, unsigned long iov_pfn
,
2326 unsigned long phys_pfn
, unsigned long nr_pages
,
2329 return domain_mapping(domain
, iov_pfn
, NULL
, phys_pfn
, nr_pages
, prot
);
2332 static void domain_context_clear_one(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
2334 unsigned long flags
;
2335 struct context_entry
*context
;
2341 spin_lock_irqsave(&iommu
->lock
, flags
);
2342 context
= iommu_context_addr(iommu
, bus
, devfn
, 0);
2344 spin_unlock_irqrestore(&iommu
->lock
, flags
);
2347 did_old
= context_domain_id(context
);
2348 context_clear_entry(context
);
2349 __iommu_flush_cache(iommu
, context
, sizeof(*context
));
2350 spin_unlock_irqrestore(&iommu
->lock
, flags
);
2351 iommu
->flush
.flush_context(iommu
,
2353 (((u16
)bus
) << 8) | devfn
,
2354 DMA_CCMD_MASK_NOBIT
,
2355 DMA_CCMD_DEVICE_INVL
);
2356 iommu
->flush
.flush_iotlb(iommu
,
2363 static inline void unlink_domain_info(struct device_domain_info
*info
)
2365 assert_spin_locked(&device_domain_lock
);
2366 list_del(&info
->link
);
2367 list_del(&info
->global
);
2369 info
->dev
->archdata
.iommu
= NULL
;
2372 static void domain_remove_dev_info(struct dmar_domain
*domain
)
2374 struct device_domain_info
*info
, *tmp
;
2375 unsigned long flags
;
2377 spin_lock_irqsave(&device_domain_lock
, flags
);
2378 list_for_each_entry_safe(info
, tmp
, &domain
->devices
, link
)
2379 __dmar_remove_one_dev_info(info
);
2380 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2385 * Note: we use struct device->archdata.iommu stores the info
2387 static struct dmar_domain
*find_domain(struct device
*dev
)
2389 struct device_domain_info
*info
;
2391 /* No lock here, assumes no domain exit in normal case */
2392 info
= dev
->archdata
.iommu
;
2394 return info
->domain
;
2398 static inline struct device_domain_info
*
2399 dmar_search_domain_by_dev_info(int segment
, int bus
, int devfn
)
2401 struct device_domain_info
*info
;
2403 list_for_each_entry(info
, &device_domain_list
, global
)
2404 if (info
->iommu
->segment
== segment
&& info
->bus
== bus
&&
2405 info
->devfn
== devfn
)
2411 static struct dmar_domain
*dmar_insert_one_dev_info(struct intel_iommu
*iommu
,
2414 struct dmar_domain
*domain
)
2416 struct dmar_domain
*found
= NULL
;
2417 struct device_domain_info
*info
;
2418 unsigned long flags
;
2421 info
= alloc_devinfo_mem();
2426 info
->devfn
= devfn
;
2427 info
->ats_supported
= info
->pasid_supported
= info
->pri_supported
= 0;
2428 info
->ats_enabled
= info
->pasid_enabled
= info
->pri_enabled
= 0;
2431 info
->domain
= domain
;
2432 info
->iommu
= iommu
;
2434 if (dev
&& dev_is_pci(dev
)) {
2435 struct pci_dev
*pdev
= to_pci_dev(info
->dev
);
2437 if (!pci_ats_disabled() &&
2438 ecap_dev_iotlb_support(iommu
->ecap
) &&
2439 pci_find_ext_capability(pdev
, PCI_EXT_CAP_ID_ATS
) &&
2440 dmar_find_matched_atsr_unit(pdev
))
2441 info
->ats_supported
= 1;
2443 if (ecs_enabled(iommu
)) {
2444 if (pasid_enabled(iommu
)) {
2445 int features
= pci_pasid_features(pdev
);
2447 info
->pasid_supported
= features
| 1;
2450 if (info
->ats_supported
&& ecap_prs(iommu
->ecap
) &&
2451 pci_find_ext_capability(pdev
, PCI_EXT_CAP_ID_PRI
))
2452 info
->pri_supported
= 1;
2456 spin_lock_irqsave(&device_domain_lock
, flags
);
2458 found
= find_domain(dev
);
2461 struct device_domain_info
*info2
;
2462 info2
= dmar_search_domain_by_dev_info(iommu
->segment
, bus
, devfn
);
2464 found
= info2
->domain
;
2470 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2471 free_devinfo_mem(info
);
2472 /* Caller must free the original domain */
2476 spin_lock(&iommu
->lock
);
2477 ret
= domain_attach_iommu(domain
, iommu
);
2478 spin_unlock(&iommu
->lock
);
2481 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2482 free_devinfo_mem(info
);
2486 list_add(&info
->link
, &domain
->devices
);
2487 list_add(&info
->global
, &device_domain_list
);
2489 dev
->archdata
.iommu
= info
;
2490 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2492 if (dev
&& domain_context_mapping(domain
, dev
)) {
2493 pr_err("Domain context map for %s failed\n", dev_name(dev
));
2494 dmar_remove_one_dev_info(domain
, dev
);
2501 static int get_last_alias(struct pci_dev
*pdev
, u16 alias
, void *opaque
)
2503 *(u16
*)opaque
= alias
;
2507 static struct dmar_domain
*find_or_alloc_domain(struct device
*dev
, int gaw
)
2509 struct device_domain_info
*info
= NULL
;
2510 struct dmar_domain
*domain
= NULL
;
2511 struct intel_iommu
*iommu
;
2513 unsigned long flags
;
2516 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
2520 if (dev_is_pci(dev
)) {
2521 struct pci_dev
*pdev
= to_pci_dev(dev
);
2523 pci_for_each_dma_alias(pdev
, get_last_alias
, &dma_alias
);
2525 spin_lock_irqsave(&device_domain_lock
, flags
);
2526 info
= dmar_search_domain_by_dev_info(pci_domain_nr(pdev
->bus
),
2527 PCI_BUS_NUM(dma_alias
),
2530 iommu
= info
->iommu
;
2531 domain
= info
->domain
;
2533 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2535 /* DMA alias already has a domain, use it */
2540 /* Allocate and initialize new domain for the device */
2541 domain
= alloc_domain(0);
2544 if (domain_init(domain
, iommu
, gaw
)) {
2545 domain_exit(domain
);
2554 static struct dmar_domain
*set_domain_for_dev(struct device
*dev
,
2555 struct dmar_domain
*domain
)
2557 struct intel_iommu
*iommu
;
2558 struct dmar_domain
*tmp
;
2559 u16 req_id
, dma_alias
;
2562 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
2566 req_id
= ((u16
)bus
<< 8) | devfn
;
2568 if (dev_is_pci(dev
)) {
2569 struct pci_dev
*pdev
= to_pci_dev(dev
);
2571 pci_for_each_dma_alias(pdev
, get_last_alias
, &dma_alias
);
2573 /* register PCI DMA alias device */
2574 if (req_id
!= dma_alias
) {
2575 tmp
= dmar_insert_one_dev_info(iommu
, PCI_BUS_NUM(dma_alias
),
2576 dma_alias
& 0xff, NULL
, domain
);
2578 if (!tmp
|| tmp
!= domain
)
2583 tmp
= dmar_insert_one_dev_info(iommu
, bus
, devfn
, dev
, domain
);
2584 if (!tmp
|| tmp
!= domain
)
2590 static struct dmar_domain
*get_domain_for_dev(struct device
*dev
, int gaw
)
2592 struct dmar_domain
*domain
, *tmp
;
2594 domain
= find_domain(dev
);
2598 domain
= find_or_alloc_domain(dev
, gaw
);
2602 tmp
= set_domain_for_dev(dev
, domain
);
2603 if (!tmp
|| domain
!= tmp
) {
2604 domain_exit(domain
);
2613 static int iommu_domain_identity_map(struct dmar_domain
*domain
,
2614 unsigned long long start
,
2615 unsigned long long end
)
2617 unsigned long first_vpfn
= start
>> VTD_PAGE_SHIFT
;
2618 unsigned long last_vpfn
= end
>> VTD_PAGE_SHIFT
;
2620 if (!reserve_iova(&domain
->iovad
, dma_to_mm_pfn(first_vpfn
),
2621 dma_to_mm_pfn(last_vpfn
))) {
2622 pr_err("Reserving iova failed\n");
2626 pr_debug("Mapping reserved region %llx-%llx\n", start
, end
);
2628 * RMRR range might have overlap with physical memory range,
2631 dma_pte_clear_range(domain
, first_vpfn
, last_vpfn
);
2633 return __domain_mapping(domain
, first_vpfn
, NULL
,
2634 first_vpfn
, last_vpfn
- first_vpfn
+ 1,
2635 DMA_PTE_READ
|DMA_PTE_WRITE
);
2638 static int domain_prepare_identity_map(struct device
*dev
,
2639 struct dmar_domain
*domain
,
2640 unsigned long long start
,
2641 unsigned long long end
)
2643 /* For _hardware_ passthrough, don't bother. But for software
2644 passthrough, we do it anyway -- it may indicate a memory
2645 range which is reserved in E820, so which didn't get set
2646 up to start with in si_domain */
2647 if (domain
== si_domain
&& hw_pass_through
) {
2648 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2649 dev_name(dev
), start
, end
);
2653 pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2654 dev_name(dev
), start
, end
);
2657 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2658 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2659 dmi_get_system_info(DMI_BIOS_VENDOR
),
2660 dmi_get_system_info(DMI_BIOS_VERSION
),
2661 dmi_get_system_info(DMI_PRODUCT_VERSION
));
2665 if (end
>> agaw_to_width(domain
->agaw
)) {
2666 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2667 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2668 agaw_to_width(domain
->agaw
),
2669 dmi_get_system_info(DMI_BIOS_VENDOR
),
2670 dmi_get_system_info(DMI_BIOS_VERSION
),
2671 dmi_get_system_info(DMI_PRODUCT_VERSION
));
2675 return iommu_domain_identity_map(domain
, start
, end
);
2678 static int iommu_prepare_identity_map(struct device
*dev
,
2679 unsigned long long start
,
2680 unsigned long long end
)
2682 struct dmar_domain
*domain
;
2685 domain
= get_domain_for_dev(dev
, DEFAULT_DOMAIN_ADDRESS_WIDTH
);
2689 ret
= domain_prepare_identity_map(dev
, domain
, start
, end
);
2691 domain_exit(domain
);
2696 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit
*rmrr
,
2699 if (dev
->archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
)
2701 return iommu_prepare_identity_map(dev
, rmrr
->base_address
,
2705 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2706 static inline void iommu_prepare_isa(void)
2708 struct pci_dev
*pdev
;
2711 pdev
= pci_get_class(PCI_CLASS_BRIDGE_ISA
<< 8, NULL
);
2715 pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2716 ret
= iommu_prepare_identity_map(&pdev
->dev
, 0, 16*1024*1024 - 1);
2719 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2724 static inline void iommu_prepare_isa(void)
2728 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2730 static int md_domain_init(struct dmar_domain
*domain
, int guest_width
);
2732 static int __init
si_domain_init(int hw
)
2736 si_domain
= alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY
);
2740 if (md_domain_init(si_domain
, DEFAULT_DOMAIN_ADDRESS_WIDTH
)) {
2741 domain_exit(si_domain
);
2745 pr_debug("Identity mapping domain allocated\n");
2750 for_each_online_node(nid
) {
2751 unsigned long start_pfn
, end_pfn
;
2754 for_each_mem_pfn_range(i
, nid
, &start_pfn
, &end_pfn
, NULL
) {
2755 ret
= iommu_domain_identity_map(si_domain
,
2756 PFN_PHYS(start_pfn
), PFN_PHYS(end_pfn
));
2765 static int identity_mapping(struct device
*dev
)
2767 struct device_domain_info
*info
;
2769 if (likely(!iommu_identity_mapping
))
2772 info
= dev
->archdata
.iommu
;
2773 if (info
&& info
!= DUMMY_DEVICE_DOMAIN_INFO
)
2774 return (info
->domain
== si_domain
);
2779 static int domain_add_dev_info(struct dmar_domain
*domain
, struct device
*dev
)
2781 struct dmar_domain
*ndomain
;
2782 struct intel_iommu
*iommu
;
2785 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
2789 ndomain
= dmar_insert_one_dev_info(iommu
, bus
, devfn
, dev
, domain
);
2790 if (ndomain
!= domain
)
2796 static bool device_has_rmrr(struct device
*dev
)
2798 struct dmar_rmrr_unit
*rmrr
;
2803 for_each_rmrr_units(rmrr
) {
2805 * Return TRUE if this RMRR contains the device that
2808 for_each_active_dev_scope(rmrr
->devices
,
2809 rmrr
->devices_cnt
, i
, tmp
)
2820 * There are a couple cases where we need to restrict the functionality of
2821 * devices associated with RMRRs. The first is when evaluating a device for
2822 * identity mapping because problems exist when devices are moved in and out
2823 * of domains and their respective RMRR information is lost. This means that
2824 * a device with associated RMRRs will never be in a "passthrough" domain.
2825 * The second is use of the device through the IOMMU API. This interface
2826 * expects to have full control of the IOVA space for the device. We cannot
2827 * satisfy both the requirement that RMRR access is maintained and have an
2828 * unencumbered IOVA space. We also have no ability to quiesce the device's
2829 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2830 * We therefore prevent devices associated with an RMRR from participating in
2831 * the IOMMU API, which eliminates them from device assignment.
2833 * In both cases we assume that PCI USB devices with RMRRs have them largely
2834 * for historical reasons and that the RMRR space is not actively used post
2835 * boot. This exclusion may change if vendors begin to abuse it.
2837 * The same exception is made for graphics devices, with the requirement that
2838 * any use of the RMRR regions will be torn down before assigning the device
2841 static bool device_is_rmrr_locked(struct device
*dev
)
2843 if (!device_has_rmrr(dev
))
2846 if (dev_is_pci(dev
)) {
2847 struct pci_dev
*pdev
= to_pci_dev(dev
);
2849 if (IS_USB_DEVICE(pdev
) || IS_GFX_DEVICE(pdev
))
2856 static int iommu_should_identity_map(struct device
*dev
, int startup
)
2859 if (dev_is_pci(dev
)) {
2860 struct pci_dev
*pdev
= to_pci_dev(dev
);
2862 if (device_is_rmrr_locked(dev
))
2865 if ((iommu_identity_mapping
& IDENTMAP_AZALIA
) && IS_AZALIA(pdev
))
2868 if ((iommu_identity_mapping
& IDENTMAP_GFX
) && IS_GFX_DEVICE(pdev
))
2871 if (!(iommu_identity_mapping
& IDENTMAP_ALL
))
2875 * We want to start off with all devices in the 1:1 domain, and
2876 * take them out later if we find they can't access all of memory.
2878 * However, we can't do this for PCI devices behind bridges,
2879 * because all PCI devices behind the same bridge will end up
2880 * with the same source-id on their transactions.
2882 * Practically speaking, we can't change things around for these
2883 * devices at run-time, because we can't be sure there'll be no
2884 * DMA transactions in flight for any of their siblings.
2886 * So PCI devices (unless they're on the root bus) as well as
2887 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2888 * the 1:1 domain, just in _case_ one of their siblings turns out
2889 * not to be able to map all of memory.
2891 if (!pci_is_pcie(pdev
)) {
2892 if (!pci_is_root_bus(pdev
->bus
))
2894 if (pdev
->class >> 8 == PCI_CLASS_BRIDGE_PCI
)
2896 } else if (pci_pcie_type(pdev
) == PCI_EXP_TYPE_PCI_BRIDGE
)
2899 if (device_has_rmrr(dev
))
2904 * At boot time, we don't yet know if devices will be 64-bit capable.
2905 * Assume that they will — if they turn out not to be, then we can
2906 * take them out of the 1:1 domain later.
2910 * If the device's dma_mask is less than the system's memory
2911 * size then this is not a candidate for identity mapping.
2913 u64 dma_mask
= *dev
->dma_mask
;
2915 if (dev
->coherent_dma_mask
&&
2916 dev
->coherent_dma_mask
< dma_mask
)
2917 dma_mask
= dev
->coherent_dma_mask
;
2919 return dma_mask
>= dma_get_required_mask(dev
);
2925 static int __init
dev_prepare_static_identity_mapping(struct device
*dev
, int hw
)
2929 if (!iommu_should_identity_map(dev
, 1))
2932 ret
= domain_add_dev_info(si_domain
, dev
);
2934 pr_info("%s identity mapping for device %s\n",
2935 hw
? "Hardware" : "Software", dev_name(dev
));
2936 else if (ret
== -ENODEV
)
2937 /* device not associated with an iommu */
2944 static int __init
iommu_prepare_static_identity_mapping(int hw
)
2946 struct pci_dev
*pdev
= NULL
;
2947 struct dmar_drhd_unit
*drhd
;
2948 struct intel_iommu
*iommu
;
2953 for_each_pci_dev(pdev
) {
2954 ret
= dev_prepare_static_identity_mapping(&pdev
->dev
, hw
);
2959 for_each_active_iommu(iommu
, drhd
)
2960 for_each_active_dev_scope(drhd
->devices
, drhd
->devices_cnt
, i
, dev
) {
2961 struct acpi_device_physical_node
*pn
;
2962 struct acpi_device
*adev
;
2964 if (dev
->bus
!= &acpi_bus_type
)
2967 adev
= to_acpi_device(dev
);
2968 mutex_lock(&adev
->physical_node_lock
);
2969 list_for_each_entry(pn
, &adev
->physical_node_list
, node
) {
2970 ret
= dev_prepare_static_identity_mapping(pn
->dev
, hw
);
2974 mutex_unlock(&adev
->physical_node_lock
);
2982 static void intel_iommu_init_qi(struct intel_iommu
*iommu
)
2985 * Start from the sane iommu hardware state.
2986 * If the queued invalidation is already initialized by us
2987 * (for example, while enabling interrupt-remapping) then
2988 * we got the things already rolling from a sane state.
2992 * Clear any previous faults.
2994 dmar_fault(-1, iommu
);
2996 * Disable queued invalidation if supported and already enabled
2997 * before OS handover.
2999 dmar_disable_qi(iommu
);
3002 if (dmar_enable_qi(iommu
)) {
3004 * Queued Invalidate not enabled, use Register Based Invalidate
3006 iommu
->flush
.flush_context
= __iommu_flush_context
;
3007 iommu
->flush
.flush_iotlb
= __iommu_flush_iotlb
;
3008 pr_info("%s: Using Register based invalidation\n",
3011 iommu
->flush
.flush_context
= qi_flush_context
;
3012 iommu
->flush
.flush_iotlb
= qi_flush_iotlb
;
3013 pr_info("%s: Using Queued invalidation\n", iommu
->name
);
3017 static int copy_context_table(struct intel_iommu
*iommu
,
3018 struct root_entry
*old_re
,
3019 struct context_entry
**tbl
,
3022 int tbl_idx
, pos
= 0, idx
, devfn
, ret
= 0, did
;
3023 struct context_entry
*new_ce
= NULL
, ce
;
3024 struct context_entry
*old_ce
= NULL
;
3025 struct root_entry re
;
3026 phys_addr_t old_ce_phys
;
3028 tbl_idx
= ext
? bus
* 2 : bus
;
3029 memcpy(&re
, old_re
, sizeof(re
));
3031 for (devfn
= 0; devfn
< 256; devfn
++) {
3032 /* First calculate the correct index */
3033 idx
= (ext
? devfn
* 2 : devfn
) % 256;
3036 /* First save what we may have and clean up */
3038 tbl
[tbl_idx
] = new_ce
;
3039 __iommu_flush_cache(iommu
, new_ce
,
3049 old_ce_phys
= root_entry_lctp(&re
);
3051 old_ce_phys
= root_entry_uctp(&re
);
3054 if (ext
&& devfn
== 0) {
3055 /* No LCTP, try UCTP */
3064 old_ce
= memremap(old_ce_phys
, PAGE_SIZE
,
3069 new_ce
= alloc_pgtable_page(iommu
->node
);
3076 /* Now copy the context entry */
3077 memcpy(&ce
, old_ce
+ idx
, sizeof(ce
));
3079 if (!__context_present(&ce
))
3082 did
= context_domain_id(&ce
);
3083 if (did
>= 0 && did
< cap_ndoms(iommu
->cap
))
3084 set_bit(did
, iommu
->domain_ids
);
3087 * We need a marker for copied context entries. This
3088 * marker needs to work for the old format as well as
3089 * for extended context entries.
3091 * Bit 67 of the context entry is used. In the old
3092 * format this bit is available to software, in the
3093 * extended format it is the PGE bit, but PGE is ignored
3094 * by HW if PASIDs are disabled (and thus still
3097 * So disable PASIDs first and then mark the entry
3098 * copied. This means that we don't copy PASID
3099 * translations from the old kernel, but this is fine as
3100 * faults there are not fatal.
3102 context_clear_pasid_enable(&ce
);
3103 context_set_copied(&ce
);
3108 tbl
[tbl_idx
+ pos
] = new_ce
;
3110 __iommu_flush_cache(iommu
, new_ce
, VTD_PAGE_SIZE
);
3119 static int copy_translation_tables(struct intel_iommu
*iommu
)
3121 struct context_entry
**ctxt_tbls
;
3122 struct root_entry
*old_rt
;
3123 phys_addr_t old_rt_phys
;
3124 int ctxt_table_entries
;
3125 unsigned long flags
;
3130 rtaddr_reg
= dmar_readq(iommu
->reg
+ DMAR_RTADDR_REG
);
3131 ext
= !!(rtaddr_reg
& DMA_RTADDR_RTT
);
3132 new_ext
= !!ecap_ecs(iommu
->ecap
);
3135 * The RTT bit can only be changed when translation is disabled,
3136 * but disabling translation means to open a window for data
3137 * corruption. So bail out and don't copy anything if we would
3138 * have to change the bit.
3143 old_rt_phys
= rtaddr_reg
& VTD_PAGE_MASK
;
3147 old_rt
= memremap(old_rt_phys
, PAGE_SIZE
, MEMREMAP_WB
);
3151 /* This is too big for the stack - allocate it from slab */
3152 ctxt_table_entries
= ext
? 512 : 256;
3154 ctxt_tbls
= kcalloc(ctxt_table_entries
, sizeof(void *), GFP_KERNEL
);
3158 for (bus
= 0; bus
< 256; bus
++) {
3159 ret
= copy_context_table(iommu
, &old_rt
[bus
],
3160 ctxt_tbls
, bus
, ext
);
3162 pr_err("%s: Failed to copy context table for bus %d\n",
3168 spin_lock_irqsave(&iommu
->lock
, flags
);
3170 /* Context tables are copied, now write them to the root_entry table */
3171 for (bus
= 0; bus
< 256; bus
++) {
3172 int idx
= ext
? bus
* 2 : bus
;
3175 if (ctxt_tbls
[idx
]) {
3176 val
= virt_to_phys(ctxt_tbls
[idx
]) | 1;
3177 iommu
->root_entry
[bus
].lo
= val
;
3180 if (!ext
|| !ctxt_tbls
[idx
+ 1])
3183 val
= virt_to_phys(ctxt_tbls
[idx
+ 1]) | 1;
3184 iommu
->root_entry
[bus
].hi
= val
;
3187 spin_unlock_irqrestore(&iommu
->lock
, flags
);
3191 __iommu_flush_cache(iommu
, iommu
->root_entry
, PAGE_SIZE
);
3201 static int __init
init_dmars(void)
3203 struct dmar_drhd_unit
*drhd
;
3204 struct dmar_rmrr_unit
*rmrr
;
3205 bool copied_tables
= false;
3207 struct intel_iommu
*iommu
;
3213 * initialize and program root entry to not present
3216 for_each_drhd_unit(drhd
) {
3218 * lock not needed as this is only incremented in the single
3219 * threaded kernel __init code path all other access are read
3222 if (g_num_of_iommus
< DMAR_UNITS_SUPPORTED
) {
3226 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED
);
3229 /* Preallocate enough resources for IOMMU hot-addition */
3230 if (g_num_of_iommus
< DMAR_UNITS_SUPPORTED
)
3231 g_num_of_iommus
= DMAR_UNITS_SUPPORTED
;
3233 g_iommus
= kcalloc(g_num_of_iommus
, sizeof(struct intel_iommu
*),
3236 pr_err("Allocating global iommu array failed\n");
3241 for_each_active_iommu(iommu
, drhd
) {
3243 * Find the max pasid size of all IOMMU's in the system.
3244 * We need to ensure the system pasid table is no bigger
3245 * than the smallest supported.
3247 if (pasid_enabled(iommu
)) {
3248 u32 temp
= 2 << ecap_pss(iommu
->ecap
);
3250 intel_pasid_max_id
= min_t(u32
, temp
,
3251 intel_pasid_max_id
);
3254 g_iommus
[iommu
->seq_id
] = iommu
;
3256 intel_iommu_init_qi(iommu
);
3258 ret
= iommu_init_domains(iommu
);
3262 init_translation_status(iommu
);
3264 if (translation_pre_enabled(iommu
) && !is_kdump_kernel()) {
3265 iommu_disable_translation(iommu
);
3266 clear_translation_pre_enabled(iommu
);
3267 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3273 * we could share the same root & context tables
3274 * among all IOMMU's. Need to Split it later.
3276 ret
= iommu_alloc_root_entry(iommu
);
3280 if (translation_pre_enabled(iommu
)) {
3281 pr_info("Translation already enabled - trying to copy translation structures\n");
3283 ret
= copy_translation_tables(iommu
);
3286 * We found the IOMMU with translation
3287 * enabled - but failed to copy over the
3288 * old root-entry table. Try to proceed
3289 * by disabling translation now and
3290 * allocating a clean root-entry table.
3291 * This might cause DMAR faults, but
3292 * probably the dump will still succeed.
3294 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3296 iommu_disable_translation(iommu
);
3297 clear_translation_pre_enabled(iommu
);
3299 pr_info("Copied translation tables from previous kernel for %s\n",
3301 copied_tables
= true;
3305 if (!ecap_pass_through(iommu
->ecap
))
3306 hw_pass_through
= 0;
3307 #ifdef CONFIG_INTEL_IOMMU_SVM
3308 if (pasid_enabled(iommu
))
3309 intel_svm_alloc_pasid_tables(iommu
);
3314 * Now that qi is enabled on all iommus, set the root entry and flush
3315 * caches. This is required on some Intel X58 chipsets, otherwise the
3316 * flush_context function will loop forever and the boot hangs.
3318 for_each_active_iommu(iommu
, drhd
) {
3319 iommu_flush_write_buffer(iommu
);
3320 iommu_set_root_entry(iommu
);
3321 iommu
->flush
.flush_context(iommu
, 0, 0, 0, DMA_CCMD_GLOBAL_INVL
);
3322 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH
);
3325 if (iommu_pass_through
)
3326 iommu_identity_mapping
|= IDENTMAP_ALL
;
3328 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3329 iommu_identity_mapping
|= IDENTMAP_GFX
;
3332 check_tylersburg_isoch();
3334 if (iommu_identity_mapping
) {
3335 ret
= si_domain_init(hw_pass_through
);
3342 * If we copied translations from a previous kernel in the kdump
3343 * case, we can not assign the devices to domains now, as that
3344 * would eliminate the old mappings. So skip this part and defer
3345 * the assignment to device driver initialization time.
3351 * If pass through is not set or not enabled, setup context entries for
3352 * identity mappings for rmrr, gfx, and isa and may fall back to static
3353 * identity mapping if iommu_identity_mapping is set.
3355 if (iommu_identity_mapping
) {
3356 ret
= iommu_prepare_static_identity_mapping(hw_pass_through
);
3358 pr_crit("Failed to setup IOMMU pass-through\n");
3364 * for each dev attached to rmrr
3366 * locate drhd for dev, alloc domain for dev
3367 * allocate free domain
3368 * allocate page table entries for rmrr
3369 * if context not allocated for bus
3370 * allocate and init context
3371 * set present in root table for this bus
3372 * init context with domain, translation etc
3376 pr_info("Setting RMRR:\n");
3377 for_each_rmrr_units(rmrr
) {
3378 /* some BIOS lists non-exist devices in DMAR table. */
3379 for_each_active_dev_scope(rmrr
->devices
, rmrr
->devices_cnt
,
3381 ret
= iommu_prepare_rmrr_dev(rmrr
, dev
);
3383 pr_err("Mapping reserved region failed\n");
3387 iommu_prepare_isa();
3394 * global invalidate context cache
3395 * global invalidate iotlb
3396 * enable translation
3398 for_each_iommu(iommu
, drhd
) {
3399 if (drhd
->ignored
) {
3401 * we always have to disable PMRs or DMA may fail on
3405 iommu_disable_protect_mem_regions(iommu
);
3409 iommu_flush_write_buffer(iommu
);
3411 #ifdef CONFIG_INTEL_IOMMU_SVM
3412 if (pasid_enabled(iommu
) && ecap_prs(iommu
->ecap
)) {
3413 ret
= intel_svm_enable_prq(iommu
);
3418 ret
= dmar_set_interrupt(iommu
);
3422 if (!translation_pre_enabled(iommu
))
3423 iommu_enable_translation(iommu
);
3425 iommu_disable_protect_mem_regions(iommu
);
3431 for_each_active_iommu(iommu
, drhd
) {
3432 disable_dmar_iommu(iommu
);
3433 free_dmar_iommu(iommu
);
3442 /* This takes a number of _MM_ pages, not VTD pages */
3443 static unsigned long intel_alloc_iova(struct device
*dev
,
3444 struct dmar_domain
*domain
,
3445 unsigned long nrpages
, uint64_t dma_mask
)
3447 unsigned long iova_pfn
= 0;
3449 /* Restrict dma_mask to the width that the iommu can handle */
3450 dma_mask
= min_t(uint64_t, DOMAIN_MAX_ADDR(domain
->gaw
), dma_mask
);
3451 /* Ensure we reserve the whole size-aligned region */
3452 nrpages
= __roundup_pow_of_two(nrpages
);
3454 if (!dmar_forcedac
&& dma_mask
> DMA_BIT_MASK(32)) {
3456 * First try to allocate an io virtual address in
3457 * DMA_BIT_MASK(32) and if that fails then try allocating
3460 iova_pfn
= alloc_iova_fast(&domain
->iovad
, nrpages
,
3461 IOVA_PFN(DMA_BIT_MASK(32)), false);
3465 iova_pfn
= alloc_iova_fast(&domain
->iovad
, nrpages
,
3466 IOVA_PFN(dma_mask
), true);
3467 if (unlikely(!iova_pfn
)) {
3468 pr_err("Allocating %ld-page iova for %s failed",
3469 nrpages
, dev_name(dev
));
3476 struct dmar_domain
*get_valid_domain_for_dev(struct device
*dev
)
3478 struct dmar_domain
*domain
, *tmp
;
3479 struct dmar_rmrr_unit
*rmrr
;
3480 struct device
*i_dev
;
3483 domain
= find_domain(dev
);
3487 domain
= find_or_alloc_domain(dev
, DEFAULT_DOMAIN_ADDRESS_WIDTH
);
3491 /* We have a new domain - setup possible RMRRs for the device */
3493 for_each_rmrr_units(rmrr
) {
3494 for_each_active_dev_scope(rmrr
->devices
, rmrr
->devices_cnt
,
3499 ret
= domain_prepare_identity_map(dev
, domain
,
3503 dev_err(dev
, "Mapping reserved region failed\n");
3508 tmp
= set_domain_for_dev(dev
, domain
);
3509 if (!tmp
|| domain
!= tmp
) {
3510 domain_exit(domain
);
3517 pr_err("Allocating domain for %s failed\n", dev_name(dev
));
3523 /* Check if the dev needs to go through non-identity map and unmap process.*/
3524 static int iommu_no_mapping(struct device
*dev
)
3528 if (iommu_dummy(dev
))
3531 if (!iommu_identity_mapping
)
3534 found
= identity_mapping(dev
);
3536 if (iommu_should_identity_map(dev
, 0))
3540 * 32 bit DMA is removed from si_domain and fall back
3541 * to non-identity mapping.
3543 dmar_remove_one_dev_info(si_domain
, dev
);
3544 pr_info("32bit %s uses non-identity mapping\n",
3550 * In case of a detached 64 bit DMA device from vm, the device
3551 * is put into si_domain for identity mapping.
3553 if (iommu_should_identity_map(dev
, 0)) {
3555 ret
= domain_add_dev_info(si_domain
, dev
);
3557 pr_info("64bit %s uses identity mapping\n",
3567 static dma_addr_t
__intel_map_single(struct device
*dev
, phys_addr_t paddr
,
3568 size_t size
, int dir
, u64 dma_mask
)
3570 struct dmar_domain
*domain
;
3571 phys_addr_t start_paddr
;
3572 unsigned long iova_pfn
;
3575 struct intel_iommu
*iommu
;
3576 unsigned long paddr_pfn
= paddr
>> PAGE_SHIFT
;
3578 BUG_ON(dir
== DMA_NONE
);
3580 if (iommu_no_mapping(dev
))
3583 domain
= get_valid_domain_for_dev(dev
);
3587 iommu
= domain_get_iommu(domain
);
3588 size
= aligned_nrpages(paddr
, size
);
3590 iova_pfn
= intel_alloc_iova(dev
, domain
, dma_to_mm_pfn(size
), dma_mask
);
3595 * Check if DMAR supports zero-length reads on write only
3598 if (dir
== DMA_TO_DEVICE
|| dir
== DMA_BIDIRECTIONAL
|| \
3599 !cap_zlr(iommu
->cap
))
3600 prot
|= DMA_PTE_READ
;
3601 if (dir
== DMA_FROM_DEVICE
|| dir
== DMA_BIDIRECTIONAL
)
3602 prot
|= DMA_PTE_WRITE
;
3604 * paddr - (paddr + size) might be partial page, we should map the whole
3605 * page. Note: if two part of one page are separately mapped, we
3606 * might have two guest_addr mapping to the same host paddr, but this
3607 * is not a big problem
3609 ret
= domain_pfn_mapping(domain
, mm_to_dma_pfn(iova_pfn
),
3610 mm_to_dma_pfn(paddr_pfn
), size
, prot
);
3614 start_paddr
= (phys_addr_t
)iova_pfn
<< PAGE_SHIFT
;
3615 start_paddr
+= paddr
& ~PAGE_MASK
;
3620 free_iova_fast(&domain
->iovad
, iova_pfn
, dma_to_mm_pfn(size
));
3621 pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3622 dev_name(dev
), size
, (unsigned long long)paddr
, dir
);
3626 static dma_addr_t
intel_map_page(struct device
*dev
, struct page
*page
,
3627 unsigned long offset
, size_t size
,
3628 enum dma_data_direction dir
,
3629 unsigned long attrs
)
3631 return __intel_map_single(dev
, page_to_phys(page
) + offset
, size
,
3632 dir
, *dev
->dma_mask
);
3635 static void intel_unmap(struct device
*dev
, dma_addr_t dev_addr
, size_t size
)
3637 struct dmar_domain
*domain
;
3638 unsigned long start_pfn
, last_pfn
;
3639 unsigned long nrpages
;
3640 unsigned long iova_pfn
;
3641 struct intel_iommu
*iommu
;
3642 struct page
*freelist
;
3644 if (iommu_no_mapping(dev
))
3647 domain
= find_domain(dev
);
3650 iommu
= domain_get_iommu(domain
);
3652 iova_pfn
= IOVA_PFN(dev_addr
);
3654 nrpages
= aligned_nrpages(dev_addr
, size
);
3655 start_pfn
= mm_to_dma_pfn(iova_pfn
);
3656 last_pfn
= start_pfn
+ nrpages
- 1;
3658 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3659 dev_name(dev
), start_pfn
, last_pfn
);
3661 freelist
= domain_unmap(domain
, start_pfn
, last_pfn
);
3663 if (intel_iommu_strict
) {
3664 iommu_flush_iotlb_psi(iommu
, domain
, start_pfn
,
3665 nrpages
, !freelist
, 0);
3667 free_iova_fast(&domain
->iovad
, iova_pfn
, dma_to_mm_pfn(nrpages
));
3668 dma_free_pagelist(freelist
);
3670 queue_iova(&domain
->iovad
, iova_pfn
, nrpages
,
3671 (unsigned long)freelist
);
3673 * queue up the release of the unmap to save the 1/6th of the
3674 * cpu used up by the iotlb flush operation...
3679 static void intel_unmap_page(struct device
*dev
, dma_addr_t dev_addr
,
3680 size_t size
, enum dma_data_direction dir
,
3681 unsigned long attrs
)
3683 intel_unmap(dev
, dev_addr
, size
);
3686 static void *intel_alloc_coherent(struct device
*dev
, size_t size
,
3687 dma_addr_t
*dma_handle
, gfp_t flags
,
3688 unsigned long attrs
)
3692 vaddr
= dma_direct_alloc(dev
, size
, dma_handle
, flags
, attrs
);
3693 if (iommu_no_mapping(dev
) || !vaddr
)
3696 *dma_handle
= __intel_map_single(dev
, virt_to_phys(vaddr
),
3697 PAGE_ALIGN(size
), DMA_BIDIRECTIONAL
,
3698 dev
->coherent_dma_mask
);
3700 goto out_free_pages
;
3704 dma_direct_free(dev
, size
, vaddr
, *dma_handle
, attrs
);
3708 static void intel_free_coherent(struct device
*dev
, size_t size
, void *vaddr
,
3709 dma_addr_t dma_handle
, unsigned long attrs
)
3711 if (!iommu_no_mapping(dev
))
3712 intel_unmap(dev
, dma_handle
, PAGE_ALIGN(size
));
3713 dma_direct_free(dev
, size
, vaddr
, dma_handle
, attrs
);
3716 static void intel_unmap_sg(struct device
*dev
, struct scatterlist
*sglist
,
3717 int nelems
, enum dma_data_direction dir
,
3718 unsigned long attrs
)
3720 dma_addr_t startaddr
= sg_dma_address(sglist
) & PAGE_MASK
;
3721 unsigned long nrpages
= 0;
3722 struct scatterlist
*sg
;
3725 for_each_sg(sglist
, sg
, nelems
, i
) {
3726 nrpages
+= aligned_nrpages(sg_dma_address(sg
), sg_dma_len(sg
));
3729 intel_unmap(dev
, startaddr
, nrpages
<< VTD_PAGE_SHIFT
);
3732 static int intel_nontranslate_map_sg(struct device
*hddev
,
3733 struct scatterlist
*sglist
, int nelems
, int dir
)
3736 struct scatterlist
*sg
;
3738 for_each_sg(sglist
, sg
, nelems
, i
) {
3739 BUG_ON(!sg_page(sg
));
3740 sg
->dma_address
= sg_phys(sg
);
3741 sg
->dma_length
= sg
->length
;
3746 static int intel_map_sg(struct device
*dev
, struct scatterlist
*sglist
, int nelems
,
3747 enum dma_data_direction dir
, unsigned long attrs
)
3750 struct dmar_domain
*domain
;
3753 unsigned long iova_pfn
;
3755 struct scatterlist
*sg
;
3756 unsigned long start_vpfn
;
3757 struct intel_iommu
*iommu
;
3759 BUG_ON(dir
== DMA_NONE
);
3760 if (iommu_no_mapping(dev
))
3761 return intel_nontranslate_map_sg(dev
, sglist
, nelems
, dir
);
3763 domain
= get_valid_domain_for_dev(dev
);
3767 iommu
= domain_get_iommu(domain
);
3769 for_each_sg(sglist
, sg
, nelems
, i
)
3770 size
+= aligned_nrpages(sg
->offset
, sg
->length
);
3772 iova_pfn
= intel_alloc_iova(dev
, domain
, dma_to_mm_pfn(size
),
3775 sglist
->dma_length
= 0;
3780 * Check if DMAR supports zero-length reads on write only
3783 if (dir
== DMA_TO_DEVICE
|| dir
== DMA_BIDIRECTIONAL
|| \
3784 !cap_zlr(iommu
->cap
))
3785 prot
|= DMA_PTE_READ
;
3786 if (dir
== DMA_FROM_DEVICE
|| dir
== DMA_BIDIRECTIONAL
)
3787 prot
|= DMA_PTE_WRITE
;
3789 start_vpfn
= mm_to_dma_pfn(iova_pfn
);
3791 ret
= domain_sg_mapping(domain
, start_vpfn
, sglist
, size
, prot
);
3792 if (unlikely(ret
)) {
3793 dma_pte_free_pagetable(domain
, start_vpfn
,
3794 start_vpfn
+ size
- 1,
3795 agaw_to_level(domain
->agaw
) + 1);
3796 free_iova_fast(&domain
->iovad
, iova_pfn
, dma_to_mm_pfn(size
));
3803 static int intel_mapping_error(struct device
*dev
, dma_addr_t dma_addr
)
3808 const struct dma_map_ops intel_dma_ops
= {
3809 .alloc
= intel_alloc_coherent
,
3810 .free
= intel_free_coherent
,
3811 .map_sg
= intel_map_sg
,
3812 .unmap_sg
= intel_unmap_sg
,
3813 .map_page
= intel_map_page
,
3814 .unmap_page
= intel_unmap_page
,
3815 .mapping_error
= intel_mapping_error
,
3817 .dma_supported
= dma_direct_supported
,
3821 static inline int iommu_domain_cache_init(void)
3825 iommu_domain_cache
= kmem_cache_create("iommu_domain",
3826 sizeof(struct dmar_domain
),
3831 if (!iommu_domain_cache
) {
3832 pr_err("Couldn't create iommu_domain cache\n");
3839 static inline int iommu_devinfo_cache_init(void)
3843 iommu_devinfo_cache
= kmem_cache_create("iommu_devinfo",
3844 sizeof(struct device_domain_info
),
3848 if (!iommu_devinfo_cache
) {
3849 pr_err("Couldn't create devinfo cache\n");
3856 static int __init
iommu_init_mempool(void)
3859 ret
= iova_cache_get();
3863 ret
= iommu_domain_cache_init();
3867 ret
= iommu_devinfo_cache_init();
3871 kmem_cache_destroy(iommu_domain_cache
);
3878 static void __init
iommu_exit_mempool(void)
3880 kmem_cache_destroy(iommu_devinfo_cache
);
3881 kmem_cache_destroy(iommu_domain_cache
);
3885 static void quirk_ioat_snb_local_iommu(struct pci_dev
*pdev
)
3887 struct dmar_drhd_unit
*drhd
;
3891 /* We know that this device on this chipset has its own IOMMU.
3892 * If we find it under a different IOMMU, then the BIOS is lying
3893 * to us. Hope that the IOMMU for this device is actually
3894 * disabled, and it needs no translation...
3896 rc
= pci_bus_read_config_dword(pdev
->bus
, PCI_DEVFN(0, 0), 0xb0, &vtbar
);
3898 /* "can't" happen */
3899 dev_info(&pdev
->dev
, "failed to run vt-d quirk\n");
3902 vtbar
&= 0xffff0000;
3904 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3905 drhd
= dmar_find_matched_drhd_unit(pdev
);
3906 if (WARN_TAINT_ONCE(!drhd
|| drhd
->reg_base_addr
- vtbar
!= 0xa000,
3907 TAINT_FIRMWARE_WORKAROUND
,
3908 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3909 pdev
->dev
.archdata
.iommu
= DUMMY_DEVICE_DOMAIN_INFO
;
3911 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL
, PCI_DEVICE_ID_INTEL_IOAT_SNB
, quirk_ioat_snb_local_iommu
);
3913 static void __init
init_no_remapping_devices(void)
3915 struct dmar_drhd_unit
*drhd
;
3919 for_each_drhd_unit(drhd
) {
3920 if (!drhd
->include_all
) {
3921 for_each_active_dev_scope(drhd
->devices
,
3922 drhd
->devices_cnt
, i
, dev
)
3924 /* ignore DMAR unit if no devices exist */
3925 if (i
== drhd
->devices_cnt
)
3930 for_each_active_drhd_unit(drhd
) {
3931 if (drhd
->include_all
)
3934 for_each_active_dev_scope(drhd
->devices
,
3935 drhd
->devices_cnt
, i
, dev
)
3936 if (!dev_is_pci(dev
) || !IS_GFX_DEVICE(to_pci_dev(dev
)))
3938 if (i
< drhd
->devices_cnt
)
3941 /* This IOMMU has *only* gfx devices. Either bypass it or
3942 set the gfx_mapped flag, as appropriate */
3944 intel_iommu_gfx_mapped
= 1;
3947 for_each_active_dev_scope(drhd
->devices
,
3948 drhd
->devices_cnt
, i
, dev
)
3949 dev
->archdata
.iommu
= DUMMY_DEVICE_DOMAIN_INFO
;
3954 #ifdef CONFIG_SUSPEND
3955 static int init_iommu_hw(void)
3957 struct dmar_drhd_unit
*drhd
;
3958 struct intel_iommu
*iommu
= NULL
;
3960 for_each_active_iommu(iommu
, drhd
)
3962 dmar_reenable_qi(iommu
);
3964 for_each_iommu(iommu
, drhd
) {
3965 if (drhd
->ignored
) {
3967 * we always have to disable PMRs or DMA may fail on
3971 iommu_disable_protect_mem_regions(iommu
);
3975 iommu_flush_write_buffer(iommu
);
3977 iommu_set_root_entry(iommu
);
3979 iommu
->flush
.flush_context(iommu
, 0, 0, 0,
3980 DMA_CCMD_GLOBAL_INVL
);
3981 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH
);
3982 iommu_enable_translation(iommu
);
3983 iommu_disable_protect_mem_regions(iommu
);
3989 static void iommu_flush_all(void)
3991 struct dmar_drhd_unit
*drhd
;
3992 struct intel_iommu
*iommu
;
3994 for_each_active_iommu(iommu
, drhd
) {
3995 iommu
->flush
.flush_context(iommu
, 0, 0, 0,
3996 DMA_CCMD_GLOBAL_INVL
);
3997 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0,
3998 DMA_TLB_GLOBAL_FLUSH
);
4002 static int iommu_suspend(void)
4004 struct dmar_drhd_unit
*drhd
;
4005 struct intel_iommu
*iommu
= NULL
;
4008 for_each_active_iommu(iommu
, drhd
) {
4009 iommu
->iommu_state
= kcalloc(MAX_SR_DMAR_REGS
, sizeof(u32
),
4011 if (!iommu
->iommu_state
)
4017 for_each_active_iommu(iommu
, drhd
) {
4018 iommu_disable_translation(iommu
);
4020 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
4022 iommu
->iommu_state
[SR_DMAR_FECTL_REG
] =
4023 readl(iommu
->reg
+ DMAR_FECTL_REG
);
4024 iommu
->iommu_state
[SR_DMAR_FEDATA_REG
] =
4025 readl(iommu
->reg
+ DMAR_FEDATA_REG
);
4026 iommu
->iommu_state
[SR_DMAR_FEADDR_REG
] =
4027 readl(iommu
->reg
+ DMAR_FEADDR_REG
);
4028 iommu
->iommu_state
[SR_DMAR_FEUADDR_REG
] =
4029 readl(iommu
->reg
+ DMAR_FEUADDR_REG
);
4031 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
4036 for_each_active_iommu(iommu
, drhd
)
4037 kfree(iommu
->iommu_state
);
4042 static void iommu_resume(void)
4044 struct dmar_drhd_unit
*drhd
;
4045 struct intel_iommu
*iommu
= NULL
;
4048 if (init_iommu_hw()) {
4050 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4052 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4056 for_each_active_iommu(iommu
, drhd
) {
4058 raw_spin_lock_irqsave(&iommu
->register_lock
, flag
);
4060 writel(iommu
->iommu_state
[SR_DMAR_FECTL_REG
],
4061 iommu
->reg
+ DMAR_FECTL_REG
);
4062 writel(iommu
->iommu_state
[SR_DMAR_FEDATA_REG
],
4063 iommu
->reg
+ DMAR_FEDATA_REG
);
4064 writel(iommu
->iommu_state
[SR_DMAR_FEADDR_REG
],
4065 iommu
->reg
+ DMAR_FEADDR_REG
);
4066 writel(iommu
->iommu_state
[SR_DMAR_FEUADDR_REG
],
4067 iommu
->reg
+ DMAR_FEUADDR_REG
);
4069 raw_spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
4072 for_each_active_iommu(iommu
, drhd
)
4073 kfree(iommu
->iommu_state
);
4076 static struct syscore_ops iommu_syscore_ops
= {
4077 .resume
= iommu_resume
,
4078 .suspend
= iommu_suspend
,
4081 static void __init
init_iommu_pm_ops(void)
4083 register_syscore_ops(&iommu_syscore_ops
);
4087 static inline void init_iommu_pm_ops(void) {}
4088 #endif /* CONFIG_PM */
4091 int __init
dmar_parse_one_rmrr(struct acpi_dmar_header
*header
, void *arg
)
4093 struct acpi_dmar_reserved_memory
*rmrr
;
4094 int prot
= DMA_PTE_READ
|DMA_PTE_WRITE
;
4095 struct dmar_rmrr_unit
*rmrru
;
4098 rmrru
= kzalloc(sizeof(*rmrru
), GFP_KERNEL
);
4102 rmrru
->hdr
= header
;
4103 rmrr
= (struct acpi_dmar_reserved_memory
*)header
;
4104 rmrru
->base_address
= rmrr
->base_address
;
4105 rmrru
->end_address
= rmrr
->end_address
;
4107 length
= rmrr
->end_address
- rmrr
->base_address
+ 1;
4108 rmrru
->resv
= iommu_alloc_resv_region(rmrr
->base_address
, length
, prot
,
4113 rmrru
->devices
= dmar_alloc_dev_scope((void *)(rmrr
+ 1),
4114 ((void *)rmrr
) + rmrr
->header
.length
,
4115 &rmrru
->devices_cnt
);
4116 if (rmrru
->devices_cnt
&& rmrru
->devices
== NULL
)
4119 list_add(&rmrru
->list
, &dmar_rmrr_units
);
4130 static struct dmar_atsr_unit
*dmar_find_atsr(struct acpi_dmar_atsr
*atsr
)
4132 struct dmar_atsr_unit
*atsru
;
4133 struct acpi_dmar_atsr
*tmp
;
4135 list_for_each_entry_rcu(atsru
, &dmar_atsr_units
, list
) {
4136 tmp
= (struct acpi_dmar_atsr
*)atsru
->hdr
;
4137 if (atsr
->segment
!= tmp
->segment
)
4139 if (atsr
->header
.length
!= tmp
->header
.length
)
4141 if (memcmp(atsr
, tmp
, atsr
->header
.length
) == 0)
4148 int dmar_parse_one_atsr(struct acpi_dmar_header
*hdr
, void *arg
)
4150 struct acpi_dmar_atsr
*atsr
;
4151 struct dmar_atsr_unit
*atsru
;
4153 if (system_state
>= SYSTEM_RUNNING
&& !intel_iommu_enabled
)
4156 atsr
= container_of(hdr
, struct acpi_dmar_atsr
, header
);
4157 atsru
= dmar_find_atsr(atsr
);
4161 atsru
= kzalloc(sizeof(*atsru
) + hdr
->length
, GFP_KERNEL
);
4166 * If memory is allocated from slab by ACPI _DSM method, we need to
4167 * copy the memory content because the memory buffer will be freed
4170 atsru
->hdr
= (void *)(atsru
+ 1);
4171 memcpy(atsru
->hdr
, hdr
, hdr
->length
);
4172 atsru
->include_all
= atsr
->flags
& 0x1;
4173 if (!atsru
->include_all
) {
4174 atsru
->devices
= dmar_alloc_dev_scope((void *)(atsr
+ 1),
4175 (void *)atsr
+ atsr
->header
.length
,
4176 &atsru
->devices_cnt
);
4177 if (atsru
->devices_cnt
&& atsru
->devices
== NULL
) {
4183 list_add_rcu(&atsru
->list
, &dmar_atsr_units
);
4188 static void intel_iommu_free_atsr(struct dmar_atsr_unit
*atsru
)
4190 dmar_free_dev_scope(&atsru
->devices
, &atsru
->devices_cnt
);
4194 int dmar_release_one_atsr(struct acpi_dmar_header
*hdr
, void *arg
)
4196 struct acpi_dmar_atsr
*atsr
;
4197 struct dmar_atsr_unit
*atsru
;
4199 atsr
= container_of(hdr
, struct acpi_dmar_atsr
, header
);
4200 atsru
= dmar_find_atsr(atsr
);
4202 list_del_rcu(&atsru
->list
);
4204 intel_iommu_free_atsr(atsru
);
4210 int dmar_check_one_atsr(struct acpi_dmar_header
*hdr
, void *arg
)
4214 struct acpi_dmar_atsr
*atsr
;
4215 struct dmar_atsr_unit
*atsru
;
4217 atsr
= container_of(hdr
, struct acpi_dmar_atsr
, header
);
4218 atsru
= dmar_find_atsr(atsr
);
4222 if (!atsru
->include_all
&& atsru
->devices
&& atsru
->devices_cnt
) {
4223 for_each_active_dev_scope(atsru
->devices
, atsru
->devices_cnt
,
4231 static int intel_iommu_add(struct dmar_drhd_unit
*dmaru
)
4234 struct intel_iommu
*iommu
= dmaru
->iommu
;
4236 if (g_iommus
[iommu
->seq_id
])
4239 if (hw_pass_through
&& !ecap_pass_through(iommu
->ecap
)) {
4240 pr_warn("%s: Doesn't support hardware pass through.\n",
4244 if (!ecap_sc_support(iommu
->ecap
) &&
4245 domain_update_iommu_snooping(iommu
)) {
4246 pr_warn("%s: Doesn't support snooping.\n",
4250 sp
= domain_update_iommu_superpage(iommu
) - 1;
4251 if (sp
>= 0 && !(cap_super_page_val(iommu
->cap
) & (1 << sp
))) {
4252 pr_warn("%s: Doesn't support large page.\n",
4258 * Disable translation if already enabled prior to OS handover.
4260 if (iommu
->gcmd
& DMA_GCMD_TE
)
4261 iommu_disable_translation(iommu
);
4263 g_iommus
[iommu
->seq_id
] = iommu
;
4264 ret
= iommu_init_domains(iommu
);
4266 ret
= iommu_alloc_root_entry(iommu
);
4270 #ifdef CONFIG_INTEL_IOMMU_SVM
4271 if (pasid_enabled(iommu
))
4272 intel_svm_alloc_pasid_tables(iommu
);
4275 if (dmaru
->ignored
) {
4277 * we always have to disable PMRs or DMA may fail on this device
4280 iommu_disable_protect_mem_regions(iommu
);
4284 intel_iommu_init_qi(iommu
);
4285 iommu_flush_write_buffer(iommu
);
4287 #ifdef CONFIG_INTEL_IOMMU_SVM
4288 if (pasid_enabled(iommu
) && ecap_prs(iommu
->ecap
)) {
4289 ret
= intel_svm_enable_prq(iommu
);
4294 ret
= dmar_set_interrupt(iommu
);
4298 iommu_set_root_entry(iommu
);
4299 iommu
->flush
.flush_context(iommu
, 0, 0, 0, DMA_CCMD_GLOBAL_INVL
);
4300 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH
);
4301 iommu_enable_translation(iommu
);
4303 iommu_disable_protect_mem_regions(iommu
);
4307 disable_dmar_iommu(iommu
);
4309 free_dmar_iommu(iommu
);
4313 int dmar_iommu_hotplug(struct dmar_drhd_unit
*dmaru
, bool insert
)
4316 struct intel_iommu
*iommu
= dmaru
->iommu
;
4318 if (!intel_iommu_enabled
)
4324 ret
= intel_iommu_add(dmaru
);
4326 disable_dmar_iommu(iommu
);
4327 free_dmar_iommu(iommu
);
4333 static void intel_iommu_free_dmars(void)
4335 struct dmar_rmrr_unit
*rmrru
, *rmrr_n
;
4336 struct dmar_atsr_unit
*atsru
, *atsr_n
;
4338 list_for_each_entry_safe(rmrru
, rmrr_n
, &dmar_rmrr_units
, list
) {
4339 list_del(&rmrru
->list
);
4340 dmar_free_dev_scope(&rmrru
->devices
, &rmrru
->devices_cnt
);
4345 list_for_each_entry_safe(atsru
, atsr_n
, &dmar_atsr_units
, list
) {
4346 list_del(&atsru
->list
);
4347 intel_iommu_free_atsr(atsru
);
4351 int dmar_find_matched_atsr_unit(struct pci_dev
*dev
)
4354 struct pci_bus
*bus
;
4355 struct pci_dev
*bridge
= NULL
;
4357 struct acpi_dmar_atsr
*atsr
;
4358 struct dmar_atsr_unit
*atsru
;
4360 dev
= pci_physfn(dev
);
4361 for (bus
= dev
->bus
; bus
; bus
= bus
->parent
) {
4363 /* If it's an integrated device, allow ATS */
4366 /* Connected via non-PCIe: no ATS */
4367 if (!pci_is_pcie(bridge
) ||
4368 pci_pcie_type(bridge
) == PCI_EXP_TYPE_PCI_BRIDGE
)
4370 /* If we found the root port, look it up in the ATSR */
4371 if (pci_pcie_type(bridge
) == PCI_EXP_TYPE_ROOT_PORT
)
4376 list_for_each_entry_rcu(atsru
, &dmar_atsr_units
, list
) {
4377 atsr
= container_of(atsru
->hdr
, struct acpi_dmar_atsr
, header
);
4378 if (atsr
->segment
!= pci_domain_nr(dev
->bus
))
4381 for_each_dev_scope(atsru
->devices
, atsru
->devices_cnt
, i
, tmp
)
4382 if (tmp
== &bridge
->dev
)
4385 if (atsru
->include_all
)
4395 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info
*info
)
4398 struct dmar_rmrr_unit
*rmrru
;
4399 struct dmar_atsr_unit
*atsru
;
4400 struct acpi_dmar_atsr
*atsr
;
4401 struct acpi_dmar_reserved_memory
*rmrr
;
4403 if (!intel_iommu_enabled
&& system_state
>= SYSTEM_RUNNING
)
4406 list_for_each_entry(rmrru
, &dmar_rmrr_units
, list
) {
4407 rmrr
= container_of(rmrru
->hdr
,
4408 struct acpi_dmar_reserved_memory
, header
);
4409 if (info
->event
== BUS_NOTIFY_ADD_DEVICE
) {
4410 ret
= dmar_insert_dev_scope(info
, (void *)(rmrr
+ 1),
4411 ((void *)rmrr
) + rmrr
->header
.length
,
4412 rmrr
->segment
, rmrru
->devices
,
4413 rmrru
->devices_cnt
);
4416 } else if (info
->event
== BUS_NOTIFY_REMOVED_DEVICE
) {
4417 dmar_remove_dev_scope(info
, rmrr
->segment
,
4418 rmrru
->devices
, rmrru
->devices_cnt
);
4422 list_for_each_entry(atsru
, &dmar_atsr_units
, list
) {
4423 if (atsru
->include_all
)
4426 atsr
= container_of(atsru
->hdr
, struct acpi_dmar_atsr
, header
);
4427 if (info
->event
== BUS_NOTIFY_ADD_DEVICE
) {
4428 ret
= dmar_insert_dev_scope(info
, (void *)(atsr
+ 1),
4429 (void *)atsr
+ atsr
->header
.length
,
4430 atsr
->segment
, atsru
->devices
,
4431 atsru
->devices_cnt
);
4436 } else if (info
->event
== BUS_NOTIFY_REMOVED_DEVICE
) {
4437 if (dmar_remove_dev_scope(info
, atsr
->segment
,
4438 atsru
->devices
, atsru
->devices_cnt
))
4447 * Here we only respond to action of unbound device from driver.
4449 * Added device is not attached to its DMAR domain here yet. That will happen
4450 * when mapping the device to iova.
4452 static int device_notifier(struct notifier_block
*nb
,
4453 unsigned long action
, void *data
)
4455 struct device
*dev
= data
;
4456 struct dmar_domain
*domain
;
4458 if (iommu_dummy(dev
))
4461 if (action
!= BUS_NOTIFY_REMOVED_DEVICE
)
4464 domain
= find_domain(dev
);
4468 dmar_remove_one_dev_info(domain
, dev
);
4469 if (!domain_type_is_vm_or_si(domain
) && list_empty(&domain
->devices
))
4470 domain_exit(domain
);
4475 static struct notifier_block device_nb
= {
4476 .notifier_call
= device_notifier
,
4479 static int intel_iommu_memory_notifier(struct notifier_block
*nb
,
4480 unsigned long val
, void *v
)
4482 struct memory_notify
*mhp
= v
;
4483 unsigned long long start
, end
;
4484 unsigned long start_vpfn
, last_vpfn
;
4487 case MEM_GOING_ONLINE
:
4488 start
= mhp
->start_pfn
<< PAGE_SHIFT
;
4489 end
= ((mhp
->start_pfn
+ mhp
->nr_pages
) << PAGE_SHIFT
) - 1;
4490 if (iommu_domain_identity_map(si_domain
, start
, end
)) {
4491 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4498 case MEM_CANCEL_ONLINE
:
4499 start_vpfn
= mm_to_dma_pfn(mhp
->start_pfn
);
4500 last_vpfn
= mm_to_dma_pfn(mhp
->start_pfn
+ mhp
->nr_pages
- 1);
4501 while (start_vpfn
<= last_vpfn
) {
4503 struct dmar_drhd_unit
*drhd
;
4504 struct intel_iommu
*iommu
;
4505 struct page
*freelist
;
4507 iova
= find_iova(&si_domain
->iovad
, start_vpfn
);
4509 pr_debug("Failed get IOVA for PFN %lx\n",
4514 iova
= split_and_remove_iova(&si_domain
->iovad
, iova
,
4515 start_vpfn
, last_vpfn
);
4517 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4518 start_vpfn
, last_vpfn
);
4522 freelist
= domain_unmap(si_domain
, iova
->pfn_lo
,
4526 for_each_active_iommu(iommu
, drhd
)
4527 iommu_flush_iotlb_psi(iommu
, si_domain
,
4528 iova
->pfn_lo
, iova_size(iova
),
4531 dma_free_pagelist(freelist
);
4533 start_vpfn
= iova
->pfn_hi
+ 1;
4534 free_iova_mem(iova
);
4542 static struct notifier_block intel_iommu_memory_nb
= {
4543 .notifier_call
= intel_iommu_memory_notifier
,
4547 static void free_all_cpu_cached_iovas(unsigned int cpu
)
4551 for (i
= 0; i
< g_num_of_iommus
; i
++) {
4552 struct intel_iommu
*iommu
= g_iommus
[i
];
4553 struct dmar_domain
*domain
;
4559 for (did
= 0; did
< cap_ndoms(iommu
->cap
); did
++) {
4560 domain
= get_iommu_domain(iommu
, (u16
)did
);
4564 free_cpu_cached_iovas(cpu
, &domain
->iovad
);
4569 static int intel_iommu_cpu_dead(unsigned int cpu
)
4571 free_all_cpu_cached_iovas(cpu
);
4575 static void intel_disable_iommus(void)
4577 struct intel_iommu
*iommu
= NULL
;
4578 struct dmar_drhd_unit
*drhd
;
4580 for_each_iommu(iommu
, drhd
)
4581 iommu_disable_translation(iommu
);
4584 static inline struct intel_iommu
*dev_to_intel_iommu(struct device
*dev
)
4586 struct iommu_device
*iommu_dev
= dev_to_iommu_device(dev
);
4588 return container_of(iommu_dev
, struct intel_iommu
, iommu
);
4591 static ssize_t
intel_iommu_show_version(struct device
*dev
,
4592 struct device_attribute
*attr
,
4595 struct intel_iommu
*iommu
= dev_to_intel_iommu(dev
);
4596 u32 ver
= readl(iommu
->reg
+ DMAR_VER_REG
);
4597 return sprintf(buf
, "%d:%d\n",
4598 DMAR_VER_MAJOR(ver
), DMAR_VER_MINOR(ver
));
4600 static DEVICE_ATTR(version
, S_IRUGO
, intel_iommu_show_version
, NULL
);
4602 static ssize_t
intel_iommu_show_address(struct device
*dev
,
4603 struct device_attribute
*attr
,
4606 struct intel_iommu
*iommu
= dev_to_intel_iommu(dev
);
4607 return sprintf(buf
, "%llx\n", iommu
->reg_phys
);
4609 static DEVICE_ATTR(address
, S_IRUGO
, intel_iommu_show_address
, NULL
);
4611 static ssize_t
intel_iommu_show_cap(struct device
*dev
,
4612 struct device_attribute
*attr
,
4615 struct intel_iommu
*iommu
= dev_to_intel_iommu(dev
);
4616 return sprintf(buf
, "%llx\n", iommu
->cap
);
4618 static DEVICE_ATTR(cap
, S_IRUGO
, intel_iommu_show_cap
, NULL
);
4620 static ssize_t
intel_iommu_show_ecap(struct device
*dev
,
4621 struct device_attribute
*attr
,
4624 struct intel_iommu
*iommu
= dev_to_intel_iommu(dev
);
4625 return sprintf(buf
, "%llx\n", iommu
->ecap
);
4627 static DEVICE_ATTR(ecap
, S_IRUGO
, intel_iommu_show_ecap
, NULL
);
4629 static ssize_t
intel_iommu_show_ndoms(struct device
*dev
,
4630 struct device_attribute
*attr
,
4633 struct intel_iommu
*iommu
= dev_to_intel_iommu(dev
);
4634 return sprintf(buf
, "%ld\n", cap_ndoms(iommu
->cap
));
4636 static DEVICE_ATTR(domains_supported
, S_IRUGO
, intel_iommu_show_ndoms
, NULL
);
4638 static ssize_t
intel_iommu_show_ndoms_used(struct device
*dev
,
4639 struct device_attribute
*attr
,
4642 struct intel_iommu
*iommu
= dev_to_intel_iommu(dev
);
4643 return sprintf(buf
, "%d\n", bitmap_weight(iommu
->domain_ids
,
4644 cap_ndoms(iommu
->cap
)));
4646 static DEVICE_ATTR(domains_used
, S_IRUGO
, intel_iommu_show_ndoms_used
, NULL
);
4648 static struct attribute
*intel_iommu_attrs
[] = {
4649 &dev_attr_version
.attr
,
4650 &dev_attr_address
.attr
,
4652 &dev_attr_ecap
.attr
,
4653 &dev_attr_domains_supported
.attr
,
4654 &dev_attr_domains_used
.attr
,
4658 static struct attribute_group intel_iommu_group
= {
4659 .name
= "intel-iommu",
4660 .attrs
= intel_iommu_attrs
,
4663 const struct attribute_group
*intel_iommu_groups
[] = {
4668 int __init
intel_iommu_init(void)
4671 struct dmar_drhd_unit
*drhd
;
4672 struct intel_iommu
*iommu
;
4674 /* VT-d is required for a TXT/tboot launch, so enforce that */
4675 force_on
= tboot_force_iommu();
4677 if (iommu_init_mempool()) {
4679 panic("tboot: Failed to initialize iommu memory\n");
4683 down_write(&dmar_global_lock
);
4684 if (dmar_table_init()) {
4686 panic("tboot: Failed to initialize DMAR table\n");
4690 if (dmar_dev_scope_init() < 0) {
4692 panic("tboot: Failed to initialize DMAR device scope\n");
4696 up_write(&dmar_global_lock
);
4699 * The bus notifier takes the dmar_global_lock, so lockdep will
4700 * complain later when we register it under the lock.
4702 dmar_register_bus_notifier();
4704 down_write(&dmar_global_lock
);
4706 if (no_iommu
|| dmar_disabled
) {
4708 * We exit the function here to ensure IOMMU's remapping and
4709 * mempool aren't setup, which means that the IOMMU's PMRs
4710 * won't be disabled via the call to init_dmars(). So disable
4711 * it explicitly here. The PMRs were setup by tboot prior to
4712 * calling SENTER, but the kernel is expected to reset/tear
4715 if (intel_iommu_tboot_noforce
) {
4716 for_each_iommu(iommu
, drhd
)
4717 iommu_disable_protect_mem_regions(iommu
);
4721 * Make sure the IOMMUs are switched off, even when we
4722 * boot into a kexec kernel and the previous kernel left
4725 intel_disable_iommus();
4729 if (list_empty(&dmar_rmrr_units
))
4730 pr_info("No RMRR found\n");
4732 if (list_empty(&dmar_atsr_units
))
4733 pr_info("No ATSR found\n");
4735 if (dmar_init_reserved_ranges()) {
4737 panic("tboot: Failed to reserve iommu ranges\n");
4738 goto out_free_reserved_range
;
4741 init_no_remapping_devices();
4746 panic("tboot: Failed to initialize DMARs\n");
4747 pr_err("Initialization failed\n");
4748 goto out_free_reserved_range
;
4750 up_write(&dmar_global_lock
);
4751 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4753 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4756 dma_ops
= &intel_dma_ops
;
4758 init_iommu_pm_ops();
4760 for_each_active_iommu(iommu
, drhd
) {
4761 iommu_device_sysfs_add(&iommu
->iommu
, NULL
,
4764 iommu_device_set_ops(&iommu
->iommu
, &intel_iommu_ops
);
4765 iommu_device_register(&iommu
->iommu
);
4768 bus_set_iommu(&pci_bus_type
, &intel_iommu_ops
);
4769 bus_register_notifier(&pci_bus_type
, &device_nb
);
4770 if (si_domain
&& !hw_pass_through
)
4771 register_memory_notifier(&intel_iommu_memory_nb
);
4772 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD
, "iommu/intel:dead", NULL
,
4773 intel_iommu_cpu_dead
);
4774 intel_iommu_enabled
= 1;
4778 out_free_reserved_range
:
4779 put_iova_domain(&reserved_iova_list
);
4781 intel_iommu_free_dmars();
4782 up_write(&dmar_global_lock
);
4783 iommu_exit_mempool();
4787 static int domain_context_clear_one_cb(struct pci_dev
*pdev
, u16 alias
, void *opaque
)
4789 struct intel_iommu
*iommu
= opaque
;
4791 domain_context_clear_one(iommu
, PCI_BUS_NUM(alias
), alias
& 0xff);
4796 * NB - intel-iommu lacks any sort of reference counting for the users of
4797 * dependent devices. If multiple endpoints have intersecting dependent
4798 * devices, unbinding the driver from any one of them will possibly leave
4799 * the others unable to operate.
4801 static void domain_context_clear(struct intel_iommu
*iommu
, struct device
*dev
)
4803 if (!iommu
|| !dev
|| !dev_is_pci(dev
))
4806 pci_for_each_dma_alias(to_pci_dev(dev
), &domain_context_clear_one_cb
, iommu
);
4809 static void __dmar_remove_one_dev_info(struct device_domain_info
*info
)
4811 struct intel_iommu
*iommu
;
4812 unsigned long flags
;
4814 assert_spin_locked(&device_domain_lock
);
4819 iommu
= info
->iommu
;
4822 iommu_disable_dev_iotlb(info
);
4823 domain_context_clear(iommu
, info
->dev
);
4826 unlink_domain_info(info
);
4828 spin_lock_irqsave(&iommu
->lock
, flags
);
4829 domain_detach_iommu(info
->domain
, iommu
);
4830 spin_unlock_irqrestore(&iommu
->lock
, flags
);
4832 free_devinfo_mem(info
);
4835 static void dmar_remove_one_dev_info(struct dmar_domain
*domain
,
4838 struct device_domain_info
*info
;
4839 unsigned long flags
;
4841 spin_lock_irqsave(&device_domain_lock
, flags
);
4842 info
= dev
->archdata
.iommu
;
4843 __dmar_remove_one_dev_info(info
);
4844 spin_unlock_irqrestore(&device_domain_lock
, flags
);
4847 static int md_domain_init(struct dmar_domain
*domain
, int guest_width
)
4851 init_iova_domain(&domain
->iovad
, VTD_PAGE_SIZE
, IOVA_START_PFN
);
4852 domain_reserve_special_ranges(domain
);
4854 /* calculate AGAW */
4855 domain
->gaw
= guest_width
;
4856 adjust_width
= guestwidth_to_adjustwidth(guest_width
);
4857 domain
->agaw
= width_to_agaw(adjust_width
);
4859 domain
->iommu_coherency
= 0;
4860 domain
->iommu_snooping
= 0;
4861 domain
->iommu_superpage
= 0;
4862 domain
->max_addr
= 0;
4864 /* always allocate the top pgd */
4865 domain
->pgd
= (struct dma_pte
*)alloc_pgtable_page(domain
->nid
);
4868 domain_flush_cache(domain
, domain
->pgd
, PAGE_SIZE
);
4872 static struct iommu_domain
*intel_iommu_domain_alloc(unsigned type
)
4874 struct dmar_domain
*dmar_domain
;
4875 struct iommu_domain
*domain
;
4877 if (type
!= IOMMU_DOMAIN_UNMANAGED
)
4880 dmar_domain
= alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE
);
4882 pr_err("Can't allocate dmar_domain\n");
4885 if (md_domain_init(dmar_domain
, DEFAULT_DOMAIN_ADDRESS_WIDTH
)) {
4886 pr_err("Domain initialization failed\n");
4887 domain_exit(dmar_domain
);
4890 domain_update_iommu_cap(dmar_domain
);
4892 domain
= &dmar_domain
->domain
;
4893 domain
->geometry
.aperture_start
= 0;
4894 domain
->geometry
.aperture_end
= __DOMAIN_MAX_ADDR(dmar_domain
->gaw
);
4895 domain
->geometry
.force_aperture
= true;
4900 static void intel_iommu_domain_free(struct iommu_domain
*domain
)
4902 domain_exit(to_dmar_domain(domain
));
4905 static int intel_iommu_attach_device(struct iommu_domain
*domain
,
4908 struct dmar_domain
*dmar_domain
= to_dmar_domain(domain
);
4909 struct intel_iommu
*iommu
;
4913 if (device_is_rmrr_locked(dev
)) {
4914 dev_warn(dev
, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4918 /* normally dev is not mapped */
4919 if (unlikely(domain_context_mapped(dev
))) {
4920 struct dmar_domain
*old_domain
;
4922 old_domain
= find_domain(dev
);
4925 dmar_remove_one_dev_info(old_domain
, dev
);
4928 if (!domain_type_is_vm_or_si(old_domain
) &&
4929 list_empty(&old_domain
->devices
))
4930 domain_exit(old_domain
);
4934 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
4938 /* check if this iommu agaw is sufficient for max mapped address */
4939 addr_width
= agaw_to_width(iommu
->agaw
);
4940 if (addr_width
> cap_mgaw(iommu
->cap
))
4941 addr_width
= cap_mgaw(iommu
->cap
);
4943 if (dmar_domain
->max_addr
> (1LL << addr_width
)) {
4944 pr_err("%s: iommu width (%d) is not "
4945 "sufficient for the mapped address (%llx)\n",
4946 __func__
, addr_width
, dmar_domain
->max_addr
);
4949 dmar_domain
->gaw
= addr_width
;
4952 * Knock out extra levels of page tables if necessary
4954 while (iommu
->agaw
< dmar_domain
->agaw
) {
4955 struct dma_pte
*pte
;
4957 pte
= dmar_domain
->pgd
;
4958 if (dma_pte_present(pte
)) {
4959 dmar_domain
->pgd
= (struct dma_pte
*)
4960 phys_to_virt(dma_pte_addr(pte
));
4961 free_pgtable_page(pte
);
4963 dmar_domain
->agaw
--;
4966 return domain_add_dev_info(dmar_domain
, dev
);
4969 static void intel_iommu_detach_device(struct iommu_domain
*domain
,
4972 dmar_remove_one_dev_info(to_dmar_domain(domain
), dev
);
4975 static int intel_iommu_map(struct iommu_domain
*domain
,
4976 unsigned long iova
, phys_addr_t hpa
,
4977 size_t size
, int iommu_prot
)
4979 struct dmar_domain
*dmar_domain
= to_dmar_domain(domain
);
4984 if (iommu_prot
& IOMMU_READ
)
4985 prot
|= DMA_PTE_READ
;
4986 if (iommu_prot
& IOMMU_WRITE
)
4987 prot
|= DMA_PTE_WRITE
;
4988 if ((iommu_prot
& IOMMU_CACHE
) && dmar_domain
->iommu_snooping
)
4989 prot
|= DMA_PTE_SNP
;
4991 max_addr
= iova
+ size
;
4992 if (dmar_domain
->max_addr
< max_addr
) {
4995 /* check if minimum agaw is sufficient for mapped address */
4996 end
= __DOMAIN_MAX_ADDR(dmar_domain
->gaw
) + 1;
4997 if (end
< max_addr
) {
4998 pr_err("%s: iommu width (%d) is not "
4999 "sufficient for the mapped address (%llx)\n",
5000 __func__
, dmar_domain
->gaw
, max_addr
);
5003 dmar_domain
->max_addr
= max_addr
;
5005 /* Round up size to next multiple of PAGE_SIZE, if it and
5006 the low bits of hpa would take us onto the next page */
5007 size
= aligned_nrpages(hpa
, size
);
5008 ret
= domain_pfn_mapping(dmar_domain
, iova
>> VTD_PAGE_SHIFT
,
5009 hpa
>> VTD_PAGE_SHIFT
, size
, prot
);
5013 static size_t intel_iommu_unmap(struct iommu_domain
*domain
,
5014 unsigned long iova
, size_t size
)
5016 struct dmar_domain
*dmar_domain
= to_dmar_domain(domain
);
5017 struct page
*freelist
= NULL
;
5018 unsigned long start_pfn
, last_pfn
;
5019 unsigned int npages
;
5020 int iommu_id
, level
= 0;
5022 /* Cope with horrid API which requires us to unmap more than the
5023 size argument if it happens to be a large-page mapping. */
5024 BUG_ON(!pfn_to_dma_pte(dmar_domain
, iova
>> VTD_PAGE_SHIFT
, &level
));
5026 if (size
< VTD_PAGE_SIZE
<< level_to_offset_bits(level
))
5027 size
= VTD_PAGE_SIZE
<< level_to_offset_bits(level
);
5029 start_pfn
= iova
>> VTD_PAGE_SHIFT
;
5030 last_pfn
= (iova
+ size
- 1) >> VTD_PAGE_SHIFT
;
5032 freelist
= domain_unmap(dmar_domain
, start_pfn
, last_pfn
);
5034 npages
= last_pfn
- start_pfn
+ 1;
5036 for_each_domain_iommu(iommu_id
, dmar_domain
)
5037 iommu_flush_iotlb_psi(g_iommus
[iommu_id
], dmar_domain
,
5038 start_pfn
, npages
, !freelist
, 0);
5040 dma_free_pagelist(freelist
);
5042 if (dmar_domain
->max_addr
== iova
+ size
)
5043 dmar_domain
->max_addr
= iova
;
5048 static phys_addr_t
intel_iommu_iova_to_phys(struct iommu_domain
*domain
,
5051 struct dmar_domain
*dmar_domain
= to_dmar_domain(domain
);
5052 struct dma_pte
*pte
;
5056 pte
= pfn_to_dma_pte(dmar_domain
, iova
>> VTD_PAGE_SHIFT
, &level
);
5058 phys
= dma_pte_addr(pte
);
5063 static bool intel_iommu_capable(enum iommu_cap cap
)
5065 if (cap
== IOMMU_CAP_CACHE_COHERENCY
)
5066 return domain_update_iommu_snooping(NULL
) == 1;
5067 if (cap
== IOMMU_CAP_INTR_REMAP
)
5068 return irq_remapping_enabled
== 1;
5073 static int intel_iommu_add_device(struct device
*dev
)
5075 struct intel_iommu
*iommu
;
5076 struct iommu_group
*group
;
5079 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
5083 iommu_device_link(&iommu
->iommu
, dev
);
5085 group
= iommu_group_get_for_dev(dev
);
5088 return PTR_ERR(group
);
5090 iommu_group_put(group
);
5094 static void intel_iommu_remove_device(struct device
*dev
)
5096 struct intel_iommu
*iommu
;
5099 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
5103 iommu_group_remove_device(dev
);
5105 iommu_device_unlink(&iommu
->iommu
, dev
);
5108 static void intel_iommu_get_resv_regions(struct device
*device
,
5109 struct list_head
*head
)
5111 struct iommu_resv_region
*reg
;
5112 struct dmar_rmrr_unit
*rmrr
;
5113 struct device
*i_dev
;
5117 for_each_rmrr_units(rmrr
) {
5118 for_each_active_dev_scope(rmrr
->devices
, rmrr
->devices_cnt
,
5120 if (i_dev
!= device
)
5123 list_add_tail(&rmrr
->resv
->list
, head
);
5128 reg
= iommu_alloc_resv_region(IOAPIC_RANGE_START
,
5129 IOAPIC_RANGE_END
- IOAPIC_RANGE_START
+ 1,
5133 list_add_tail(®
->list
, head
);
5136 static void intel_iommu_put_resv_regions(struct device
*dev
,
5137 struct list_head
*head
)
5139 struct iommu_resv_region
*entry
, *next
;
5141 list_for_each_entry_safe(entry
, next
, head
, list
) {
5142 if (entry
->type
== IOMMU_RESV_RESERVED
)
5147 #ifdef CONFIG_INTEL_IOMMU_SVM
5148 #define MAX_NR_PASID_BITS (20)
5149 static inline unsigned long intel_iommu_get_pts(struct intel_iommu
*iommu
)
5152 * Convert ecap_pss to extend context entry pts encoding, also
5153 * respect the soft pasid_max value set by the iommu.
5154 * - number of PASID bits = ecap_pss + 1
5155 * - number of PASID table entries = 2^(pts + 5)
5156 * Therefore, pts = ecap_pss - 4
5157 * e.g. KBL ecap_pss = 0x13, PASID has 20 bits, pts = 15
5159 if (ecap_pss(iommu
->ecap
) < 5)
5162 /* pasid_max is encoded as actual number of entries not the bits */
5163 return find_first_bit((unsigned long *)&iommu
->pasid_max
,
5164 MAX_NR_PASID_BITS
) - 5;
5167 int intel_iommu_enable_pasid(struct intel_iommu
*iommu
, struct intel_svm_dev
*sdev
)
5169 struct device_domain_info
*info
;
5170 struct context_entry
*context
;
5171 struct dmar_domain
*domain
;
5172 unsigned long flags
;
5176 domain
= get_valid_domain_for_dev(sdev
->dev
);
5180 spin_lock_irqsave(&device_domain_lock
, flags
);
5181 spin_lock(&iommu
->lock
);
5184 info
= sdev
->dev
->archdata
.iommu
;
5185 if (!info
|| !info
->pasid_supported
)
5188 context
= iommu_context_addr(iommu
, info
->bus
, info
->devfn
, 0);
5189 if (WARN_ON(!context
))
5192 ctx_lo
= context
[0].lo
;
5194 sdev
->did
= domain
->iommu_did
[iommu
->seq_id
];
5195 sdev
->sid
= PCI_DEVID(info
->bus
, info
->devfn
);
5197 if (!(ctx_lo
& CONTEXT_PASIDE
)) {
5198 if (iommu
->pasid_state_table
)
5199 context
[1].hi
= (u64
)virt_to_phys(iommu
->pasid_state_table
);
5200 context
[1].lo
= (u64
)virt_to_phys(iommu
->pasid_table
) |
5201 intel_iommu_get_pts(iommu
);
5204 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5205 * extended to permit requests-with-PASID if the PASIDE bit
5206 * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5207 * however, the PASIDE bit is ignored and requests-with-PASID
5208 * are unconditionally blocked. Which makes less sense.
5209 * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5210 * "guest mode" translation types depending on whether ATS
5211 * is available or not. Annoyingly, we can't use the new
5212 * modes *unless* PASIDE is set. */
5213 if ((ctx_lo
& CONTEXT_TT_MASK
) == (CONTEXT_TT_PASS_THROUGH
<< 2)) {
5214 ctx_lo
&= ~CONTEXT_TT_MASK
;
5215 if (info
->ats_supported
)
5216 ctx_lo
|= CONTEXT_TT_PT_PASID_DEV_IOTLB
<< 2;
5218 ctx_lo
|= CONTEXT_TT_PT_PASID
<< 2;
5220 ctx_lo
|= CONTEXT_PASIDE
;
5221 if (iommu
->pasid_state_table
)
5222 ctx_lo
|= CONTEXT_DINVE
;
5223 if (info
->pri_supported
)
5224 ctx_lo
|= CONTEXT_PRS
;
5225 context
[0].lo
= ctx_lo
;
5227 iommu
->flush
.flush_context(iommu
, sdev
->did
, sdev
->sid
,
5228 DMA_CCMD_MASK_NOBIT
,
5229 DMA_CCMD_DEVICE_INVL
);
5232 /* Enable PASID support in the device, if it wasn't already */
5233 if (!info
->pasid_enabled
)
5234 iommu_enable_dev_iotlb(info
);
5236 if (info
->ats_enabled
) {
5237 sdev
->dev_iotlb
= 1;
5238 sdev
->qdep
= info
->ats_qdep
;
5239 if (sdev
->qdep
>= QI_DEV_EIOTLB_MAX_INVS
)
5245 spin_unlock(&iommu
->lock
);
5246 spin_unlock_irqrestore(&device_domain_lock
, flags
);
5251 struct intel_iommu
*intel_svm_device_to_iommu(struct device
*dev
)
5253 struct intel_iommu
*iommu
;
5256 if (iommu_dummy(dev
)) {
5258 "No IOMMU translation for device; cannot enable SVM\n");
5262 iommu
= device_to_iommu(dev
, &bus
, &devfn
);
5264 dev_err(dev
, "No IOMMU for device; cannot enable SVM\n");
5268 if (!iommu
->pasid_table
) {
5269 dev_err(dev
, "PASID not enabled on IOMMU; cannot enable SVM\n");
5275 #endif /* CONFIG_INTEL_IOMMU_SVM */
5277 const struct iommu_ops intel_iommu_ops
= {
5278 .capable
= intel_iommu_capable
,
5279 .domain_alloc
= intel_iommu_domain_alloc
,
5280 .domain_free
= intel_iommu_domain_free
,
5281 .attach_dev
= intel_iommu_attach_device
,
5282 .detach_dev
= intel_iommu_detach_device
,
5283 .map
= intel_iommu_map
,
5284 .unmap
= intel_iommu_unmap
,
5285 .map_sg
= default_iommu_map_sg
,
5286 .iova_to_phys
= intel_iommu_iova_to_phys
,
5287 .add_device
= intel_iommu_add_device
,
5288 .remove_device
= intel_iommu_remove_device
,
5289 .get_resv_regions
= intel_iommu_get_resv_regions
,
5290 .put_resv_regions
= intel_iommu_put_resv_regions
,
5291 .device_group
= pci_device_group
,
5292 .pgsize_bitmap
= INTEL_IOMMU_PGSIZES
,
5295 static void quirk_iommu_g4x_gfx(struct pci_dev
*dev
)
5297 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5298 pr_info("Disabling IOMMU for graphics on this chipset\n");
5302 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2a40, quirk_iommu_g4x_gfx
);
5303 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e00, quirk_iommu_g4x_gfx
);
5304 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e10, quirk_iommu_g4x_gfx
);
5305 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e20, quirk_iommu_g4x_gfx
);
5306 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e30, quirk_iommu_g4x_gfx
);
5307 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e40, quirk_iommu_g4x_gfx
);
5308 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e90, quirk_iommu_g4x_gfx
);
5310 static void quirk_iommu_rwbf(struct pci_dev
*dev
)
5313 * Mobile 4 Series Chipset neglects to set RWBF capability,
5314 * but needs it. Same seems to hold for the desktop versions.
5316 pr_info("Forcing write-buffer flush capability\n");
5320 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2a40, quirk_iommu_rwbf
);
5321 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e00, quirk_iommu_rwbf
);
5322 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e10, quirk_iommu_rwbf
);
5323 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e20, quirk_iommu_rwbf
);
5324 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e30, quirk_iommu_rwbf
);
5325 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e40, quirk_iommu_rwbf
);
5326 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2e90, quirk_iommu_rwbf
);
5329 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5330 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5331 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5332 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5333 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5334 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5335 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5336 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5338 static void quirk_calpella_no_shadow_gtt(struct pci_dev
*dev
)
5342 if (pci_read_config_word(dev
, GGC
, &ggc
))
5345 if (!(ggc
& GGC_MEMORY_VT_ENABLED
)) {
5346 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5348 } else if (dmar_map_gfx
) {
5349 /* we have to ensure the gfx device is idle before we flush */
5350 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5351 intel_iommu_strict
= 1;
5354 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x0040, quirk_calpella_no_shadow_gtt
);
5355 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x0044, quirk_calpella_no_shadow_gtt
);
5356 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x0062, quirk_calpella_no_shadow_gtt
);
5357 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x006a, quirk_calpella_no_shadow_gtt
);
5359 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5360 ISOCH DMAR unit for the Azalia sound device, but not give it any
5361 TLB entries, which causes it to deadlock. Check for that. We do
5362 this in a function called from init_dmars(), instead of in a PCI
5363 quirk, because we don't want to print the obnoxious "BIOS broken"
5364 message if VT-d is actually disabled.
5366 static void __init
check_tylersburg_isoch(void)
5368 struct pci_dev
*pdev
;
5369 uint32_t vtisochctrl
;
5371 /* If there's no Azalia in the system anyway, forget it. */
5372 pdev
= pci_get_device(PCI_VENDOR_ID_INTEL
, 0x3a3e, NULL
);
5377 /* System Management Registers. Might be hidden, in which case
5378 we can't do the sanity check. But that's OK, because the
5379 known-broken BIOSes _don't_ actually hide it, so far. */
5380 pdev
= pci_get_device(PCI_VENDOR_ID_INTEL
, 0x342e, NULL
);
5384 if (pci_read_config_dword(pdev
, 0x188, &vtisochctrl
)) {
5391 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5392 if (vtisochctrl
& 1)
5395 /* Drop all bits other than the number of TLB entries */
5396 vtisochctrl
&= 0x1c;
5398 /* If we have the recommended number of TLB entries (16), fine. */
5399 if (vtisochctrl
== 0x10)
5402 /* Zero TLB entries? You get to ride the short bus to school. */
5404 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5405 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5406 dmi_get_system_info(DMI_BIOS_VENDOR
),
5407 dmi_get_system_info(DMI_BIOS_VERSION
),
5408 dmi_get_system_info(DMI_PRODUCT_VERSION
));
5409 iommu_identity_mapping
|= IDENTMAP_AZALIA
;
5413 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",