2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
43 #define ROOT_SIZE VTD_PAGE_SIZE
44 #define CONTEXT_SIZE VTD_PAGE_SIZE
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
49 #define IOAPIC_RANGE_START (0xfee00000)
50 #define IOAPIC_RANGE_END (0xfeefffff)
51 #define IOVA_START_ADDR (0x1000)
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
57 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
58 #define DMA_32BIT_PFN IOVA_PFN(DMA_32BIT_MASK)
59 #define DMA_64BIT_PFN IOVA_PFN(DMA_64BIT_MASK)
61 /* global iommu list, set NULL for ignored DMAR units */
62 static struct intel_iommu
**g_iommus
;
64 static int rwbf_quirk
;
69 * 12-63: Context Ptr (12 - (haw-1))
76 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
77 static inline bool root_present(struct root_entry
*root
)
79 return (root
->val
& 1);
81 static inline void set_root_present(struct root_entry
*root
)
85 static inline void set_root_value(struct root_entry
*root
, unsigned long value
)
87 root
->val
|= value
& VTD_PAGE_MASK
;
90 static inline struct context_entry
*
91 get_context_addr_from_root(struct root_entry
*root
)
93 return (struct context_entry
*)
94 (root_present(root
)?phys_to_virt(
95 root
->val
& VTD_PAGE_MASK
) :
102 * 1: fault processing disable
103 * 2-3: translation type
104 * 12-63: address space root
110 struct context_entry
{
115 static inline bool context_present(struct context_entry
*context
)
117 return (context
->lo
& 1);
119 static inline void context_set_present(struct context_entry
*context
)
124 static inline void context_set_fault_enable(struct context_entry
*context
)
126 context
->lo
&= (((u64
)-1) << 2) | 1;
129 #define CONTEXT_TT_MULTI_LEVEL 0
131 static inline void context_set_translation_type(struct context_entry
*context
,
134 context
->lo
&= (((u64
)-1) << 4) | 3;
135 context
->lo
|= (value
& 3) << 2;
138 static inline void context_set_address_root(struct context_entry
*context
,
141 context
->lo
|= value
& VTD_PAGE_MASK
;
144 static inline void context_set_address_width(struct context_entry
*context
,
147 context
->hi
|= value
& 7;
150 static inline void context_set_domain_id(struct context_entry
*context
,
153 context
->hi
|= (value
& ((1 << 16) - 1)) << 8;
156 static inline void context_clear_entry(struct context_entry
*context
)
168 * 12-63: Host physcial address
174 static inline void dma_clear_pte(struct dma_pte
*pte
)
179 static inline void dma_set_pte_readable(struct dma_pte
*pte
)
181 pte
->val
|= DMA_PTE_READ
;
184 static inline void dma_set_pte_writable(struct dma_pte
*pte
)
186 pte
->val
|= DMA_PTE_WRITE
;
189 static inline void dma_set_pte_prot(struct dma_pte
*pte
, unsigned long prot
)
191 pte
->val
= (pte
->val
& ~3) | (prot
& 3);
194 static inline u64
dma_pte_addr(struct dma_pte
*pte
)
196 return (pte
->val
& VTD_PAGE_MASK
);
199 static inline void dma_set_pte_addr(struct dma_pte
*pte
, u64 addr
)
201 pte
->val
|= (addr
& VTD_PAGE_MASK
);
204 static inline bool dma_pte_present(struct dma_pte
*pte
)
206 return (pte
->val
& 3) != 0;
209 /* devices under the same p2p bridge are owned in one domain */
210 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
212 /* domain represents a virtual machine, more than one devices
213 * across iommus may be owned in one domain, e.g. kvm guest.
215 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
218 int id
; /* domain id */
219 unsigned long iommu_bmp
; /* bitmap of iommus this domain uses*/
221 struct list_head devices
; /* all devices' list */
222 struct iova_domain iovad
; /* iova's that belong to this domain */
224 struct dma_pte
*pgd
; /* virtual address */
225 spinlock_t mapping_lock
; /* page table lock */
226 int gaw
; /* max guest address width */
228 /* adjusted guest address width, 0 is level 2 30-bit */
231 int flags
; /* flags to find out type of domain */
233 int iommu_coherency
;/* indicate coherency of iommu access */
234 int iommu_snooping
; /* indicate snooping control feature*/
235 int iommu_count
; /* reference count of iommu */
236 spinlock_t iommu_lock
; /* protect iommu set in domain */
237 u64 max_addr
; /* maximum mapped address */
240 /* PCI domain-device relationship */
241 struct device_domain_info
{
242 struct list_head link
; /* link to domain siblings */
243 struct list_head global
; /* link to global list */
244 u8 bus
; /* PCI bus numer */
245 u8 devfn
; /* PCI devfn number */
246 struct pci_dev
*dev
; /* it's NULL for PCIE-to-PCI bridge */
247 struct dmar_domain
*domain
; /* pointer to domain */
250 static void flush_unmaps_timeout(unsigned long data
);
252 DEFINE_TIMER(unmap_timer
, flush_unmaps_timeout
, 0, 0);
254 #define HIGH_WATER_MARK 250
255 struct deferred_flush_tables
{
257 struct iova
*iova
[HIGH_WATER_MARK
];
258 struct dmar_domain
*domain
[HIGH_WATER_MARK
];
261 static struct deferred_flush_tables
*deferred_flush
;
263 /* bitmap for indexing intel_iommus */
264 static int g_num_of_iommus
;
266 static DEFINE_SPINLOCK(async_umap_flush_lock
);
267 static LIST_HEAD(unmaps_to_do
);
270 static long list_size
;
272 static void domain_remove_dev_info(struct dmar_domain
*domain
);
274 #ifdef CONFIG_DMAR_DEFAULT_ON
275 int dmar_disabled
= 0;
277 int dmar_disabled
= 1;
278 #endif /*CONFIG_DMAR_DEFAULT_ON*/
280 static int __initdata dmar_map_gfx
= 1;
281 static int dmar_forcedac
;
282 static int intel_iommu_strict
;
284 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
285 static DEFINE_SPINLOCK(device_domain_lock
);
286 static LIST_HEAD(device_domain_list
);
288 static struct iommu_ops intel_iommu_ops
;
290 static int __init
intel_iommu_setup(char *str
)
295 if (!strncmp(str
, "on", 2)) {
297 printk(KERN_INFO
"Intel-IOMMU: enabled\n");
298 } else if (!strncmp(str
, "off", 3)) {
300 printk(KERN_INFO
"Intel-IOMMU: disabled\n");
301 } else if (!strncmp(str
, "igfx_off", 8)) {
304 "Intel-IOMMU: disable GFX device mapping\n");
305 } else if (!strncmp(str
, "forcedac", 8)) {
307 "Intel-IOMMU: Forcing DAC for PCI devices\n");
309 } else if (!strncmp(str
, "strict", 6)) {
311 "Intel-IOMMU: disable batched IOTLB flush\n");
312 intel_iommu_strict
= 1;
315 str
+= strcspn(str
, ",");
321 __setup("intel_iommu=", intel_iommu_setup
);
323 static struct kmem_cache
*iommu_domain_cache
;
324 static struct kmem_cache
*iommu_devinfo_cache
;
325 static struct kmem_cache
*iommu_iova_cache
;
327 static inline void *iommu_kmem_cache_alloc(struct kmem_cache
*cachep
)
332 /* trying to avoid low memory issues */
333 flags
= current
->flags
& PF_MEMALLOC
;
334 current
->flags
|= PF_MEMALLOC
;
335 vaddr
= kmem_cache_alloc(cachep
, GFP_ATOMIC
);
336 current
->flags
&= (~PF_MEMALLOC
| flags
);
341 static inline void *alloc_pgtable_page(void)
346 /* trying to avoid low memory issues */
347 flags
= current
->flags
& PF_MEMALLOC
;
348 current
->flags
|= PF_MEMALLOC
;
349 vaddr
= (void *)get_zeroed_page(GFP_ATOMIC
);
350 current
->flags
&= (~PF_MEMALLOC
| flags
);
354 static inline void free_pgtable_page(void *vaddr
)
356 free_page((unsigned long)vaddr
);
359 static inline void *alloc_domain_mem(void)
361 return iommu_kmem_cache_alloc(iommu_domain_cache
);
364 static void free_domain_mem(void *vaddr
)
366 kmem_cache_free(iommu_domain_cache
, vaddr
);
369 static inline void * alloc_devinfo_mem(void)
371 return iommu_kmem_cache_alloc(iommu_devinfo_cache
);
374 static inline void free_devinfo_mem(void *vaddr
)
376 kmem_cache_free(iommu_devinfo_cache
, vaddr
);
379 struct iova
*alloc_iova_mem(void)
381 return iommu_kmem_cache_alloc(iommu_iova_cache
);
384 void free_iova_mem(struct iova
*iova
)
386 kmem_cache_free(iommu_iova_cache
, iova
);
390 static inline int width_to_agaw(int width
);
392 /* calculate agaw for each iommu.
393 * "SAGAW" may be different across iommus, use a default agaw, and
394 * get a supported less agaw for iommus that don't support the default agaw.
396 int iommu_calculate_agaw(struct intel_iommu
*iommu
)
401 sagaw
= cap_sagaw(iommu
->cap
);
402 for (agaw
= width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH
);
404 if (test_bit(agaw
, &sagaw
))
411 /* in native case, each domain is related to only one iommu */
412 static struct intel_iommu
*domain_get_iommu(struct dmar_domain
*domain
)
416 BUG_ON(domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
);
418 iommu_id
= find_first_bit(&domain
->iommu_bmp
, g_num_of_iommus
);
419 if (iommu_id
< 0 || iommu_id
>= g_num_of_iommus
)
422 return g_iommus
[iommu_id
];
425 static void domain_update_iommu_coherency(struct dmar_domain
*domain
)
429 domain
->iommu_coherency
= 1;
431 i
= find_first_bit(&domain
->iommu_bmp
, g_num_of_iommus
);
432 for (; i
< g_num_of_iommus
; ) {
433 if (!ecap_coherent(g_iommus
[i
]->ecap
)) {
434 domain
->iommu_coherency
= 0;
437 i
= find_next_bit(&domain
->iommu_bmp
, g_num_of_iommus
, i
+1);
441 static void domain_update_iommu_snooping(struct dmar_domain
*domain
)
445 domain
->iommu_snooping
= 1;
447 i
= find_first_bit(&domain
->iommu_bmp
, g_num_of_iommus
);
448 for (; i
< g_num_of_iommus
; ) {
449 if (!ecap_sc_support(g_iommus
[i
]->ecap
)) {
450 domain
->iommu_snooping
= 0;
453 i
= find_next_bit(&domain
->iommu_bmp
, g_num_of_iommus
, i
+1);
457 /* Some capabilities may be different across iommus */
458 static void domain_update_iommu_cap(struct dmar_domain
*domain
)
460 domain_update_iommu_coherency(domain
);
461 domain_update_iommu_snooping(domain
);
464 static struct intel_iommu
*device_to_iommu(u8 bus
, u8 devfn
)
466 struct dmar_drhd_unit
*drhd
= NULL
;
469 for_each_drhd_unit(drhd
) {
473 for (i
= 0; i
< drhd
->devices_cnt
; i
++)
474 if (drhd
->devices
[i
] &&
475 drhd
->devices
[i
]->bus
->number
== bus
&&
476 drhd
->devices
[i
]->devfn
== devfn
)
479 if (drhd
->include_all
)
486 static void domain_flush_cache(struct dmar_domain
*domain
,
487 void *addr
, int size
)
489 if (!domain
->iommu_coherency
)
490 clflush_cache_range(addr
, size
);
493 /* Gets context entry for a given bus and devfn */
494 static struct context_entry
* device_to_context_entry(struct intel_iommu
*iommu
,
497 struct root_entry
*root
;
498 struct context_entry
*context
;
499 unsigned long phy_addr
;
502 spin_lock_irqsave(&iommu
->lock
, flags
);
503 root
= &iommu
->root_entry
[bus
];
504 context
= get_context_addr_from_root(root
);
506 context
= (struct context_entry
*)alloc_pgtable_page();
508 spin_unlock_irqrestore(&iommu
->lock
, flags
);
511 __iommu_flush_cache(iommu
, (void *)context
, CONTEXT_SIZE
);
512 phy_addr
= virt_to_phys((void *)context
);
513 set_root_value(root
, phy_addr
);
514 set_root_present(root
);
515 __iommu_flush_cache(iommu
, root
, sizeof(*root
));
517 spin_unlock_irqrestore(&iommu
->lock
, flags
);
518 return &context
[devfn
];
521 static int device_context_mapped(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
523 struct root_entry
*root
;
524 struct context_entry
*context
;
528 spin_lock_irqsave(&iommu
->lock
, flags
);
529 root
= &iommu
->root_entry
[bus
];
530 context
= get_context_addr_from_root(root
);
535 ret
= context_present(&context
[devfn
]);
537 spin_unlock_irqrestore(&iommu
->lock
, flags
);
541 static void clear_context_table(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
543 struct root_entry
*root
;
544 struct context_entry
*context
;
547 spin_lock_irqsave(&iommu
->lock
, flags
);
548 root
= &iommu
->root_entry
[bus
];
549 context
= get_context_addr_from_root(root
);
551 context_clear_entry(&context
[devfn
]);
552 __iommu_flush_cache(iommu
, &context
[devfn
], \
555 spin_unlock_irqrestore(&iommu
->lock
, flags
);
558 static void free_context_table(struct intel_iommu
*iommu
)
560 struct root_entry
*root
;
563 struct context_entry
*context
;
565 spin_lock_irqsave(&iommu
->lock
, flags
);
566 if (!iommu
->root_entry
) {
569 for (i
= 0; i
< ROOT_ENTRY_NR
; i
++) {
570 root
= &iommu
->root_entry
[i
];
571 context
= get_context_addr_from_root(root
);
573 free_pgtable_page(context
);
575 free_pgtable_page(iommu
->root_entry
);
576 iommu
->root_entry
= NULL
;
578 spin_unlock_irqrestore(&iommu
->lock
, flags
);
581 /* page table handling */
582 #define LEVEL_STRIDE (9)
583 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
585 static inline int agaw_to_level(int agaw
)
590 static inline int agaw_to_width(int agaw
)
592 return 30 + agaw
* LEVEL_STRIDE
;
596 static inline int width_to_agaw(int width
)
598 return (width
- 30) / LEVEL_STRIDE
;
601 static inline unsigned int level_to_offset_bits(int level
)
603 return (12 + (level
- 1) * LEVEL_STRIDE
);
606 static inline int address_level_offset(u64 addr
, int level
)
608 return ((addr
>> level_to_offset_bits(level
)) & LEVEL_MASK
);
611 static inline u64
level_mask(int level
)
613 return ((u64
)-1 << level_to_offset_bits(level
));
616 static inline u64
level_size(int level
)
618 return ((u64
)1 << level_to_offset_bits(level
));
621 static inline u64
align_to_level(u64 addr
, int level
)
623 return ((addr
+ level_size(level
) - 1) & level_mask(level
));
626 static struct dma_pte
* addr_to_dma_pte(struct dmar_domain
*domain
, u64 addr
)
628 int addr_width
= agaw_to_width(domain
->agaw
);
629 struct dma_pte
*parent
, *pte
= NULL
;
630 int level
= agaw_to_level(domain
->agaw
);
634 BUG_ON(!domain
->pgd
);
636 addr
&= (((u64
)1) << addr_width
) - 1;
637 parent
= domain
->pgd
;
639 spin_lock_irqsave(&domain
->mapping_lock
, flags
);
643 offset
= address_level_offset(addr
, level
);
644 pte
= &parent
[offset
];
648 if (!dma_pte_present(pte
)) {
649 tmp_page
= alloc_pgtable_page();
652 spin_unlock_irqrestore(&domain
->mapping_lock
,
656 domain_flush_cache(domain
, tmp_page
, PAGE_SIZE
);
657 dma_set_pte_addr(pte
, virt_to_phys(tmp_page
));
659 * high level table always sets r/w, last level page
660 * table control read/write
662 dma_set_pte_readable(pte
);
663 dma_set_pte_writable(pte
);
664 domain_flush_cache(domain
, pte
, sizeof(*pte
));
666 parent
= phys_to_virt(dma_pte_addr(pte
));
670 spin_unlock_irqrestore(&domain
->mapping_lock
, flags
);
674 /* return address's pte at specific level */
675 static struct dma_pte
*dma_addr_level_pte(struct dmar_domain
*domain
, u64 addr
,
678 struct dma_pte
*parent
, *pte
= NULL
;
679 int total
= agaw_to_level(domain
->agaw
);
682 parent
= domain
->pgd
;
683 while (level
<= total
) {
684 offset
= address_level_offset(addr
, total
);
685 pte
= &parent
[offset
];
689 if (!dma_pte_present(pte
))
691 parent
= phys_to_virt(dma_pte_addr(pte
));
697 /* clear one page's page table */
698 static void dma_pte_clear_one(struct dmar_domain
*domain
, u64 addr
)
700 struct dma_pte
*pte
= NULL
;
702 /* get last level pte */
703 pte
= dma_addr_level_pte(domain
, addr
, 1);
707 domain_flush_cache(domain
, pte
, sizeof(*pte
));
711 /* clear last level pte, a tlb flush should be followed */
712 static void dma_pte_clear_range(struct dmar_domain
*domain
, u64 start
, u64 end
)
714 int addr_width
= agaw_to_width(domain
->agaw
);
716 start
&= (((u64
)1) << addr_width
) - 1;
717 end
&= (((u64
)1) << addr_width
) - 1;
718 /* in case it's partial page */
719 start
= PAGE_ALIGN(start
);
722 /* we don't need lock here, nobody else touches the iova range */
723 while (start
< end
) {
724 dma_pte_clear_one(domain
, start
);
725 start
+= VTD_PAGE_SIZE
;
729 /* free page table pages. last level pte should already be cleared */
730 static void dma_pte_free_pagetable(struct dmar_domain
*domain
,
733 int addr_width
= agaw_to_width(domain
->agaw
);
735 int total
= agaw_to_level(domain
->agaw
);
739 start
&= (((u64
)1) << addr_width
) - 1;
740 end
&= (((u64
)1) << addr_width
) - 1;
742 /* we don't need lock here, nobody else touches the iova range */
744 while (level
<= total
) {
745 tmp
= align_to_level(start
, level
);
746 if (tmp
>= end
|| (tmp
+ level_size(level
) > end
))
750 pte
= dma_addr_level_pte(domain
, tmp
, level
);
753 phys_to_virt(dma_pte_addr(pte
)));
755 domain_flush_cache(domain
, pte
, sizeof(*pte
));
757 tmp
+= level_size(level
);
762 if (start
== 0 && end
>= ((((u64
)1) << addr_width
) - 1)) {
763 free_pgtable_page(domain
->pgd
);
769 static int iommu_alloc_root_entry(struct intel_iommu
*iommu
)
771 struct root_entry
*root
;
774 root
= (struct root_entry
*)alloc_pgtable_page();
778 __iommu_flush_cache(iommu
, root
, ROOT_SIZE
);
780 spin_lock_irqsave(&iommu
->lock
, flags
);
781 iommu
->root_entry
= root
;
782 spin_unlock_irqrestore(&iommu
->lock
, flags
);
787 static void iommu_set_root_entry(struct intel_iommu
*iommu
)
793 addr
= iommu
->root_entry
;
795 spin_lock_irqsave(&iommu
->register_lock
, flag
);
796 dmar_writeq(iommu
->reg
+ DMAR_RTADDR_REG
, virt_to_phys(addr
));
798 cmd
= iommu
->gcmd
| DMA_GCMD_SRTP
;
799 writel(cmd
, iommu
->reg
+ DMAR_GCMD_REG
);
801 /* Make sure hardware complete it */
802 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
803 readl
, (sts
& DMA_GSTS_RTPS
), sts
);
805 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
808 static void iommu_flush_write_buffer(struct intel_iommu
*iommu
)
813 if (!rwbf_quirk
&& !cap_rwbf(iommu
->cap
))
815 val
= iommu
->gcmd
| DMA_GCMD_WBF
;
817 spin_lock_irqsave(&iommu
->register_lock
, flag
);
818 writel(val
, iommu
->reg
+ DMAR_GCMD_REG
);
820 /* Make sure hardware complete it */
821 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
822 readl
, (!(val
& DMA_GSTS_WBFS
)), val
);
824 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
827 /* return value determine if we need a write buffer flush */
828 static int __iommu_flush_context(struct intel_iommu
*iommu
,
829 u16 did
, u16 source_id
, u8 function_mask
, u64 type
,
830 int non_present_entry_flush
)
836 * In the non-present entry flush case, if hardware doesn't cache
837 * non-present entry we do nothing and if hardware cache non-present
838 * entry, we flush entries of domain 0 (the domain id is used to cache
839 * any non-present entries)
841 if (non_present_entry_flush
) {
842 if (!cap_caching_mode(iommu
->cap
))
849 case DMA_CCMD_GLOBAL_INVL
:
850 val
= DMA_CCMD_GLOBAL_INVL
;
852 case DMA_CCMD_DOMAIN_INVL
:
853 val
= DMA_CCMD_DOMAIN_INVL
|DMA_CCMD_DID(did
);
855 case DMA_CCMD_DEVICE_INVL
:
856 val
= DMA_CCMD_DEVICE_INVL
|DMA_CCMD_DID(did
)
857 | DMA_CCMD_SID(source_id
) | DMA_CCMD_FM(function_mask
);
864 spin_lock_irqsave(&iommu
->register_lock
, flag
);
865 dmar_writeq(iommu
->reg
+ DMAR_CCMD_REG
, val
);
867 /* Make sure hardware complete it */
868 IOMMU_WAIT_OP(iommu
, DMAR_CCMD_REG
,
869 dmar_readq
, (!(val
& DMA_CCMD_ICC
)), val
);
871 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
873 /* flush context entry will implicitly flush write buffer */
877 /* return value determine if we need a write buffer flush */
878 static int __iommu_flush_iotlb(struct intel_iommu
*iommu
, u16 did
,
879 u64 addr
, unsigned int size_order
, u64 type
,
880 int non_present_entry_flush
)
882 int tlb_offset
= ecap_iotlb_offset(iommu
->ecap
);
883 u64 val
= 0, val_iva
= 0;
887 * In the non-present entry flush case, if hardware doesn't cache
888 * non-present entry we do nothing and if hardware cache non-present
889 * entry, we flush entries of domain 0 (the domain id is used to cache
890 * any non-present entries)
892 if (non_present_entry_flush
) {
893 if (!cap_caching_mode(iommu
->cap
))
900 case DMA_TLB_GLOBAL_FLUSH
:
901 /* global flush doesn't need set IVA_REG */
902 val
= DMA_TLB_GLOBAL_FLUSH
|DMA_TLB_IVT
;
904 case DMA_TLB_DSI_FLUSH
:
905 val
= DMA_TLB_DSI_FLUSH
|DMA_TLB_IVT
|DMA_TLB_DID(did
);
907 case DMA_TLB_PSI_FLUSH
:
908 val
= DMA_TLB_PSI_FLUSH
|DMA_TLB_IVT
|DMA_TLB_DID(did
);
909 /* Note: always flush non-leaf currently */
910 val_iva
= size_order
| addr
;
915 /* Note: set drain read/write */
918 * This is probably to be super secure.. Looks like we can
919 * ignore it without any impact.
921 if (cap_read_drain(iommu
->cap
))
922 val
|= DMA_TLB_READ_DRAIN
;
924 if (cap_write_drain(iommu
->cap
))
925 val
|= DMA_TLB_WRITE_DRAIN
;
927 spin_lock_irqsave(&iommu
->register_lock
, flag
);
928 /* Note: Only uses first TLB reg currently */
930 dmar_writeq(iommu
->reg
+ tlb_offset
, val_iva
);
931 dmar_writeq(iommu
->reg
+ tlb_offset
+ 8, val
);
933 /* Make sure hardware complete it */
934 IOMMU_WAIT_OP(iommu
, tlb_offset
+ 8,
935 dmar_readq
, (!(val
& DMA_TLB_IVT
)), val
);
937 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
939 /* check IOTLB invalidation granularity */
940 if (DMA_TLB_IAIG(val
) == 0)
941 printk(KERN_ERR
"IOMMU: flush IOTLB failed\n");
942 if (DMA_TLB_IAIG(val
) != DMA_TLB_IIRG(type
))
943 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
944 (unsigned long long)DMA_TLB_IIRG(type
),
945 (unsigned long long)DMA_TLB_IAIG(val
));
946 /* flush iotlb entry will implicitly flush write buffer */
950 static int iommu_flush_iotlb_psi(struct intel_iommu
*iommu
, u16 did
,
951 u64 addr
, unsigned int pages
, int non_present_entry_flush
)
955 BUG_ON(addr
& (~VTD_PAGE_MASK
));
958 /* Fallback to domain selective flush if no PSI support */
959 if (!cap_pgsel_inv(iommu
->cap
))
960 return iommu
->flush
.flush_iotlb(iommu
, did
, 0, 0,
962 non_present_entry_flush
);
965 * PSI requires page size to be 2 ^ x, and the base address is naturally
966 * aligned to the size
968 mask
= ilog2(__roundup_pow_of_two(pages
));
969 /* Fallback to domain selective flush if size is too big */
970 if (mask
> cap_max_amask_val(iommu
->cap
))
971 return iommu
->flush
.flush_iotlb(iommu
, did
, 0, 0,
972 DMA_TLB_DSI_FLUSH
, non_present_entry_flush
);
974 return iommu
->flush
.flush_iotlb(iommu
, did
, addr
, mask
,
976 non_present_entry_flush
);
979 static void iommu_disable_protect_mem_regions(struct intel_iommu
*iommu
)
984 spin_lock_irqsave(&iommu
->register_lock
, flags
);
985 pmen
= readl(iommu
->reg
+ DMAR_PMEN_REG
);
986 pmen
&= ~DMA_PMEN_EPM
;
987 writel(pmen
, iommu
->reg
+ DMAR_PMEN_REG
);
989 /* wait for the protected region status bit to clear */
990 IOMMU_WAIT_OP(iommu
, DMAR_PMEN_REG
,
991 readl
, !(pmen
& DMA_PMEN_PRS
), pmen
);
993 spin_unlock_irqrestore(&iommu
->register_lock
, flags
);
996 static int iommu_enable_translation(struct intel_iommu
*iommu
)
1001 spin_lock_irqsave(&iommu
->register_lock
, flags
);
1002 writel(iommu
->gcmd
|DMA_GCMD_TE
, iommu
->reg
+ DMAR_GCMD_REG
);
1004 /* Make sure hardware complete it */
1005 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1006 readl
, (sts
& DMA_GSTS_TES
), sts
);
1008 iommu
->gcmd
|= DMA_GCMD_TE
;
1009 spin_unlock_irqrestore(&iommu
->register_lock
, flags
);
1013 static int iommu_disable_translation(struct intel_iommu
*iommu
)
1018 spin_lock_irqsave(&iommu
->register_lock
, flag
);
1019 iommu
->gcmd
&= ~DMA_GCMD_TE
;
1020 writel(iommu
->gcmd
, iommu
->reg
+ DMAR_GCMD_REG
);
1022 /* Make sure hardware complete it */
1023 IOMMU_WAIT_OP(iommu
, DMAR_GSTS_REG
,
1024 readl
, (!(sts
& DMA_GSTS_TES
)), sts
);
1026 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1030 /* iommu interrupt handling. Most stuff are MSI-like. */
1032 static const char *fault_reason_strings
[] =
1035 "Present bit in root entry is clear",
1036 "Present bit in context entry is clear",
1037 "Invalid context entry",
1038 "Access beyond MGAW",
1039 "PTE Write access is not set",
1040 "PTE Read access is not set",
1041 "Next page table ptr is invalid",
1042 "Root table address invalid",
1043 "Context table ptr is invalid",
1044 "non-zero reserved fields in RTP",
1045 "non-zero reserved fields in CTP",
1046 "non-zero reserved fields in PTE",
1048 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
1050 const char *dmar_get_fault_reason(u8 fault_reason
)
1052 if (fault_reason
> MAX_FAULT_REASON_IDX
)
1055 return fault_reason_strings
[fault_reason
];
1058 void dmar_msi_unmask(unsigned int irq
)
1060 struct intel_iommu
*iommu
= get_irq_data(irq
);
1064 spin_lock_irqsave(&iommu
->register_lock
, flag
);
1065 writel(0, iommu
->reg
+ DMAR_FECTL_REG
);
1066 /* Read a reg to force flush the post write */
1067 readl(iommu
->reg
+ DMAR_FECTL_REG
);
1068 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1071 void dmar_msi_mask(unsigned int irq
)
1074 struct intel_iommu
*iommu
= get_irq_data(irq
);
1077 spin_lock_irqsave(&iommu
->register_lock
, flag
);
1078 writel(DMA_FECTL_IM
, iommu
->reg
+ DMAR_FECTL_REG
);
1079 /* Read a reg to force flush the post write */
1080 readl(iommu
->reg
+ DMAR_FECTL_REG
);
1081 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1084 void dmar_msi_write(int irq
, struct msi_msg
*msg
)
1086 struct intel_iommu
*iommu
= get_irq_data(irq
);
1089 spin_lock_irqsave(&iommu
->register_lock
, flag
);
1090 writel(msg
->data
, iommu
->reg
+ DMAR_FEDATA_REG
);
1091 writel(msg
->address_lo
, iommu
->reg
+ DMAR_FEADDR_REG
);
1092 writel(msg
->address_hi
, iommu
->reg
+ DMAR_FEUADDR_REG
);
1093 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1096 void dmar_msi_read(int irq
, struct msi_msg
*msg
)
1098 struct intel_iommu
*iommu
= get_irq_data(irq
);
1101 spin_lock_irqsave(&iommu
->register_lock
, flag
);
1102 msg
->data
= readl(iommu
->reg
+ DMAR_FEDATA_REG
);
1103 msg
->address_lo
= readl(iommu
->reg
+ DMAR_FEADDR_REG
);
1104 msg
->address_hi
= readl(iommu
->reg
+ DMAR_FEUADDR_REG
);
1105 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1108 static int iommu_page_fault_do_one(struct intel_iommu
*iommu
, int type
,
1109 u8 fault_reason
, u16 source_id
, unsigned long long addr
)
1113 reason
= dmar_get_fault_reason(fault_reason
);
1116 "DMAR:[%s] Request device [%02x:%02x.%d] "
1117 "fault addr %llx \n"
1118 "DMAR:[fault reason %02d] %s\n",
1119 (type
? "DMA Read" : "DMA Write"),
1120 (source_id
>> 8), PCI_SLOT(source_id
& 0xFF),
1121 PCI_FUNC(source_id
& 0xFF), addr
, fault_reason
, reason
);
1125 #define PRIMARY_FAULT_REG_LEN (16)
1126 static irqreturn_t
iommu_page_fault(int irq
, void *dev_id
)
1128 struct intel_iommu
*iommu
= dev_id
;
1129 int reg
, fault_index
;
1133 spin_lock_irqsave(&iommu
->register_lock
, flag
);
1134 fault_status
= readl(iommu
->reg
+ DMAR_FSTS_REG
);
1136 /* TBD: ignore advanced fault log currently */
1137 if (!(fault_status
& DMA_FSTS_PPF
))
1138 goto clear_overflow
;
1140 fault_index
= dma_fsts_fault_record_index(fault_status
);
1141 reg
= cap_fault_reg_offset(iommu
->cap
);
1149 /* highest 32 bits */
1150 data
= readl(iommu
->reg
+ reg
+
1151 fault_index
* PRIMARY_FAULT_REG_LEN
+ 12);
1152 if (!(data
& DMA_FRCD_F
))
1155 fault_reason
= dma_frcd_fault_reason(data
);
1156 type
= dma_frcd_type(data
);
1158 data
= readl(iommu
->reg
+ reg
+
1159 fault_index
* PRIMARY_FAULT_REG_LEN
+ 8);
1160 source_id
= dma_frcd_source_id(data
);
1162 guest_addr
= dmar_readq(iommu
->reg
+ reg
+
1163 fault_index
* PRIMARY_FAULT_REG_LEN
);
1164 guest_addr
= dma_frcd_page_addr(guest_addr
);
1165 /* clear the fault */
1166 writel(DMA_FRCD_F
, iommu
->reg
+ reg
+
1167 fault_index
* PRIMARY_FAULT_REG_LEN
+ 12);
1169 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1171 iommu_page_fault_do_one(iommu
, type
, fault_reason
,
1172 source_id
, guest_addr
);
1175 if (fault_index
> cap_num_fault_regs(iommu
->cap
))
1177 spin_lock_irqsave(&iommu
->register_lock
, flag
);
1180 /* clear primary fault overflow */
1181 fault_status
= readl(iommu
->reg
+ DMAR_FSTS_REG
);
1182 if (fault_status
& DMA_FSTS_PFO
)
1183 writel(DMA_FSTS_PFO
, iommu
->reg
+ DMAR_FSTS_REG
);
1185 spin_unlock_irqrestore(&iommu
->register_lock
, flag
);
1189 int dmar_set_interrupt(struct intel_iommu
*iommu
)
1195 printk(KERN_ERR
"IOMMU: no free vectors\n");
1199 set_irq_data(irq
, iommu
);
1202 ret
= arch_setup_dmar_msi(irq
);
1204 set_irq_data(irq
, NULL
);
1210 /* Force fault register is cleared */
1211 iommu_page_fault(irq
, iommu
);
1213 ret
= request_irq(irq
, iommu_page_fault
, 0, iommu
->name
, iommu
);
1215 printk(KERN_ERR
"IOMMU: can't request irq\n");
1219 static int iommu_init_domains(struct intel_iommu
*iommu
)
1221 unsigned long ndomains
;
1222 unsigned long nlongs
;
1224 ndomains
= cap_ndoms(iommu
->cap
);
1225 pr_debug("Number of Domains supportd <%ld>\n", ndomains
);
1226 nlongs
= BITS_TO_LONGS(ndomains
);
1228 /* TBD: there might be 64K domains,
1229 * consider other allocation for future chip
1231 iommu
->domain_ids
= kcalloc(nlongs
, sizeof(unsigned long), GFP_KERNEL
);
1232 if (!iommu
->domain_ids
) {
1233 printk(KERN_ERR
"Allocating domain id array failed\n");
1236 iommu
->domains
= kcalloc(ndomains
, sizeof(struct dmar_domain
*),
1238 if (!iommu
->domains
) {
1239 printk(KERN_ERR
"Allocating domain array failed\n");
1240 kfree(iommu
->domain_ids
);
1244 spin_lock_init(&iommu
->lock
);
1247 * if Caching mode is set, then invalid translations are tagged
1248 * with domainid 0. Hence we need to pre-allocate it.
1250 if (cap_caching_mode(iommu
->cap
))
1251 set_bit(0, iommu
->domain_ids
);
1256 static void domain_exit(struct dmar_domain
*domain
);
1257 static void vm_domain_exit(struct dmar_domain
*domain
);
1259 void free_dmar_iommu(struct intel_iommu
*iommu
)
1261 struct dmar_domain
*domain
;
1263 unsigned long flags
;
1265 i
= find_first_bit(iommu
->domain_ids
, cap_ndoms(iommu
->cap
));
1266 for (; i
< cap_ndoms(iommu
->cap
); ) {
1267 domain
= iommu
->domains
[i
];
1268 clear_bit(i
, iommu
->domain_ids
);
1270 spin_lock_irqsave(&domain
->iommu_lock
, flags
);
1271 if (--domain
->iommu_count
== 0) {
1272 if (domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
)
1273 vm_domain_exit(domain
);
1275 domain_exit(domain
);
1277 spin_unlock_irqrestore(&domain
->iommu_lock
, flags
);
1279 i
= find_next_bit(iommu
->domain_ids
,
1280 cap_ndoms(iommu
->cap
), i
+1);
1283 if (iommu
->gcmd
& DMA_GCMD_TE
)
1284 iommu_disable_translation(iommu
);
1287 set_irq_data(iommu
->irq
, NULL
);
1288 /* This will mask the irq */
1289 free_irq(iommu
->irq
, iommu
);
1290 destroy_irq(iommu
->irq
);
1293 kfree(iommu
->domains
);
1294 kfree(iommu
->domain_ids
);
1296 g_iommus
[iommu
->seq_id
] = NULL
;
1298 /* if all iommus are freed, free g_iommus */
1299 for (i
= 0; i
< g_num_of_iommus
; i
++) {
1304 if (i
== g_num_of_iommus
)
1307 /* free context mapping */
1308 free_context_table(iommu
);
1311 static struct dmar_domain
* iommu_alloc_domain(struct intel_iommu
*iommu
)
1314 unsigned long ndomains
;
1315 struct dmar_domain
*domain
;
1316 unsigned long flags
;
1318 domain
= alloc_domain_mem();
1322 ndomains
= cap_ndoms(iommu
->cap
);
1324 spin_lock_irqsave(&iommu
->lock
, flags
);
1325 num
= find_first_zero_bit(iommu
->domain_ids
, ndomains
);
1326 if (num
>= ndomains
) {
1327 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1328 free_domain_mem(domain
);
1329 printk(KERN_ERR
"IOMMU: no free domain ids\n");
1333 set_bit(num
, iommu
->domain_ids
);
1335 memset(&domain
->iommu_bmp
, 0, sizeof(unsigned long));
1336 set_bit(iommu
->seq_id
, &domain
->iommu_bmp
);
1338 iommu
->domains
[num
] = domain
;
1339 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1344 static void iommu_free_domain(struct dmar_domain
*domain
)
1346 unsigned long flags
;
1347 struct intel_iommu
*iommu
;
1349 iommu
= domain_get_iommu(domain
);
1351 spin_lock_irqsave(&iommu
->lock
, flags
);
1352 clear_bit(domain
->id
, iommu
->domain_ids
);
1353 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1356 static struct iova_domain reserved_iova_list
;
1357 static struct lock_class_key reserved_alloc_key
;
1358 static struct lock_class_key reserved_rbtree_key
;
1360 static void dmar_init_reserved_ranges(void)
1362 struct pci_dev
*pdev
= NULL
;
1367 init_iova_domain(&reserved_iova_list
, DMA_32BIT_PFN
);
1369 lockdep_set_class(&reserved_iova_list
.iova_alloc_lock
,
1370 &reserved_alloc_key
);
1371 lockdep_set_class(&reserved_iova_list
.iova_rbtree_lock
,
1372 &reserved_rbtree_key
);
1374 /* IOAPIC ranges shouldn't be accessed by DMA */
1375 iova
= reserve_iova(&reserved_iova_list
, IOVA_PFN(IOAPIC_RANGE_START
),
1376 IOVA_PFN(IOAPIC_RANGE_END
));
1378 printk(KERN_ERR
"Reserve IOAPIC range failed\n");
1380 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1381 for_each_pci_dev(pdev
) {
1384 for (i
= 0; i
< PCI_NUM_RESOURCES
; i
++) {
1385 r
= &pdev
->resource
[i
];
1386 if (!r
->flags
|| !(r
->flags
& IORESOURCE_MEM
))
1390 size
= r
->end
- addr
;
1391 size
= PAGE_ALIGN(size
);
1392 iova
= reserve_iova(&reserved_iova_list
, IOVA_PFN(addr
),
1393 IOVA_PFN(size
+ addr
) - 1);
1395 printk(KERN_ERR
"Reserve iova failed\n");
1401 static void domain_reserve_special_ranges(struct dmar_domain
*domain
)
1403 copy_reserved_iova(&reserved_iova_list
, &domain
->iovad
);
1406 static inline int guestwidth_to_adjustwidth(int gaw
)
1409 int r
= (gaw
- 12) % 9;
1420 static int domain_init(struct dmar_domain
*domain
, int guest_width
)
1422 struct intel_iommu
*iommu
;
1423 int adjust_width
, agaw
;
1424 unsigned long sagaw
;
1426 init_iova_domain(&domain
->iovad
, DMA_32BIT_PFN
);
1427 spin_lock_init(&domain
->mapping_lock
);
1428 spin_lock_init(&domain
->iommu_lock
);
1430 domain_reserve_special_ranges(domain
);
1432 /* calculate AGAW */
1433 iommu
= domain_get_iommu(domain
);
1434 if (guest_width
> cap_mgaw(iommu
->cap
))
1435 guest_width
= cap_mgaw(iommu
->cap
);
1436 domain
->gaw
= guest_width
;
1437 adjust_width
= guestwidth_to_adjustwidth(guest_width
);
1438 agaw
= width_to_agaw(adjust_width
);
1439 sagaw
= cap_sagaw(iommu
->cap
);
1440 if (!test_bit(agaw
, &sagaw
)) {
1441 /* hardware doesn't support it, choose a bigger one */
1442 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw
);
1443 agaw
= find_next_bit(&sagaw
, 5, agaw
);
1447 domain
->agaw
= agaw
;
1448 INIT_LIST_HEAD(&domain
->devices
);
1450 if (ecap_coherent(iommu
->ecap
))
1451 domain
->iommu_coherency
= 1;
1453 domain
->iommu_coherency
= 0;
1455 if (ecap_sc_support(iommu
->ecap
))
1456 domain
->iommu_snooping
= 1;
1458 domain
->iommu_snooping
= 0;
1460 domain
->iommu_count
= 1;
1462 /* always allocate the top pgd */
1463 domain
->pgd
= (struct dma_pte
*)alloc_pgtable_page();
1466 __iommu_flush_cache(iommu
, domain
->pgd
, PAGE_SIZE
);
1470 static void domain_exit(struct dmar_domain
*domain
)
1474 /* Domain 0 is reserved, so dont process it */
1478 domain_remove_dev_info(domain
);
1480 put_iova_domain(&domain
->iovad
);
1481 end
= DOMAIN_MAX_ADDR(domain
->gaw
);
1482 end
= end
& (~PAGE_MASK
);
1485 dma_pte_clear_range(domain
, 0, end
);
1487 /* free page tables */
1488 dma_pte_free_pagetable(domain
, 0, end
);
1490 iommu_free_domain(domain
);
1491 free_domain_mem(domain
);
1494 static int domain_context_mapping_one(struct dmar_domain
*domain
,
1497 struct context_entry
*context
;
1498 unsigned long flags
;
1499 struct intel_iommu
*iommu
;
1500 struct dma_pte
*pgd
;
1502 unsigned long ndomains
;
1506 pr_debug("Set context mapping for %02x:%02x.%d\n",
1507 bus
, PCI_SLOT(devfn
), PCI_FUNC(devfn
));
1508 BUG_ON(!domain
->pgd
);
1510 iommu
= device_to_iommu(bus
, devfn
);
1514 context
= device_to_context_entry(iommu
, bus
, devfn
);
1517 spin_lock_irqsave(&iommu
->lock
, flags
);
1518 if (context_present(context
)) {
1519 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1526 if (domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
) {
1529 /* find an available domain id for this device in iommu */
1530 ndomains
= cap_ndoms(iommu
->cap
);
1531 num
= find_first_bit(iommu
->domain_ids
, ndomains
);
1532 for (; num
< ndomains
; ) {
1533 if (iommu
->domains
[num
] == domain
) {
1538 num
= find_next_bit(iommu
->domain_ids
,
1539 cap_ndoms(iommu
->cap
), num
+1);
1543 num
= find_first_zero_bit(iommu
->domain_ids
, ndomains
);
1544 if (num
>= ndomains
) {
1545 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1546 printk(KERN_ERR
"IOMMU: no free domain ids\n");
1550 set_bit(num
, iommu
->domain_ids
);
1551 iommu
->domains
[num
] = domain
;
1555 /* Skip top levels of page tables for
1556 * iommu which has less agaw than default.
1558 for (agaw
= domain
->agaw
; agaw
!= iommu
->agaw
; agaw
--) {
1559 pgd
= phys_to_virt(dma_pte_addr(pgd
));
1560 if (!dma_pte_present(pgd
)) {
1561 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1567 context_set_domain_id(context
, id
);
1568 context_set_address_width(context
, iommu
->agaw
);
1569 context_set_address_root(context
, virt_to_phys(pgd
));
1570 context_set_translation_type(context
, CONTEXT_TT_MULTI_LEVEL
);
1571 context_set_fault_enable(context
);
1572 context_set_present(context
);
1573 domain_flush_cache(domain
, context
, sizeof(*context
));
1575 /* it's a non-present to present mapping */
1576 if (iommu
->flush
.flush_context(iommu
, domain
->id
,
1577 (((u16
)bus
) << 8) | devfn
, DMA_CCMD_MASK_NOBIT
,
1578 DMA_CCMD_DEVICE_INVL
, 1))
1579 iommu_flush_write_buffer(iommu
);
1581 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0, DMA_TLB_DSI_FLUSH
, 0);
1583 spin_unlock_irqrestore(&iommu
->lock
, flags
);
1585 spin_lock_irqsave(&domain
->iommu_lock
, flags
);
1586 if (!test_and_set_bit(iommu
->seq_id
, &domain
->iommu_bmp
)) {
1587 domain
->iommu_count
++;
1588 domain_update_iommu_cap(domain
);
1590 spin_unlock_irqrestore(&domain
->iommu_lock
, flags
);
1595 domain_context_mapping(struct dmar_domain
*domain
, struct pci_dev
*pdev
)
1598 struct pci_dev
*tmp
, *parent
;
1600 ret
= domain_context_mapping_one(domain
, pdev
->bus
->number
,
1605 /* dependent device mapping */
1606 tmp
= pci_find_upstream_pcie_bridge(pdev
);
1609 /* Secondary interface's bus number and devfn 0 */
1610 parent
= pdev
->bus
->self
;
1611 while (parent
!= tmp
) {
1612 ret
= domain_context_mapping_one(domain
, parent
->bus
->number
,
1616 parent
= parent
->bus
->self
;
1618 if (tmp
->is_pcie
) /* this is a PCIE-to-PCI bridge */
1619 return domain_context_mapping_one(domain
,
1620 tmp
->subordinate
->number
, 0);
1621 else /* this is a legacy PCI bridge */
1622 return domain_context_mapping_one(domain
,
1623 tmp
->bus
->number
, tmp
->devfn
);
1626 static int domain_context_mapped(struct pci_dev
*pdev
)
1629 struct pci_dev
*tmp
, *parent
;
1630 struct intel_iommu
*iommu
;
1632 iommu
= device_to_iommu(pdev
->bus
->number
, pdev
->devfn
);
1636 ret
= device_context_mapped(iommu
,
1637 pdev
->bus
->number
, pdev
->devfn
);
1640 /* dependent device mapping */
1641 tmp
= pci_find_upstream_pcie_bridge(pdev
);
1644 /* Secondary interface's bus number and devfn 0 */
1645 parent
= pdev
->bus
->self
;
1646 while (parent
!= tmp
) {
1647 ret
= device_context_mapped(iommu
, parent
->bus
->number
,
1651 parent
= parent
->bus
->self
;
1654 return device_context_mapped(iommu
,
1655 tmp
->subordinate
->number
, 0);
1657 return device_context_mapped(iommu
,
1658 tmp
->bus
->number
, tmp
->devfn
);
1662 domain_page_mapping(struct dmar_domain
*domain
, dma_addr_t iova
,
1663 u64 hpa
, size_t size
, int prot
)
1665 u64 start_pfn
, end_pfn
;
1666 struct dma_pte
*pte
;
1668 int addr_width
= agaw_to_width(domain
->agaw
);
1670 hpa
&= (((u64
)1) << addr_width
) - 1;
1672 if ((prot
& (DMA_PTE_READ
|DMA_PTE_WRITE
)) == 0)
1675 start_pfn
= ((u64
)hpa
) >> VTD_PAGE_SHIFT
;
1676 end_pfn
= (VTD_PAGE_ALIGN(((u64
)hpa
) + size
)) >> VTD_PAGE_SHIFT
;
1678 while (start_pfn
< end_pfn
) {
1679 pte
= addr_to_dma_pte(domain
, iova
+ VTD_PAGE_SIZE
* index
);
1682 /* We don't need lock here, nobody else
1683 * touches the iova range
1685 BUG_ON(dma_pte_addr(pte
));
1686 dma_set_pte_addr(pte
, start_pfn
<< VTD_PAGE_SHIFT
);
1687 dma_set_pte_prot(pte
, prot
);
1688 domain_flush_cache(domain
, pte
, sizeof(*pte
));
1695 static void iommu_detach_dev(struct intel_iommu
*iommu
, u8 bus
, u8 devfn
)
1700 clear_context_table(iommu
, bus
, devfn
);
1701 iommu
->flush
.flush_context(iommu
, 0, 0, 0,
1702 DMA_CCMD_GLOBAL_INVL
, 0);
1703 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0,
1704 DMA_TLB_GLOBAL_FLUSH
, 0);
1707 static void domain_remove_dev_info(struct dmar_domain
*domain
)
1709 struct device_domain_info
*info
;
1710 unsigned long flags
;
1711 struct intel_iommu
*iommu
;
1713 spin_lock_irqsave(&device_domain_lock
, flags
);
1714 while (!list_empty(&domain
->devices
)) {
1715 info
= list_entry(domain
->devices
.next
,
1716 struct device_domain_info
, link
);
1717 list_del(&info
->link
);
1718 list_del(&info
->global
);
1720 info
->dev
->dev
.archdata
.iommu
= NULL
;
1721 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1723 iommu
= device_to_iommu(info
->bus
, info
->devfn
);
1724 iommu_detach_dev(iommu
, info
->bus
, info
->devfn
);
1725 free_devinfo_mem(info
);
1727 spin_lock_irqsave(&device_domain_lock
, flags
);
1729 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1734 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1736 static struct dmar_domain
*
1737 find_domain(struct pci_dev
*pdev
)
1739 struct device_domain_info
*info
;
1741 /* No lock here, assumes no domain exit in normal case */
1742 info
= pdev
->dev
.archdata
.iommu
;
1744 return info
->domain
;
1748 /* domain is initialized */
1749 static struct dmar_domain
*get_domain_for_dev(struct pci_dev
*pdev
, int gaw
)
1751 struct dmar_domain
*domain
, *found
= NULL
;
1752 struct intel_iommu
*iommu
;
1753 struct dmar_drhd_unit
*drhd
;
1754 struct device_domain_info
*info
, *tmp
;
1755 struct pci_dev
*dev_tmp
;
1756 unsigned long flags
;
1757 int bus
= 0, devfn
= 0;
1759 domain
= find_domain(pdev
);
1763 dev_tmp
= pci_find_upstream_pcie_bridge(pdev
);
1765 if (dev_tmp
->is_pcie
) {
1766 bus
= dev_tmp
->subordinate
->number
;
1769 bus
= dev_tmp
->bus
->number
;
1770 devfn
= dev_tmp
->devfn
;
1772 spin_lock_irqsave(&device_domain_lock
, flags
);
1773 list_for_each_entry(info
, &device_domain_list
, global
) {
1774 if (info
->bus
== bus
&& info
->devfn
== devfn
) {
1775 found
= info
->domain
;
1779 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1780 /* pcie-pci bridge already has a domain, uses it */
1787 /* Allocate new domain for the device */
1788 drhd
= dmar_find_matched_drhd_unit(pdev
);
1790 printk(KERN_ERR
"IOMMU: can't find DMAR for device %s\n",
1794 iommu
= drhd
->iommu
;
1796 domain
= iommu_alloc_domain(iommu
);
1800 if (domain_init(domain
, gaw
)) {
1801 domain_exit(domain
);
1805 /* register pcie-to-pci device */
1807 info
= alloc_devinfo_mem();
1809 domain_exit(domain
);
1813 info
->devfn
= devfn
;
1815 info
->domain
= domain
;
1816 /* This domain is shared by devices under p2p bridge */
1817 domain
->flags
|= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES
;
1819 /* pcie-to-pci bridge already has a domain, uses it */
1821 spin_lock_irqsave(&device_domain_lock
, flags
);
1822 list_for_each_entry(tmp
, &device_domain_list
, global
) {
1823 if (tmp
->bus
== bus
&& tmp
->devfn
== devfn
) {
1824 found
= tmp
->domain
;
1829 free_devinfo_mem(info
);
1830 domain_exit(domain
);
1833 list_add(&info
->link
, &domain
->devices
);
1834 list_add(&info
->global
, &device_domain_list
);
1836 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1840 info
= alloc_devinfo_mem();
1843 info
->bus
= pdev
->bus
->number
;
1844 info
->devfn
= pdev
->devfn
;
1846 info
->domain
= domain
;
1847 spin_lock_irqsave(&device_domain_lock
, flags
);
1848 /* somebody is fast */
1849 found
= find_domain(pdev
);
1850 if (found
!= NULL
) {
1851 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1852 if (found
!= domain
) {
1853 domain_exit(domain
);
1856 free_devinfo_mem(info
);
1859 list_add(&info
->link
, &domain
->devices
);
1860 list_add(&info
->global
, &device_domain_list
);
1861 pdev
->dev
.archdata
.iommu
= info
;
1862 spin_unlock_irqrestore(&device_domain_lock
, flags
);
1865 /* recheck it here, maybe others set it */
1866 return find_domain(pdev
);
1869 static int iommu_prepare_identity_map(struct pci_dev
*pdev
,
1870 unsigned long long start
,
1871 unsigned long long end
)
1873 struct dmar_domain
*domain
;
1875 unsigned long long base
;
1879 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1880 pci_name(pdev
), start
, end
);
1881 /* page table init */
1882 domain
= get_domain_for_dev(pdev
, DEFAULT_DOMAIN_ADDRESS_WIDTH
);
1886 /* The address might not be aligned */
1887 base
= start
& PAGE_MASK
;
1889 size
= PAGE_ALIGN(size
);
1890 if (!reserve_iova(&domain
->iovad
, IOVA_PFN(base
),
1891 IOVA_PFN(base
+ size
) - 1)) {
1892 printk(KERN_ERR
"IOMMU: reserve iova failed\n");
1897 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1898 size
, base
, pci_name(pdev
));
1900 * RMRR range might have overlap with physical memory range,
1903 dma_pte_clear_range(domain
, base
, base
+ size
);
1905 ret
= domain_page_mapping(domain
, base
, base
, size
,
1906 DMA_PTE_READ
|DMA_PTE_WRITE
);
1910 /* context entry init */
1911 ret
= domain_context_mapping(domain
, pdev
);
1915 domain_exit(domain
);
1920 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit
*rmrr
,
1921 struct pci_dev
*pdev
)
1923 if (pdev
->dev
.archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
)
1925 return iommu_prepare_identity_map(pdev
, rmrr
->base_address
,
1926 rmrr
->end_address
+ 1);
1929 #ifdef CONFIG_DMAR_GFX_WA
1930 struct iommu_prepare_data
{
1931 struct pci_dev
*pdev
;
1935 static int __init
iommu_prepare_work_fn(unsigned long start_pfn
,
1936 unsigned long end_pfn
, void *datax
)
1938 struct iommu_prepare_data
*data
;
1940 data
= (struct iommu_prepare_data
*)datax
;
1942 data
->ret
= iommu_prepare_identity_map(data
->pdev
,
1943 start_pfn
<<PAGE_SHIFT
, end_pfn
<<PAGE_SHIFT
);
1948 static int __init
iommu_prepare_with_active_regions(struct pci_dev
*pdev
)
1951 struct iommu_prepare_data data
;
1956 for_each_online_node(nid
) {
1957 work_with_active_regions(nid
, iommu_prepare_work_fn
, &data
);
1964 static void __init
iommu_prepare_gfx_mapping(void)
1966 struct pci_dev
*pdev
= NULL
;
1969 for_each_pci_dev(pdev
) {
1970 if (pdev
->dev
.archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
||
1971 !IS_GFX_DEVICE(pdev
))
1973 printk(KERN_INFO
"IOMMU: gfx device %s 1-1 mapping\n",
1975 ret
= iommu_prepare_with_active_regions(pdev
);
1977 printk(KERN_ERR
"IOMMU: mapping reserved region failed\n");
1980 #else /* !CONFIG_DMAR_GFX_WA */
1981 static inline void iommu_prepare_gfx_mapping(void)
1987 #ifdef CONFIG_DMAR_FLOPPY_WA
1988 static inline void iommu_prepare_isa(void)
1990 struct pci_dev
*pdev
;
1993 pdev
= pci_get_class(PCI_CLASS_BRIDGE_ISA
<< 8, NULL
);
1997 printk(KERN_INFO
"IOMMU: Prepare 0-16M unity mapping for LPC\n");
1998 ret
= iommu_prepare_identity_map(pdev
, 0, 16*1024*1024);
2001 printk("IOMMU: Failed to create 0-64M identity map, "
2002 "floppy might not work\n");
2006 static inline void iommu_prepare_isa(void)
2010 #endif /* !CONFIG_DMAR_FLPY_WA */
2012 static int __init
init_dmars(void)
2014 struct dmar_drhd_unit
*drhd
;
2015 struct dmar_rmrr_unit
*rmrr
;
2016 struct pci_dev
*pdev
;
2017 struct intel_iommu
*iommu
;
2018 int i
, ret
, unit
= 0;
2023 * initialize and program root entry to not present
2026 for_each_drhd_unit(drhd
) {
2029 * lock not needed as this is only incremented in the single
2030 * threaded kernel __init code path all other access are read
2035 g_iommus
= kcalloc(g_num_of_iommus
, sizeof(struct intel_iommu
*),
2038 printk(KERN_ERR
"Allocating global iommu array failed\n");
2043 deferred_flush
= kzalloc(g_num_of_iommus
*
2044 sizeof(struct deferred_flush_tables
), GFP_KERNEL
);
2045 if (!deferred_flush
) {
2051 for_each_drhd_unit(drhd
) {
2055 iommu
= drhd
->iommu
;
2056 g_iommus
[iommu
->seq_id
] = iommu
;
2058 ret
= iommu_init_domains(iommu
);
2064 * we could share the same root & context tables
2065 * amoung all IOMMU's. Need to Split it later.
2067 ret
= iommu_alloc_root_entry(iommu
);
2069 printk(KERN_ERR
"IOMMU: allocate root entry failed\n");
2074 for_each_drhd_unit(drhd
) {
2078 iommu
= drhd
->iommu
;
2079 if (dmar_enable_qi(iommu
)) {
2081 * Queued Invalidate not enabled, use Register Based
2084 iommu
->flush
.flush_context
= __iommu_flush_context
;
2085 iommu
->flush
.flush_iotlb
= __iommu_flush_iotlb
;
2086 printk(KERN_INFO
"IOMMU 0x%Lx: using Register based "
2088 (unsigned long long)drhd
->reg_base_addr
);
2090 iommu
->flush
.flush_context
= qi_flush_context
;
2091 iommu
->flush
.flush_iotlb
= qi_flush_iotlb
;
2092 printk(KERN_INFO
"IOMMU 0x%Lx: using Queued "
2094 (unsigned long long)drhd
->reg_base_addr
);
2100 * for each dev attached to rmrr
2102 * locate drhd for dev, alloc domain for dev
2103 * allocate free domain
2104 * allocate page table entries for rmrr
2105 * if context not allocated for bus
2106 * allocate and init context
2107 * set present in root table for this bus
2108 * init context with domain, translation etc
2112 for_each_rmrr_units(rmrr
) {
2113 for (i
= 0; i
< rmrr
->devices_cnt
; i
++) {
2114 pdev
= rmrr
->devices
[i
];
2115 /* some BIOS lists non-exist devices in DMAR table */
2118 ret
= iommu_prepare_rmrr_dev(rmrr
, pdev
);
2121 "IOMMU: mapping reserved region failed\n");
2125 iommu_prepare_gfx_mapping();
2127 iommu_prepare_isa();
2132 * global invalidate context cache
2133 * global invalidate iotlb
2134 * enable translation
2136 for_each_drhd_unit(drhd
) {
2139 iommu
= drhd
->iommu
;
2140 sprintf (iommu
->name
, "dmar%d", unit
++);
2142 iommu_flush_write_buffer(iommu
);
2144 ret
= dmar_set_interrupt(iommu
);
2148 iommu_set_root_entry(iommu
);
2150 iommu
->flush
.flush_context(iommu
, 0, 0, 0, DMA_CCMD_GLOBAL_INVL
,
2152 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH
,
2154 iommu_disable_protect_mem_regions(iommu
);
2156 ret
= iommu_enable_translation(iommu
);
2163 for_each_drhd_unit(drhd
) {
2166 iommu
= drhd
->iommu
;
2173 static inline u64
aligned_size(u64 host_addr
, size_t size
)
2176 addr
= (host_addr
& (~PAGE_MASK
)) + size
;
2177 return PAGE_ALIGN(addr
);
2181 iommu_alloc_iova(struct dmar_domain
*domain
, size_t size
, u64 end
)
2185 /* Make sure it's in range */
2186 end
= min_t(u64
, DOMAIN_MAX_ADDR(domain
->gaw
), end
);
2187 if (!size
|| (IOVA_START_ADDR
+ size
> end
))
2190 piova
= alloc_iova(&domain
->iovad
,
2191 size
>> PAGE_SHIFT
, IOVA_PFN(end
), 1);
2195 static struct iova
*
2196 __intel_alloc_iova(struct device
*dev
, struct dmar_domain
*domain
,
2197 size_t size
, u64 dma_mask
)
2199 struct pci_dev
*pdev
= to_pci_dev(dev
);
2200 struct iova
*iova
= NULL
;
2202 if (dma_mask
<= DMA_32BIT_MASK
|| dmar_forcedac
)
2203 iova
= iommu_alloc_iova(domain
, size
, dma_mask
);
2206 * First try to allocate an io virtual address in
2207 * DMA_32BIT_MASK and if that fails then try allocating
2210 iova
= iommu_alloc_iova(domain
, size
, DMA_32BIT_MASK
);
2212 iova
= iommu_alloc_iova(domain
, size
, dma_mask
);
2216 printk(KERN_ERR
"Allocating iova for %s failed", pci_name(pdev
));
2223 static struct dmar_domain
*
2224 get_valid_domain_for_dev(struct pci_dev
*pdev
)
2226 struct dmar_domain
*domain
;
2229 domain
= get_domain_for_dev(pdev
,
2230 DEFAULT_DOMAIN_ADDRESS_WIDTH
);
2233 "Allocating domain for %s failed", pci_name(pdev
));
2237 /* make sure context mapping is ok */
2238 if (unlikely(!domain_context_mapped(pdev
))) {
2239 ret
= domain_context_mapping(domain
, pdev
);
2242 "Domain context map for %s failed",
2251 static dma_addr_t
__intel_map_single(struct device
*hwdev
, phys_addr_t paddr
,
2252 size_t size
, int dir
, u64 dma_mask
)
2254 struct pci_dev
*pdev
= to_pci_dev(hwdev
);
2255 struct dmar_domain
*domain
;
2256 phys_addr_t start_paddr
;
2260 struct intel_iommu
*iommu
;
2262 BUG_ON(dir
== DMA_NONE
);
2263 if (pdev
->dev
.archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
)
2266 domain
= get_valid_domain_for_dev(pdev
);
2270 iommu
= domain_get_iommu(domain
);
2271 size
= aligned_size((u64
)paddr
, size
);
2273 iova
= __intel_alloc_iova(hwdev
, domain
, size
, pdev
->dma_mask
);
2277 start_paddr
= (phys_addr_t
)iova
->pfn_lo
<< PAGE_SHIFT
;
2280 * Check if DMAR supports zero-length reads on write only
2283 if (dir
== DMA_TO_DEVICE
|| dir
== DMA_BIDIRECTIONAL
|| \
2284 !cap_zlr(iommu
->cap
))
2285 prot
|= DMA_PTE_READ
;
2286 if (dir
== DMA_FROM_DEVICE
|| dir
== DMA_BIDIRECTIONAL
)
2287 prot
|= DMA_PTE_WRITE
;
2289 * paddr - (paddr + size) might be partial page, we should map the whole
2290 * page. Note: if two part of one page are separately mapped, we
2291 * might have two guest_addr mapping to the same host paddr, but this
2292 * is not a big problem
2294 ret
= domain_page_mapping(domain
, start_paddr
,
2295 ((u64
)paddr
) & PAGE_MASK
, size
, prot
);
2299 /* it's a non-present to present mapping */
2300 ret
= iommu_flush_iotlb_psi(iommu
, domain
->id
,
2301 start_paddr
, size
>> VTD_PAGE_SHIFT
, 1);
2303 iommu_flush_write_buffer(iommu
);
2305 return start_paddr
+ ((u64
)paddr
& (~PAGE_MASK
));
2309 __free_iova(&domain
->iovad
, iova
);
2310 printk(KERN_ERR
"Device %s request: %lx@%llx dir %d --- failed\n",
2311 pci_name(pdev
), size
, (unsigned long long)paddr
, dir
);
2315 dma_addr_t
intel_map_single(struct device
*hwdev
, phys_addr_t paddr
,
2316 size_t size
, int dir
)
2318 return __intel_map_single(hwdev
, paddr
, size
, dir
,
2319 to_pci_dev(hwdev
)->dma_mask
);
2322 static void flush_unmaps(void)
2328 /* just flush them all */
2329 for (i
= 0; i
< g_num_of_iommus
; i
++) {
2330 struct intel_iommu
*iommu
= g_iommus
[i
];
2334 if (deferred_flush
[i
].next
) {
2335 iommu
->flush
.flush_iotlb(iommu
, 0, 0, 0,
2336 DMA_TLB_GLOBAL_FLUSH
, 0);
2337 for (j
= 0; j
< deferred_flush
[i
].next
; j
++) {
2338 __free_iova(&deferred_flush
[i
].domain
[j
]->iovad
,
2339 deferred_flush
[i
].iova
[j
]);
2341 deferred_flush
[i
].next
= 0;
2348 static void flush_unmaps_timeout(unsigned long data
)
2350 unsigned long flags
;
2352 spin_lock_irqsave(&async_umap_flush_lock
, flags
);
2354 spin_unlock_irqrestore(&async_umap_flush_lock
, flags
);
2357 static void add_unmap(struct dmar_domain
*dom
, struct iova
*iova
)
2359 unsigned long flags
;
2361 struct intel_iommu
*iommu
;
2363 spin_lock_irqsave(&async_umap_flush_lock
, flags
);
2364 if (list_size
== HIGH_WATER_MARK
)
2367 iommu
= domain_get_iommu(dom
);
2368 iommu_id
= iommu
->seq_id
;
2370 next
= deferred_flush
[iommu_id
].next
;
2371 deferred_flush
[iommu_id
].domain
[next
] = dom
;
2372 deferred_flush
[iommu_id
].iova
[next
] = iova
;
2373 deferred_flush
[iommu_id
].next
++;
2376 mod_timer(&unmap_timer
, jiffies
+ msecs_to_jiffies(10));
2380 spin_unlock_irqrestore(&async_umap_flush_lock
, flags
);
2383 void intel_unmap_single(struct device
*dev
, dma_addr_t dev_addr
, size_t size
,
2386 struct pci_dev
*pdev
= to_pci_dev(dev
);
2387 struct dmar_domain
*domain
;
2388 unsigned long start_addr
;
2390 struct intel_iommu
*iommu
;
2392 if (pdev
->dev
.archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
)
2394 domain
= find_domain(pdev
);
2397 iommu
= domain_get_iommu(domain
);
2399 iova
= find_iova(&domain
->iovad
, IOVA_PFN(dev_addr
));
2403 start_addr
= iova
->pfn_lo
<< PAGE_SHIFT
;
2404 size
= aligned_size((u64
)dev_addr
, size
);
2406 pr_debug("Device %s unmapping: %lx@%llx\n",
2407 pci_name(pdev
), size
, (unsigned long long)start_addr
);
2409 /* clear the whole page */
2410 dma_pte_clear_range(domain
, start_addr
, start_addr
+ size
);
2411 /* free page tables */
2412 dma_pte_free_pagetable(domain
, start_addr
, start_addr
+ size
);
2413 if (intel_iommu_strict
) {
2414 if (iommu_flush_iotlb_psi(iommu
,
2415 domain
->id
, start_addr
, size
>> VTD_PAGE_SHIFT
, 0))
2416 iommu_flush_write_buffer(iommu
);
2418 __free_iova(&domain
->iovad
, iova
);
2420 add_unmap(domain
, iova
);
2422 * queue up the release of the unmap to save the 1/6th of the
2423 * cpu used up by the iotlb flush operation...
2428 void *intel_alloc_coherent(struct device
*hwdev
, size_t size
,
2429 dma_addr_t
*dma_handle
, gfp_t flags
)
2434 size
= PAGE_ALIGN(size
);
2435 order
= get_order(size
);
2436 flags
&= ~(GFP_DMA
| GFP_DMA32
);
2438 vaddr
= (void *)__get_free_pages(flags
, order
);
2441 memset(vaddr
, 0, size
);
2443 *dma_handle
= __intel_map_single(hwdev
, virt_to_bus(vaddr
), size
,
2445 hwdev
->coherent_dma_mask
);
2448 free_pages((unsigned long)vaddr
, order
);
2452 void intel_free_coherent(struct device
*hwdev
, size_t size
, void *vaddr
,
2453 dma_addr_t dma_handle
)
2457 size
= PAGE_ALIGN(size
);
2458 order
= get_order(size
);
2460 intel_unmap_single(hwdev
, dma_handle
, size
, DMA_BIDIRECTIONAL
);
2461 free_pages((unsigned long)vaddr
, order
);
2464 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2466 void intel_unmap_sg(struct device
*hwdev
, struct scatterlist
*sglist
,
2467 int nelems
, int dir
)
2470 struct pci_dev
*pdev
= to_pci_dev(hwdev
);
2471 struct dmar_domain
*domain
;
2472 unsigned long start_addr
;
2476 struct scatterlist
*sg
;
2477 struct intel_iommu
*iommu
;
2479 if (pdev
->dev
.archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
)
2482 domain
= find_domain(pdev
);
2485 iommu
= domain_get_iommu(domain
);
2487 iova
= find_iova(&domain
->iovad
, IOVA_PFN(sglist
[0].dma_address
));
2490 for_each_sg(sglist
, sg
, nelems
, i
) {
2491 addr
= SG_ENT_VIRT_ADDRESS(sg
);
2492 size
+= aligned_size((u64
)addr
, sg
->length
);
2495 start_addr
= iova
->pfn_lo
<< PAGE_SHIFT
;
2497 /* clear the whole page */
2498 dma_pte_clear_range(domain
, start_addr
, start_addr
+ size
);
2499 /* free page tables */
2500 dma_pte_free_pagetable(domain
, start_addr
, start_addr
+ size
);
2502 if (iommu_flush_iotlb_psi(iommu
, domain
->id
, start_addr
,
2503 size
>> VTD_PAGE_SHIFT
, 0))
2504 iommu_flush_write_buffer(iommu
);
2507 __free_iova(&domain
->iovad
, iova
);
2510 static int intel_nontranslate_map_sg(struct device
*hddev
,
2511 struct scatterlist
*sglist
, int nelems
, int dir
)
2514 struct scatterlist
*sg
;
2516 for_each_sg(sglist
, sg
, nelems
, i
) {
2517 BUG_ON(!sg_page(sg
));
2518 sg
->dma_address
= virt_to_bus(SG_ENT_VIRT_ADDRESS(sg
));
2519 sg
->dma_length
= sg
->length
;
2524 int intel_map_sg(struct device
*hwdev
, struct scatterlist
*sglist
, int nelems
,
2529 struct pci_dev
*pdev
= to_pci_dev(hwdev
);
2530 struct dmar_domain
*domain
;
2534 struct iova
*iova
= NULL
;
2536 struct scatterlist
*sg
;
2537 unsigned long start_addr
;
2538 struct intel_iommu
*iommu
;
2540 BUG_ON(dir
== DMA_NONE
);
2541 if (pdev
->dev
.archdata
.iommu
== DUMMY_DEVICE_DOMAIN_INFO
)
2542 return intel_nontranslate_map_sg(hwdev
, sglist
, nelems
, dir
);
2544 domain
= get_valid_domain_for_dev(pdev
);
2548 iommu
= domain_get_iommu(domain
);
2550 for_each_sg(sglist
, sg
, nelems
, i
) {
2551 addr
= SG_ENT_VIRT_ADDRESS(sg
);
2552 addr
= (void *)virt_to_phys(addr
);
2553 size
+= aligned_size((u64
)addr
, sg
->length
);
2556 iova
= __intel_alloc_iova(hwdev
, domain
, size
, pdev
->dma_mask
);
2558 sglist
->dma_length
= 0;
2563 * Check if DMAR supports zero-length reads on write only
2566 if (dir
== DMA_TO_DEVICE
|| dir
== DMA_BIDIRECTIONAL
|| \
2567 !cap_zlr(iommu
->cap
))
2568 prot
|= DMA_PTE_READ
;
2569 if (dir
== DMA_FROM_DEVICE
|| dir
== DMA_BIDIRECTIONAL
)
2570 prot
|= DMA_PTE_WRITE
;
2572 start_addr
= iova
->pfn_lo
<< PAGE_SHIFT
;
2574 for_each_sg(sglist
, sg
, nelems
, i
) {
2575 addr
= SG_ENT_VIRT_ADDRESS(sg
);
2576 addr
= (void *)virt_to_phys(addr
);
2577 size
= aligned_size((u64
)addr
, sg
->length
);
2578 ret
= domain_page_mapping(domain
, start_addr
+ offset
,
2579 ((u64
)addr
) & PAGE_MASK
,
2582 /* clear the page */
2583 dma_pte_clear_range(domain
, start_addr
,
2584 start_addr
+ offset
);
2585 /* free page tables */
2586 dma_pte_free_pagetable(domain
, start_addr
,
2587 start_addr
+ offset
);
2589 __free_iova(&domain
->iovad
, iova
);
2592 sg
->dma_address
= start_addr
+ offset
+
2593 ((u64
)addr
& (~PAGE_MASK
));
2594 sg
->dma_length
= sg
->length
;
2598 /* it's a non-present to present mapping */
2599 if (iommu_flush_iotlb_psi(iommu
, domain
->id
,
2600 start_addr
, offset
>> VTD_PAGE_SHIFT
, 1))
2601 iommu_flush_write_buffer(iommu
);
2605 static struct dma_mapping_ops intel_dma_ops
= {
2606 .alloc_coherent
= intel_alloc_coherent
,
2607 .free_coherent
= intel_free_coherent
,
2608 .map_single
= intel_map_single
,
2609 .unmap_single
= intel_unmap_single
,
2610 .map_sg
= intel_map_sg
,
2611 .unmap_sg
= intel_unmap_sg
,
2614 static inline int iommu_domain_cache_init(void)
2618 iommu_domain_cache
= kmem_cache_create("iommu_domain",
2619 sizeof(struct dmar_domain
),
2624 if (!iommu_domain_cache
) {
2625 printk(KERN_ERR
"Couldn't create iommu_domain cache\n");
2632 static inline int iommu_devinfo_cache_init(void)
2636 iommu_devinfo_cache
= kmem_cache_create("iommu_devinfo",
2637 sizeof(struct device_domain_info
),
2641 if (!iommu_devinfo_cache
) {
2642 printk(KERN_ERR
"Couldn't create devinfo cache\n");
2649 static inline int iommu_iova_cache_init(void)
2653 iommu_iova_cache
= kmem_cache_create("iommu_iova",
2654 sizeof(struct iova
),
2658 if (!iommu_iova_cache
) {
2659 printk(KERN_ERR
"Couldn't create iova cache\n");
2666 static int __init
iommu_init_mempool(void)
2669 ret
= iommu_iova_cache_init();
2673 ret
= iommu_domain_cache_init();
2677 ret
= iommu_devinfo_cache_init();
2681 kmem_cache_destroy(iommu_domain_cache
);
2683 kmem_cache_destroy(iommu_iova_cache
);
2688 static void __init
iommu_exit_mempool(void)
2690 kmem_cache_destroy(iommu_devinfo_cache
);
2691 kmem_cache_destroy(iommu_domain_cache
);
2692 kmem_cache_destroy(iommu_iova_cache
);
2696 static void __init
init_no_remapping_devices(void)
2698 struct dmar_drhd_unit
*drhd
;
2700 for_each_drhd_unit(drhd
) {
2701 if (!drhd
->include_all
) {
2703 for (i
= 0; i
< drhd
->devices_cnt
; i
++)
2704 if (drhd
->devices
[i
] != NULL
)
2706 /* ignore DMAR unit if no pci devices exist */
2707 if (i
== drhd
->devices_cnt
)
2715 for_each_drhd_unit(drhd
) {
2717 if (drhd
->ignored
|| drhd
->include_all
)
2720 for (i
= 0; i
< drhd
->devices_cnt
; i
++)
2721 if (drhd
->devices
[i
] &&
2722 !IS_GFX_DEVICE(drhd
->devices
[i
]))
2725 if (i
< drhd
->devices_cnt
)
2728 /* bypass IOMMU if it is just for gfx devices */
2730 for (i
= 0; i
< drhd
->devices_cnt
; i
++) {
2731 if (!drhd
->devices
[i
])
2733 drhd
->devices
[i
]->dev
.archdata
.iommu
= DUMMY_DEVICE_DOMAIN_INFO
;
2738 int __init
intel_iommu_init(void)
2742 if (dmar_table_init())
2745 if (dmar_dev_scope_init())
2749 * Check the need for DMA-remapping initialization now.
2750 * Above initialization will also be used by Interrupt-remapping.
2752 if (no_iommu
|| swiotlb
|| dmar_disabled
)
2755 iommu_init_mempool();
2756 dmar_init_reserved_ranges();
2758 init_no_remapping_devices();
2762 printk(KERN_ERR
"IOMMU: dmar init failed\n");
2763 put_iova_domain(&reserved_iova_list
);
2764 iommu_exit_mempool();
2768 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2770 init_timer(&unmap_timer
);
2772 dma_ops
= &intel_dma_ops
;
2774 register_iommu(&intel_iommu_ops
);
2779 static int vm_domain_add_dev_info(struct dmar_domain
*domain
,
2780 struct pci_dev
*pdev
)
2782 struct device_domain_info
*info
;
2783 unsigned long flags
;
2785 info
= alloc_devinfo_mem();
2789 info
->bus
= pdev
->bus
->number
;
2790 info
->devfn
= pdev
->devfn
;
2792 info
->domain
= domain
;
2794 spin_lock_irqsave(&device_domain_lock
, flags
);
2795 list_add(&info
->link
, &domain
->devices
);
2796 list_add(&info
->global
, &device_domain_list
);
2797 pdev
->dev
.archdata
.iommu
= info
;
2798 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2803 static void vm_domain_remove_one_dev_info(struct dmar_domain
*domain
,
2804 struct pci_dev
*pdev
)
2806 struct device_domain_info
*info
;
2807 struct intel_iommu
*iommu
;
2808 unsigned long flags
;
2810 struct list_head
*entry
, *tmp
;
2812 iommu
= device_to_iommu(pdev
->bus
->number
, pdev
->devfn
);
2816 spin_lock_irqsave(&device_domain_lock
, flags
);
2817 list_for_each_safe(entry
, tmp
, &domain
->devices
) {
2818 info
= list_entry(entry
, struct device_domain_info
, link
);
2819 if (info
->bus
== pdev
->bus
->number
&&
2820 info
->devfn
== pdev
->devfn
) {
2821 list_del(&info
->link
);
2822 list_del(&info
->global
);
2824 info
->dev
->dev
.archdata
.iommu
= NULL
;
2825 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2827 iommu_detach_dev(iommu
, info
->bus
, info
->devfn
);
2828 free_devinfo_mem(info
);
2830 spin_lock_irqsave(&device_domain_lock
, flags
);
2838 /* if there is no other devices under the same iommu
2839 * owned by this domain, clear this iommu in iommu_bmp
2840 * update iommu count and coherency
2842 if (device_to_iommu(info
->bus
, info
->devfn
) == iommu
)
2847 unsigned long tmp_flags
;
2848 spin_lock_irqsave(&domain
->iommu_lock
, tmp_flags
);
2849 clear_bit(iommu
->seq_id
, &domain
->iommu_bmp
);
2850 domain
->iommu_count
--;
2851 domain_update_iommu_cap(domain
);
2852 spin_unlock_irqrestore(&domain
->iommu_lock
, tmp_flags
);
2855 spin_unlock_irqrestore(&device_domain_lock
, flags
);
2858 static void vm_domain_remove_all_dev_info(struct dmar_domain
*domain
)
2860 struct device_domain_info
*info
;
2861 struct intel_iommu
*iommu
;
2862 unsigned long flags1
, flags2
;
2864 spin_lock_irqsave(&device_domain_lock
, flags1
);
2865 while (!list_empty(&domain
->devices
)) {
2866 info
= list_entry(domain
->devices
.next
,
2867 struct device_domain_info
, link
);
2868 list_del(&info
->link
);
2869 list_del(&info
->global
);
2871 info
->dev
->dev
.archdata
.iommu
= NULL
;
2873 spin_unlock_irqrestore(&device_domain_lock
, flags1
);
2875 iommu
= device_to_iommu(info
->bus
, info
->devfn
);
2876 iommu_detach_dev(iommu
, info
->bus
, info
->devfn
);
2878 /* clear this iommu in iommu_bmp, update iommu count
2881 spin_lock_irqsave(&domain
->iommu_lock
, flags2
);
2882 if (test_and_clear_bit(iommu
->seq_id
,
2883 &domain
->iommu_bmp
)) {
2884 domain
->iommu_count
--;
2885 domain_update_iommu_cap(domain
);
2887 spin_unlock_irqrestore(&domain
->iommu_lock
, flags2
);
2889 free_devinfo_mem(info
);
2890 spin_lock_irqsave(&device_domain_lock
, flags1
);
2892 spin_unlock_irqrestore(&device_domain_lock
, flags1
);
2895 /* domain id for virtual machine, it won't be set in context */
2896 static unsigned long vm_domid
;
2898 static int vm_domain_min_agaw(struct dmar_domain
*domain
)
2901 int min_agaw
= domain
->agaw
;
2903 i
= find_first_bit(&domain
->iommu_bmp
, g_num_of_iommus
);
2904 for (; i
< g_num_of_iommus
; ) {
2905 if (min_agaw
> g_iommus
[i
]->agaw
)
2906 min_agaw
= g_iommus
[i
]->agaw
;
2908 i
= find_next_bit(&domain
->iommu_bmp
, g_num_of_iommus
, i
+1);
2914 static struct dmar_domain
*iommu_alloc_vm_domain(void)
2916 struct dmar_domain
*domain
;
2918 domain
= alloc_domain_mem();
2922 domain
->id
= vm_domid
++;
2923 memset(&domain
->iommu_bmp
, 0, sizeof(unsigned long));
2924 domain
->flags
= DOMAIN_FLAG_VIRTUAL_MACHINE
;
2929 static int vm_domain_init(struct dmar_domain
*domain
, int guest_width
)
2933 init_iova_domain(&domain
->iovad
, DMA_32BIT_PFN
);
2934 spin_lock_init(&domain
->mapping_lock
);
2935 spin_lock_init(&domain
->iommu_lock
);
2937 domain_reserve_special_ranges(domain
);
2939 /* calculate AGAW */
2940 domain
->gaw
= guest_width
;
2941 adjust_width
= guestwidth_to_adjustwidth(guest_width
);
2942 domain
->agaw
= width_to_agaw(adjust_width
);
2944 INIT_LIST_HEAD(&domain
->devices
);
2946 domain
->iommu_count
= 0;
2947 domain
->iommu_coherency
= 0;
2948 domain
->max_addr
= 0;
2950 /* always allocate the top pgd */
2951 domain
->pgd
= (struct dma_pte
*)alloc_pgtable_page();
2954 domain_flush_cache(domain
, domain
->pgd
, PAGE_SIZE
);
2958 static void iommu_free_vm_domain(struct dmar_domain
*domain
)
2960 unsigned long flags
;
2961 struct dmar_drhd_unit
*drhd
;
2962 struct intel_iommu
*iommu
;
2964 unsigned long ndomains
;
2966 for_each_drhd_unit(drhd
) {
2969 iommu
= drhd
->iommu
;
2971 ndomains
= cap_ndoms(iommu
->cap
);
2972 i
= find_first_bit(iommu
->domain_ids
, ndomains
);
2973 for (; i
< ndomains
; ) {
2974 if (iommu
->domains
[i
] == domain
) {
2975 spin_lock_irqsave(&iommu
->lock
, flags
);
2976 clear_bit(i
, iommu
->domain_ids
);
2977 iommu
->domains
[i
] = NULL
;
2978 spin_unlock_irqrestore(&iommu
->lock
, flags
);
2981 i
= find_next_bit(iommu
->domain_ids
, ndomains
, i
+1);
2986 static void vm_domain_exit(struct dmar_domain
*domain
)
2990 /* Domain 0 is reserved, so dont process it */
2994 vm_domain_remove_all_dev_info(domain
);
2996 put_iova_domain(&domain
->iovad
);
2997 end
= DOMAIN_MAX_ADDR(domain
->gaw
);
2998 end
= end
& (~VTD_PAGE_MASK
);
3001 dma_pte_clear_range(domain
, 0, end
);
3003 /* free page tables */
3004 dma_pte_free_pagetable(domain
, 0, end
);
3006 iommu_free_vm_domain(domain
);
3007 free_domain_mem(domain
);
3010 static int intel_iommu_domain_init(struct iommu_domain
*domain
)
3012 struct dmar_domain
*dmar_domain
;
3014 dmar_domain
= iommu_alloc_vm_domain();
3017 "intel_iommu_domain_init: dmar_domain == NULL\n");
3020 if (vm_domain_init(dmar_domain
, DEFAULT_DOMAIN_ADDRESS_WIDTH
)) {
3022 "intel_iommu_domain_init() failed\n");
3023 vm_domain_exit(dmar_domain
);
3026 domain
->priv
= dmar_domain
;
3031 static void intel_iommu_domain_destroy(struct iommu_domain
*domain
)
3033 struct dmar_domain
*dmar_domain
= domain
->priv
;
3035 domain
->priv
= NULL
;
3036 vm_domain_exit(dmar_domain
);
3039 static int intel_iommu_attach_device(struct iommu_domain
*domain
,
3042 struct dmar_domain
*dmar_domain
= domain
->priv
;
3043 struct pci_dev
*pdev
= to_pci_dev(dev
);
3044 struct intel_iommu
*iommu
;
3049 /* normally pdev is not mapped */
3050 if (unlikely(domain_context_mapped(pdev
))) {
3051 struct dmar_domain
*old_domain
;
3053 old_domain
= find_domain(pdev
);
3055 if (dmar_domain
->flags
& DOMAIN_FLAG_VIRTUAL_MACHINE
)
3056 vm_domain_remove_one_dev_info(old_domain
, pdev
);
3058 domain_remove_dev_info(old_domain
);
3062 iommu
= device_to_iommu(pdev
->bus
->number
, pdev
->devfn
);
3066 /* check if this iommu agaw is sufficient for max mapped address */
3067 addr_width
= agaw_to_width(iommu
->agaw
);
3068 end
= DOMAIN_MAX_ADDR(addr_width
);
3069 end
= end
& VTD_PAGE_MASK
;
3070 if (end
< dmar_domain
->max_addr
) {
3071 printk(KERN_ERR
"%s: iommu agaw (%d) is not "
3072 "sufficient for the mapped address (%llx)\n",
3073 __func__
, iommu
->agaw
, dmar_domain
->max_addr
);
3077 ret
= domain_context_mapping(dmar_domain
, pdev
);
3081 ret
= vm_domain_add_dev_info(dmar_domain
, pdev
);
3085 static void intel_iommu_detach_device(struct iommu_domain
*domain
,
3088 struct dmar_domain
*dmar_domain
= domain
->priv
;
3089 struct pci_dev
*pdev
= to_pci_dev(dev
);
3091 vm_domain_remove_one_dev_info(dmar_domain
, pdev
);
3094 static int intel_iommu_map_range(struct iommu_domain
*domain
,
3095 unsigned long iova
, phys_addr_t hpa
,
3096 size_t size
, int iommu_prot
)
3098 struct dmar_domain
*dmar_domain
= domain
->priv
;
3104 if (iommu_prot
& IOMMU_READ
)
3105 prot
|= DMA_PTE_READ
;
3106 if (iommu_prot
& IOMMU_WRITE
)
3107 prot
|= DMA_PTE_WRITE
;
3109 max_addr
= (iova
& VTD_PAGE_MASK
) + VTD_PAGE_ALIGN(size
);
3110 if (dmar_domain
->max_addr
< max_addr
) {
3114 /* check if minimum agaw is sufficient for mapped address */
3115 min_agaw
= vm_domain_min_agaw(dmar_domain
);
3116 addr_width
= agaw_to_width(min_agaw
);
3117 end
= DOMAIN_MAX_ADDR(addr_width
);
3118 end
= end
& VTD_PAGE_MASK
;
3119 if (end
< max_addr
) {
3120 printk(KERN_ERR
"%s: iommu agaw (%d) is not "
3121 "sufficient for the mapped address (%llx)\n",
3122 __func__
, min_agaw
, max_addr
);
3125 dmar_domain
->max_addr
= max_addr
;
3128 ret
= domain_page_mapping(dmar_domain
, iova
, hpa
, size
, prot
);
3132 static void intel_iommu_unmap_range(struct iommu_domain
*domain
,
3133 unsigned long iova
, size_t size
)
3135 struct dmar_domain
*dmar_domain
= domain
->priv
;
3138 /* The address might not be aligned */
3139 base
= iova
& VTD_PAGE_MASK
;
3140 size
= VTD_PAGE_ALIGN(size
);
3141 dma_pte_clear_range(dmar_domain
, base
, base
+ size
);
3143 if (dmar_domain
->max_addr
== base
+ size
)
3144 dmar_domain
->max_addr
= base
;
3147 static phys_addr_t
intel_iommu_iova_to_phys(struct iommu_domain
*domain
,
3150 struct dmar_domain
*dmar_domain
= domain
->priv
;
3151 struct dma_pte
*pte
;
3154 pte
= addr_to_dma_pte(dmar_domain
, iova
);
3156 phys
= dma_pte_addr(pte
);
3161 static int intel_iommu_domain_has_cap(struct iommu_domain
*domain
,
3164 struct dmar_domain
*dmar_domain
= domain
->priv
;
3166 if (cap
== IOMMU_CAP_CACHE_COHERENCY
)
3167 return dmar_domain
->iommu_snooping
;
3172 static struct iommu_ops intel_iommu_ops
= {
3173 .domain_init
= intel_iommu_domain_init
,
3174 .domain_destroy
= intel_iommu_domain_destroy
,
3175 .attach_dev
= intel_iommu_attach_device
,
3176 .detach_dev
= intel_iommu_detach_device
,
3177 .map
= intel_iommu_map_range
,
3178 .unmap
= intel_iommu_unmap_range
,
3179 .iova_to_phys
= intel_iommu_iova_to_phys
,
3180 .domain_has_cap
= intel_iommu_domain_has_cap
,
3183 static void __devinit
quirk_iommu_rwbf(struct pci_dev
*dev
)
3186 * Mobile 4 Series Chipset neglects to set RWBF capability,
3189 printk(KERN_INFO
"DMAR: Forcing write-buffer flush capability\n");
3193 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL
, 0x2a40, quirk_iommu_rwbf
);