]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - drivers/pci/intel-iommu.c
KVM: x86 emulator: Use DstAcc for 'and'
[mirror_ubuntu-jammy-kernel.git] / drivers / pci / intel-iommu.c
CommitLineData
ba395927
KA
1/*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
98bcef56 17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
ba395927
KA
21 */
22
23#include <linux/init.h>
24#include <linux/bitmap.h>
5e0d2a6f 25#include <linux/debugfs.h>
ba395927
KA
26#include <linux/slab.h>
27#include <linux/irq.h>
28#include <linux/interrupt.h>
29#include <linux/sysdev.h>
30#include <linux/spinlock.h>
31#include <linux/pci.h>
32#include <linux/dmar.h>
33#include <linux/dma-mapping.h>
34#include <linux/mempool.h>
5e0d2a6f 35#include <linux/timer.h>
ba395927
KA
36#include "iova.h"
37#include "intel-iommu.h"
38#include <asm/proto.h> /* force_iommu in this header in x86-64*/
39#include <asm/cacheflush.h>
46a7fa27 40#include <asm/iommu.h>
ba395927
KA
41#include "pci.h"
42
43#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
44#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
45
46#define IOAPIC_RANGE_START (0xfee00000)
47#define IOAPIC_RANGE_END (0xfeefffff)
48#define IOVA_START_ADDR (0x1000)
49
50#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
51
ba395927
KA
52#define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
53
5e0d2a6f 54
55static void flush_unmaps_timeout(unsigned long data);
56
57DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
58
80b20dd8 59#define HIGH_WATER_MARK 250
60struct deferred_flush_tables {
61 int next;
62 struct iova *iova[HIGH_WATER_MARK];
63 struct dmar_domain *domain[HIGH_WATER_MARK];
64};
65
66static struct deferred_flush_tables *deferred_flush;
67
5e0d2a6f 68/* bitmap for indexing intel_iommus */
5e0d2a6f 69static int g_num_of_iommus;
70
71static DEFINE_SPINLOCK(async_umap_flush_lock);
72static LIST_HEAD(unmaps_to_do);
73
74static int timer_on;
75static long list_size;
5e0d2a6f 76
ba395927
KA
77static void domain_remove_dev_info(struct dmar_domain *domain);
78
2ae21010 79int dmar_disabled;
ba395927 80static int __initdata dmar_map_gfx = 1;
7d3b03ce 81static int dmar_forcedac;
5e0d2a6f 82static int intel_iommu_strict;
ba395927
KA
83
84#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
85static DEFINE_SPINLOCK(device_domain_lock);
86static LIST_HEAD(device_domain_list);
87
88static int __init intel_iommu_setup(char *str)
89{
90 if (!str)
91 return -EINVAL;
92 while (*str) {
93 if (!strncmp(str, "off", 3)) {
94 dmar_disabled = 1;
95 printk(KERN_INFO"Intel-IOMMU: disabled\n");
96 } else if (!strncmp(str, "igfx_off", 8)) {
97 dmar_map_gfx = 0;
98 printk(KERN_INFO
99 "Intel-IOMMU: disable GFX device mapping\n");
7d3b03ce 100 } else if (!strncmp(str, "forcedac", 8)) {
5e0d2a6f 101 printk(KERN_INFO
7d3b03ce
KA
102 "Intel-IOMMU: Forcing DAC for PCI devices\n");
103 dmar_forcedac = 1;
5e0d2a6f 104 } else if (!strncmp(str, "strict", 6)) {
105 printk(KERN_INFO
106 "Intel-IOMMU: disable batched IOTLB flush\n");
107 intel_iommu_strict = 1;
ba395927
KA
108 }
109
110 str += strcspn(str, ",");
111 while (*str == ',')
112 str++;
113 }
114 return 0;
115}
116__setup("intel_iommu=", intel_iommu_setup);
117
118static struct kmem_cache *iommu_domain_cache;
119static struct kmem_cache *iommu_devinfo_cache;
120static struct kmem_cache *iommu_iova_cache;
121
eb3fa7cb
KA
122static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
123{
124 unsigned int flags;
125 void *vaddr;
126
127 /* trying to avoid low memory issues */
128 flags = current->flags & PF_MEMALLOC;
129 current->flags |= PF_MEMALLOC;
130 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
131 current->flags &= (~PF_MEMALLOC | flags);
132 return vaddr;
133}
134
135
ba395927
KA
136static inline void *alloc_pgtable_page(void)
137{
eb3fa7cb
KA
138 unsigned int flags;
139 void *vaddr;
140
141 /* trying to avoid low memory issues */
142 flags = current->flags & PF_MEMALLOC;
143 current->flags |= PF_MEMALLOC;
144 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
145 current->flags &= (~PF_MEMALLOC | flags);
146 return vaddr;
ba395927
KA
147}
148
149static inline void free_pgtable_page(void *vaddr)
150{
151 free_page((unsigned long)vaddr);
152}
153
154static inline void *alloc_domain_mem(void)
155{
eb3fa7cb 156 return iommu_kmem_cache_alloc(iommu_domain_cache);
ba395927
KA
157}
158
159static inline void free_domain_mem(void *vaddr)
160{
161 kmem_cache_free(iommu_domain_cache, vaddr);
162}
163
164static inline void * alloc_devinfo_mem(void)
165{
eb3fa7cb 166 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
ba395927
KA
167}
168
169static inline void free_devinfo_mem(void *vaddr)
170{
171 kmem_cache_free(iommu_devinfo_cache, vaddr);
172}
173
174struct iova *alloc_iova_mem(void)
175{
eb3fa7cb 176 return iommu_kmem_cache_alloc(iommu_iova_cache);
ba395927
KA
177}
178
179void free_iova_mem(struct iova *iova)
180{
181 kmem_cache_free(iommu_iova_cache, iova);
182}
183
ba395927
KA
184/* Gets context entry for a given bus and devfn */
185static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
186 u8 bus, u8 devfn)
187{
188 struct root_entry *root;
189 struct context_entry *context;
190 unsigned long phy_addr;
191 unsigned long flags;
192
193 spin_lock_irqsave(&iommu->lock, flags);
194 root = &iommu->root_entry[bus];
195 context = get_context_addr_from_root(root);
196 if (!context) {
197 context = (struct context_entry *)alloc_pgtable_page();
198 if (!context) {
199 spin_unlock_irqrestore(&iommu->lock, flags);
200 return NULL;
201 }
202 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
203 phy_addr = virt_to_phys((void *)context);
204 set_root_value(root, phy_addr);
205 set_root_present(root);
206 __iommu_flush_cache(iommu, root, sizeof(*root));
207 }
208 spin_unlock_irqrestore(&iommu->lock, flags);
209 return &context[devfn];
210}
211
212static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
213{
214 struct root_entry *root;
215 struct context_entry *context;
216 int ret;
217 unsigned long flags;
218
219 spin_lock_irqsave(&iommu->lock, flags);
220 root = &iommu->root_entry[bus];
221 context = get_context_addr_from_root(root);
222 if (!context) {
223 ret = 0;
224 goto out;
225 }
226 ret = context_present(context[devfn]);
227out:
228 spin_unlock_irqrestore(&iommu->lock, flags);
229 return ret;
230}
231
232static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
233{
234 struct root_entry *root;
235 struct context_entry *context;
236 unsigned long flags;
237
238 spin_lock_irqsave(&iommu->lock, flags);
239 root = &iommu->root_entry[bus];
240 context = get_context_addr_from_root(root);
241 if (context) {
242 context_clear_entry(context[devfn]);
243 __iommu_flush_cache(iommu, &context[devfn], \
244 sizeof(*context));
245 }
246 spin_unlock_irqrestore(&iommu->lock, flags);
247}
248
249static void free_context_table(struct intel_iommu *iommu)
250{
251 struct root_entry *root;
252 int i;
253 unsigned long flags;
254 struct context_entry *context;
255
256 spin_lock_irqsave(&iommu->lock, flags);
257 if (!iommu->root_entry) {
258 goto out;
259 }
260 for (i = 0; i < ROOT_ENTRY_NR; i++) {
261 root = &iommu->root_entry[i];
262 context = get_context_addr_from_root(root);
263 if (context)
264 free_pgtable_page(context);
265 }
266 free_pgtable_page(iommu->root_entry);
267 iommu->root_entry = NULL;
268out:
269 spin_unlock_irqrestore(&iommu->lock, flags);
270}
271
272/* page table handling */
273#define LEVEL_STRIDE (9)
274#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
275
276static inline int agaw_to_level(int agaw)
277{
278 return agaw + 2;
279}
280
281static inline int agaw_to_width(int agaw)
282{
283 return 30 + agaw * LEVEL_STRIDE;
284
285}
286
287static inline int width_to_agaw(int width)
288{
289 return (width - 30) / LEVEL_STRIDE;
290}
291
292static inline unsigned int level_to_offset_bits(int level)
293{
294 return (12 + (level - 1) * LEVEL_STRIDE);
295}
296
297static inline int address_level_offset(u64 addr, int level)
298{
299 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
300}
301
302static inline u64 level_mask(int level)
303{
304 return ((u64)-1 << level_to_offset_bits(level));
305}
306
307static inline u64 level_size(int level)
308{
309 return ((u64)1 << level_to_offset_bits(level));
310}
311
312static inline u64 align_to_level(u64 addr, int level)
313{
314 return ((addr + level_size(level) - 1) & level_mask(level));
315}
316
317static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
318{
319 int addr_width = agaw_to_width(domain->agaw);
320 struct dma_pte *parent, *pte = NULL;
321 int level = agaw_to_level(domain->agaw);
322 int offset;
323 unsigned long flags;
324
325 BUG_ON(!domain->pgd);
326
327 addr &= (((u64)1) << addr_width) - 1;
328 parent = domain->pgd;
329
330 spin_lock_irqsave(&domain->mapping_lock, flags);
331 while (level > 0) {
332 void *tmp_page;
333
334 offset = address_level_offset(addr, level);
335 pte = &parent[offset];
336 if (level == 1)
337 break;
338
339 if (!dma_pte_present(*pte)) {
340 tmp_page = alloc_pgtable_page();
341
342 if (!tmp_page) {
343 spin_unlock_irqrestore(&domain->mapping_lock,
344 flags);
345 return NULL;
346 }
347 __iommu_flush_cache(domain->iommu, tmp_page,
348 PAGE_SIZE_4K);
349 dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
350 /*
351 * high level table always sets r/w, last level page
352 * table control read/write
353 */
354 dma_set_pte_readable(*pte);
355 dma_set_pte_writable(*pte);
356 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
357 }
358 parent = phys_to_virt(dma_pte_addr(*pte));
359 level--;
360 }
361
362 spin_unlock_irqrestore(&domain->mapping_lock, flags);
363 return pte;
364}
365
366/* return address's pte at specific level */
367static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
368 int level)
369{
370 struct dma_pte *parent, *pte = NULL;
371 int total = agaw_to_level(domain->agaw);
372 int offset;
373
374 parent = domain->pgd;
375 while (level <= total) {
376 offset = address_level_offset(addr, total);
377 pte = &parent[offset];
378 if (level == total)
379 return pte;
380
381 if (!dma_pte_present(*pte))
382 break;
383 parent = phys_to_virt(dma_pte_addr(*pte));
384 total--;
385 }
386 return NULL;
387}
388
389/* clear one page's page table */
390static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
391{
392 struct dma_pte *pte = NULL;
393
394 /* get last level pte */
395 pte = dma_addr_level_pte(domain, addr, 1);
396
397 if (pte) {
398 dma_clear_pte(*pte);
399 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
400 }
401}
402
403/* clear last level pte, a tlb flush should be followed */
404static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
405{
406 int addr_width = agaw_to_width(domain->agaw);
407
408 start &= (((u64)1) << addr_width) - 1;
409 end &= (((u64)1) << addr_width) - 1;
410 /* in case it's partial page */
411 start = PAGE_ALIGN_4K(start);
412 end &= PAGE_MASK_4K;
413
414 /* we don't need lock here, nobody else touches the iova range */
415 while (start < end) {
416 dma_pte_clear_one(domain, start);
417 start += PAGE_SIZE_4K;
418 }
419}
420
421/* free page table pages. last level pte should already be cleared */
422static void dma_pte_free_pagetable(struct dmar_domain *domain,
423 u64 start, u64 end)
424{
425 int addr_width = agaw_to_width(domain->agaw);
426 struct dma_pte *pte;
427 int total = agaw_to_level(domain->agaw);
428 int level;
429 u64 tmp;
430
431 start &= (((u64)1) << addr_width) - 1;
432 end &= (((u64)1) << addr_width) - 1;
433
434 /* we don't need lock here, nobody else touches the iova range */
435 level = 2;
436 while (level <= total) {
437 tmp = align_to_level(start, level);
438 if (tmp >= end || (tmp + level_size(level) > end))
439 return;
440
441 while (tmp < end) {
442 pte = dma_addr_level_pte(domain, tmp, level);
443 if (pte) {
444 free_pgtable_page(
445 phys_to_virt(dma_pte_addr(*pte)));
446 dma_clear_pte(*pte);
447 __iommu_flush_cache(domain->iommu,
448 pte, sizeof(*pte));
449 }
450 tmp += level_size(level);
451 }
452 level++;
453 }
454 /* free pgd */
455 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
456 free_pgtable_page(domain->pgd);
457 domain->pgd = NULL;
458 }
459}
460
461/* iommu handling */
462static int iommu_alloc_root_entry(struct intel_iommu *iommu)
463{
464 struct root_entry *root;
465 unsigned long flags;
466
467 root = (struct root_entry *)alloc_pgtable_page();
468 if (!root)
469 return -ENOMEM;
470
471 __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
472
473 spin_lock_irqsave(&iommu->lock, flags);
474 iommu->root_entry = root;
475 spin_unlock_irqrestore(&iommu->lock, flags);
476
477 return 0;
478}
479
ba395927
KA
480static void iommu_set_root_entry(struct intel_iommu *iommu)
481{
482 void *addr;
483 u32 cmd, sts;
484 unsigned long flag;
485
486 addr = iommu->root_entry;
487
488 spin_lock_irqsave(&iommu->register_lock, flag);
489 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
490
491 cmd = iommu->gcmd | DMA_GCMD_SRTP;
492 writel(cmd, iommu->reg + DMAR_GCMD_REG);
493
494 /* Make sure hardware complete it */
495 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
496 readl, (sts & DMA_GSTS_RTPS), sts);
497
498 spin_unlock_irqrestore(&iommu->register_lock, flag);
499}
500
501static void iommu_flush_write_buffer(struct intel_iommu *iommu)
502{
503 u32 val;
504 unsigned long flag;
505
506 if (!cap_rwbf(iommu->cap))
507 return;
508 val = iommu->gcmd | DMA_GCMD_WBF;
509
510 spin_lock_irqsave(&iommu->register_lock, flag);
511 writel(val, iommu->reg + DMAR_GCMD_REG);
512
513 /* Make sure hardware complete it */
514 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
515 readl, (!(val & DMA_GSTS_WBFS)), val);
516
517 spin_unlock_irqrestore(&iommu->register_lock, flag);
518}
519
520/* return value determine if we need a write buffer flush */
521static int __iommu_flush_context(struct intel_iommu *iommu,
522 u16 did, u16 source_id, u8 function_mask, u64 type,
523 int non_present_entry_flush)
524{
525 u64 val = 0;
526 unsigned long flag;
527
528 /*
529 * In the non-present entry flush case, if hardware doesn't cache
530 * non-present entry we do nothing and if hardware cache non-present
531 * entry, we flush entries of domain 0 (the domain id is used to cache
532 * any non-present entries)
533 */
534 if (non_present_entry_flush) {
535 if (!cap_caching_mode(iommu->cap))
536 return 1;
537 else
538 did = 0;
539 }
540
541 switch (type) {
542 case DMA_CCMD_GLOBAL_INVL:
543 val = DMA_CCMD_GLOBAL_INVL;
544 break;
545 case DMA_CCMD_DOMAIN_INVL:
546 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
547 break;
548 case DMA_CCMD_DEVICE_INVL:
549 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
550 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
551 break;
552 default:
553 BUG();
554 }
555 val |= DMA_CCMD_ICC;
556
557 spin_lock_irqsave(&iommu->register_lock, flag);
558 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
559
560 /* Make sure hardware complete it */
561 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
562 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
563
564 spin_unlock_irqrestore(&iommu->register_lock, flag);
565
566 /* flush context entry will implictly flush write buffer */
567 return 0;
568}
569
570static int inline iommu_flush_context_global(struct intel_iommu *iommu,
571 int non_present_entry_flush)
572{
573 return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
574 non_present_entry_flush);
575}
576
577static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
578 int non_present_entry_flush)
579{
580 return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
581 non_present_entry_flush);
582}
583
584static int inline iommu_flush_context_device(struct intel_iommu *iommu,
585 u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
586{
587 return __iommu_flush_context(iommu, did, source_id, function_mask,
588 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
589}
590
591/* return value determine if we need a write buffer flush */
592static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
593 u64 addr, unsigned int size_order, u64 type,
594 int non_present_entry_flush)
595{
596 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
597 u64 val = 0, val_iva = 0;
598 unsigned long flag;
599
600 /*
601 * In the non-present entry flush case, if hardware doesn't cache
602 * non-present entry we do nothing and if hardware cache non-present
603 * entry, we flush entries of domain 0 (the domain id is used to cache
604 * any non-present entries)
605 */
606 if (non_present_entry_flush) {
607 if (!cap_caching_mode(iommu->cap))
608 return 1;
609 else
610 did = 0;
611 }
612
613 switch (type) {
614 case DMA_TLB_GLOBAL_FLUSH:
615 /* global flush doesn't need set IVA_REG */
616 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
617 break;
618 case DMA_TLB_DSI_FLUSH:
619 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
620 break;
621 case DMA_TLB_PSI_FLUSH:
622 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
623 /* Note: always flush non-leaf currently */
624 val_iva = size_order | addr;
625 break;
626 default:
627 BUG();
628 }
629 /* Note: set drain read/write */
630#if 0
631 /*
632 * This is probably to be super secure.. Looks like we can
633 * ignore it without any impact.
634 */
635 if (cap_read_drain(iommu->cap))
636 val |= DMA_TLB_READ_DRAIN;
637#endif
638 if (cap_write_drain(iommu->cap))
639 val |= DMA_TLB_WRITE_DRAIN;
640
641 spin_lock_irqsave(&iommu->register_lock, flag);
642 /* Note: Only uses first TLB reg currently */
643 if (val_iva)
644 dmar_writeq(iommu->reg + tlb_offset, val_iva);
645 dmar_writeq(iommu->reg + tlb_offset + 8, val);
646
647 /* Make sure hardware complete it */
648 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
649 dmar_readq, (!(val & DMA_TLB_IVT)), val);
650
651 spin_unlock_irqrestore(&iommu->register_lock, flag);
652
653 /* check IOTLB invalidation granularity */
654 if (DMA_TLB_IAIG(val) == 0)
655 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
656 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
657 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
658 DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
659 /* flush context entry will implictly flush write buffer */
660 return 0;
661}
662
663static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
664 int non_present_entry_flush)
665{
666 return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
667 non_present_entry_flush);
668}
669
670static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
671 int non_present_entry_flush)
672{
673 return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
674 non_present_entry_flush);
675}
676
ba395927
KA
677static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
678 u64 addr, unsigned int pages, int non_present_entry_flush)
679{
f76aec76 680 unsigned int mask;
ba395927
KA
681
682 BUG_ON(addr & (~PAGE_MASK_4K));
683 BUG_ON(pages == 0);
684
685 /* Fallback to domain selective flush if no PSI support */
686 if (!cap_pgsel_inv(iommu->cap))
687 return iommu_flush_iotlb_dsi(iommu, did,
688 non_present_entry_flush);
689
690 /*
691 * PSI requires page size to be 2 ^ x, and the base address is naturally
692 * aligned to the size
693 */
f76aec76 694 mask = ilog2(__roundup_pow_of_two(pages));
ba395927 695 /* Fallback to domain selective flush if size is too big */
f76aec76 696 if (mask > cap_max_amask_val(iommu->cap))
ba395927
KA
697 return iommu_flush_iotlb_dsi(iommu, did,
698 non_present_entry_flush);
699
f76aec76 700 return __iommu_flush_iotlb(iommu, did, addr, mask,
ba395927
KA
701 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
702}
703
f8bab735 704static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
705{
706 u32 pmen;
707 unsigned long flags;
708
709 spin_lock_irqsave(&iommu->register_lock, flags);
710 pmen = readl(iommu->reg + DMAR_PMEN_REG);
711 pmen &= ~DMA_PMEN_EPM;
712 writel(pmen, iommu->reg + DMAR_PMEN_REG);
713
714 /* wait for the protected region status bit to clear */
715 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
716 readl, !(pmen & DMA_PMEN_PRS), pmen);
717
718 spin_unlock_irqrestore(&iommu->register_lock, flags);
719}
720
ba395927
KA
721static int iommu_enable_translation(struct intel_iommu *iommu)
722{
723 u32 sts;
724 unsigned long flags;
725
726 spin_lock_irqsave(&iommu->register_lock, flags);
727 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
728
729 /* Make sure hardware complete it */
730 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
731 readl, (sts & DMA_GSTS_TES), sts);
732
733 iommu->gcmd |= DMA_GCMD_TE;
734 spin_unlock_irqrestore(&iommu->register_lock, flags);
735 return 0;
736}
737
738static int iommu_disable_translation(struct intel_iommu *iommu)
739{
740 u32 sts;
741 unsigned long flag;
742
743 spin_lock_irqsave(&iommu->register_lock, flag);
744 iommu->gcmd &= ~DMA_GCMD_TE;
745 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
746
747 /* Make sure hardware complete it */
748 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
749 readl, (!(sts & DMA_GSTS_TES)), sts);
750
751 spin_unlock_irqrestore(&iommu->register_lock, flag);
752 return 0;
753}
754
3460a6d9
KA
755/* iommu interrupt handling. Most stuff are MSI-like. */
756
d94afc6c 757static const char *fault_reason_strings[] =
3460a6d9
KA
758{
759 "Software",
760 "Present bit in root entry is clear",
761 "Present bit in context entry is clear",
762 "Invalid context entry",
763 "Access beyond MGAW",
764 "PTE Write access is not set",
765 "PTE Read access is not set",
766 "Next page table ptr is invalid",
767 "Root table address invalid",
768 "Context table ptr is invalid",
769 "non-zero reserved fields in RTP",
770 "non-zero reserved fields in CTP",
771 "non-zero reserved fields in PTE",
3460a6d9 772};
f8bab735 773#define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
3460a6d9 774
d94afc6c 775const char *dmar_get_fault_reason(u8 fault_reason)
3460a6d9 776{
d94afc6c 777 if (fault_reason > MAX_FAULT_REASON_IDX)
778 return "Unknown";
3460a6d9
KA
779 else
780 return fault_reason_strings[fault_reason];
781}
782
783void dmar_msi_unmask(unsigned int irq)
784{
785 struct intel_iommu *iommu = get_irq_data(irq);
786 unsigned long flag;
787
788 /* unmask it */
789 spin_lock_irqsave(&iommu->register_lock, flag);
790 writel(0, iommu->reg + DMAR_FECTL_REG);
791 /* Read a reg to force flush the post write */
792 readl(iommu->reg + DMAR_FECTL_REG);
793 spin_unlock_irqrestore(&iommu->register_lock, flag);
794}
795
796void dmar_msi_mask(unsigned int irq)
797{
798 unsigned long flag;
799 struct intel_iommu *iommu = get_irq_data(irq);
800
801 /* mask it */
802 spin_lock_irqsave(&iommu->register_lock, flag);
803 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
804 /* Read a reg to force flush the post write */
805 readl(iommu->reg + DMAR_FECTL_REG);
806 spin_unlock_irqrestore(&iommu->register_lock, flag);
807}
808
809void dmar_msi_write(int irq, struct msi_msg *msg)
810{
811 struct intel_iommu *iommu = get_irq_data(irq);
812 unsigned long flag;
813
814 spin_lock_irqsave(&iommu->register_lock, flag);
815 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
816 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
817 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
818 spin_unlock_irqrestore(&iommu->register_lock, flag);
819}
820
821void dmar_msi_read(int irq, struct msi_msg *msg)
822{
823 struct intel_iommu *iommu = get_irq_data(irq);
824 unsigned long flag;
825
826 spin_lock_irqsave(&iommu->register_lock, flag);
827 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
828 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
829 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
830 spin_unlock_irqrestore(&iommu->register_lock, flag);
831}
832
833static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
834 u8 fault_reason, u16 source_id, u64 addr)
835{
d94afc6c 836 const char *reason;
3460a6d9
KA
837
838 reason = dmar_get_fault_reason(fault_reason);
839
840 printk(KERN_ERR
841 "DMAR:[%s] Request device [%02x:%02x.%d] "
842 "fault addr %llx \n"
843 "DMAR:[fault reason %02d] %s\n",
844 (type ? "DMA Read" : "DMA Write"),
845 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
846 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
847 return 0;
848}
849
850#define PRIMARY_FAULT_REG_LEN (16)
851static irqreturn_t iommu_page_fault(int irq, void *dev_id)
852{
853 struct intel_iommu *iommu = dev_id;
854 int reg, fault_index;
855 u32 fault_status;
856 unsigned long flag;
857
858 spin_lock_irqsave(&iommu->register_lock, flag);
859 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
860
861 /* TBD: ignore advanced fault log currently */
862 if (!(fault_status & DMA_FSTS_PPF))
863 goto clear_overflow;
864
865 fault_index = dma_fsts_fault_record_index(fault_status);
866 reg = cap_fault_reg_offset(iommu->cap);
867 while (1) {
868 u8 fault_reason;
869 u16 source_id;
870 u64 guest_addr;
871 int type;
872 u32 data;
873
874 /* highest 32 bits */
875 data = readl(iommu->reg + reg +
876 fault_index * PRIMARY_FAULT_REG_LEN + 12);
877 if (!(data & DMA_FRCD_F))
878 break;
879
880 fault_reason = dma_frcd_fault_reason(data);
881 type = dma_frcd_type(data);
882
883 data = readl(iommu->reg + reg +
884 fault_index * PRIMARY_FAULT_REG_LEN + 8);
885 source_id = dma_frcd_source_id(data);
886
887 guest_addr = dmar_readq(iommu->reg + reg +
888 fault_index * PRIMARY_FAULT_REG_LEN);
889 guest_addr = dma_frcd_page_addr(guest_addr);
890 /* clear the fault */
891 writel(DMA_FRCD_F, iommu->reg + reg +
892 fault_index * PRIMARY_FAULT_REG_LEN + 12);
893
894 spin_unlock_irqrestore(&iommu->register_lock, flag);
895
896 iommu_page_fault_do_one(iommu, type, fault_reason,
897 source_id, guest_addr);
898
899 fault_index++;
900 if (fault_index > cap_num_fault_regs(iommu->cap))
901 fault_index = 0;
902 spin_lock_irqsave(&iommu->register_lock, flag);
903 }
904clear_overflow:
905 /* clear primary fault overflow */
906 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
907 if (fault_status & DMA_FSTS_PFO)
908 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
909
910 spin_unlock_irqrestore(&iommu->register_lock, flag);
911 return IRQ_HANDLED;
912}
913
914int dmar_set_interrupt(struct intel_iommu *iommu)
915{
916 int irq, ret;
917
918 irq = create_irq();
919 if (!irq) {
920 printk(KERN_ERR "IOMMU: no free vectors\n");
921 return -EINVAL;
922 }
923
924 set_irq_data(irq, iommu);
925 iommu->irq = irq;
926
927 ret = arch_setup_dmar_msi(irq);
928 if (ret) {
929 set_irq_data(irq, NULL);
930 iommu->irq = 0;
931 destroy_irq(irq);
932 return 0;
933 }
934
935 /* Force fault register is cleared */
936 iommu_page_fault(irq, iommu);
937
938 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
939 if (ret)
940 printk(KERN_ERR "IOMMU: can't request irq\n");
941 return ret;
942}
943
ba395927
KA
944static int iommu_init_domains(struct intel_iommu *iommu)
945{
946 unsigned long ndomains;
947 unsigned long nlongs;
948
949 ndomains = cap_ndoms(iommu->cap);
950 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
951 nlongs = BITS_TO_LONGS(ndomains);
952
953 /* TBD: there might be 64K domains,
954 * consider other allocation for future chip
955 */
956 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
957 if (!iommu->domain_ids) {
958 printk(KERN_ERR "Allocating domain id array failed\n");
959 return -ENOMEM;
960 }
961 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
962 GFP_KERNEL);
963 if (!iommu->domains) {
964 printk(KERN_ERR "Allocating domain array failed\n");
965 kfree(iommu->domain_ids);
966 return -ENOMEM;
967 }
968
e61d98d8
SS
969 spin_lock_init(&iommu->lock);
970
ba395927
KA
971 /*
972 * if Caching mode is set, then invalid translations are tagged
973 * with domainid 0. Hence we need to pre-allocate it.
974 */
975 if (cap_caching_mode(iommu->cap))
976 set_bit(0, iommu->domain_ids);
977 return 0;
978}
ba395927 979
ba395927
KA
980
981static void domain_exit(struct dmar_domain *domain);
e61d98d8
SS
982
983void free_dmar_iommu(struct intel_iommu *iommu)
ba395927
KA
984{
985 struct dmar_domain *domain;
986 int i;
987
ba395927
KA
988 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
989 for (; i < cap_ndoms(iommu->cap); ) {
990 domain = iommu->domains[i];
991 clear_bit(i, iommu->domain_ids);
992 domain_exit(domain);
993 i = find_next_bit(iommu->domain_ids,
994 cap_ndoms(iommu->cap), i+1);
995 }
996
997 if (iommu->gcmd & DMA_GCMD_TE)
998 iommu_disable_translation(iommu);
999
1000 if (iommu->irq) {
1001 set_irq_data(iommu->irq, NULL);
1002 /* This will mask the irq */
1003 free_irq(iommu->irq, iommu);
1004 destroy_irq(iommu->irq);
1005 }
1006
1007 kfree(iommu->domains);
1008 kfree(iommu->domain_ids);
1009
1010 /* free context mapping */
1011 free_context_table(iommu);
ba395927
KA
1012}
1013
1014static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1015{
1016 unsigned long num;
1017 unsigned long ndomains;
1018 struct dmar_domain *domain;
1019 unsigned long flags;
1020
1021 domain = alloc_domain_mem();
1022 if (!domain)
1023 return NULL;
1024
1025 ndomains = cap_ndoms(iommu->cap);
1026
1027 spin_lock_irqsave(&iommu->lock, flags);
1028 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1029 if (num >= ndomains) {
1030 spin_unlock_irqrestore(&iommu->lock, flags);
1031 free_domain_mem(domain);
1032 printk(KERN_ERR "IOMMU: no free domain ids\n");
1033 return NULL;
1034 }
1035
1036 set_bit(num, iommu->domain_ids);
1037 domain->id = num;
1038 domain->iommu = iommu;
1039 iommu->domains[num] = domain;
1040 spin_unlock_irqrestore(&iommu->lock, flags);
1041
1042 return domain;
1043}
1044
1045static void iommu_free_domain(struct dmar_domain *domain)
1046{
1047 unsigned long flags;
1048
1049 spin_lock_irqsave(&domain->iommu->lock, flags);
1050 clear_bit(domain->id, domain->iommu->domain_ids);
1051 spin_unlock_irqrestore(&domain->iommu->lock, flags);
1052}
1053
1054static struct iova_domain reserved_iova_list;
8a443df4
MG
1055static struct lock_class_key reserved_alloc_key;
1056static struct lock_class_key reserved_rbtree_key;
ba395927
KA
1057
1058static void dmar_init_reserved_ranges(void)
1059{
1060 struct pci_dev *pdev = NULL;
1061 struct iova *iova;
1062 int i;
1063 u64 addr, size;
1064
f661197e 1065 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
ba395927 1066
8a443df4
MG
1067 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1068 &reserved_alloc_key);
1069 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1070 &reserved_rbtree_key);
1071
ba395927
KA
1072 /* IOAPIC ranges shouldn't be accessed by DMA */
1073 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1074 IOVA_PFN(IOAPIC_RANGE_END));
1075 if (!iova)
1076 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1077
1078 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1079 for_each_pci_dev(pdev) {
1080 struct resource *r;
1081
1082 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1083 r = &pdev->resource[i];
1084 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1085 continue;
1086 addr = r->start;
1087 addr &= PAGE_MASK_4K;
1088 size = r->end - addr;
1089 size = PAGE_ALIGN_4K(size);
1090 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1091 IOVA_PFN(size + addr) - 1);
1092 if (!iova)
1093 printk(KERN_ERR "Reserve iova failed\n");
1094 }
1095 }
1096
1097}
1098
1099static void domain_reserve_special_ranges(struct dmar_domain *domain)
1100{
1101 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1102}
1103
1104static inline int guestwidth_to_adjustwidth(int gaw)
1105{
1106 int agaw;
1107 int r = (gaw - 12) % 9;
1108
1109 if (r == 0)
1110 agaw = gaw;
1111 else
1112 agaw = gaw + 9 - r;
1113 if (agaw > 64)
1114 agaw = 64;
1115 return agaw;
1116}
1117
1118static int domain_init(struct dmar_domain *domain, int guest_width)
1119{
1120 struct intel_iommu *iommu;
1121 int adjust_width, agaw;
1122 unsigned long sagaw;
1123
f661197e 1124 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
ba395927
KA
1125 spin_lock_init(&domain->mapping_lock);
1126
1127 domain_reserve_special_ranges(domain);
1128
1129 /* calculate AGAW */
1130 iommu = domain->iommu;
1131 if (guest_width > cap_mgaw(iommu->cap))
1132 guest_width = cap_mgaw(iommu->cap);
1133 domain->gaw = guest_width;
1134 adjust_width = guestwidth_to_adjustwidth(guest_width);
1135 agaw = width_to_agaw(adjust_width);
1136 sagaw = cap_sagaw(iommu->cap);
1137 if (!test_bit(agaw, &sagaw)) {
1138 /* hardware doesn't support it, choose a bigger one */
1139 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1140 agaw = find_next_bit(&sagaw, 5, agaw);
1141 if (agaw >= 5)
1142 return -ENODEV;
1143 }
1144 domain->agaw = agaw;
1145 INIT_LIST_HEAD(&domain->devices);
1146
1147 /* always allocate the top pgd */
1148 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1149 if (!domain->pgd)
1150 return -ENOMEM;
1151 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1152 return 0;
1153}
1154
1155static void domain_exit(struct dmar_domain *domain)
1156{
1157 u64 end;
1158
1159 /* Domain 0 is reserved, so dont process it */
1160 if (!domain)
1161 return;
1162
1163 domain_remove_dev_info(domain);
1164 /* destroy iovas */
1165 put_iova_domain(&domain->iovad);
1166 end = DOMAIN_MAX_ADDR(domain->gaw);
1167 end = end & (~PAGE_MASK_4K);
1168
1169 /* clear ptes */
1170 dma_pte_clear_range(domain, 0, end);
1171
1172 /* free page tables */
1173 dma_pte_free_pagetable(domain, 0, end);
1174
1175 iommu_free_domain(domain);
1176 free_domain_mem(domain);
1177}
1178
1179static int domain_context_mapping_one(struct dmar_domain *domain,
1180 u8 bus, u8 devfn)
1181{
1182 struct context_entry *context;
1183 struct intel_iommu *iommu = domain->iommu;
1184 unsigned long flags;
1185
1186 pr_debug("Set context mapping for %02x:%02x.%d\n",
1187 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1188 BUG_ON(!domain->pgd);
1189 context = device_to_context_entry(iommu, bus, devfn);
1190 if (!context)
1191 return -ENOMEM;
1192 spin_lock_irqsave(&iommu->lock, flags);
1193 if (context_present(*context)) {
1194 spin_unlock_irqrestore(&iommu->lock, flags);
1195 return 0;
1196 }
1197
1198 context_set_domain_id(*context, domain->id);
1199 context_set_address_width(*context, domain->agaw);
1200 context_set_address_root(*context, virt_to_phys(domain->pgd));
1201 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1202 context_set_fault_enable(*context);
1203 context_set_present(*context);
1204 __iommu_flush_cache(iommu, context, sizeof(*context));
1205
1206 /* it's a non-present to present mapping */
1207 if (iommu_flush_context_device(iommu, domain->id,
1208 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1209 iommu_flush_write_buffer(iommu);
1210 else
1211 iommu_flush_iotlb_dsi(iommu, 0, 0);
1212 spin_unlock_irqrestore(&iommu->lock, flags);
1213 return 0;
1214}
1215
1216static int
1217domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1218{
1219 int ret;
1220 struct pci_dev *tmp, *parent;
1221
1222 ret = domain_context_mapping_one(domain, pdev->bus->number,
1223 pdev->devfn);
1224 if (ret)
1225 return ret;
1226
1227 /* dependent device mapping */
1228 tmp = pci_find_upstream_pcie_bridge(pdev);
1229 if (!tmp)
1230 return 0;
1231 /* Secondary interface's bus number and devfn 0 */
1232 parent = pdev->bus->self;
1233 while (parent != tmp) {
1234 ret = domain_context_mapping_one(domain, parent->bus->number,
1235 parent->devfn);
1236 if (ret)
1237 return ret;
1238 parent = parent->bus->self;
1239 }
1240 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1241 return domain_context_mapping_one(domain,
1242 tmp->subordinate->number, 0);
1243 else /* this is a legacy PCI bridge */
1244 return domain_context_mapping_one(domain,
1245 tmp->bus->number, tmp->devfn);
1246}
1247
1248static int domain_context_mapped(struct dmar_domain *domain,
1249 struct pci_dev *pdev)
1250{
1251 int ret;
1252 struct pci_dev *tmp, *parent;
1253
1254 ret = device_context_mapped(domain->iommu,
1255 pdev->bus->number, pdev->devfn);
1256 if (!ret)
1257 return ret;
1258 /* dependent device mapping */
1259 tmp = pci_find_upstream_pcie_bridge(pdev);
1260 if (!tmp)
1261 return ret;
1262 /* Secondary interface's bus number and devfn 0 */
1263 parent = pdev->bus->self;
1264 while (parent != tmp) {
1265 ret = device_context_mapped(domain->iommu, parent->bus->number,
1266 parent->devfn);
1267 if (!ret)
1268 return ret;
1269 parent = parent->bus->self;
1270 }
1271 if (tmp->is_pcie)
1272 return device_context_mapped(domain->iommu,
1273 tmp->subordinate->number, 0);
1274 else
1275 return device_context_mapped(domain->iommu,
1276 tmp->bus->number, tmp->devfn);
1277}
1278
1279static int
1280domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1281 u64 hpa, size_t size, int prot)
1282{
1283 u64 start_pfn, end_pfn;
1284 struct dma_pte *pte;
1285 int index;
1286
1287 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1288 return -EINVAL;
1289 iova &= PAGE_MASK_4K;
1290 start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1291 end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1292 index = 0;
1293 while (start_pfn < end_pfn) {
1294 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1295 if (!pte)
1296 return -ENOMEM;
1297 /* We don't need lock here, nobody else
1298 * touches the iova range
1299 */
1300 BUG_ON(dma_pte_addr(*pte));
1301 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1302 dma_set_pte_prot(*pte, prot);
1303 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1304 start_pfn++;
1305 index++;
1306 }
1307 return 0;
1308}
1309
1310static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1311{
1312 clear_context_table(domain->iommu, bus, devfn);
1313 iommu_flush_context_global(domain->iommu, 0);
1314 iommu_flush_iotlb_global(domain->iommu, 0);
1315}
1316
1317static void domain_remove_dev_info(struct dmar_domain *domain)
1318{
1319 struct device_domain_info *info;
1320 unsigned long flags;
1321
1322 spin_lock_irqsave(&device_domain_lock, flags);
1323 while (!list_empty(&domain->devices)) {
1324 info = list_entry(domain->devices.next,
1325 struct device_domain_info, link);
1326 list_del(&info->link);
1327 list_del(&info->global);
1328 if (info->dev)
358dd8ac 1329 info->dev->dev.archdata.iommu = NULL;
ba395927
KA
1330 spin_unlock_irqrestore(&device_domain_lock, flags);
1331
1332 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1333 free_devinfo_mem(info);
1334
1335 spin_lock_irqsave(&device_domain_lock, flags);
1336 }
1337 spin_unlock_irqrestore(&device_domain_lock, flags);
1338}
1339
1340/*
1341 * find_domain
358dd8ac 1342 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
ba395927
KA
1343 */
1344struct dmar_domain *
1345find_domain(struct pci_dev *pdev)
1346{
1347 struct device_domain_info *info;
1348
1349 /* No lock here, assumes no domain exit in normal case */
358dd8ac 1350 info = pdev->dev.archdata.iommu;
ba395927
KA
1351 if (info)
1352 return info->domain;
1353 return NULL;
1354}
1355
ba395927
KA
1356/* domain is initialized */
1357static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1358{
1359 struct dmar_domain *domain, *found = NULL;
1360 struct intel_iommu *iommu;
1361 struct dmar_drhd_unit *drhd;
1362 struct device_domain_info *info, *tmp;
1363 struct pci_dev *dev_tmp;
1364 unsigned long flags;
1365 int bus = 0, devfn = 0;
1366
1367 domain = find_domain(pdev);
1368 if (domain)
1369 return domain;
1370
1371 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1372 if (dev_tmp) {
1373 if (dev_tmp->is_pcie) {
1374 bus = dev_tmp->subordinate->number;
1375 devfn = 0;
1376 } else {
1377 bus = dev_tmp->bus->number;
1378 devfn = dev_tmp->devfn;
1379 }
1380 spin_lock_irqsave(&device_domain_lock, flags);
1381 list_for_each_entry(info, &device_domain_list, global) {
1382 if (info->bus == bus && info->devfn == devfn) {
1383 found = info->domain;
1384 break;
1385 }
1386 }
1387 spin_unlock_irqrestore(&device_domain_lock, flags);
1388 /* pcie-pci bridge already has a domain, uses it */
1389 if (found) {
1390 domain = found;
1391 goto found_domain;
1392 }
1393 }
1394
1395 /* Allocate new domain for the device */
1396 drhd = dmar_find_matched_drhd_unit(pdev);
1397 if (!drhd) {
1398 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1399 pci_name(pdev));
1400 return NULL;
1401 }
1402 iommu = drhd->iommu;
1403
1404 domain = iommu_alloc_domain(iommu);
1405 if (!domain)
1406 goto error;
1407
1408 if (domain_init(domain, gaw)) {
1409 domain_exit(domain);
1410 goto error;
1411 }
1412
1413 /* register pcie-to-pci device */
1414 if (dev_tmp) {
1415 info = alloc_devinfo_mem();
1416 if (!info) {
1417 domain_exit(domain);
1418 goto error;
1419 }
1420 info->bus = bus;
1421 info->devfn = devfn;
1422 info->dev = NULL;
1423 info->domain = domain;
1424 /* This domain is shared by devices under p2p bridge */
1425 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1426
1427 /* pcie-to-pci bridge already has a domain, uses it */
1428 found = NULL;
1429 spin_lock_irqsave(&device_domain_lock, flags);
1430 list_for_each_entry(tmp, &device_domain_list, global) {
1431 if (tmp->bus == bus && tmp->devfn == devfn) {
1432 found = tmp->domain;
1433 break;
1434 }
1435 }
1436 if (found) {
1437 free_devinfo_mem(info);
1438 domain_exit(domain);
1439 domain = found;
1440 } else {
1441 list_add(&info->link, &domain->devices);
1442 list_add(&info->global, &device_domain_list);
1443 }
1444 spin_unlock_irqrestore(&device_domain_lock, flags);
1445 }
1446
1447found_domain:
1448 info = alloc_devinfo_mem();
1449 if (!info)
1450 goto error;
1451 info->bus = pdev->bus->number;
1452 info->devfn = pdev->devfn;
1453 info->dev = pdev;
1454 info->domain = domain;
1455 spin_lock_irqsave(&device_domain_lock, flags);
1456 /* somebody is fast */
1457 found = find_domain(pdev);
1458 if (found != NULL) {
1459 spin_unlock_irqrestore(&device_domain_lock, flags);
1460 if (found != domain) {
1461 domain_exit(domain);
1462 domain = found;
1463 }
1464 free_devinfo_mem(info);
1465 return domain;
1466 }
1467 list_add(&info->link, &domain->devices);
1468 list_add(&info->global, &device_domain_list);
358dd8ac 1469 pdev->dev.archdata.iommu = info;
ba395927
KA
1470 spin_unlock_irqrestore(&device_domain_lock, flags);
1471 return domain;
1472error:
1473 /* recheck it here, maybe others set it */
1474 return find_domain(pdev);
1475}
1476
1477static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1478{
1479 struct dmar_domain *domain;
1480 unsigned long size;
1481 u64 base;
1482 int ret;
1483
1484 printk(KERN_INFO
1485 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1486 pci_name(pdev), start, end);
1487 /* page table init */
1488 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1489 if (!domain)
1490 return -ENOMEM;
1491
1492 /* The address might not be aligned */
1493 base = start & PAGE_MASK_4K;
1494 size = end - base;
1495 size = PAGE_ALIGN_4K(size);
1496 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1497 IOVA_PFN(base + size) - 1)) {
1498 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1499 ret = -ENOMEM;
1500 goto error;
1501 }
1502
1503 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1504 size, base, pci_name(pdev));
1505 /*
1506 * RMRR range might have overlap with physical memory range,
1507 * clear it first
1508 */
1509 dma_pte_clear_range(domain, base, base + size);
1510
1511 ret = domain_page_mapping(domain, base, base, size,
1512 DMA_PTE_READ|DMA_PTE_WRITE);
1513 if (ret)
1514 goto error;
1515
1516 /* context entry init */
1517 ret = domain_context_mapping(domain, pdev);
1518 if (!ret)
1519 return 0;
1520error:
1521 domain_exit(domain);
1522 return ret;
1523
1524}
1525
1526static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1527 struct pci_dev *pdev)
1528{
358dd8ac 1529 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
ba395927
KA
1530 return 0;
1531 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1532 rmrr->end_address + 1);
1533}
1534
e820482c 1535#ifdef CONFIG_DMAR_GFX_WA
d52d53b8
YL
1536struct iommu_prepare_data {
1537 struct pci_dev *pdev;
1538 int ret;
1539};
1540
1541static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1542 unsigned long end_pfn, void *datax)
1543{
1544 struct iommu_prepare_data *data;
1545
1546 data = (struct iommu_prepare_data *)datax;
1547
1548 data->ret = iommu_prepare_identity_map(data->pdev,
1549 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1550 return data->ret;
1551
1552}
1553
1554static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1555{
1556 int nid;
1557 struct iommu_prepare_data data;
1558
1559 data.pdev = pdev;
1560 data.ret = 0;
1561
1562 for_each_online_node(nid) {
1563 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1564 if (data.ret)
1565 return data.ret;
1566 }
1567 return data.ret;
1568}
1569
e820482c
KA
1570static void __init iommu_prepare_gfx_mapping(void)
1571{
1572 struct pci_dev *pdev = NULL;
e820482c
KA
1573 int ret;
1574
1575 for_each_pci_dev(pdev) {
358dd8ac 1576 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
e820482c
KA
1577 !IS_GFX_DEVICE(pdev))
1578 continue;
1579 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1580 pci_name(pdev));
d52d53b8
YL
1581 ret = iommu_prepare_with_active_regions(pdev);
1582 if (ret)
1583 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
e820482c
KA
1584 }
1585}
1586#endif
1587
49a0429e
KA
1588#ifdef CONFIG_DMAR_FLOPPY_WA
1589static inline void iommu_prepare_isa(void)
1590{
1591 struct pci_dev *pdev;
1592 int ret;
1593
1594 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1595 if (!pdev)
1596 return;
1597
1598 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1599 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1600
1601 if (ret)
1602 printk("IOMMU: Failed to create 0-64M identity map, "
1603 "floppy might not work\n");
1604
1605}
1606#else
1607static inline void iommu_prepare_isa(void)
1608{
1609 return;
1610}
1611#endif /* !CONFIG_DMAR_FLPY_WA */
1612
ba395927
KA
1613int __init init_dmars(void)
1614{
1615 struct dmar_drhd_unit *drhd;
1616 struct dmar_rmrr_unit *rmrr;
1617 struct pci_dev *pdev;
1618 struct intel_iommu *iommu;
80b20dd8 1619 int i, ret, unit = 0;
ba395927
KA
1620
1621 /*
1622 * for each drhd
1623 * allocate root
1624 * initialize and program root entry to not present
1625 * endfor
1626 */
1627 for_each_drhd_unit(drhd) {
5e0d2a6f 1628 g_num_of_iommus++;
1629 /*
1630 * lock not needed as this is only incremented in the single
1631 * threaded kernel __init code path all other access are read
1632 * only
1633 */
1634 }
1635
80b20dd8 1636 deferred_flush = kzalloc(g_num_of_iommus *
1637 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1638 if (!deferred_flush) {
5e0d2a6f 1639 ret = -ENOMEM;
1640 goto error;
1641 }
1642
5e0d2a6f 1643 for_each_drhd_unit(drhd) {
1644 if (drhd->ignored)
1645 continue;
1886e8a9
SS
1646
1647 iommu = drhd->iommu;
ba395927 1648
e61d98d8
SS
1649 ret = iommu_init_domains(iommu);
1650 if (ret)
1651 goto error;
1652
ba395927
KA
1653 /*
1654 * TBD:
1655 * we could share the same root & context tables
1656 * amoung all IOMMU's. Need to Split it later.
1657 */
1658 ret = iommu_alloc_root_entry(iommu);
1659 if (ret) {
1660 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1661 goto error;
1662 }
1663 }
1664
1665 /*
1666 * For each rmrr
1667 * for each dev attached to rmrr
1668 * do
1669 * locate drhd for dev, alloc domain for dev
1670 * allocate free domain
1671 * allocate page table entries for rmrr
1672 * if context not allocated for bus
1673 * allocate and init context
1674 * set present in root table for this bus
1675 * init context with domain, translation etc
1676 * endfor
1677 * endfor
1678 */
1679 for_each_rmrr_units(rmrr) {
ba395927
KA
1680 for (i = 0; i < rmrr->devices_cnt; i++) {
1681 pdev = rmrr->devices[i];
1682 /* some BIOS lists non-exist devices in DMAR table */
1683 if (!pdev)
1684 continue;
1685 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1686 if (ret)
1687 printk(KERN_ERR
1688 "IOMMU: mapping reserved region failed\n");
1689 }
1690 }
1691
e820482c
KA
1692 iommu_prepare_gfx_mapping();
1693
49a0429e
KA
1694 iommu_prepare_isa();
1695
ba395927
KA
1696 /*
1697 * for each drhd
1698 * enable fault log
1699 * global invalidate context cache
1700 * global invalidate iotlb
1701 * enable translation
1702 */
1703 for_each_drhd_unit(drhd) {
1704 if (drhd->ignored)
1705 continue;
1706 iommu = drhd->iommu;
1707 sprintf (iommu->name, "dmar%d", unit++);
1708
1709 iommu_flush_write_buffer(iommu);
1710
3460a6d9
KA
1711 ret = dmar_set_interrupt(iommu);
1712 if (ret)
1713 goto error;
1714
ba395927
KA
1715 iommu_set_root_entry(iommu);
1716
1717 iommu_flush_context_global(iommu, 0);
1718 iommu_flush_iotlb_global(iommu, 0);
1719
f8bab735 1720 iommu_disable_protect_mem_regions(iommu);
1721
ba395927
KA
1722 ret = iommu_enable_translation(iommu);
1723 if (ret)
1724 goto error;
1725 }
1726
1727 return 0;
1728error:
1729 for_each_drhd_unit(drhd) {
1730 if (drhd->ignored)
1731 continue;
1732 iommu = drhd->iommu;
1733 free_iommu(iommu);
1734 }
1735 return ret;
1736}
1737
1738static inline u64 aligned_size(u64 host_addr, size_t size)
1739{
1740 u64 addr;
1741 addr = (host_addr & (~PAGE_MASK_4K)) + size;
1742 return PAGE_ALIGN_4K(addr);
1743}
1744
1745struct iova *
f76aec76 1746iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
ba395927 1747{
ba395927
KA
1748 struct iova *piova;
1749
1750 /* Make sure it's in range */
ba395927 1751 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
f76aec76 1752 if (!size || (IOVA_START_ADDR + size > end))
ba395927
KA
1753 return NULL;
1754
1755 piova = alloc_iova(&domain->iovad,
f76aec76 1756 size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
ba395927
KA
1757 return piova;
1758}
1759
f76aec76
KA
1760static struct iova *
1761__intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1762 size_t size)
ba395927 1763{
ba395927 1764 struct pci_dev *pdev = to_pci_dev(dev);
ba395927 1765 struct iova *iova = NULL;
ba395927 1766
7d3b03ce 1767 if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
f76aec76 1768 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
ba395927
KA
1769 } else {
1770 /*
1771 * First try to allocate an io virtual address in
1772 * DMA_32BIT_MASK and if that fails then try allocating
3609801e 1773 * from higher range
ba395927 1774 */
f76aec76 1775 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
ba395927 1776 if (!iova)
f76aec76 1777 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
ba395927
KA
1778 }
1779
1780 if (!iova) {
1781 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
f76aec76
KA
1782 return NULL;
1783 }
1784
1785 return iova;
1786}
1787
1788static struct dmar_domain *
1789get_valid_domain_for_dev(struct pci_dev *pdev)
1790{
1791 struct dmar_domain *domain;
1792 int ret;
1793
1794 domain = get_domain_for_dev(pdev,
1795 DEFAULT_DOMAIN_ADDRESS_WIDTH);
1796 if (!domain) {
1797 printk(KERN_ERR
1798 "Allocating domain for %s failed", pci_name(pdev));
4fe05bbc 1799 return NULL;
ba395927
KA
1800 }
1801
1802 /* make sure context mapping is ok */
1803 if (unlikely(!domain_context_mapped(domain, pdev))) {
1804 ret = domain_context_mapping(domain, pdev);
f76aec76
KA
1805 if (ret) {
1806 printk(KERN_ERR
1807 "Domain context map for %s failed",
1808 pci_name(pdev));
4fe05bbc 1809 return NULL;
f76aec76 1810 }
ba395927
KA
1811 }
1812
f76aec76
KA
1813 return domain;
1814}
1815
6865f0d1
IM
1816static dma_addr_t
1817intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
f76aec76
KA
1818{
1819 struct pci_dev *pdev = to_pci_dev(hwdev);
f76aec76 1820 struct dmar_domain *domain;
6865f0d1 1821 unsigned long start_paddr;
f76aec76
KA
1822 struct iova *iova;
1823 int prot = 0;
6865f0d1 1824 int ret;
f76aec76
KA
1825
1826 BUG_ON(dir == DMA_NONE);
358dd8ac 1827 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
6865f0d1 1828 return paddr;
f76aec76
KA
1829
1830 domain = get_valid_domain_for_dev(pdev);
1831 if (!domain)
1832 return 0;
1833
6865f0d1 1834 size = aligned_size((u64)paddr, size);
f76aec76
KA
1835
1836 iova = __intel_alloc_iova(hwdev, domain, size);
1837 if (!iova)
1838 goto error;
1839
6865f0d1 1840 start_paddr = iova->pfn_lo << PAGE_SHIFT_4K;
f76aec76 1841
ba395927
KA
1842 /*
1843 * Check if DMAR supports zero-length reads on write only
1844 * mappings..
1845 */
1846 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1847 !cap_zlr(domain->iommu->cap))
1848 prot |= DMA_PTE_READ;
1849 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1850 prot |= DMA_PTE_WRITE;
1851 /*
6865f0d1 1852 * paddr - (paddr + size) might be partial page, we should map the whole
ba395927 1853 * page. Note: if two part of one page are separately mapped, we
6865f0d1 1854 * might have two guest_addr mapping to the same host paddr, but this
ba395927
KA
1855 * is not a big problem
1856 */
6865f0d1
IM
1857 ret = domain_page_mapping(domain, start_paddr,
1858 ((u64)paddr) & PAGE_MASK_4K, size, prot);
ba395927
KA
1859 if (ret)
1860 goto error;
1861
1862 pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
6865f0d1
IM
1863 pci_name(pdev), size, (u64)paddr,
1864 size, (u64)start_paddr, dir);
f76aec76
KA
1865
1866 /* it's a non-present to present mapping */
1867 ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
6865f0d1 1868 start_paddr, size >> PAGE_SHIFT_4K, 1);
f76aec76
KA
1869 if (ret)
1870 iommu_flush_write_buffer(domain->iommu);
1871
6865f0d1 1872 return (start_paddr + ((u64)paddr & (~PAGE_MASK_4K)));
ba395927 1873
ba395927 1874error:
f76aec76
KA
1875 if (iova)
1876 __free_iova(&domain->iovad, iova);
ba395927 1877 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
6865f0d1 1878 pci_name(pdev), size, (u64)paddr, dir);
ba395927
KA
1879 return 0;
1880}
1881
5e0d2a6f 1882static void flush_unmaps(void)
1883{
80b20dd8 1884 int i, j;
5e0d2a6f 1885
5e0d2a6f 1886 timer_on = 0;
1887
1888 /* just flush them all */
1889 for (i = 0; i < g_num_of_iommus; i++) {
80b20dd8 1890 if (deferred_flush[i].next) {
c42d9f32
SS
1891 struct intel_iommu *iommu =
1892 deferred_flush[i].domain[0]->iommu;
1893
1894 iommu_flush_iotlb_global(iommu, 0);
80b20dd8 1895 for (j = 0; j < deferred_flush[i].next; j++) {
1896 __free_iova(&deferred_flush[i].domain[j]->iovad,
1897 deferred_flush[i].iova[j]);
1898 }
1899 deferred_flush[i].next = 0;
1900 }
5e0d2a6f 1901 }
1902
5e0d2a6f 1903 list_size = 0;
5e0d2a6f 1904}
1905
1906static void flush_unmaps_timeout(unsigned long data)
1907{
80b20dd8 1908 unsigned long flags;
1909
1910 spin_lock_irqsave(&async_umap_flush_lock, flags);
5e0d2a6f 1911 flush_unmaps();
80b20dd8 1912 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
5e0d2a6f 1913}
1914
1915static void add_unmap(struct dmar_domain *dom, struct iova *iova)
1916{
1917 unsigned long flags;
80b20dd8 1918 int next, iommu_id;
5e0d2a6f 1919
1920 spin_lock_irqsave(&async_umap_flush_lock, flags);
80b20dd8 1921 if (list_size == HIGH_WATER_MARK)
1922 flush_unmaps();
1923
c42d9f32
SS
1924 iommu_id = dom->iommu->seq_id;
1925
80b20dd8 1926 next = deferred_flush[iommu_id].next;
1927 deferred_flush[iommu_id].domain[next] = dom;
1928 deferred_flush[iommu_id].iova[next] = iova;
1929 deferred_flush[iommu_id].next++;
5e0d2a6f 1930
1931 if (!timer_on) {
1932 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
1933 timer_on = 1;
1934 }
1935 list_size++;
1936 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1937}
1938
f76aec76 1939static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
ba395927
KA
1940 size_t size, int dir)
1941{
ba395927 1942 struct pci_dev *pdev = to_pci_dev(dev);
f76aec76
KA
1943 struct dmar_domain *domain;
1944 unsigned long start_addr;
ba395927
KA
1945 struct iova *iova;
1946
358dd8ac 1947 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
f76aec76 1948 return;
ba395927
KA
1949 domain = find_domain(pdev);
1950 BUG_ON(!domain);
1951
1952 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
f76aec76 1953 if (!iova)
ba395927 1954 return;
ba395927 1955
f76aec76
KA
1956 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1957 size = aligned_size((u64)dev_addr, size);
ba395927 1958
f76aec76
KA
1959 pr_debug("Device %s unmapping: %lx@%llx\n",
1960 pci_name(pdev), size, (u64)start_addr);
ba395927 1961
f76aec76
KA
1962 /* clear the whole page */
1963 dma_pte_clear_range(domain, start_addr, start_addr + size);
1964 /* free page tables */
1965 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
5e0d2a6f 1966 if (intel_iommu_strict) {
1967 if (iommu_flush_iotlb_psi(domain->iommu,
1968 domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
1969 iommu_flush_write_buffer(domain->iommu);
1970 /* free iova */
1971 __free_iova(&domain->iovad, iova);
1972 } else {
1973 add_unmap(domain, iova);
1974 /*
1975 * queue up the release of the unmap to save the 1/6th of the
1976 * cpu used up by the iotlb flush operation...
1977 */
5e0d2a6f 1978 }
ba395927
KA
1979}
1980
1981static void * intel_alloc_coherent(struct device *hwdev, size_t size,
1982 dma_addr_t *dma_handle, gfp_t flags)
1983{
1984 void *vaddr;
1985 int order;
1986
1987 size = PAGE_ALIGN_4K(size);
1988 order = get_order(size);
1989 flags &= ~(GFP_DMA | GFP_DMA32);
1990
1991 vaddr = (void *)__get_free_pages(flags, order);
1992 if (!vaddr)
1993 return NULL;
1994 memset(vaddr, 0, size);
1995
6865f0d1 1996 *dma_handle = intel_map_single(hwdev, virt_to_bus(vaddr), size, DMA_BIDIRECTIONAL);
ba395927
KA
1997 if (*dma_handle)
1998 return vaddr;
1999 free_pages((unsigned long)vaddr, order);
2000 return NULL;
2001}
2002
2003static void intel_free_coherent(struct device *hwdev, size_t size,
2004 void *vaddr, dma_addr_t dma_handle)
2005{
2006 int order;
2007
2008 size = PAGE_ALIGN_4K(size);
2009 order = get_order(size);
2010
2011 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2012 free_pages((unsigned long)vaddr, order);
2013}
2014
12d4d40e 2015#define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
c03ab37c 2016static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
ba395927
KA
2017 int nelems, int dir)
2018{
2019 int i;
2020 struct pci_dev *pdev = to_pci_dev(hwdev);
2021 struct dmar_domain *domain;
f76aec76
KA
2022 unsigned long start_addr;
2023 struct iova *iova;
2024 size_t size = 0;
2025 void *addr;
c03ab37c 2026 struct scatterlist *sg;
ba395927 2027
358dd8ac 2028 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
ba395927
KA
2029 return;
2030
2031 domain = find_domain(pdev);
ba395927 2032
c03ab37c 2033 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
f76aec76
KA
2034 if (!iova)
2035 return;
c03ab37c 2036 for_each_sg(sglist, sg, nelems, i) {
f76aec76
KA
2037 addr = SG_ENT_VIRT_ADDRESS(sg);
2038 size += aligned_size((u64)addr, sg->length);
2039 }
2040
2041 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2042
2043 /* clear the whole page */
2044 dma_pte_clear_range(domain, start_addr, start_addr + size);
2045 /* free page tables */
2046 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2047
2048 if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2049 size >> PAGE_SHIFT_4K, 0))
ba395927 2050 iommu_flush_write_buffer(domain->iommu);
f76aec76
KA
2051
2052 /* free iova */
2053 __free_iova(&domain->iovad, iova);
ba395927
KA
2054}
2055
ba395927 2056static int intel_nontranslate_map_sg(struct device *hddev,
c03ab37c 2057 struct scatterlist *sglist, int nelems, int dir)
ba395927
KA
2058{
2059 int i;
c03ab37c 2060 struct scatterlist *sg;
ba395927 2061
c03ab37c 2062 for_each_sg(sglist, sg, nelems, i) {
12d4d40e 2063 BUG_ON(!sg_page(sg));
c03ab37c
FT
2064 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2065 sg->dma_length = sg->length;
ba395927
KA
2066 }
2067 return nelems;
2068}
2069
c03ab37c
FT
2070static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2071 int nelems, int dir)
ba395927
KA
2072{
2073 void *addr;
2074 int i;
ba395927
KA
2075 struct pci_dev *pdev = to_pci_dev(hwdev);
2076 struct dmar_domain *domain;
f76aec76
KA
2077 size_t size = 0;
2078 int prot = 0;
2079 size_t offset = 0;
2080 struct iova *iova = NULL;
2081 int ret;
c03ab37c 2082 struct scatterlist *sg;
f76aec76 2083 unsigned long start_addr;
ba395927
KA
2084
2085 BUG_ON(dir == DMA_NONE);
358dd8ac 2086 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
c03ab37c 2087 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
ba395927 2088
f76aec76
KA
2089 domain = get_valid_domain_for_dev(pdev);
2090 if (!domain)
2091 return 0;
2092
c03ab37c 2093 for_each_sg(sglist, sg, nelems, i) {
ba395927 2094 addr = SG_ENT_VIRT_ADDRESS(sg);
f76aec76
KA
2095 addr = (void *)virt_to_phys(addr);
2096 size += aligned_size((u64)addr, sg->length);
2097 }
2098
2099 iova = __intel_alloc_iova(hwdev, domain, size);
2100 if (!iova) {
c03ab37c 2101 sglist->dma_length = 0;
f76aec76
KA
2102 return 0;
2103 }
2104
2105 /*
2106 * Check if DMAR supports zero-length reads on write only
2107 * mappings..
2108 */
2109 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2110 !cap_zlr(domain->iommu->cap))
2111 prot |= DMA_PTE_READ;
2112 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2113 prot |= DMA_PTE_WRITE;
2114
2115 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2116 offset = 0;
c03ab37c 2117 for_each_sg(sglist, sg, nelems, i) {
f76aec76
KA
2118 addr = SG_ENT_VIRT_ADDRESS(sg);
2119 addr = (void *)virt_to_phys(addr);
2120 size = aligned_size((u64)addr, sg->length);
2121 ret = domain_page_mapping(domain, start_addr + offset,
2122 ((u64)addr) & PAGE_MASK_4K,
2123 size, prot);
2124 if (ret) {
2125 /* clear the page */
2126 dma_pte_clear_range(domain, start_addr,
2127 start_addr + offset);
2128 /* free page tables */
2129 dma_pte_free_pagetable(domain, start_addr,
2130 start_addr + offset);
2131 /* free iova */
2132 __free_iova(&domain->iovad, iova);
ba395927
KA
2133 return 0;
2134 }
f76aec76
KA
2135 sg->dma_address = start_addr + offset +
2136 ((u64)addr & (~PAGE_MASK_4K));
ba395927 2137 sg->dma_length = sg->length;
f76aec76 2138 offset += size;
ba395927
KA
2139 }
2140
ba395927 2141 /* it's a non-present to present mapping */
f76aec76
KA
2142 if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2143 start_addr, offset >> PAGE_SHIFT_4K, 1))
ba395927
KA
2144 iommu_flush_write_buffer(domain->iommu);
2145 return nelems;
2146}
2147
2148static struct dma_mapping_ops intel_dma_ops = {
2149 .alloc_coherent = intel_alloc_coherent,
2150 .free_coherent = intel_free_coherent,
2151 .map_single = intel_map_single,
2152 .unmap_single = intel_unmap_single,
2153 .map_sg = intel_map_sg,
2154 .unmap_sg = intel_unmap_sg,
2155};
2156
2157static inline int iommu_domain_cache_init(void)
2158{
2159 int ret = 0;
2160
2161 iommu_domain_cache = kmem_cache_create("iommu_domain",
2162 sizeof(struct dmar_domain),
2163 0,
2164 SLAB_HWCACHE_ALIGN,
2165
2166 NULL);
2167 if (!iommu_domain_cache) {
2168 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2169 ret = -ENOMEM;
2170 }
2171
2172 return ret;
2173}
2174
2175static inline int iommu_devinfo_cache_init(void)
2176{
2177 int ret = 0;
2178
2179 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2180 sizeof(struct device_domain_info),
2181 0,
2182 SLAB_HWCACHE_ALIGN,
2183
2184 NULL);
2185 if (!iommu_devinfo_cache) {
2186 printk(KERN_ERR "Couldn't create devinfo cache\n");
2187 ret = -ENOMEM;
2188 }
2189
2190 return ret;
2191}
2192
2193static inline int iommu_iova_cache_init(void)
2194{
2195 int ret = 0;
2196
2197 iommu_iova_cache = kmem_cache_create("iommu_iova",
2198 sizeof(struct iova),
2199 0,
2200 SLAB_HWCACHE_ALIGN,
2201
2202 NULL);
2203 if (!iommu_iova_cache) {
2204 printk(KERN_ERR "Couldn't create iova cache\n");
2205 ret = -ENOMEM;
2206 }
2207
2208 return ret;
2209}
2210
2211static int __init iommu_init_mempool(void)
2212{
2213 int ret;
2214 ret = iommu_iova_cache_init();
2215 if (ret)
2216 return ret;
2217
2218 ret = iommu_domain_cache_init();
2219 if (ret)
2220 goto domain_error;
2221
2222 ret = iommu_devinfo_cache_init();
2223 if (!ret)
2224 return ret;
2225
2226 kmem_cache_destroy(iommu_domain_cache);
2227domain_error:
2228 kmem_cache_destroy(iommu_iova_cache);
2229
2230 return -ENOMEM;
2231}
2232
2233static void __init iommu_exit_mempool(void)
2234{
2235 kmem_cache_destroy(iommu_devinfo_cache);
2236 kmem_cache_destroy(iommu_domain_cache);
2237 kmem_cache_destroy(iommu_iova_cache);
2238
2239}
2240
ba395927
KA
2241static void __init init_no_remapping_devices(void)
2242{
2243 struct dmar_drhd_unit *drhd;
2244
2245 for_each_drhd_unit(drhd) {
2246 if (!drhd->include_all) {
2247 int i;
2248 for (i = 0; i < drhd->devices_cnt; i++)
2249 if (drhd->devices[i] != NULL)
2250 break;
2251 /* ignore DMAR unit if no pci devices exist */
2252 if (i == drhd->devices_cnt)
2253 drhd->ignored = 1;
2254 }
2255 }
2256
2257 if (dmar_map_gfx)
2258 return;
2259
2260 for_each_drhd_unit(drhd) {
2261 int i;
2262 if (drhd->ignored || drhd->include_all)
2263 continue;
2264
2265 for (i = 0; i < drhd->devices_cnt; i++)
2266 if (drhd->devices[i] &&
2267 !IS_GFX_DEVICE(drhd->devices[i]))
2268 break;
2269
2270 if (i < drhd->devices_cnt)
2271 continue;
2272
2273 /* bypass IOMMU if it is just for gfx devices */
2274 drhd->ignored = 1;
2275 for (i = 0; i < drhd->devices_cnt; i++) {
2276 if (!drhd->devices[i])
2277 continue;
358dd8ac 2278 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
ba395927
KA
2279 }
2280 }
2281}
2282
2283int __init intel_iommu_init(void)
2284{
2285 int ret = 0;
2286
ba395927
KA
2287 if (dmar_table_init())
2288 return -ENODEV;
2289
1886e8a9
SS
2290 if (dmar_dev_scope_init())
2291 return -ENODEV;
2292
2ae21010
SS
2293 /*
2294 * Check the need for DMA-remapping initialization now.
2295 * Above initialization will also be used by Interrupt-remapping.
2296 */
2297 if (no_iommu || swiotlb || dmar_disabled)
2298 return -ENODEV;
2299
ba395927
KA
2300 iommu_init_mempool();
2301 dmar_init_reserved_ranges();
2302
2303 init_no_remapping_devices();
2304
2305 ret = init_dmars();
2306 if (ret) {
2307 printk(KERN_ERR "IOMMU: dmar init failed\n");
2308 put_iova_domain(&reserved_iova_list);
2309 iommu_exit_mempool();
2310 return ret;
2311 }
2312 printk(KERN_INFO
2313 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2314
5e0d2a6f 2315 init_timer(&unmap_timer);
ba395927
KA
2316 force_iommu = 1;
2317 dma_ops = &intel_dma_ops;
2318 return 0;
2319}
e820482c 2320