]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - drivers/pci/intel-iommu.c
Add domain flag DOMAIN_FLAG_VIRTUAL_MACHINE
[mirror_ubuntu-bionic-kernel.git] / drivers / pci / intel-iommu.c
1 /*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
22 */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/cacheflush.h>
39 #include <asm/iommu.h>
40 #include "pci.h"
41
42 #define ROOT_SIZE VTD_PAGE_SIZE
43 #define CONTEXT_SIZE VTD_PAGE_SIZE
44
45 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
46 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
47
48 #define IOAPIC_RANGE_START (0xfee00000)
49 #define IOAPIC_RANGE_END (0xfeefffff)
50 #define IOVA_START_ADDR (0x1000)
51
52 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
53
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55
56 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
57 #define DMA_32BIT_PFN IOVA_PFN(DMA_32BIT_MASK)
58 #define DMA_64BIT_PFN IOVA_PFN(DMA_64BIT_MASK)
59
60 /* global iommu list, set NULL for ignored DMAR units */
61 static struct intel_iommu **g_iommus;
62
63 /*
64 * 0: Present
65 * 1-11: Reserved
66 * 12-63: Context Ptr (12 - (haw-1))
67 * 64-127: Reserved
68 */
69 struct root_entry {
70 u64 val;
71 u64 rsvd1;
72 };
73 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
74 static inline bool root_present(struct root_entry *root)
75 {
76 return (root->val & 1);
77 }
78 static inline void set_root_present(struct root_entry *root)
79 {
80 root->val |= 1;
81 }
82 static inline void set_root_value(struct root_entry *root, unsigned long value)
83 {
84 root->val |= value & VTD_PAGE_MASK;
85 }
86
87 static inline struct context_entry *
88 get_context_addr_from_root(struct root_entry *root)
89 {
90 return (struct context_entry *)
91 (root_present(root)?phys_to_virt(
92 root->val & VTD_PAGE_MASK) :
93 NULL);
94 }
95
96 /*
97 * low 64 bits:
98 * 0: present
99 * 1: fault processing disable
100 * 2-3: translation type
101 * 12-63: address space root
102 * high 64 bits:
103 * 0-2: address width
104 * 3-6: aval
105 * 8-23: domain id
106 */
107 struct context_entry {
108 u64 lo;
109 u64 hi;
110 };
111
112 static inline bool context_present(struct context_entry *context)
113 {
114 return (context->lo & 1);
115 }
116 static inline void context_set_present(struct context_entry *context)
117 {
118 context->lo |= 1;
119 }
120
121 static inline void context_set_fault_enable(struct context_entry *context)
122 {
123 context->lo &= (((u64)-1) << 2) | 1;
124 }
125
126 #define CONTEXT_TT_MULTI_LEVEL 0
127
128 static inline void context_set_translation_type(struct context_entry *context,
129 unsigned long value)
130 {
131 context->lo &= (((u64)-1) << 4) | 3;
132 context->lo |= (value & 3) << 2;
133 }
134
135 static inline void context_set_address_root(struct context_entry *context,
136 unsigned long value)
137 {
138 context->lo |= value & VTD_PAGE_MASK;
139 }
140
141 static inline void context_set_address_width(struct context_entry *context,
142 unsigned long value)
143 {
144 context->hi |= value & 7;
145 }
146
147 static inline void context_set_domain_id(struct context_entry *context,
148 unsigned long value)
149 {
150 context->hi |= (value & ((1 << 16) - 1)) << 8;
151 }
152
153 static inline void context_clear_entry(struct context_entry *context)
154 {
155 context->lo = 0;
156 context->hi = 0;
157 }
158
159 /*
160 * 0: readable
161 * 1: writable
162 * 2-6: reserved
163 * 7: super page
164 * 8-11: available
165 * 12-63: Host physcial address
166 */
167 struct dma_pte {
168 u64 val;
169 };
170
171 static inline void dma_clear_pte(struct dma_pte *pte)
172 {
173 pte->val = 0;
174 }
175
176 static inline void dma_set_pte_readable(struct dma_pte *pte)
177 {
178 pte->val |= DMA_PTE_READ;
179 }
180
181 static inline void dma_set_pte_writable(struct dma_pte *pte)
182 {
183 pte->val |= DMA_PTE_WRITE;
184 }
185
186 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
187 {
188 pte->val = (pte->val & ~3) | (prot & 3);
189 }
190
191 static inline u64 dma_pte_addr(struct dma_pte *pte)
192 {
193 return (pte->val & VTD_PAGE_MASK);
194 }
195
196 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
197 {
198 pte->val |= (addr & VTD_PAGE_MASK);
199 }
200
201 static inline bool dma_pte_present(struct dma_pte *pte)
202 {
203 return (pte->val & 3) != 0;
204 }
205
206 /* devices under the same p2p bridge are owned in one domain */
207 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 < 0)
208
209 /* domain represents a virtual machine, more than one devices
210 * across iommus may be owned in one domain, e.g. kvm guest.
211 */
212 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
213
214 struct dmar_domain {
215 int id; /* domain id */
216 unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/
217
218 struct list_head devices; /* all devices' list */
219 struct iova_domain iovad; /* iova's that belong to this domain */
220
221 struct dma_pte *pgd; /* virtual address */
222 spinlock_t mapping_lock; /* page table lock */
223 int gaw; /* max guest address width */
224
225 /* adjusted guest address width, 0 is level 2 30-bit */
226 int agaw;
227
228 int flags; /* flags to find out type of domain */
229
230 int iommu_coherency;/* indicate coherency of iommu access */
231 };
232
233 /* PCI domain-device relationship */
234 struct device_domain_info {
235 struct list_head link; /* link to domain siblings */
236 struct list_head global; /* link to global list */
237 u8 bus; /* PCI bus numer */
238 u8 devfn; /* PCI devfn number */
239 struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
240 struct dmar_domain *domain; /* pointer to domain */
241 };
242
243 static void flush_unmaps_timeout(unsigned long data);
244
245 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
246
247 #define HIGH_WATER_MARK 250
248 struct deferred_flush_tables {
249 int next;
250 struct iova *iova[HIGH_WATER_MARK];
251 struct dmar_domain *domain[HIGH_WATER_MARK];
252 };
253
254 static struct deferred_flush_tables *deferred_flush;
255
256 /* bitmap for indexing intel_iommus */
257 static int g_num_of_iommus;
258
259 static DEFINE_SPINLOCK(async_umap_flush_lock);
260 static LIST_HEAD(unmaps_to_do);
261
262 static int timer_on;
263 static long list_size;
264
265 static void domain_remove_dev_info(struct dmar_domain *domain);
266
267 int dmar_disabled;
268 static int __initdata dmar_map_gfx = 1;
269 static int dmar_forcedac;
270 static int intel_iommu_strict;
271
272 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
273 static DEFINE_SPINLOCK(device_domain_lock);
274 static LIST_HEAD(device_domain_list);
275
276 static int __init intel_iommu_setup(char *str)
277 {
278 if (!str)
279 return -EINVAL;
280 while (*str) {
281 if (!strncmp(str, "off", 3)) {
282 dmar_disabled = 1;
283 printk(KERN_INFO"Intel-IOMMU: disabled\n");
284 } else if (!strncmp(str, "igfx_off", 8)) {
285 dmar_map_gfx = 0;
286 printk(KERN_INFO
287 "Intel-IOMMU: disable GFX device mapping\n");
288 } else if (!strncmp(str, "forcedac", 8)) {
289 printk(KERN_INFO
290 "Intel-IOMMU: Forcing DAC for PCI devices\n");
291 dmar_forcedac = 1;
292 } else if (!strncmp(str, "strict", 6)) {
293 printk(KERN_INFO
294 "Intel-IOMMU: disable batched IOTLB flush\n");
295 intel_iommu_strict = 1;
296 }
297
298 str += strcspn(str, ",");
299 while (*str == ',')
300 str++;
301 }
302 return 0;
303 }
304 __setup("intel_iommu=", intel_iommu_setup);
305
306 static struct kmem_cache *iommu_domain_cache;
307 static struct kmem_cache *iommu_devinfo_cache;
308 static struct kmem_cache *iommu_iova_cache;
309
310 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
311 {
312 unsigned int flags;
313 void *vaddr;
314
315 /* trying to avoid low memory issues */
316 flags = current->flags & PF_MEMALLOC;
317 current->flags |= PF_MEMALLOC;
318 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
319 current->flags &= (~PF_MEMALLOC | flags);
320 return vaddr;
321 }
322
323
324 static inline void *alloc_pgtable_page(void)
325 {
326 unsigned int flags;
327 void *vaddr;
328
329 /* trying to avoid low memory issues */
330 flags = current->flags & PF_MEMALLOC;
331 current->flags |= PF_MEMALLOC;
332 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
333 current->flags &= (~PF_MEMALLOC | flags);
334 return vaddr;
335 }
336
337 static inline void free_pgtable_page(void *vaddr)
338 {
339 free_page((unsigned long)vaddr);
340 }
341
342 static inline void *alloc_domain_mem(void)
343 {
344 return iommu_kmem_cache_alloc(iommu_domain_cache);
345 }
346
347 static void free_domain_mem(void *vaddr)
348 {
349 kmem_cache_free(iommu_domain_cache, vaddr);
350 }
351
352 static inline void * alloc_devinfo_mem(void)
353 {
354 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
355 }
356
357 static inline void free_devinfo_mem(void *vaddr)
358 {
359 kmem_cache_free(iommu_devinfo_cache, vaddr);
360 }
361
362 struct iova *alloc_iova_mem(void)
363 {
364 return iommu_kmem_cache_alloc(iommu_iova_cache);
365 }
366
367 void free_iova_mem(struct iova *iova)
368 {
369 kmem_cache_free(iommu_iova_cache, iova);
370 }
371
372
373 static inline int width_to_agaw(int width);
374
375 /* calculate agaw for each iommu.
376 * "SAGAW" may be different across iommus, use a default agaw, and
377 * get a supported less agaw for iommus that don't support the default agaw.
378 */
379 int iommu_calculate_agaw(struct intel_iommu *iommu)
380 {
381 unsigned long sagaw;
382 int agaw = -1;
383
384 sagaw = cap_sagaw(iommu->cap);
385 for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
386 agaw >= 0; agaw--) {
387 if (test_bit(agaw, &sagaw))
388 break;
389 }
390
391 return agaw;
392 }
393
394 /* in native case, each domain is related to only one iommu */
395 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
396 {
397 int iommu_id;
398
399 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
400
401 iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
402 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
403 return NULL;
404
405 return g_iommus[iommu_id];
406 }
407
408 /* "Coherency" capability may be different across iommus */
409 static void domain_update_iommu_coherency(struct dmar_domain *domain)
410 {
411 int i;
412
413 domain->iommu_coherency = 1;
414
415 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
416 for (; i < g_num_of_iommus; ) {
417 if (!ecap_coherent(g_iommus[i]->ecap)) {
418 domain->iommu_coherency = 0;
419 break;
420 }
421 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
422 }
423 }
424
425 /* Gets context entry for a given bus and devfn */
426 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
427 u8 bus, u8 devfn)
428 {
429 struct root_entry *root;
430 struct context_entry *context;
431 unsigned long phy_addr;
432 unsigned long flags;
433
434 spin_lock_irqsave(&iommu->lock, flags);
435 root = &iommu->root_entry[bus];
436 context = get_context_addr_from_root(root);
437 if (!context) {
438 context = (struct context_entry *)alloc_pgtable_page();
439 if (!context) {
440 spin_unlock_irqrestore(&iommu->lock, flags);
441 return NULL;
442 }
443 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
444 phy_addr = virt_to_phys((void *)context);
445 set_root_value(root, phy_addr);
446 set_root_present(root);
447 __iommu_flush_cache(iommu, root, sizeof(*root));
448 }
449 spin_unlock_irqrestore(&iommu->lock, flags);
450 return &context[devfn];
451 }
452
453 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
454 {
455 struct root_entry *root;
456 struct context_entry *context;
457 int ret;
458 unsigned long flags;
459
460 spin_lock_irqsave(&iommu->lock, flags);
461 root = &iommu->root_entry[bus];
462 context = get_context_addr_from_root(root);
463 if (!context) {
464 ret = 0;
465 goto out;
466 }
467 ret = context_present(&context[devfn]);
468 out:
469 spin_unlock_irqrestore(&iommu->lock, flags);
470 return ret;
471 }
472
473 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
474 {
475 struct root_entry *root;
476 struct context_entry *context;
477 unsigned long flags;
478
479 spin_lock_irqsave(&iommu->lock, flags);
480 root = &iommu->root_entry[bus];
481 context = get_context_addr_from_root(root);
482 if (context) {
483 context_clear_entry(&context[devfn]);
484 __iommu_flush_cache(iommu, &context[devfn], \
485 sizeof(*context));
486 }
487 spin_unlock_irqrestore(&iommu->lock, flags);
488 }
489
490 static void free_context_table(struct intel_iommu *iommu)
491 {
492 struct root_entry *root;
493 int i;
494 unsigned long flags;
495 struct context_entry *context;
496
497 spin_lock_irqsave(&iommu->lock, flags);
498 if (!iommu->root_entry) {
499 goto out;
500 }
501 for (i = 0; i < ROOT_ENTRY_NR; i++) {
502 root = &iommu->root_entry[i];
503 context = get_context_addr_from_root(root);
504 if (context)
505 free_pgtable_page(context);
506 }
507 free_pgtable_page(iommu->root_entry);
508 iommu->root_entry = NULL;
509 out:
510 spin_unlock_irqrestore(&iommu->lock, flags);
511 }
512
513 /* page table handling */
514 #define LEVEL_STRIDE (9)
515 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
516
517 static inline int agaw_to_level(int agaw)
518 {
519 return agaw + 2;
520 }
521
522 static inline int agaw_to_width(int agaw)
523 {
524 return 30 + agaw * LEVEL_STRIDE;
525
526 }
527
528 static inline int width_to_agaw(int width)
529 {
530 return (width - 30) / LEVEL_STRIDE;
531 }
532
533 static inline unsigned int level_to_offset_bits(int level)
534 {
535 return (12 + (level - 1) * LEVEL_STRIDE);
536 }
537
538 static inline int address_level_offset(u64 addr, int level)
539 {
540 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
541 }
542
543 static inline u64 level_mask(int level)
544 {
545 return ((u64)-1 << level_to_offset_bits(level));
546 }
547
548 static inline u64 level_size(int level)
549 {
550 return ((u64)1 << level_to_offset_bits(level));
551 }
552
553 static inline u64 align_to_level(u64 addr, int level)
554 {
555 return ((addr + level_size(level) - 1) & level_mask(level));
556 }
557
558 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
559 {
560 int addr_width = agaw_to_width(domain->agaw);
561 struct dma_pte *parent, *pte = NULL;
562 int level = agaw_to_level(domain->agaw);
563 int offset;
564 unsigned long flags;
565 struct intel_iommu *iommu = domain_get_iommu(domain);
566
567 BUG_ON(!domain->pgd);
568
569 addr &= (((u64)1) << addr_width) - 1;
570 parent = domain->pgd;
571
572 spin_lock_irqsave(&domain->mapping_lock, flags);
573 while (level > 0) {
574 void *tmp_page;
575
576 offset = address_level_offset(addr, level);
577 pte = &parent[offset];
578 if (level == 1)
579 break;
580
581 if (!dma_pte_present(pte)) {
582 tmp_page = alloc_pgtable_page();
583
584 if (!tmp_page) {
585 spin_unlock_irqrestore(&domain->mapping_lock,
586 flags);
587 return NULL;
588 }
589 __iommu_flush_cache(iommu, tmp_page,
590 PAGE_SIZE);
591 dma_set_pte_addr(pte, virt_to_phys(tmp_page));
592 /*
593 * high level table always sets r/w, last level page
594 * table control read/write
595 */
596 dma_set_pte_readable(pte);
597 dma_set_pte_writable(pte);
598 __iommu_flush_cache(iommu, pte, sizeof(*pte));
599 }
600 parent = phys_to_virt(dma_pte_addr(pte));
601 level--;
602 }
603
604 spin_unlock_irqrestore(&domain->mapping_lock, flags);
605 return pte;
606 }
607
608 /* return address's pte at specific level */
609 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
610 int level)
611 {
612 struct dma_pte *parent, *pte = NULL;
613 int total = agaw_to_level(domain->agaw);
614 int offset;
615
616 parent = domain->pgd;
617 while (level <= total) {
618 offset = address_level_offset(addr, total);
619 pte = &parent[offset];
620 if (level == total)
621 return pte;
622
623 if (!dma_pte_present(pte))
624 break;
625 parent = phys_to_virt(dma_pte_addr(pte));
626 total--;
627 }
628 return NULL;
629 }
630
631 /* clear one page's page table */
632 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
633 {
634 struct dma_pte *pte = NULL;
635 struct intel_iommu *iommu = domain_get_iommu(domain);
636
637 /* get last level pte */
638 pte = dma_addr_level_pte(domain, addr, 1);
639
640 if (pte) {
641 dma_clear_pte(pte);
642 __iommu_flush_cache(iommu, pte, sizeof(*pte));
643 }
644 }
645
646 /* clear last level pte, a tlb flush should be followed */
647 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
648 {
649 int addr_width = agaw_to_width(domain->agaw);
650
651 start &= (((u64)1) << addr_width) - 1;
652 end &= (((u64)1) << addr_width) - 1;
653 /* in case it's partial page */
654 start = PAGE_ALIGN(start);
655 end &= PAGE_MASK;
656
657 /* we don't need lock here, nobody else touches the iova range */
658 while (start < end) {
659 dma_pte_clear_one(domain, start);
660 start += VTD_PAGE_SIZE;
661 }
662 }
663
664 /* free page table pages. last level pte should already be cleared */
665 static void dma_pte_free_pagetable(struct dmar_domain *domain,
666 u64 start, u64 end)
667 {
668 int addr_width = agaw_to_width(domain->agaw);
669 struct dma_pte *pte;
670 int total = agaw_to_level(domain->agaw);
671 int level;
672 u64 tmp;
673 struct intel_iommu *iommu = domain_get_iommu(domain);
674
675 start &= (((u64)1) << addr_width) - 1;
676 end &= (((u64)1) << addr_width) - 1;
677
678 /* we don't need lock here, nobody else touches the iova range */
679 level = 2;
680 while (level <= total) {
681 tmp = align_to_level(start, level);
682 if (tmp >= end || (tmp + level_size(level) > end))
683 return;
684
685 while (tmp < end) {
686 pte = dma_addr_level_pte(domain, tmp, level);
687 if (pte) {
688 free_pgtable_page(
689 phys_to_virt(dma_pte_addr(pte)));
690 dma_clear_pte(pte);
691 __iommu_flush_cache(iommu,
692 pte, sizeof(*pte));
693 }
694 tmp += level_size(level);
695 }
696 level++;
697 }
698 /* free pgd */
699 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
700 free_pgtable_page(domain->pgd);
701 domain->pgd = NULL;
702 }
703 }
704
705 /* iommu handling */
706 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
707 {
708 struct root_entry *root;
709 unsigned long flags;
710
711 root = (struct root_entry *)alloc_pgtable_page();
712 if (!root)
713 return -ENOMEM;
714
715 __iommu_flush_cache(iommu, root, ROOT_SIZE);
716
717 spin_lock_irqsave(&iommu->lock, flags);
718 iommu->root_entry = root;
719 spin_unlock_irqrestore(&iommu->lock, flags);
720
721 return 0;
722 }
723
724 static void iommu_set_root_entry(struct intel_iommu *iommu)
725 {
726 void *addr;
727 u32 cmd, sts;
728 unsigned long flag;
729
730 addr = iommu->root_entry;
731
732 spin_lock_irqsave(&iommu->register_lock, flag);
733 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
734
735 cmd = iommu->gcmd | DMA_GCMD_SRTP;
736 writel(cmd, iommu->reg + DMAR_GCMD_REG);
737
738 /* Make sure hardware complete it */
739 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
740 readl, (sts & DMA_GSTS_RTPS), sts);
741
742 spin_unlock_irqrestore(&iommu->register_lock, flag);
743 }
744
745 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
746 {
747 u32 val;
748 unsigned long flag;
749
750 if (!cap_rwbf(iommu->cap))
751 return;
752 val = iommu->gcmd | DMA_GCMD_WBF;
753
754 spin_lock_irqsave(&iommu->register_lock, flag);
755 writel(val, iommu->reg + DMAR_GCMD_REG);
756
757 /* Make sure hardware complete it */
758 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
759 readl, (!(val & DMA_GSTS_WBFS)), val);
760
761 spin_unlock_irqrestore(&iommu->register_lock, flag);
762 }
763
764 /* return value determine if we need a write buffer flush */
765 static int __iommu_flush_context(struct intel_iommu *iommu,
766 u16 did, u16 source_id, u8 function_mask, u64 type,
767 int non_present_entry_flush)
768 {
769 u64 val = 0;
770 unsigned long flag;
771
772 /*
773 * In the non-present entry flush case, if hardware doesn't cache
774 * non-present entry we do nothing and if hardware cache non-present
775 * entry, we flush entries of domain 0 (the domain id is used to cache
776 * any non-present entries)
777 */
778 if (non_present_entry_flush) {
779 if (!cap_caching_mode(iommu->cap))
780 return 1;
781 else
782 did = 0;
783 }
784
785 switch (type) {
786 case DMA_CCMD_GLOBAL_INVL:
787 val = DMA_CCMD_GLOBAL_INVL;
788 break;
789 case DMA_CCMD_DOMAIN_INVL:
790 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
791 break;
792 case DMA_CCMD_DEVICE_INVL:
793 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
794 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
795 break;
796 default:
797 BUG();
798 }
799 val |= DMA_CCMD_ICC;
800
801 spin_lock_irqsave(&iommu->register_lock, flag);
802 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
803
804 /* Make sure hardware complete it */
805 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
806 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
807
808 spin_unlock_irqrestore(&iommu->register_lock, flag);
809
810 /* flush context entry will implicitly flush write buffer */
811 return 0;
812 }
813
814 /* return value determine if we need a write buffer flush */
815 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
816 u64 addr, unsigned int size_order, u64 type,
817 int non_present_entry_flush)
818 {
819 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
820 u64 val = 0, val_iva = 0;
821 unsigned long flag;
822
823 /*
824 * In the non-present entry flush case, if hardware doesn't cache
825 * non-present entry we do nothing and if hardware cache non-present
826 * entry, we flush entries of domain 0 (the domain id is used to cache
827 * any non-present entries)
828 */
829 if (non_present_entry_flush) {
830 if (!cap_caching_mode(iommu->cap))
831 return 1;
832 else
833 did = 0;
834 }
835
836 switch (type) {
837 case DMA_TLB_GLOBAL_FLUSH:
838 /* global flush doesn't need set IVA_REG */
839 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
840 break;
841 case DMA_TLB_DSI_FLUSH:
842 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
843 break;
844 case DMA_TLB_PSI_FLUSH:
845 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
846 /* Note: always flush non-leaf currently */
847 val_iva = size_order | addr;
848 break;
849 default:
850 BUG();
851 }
852 /* Note: set drain read/write */
853 #if 0
854 /*
855 * This is probably to be super secure.. Looks like we can
856 * ignore it without any impact.
857 */
858 if (cap_read_drain(iommu->cap))
859 val |= DMA_TLB_READ_DRAIN;
860 #endif
861 if (cap_write_drain(iommu->cap))
862 val |= DMA_TLB_WRITE_DRAIN;
863
864 spin_lock_irqsave(&iommu->register_lock, flag);
865 /* Note: Only uses first TLB reg currently */
866 if (val_iva)
867 dmar_writeq(iommu->reg + tlb_offset, val_iva);
868 dmar_writeq(iommu->reg + tlb_offset + 8, val);
869
870 /* Make sure hardware complete it */
871 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
872 dmar_readq, (!(val & DMA_TLB_IVT)), val);
873
874 spin_unlock_irqrestore(&iommu->register_lock, flag);
875
876 /* check IOTLB invalidation granularity */
877 if (DMA_TLB_IAIG(val) == 0)
878 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
879 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
880 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
881 (unsigned long long)DMA_TLB_IIRG(type),
882 (unsigned long long)DMA_TLB_IAIG(val));
883 /* flush iotlb entry will implicitly flush write buffer */
884 return 0;
885 }
886
887 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
888 u64 addr, unsigned int pages, int non_present_entry_flush)
889 {
890 unsigned int mask;
891
892 BUG_ON(addr & (~VTD_PAGE_MASK));
893 BUG_ON(pages == 0);
894
895 /* Fallback to domain selective flush if no PSI support */
896 if (!cap_pgsel_inv(iommu->cap))
897 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
898 DMA_TLB_DSI_FLUSH,
899 non_present_entry_flush);
900
901 /*
902 * PSI requires page size to be 2 ^ x, and the base address is naturally
903 * aligned to the size
904 */
905 mask = ilog2(__roundup_pow_of_two(pages));
906 /* Fallback to domain selective flush if size is too big */
907 if (mask > cap_max_amask_val(iommu->cap))
908 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
909 DMA_TLB_DSI_FLUSH, non_present_entry_flush);
910
911 return iommu->flush.flush_iotlb(iommu, did, addr, mask,
912 DMA_TLB_PSI_FLUSH,
913 non_present_entry_flush);
914 }
915
916 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
917 {
918 u32 pmen;
919 unsigned long flags;
920
921 spin_lock_irqsave(&iommu->register_lock, flags);
922 pmen = readl(iommu->reg + DMAR_PMEN_REG);
923 pmen &= ~DMA_PMEN_EPM;
924 writel(pmen, iommu->reg + DMAR_PMEN_REG);
925
926 /* wait for the protected region status bit to clear */
927 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
928 readl, !(pmen & DMA_PMEN_PRS), pmen);
929
930 spin_unlock_irqrestore(&iommu->register_lock, flags);
931 }
932
933 static int iommu_enable_translation(struct intel_iommu *iommu)
934 {
935 u32 sts;
936 unsigned long flags;
937
938 spin_lock_irqsave(&iommu->register_lock, flags);
939 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
940
941 /* Make sure hardware complete it */
942 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
943 readl, (sts & DMA_GSTS_TES), sts);
944
945 iommu->gcmd |= DMA_GCMD_TE;
946 spin_unlock_irqrestore(&iommu->register_lock, flags);
947 return 0;
948 }
949
950 static int iommu_disable_translation(struct intel_iommu *iommu)
951 {
952 u32 sts;
953 unsigned long flag;
954
955 spin_lock_irqsave(&iommu->register_lock, flag);
956 iommu->gcmd &= ~DMA_GCMD_TE;
957 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
958
959 /* Make sure hardware complete it */
960 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
961 readl, (!(sts & DMA_GSTS_TES)), sts);
962
963 spin_unlock_irqrestore(&iommu->register_lock, flag);
964 return 0;
965 }
966
967 /* iommu interrupt handling. Most stuff are MSI-like. */
968
969 static const char *fault_reason_strings[] =
970 {
971 "Software",
972 "Present bit in root entry is clear",
973 "Present bit in context entry is clear",
974 "Invalid context entry",
975 "Access beyond MGAW",
976 "PTE Write access is not set",
977 "PTE Read access is not set",
978 "Next page table ptr is invalid",
979 "Root table address invalid",
980 "Context table ptr is invalid",
981 "non-zero reserved fields in RTP",
982 "non-zero reserved fields in CTP",
983 "non-zero reserved fields in PTE",
984 };
985 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
986
987 const char *dmar_get_fault_reason(u8 fault_reason)
988 {
989 if (fault_reason > MAX_FAULT_REASON_IDX)
990 return "Unknown";
991 else
992 return fault_reason_strings[fault_reason];
993 }
994
995 void dmar_msi_unmask(unsigned int irq)
996 {
997 struct intel_iommu *iommu = get_irq_data(irq);
998 unsigned long flag;
999
1000 /* unmask it */
1001 spin_lock_irqsave(&iommu->register_lock, flag);
1002 writel(0, iommu->reg + DMAR_FECTL_REG);
1003 /* Read a reg to force flush the post write */
1004 readl(iommu->reg + DMAR_FECTL_REG);
1005 spin_unlock_irqrestore(&iommu->register_lock, flag);
1006 }
1007
1008 void dmar_msi_mask(unsigned int irq)
1009 {
1010 unsigned long flag;
1011 struct intel_iommu *iommu = get_irq_data(irq);
1012
1013 /* mask it */
1014 spin_lock_irqsave(&iommu->register_lock, flag);
1015 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1016 /* Read a reg to force flush the post write */
1017 readl(iommu->reg + DMAR_FECTL_REG);
1018 spin_unlock_irqrestore(&iommu->register_lock, flag);
1019 }
1020
1021 void dmar_msi_write(int irq, struct msi_msg *msg)
1022 {
1023 struct intel_iommu *iommu = get_irq_data(irq);
1024 unsigned long flag;
1025
1026 spin_lock_irqsave(&iommu->register_lock, flag);
1027 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1028 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1029 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1030 spin_unlock_irqrestore(&iommu->register_lock, flag);
1031 }
1032
1033 void dmar_msi_read(int irq, struct msi_msg *msg)
1034 {
1035 struct intel_iommu *iommu = get_irq_data(irq);
1036 unsigned long flag;
1037
1038 spin_lock_irqsave(&iommu->register_lock, flag);
1039 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1040 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1041 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1042 spin_unlock_irqrestore(&iommu->register_lock, flag);
1043 }
1044
1045 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1046 u8 fault_reason, u16 source_id, unsigned long long addr)
1047 {
1048 const char *reason;
1049
1050 reason = dmar_get_fault_reason(fault_reason);
1051
1052 printk(KERN_ERR
1053 "DMAR:[%s] Request device [%02x:%02x.%d] "
1054 "fault addr %llx \n"
1055 "DMAR:[fault reason %02d] %s\n",
1056 (type ? "DMA Read" : "DMA Write"),
1057 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1058 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1059 return 0;
1060 }
1061
1062 #define PRIMARY_FAULT_REG_LEN (16)
1063 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1064 {
1065 struct intel_iommu *iommu = dev_id;
1066 int reg, fault_index;
1067 u32 fault_status;
1068 unsigned long flag;
1069
1070 spin_lock_irqsave(&iommu->register_lock, flag);
1071 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1072
1073 /* TBD: ignore advanced fault log currently */
1074 if (!(fault_status & DMA_FSTS_PPF))
1075 goto clear_overflow;
1076
1077 fault_index = dma_fsts_fault_record_index(fault_status);
1078 reg = cap_fault_reg_offset(iommu->cap);
1079 while (1) {
1080 u8 fault_reason;
1081 u16 source_id;
1082 u64 guest_addr;
1083 int type;
1084 u32 data;
1085
1086 /* highest 32 bits */
1087 data = readl(iommu->reg + reg +
1088 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1089 if (!(data & DMA_FRCD_F))
1090 break;
1091
1092 fault_reason = dma_frcd_fault_reason(data);
1093 type = dma_frcd_type(data);
1094
1095 data = readl(iommu->reg + reg +
1096 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1097 source_id = dma_frcd_source_id(data);
1098
1099 guest_addr = dmar_readq(iommu->reg + reg +
1100 fault_index * PRIMARY_FAULT_REG_LEN);
1101 guest_addr = dma_frcd_page_addr(guest_addr);
1102 /* clear the fault */
1103 writel(DMA_FRCD_F, iommu->reg + reg +
1104 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1105
1106 spin_unlock_irqrestore(&iommu->register_lock, flag);
1107
1108 iommu_page_fault_do_one(iommu, type, fault_reason,
1109 source_id, guest_addr);
1110
1111 fault_index++;
1112 if (fault_index > cap_num_fault_regs(iommu->cap))
1113 fault_index = 0;
1114 spin_lock_irqsave(&iommu->register_lock, flag);
1115 }
1116 clear_overflow:
1117 /* clear primary fault overflow */
1118 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1119 if (fault_status & DMA_FSTS_PFO)
1120 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1121
1122 spin_unlock_irqrestore(&iommu->register_lock, flag);
1123 return IRQ_HANDLED;
1124 }
1125
1126 int dmar_set_interrupt(struct intel_iommu *iommu)
1127 {
1128 int irq, ret;
1129
1130 irq = create_irq();
1131 if (!irq) {
1132 printk(KERN_ERR "IOMMU: no free vectors\n");
1133 return -EINVAL;
1134 }
1135
1136 set_irq_data(irq, iommu);
1137 iommu->irq = irq;
1138
1139 ret = arch_setup_dmar_msi(irq);
1140 if (ret) {
1141 set_irq_data(irq, NULL);
1142 iommu->irq = 0;
1143 destroy_irq(irq);
1144 return 0;
1145 }
1146
1147 /* Force fault register is cleared */
1148 iommu_page_fault(irq, iommu);
1149
1150 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1151 if (ret)
1152 printk(KERN_ERR "IOMMU: can't request irq\n");
1153 return ret;
1154 }
1155
1156 static int iommu_init_domains(struct intel_iommu *iommu)
1157 {
1158 unsigned long ndomains;
1159 unsigned long nlongs;
1160
1161 ndomains = cap_ndoms(iommu->cap);
1162 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1163 nlongs = BITS_TO_LONGS(ndomains);
1164
1165 /* TBD: there might be 64K domains,
1166 * consider other allocation for future chip
1167 */
1168 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1169 if (!iommu->domain_ids) {
1170 printk(KERN_ERR "Allocating domain id array failed\n");
1171 return -ENOMEM;
1172 }
1173 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1174 GFP_KERNEL);
1175 if (!iommu->domains) {
1176 printk(KERN_ERR "Allocating domain array failed\n");
1177 kfree(iommu->domain_ids);
1178 return -ENOMEM;
1179 }
1180
1181 spin_lock_init(&iommu->lock);
1182
1183 /*
1184 * if Caching mode is set, then invalid translations are tagged
1185 * with domainid 0. Hence we need to pre-allocate it.
1186 */
1187 if (cap_caching_mode(iommu->cap))
1188 set_bit(0, iommu->domain_ids);
1189 return 0;
1190 }
1191
1192
1193 static void domain_exit(struct dmar_domain *domain);
1194
1195 void free_dmar_iommu(struct intel_iommu *iommu)
1196 {
1197 struct dmar_domain *domain;
1198 int i;
1199
1200 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1201 for (; i < cap_ndoms(iommu->cap); ) {
1202 domain = iommu->domains[i];
1203 clear_bit(i, iommu->domain_ids);
1204 domain_exit(domain);
1205 i = find_next_bit(iommu->domain_ids,
1206 cap_ndoms(iommu->cap), i+1);
1207 }
1208
1209 if (iommu->gcmd & DMA_GCMD_TE)
1210 iommu_disable_translation(iommu);
1211
1212 if (iommu->irq) {
1213 set_irq_data(iommu->irq, NULL);
1214 /* This will mask the irq */
1215 free_irq(iommu->irq, iommu);
1216 destroy_irq(iommu->irq);
1217 }
1218
1219 kfree(iommu->domains);
1220 kfree(iommu->domain_ids);
1221
1222 g_iommus[iommu->seq_id] = NULL;
1223
1224 /* if all iommus are freed, free g_iommus */
1225 for (i = 0; i < g_num_of_iommus; i++) {
1226 if (g_iommus[i])
1227 break;
1228 }
1229
1230 if (i == g_num_of_iommus)
1231 kfree(g_iommus);
1232
1233 /* free context mapping */
1234 free_context_table(iommu);
1235 }
1236
1237 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1238 {
1239 unsigned long num;
1240 unsigned long ndomains;
1241 struct dmar_domain *domain;
1242 unsigned long flags;
1243
1244 domain = alloc_domain_mem();
1245 if (!domain)
1246 return NULL;
1247
1248 ndomains = cap_ndoms(iommu->cap);
1249
1250 spin_lock_irqsave(&iommu->lock, flags);
1251 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1252 if (num >= ndomains) {
1253 spin_unlock_irqrestore(&iommu->lock, flags);
1254 free_domain_mem(domain);
1255 printk(KERN_ERR "IOMMU: no free domain ids\n");
1256 return NULL;
1257 }
1258
1259 set_bit(num, iommu->domain_ids);
1260 domain->id = num;
1261 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1262 set_bit(iommu->seq_id, &domain->iommu_bmp);
1263 domain->flags = 0;
1264 iommu->domains[num] = domain;
1265 spin_unlock_irqrestore(&iommu->lock, flags);
1266
1267 return domain;
1268 }
1269
1270 static void iommu_free_domain(struct dmar_domain *domain)
1271 {
1272 unsigned long flags;
1273 struct intel_iommu *iommu;
1274
1275 iommu = domain_get_iommu(domain);
1276
1277 spin_lock_irqsave(&iommu->lock, flags);
1278 clear_bit(domain->id, iommu->domain_ids);
1279 spin_unlock_irqrestore(&iommu->lock, flags);
1280 }
1281
1282 static struct iova_domain reserved_iova_list;
1283 static struct lock_class_key reserved_alloc_key;
1284 static struct lock_class_key reserved_rbtree_key;
1285
1286 static void dmar_init_reserved_ranges(void)
1287 {
1288 struct pci_dev *pdev = NULL;
1289 struct iova *iova;
1290 int i;
1291 u64 addr, size;
1292
1293 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1294
1295 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1296 &reserved_alloc_key);
1297 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1298 &reserved_rbtree_key);
1299
1300 /* IOAPIC ranges shouldn't be accessed by DMA */
1301 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1302 IOVA_PFN(IOAPIC_RANGE_END));
1303 if (!iova)
1304 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1305
1306 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1307 for_each_pci_dev(pdev) {
1308 struct resource *r;
1309
1310 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1311 r = &pdev->resource[i];
1312 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1313 continue;
1314 addr = r->start;
1315 addr &= PAGE_MASK;
1316 size = r->end - addr;
1317 size = PAGE_ALIGN(size);
1318 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1319 IOVA_PFN(size + addr) - 1);
1320 if (!iova)
1321 printk(KERN_ERR "Reserve iova failed\n");
1322 }
1323 }
1324
1325 }
1326
1327 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1328 {
1329 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1330 }
1331
1332 static inline int guestwidth_to_adjustwidth(int gaw)
1333 {
1334 int agaw;
1335 int r = (gaw - 12) % 9;
1336
1337 if (r == 0)
1338 agaw = gaw;
1339 else
1340 agaw = gaw + 9 - r;
1341 if (agaw > 64)
1342 agaw = 64;
1343 return agaw;
1344 }
1345
1346 static int domain_init(struct dmar_domain *domain, int guest_width)
1347 {
1348 struct intel_iommu *iommu;
1349 int adjust_width, agaw;
1350 unsigned long sagaw;
1351
1352 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1353 spin_lock_init(&domain->mapping_lock);
1354
1355 domain_reserve_special_ranges(domain);
1356
1357 /* calculate AGAW */
1358 iommu = domain_get_iommu(domain);
1359 if (guest_width > cap_mgaw(iommu->cap))
1360 guest_width = cap_mgaw(iommu->cap);
1361 domain->gaw = guest_width;
1362 adjust_width = guestwidth_to_adjustwidth(guest_width);
1363 agaw = width_to_agaw(adjust_width);
1364 sagaw = cap_sagaw(iommu->cap);
1365 if (!test_bit(agaw, &sagaw)) {
1366 /* hardware doesn't support it, choose a bigger one */
1367 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1368 agaw = find_next_bit(&sagaw, 5, agaw);
1369 if (agaw >= 5)
1370 return -ENODEV;
1371 }
1372 domain->agaw = agaw;
1373 INIT_LIST_HEAD(&domain->devices);
1374
1375 if (ecap_coherent(iommu->ecap))
1376 domain->iommu_coherency = 1;
1377 else
1378 domain->iommu_coherency = 0;
1379
1380 /* always allocate the top pgd */
1381 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1382 if (!domain->pgd)
1383 return -ENOMEM;
1384 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1385 return 0;
1386 }
1387
1388 static void domain_exit(struct dmar_domain *domain)
1389 {
1390 u64 end;
1391
1392 /* Domain 0 is reserved, so dont process it */
1393 if (!domain)
1394 return;
1395
1396 domain_remove_dev_info(domain);
1397 /* destroy iovas */
1398 put_iova_domain(&domain->iovad);
1399 end = DOMAIN_MAX_ADDR(domain->gaw);
1400 end = end & (~PAGE_MASK);
1401
1402 /* clear ptes */
1403 dma_pte_clear_range(domain, 0, end);
1404
1405 /* free page tables */
1406 dma_pte_free_pagetable(domain, 0, end);
1407
1408 iommu_free_domain(domain);
1409 free_domain_mem(domain);
1410 }
1411
1412 static int domain_context_mapping_one(struct dmar_domain *domain,
1413 u8 bus, u8 devfn)
1414 {
1415 struct context_entry *context;
1416 struct intel_iommu *iommu = domain_get_iommu(domain);
1417 unsigned long flags;
1418
1419 pr_debug("Set context mapping for %02x:%02x.%d\n",
1420 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1421 BUG_ON(!domain->pgd);
1422 context = device_to_context_entry(iommu, bus, devfn);
1423 if (!context)
1424 return -ENOMEM;
1425 spin_lock_irqsave(&iommu->lock, flags);
1426 if (context_present(context)) {
1427 spin_unlock_irqrestore(&iommu->lock, flags);
1428 return 0;
1429 }
1430
1431 context_set_domain_id(context, domain->id);
1432 context_set_address_width(context, domain->agaw);
1433 context_set_address_root(context, virt_to_phys(domain->pgd));
1434 context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1435 context_set_fault_enable(context);
1436 context_set_present(context);
1437 __iommu_flush_cache(iommu, context, sizeof(*context));
1438
1439 /* it's a non-present to present mapping */
1440 if (iommu->flush.flush_context(iommu, domain->id,
1441 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1442 DMA_CCMD_DEVICE_INVL, 1))
1443 iommu_flush_write_buffer(iommu);
1444 else
1445 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1446
1447 spin_unlock_irqrestore(&iommu->lock, flags);
1448 return 0;
1449 }
1450
1451 static int
1452 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1453 {
1454 int ret;
1455 struct pci_dev *tmp, *parent;
1456
1457 ret = domain_context_mapping_one(domain, pdev->bus->number,
1458 pdev->devfn);
1459 if (ret)
1460 return ret;
1461
1462 /* dependent device mapping */
1463 tmp = pci_find_upstream_pcie_bridge(pdev);
1464 if (!tmp)
1465 return 0;
1466 /* Secondary interface's bus number and devfn 0 */
1467 parent = pdev->bus->self;
1468 while (parent != tmp) {
1469 ret = domain_context_mapping_one(domain, parent->bus->number,
1470 parent->devfn);
1471 if (ret)
1472 return ret;
1473 parent = parent->bus->self;
1474 }
1475 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1476 return domain_context_mapping_one(domain,
1477 tmp->subordinate->number, 0);
1478 else /* this is a legacy PCI bridge */
1479 return domain_context_mapping_one(domain,
1480 tmp->bus->number, tmp->devfn);
1481 }
1482
1483 static int domain_context_mapped(struct dmar_domain *domain,
1484 struct pci_dev *pdev)
1485 {
1486 int ret;
1487 struct pci_dev *tmp, *parent;
1488 struct intel_iommu *iommu = domain_get_iommu(domain);
1489
1490 ret = device_context_mapped(iommu,
1491 pdev->bus->number, pdev->devfn);
1492 if (!ret)
1493 return ret;
1494 /* dependent device mapping */
1495 tmp = pci_find_upstream_pcie_bridge(pdev);
1496 if (!tmp)
1497 return ret;
1498 /* Secondary interface's bus number and devfn 0 */
1499 parent = pdev->bus->self;
1500 while (parent != tmp) {
1501 ret = device_context_mapped(iommu, parent->bus->number,
1502 parent->devfn);
1503 if (!ret)
1504 return ret;
1505 parent = parent->bus->self;
1506 }
1507 if (tmp->is_pcie)
1508 return device_context_mapped(iommu,
1509 tmp->subordinate->number, 0);
1510 else
1511 return device_context_mapped(iommu,
1512 tmp->bus->number, tmp->devfn);
1513 }
1514
1515 static int
1516 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1517 u64 hpa, size_t size, int prot)
1518 {
1519 u64 start_pfn, end_pfn;
1520 struct dma_pte *pte;
1521 int index;
1522 int addr_width = agaw_to_width(domain->agaw);
1523 struct intel_iommu *iommu = domain_get_iommu(domain);
1524
1525 hpa &= (((u64)1) << addr_width) - 1;
1526
1527 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1528 return -EINVAL;
1529 iova &= PAGE_MASK;
1530 start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1531 end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1532 index = 0;
1533 while (start_pfn < end_pfn) {
1534 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1535 if (!pte)
1536 return -ENOMEM;
1537 /* We don't need lock here, nobody else
1538 * touches the iova range
1539 */
1540 BUG_ON(dma_pte_addr(pte));
1541 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1542 dma_set_pte_prot(pte, prot);
1543 __iommu_flush_cache(iommu, pte, sizeof(*pte));
1544 start_pfn++;
1545 index++;
1546 }
1547 return 0;
1548 }
1549
1550 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1551 {
1552 struct intel_iommu *iommu = domain_get_iommu(domain);
1553
1554 clear_context_table(iommu, bus, devfn);
1555 iommu->flush.flush_context(iommu, 0, 0, 0,
1556 DMA_CCMD_GLOBAL_INVL, 0);
1557 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1558 DMA_TLB_GLOBAL_FLUSH, 0);
1559 }
1560
1561 static void domain_remove_dev_info(struct dmar_domain *domain)
1562 {
1563 struct device_domain_info *info;
1564 unsigned long flags;
1565
1566 spin_lock_irqsave(&device_domain_lock, flags);
1567 while (!list_empty(&domain->devices)) {
1568 info = list_entry(domain->devices.next,
1569 struct device_domain_info, link);
1570 list_del(&info->link);
1571 list_del(&info->global);
1572 if (info->dev)
1573 info->dev->dev.archdata.iommu = NULL;
1574 spin_unlock_irqrestore(&device_domain_lock, flags);
1575
1576 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1577 free_devinfo_mem(info);
1578
1579 spin_lock_irqsave(&device_domain_lock, flags);
1580 }
1581 spin_unlock_irqrestore(&device_domain_lock, flags);
1582 }
1583
1584 /*
1585 * find_domain
1586 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1587 */
1588 static struct dmar_domain *
1589 find_domain(struct pci_dev *pdev)
1590 {
1591 struct device_domain_info *info;
1592
1593 /* No lock here, assumes no domain exit in normal case */
1594 info = pdev->dev.archdata.iommu;
1595 if (info)
1596 return info->domain;
1597 return NULL;
1598 }
1599
1600 /* domain is initialized */
1601 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1602 {
1603 struct dmar_domain *domain, *found = NULL;
1604 struct intel_iommu *iommu;
1605 struct dmar_drhd_unit *drhd;
1606 struct device_domain_info *info, *tmp;
1607 struct pci_dev *dev_tmp;
1608 unsigned long flags;
1609 int bus = 0, devfn = 0;
1610
1611 domain = find_domain(pdev);
1612 if (domain)
1613 return domain;
1614
1615 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1616 if (dev_tmp) {
1617 if (dev_tmp->is_pcie) {
1618 bus = dev_tmp->subordinate->number;
1619 devfn = 0;
1620 } else {
1621 bus = dev_tmp->bus->number;
1622 devfn = dev_tmp->devfn;
1623 }
1624 spin_lock_irqsave(&device_domain_lock, flags);
1625 list_for_each_entry(info, &device_domain_list, global) {
1626 if (info->bus == bus && info->devfn == devfn) {
1627 found = info->domain;
1628 break;
1629 }
1630 }
1631 spin_unlock_irqrestore(&device_domain_lock, flags);
1632 /* pcie-pci bridge already has a domain, uses it */
1633 if (found) {
1634 domain = found;
1635 goto found_domain;
1636 }
1637 }
1638
1639 /* Allocate new domain for the device */
1640 drhd = dmar_find_matched_drhd_unit(pdev);
1641 if (!drhd) {
1642 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1643 pci_name(pdev));
1644 return NULL;
1645 }
1646 iommu = drhd->iommu;
1647
1648 domain = iommu_alloc_domain(iommu);
1649 if (!domain)
1650 goto error;
1651
1652 if (domain_init(domain, gaw)) {
1653 domain_exit(domain);
1654 goto error;
1655 }
1656
1657 /* register pcie-to-pci device */
1658 if (dev_tmp) {
1659 info = alloc_devinfo_mem();
1660 if (!info) {
1661 domain_exit(domain);
1662 goto error;
1663 }
1664 info->bus = bus;
1665 info->devfn = devfn;
1666 info->dev = NULL;
1667 info->domain = domain;
1668 /* This domain is shared by devices under p2p bridge */
1669 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1670
1671 /* pcie-to-pci bridge already has a domain, uses it */
1672 found = NULL;
1673 spin_lock_irqsave(&device_domain_lock, flags);
1674 list_for_each_entry(tmp, &device_domain_list, global) {
1675 if (tmp->bus == bus && tmp->devfn == devfn) {
1676 found = tmp->domain;
1677 break;
1678 }
1679 }
1680 if (found) {
1681 free_devinfo_mem(info);
1682 domain_exit(domain);
1683 domain = found;
1684 } else {
1685 list_add(&info->link, &domain->devices);
1686 list_add(&info->global, &device_domain_list);
1687 }
1688 spin_unlock_irqrestore(&device_domain_lock, flags);
1689 }
1690
1691 found_domain:
1692 info = alloc_devinfo_mem();
1693 if (!info)
1694 goto error;
1695 info->bus = pdev->bus->number;
1696 info->devfn = pdev->devfn;
1697 info->dev = pdev;
1698 info->domain = domain;
1699 spin_lock_irqsave(&device_domain_lock, flags);
1700 /* somebody is fast */
1701 found = find_domain(pdev);
1702 if (found != NULL) {
1703 spin_unlock_irqrestore(&device_domain_lock, flags);
1704 if (found != domain) {
1705 domain_exit(domain);
1706 domain = found;
1707 }
1708 free_devinfo_mem(info);
1709 return domain;
1710 }
1711 list_add(&info->link, &domain->devices);
1712 list_add(&info->global, &device_domain_list);
1713 pdev->dev.archdata.iommu = info;
1714 spin_unlock_irqrestore(&device_domain_lock, flags);
1715 return domain;
1716 error:
1717 /* recheck it here, maybe others set it */
1718 return find_domain(pdev);
1719 }
1720
1721 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1722 unsigned long long start,
1723 unsigned long long end)
1724 {
1725 struct dmar_domain *domain;
1726 unsigned long size;
1727 unsigned long long base;
1728 int ret;
1729
1730 printk(KERN_INFO
1731 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1732 pci_name(pdev), start, end);
1733 /* page table init */
1734 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1735 if (!domain)
1736 return -ENOMEM;
1737
1738 /* The address might not be aligned */
1739 base = start & PAGE_MASK;
1740 size = end - base;
1741 size = PAGE_ALIGN(size);
1742 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1743 IOVA_PFN(base + size) - 1)) {
1744 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1745 ret = -ENOMEM;
1746 goto error;
1747 }
1748
1749 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1750 size, base, pci_name(pdev));
1751 /*
1752 * RMRR range might have overlap with physical memory range,
1753 * clear it first
1754 */
1755 dma_pte_clear_range(domain, base, base + size);
1756
1757 ret = domain_page_mapping(domain, base, base, size,
1758 DMA_PTE_READ|DMA_PTE_WRITE);
1759 if (ret)
1760 goto error;
1761
1762 /* context entry init */
1763 ret = domain_context_mapping(domain, pdev);
1764 if (!ret)
1765 return 0;
1766 error:
1767 domain_exit(domain);
1768 return ret;
1769
1770 }
1771
1772 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1773 struct pci_dev *pdev)
1774 {
1775 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1776 return 0;
1777 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1778 rmrr->end_address + 1);
1779 }
1780
1781 #ifdef CONFIG_DMAR_GFX_WA
1782 struct iommu_prepare_data {
1783 struct pci_dev *pdev;
1784 int ret;
1785 };
1786
1787 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1788 unsigned long end_pfn, void *datax)
1789 {
1790 struct iommu_prepare_data *data;
1791
1792 data = (struct iommu_prepare_data *)datax;
1793
1794 data->ret = iommu_prepare_identity_map(data->pdev,
1795 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1796 return data->ret;
1797
1798 }
1799
1800 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1801 {
1802 int nid;
1803 struct iommu_prepare_data data;
1804
1805 data.pdev = pdev;
1806 data.ret = 0;
1807
1808 for_each_online_node(nid) {
1809 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1810 if (data.ret)
1811 return data.ret;
1812 }
1813 return data.ret;
1814 }
1815
1816 static void __init iommu_prepare_gfx_mapping(void)
1817 {
1818 struct pci_dev *pdev = NULL;
1819 int ret;
1820
1821 for_each_pci_dev(pdev) {
1822 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1823 !IS_GFX_DEVICE(pdev))
1824 continue;
1825 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1826 pci_name(pdev));
1827 ret = iommu_prepare_with_active_regions(pdev);
1828 if (ret)
1829 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1830 }
1831 }
1832 #else /* !CONFIG_DMAR_GFX_WA */
1833 static inline void iommu_prepare_gfx_mapping(void)
1834 {
1835 return;
1836 }
1837 #endif
1838
1839 #ifdef CONFIG_DMAR_FLOPPY_WA
1840 static inline void iommu_prepare_isa(void)
1841 {
1842 struct pci_dev *pdev;
1843 int ret;
1844
1845 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1846 if (!pdev)
1847 return;
1848
1849 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1850 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1851
1852 if (ret)
1853 printk("IOMMU: Failed to create 0-64M identity map, "
1854 "floppy might not work\n");
1855
1856 }
1857 #else
1858 static inline void iommu_prepare_isa(void)
1859 {
1860 return;
1861 }
1862 #endif /* !CONFIG_DMAR_FLPY_WA */
1863
1864 static int __init init_dmars(void)
1865 {
1866 struct dmar_drhd_unit *drhd;
1867 struct dmar_rmrr_unit *rmrr;
1868 struct pci_dev *pdev;
1869 struct intel_iommu *iommu;
1870 int i, ret, unit = 0;
1871
1872 /*
1873 * for each drhd
1874 * allocate root
1875 * initialize and program root entry to not present
1876 * endfor
1877 */
1878 for_each_drhd_unit(drhd) {
1879 g_num_of_iommus++;
1880 /*
1881 * lock not needed as this is only incremented in the single
1882 * threaded kernel __init code path all other access are read
1883 * only
1884 */
1885 }
1886
1887 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1888 GFP_KERNEL);
1889 if (!g_iommus) {
1890 printk(KERN_ERR "Allocating global iommu array failed\n");
1891 ret = -ENOMEM;
1892 goto error;
1893 }
1894
1895 deferred_flush = kzalloc(g_num_of_iommus *
1896 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1897 if (!deferred_flush) {
1898 kfree(g_iommus);
1899 ret = -ENOMEM;
1900 goto error;
1901 }
1902
1903 for_each_drhd_unit(drhd) {
1904 if (drhd->ignored)
1905 continue;
1906
1907 iommu = drhd->iommu;
1908 g_iommus[iommu->seq_id] = iommu;
1909
1910 ret = iommu_init_domains(iommu);
1911 if (ret)
1912 goto error;
1913
1914 /*
1915 * TBD:
1916 * we could share the same root & context tables
1917 * amoung all IOMMU's. Need to Split it later.
1918 */
1919 ret = iommu_alloc_root_entry(iommu);
1920 if (ret) {
1921 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1922 goto error;
1923 }
1924 }
1925
1926 for_each_drhd_unit(drhd) {
1927 if (drhd->ignored)
1928 continue;
1929
1930 iommu = drhd->iommu;
1931 if (dmar_enable_qi(iommu)) {
1932 /*
1933 * Queued Invalidate not enabled, use Register Based
1934 * Invalidate
1935 */
1936 iommu->flush.flush_context = __iommu_flush_context;
1937 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1938 printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1939 "invalidation\n",
1940 (unsigned long long)drhd->reg_base_addr);
1941 } else {
1942 iommu->flush.flush_context = qi_flush_context;
1943 iommu->flush.flush_iotlb = qi_flush_iotlb;
1944 printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1945 "invalidation\n",
1946 (unsigned long long)drhd->reg_base_addr);
1947 }
1948 }
1949
1950 /*
1951 * For each rmrr
1952 * for each dev attached to rmrr
1953 * do
1954 * locate drhd for dev, alloc domain for dev
1955 * allocate free domain
1956 * allocate page table entries for rmrr
1957 * if context not allocated for bus
1958 * allocate and init context
1959 * set present in root table for this bus
1960 * init context with domain, translation etc
1961 * endfor
1962 * endfor
1963 */
1964 for_each_rmrr_units(rmrr) {
1965 for (i = 0; i < rmrr->devices_cnt; i++) {
1966 pdev = rmrr->devices[i];
1967 /* some BIOS lists non-exist devices in DMAR table */
1968 if (!pdev)
1969 continue;
1970 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1971 if (ret)
1972 printk(KERN_ERR
1973 "IOMMU: mapping reserved region failed\n");
1974 }
1975 }
1976
1977 iommu_prepare_gfx_mapping();
1978
1979 iommu_prepare_isa();
1980
1981 /*
1982 * for each drhd
1983 * enable fault log
1984 * global invalidate context cache
1985 * global invalidate iotlb
1986 * enable translation
1987 */
1988 for_each_drhd_unit(drhd) {
1989 if (drhd->ignored)
1990 continue;
1991 iommu = drhd->iommu;
1992 sprintf (iommu->name, "dmar%d", unit++);
1993
1994 iommu_flush_write_buffer(iommu);
1995
1996 ret = dmar_set_interrupt(iommu);
1997 if (ret)
1998 goto error;
1999
2000 iommu_set_root_entry(iommu);
2001
2002 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2003 0);
2004 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2005 0);
2006 iommu_disable_protect_mem_regions(iommu);
2007
2008 ret = iommu_enable_translation(iommu);
2009 if (ret)
2010 goto error;
2011 }
2012
2013 return 0;
2014 error:
2015 for_each_drhd_unit(drhd) {
2016 if (drhd->ignored)
2017 continue;
2018 iommu = drhd->iommu;
2019 free_iommu(iommu);
2020 }
2021 kfree(g_iommus);
2022 return ret;
2023 }
2024
2025 static inline u64 aligned_size(u64 host_addr, size_t size)
2026 {
2027 u64 addr;
2028 addr = (host_addr & (~PAGE_MASK)) + size;
2029 return PAGE_ALIGN(addr);
2030 }
2031
2032 struct iova *
2033 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2034 {
2035 struct iova *piova;
2036
2037 /* Make sure it's in range */
2038 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2039 if (!size || (IOVA_START_ADDR + size > end))
2040 return NULL;
2041
2042 piova = alloc_iova(&domain->iovad,
2043 size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2044 return piova;
2045 }
2046
2047 static struct iova *
2048 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2049 size_t size, u64 dma_mask)
2050 {
2051 struct pci_dev *pdev = to_pci_dev(dev);
2052 struct iova *iova = NULL;
2053
2054 if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2055 iova = iommu_alloc_iova(domain, size, dma_mask);
2056 else {
2057 /*
2058 * First try to allocate an io virtual address in
2059 * DMA_32BIT_MASK and if that fails then try allocating
2060 * from higher range
2061 */
2062 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2063 if (!iova)
2064 iova = iommu_alloc_iova(domain, size, dma_mask);
2065 }
2066
2067 if (!iova) {
2068 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2069 return NULL;
2070 }
2071
2072 return iova;
2073 }
2074
2075 static struct dmar_domain *
2076 get_valid_domain_for_dev(struct pci_dev *pdev)
2077 {
2078 struct dmar_domain *domain;
2079 int ret;
2080
2081 domain = get_domain_for_dev(pdev,
2082 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2083 if (!domain) {
2084 printk(KERN_ERR
2085 "Allocating domain for %s failed", pci_name(pdev));
2086 return NULL;
2087 }
2088
2089 /* make sure context mapping is ok */
2090 if (unlikely(!domain_context_mapped(domain, pdev))) {
2091 ret = domain_context_mapping(domain, pdev);
2092 if (ret) {
2093 printk(KERN_ERR
2094 "Domain context map for %s failed",
2095 pci_name(pdev));
2096 return NULL;
2097 }
2098 }
2099
2100 return domain;
2101 }
2102
2103 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2104 size_t size, int dir, u64 dma_mask)
2105 {
2106 struct pci_dev *pdev = to_pci_dev(hwdev);
2107 struct dmar_domain *domain;
2108 phys_addr_t start_paddr;
2109 struct iova *iova;
2110 int prot = 0;
2111 int ret;
2112 struct intel_iommu *iommu;
2113
2114 BUG_ON(dir == DMA_NONE);
2115 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2116 return paddr;
2117
2118 domain = get_valid_domain_for_dev(pdev);
2119 if (!domain)
2120 return 0;
2121
2122 iommu = domain_get_iommu(domain);
2123 size = aligned_size((u64)paddr, size);
2124
2125 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2126 if (!iova)
2127 goto error;
2128
2129 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2130
2131 /*
2132 * Check if DMAR supports zero-length reads on write only
2133 * mappings..
2134 */
2135 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2136 !cap_zlr(iommu->cap))
2137 prot |= DMA_PTE_READ;
2138 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2139 prot |= DMA_PTE_WRITE;
2140 /*
2141 * paddr - (paddr + size) might be partial page, we should map the whole
2142 * page. Note: if two part of one page are separately mapped, we
2143 * might have two guest_addr mapping to the same host paddr, but this
2144 * is not a big problem
2145 */
2146 ret = domain_page_mapping(domain, start_paddr,
2147 ((u64)paddr) & PAGE_MASK, size, prot);
2148 if (ret)
2149 goto error;
2150
2151 /* it's a non-present to present mapping */
2152 ret = iommu_flush_iotlb_psi(iommu, domain->id,
2153 start_paddr, size >> VTD_PAGE_SHIFT, 1);
2154 if (ret)
2155 iommu_flush_write_buffer(iommu);
2156
2157 return start_paddr + ((u64)paddr & (~PAGE_MASK));
2158
2159 error:
2160 if (iova)
2161 __free_iova(&domain->iovad, iova);
2162 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2163 pci_name(pdev), size, (unsigned long long)paddr, dir);
2164 return 0;
2165 }
2166
2167 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2168 size_t size, int dir)
2169 {
2170 return __intel_map_single(hwdev, paddr, size, dir,
2171 to_pci_dev(hwdev)->dma_mask);
2172 }
2173
2174 static void flush_unmaps(void)
2175 {
2176 int i, j;
2177
2178 timer_on = 0;
2179
2180 /* just flush them all */
2181 for (i = 0; i < g_num_of_iommus; i++) {
2182 struct intel_iommu *iommu = g_iommus[i];
2183 if (!iommu)
2184 continue;
2185
2186 if (deferred_flush[i].next) {
2187 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2188 DMA_TLB_GLOBAL_FLUSH, 0);
2189 for (j = 0; j < deferred_flush[i].next; j++) {
2190 __free_iova(&deferred_flush[i].domain[j]->iovad,
2191 deferred_flush[i].iova[j]);
2192 }
2193 deferred_flush[i].next = 0;
2194 }
2195 }
2196
2197 list_size = 0;
2198 }
2199
2200 static void flush_unmaps_timeout(unsigned long data)
2201 {
2202 unsigned long flags;
2203
2204 spin_lock_irqsave(&async_umap_flush_lock, flags);
2205 flush_unmaps();
2206 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2207 }
2208
2209 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2210 {
2211 unsigned long flags;
2212 int next, iommu_id;
2213 struct intel_iommu *iommu;
2214
2215 spin_lock_irqsave(&async_umap_flush_lock, flags);
2216 if (list_size == HIGH_WATER_MARK)
2217 flush_unmaps();
2218
2219 iommu = domain_get_iommu(dom);
2220 iommu_id = iommu->seq_id;
2221
2222 next = deferred_flush[iommu_id].next;
2223 deferred_flush[iommu_id].domain[next] = dom;
2224 deferred_flush[iommu_id].iova[next] = iova;
2225 deferred_flush[iommu_id].next++;
2226
2227 if (!timer_on) {
2228 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2229 timer_on = 1;
2230 }
2231 list_size++;
2232 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2233 }
2234
2235 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2236 int dir)
2237 {
2238 struct pci_dev *pdev = to_pci_dev(dev);
2239 struct dmar_domain *domain;
2240 unsigned long start_addr;
2241 struct iova *iova;
2242 struct intel_iommu *iommu;
2243
2244 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2245 return;
2246 domain = find_domain(pdev);
2247 BUG_ON(!domain);
2248
2249 iommu = domain_get_iommu(domain);
2250
2251 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2252 if (!iova)
2253 return;
2254
2255 start_addr = iova->pfn_lo << PAGE_SHIFT;
2256 size = aligned_size((u64)dev_addr, size);
2257
2258 pr_debug("Device %s unmapping: %lx@%llx\n",
2259 pci_name(pdev), size, (unsigned long long)start_addr);
2260
2261 /* clear the whole page */
2262 dma_pte_clear_range(domain, start_addr, start_addr + size);
2263 /* free page tables */
2264 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2265 if (intel_iommu_strict) {
2266 if (iommu_flush_iotlb_psi(iommu,
2267 domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2268 iommu_flush_write_buffer(iommu);
2269 /* free iova */
2270 __free_iova(&domain->iovad, iova);
2271 } else {
2272 add_unmap(domain, iova);
2273 /*
2274 * queue up the release of the unmap to save the 1/6th of the
2275 * cpu used up by the iotlb flush operation...
2276 */
2277 }
2278 }
2279
2280 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2281 dma_addr_t *dma_handle, gfp_t flags)
2282 {
2283 void *vaddr;
2284 int order;
2285
2286 size = PAGE_ALIGN(size);
2287 order = get_order(size);
2288 flags &= ~(GFP_DMA | GFP_DMA32);
2289
2290 vaddr = (void *)__get_free_pages(flags, order);
2291 if (!vaddr)
2292 return NULL;
2293 memset(vaddr, 0, size);
2294
2295 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2296 DMA_BIDIRECTIONAL,
2297 hwdev->coherent_dma_mask);
2298 if (*dma_handle)
2299 return vaddr;
2300 free_pages((unsigned long)vaddr, order);
2301 return NULL;
2302 }
2303
2304 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2305 dma_addr_t dma_handle)
2306 {
2307 int order;
2308
2309 size = PAGE_ALIGN(size);
2310 order = get_order(size);
2311
2312 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2313 free_pages((unsigned long)vaddr, order);
2314 }
2315
2316 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2317
2318 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2319 int nelems, int dir)
2320 {
2321 int i;
2322 struct pci_dev *pdev = to_pci_dev(hwdev);
2323 struct dmar_domain *domain;
2324 unsigned long start_addr;
2325 struct iova *iova;
2326 size_t size = 0;
2327 void *addr;
2328 struct scatterlist *sg;
2329 struct intel_iommu *iommu;
2330
2331 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2332 return;
2333
2334 domain = find_domain(pdev);
2335 BUG_ON(!domain);
2336
2337 iommu = domain_get_iommu(domain);
2338
2339 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2340 if (!iova)
2341 return;
2342 for_each_sg(sglist, sg, nelems, i) {
2343 addr = SG_ENT_VIRT_ADDRESS(sg);
2344 size += aligned_size((u64)addr, sg->length);
2345 }
2346
2347 start_addr = iova->pfn_lo << PAGE_SHIFT;
2348
2349 /* clear the whole page */
2350 dma_pte_clear_range(domain, start_addr, start_addr + size);
2351 /* free page tables */
2352 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2353
2354 if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2355 size >> VTD_PAGE_SHIFT, 0))
2356 iommu_flush_write_buffer(iommu);
2357
2358 /* free iova */
2359 __free_iova(&domain->iovad, iova);
2360 }
2361
2362 static int intel_nontranslate_map_sg(struct device *hddev,
2363 struct scatterlist *sglist, int nelems, int dir)
2364 {
2365 int i;
2366 struct scatterlist *sg;
2367
2368 for_each_sg(sglist, sg, nelems, i) {
2369 BUG_ON(!sg_page(sg));
2370 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2371 sg->dma_length = sg->length;
2372 }
2373 return nelems;
2374 }
2375
2376 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2377 int dir)
2378 {
2379 void *addr;
2380 int i;
2381 struct pci_dev *pdev = to_pci_dev(hwdev);
2382 struct dmar_domain *domain;
2383 size_t size = 0;
2384 int prot = 0;
2385 size_t offset = 0;
2386 struct iova *iova = NULL;
2387 int ret;
2388 struct scatterlist *sg;
2389 unsigned long start_addr;
2390 struct intel_iommu *iommu;
2391
2392 BUG_ON(dir == DMA_NONE);
2393 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2394 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2395
2396 domain = get_valid_domain_for_dev(pdev);
2397 if (!domain)
2398 return 0;
2399
2400 iommu = domain_get_iommu(domain);
2401
2402 for_each_sg(sglist, sg, nelems, i) {
2403 addr = SG_ENT_VIRT_ADDRESS(sg);
2404 addr = (void *)virt_to_phys(addr);
2405 size += aligned_size((u64)addr, sg->length);
2406 }
2407
2408 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2409 if (!iova) {
2410 sglist->dma_length = 0;
2411 return 0;
2412 }
2413
2414 /*
2415 * Check if DMAR supports zero-length reads on write only
2416 * mappings..
2417 */
2418 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2419 !cap_zlr(iommu->cap))
2420 prot |= DMA_PTE_READ;
2421 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2422 prot |= DMA_PTE_WRITE;
2423
2424 start_addr = iova->pfn_lo << PAGE_SHIFT;
2425 offset = 0;
2426 for_each_sg(sglist, sg, nelems, i) {
2427 addr = SG_ENT_VIRT_ADDRESS(sg);
2428 addr = (void *)virt_to_phys(addr);
2429 size = aligned_size((u64)addr, sg->length);
2430 ret = domain_page_mapping(domain, start_addr + offset,
2431 ((u64)addr) & PAGE_MASK,
2432 size, prot);
2433 if (ret) {
2434 /* clear the page */
2435 dma_pte_clear_range(domain, start_addr,
2436 start_addr + offset);
2437 /* free page tables */
2438 dma_pte_free_pagetable(domain, start_addr,
2439 start_addr + offset);
2440 /* free iova */
2441 __free_iova(&domain->iovad, iova);
2442 return 0;
2443 }
2444 sg->dma_address = start_addr + offset +
2445 ((u64)addr & (~PAGE_MASK));
2446 sg->dma_length = sg->length;
2447 offset += size;
2448 }
2449
2450 /* it's a non-present to present mapping */
2451 if (iommu_flush_iotlb_psi(iommu, domain->id,
2452 start_addr, offset >> VTD_PAGE_SHIFT, 1))
2453 iommu_flush_write_buffer(iommu);
2454 return nelems;
2455 }
2456
2457 static struct dma_mapping_ops intel_dma_ops = {
2458 .alloc_coherent = intel_alloc_coherent,
2459 .free_coherent = intel_free_coherent,
2460 .map_single = intel_map_single,
2461 .unmap_single = intel_unmap_single,
2462 .map_sg = intel_map_sg,
2463 .unmap_sg = intel_unmap_sg,
2464 };
2465
2466 static inline int iommu_domain_cache_init(void)
2467 {
2468 int ret = 0;
2469
2470 iommu_domain_cache = kmem_cache_create("iommu_domain",
2471 sizeof(struct dmar_domain),
2472 0,
2473 SLAB_HWCACHE_ALIGN,
2474
2475 NULL);
2476 if (!iommu_domain_cache) {
2477 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2478 ret = -ENOMEM;
2479 }
2480
2481 return ret;
2482 }
2483
2484 static inline int iommu_devinfo_cache_init(void)
2485 {
2486 int ret = 0;
2487
2488 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2489 sizeof(struct device_domain_info),
2490 0,
2491 SLAB_HWCACHE_ALIGN,
2492 NULL);
2493 if (!iommu_devinfo_cache) {
2494 printk(KERN_ERR "Couldn't create devinfo cache\n");
2495 ret = -ENOMEM;
2496 }
2497
2498 return ret;
2499 }
2500
2501 static inline int iommu_iova_cache_init(void)
2502 {
2503 int ret = 0;
2504
2505 iommu_iova_cache = kmem_cache_create("iommu_iova",
2506 sizeof(struct iova),
2507 0,
2508 SLAB_HWCACHE_ALIGN,
2509 NULL);
2510 if (!iommu_iova_cache) {
2511 printk(KERN_ERR "Couldn't create iova cache\n");
2512 ret = -ENOMEM;
2513 }
2514
2515 return ret;
2516 }
2517
2518 static int __init iommu_init_mempool(void)
2519 {
2520 int ret;
2521 ret = iommu_iova_cache_init();
2522 if (ret)
2523 return ret;
2524
2525 ret = iommu_domain_cache_init();
2526 if (ret)
2527 goto domain_error;
2528
2529 ret = iommu_devinfo_cache_init();
2530 if (!ret)
2531 return ret;
2532
2533 kmem_cache_destroy(iommu_domain_cache);
2534 domain_error:
2535 kmem_cache_destroy(iommu_iova_cache);
2536
2537 return -ENOMEM;
2538 }
2539
2540 static void __init iommu_exit_mempool(void)
2541 {
2542 kmem_cache_destroy(iommu_devinfo_cache);
2543 kmem_cache_destroy(iommu_domain_cache);
2544 kmem_cache_destroy(iommu_iova_cache);
2545
2546 }
2547
2548 static void __init init_no_remapping_devices(void)
2549 {
2550 struct dmar_drhd_unit *drhd;
2551
2552 for_each_drhd_unit(drhd) {
2553 if (!drhd->include_all) {
2554 int i;
2555 for (i = 0; i < drhd->devices_cnt; i++)
2556 if (drhd->devices[i] != NULL)
2557 break;
2558 /* ignore DMAR unit if no pci devices exist */
2559 if (i == drhd->devices_cnt)
2560 drhd->ignored = 1;
2561 }
2562 }
2563
2564 if (dmar_map_gfx)
2565 return;
2566
2567 for_each_drhd_unit(drhd) {
2568 int i;
2569 if (drhd->ignored || drhd->include_all)
2570 continue;
2571
2572 for (i = 0; i < drhd->devices_cnt; i++)
2573 if (drhd->devices[i] &&
2574 !IS_GFX_DEVICE(drhd->devices[i]))
2575 break;
2576
2577 if (i < drhd->devices_cnt)
2578 continue;
2579
2580 /* bypass IOMMU if it is just for gfx devices */
2581 drhd->ignored = 1;
2582 for (i = 0; i < drhd->devices_cnt; i++) {
2583 if (!drhd->devices[i])
2584 continue;
2585 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2586 }
2587 }
2588 }
2589
2590 int __init intel_iommu_init(void)
2591 {
2592 int ret = 0;
2593
2594 if (dmar_table_init())
2595 return -ENODEV;
2596
2597 if (dmar_dev_scope_init())
2598 return -ENODEV;
2599
2600 /*
2601 * Check the need for DMA-remapping initialization now.
2602 * Above initialization will also be used by Interrupt-remapping.
2603 */
2604 if (no_iommu || swiotlb || dmar_disabled)
2605 return -ENODEV;
2606
2607 iommu_init_mempool();
2608 dmar_init_reserved_ranges();
2609
2610 init_no_remapping_devices();
2611
2612 ret = init_dmars();
2613 if (ret) {
2614 printk(KERN_ERR "IOMMU: dmar init failed\n");
2615 put_iova_domain(&reserved_iova_list);
2616 iommu_exit_mempool();
2617 return ret;
2618 }
2619 printk(KERN_INFO
2620 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2621
2622 init_timer(&unmap_timer);
2623 force_iommu = 1;
2624 dma_ops = &intel_dma_ops;
2625 return 0;
2626 }
2627
2628 void intel_iommu_domain_exit(struct dmar_domain *domain)
2629 {
2630 u64 end;
2631
2632 /* Domain 0 is reserved, so dont process it */
2633 if (!domain)
2634 return;
2635
2636 end = DOMAIN_MAX_ADDR(domain->gaw);
2637 end = end & (~VTD_PAGE_MASK);
2638
2639 /* clear ptes */
2640 dma_pte_clear_range(domain, 0, end);
2641
2642 /* free page tables */
2643 dma_pte_free_pagetable(domain, 0, end);
2644
2645 iommu_free_domain(domain);
2646 free_domain_mem(domain);
2647 }
2648 EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2649
2650 struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2651 {
2652 struct dmar_drhd_unit *drhd;
2653 struct dmar_domain *domain;
2654 struct intel_iommu *iommu;
2655
2656 drhd = dmar_find_matched_drhd_unit(pdev);
2657 if (!drhd) {
2658 printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2659 return NULL;
2660 }
2661
2662 iommu = drhd->iommu;
2663 if (!iommu) {
2664 printk(KERN_ERR
2665 "intel_iommu_domain_alloc: iommu == NULL\n");
2666 return NULL;
2667 }
2668 domain = iommu_alloc_domain(iommu);
2669 if (!domain) {
2670 printk(KERN_ERR
2671 "intel_iommu_domain_alloc: domain == NULL\n");
2672 return NULL;
2673 }
2674 if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2675 printk(KERN_ERR
2676 "intel_iommu_domain_alloc: domain_init() failed\n");
2677 intel_iommu_domain_exit(domain);
2678 return NULL;
2679 }
2680 return domain;
2681 }
2682 EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2683
2684 int intel_iommu_context_mapping(
2685 struct dmar_domain *domain, struct pci_dev *pdev)
2686 {
2687 int rc;
2688 rc = domain_context_mapping(domain, pdev);
2689 return rc;
2690 }
2691 EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2692
2693 int intel_iommu_page_mapping(
2694 struct dmar_domain *domain, dma_addr_t iova,
2695 u64 hpa, size_t size, int prot)
2696 {
2697 int rc;
2698 rc = domain_page_mapping(domain, iova, hpa, size, prot);
2699 return rc;
2700 }
2701 EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2702
2703 void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2704 {
2705 detach_domain_for_dev(domain, bus, devfn);
2706 }
2707 EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2708
2709 struct dmar_domain *
2710 intel_iommu_find_domain(struct pci_dev *pdev)
2711 {
2712 return find_domain(pdev);
2713 }
2714 EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2715
2716 int intel_iommu_found(void)
2717 {
2718 return g_num_of_iommus;
2719 }
2720 EXPORT_SYMBOL_GPL(intel_iommu_found);
2721
2722 u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2723 {
2724 struct dma_pte *pte;
2725 u64 pfn;
2726
2727 pfn = 0;
2728 pte = addr_to_dma_pte(domain, iova);
2729
2730 if (pte)
2731 pfn = dma_pte_addr(pte);
2732
2733 return pfn >> VTD_PAGE_SHIFT;
2734 }
2735 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);