]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blob - drivers/pci/intel-iommu.c
iommu: Add domain_has_cap iommu_ops
[mirror_ubuntu-jammy-kernel.git] / drivers / pci / intel-iommu.c
1 /*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
22 */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
41 #include "pci.h"
42
43 #define ROOT_SIZE VTD_PAGE_SIZE
44 #define CONTEXT_SIZE VTD_PAGE_SIZE
45
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
48
49 #define IOAPIC_RANGE_START (0xfee00000)
50 #define IOAPIC_RANGE_END (0xfeefffff)
51 #define IOVA_START_ADDR (0x1000)
52
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
54
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
56
57 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
58 #define DMA_32BIT_PFN IOVA_PFN(DMA_32BIT_MASK)
59 #define DMA_64BIT_PFN IOVA_PFN(DMA_64BIT_MASK)
60
61 /* global iommu list, set NULL for ignored DMAR units */
62 static struct intel_iommu **g_iommus;
63
64 static int rwbf_quirk;
65
66 /*
67 * 0: Present
68 * 1-11: Reserved
69 * 12-63: Context Ptr (12 - (haw-1))
70 * 64-127: Reserved
71 */
72 struct root_entry {
73 u64 val;
74 u64 rsvd1;
75 };
76 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
77 static inline bool root_present(struct root_entry *root)
78 {
79 return (root->val & 1);
80 }
81 static inline void set_root_present(struct root_entry *root)
82 {
83 root->val |= 1;
84 }
85 static inline void set_root_value(struct root_entry *root, unsigned long value)
86 {
87 root->val |= value & VTD_PAGE_MASK;
88 }
89
90 static inline struct context_entry *
91 get_context_addr_from_root(struct root_entry *root)
92 {
93 return (struct context_entry *)
94 (root_present(root)?phys_to_virt(
95 root->val & VTD_PAGE_MASK) :
96 NULL);
97 }
98
99 /*
100 * low 64 bits:
101 * 0: present
102 * 1: fault processing disable
103 * 2-3: translation type
104 * 12-63: address space root
105 * high 64 bits:
106 * 0-2: address width
107 * 3-6: aval
108 * 8-23: domain id
109 */
110 struct context_entry {
111 u64 lo;
112 u64 hi;
113 };
114
115 static inline bool context_present(struct context_entry *context)
116 {
117 return (context->lo & 1);
118 }
119 static inline void context_set_present(struct context_entry *context)
120 {
121 context->lo |= 1;
122 }
123
124 static inline void context_set_fault_enable(struct context_entry *context)
125 {
126 context->lo &= (((u64)-1) << 2) | 1;
127 }
128
129 #define CONTEXT_TT_MULTI_LEVEL 0
130
131 static inline void context_set_translation_type(struct context_entry *context,
132 unsigned long value)
133 {
134 context->lo &= (((u64)-1) << 4) | 3;
135 context->lo |= (value & 3) << 2;
136 }
137
138 static inline void context_set_address_root(struct context_entry *context,
139 unsigned long value)
140 {
141 context->lo |= value & VTD_PAGE_MASK;
142 }
143
144 static inline void context_set_address_width(struct context_entry *context,
145 unsigned long value)
146 {
147 context->hi |= value & 7;
148 }
149
150 static inline void context_set_domain_id(struct context_entry *context,
151 unsigned long value)
152 {
153 context->hi |= (value & ((1 << 16) - 1)) << 8;
154 }
155
156 static inline void context_clear_entry(struct context_entry *context)
157 {
158 context->lo = 0;
159 context->hi = 0;
160 }
161
162 /*
163 * 0: readable
164 * 1: writable
165 * 2-6: reserved
166 * 7: super page
167 * 8-11: available
168 * 12-63: Host physcial address
169 */
170 struct dma_pte {
171 u64 val;
172 };
173
174 static inline void dma_clear_pte(struct dma_pte *pte)
175 {
176 pte->val = 0;
177 }
178
179 static inline void dma_set_pte_readable(struct dma_pte *pte)
180 {
181 pte->val |= DMA_PTE_READ;
182 }
183
184 static inline void dma_set_pte_writable(struct dma_pte *pte)
185 {
186 pte->val |= DMA_PTE_WRITE;
187 }
188
189 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
190 {
191 pte->val = (pte->val & ~3) | (prot & 3);
192 }
193
194 static inline u64 dma_pte_addr(struct dma_pte *pte)
195 {
196 return (pte->val & VTD_PAGE_MASK);
197 }
198
199 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
200 {
201 pte->val |= (addr & VTD_PAGE_MASK);
202 }
203
204 static inline bool dma_pte_present(struct dma_pte *pte)
205 {
206 return (pte->val & 3) != 0;
207 }
208
209 /* devices under the same p2p bridge are owned in one domain */
210 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
211
212 /* domain represents a virtual machine, more than one devices
213 * across iommus may be owned in one domain, e.g. kvm guest.
214 */
215 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
216
217 struct dmar_domain {
218 int id; /* domain id */
219 unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/
220
221 struct list_head devices; /* all devices' list */
222 struct iova_domain iovad; /* iova's that belong to this domain */
223
224 struct dma_pte *pgd; /* virtual address */
225 spinlock_t mapping_lock; /* page table lock */
226 int gaw; /* max guest address width */
227
228 /* adjusted guest address width, 0 is level 2 30-bit */
229 int agaw;
230
231 int flags; /* flags to find out type of domain */
232
233 int iommu_coherency;/* indicate coherency of iommu access */
234 int iommu_snooping; /* indicate snooping control feature*/
235 int iommu_count; /* reference count of iommu */
236 spinlock_t iommu_lock; /* protect iommu set in domain */
237 u64 max_addr; /* maximum mapped address */
238 };
239
240 /* PCI domain-device relationship */
241 struct device_domain_info {
242 struct list_head link; /* link to domain siblings */
243 struct list_head global; /* link to global list */
244 u8 bus; /* PCI bus numer */
245 u8 devfn; /* PCI devfn number */
246 struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
247 struct dmar_domain *domain; /* pointer to domain */
248 };
249
250 static void flush_unmaps_timeout(unsigned long data);
251
252 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
253
254 #define HIGH_WATER_MARK 250
255 struct deferred_flush_tables {
256 int next;
257 struct iova *iova[HIGH_WATER_MARK];
258 struct dmar_domain *domain[HIGH_WATER_MARK];
259 };
260
261 static struct deferred_flush_tables *deferred_flush;
262
263 /* bitmap for indexing intel_iommus */
264 static int g_num_of_iommus;
265
266 static DEFINE_SPINLOCK(async_umap_flush_lock);
267 static LIST_HEAD(unmaps_to_do);
268
269 static int timer_on;
270 static long list_size;
271
272 static void domain_remove_dev_info(struct dmar_domain *domain);
273
274 #ifdef CONFIG_DMAR_DEFAULT_ON
275 int dmar_disabled = 0;
276 #else
277 int dmar_disabled = 1;
278 #endif /*CONFIG_DMAR_DEFAULT_ON*/
279
280 static int __initdata dmar_map_gfx = 1;
281 static int dmar_forcedac;
282 static int intel_iommu_strict;
283
284 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
285 static DEFINE_SPINLOCK(device_domain_lock);
286 static LIST_HEAD(device_domain_list);
287
288 static struct iommu_ops intel_iommu_ops;
289
290 static int __init intel_iommu_setup(char *str)
291 {
292 if (!str)
293 return -EINVAL;
294 while (*str) {
295 if (!strncmp(str, "on", 2)) {
296 dmar_disabled = 0;
297 printk(KERN_INFO "Intel-IOMMU: enabled\n");
298 } else if (!strncmp(str, "off", 3)) {
299 dmar_disabled = 1;
300 printk(KERN_INFO "Intel-IOMMU: disabled\n");
301 } else if (!strncmp(str, "igfx_off", 8)) {
302 dmar_map_gfx = 0;
303 printk(KERN_INFO
304 "Intel-IOMMU: disable GFX device mapping\n");
305 } else if (!strncmp(str, "forcedac", 8)) {
306 printk(KERN_INFO
307 "Intel-IOMMU: Forcing DAC for PCI devices\n");
308 dmar_forcedac = 1;
309 } else if (!strncmp(str, "strict", 6)) {
310 printk(KERN_INFO
311 "Intel-IOMMU: disable batched IOTLB flush\n");
312 intel_iommu_strict = 1;
313 }
314
315 str += strcspn(str, ",");
316 while (*str == ',')
317 str++;
318 }
319 return 0;
320 }
321 __setup("intel_iommu=", intel_iommu_setup);
322
323 static struct kmem_cache *iommu_domain_cache;
324 static struct kmem_cache *iommu_devinfo_cache;
325 static struct kmem_cache *iommu_iova_cache;
326
327 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
328 {
329 unsigned int flags;
330 void *vaddr;
331
332 /* trying to avoid low memory issues */
333 flags = current->flags & PF_MEMALLOC;
334 current->flags |= PF_MEMALLOC;
335 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
336 current->flags &= (~PF_MEMALLOC | flags);
337 return vaddr;
338 }
339
340
341 static inline void *alloc_pgtable_page(void)
342 {
343 unsigned int flags;
344 void *vaddr;
345
346 /* trying to avoid low memory issues */
347 flags = current->flags & PF_MEMALLOC;
348 current->flags |= PF_MEMALLOC;
349 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
350 current->flags &= (~PF_MEMALLOC | flags);
351 return vaddr;
352 }
353
354 static inline void free_pgtable_page(void *vaddr)
355 {
356 free_page((unsigned long)vaddr);
357 }
358
359 static inline void *alloc_domain_mem(void)
360 {
361 return iommu_kmem_cache_alloc(iommu_domain_cache);
362 }
363
364 static void free_domain_mem(void *vaddr)
365 {
366 kmem_cache_free(iommu_domain_cache, vaddr);
367 }
368
369 static inline void * alloc_devinfo_mem(void)
370 {
371 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
372 }
373
374 static inline void free_devinfo_mem(void *vaddr)
375 {
376 kmem_cache_free(iommu_devinfo_cache, vaddr);
377 }
378
379 struct iova *alloc_iova_mem(void)
380 {
381 return iommu_kmem_cache_alloc(iommu_iova_cache);
382 }
383
384 void free_iova_mem(struct iova *iova)
385 {
386 kmem_cache_free(iommu_iova_cache, iova);
387 }
388
389
390 static inline int width_to_agaw(int width);
391
392 /* calculate agaw for each iommu.
393 * "SAGAW" may be different across iommus, use a default agaw, and
394 * get a supported less agaw for iommus that don't support the default agaw.
395 */
396 int iommu_calculate_agaw(struct intel_iommu *iommu)
397 {
398 unsigned long sagaw;
399 int agaw = -1;
400
401 sagaw = cap_sagaw(iommu->cap);
402 for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
403 agaw >= 0; agaw--) {
404 if (test_bit(agaw, &sagaw))
405 break;
406 }
407
408 return agaw;
409 }
410
411 /* in native case, each domain is related to only one iommu */
412 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
413 {
414 int iommu_id;
415
416 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
417
418 iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
419 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
420 return NULL;
421
422 return g_iommus[iommu_id];
423 }
424
425 static void domain_update_iommu_coherency(struct dmar_domain *domain)
426 {
427 int i;
428
429 domain->iommu_coherency = 1;
430
431 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
432 for (; i < g_num_of_iommus; ) {
433 if (!ecap_coherent(g_iommus[i]->ecap)) {
434 domain->iommu_coherency = 0;
435 break;
436 }
437 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
438 }
439 }
440
441 static void domain_update_iommu_snooping(struct dmar_domain *domain)
442 {
443 int i;
444
445 domain->iommu_snooping = 1;
446
447 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
448 for (; i < g_num_of_iommus; ) {
449 if (!ecap_sc_support(g_iommus[i]->ecap)) {
450 domain->iommu_snooping = 0;
451 break;
452 }
453 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
454 }
455 }
456
457 /* Some capabilities may be different across iommus */
458 static void domain_update_iommu_cap(struct dmar_domain *domain)
459 {
460 domain_update_iommu_coherency(domain);
461 domain_update_iommu_snooping(domain);
462 }
463
464 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
465 {
466 struct dmar_drhd_unit *drhd = NULL;
467 int i;
468
469 for_each_drhd_unit(drhd) {
470 if (drhd->ignored)
471 continue;
472
473 for (i = 0; i < drhd->devices_cnt; i++)
474 if (drhd->devices[i] &&
475 drhd->devices[i]->bus->number == bus &&
476 drhd->devices[i]->devfn == devfn)
477 return drhd->iommu;
478
479 if (drhd->include_all)
480 return drhd->iommu;
481 }
482
483 return NULL;
484 }
485
486 static void domain_flush_cache(struct dmar_domain *domain,
487 void *addr, int size)
488 {
489 if (!domain->iommu_coherency)
490 clflush_cache_range(addr, size);
491 }
492
493 /* Gets context entry for a given bus and devfn */
494 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
495 u8 bus, u8 devfn)
496 {
497 struct root_entry *root;
498 struct context_entry *context;
499 unsigned long phy_addr;
500 unsigned long flags;
501
502 spin_lock_irqsave(&iommu->lock, flags);
503 root = &iommu->root_entry[bus];
504 context = get_context_addr_from_root(root);
505 if (!context) {
506 context = (struct context_entry *)alloc_pgtable_page();
507 if (!context) {
508 spin_unlock_irqrestore(&iommu->lock, flags);
509 return NULL;
510 }
511 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
512 phy_addr = virt_to_phys((void *)context);
513 set_root_value(root, phy_addr);
514 set_root_present(root);
515 __iommu_flush_cache(iommu, root, sizeof(*root));
516 }
517 spin_unlock_irqrestore(&iommu->lock, flags);
518 return &context[devfn];
519 }
520
521 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
522 {
523 struct root_entry *root;
524 struct context_entry *context;
525 int ret;
526 unsigned long flags;
527
528 spin_lock_irqsave(&iommu->lock, flags);
529 root = &iommu->root_entry[bus];
530 context = get_context_addr_from_root(root);
531 if (!context) {
532 ret = 0;
533 goto out;
534 }
535 ret = context_present(&context[devfn]);
536 out:
537 spin_unlock_irqrestore(&iommu->lock, flags);
538 return ret;
539 }
540
541 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
542 {
543 struct root_entry *root;
544 struct context_entry *context;
545 unsigned long flags;
546
547 spin_lock_irqsave(&iommu->lock, flags);
548 root = &iommu->root_entry[bus];
549 context = get_context_addr_from_root(root);
550 if (context) {
551 context_clear_entry(&context[devfn]);
552 __iommu_flush_cache(iommu, &context[devfn], \
553 sizeof(*context));
554 }
555 spin_unlock_irqrestore(&iommu->lock, flags);
556 }
557
558 static void free_context_table(struct intel_iommu *iommu)
559 {
560 struct root_entry *root;
561 int i;
562 unsigned long flags;
563 struct context_entry *context;
564
565 spin_lock_irqsave(&iommu->lock, flags);
566 if (!iommu->root_entry) {
567 goto out;
568 }
569 for (i = 0; i < ROOT_ENTRY_NR; i++) {
570 root = &iommu->root_entry[i];
571 context = get_context_addr_from_root(root);
572 if (context)
573 free_pgtable_page(context);
574 }
575 free_pgtable_page(iommu->root_entry);
576 iommu->root_entry = NULL;
577 out:
578 spin_unlock_irqrestore(&iommu->lock, flags);
579 }
580
581 /* page table handling */
582 #define LEVEL_STRIDE (9)
583 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
584
585 static inline int agaw_to_level(int agaw)
586 {
587 return agaw + 2;
588 }
589
590 static inline int agaw_to_width(int agaw)
591 {
592 return 30 + agaw * LEVEL_STRIDE;
593
594 }
595
596 static inline int width_to_agaw(int width)
597 {
598 return (width - 30) / LEVEL_STRIDE;
599 }
600
601 static inline unsigned int level_to_offset_bits(int level)
602 {
603 return (12 + (level - 1) * LEVEL_STRIDE);
604 }
605
606 static inline int address_level_offset(u64 addr, int level)
607 {
608 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
609 }
610
611 static inline u64 level_mask(int level)
612 {
613 return ((u64)-1 << level_to_offset_bits(level));
614 }
615
616 static inline u64 level_size(int level)
617 {
618 return ((u64)1 << level_to_offset_bits(level));
619 }
620
621 static inline u64 align_to_level(u64 addr, int level)
622 {
623 return ((addr + level_size(level) - 1) & level_mask(level));
624 }
625
626 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
627 {
628 int addr_width = agaw_to_width(domain->agaw);
629 struct dma_pte *parent, *pte = NULL;
630 int level = agaw_to_level(domain->agaw);
631 int offset;
632 unsigned long flags;
633
634 BUG_ON(!domain->pgd);
635
636 addr &= (((u64)1) << addr_width) - 1;
637 parent = domain->pgd;
638
639 spin_lock_irqsave(&domain->mapping_lock, flags);
640 while (level > 0) {
641 void *tmp_page;
642
643 offset = address_level_offset(addr, level);
644 pte = &parent[offset];
645 if (level == 1)
646 break;
647
648 if (!dma_pte_present(pte)) {
649 tmp_page = alloc_pgtable_page();
650
651 if (!tmp_page) {
652 spin_unlock_irqrestore(&domain->mapping_lock,
653 flags);
654 return NULL;
655 }
656 domain_flush_cache(domain, tmp_page, PAGE_SIZE);
657 dma_set_pte_addr(pte, virt_to_phys(tmp_page));
658 /*
659 * high level table always sets r/w, last level page
660 * table control read/write
661 */
662 dma_set_pte_readable(pte);
663 dma_set_pte_writable(pte);
664 domain_flush_cache(domain, pte, sizeof(*pte));
665 }
666 parent = phys_to_virt(dma_pte_addr(pte));
667 level--;
668 }
669
670 spin_unlock_irqrestore(&domain->mapping_lock, flags);
671 return pte;
672 }
673
674 /* return address's pte at specific level */
675 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
676 int level)
677 {
678 struct dma_pte *parent, *pte = NULL;
679 int total = agaw_to_level(domain->agaw);
680 int offset;
681
682 parent = domain->pgd;
683 while (level <= total) {
684 offset = address_level_offset(addr, total);
685 pte = &parent[offset];
686 if (level == total)
687 return pte;
688
689 if (!dma_pte_present(pte))
690 break;
691 parent = phys_to_virt(dma_pte_addr(pte));
692 total--;
693 }
694 return NULL;
695 }
696
697 /* clear one page's page table */
698 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
699 {
700 struct dma_pte *pte = NULL;
701
702 /* get last level pte */
703 pte = dma_addr_level_pte(domain, addr, 1);
704
705 if (pte) {
706 dma_clear_pte(pte);
707 domain_flush_cache(domain, pte, sizeof(*pte));
708 }
709 }
710
711 /* clear last level pte, a tlb flush should be followed */
712 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
713 {
714 int addr_width = agaw_to_width(domain->agaw);
715
716 start &= (((u64)1) << addr_width) - 1;
717 end &= (((u64)1) << addr_width) - 1;
718 /* in case it's partial page */
719 start = PAGE_ALIGN(start);
720 end &= PAGE_MASK;
721
722 /* we don't need lock here, nobody else touches the iova range */
723 while (start < end) {
724 dma_pte_clear_one(domain, start);
725 start += VTD_PAGE_SIZE;
726 }
727 }
728
729 /* free page table pages. last level pte should already be cleared */
730 static void dma_pte_free_pagetable(struct dmar_domain *domain,
731 u64 start, u64 end)
732 {
733 int addr_width = agaw_to_width(domain->agaw);
734 struct dma_pte *pte;
735 int total = agaw_to_level(domain->agaw);
736 int level;
737 u64 tmp;
738
739 start &= (((u64)1) << addr_width) - 1;
740 end &= (((u64)1) << addr_width) - 1;
741
742 /* we don't need lock here, nobody else touches the iova range */
743 level = 2;
744 while (level <= total) {
745 tmp = align_to_level(start, level);
746 if (tmp >= end || (tmp + level_size(level) > end))
747 return;
748
749 while (tmp < end) {
750 pte = dma_addr_level_pte(domain, tmp, level);
751 if (pte) {
752 free_pgtable_page(
753 phys_to_virt(dma_pte_addr(pte)));
754 dma_clear_pte(pte);
755 domain_flush_cache(domain, pte, sizeof(*pte));
756 }
757 tmp += level_size(level);
758 }
759 level++;
760 }
761 /* free pgd */
762 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
763 free_pgtable_page(domain->pgd);
764 domain->pgd = NULL;
765 }
766 }
767
768 /* iommu handling */
769 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
770 {
771 struct root_entry *root;
772 unsigned long flags;
773
774 root = (struct root_entry *)alloc_pgtable_page();
775 if (!root)
776 return -ENOMEM;
777
778 __iommu_flush_cache(iommu, root, ROOT_SIZE);
779
780 spin_lock_irqsave(&iommu->lock, flags);
781 iommu->root_entry = root;
782 spin_unlock_irqrestore(&iommu->lock, flags);
783
784 return 0;
785 }
786
787 static void iommu_set_root_entry(struct intel_iommu *iommu)
788 {
789 void *addr;
790 u32 cmd, sts;
791 unsigned long flag;
792
793 addr = iommu->root_entry;
794
795 spin_lock_irqsave(&iommu->register_lock, flag);
796 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
797
798 cmd = iommu->gcmd | DMA_GCMD_SRTP;
799 writel(cmd, iommu->reg + DMAR_GCMD_REG);
800
801 /* Make sure hardware complete it */
802 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
803 readl, (sts & DMA_GSTS_RTPS), sts);
804
805 spin_unlock_irqrestore(&iommu->register_lock, flag);
806 }
807
808 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
809 {
810 u32 val;
811 unsigned long flag;
812
813 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
814 return;
815 val = iommu->gcmd | DMA_GCMD_WBF;
816
817 spin_lock_irqsave(&iommu->register_lock, flag);
818 writel(val, iommu->reg + DMAR_GCMD_REG);
819
820 /* Make sure hardware complete it */
821 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
822 readl, (!(val & DMA_GSTS_WBFS)), val);
823
824 spin_unlock_irqrestore(&iommu->register_lock, flag);
825 }
826
827 /* return value determine if we need a write buffer flush */
828 static int __iommu_flush_context(struct intel_iommu *iommu,
829 u16 did, u16 source_id, u8 function_mask, u64 type,
830 int non_present_entry_flush)
831 {
832 u64 val = 0;
833 unsigned long flag;
834
835 /*
836 * In the non-present entry flush case, if hardware doesn't cache
837 * non-present entry we do nothing and if hardware cache non-present
838 * entry, we flush entries of domain 0 (the domain id is used to cache
839 * any non-present entries)
840 */
841 if (non_present_entry_flush) {
842 if (!cap_caching_mode(iommu->cap))
843 return 1;
844 else
845 did = 0;
846 }
847
848 switch (type) {
849 case DMA_CCMD_GLOBAL_INVL:
850 val = DMA_CCMD_GLOBAL_INVL;
851 break;
852 case DMA_CCMD_DOMAIN_INVL:
853 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
854 break;
855 case DMA_CCMD_DEVICE_INVL:
856 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
857 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
858 break;
859 default:
860 BUG();
861 }
862 val |= DMA_CCMD_ICC;
863
864 spin_lock_irqsave(&iommu->register_lock, flag);
865 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
866
867 /* Make sure hardware complete it */
868 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
869 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
870
871 spin_unlock_irqrestore(&iommu->register_lock, flag);
872
873 /* flush context entry will implicitly flush write buffer */
874 return 0;
875 }
876
877 /* return value determine if we need a write buffer flush */
878 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
879 u64 addr, unsigned int size_order, u64 type,
880 int non_present_entry_flush)
881 {
882 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
883 u64 val = 0, val_iva = 0;
884 unsigned long flag;
885
886 /*
887 * In the non-present entry flush case, if hardware doesn't cache
888 * non-present entry we do nothing and if hardware cache non-present
889 * entry, we flush entries of domain 0 (the domain id is used to cache
890 * any non-present entries)
891 */
892 if (non_present_entry_flush) {
893 if (!cap_caching_mode(iommu->cap))
894 return 1;
895 else
896 did = 0;
897 }
898
899 switch (type) {
900 case DMA_TLB_GLOBAL_FLUSH:
901 /* global flush doesn't need set IVA_REG */
902 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
903 break;
904 case DMA_TLB_DSI_FLUSH:
905 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
906 break;
907 case DMA_TLB_PSI_FLUSH:
908 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
909 /* Note: always flush non-leaf currently */
910 val_iva = size_order | addr;
911 break;
912 default:
913 BUG();
914 }
915 /* Note: set drain read/write */
916 #if 0
917 /*
918 * This is probably to be super secure.. Looks like we can
919 * ignore it without any impact.
920 */
921 if (cap_read_drain(iommu->cap))
922 val |= DMA_TLB_READ_DRAIN;
923 #endif
924 if (cap_write_drain(iommu->cap))
925 val |= DMA_TLB_WRITE_DRAIN;
926
927 spin_lock_irqsave(&iommu->register_lock, flag);
928 /* Note: Only uses first TLB reg currently */
929 if (val_iva)
930 dmar_writeq(iommu->reg + tlb_offset, val_iva);
931 dmar_writeq(iommu->reg + tlb_offset + 8, val);
932
933 /* Make sure hardware complete it */
934 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
935 dmar_readq, (!(val & DMA_TLB_IVT)), val);
936
937 spin_unlock_irqrestore(&iommu->register_lock, flag);
938
939 /* check IOTLB invalidation granularity */
940 if (DMA_TLB_IAIG(val) == 0)
941 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
942 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
943 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
944 (unsigned long long)DMA_TLB_IIRG(type),
945 (unsigned long long)DMA_TLB_IAIG(val));
946 /* flush iotlb entry will implicitly flush write buffer */
947 return 0;
948 }
949
950 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
951 u64 addr, unsigned int pages, int non_present_entry_flush)
952 {
953 unsigned int mask;
954
955 BUG_ON(addr & (~VTD_PAGE_MASK));
956 BUG_ON(pages == 0);
957
958 /* Fallback to domain selective flush if no PSI support */
959 if (!cap_pgsel_inv(iommu->cap))
960 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
961 DMA_TLB_DSI_FLUSH,
962 non_present_entry_flush);
963
964 /*
965 * PSI requires page size to be 2 ^ x, and the base address is naturally
966 * aligned to the size
967 */
968 mask = ilog2(__roundup_pow_of_two(pages));
969 /* Fallback to domain selective flush if size is too big */
970 if (mask > cap_max_amask_val(iommu->cap))
971 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
972 DMA_TLB_DSI_FLUSH, non_present_entry_flush);
973
974 return iommu->flush.flush_iotlb(iommu, did, addr, mask,
975 DMA_TLB_PSI_FLUSH,
976 non_present_entry_flush);
977 }
978
979 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
980 {
981 u32 pmen;
982 unsigned long flags;
983
984 spin_lock_irqsave(&iommu->register_lock, flags);
985 pmen = readl(iommu->reg + DMAR_PMEN_REG);
986 pmen &= ~DMA_PMEN_EPM;
987 writel(pmen, iommu->reg + DMAR_PMEN_REG);
988
989 /* wait for the protected region status bit to clear */
990 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
991 readl, !(pmen & DMA_PMEN_PRS), pmen);
992
993 spin_unlock_irqrestore(&iommu->register_lock, flags);
994 }
995
996 static int iommu_enable_translation(struct intel_iommu *iommu)
997 {
998 u32 sts;
999 unsigned long flags;
1000
1001 spin_lock_irqsave(&iommu->register_lock, flags);
1002 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
1003
1004 /* Make sure hardware complete it */
1005 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1006 readl, (sts & DMA_GSTS_TES), sts);
1007
1008 iommu->gcmd |= DMA_GCMD_TE;
1009 spin_unlock_irqrestore(&iommu->register_lock, flags);
1010 return 0;
1011 }
1012
1013 static int iommu_disable_translation(struct intel_iommu *iommu)
1014 {
1015 u32 sts;
1016 unsigned long flag;
1017
1018 spin_lock_irqsave(&iommu->register_lock, flag);
1019 iommu->gcmd &= ~DMA_GCMD_TE;
1020 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1021
1022 /* Make sure hardware complete it */
1023 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1024 readl, (!(sts & DMA_GSTS_TES)), sts);
1025
1026 spin_unlock_irqrestore(&iommu->register_lock, flag);
1027 return 0;
1028 }
1029
1030 /* iommu interrupt handling. Most stuff are MSI-like. */
1031
1032 static const char *fault_reason_strings[] =
1033 {
1034 "Software",
1035 "Present bit in root entry is clear",
1036 "Present bit in context entry is clear",
1037 "Invalid context entry",
1038 "Access beyond MGAW",
1039 "PTE Write access is not set",
1040 "PTE Read access is not set",
1041 "Next page table ptr is invalid",
1042 "Root table address invalid",
1043 "Context table ptr is invalid",
1044 "non-zero reserved fields in RTP",
1045 "non-zero reserved fields in CTP",
1046 "non-zero reserved fields in PTE",
1047 };
1048 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
1049
1050 const char *dmar_get_fault_reason(u8 fault_reason)
1051 {
1052 if (fault_reason > MAX_FAULT_REASON_IDX)
1053 return "Unknown";
1054 else
1055 return fault_reason_strings[fault_reason];
1056 }
1057
1058 void dmar_msi_unmask(unsigned int irq)
1059 {
1060 struct intel_iommu *iommu = get_irq_data(irq);
1061 unsigned long flag;
1062
1063 /* unmask it */
1064 spin_lock_irqsave(&iommu->register_lock, flag);
1065 writel(0, iommu->reg + DMAR_FECTL_REG);
1066 /* Read a reg to force flush the post write */
1067 readl(iommu->reg + DMAR_FECTL_REG);
1068 spin_unlock_irqrestore(&iommu->register_lock, flag);
1069 }
1070
1071 void dmar_msi_mask(unsigned int irq)
1072 {
1073 unsigned long flag;
1074 struct intel_iommu *iommu = get_irq_data(irq);
1075
1076 /* mask it */
1077 spin_lock_irqsave(&iommu->register_lock, flag);
1078 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1079 /* Read a reg to force flush the post write */
1080 readl(iommu->reg + DMAR_FECTL_REG);
1081 spin_unlock_irqrestore(&iommu->register_lock, flag);
1082 }
1083
1084 void dmar_msi_write(int irq, struct msi_msg *msg)
1085 {
1086 struct intel_iommu *iommu = get_irq_data(irq);
1087 unsigned long flag;
1088
1089 spin_lock_irqsave(&iommu->register_lock, flag);
1090 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1091 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1092 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1093 spin_unlock_irqrestore(&iommu->register_lock, flag);
1094 }
1095
1096 void dmar_msi_read(int irq, struct msi_msg *msg)
1097 {
1098 struct intel_iommu *iommu = get_irq_data(irq);
1099 unsigned long flag;
1100
1101 spin_lock_irqsave(&iommu->register_lock, flag);
1102 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1103 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1104 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1105 spin_unlock_irqrestore(&iommu->register_lock, flag);
1106 }
1107
1108 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1109 u8 fault_reason, u16 source_id, unsigned long long addr)
1110 {
1111 const char *reason;
1112
1113 reason = dmar_get_fault_reason(fault_reason);
1114
1115 printk(KERN_ERR
1116 "DMAR:[%s] Request device [%02x:%02x.%d] "
1117 "fault addr %llx \n"
1118 "DMAR:[fault reason %02d] %s\n",
1119 (type ? "DMA Read" : "DMA Write"),
1120 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1121 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1122 return 0;
1123 }
1124
1125 #define PRIMARY_FAULT_REG_LEN (16)
1126 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1127 {
1128 struct intel_iommu *iommu = dev_id;
1129 int reg, fault_index;
1130 u32 fault_status;
1131 unsigned long flag;
1132
1133 spin_lock_irqsave(&iommu->register_lock, flag);
1134 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1135
1136 /* TBD: ignore advanced fault log currently */
1137 if (!(fault_status & DMA_FSTS_PPF))
1138 goto clear_overflow;
1139
1140 fault_index = dma_fsts_fault_record_index(fault_status);
1141 reg = cap_fault_reg_offset(iommu->cap);
1142 while (1) {
1143 u8 fault_reason;
1144 u16 source_id;
1145 u64 guest_addr;
1146 int type;
1147 u32 data;
1148
1149 /* highest 32 bits */
1150 data = readl(iommu->reg + reg +
1151 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1152 if (!(data & DMA_FRCD_F))
1153 break;
1154
1155 fault_reason = dma_frcd_fault_reason(data);
1156 type = dma_frcd_type(data);
1157
1158 data = readl(iommu->reg + reg +
1159 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1160 source_id = dma_frcd_source_id(data);
1161
1162 guest_addr = dmar_readq(iommu->reg + reg +
1163 fault_index * PRIMARY_FAULT_REG_LEN);
1164 guest_addr = dma_frcd_page_addr(guest_addr);
1165 /* clear the fault */
1166 writel(DMA_FRCD_F, iommu->reg + reg +
1167 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1168
1169 spin_unlock_irqrestore(&iommu->register_lock, flag);
1170
1171 iommu_page_fault_do_one(iommu, type, fault_reason,
1172 source_id, guest_addr);
1173
1174 fault_index++;
1175 if (fault_index > cap_num_fault_regs(iommu->cap))
1176 fault_index = 0;
1177 spin_lock_irqsave(&iommu->register_lock, flag);
1178 }
1179 clear_overflow:
1180 /* clear primary fault overflow */
1181 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1182 if (fault_status & DMA_FSTS_PFO)
1183 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1184
1185 spin_unlock_irqrestore(&iommu->register_lock, flag);
1186 return IRQ_HANDLED;
1187 }
1188
1189 int dmar_set_interrupt(struct intel_iommu *iommu)
1190 {
1191 int irq, ret;
1192
1193 irq = create_irq();
1194 if (!irq) {
1195 printk(KERN_ERR "IOMMU: no free vectors\n");
1196 return -EINVAL;
1197 }
1198
1199 set_irq_data(irq, iommu);
1200 iommu->irq = irq;
1201
1202 ret = arch_setup_dmar_msi(irq);
1203 if (ret) {
1204 set_irq_data(irq, NULL);
1205 iommu->irq = 0;
1206 destroy_irq(irq);
1207 return 0;
1208 }
1209
1210 /* Force fault register is cleared */
1211 iommu_page_fault(irq, iommu);
1212
1213 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1214 if (ret)
1215 printk(KERN_ERR "IOMMU: can't request irq\n");
1216 return ret;
1217 }
1218
1219 static int iommu_init_domains(struct intel_iommu *iommu)
1220 {
1221 unsigned long ndomains;
1222 unsigned long nlongs;
1223
1224 ndomains = cap_ndoms(iommu->cap);
1225 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1226 nlongs = BITS_TO_LONGS(ndomains);
1227
1228 /* TBD: there might be 64K domains,
1229 * consider other allocation for future chip
1230 */
1231 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1232 if (!iommu->domain_ids) {
1233 printk(KERN_ERR "Allocating domain id array failed\n");
1234 return -ENOMEM;
1235 }
1236 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1237 GFP_KERNEL);
1238 if (!iommu->domains) {
1239 printk(KERN_ERR "Allocating domain array failed\n");
1240 kfree(iommu->domain_ids);
1241 return -ENOMEM;
1242 }
1243
1244 spin_lock_init(&iommu->lock);
1245
1246 /*
1247 * if Caching mode is set, then invalid translations are tagged
1248 * with domainid 0. Hence we need to pre-allocate it.
1249 */
1250 if (cap_caching_mode(iommu->cap))
1251 set_bit(0, iommu->domain_ids);
1252 return 0;
1253 }
1254
1255
1256 static void domain_exit(struct dmar_domain *domain);
1257 static void vm_domain_exit(struct dmar_domain *domain);
1258
1259 void free_dmar_iommu(struct intel_iommu *iommu)
1260 {
1261 struct dmar_domain *domain;
1262 int i;
1263 unsigned long flags;
1264
1265 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1266 for (; i < cap_ndoms(iommu->cap); ) {
1267 domain = iommu->domains[i];
1268 clear_bit(i, iommu->domain_ids);
1269
1270 spin_lock_irqsave(&domain->iommu_lock, flags);
1271 if (--domain->iommu_count == 0) {
1272 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1273 vm_domain_exit(domain);
1274 else
1275 domain_exit(domain);
1276 }
1277 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1278
1279 i = find_next_bit(iommu->domain_ids,
1280 cap_ndoms(iommu->cap), i+1);
1281 }
1282
1283 if (iommu->gcmd & DMA_GCMD_TE)
1284 iommu_disable_translation(iommu);
1285
1286 if (iommu->irq) {
1287 set_irq_data(iommu->irq, NULL);
1288 /* This will mask the irq */
1289 free_irq(iommu->irq, iommu);
1290 destroy_irq(iommu->irq);
1291 }
1292
1293 kfree(iommu->domains);
1294 kfree(iommu->domain_ids);
1295
1296 g_iommus[iommu->seq_id] = NULL;
1297
1298 /* if all iommus are freed, free g_iommus */
1299 for (i = 0; i < g_num_of_iommus; i++) {
1300 if (g_iommus[i])
1301 break;
1302 }
1303
1304 if (i == g_num_of_iommus)
1305 kfree(g_iommus);
1306
1307 /* free context mapping */
1308 free_context_table(iommu);
1309 }
1310
1311 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1312 {
1313 unsigned long num;
1314 unsigned long ndomains;
1315 struct dmar_domain *domain;
1316 unsigned long flags;
1317
1318 domain = alloc_domain_mem();
1319 if (!domain)
1320 return NULL;
1321
1322 ndomains = cap_ndoms(iommu->cap);
1323
1324 spin_lock_irqsave(&iommu->lock, flags);
1325 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1326 if (num >= ndomains) {
1327 spin_unlock_irqrestore(&iommu->lock, flags);
1328 free_domain_mem(domain);
1329 printk(KERN_ERR "IOMMU: no free domain ids\n");
1330 return NULL;
1331 }
1332
1333 set_bit(num, iommu->domain_ids);
1334 domain->id = num;
1335 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1336 set_bit(iommu->seq_id, &domain->iommu_bmp);
1337 domain->flags = 0;
1338 iommu->domains[num] = domain;
1339 spin_unlock_irqrestore(&iommu->lock, flags);
1340
1341 return domain;
1342 }
1343
1344 static void iommu_free_domain(struct dmar_domain *domain)
1345 {
1346 unsigned long flags;
1347 struct intel_iommu *iommu;
1348
1349 iommu = domain_get_iommu(domain);
1350
1351 spin_lock_irqsave(&iommu->lock, flags);
1352 clear_bit(domain->id, iommu->domain_ids);
1353 spin_unlock_irqrestore(&iommu->lock, flags);
1354 }
1355
1356 static struct iova_domain reserved_iova_list;
1357 static struct lock_class_key reserved_alloc_key;
1358 static struct lock_class_key reserved_rbtree_key;
1359
1360 static void dmar_init_reserved_ranges(void)
1361 {
1362 struct pci_dev *pdev = NULL;
1363 struct iova *iova;
1364 int i;
1365 u64 addr, size;
1366
1367 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1368
1369 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1370 &reserved_alloc_key);
1371 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1372 &reserved_rbtree_key);
1373
1374 /* IOAPIC ranges shouldn't be accessed by DMA */
1375 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1376 IOVA_PFN(IOAPIC_RANGE_END));
1377 if (!iova)
1378 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1379
1380 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1381 for_each_pci_dev(pdev) {
1382 struct resource *r;
1383
1384 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1385 r = &pdev->resource[i];
1386 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1387 continue;
1388 addr = r->start;
1389 addr &= PAGE_MASK;
1390 size = r->end - addr;
1391 size = PAGE_ALIGN(size);
1392 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1393 IOVA_PFN(size + addr) - 1);
1394 if (!iova)
1395 printk(KERN_ERR "Reserve iova failed\n");
1396 }
1397 }
1398
1399 }
1400
1401 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1402 {
1403 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1404 }
1405
1406 static inline int guestwidth_to_adjustwidth(int gaw)
1407 {
1408 int agaw;
1409 int r = (gaw - 12) % 9;
1410
1411 if (r == 0)
1412 agaw = gaw;
1413 else
1414 agaw = gaw + 9 - r;
1415 if (agaw > 64)
1416 agaw = 64;
1417 return agaw;
1418 }
1419
1420 static int domain_init(struct dmar_domain *domain, int guest_width)
1421 {
1422 struct intel_iommu *iommu;
1423 int adjust_width, agaw;
1424 unsigned long sagaw;
1425
1426 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1427 spin_lock_init(&domain->mapping_lock);
1428 spin_lock_init(&domain->iommu_lock);
1429
1430 domain_reserve_special_ranges(domain);
1431
1432 /* calculate AGAW */
1433 iommu = domain_get_iommu(domain);
1434 if (guest_width > cap_mgaw(iommu->cap))
1435 guest_width = cap_mgaw(iommu->cap);
1436 domain->gaw = guest_width;
1437 adjust_width = guestwidth_to_adjustwidth(guest_width);
1438 agaw = width_to_agaw(adjust_width);
1439 sagaw = cap_sagaw(iommu->cap);
1440 if (!test_bit(agaw, &sagaw)) {
1441 /* hardware doesn't support it, choose a bigger one */
1442 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1443 agaw = find_next_bit(&sagaw, 5, agaw);
1444 if (agaw >= 5)
1445 return -ENODEV;
1446 }
1447 domain->agaw = agaw;
1448 INIT_LIST_HEAD(&domain->devices);
1449
1450 if (ecap_coherent(iommu->ecap))
1451 domain->iommu_coherency = 1;
1452 else
1453 domain->iommu_coherency = 0;
1454
1455 if (ecap_sc_support(iommu->ecap))
1456 domain->iommu_snooping = 1;
1457 else
1458 domain->iommu_snooping = 0;
1459
1460 domain->iommu_count = 1;
1461
1462 /* always allocate the top pgd */
1463 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1464 if (!domain->pgd)
1465 return -ENOMEM;
1466 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1467 return 0;
1468 }
1469
1470 static void domain_exit(struct dmar_domain *domain)
1471 {
1472 u64 end;
1473
1474 /* Domain 0 is reserved, so dont process it */
1475 if (!domain)
1476 return;
1477
1478 domain_remove_dev_info(domain);
1479 /* destroy iovas */
1480 put_iova_domain(&domain->iovad);
1481 end = DOMAIN_MAX_ADDR(domain->gaw);
1482 end = end & (~PAGE_MASK);
1483
1484 /* clear ptes */
1485 dma_pte_clear_range(domain, 0, end);
1486
1487 /* free page tables */
1488 dma_pte_free_pagetable(domain, 0, end);
1489
1490 iommu_free_domain(domain);
1491 free_domain_mem(domain);
1492 }
1493
1494 static int domain_context_mapping_one(struct dmar_domain *domain,
1495 u8 bus, u8 devfn)
1496 {
1497 struct context_entry *context;
1498 unsigned long flags;
1499 struct intel_iommu *iommu;
1500 struct dma_pte *pgd;
1501 unsigned long num;
1502 unsigned long ndomains;
1503 int id;
1504 int agaw;
1505
1506 pr_debug("Set context mapping for %02x:%02x.%d\n",
1507 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1508 BUG_ON(!domain->pgd);
1509
1510 iommu = device_to_iommu(bus, devfn);
1511 if (!iommu)
1512 return -ENODEV;
1513
1514 context = device_to_context_entry(iommu, bus, devfn);
1515 if (!context)
1516 return -ENOMEM;
1517 spin_lock_irqsave(&iommu->lock, flags);
1518 if (context_present(context)) {
1519 spin_unlock_irqrestore(&iommu->lock, flags);
1520 return 0;
1521 }
1522
1523 id = domain->id;
1524 pgd = domain->pgd;
1525
1526 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1527 int found = 0;
1528
1529 /* find an available domain id for this device in iommu */
1530 ndomains = cap_ndoms(iommu->cap);
1531 num = find_first_bit(iommu->domain_ids, ndomains);
1532 for (; num < ndomains; ) {
1533 if (iommu->domains[num] == domain) {
1534 id = num;
1535 found = 1;
1536 break;
1537 }
1538 num = find_next_bit(iommu->domain_ids,
1539 cap_ndoms(iommu->cap), num+1);
1540 }
1541
1542 if (found == 0) {
1543 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1544 if (num >= ndomains) {
1545 spin_unlock_irqrestore(&iommu->lock, flags);
1546 printk(KERN_ERR "IOMMU: no free domain ids\n");
1547 return -EFAULT;
1548 }
1549
1550 set_bit(num, iommu->domain_ids);
1551 iommu->domains[num] = domain;
1552 id = num;
1553 }
1554
1555 /* Skip top levels of page tables for
1556 * iommu which has less agaw than default.
1557 */
1558 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1559 pgd = phys_to_virt(dma_pte_addr(pgd));
1560 if (!dma_pte_present(pgd)) {
1561 spin_unlock_irqrestore(&iommu->lock, flags);
1562 return -ENOMEM;
1563 }
1564 }
1565 }
1566
1567 context_set_domain_id(context, id);
1568 context_set_address_width(context, iommu->agaw);
1569 context_set_address_root(context, virt_to_phys(pgd));
1570 context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1571 context_set_fault_enable(context);
1572 context_set_present(context);
1573 domain_flush_cache(domain, context, sizeof(*context));
1574
1575 /* it's a non-present to present mapping */
1576 if (iommu->flush.flush_context(iommu, domain->id,
1577 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1578 DMA_CCMD_DEVICE_INVL, 1))
1579 iommu_flush_write_buffer(iommu);
1580 else
1581 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1582
1583 spin_unlock_irqrestore(&iommu->lock, flags);
1584
1585 spin_lock_irqsave(&domain->iommu_lock, flags);
1586 if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1587 domain->iommu_count++;
1588 domain_update_iommu_cap(domain);
1589 }
1590 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1591 return 0;
1592 }
1593
1594 static int
1595 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1596 {
1597 int ret;
1598 struct pci_dev *tmp, *parent;
1599
1600 ret = domain_context_mapping_one(domain, pdev->bus->number,
1601 pdev->devfn);
1602 if (ret)
1603 return ret;
1604
1605 /* dependent device mapping */
1606 tmp = pci_find_upstream_pcie_bridge(pdev);
1607 if (!tmp)
1608 return 0;
1609 /* Secondary interface's bus number and devfn 0 */
1610 parent = pdev->bus->self;
1611 while (parent != tmp) {
1612 ret = domain_context_mapping_one(domain, parent->bus->number,
1613 parent->devfn);
1614 if (ret)
1615 return ret;
1616 parent = parent->bus->self;
1617 }
1618 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1619 return domain_context_mapping_one(domain,
1620 tmp->subordinate->number, 0);
1621 else /* this is a legacy PCI bridge */
1622 return domain_context_mapping_one(domain,
1623 tmp->bus->number, tmp->devfn);
1624 }
1625
1626 static int domain_context_mapped(struct pci_dev *pdev)
1627 {
1628 int ret;
1629 struct pci_dev *tmp, *parent;
1630 struct intel_iommu *iommu;
1631
1632 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1633 if (!iommu)
1634 return -ENODEV;
1635
1636 ret = device_context_mapped(iommu,
1637 pdev->bus->number, pdev->devfn);
1638 if (!ret)
1639 return ret;
1640 /* dependent device mapping */
1641 tmp = pci_find_upstream_pcie_bridge(pdev);
1642 if (!tmp)
1643 return ret;
1644 /* Secondary interface's bus number and devfn 0 */
1645 parent = pdev->bus->self;
1646 while (parent != tmp) {
1647 ret = device_context_mapped(iommu, parent->bus->number,
1648 parent->devfn);
1649 if (!ret)
1650 return ret;
1651 parent = parent->bus->self;
1652 }
1653 if (tmp->is_pcie)
1654 return device_context_mapped(iommu,
1655 tmp->subordinate->number, 0);
1656 else
1657 return device_context_mapped(iommu,
1658 tmp->bus->number, tmp->devfn);
1659 }
1660
1661 static int
1662 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1663 u64 hpa, size_t size, int prot)
1664 {
1665 u64 start_pfn, end_pfn;
1666 struct dma_pte *pte;
1667 int index;
1668 int addr_width = agaw_to_width(domain->agaw);
1669
1670 hpa &= (((u64)1) << addr_width) - 1;
1671
1672 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1673 return -EINVAL;
1674 iova &= PAGE_MASK;
1675 start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1676 end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1677 index = 0;
1678 while (start_pfn < end_pfn) {
1679 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1680 if (!pte)
1681 return -ENOMEM;
1682 /* We don't need lock here, nobody else
1683 * touches the iova range
1684 */
1685 BUG_ON(dma_pte_addr(pte));
1686 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1687 dma_set_pte_prot(pte, prot);
1688 domain_flush_cache(domain, pte, sizeof(*pte));
1689 start_pfn++;
1690 index++;
1691 }
1692 return 0;
1693 }
1694
1695 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1696 {
1697 if (!iommu)
1698 return;
1699
1700 clear_context_table(iommu, bus, devfn);
1701 iommu->flush.flush_context(iommu, 0, 0, 0,
1702 DMA_CCMD_GLOBAL_INVL, 0);
1703 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1704 DMA_TLB_GLOBAL_FLUSH, 0);
1705 }
1706
1707 static void domain_remove_dev_info(struct dmar_domain *domain)
1708 {
1709 struct device_domain_info *info;
1710 unsigned long flags;
1711 struct intel_iommu *iommu;
1712
1713 spin_lock_irqsave(&device_domain_lock, flags);
1714 while (!list_empty(&domain->devices)) {
1715 info = list_entry(domain->devices.next,
1716 struct device_domain_info, link);
1717 list_del(&info->link);
1718 list_del(&info->global);
1719 if (info->dev)
1720 info->dev->dev.archdata.iommu = NULL;
1721 spin_unlock_irqrestore(&device_domain_lock, flags);
1722
1723 iommu = device_to_iommu(info->bus, info->devfn);
1724 iommu_detach_dev(iommu, info->bus, info->devfn);
1725 free_devinfo_mem(info);
1726
1727 spin_lock_irqsave(&device_domain_lock, flags);
1728 }
1729 spin_unlock_irqrestore(&device_domain_lock, flags);
1730 }
1731
1732 /*
1733 * find_domain
1734 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1735 */
1736 static struct dmar_domain *
1737 find_domain(struct pci_dev *pdev)
1738 {
1739 struct device_domain_info *info;
1740
1741 /* No lock here, assumes no domain exit in normal case */
1742 info = pdev->dev.archdata.iommu;
1743 if (info)
1744 return info->domain;
1745 return NULL;
1746 }
1747
1748 /* domain is initialized */
1749 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1750 {
1751 struct dmar_domain *domain, *found = NULL;
1752 struct intel_iommu *iommu;
1753 struct dmar_drhd_unit *drhd;
1754 struct device_domain_info *info, *tmp;
1755 struct pci_dev *dev_tmp;
1756 unsigned long flags;
1757 int bus = 0, devfn = 0;
1758
1759 domain = find_domain(pdev);
1760 if (domain)
1761 return domain;
1762
1763 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1764 if (dev_tmp) {
1765 if (dev_tmp->is_pcie) {
1766 bus = dev_tmp->subordinate->number;
1767 devfn = 0;
1768 } else {
1769 bus = dev_tmp->bus->number;
1770 devfn = dev_tmp->devfn;
1771 }
1772 spin_lock_irqsave(&device_domain_lock, flags);
1773 list_for_each_entry(info, &device_domain_list, global) {
1774 if (info->bus == bus && info->devfn == devfn) {
1775 found = info->domain;
1776 break;
1777 }
1778 }
1779 spin_unlock_irqrestore(&device_domain_lock, flags);
1780 /* pcie-pci bridge already has a domain, uses it */
1781 if (found) {
1782 domain = found;
1783 goto found_domain;
1784 }
1785 }
1786
1787 /* Allocate new domain for the device */
1788 drhd = dmar_find_matched_drhd_unit(pdev);
1789 if (!drhd) {
1790 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1791 pci_name(pdev));
1792 return NULL;
1793 }
1794 iommu = drhd->iommu;
1795
1796 domain = iommu_alloc_domain(iommu);
1797 if (!domain)
1798 goto error;
1799
1800 if (domain_init(domain, gaw)) {
1801 domain_exit(domain);
1802 goto error;
1803 }
1804
1805 /* register pcie-to-pci device */
1806 if (dev_tmp) {
1807 info = alloc_devinfo_mem();
1808 if (!info) {
1809 domain_exit(domain);
1810 goto error;
1811 }
1812 info->bus = bus;
1813 info->devfn = devfn;
1814 info->dev = NULL;
1815 info->domain = domain;
1816 /* This domain is shared by devices under p2p bridge */
1817 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1818
1819 /* pcie-to-pci bridge already has a domain, uses it */
1820 found = NULL;
1821 spin_lock_irqsave(&device_domain_lock, flags);
1822 list_for_each_entry(tmp, &device_domain_list, global) {
1823 if (tmp->bus == bus && tmp->devfn == devfn) {
1824 found = tmp->domain;
1825 break;
1826 }
1827 }
1828 if (found) {
1829 free_devinfo_mem(info);
1830 domain_exit(domain);
1831 domain = found;
1832 } else {
1833 list_add(&info->link, &domain->devices);
1834 list_add(&info->global, &device_domain_list);
1835 }
1836 spin_unlock_irqrestore(&device_domain_lock, flags);
1837 }
1838
1839 found_domain:
1840 info = alloc_devinfo_mem();
1841 if (!info)
1842 goto error;
1843 info->bus = pdev->bus->number;
1844 info->devfn = pdev->devfn;
1845 info->dev = pdev;
1846 info->domain = domain;
1847 spin_lock_irqsave(&device_domain_lock, flags);
1848 /* somebody is fast */
1849 found = find_domain(pdev);
1850 if (found != NULL) {
1851 spin_unlock_irqrestore(&device_domain_lock, flags);
1852 if (found != domain) {
1853 domain_exit(domain);
1854 domain = found;
1855 }
1856 free_devinfo_mem(info);
1857 return domain;
1858 }
1859 list_add(&info->link, &domain->devices);
1860 list_add(&info->global, &device_domain_list);
1861 pdev->dev.archdata.iommu = info;
1862 spin_unlock_irqrestore(&device_domain_lock, flags);
1863 return domain;
1864 error:
1865 /* recheck it here, maybe others set it */
1866 return find_domain(pdev);
1867 }
1868
1869 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1870 unsigned long long start,
1871 unsigned long long end)
1872 {
1873 struct dmar_domain *domain;
1874 unsigned long size;
1875 unsigned long long base;
1876 int ret;
1877
1878 printk(KERN_INFO
1879 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1880 pci_name(pdev), start, end);
1881 /* page table init */
1882 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1883 if (!domain)
1884 return -ENOMEM;
1885
1886 /* The address might not be aligned */
1887 base = start & PAGE_MASK;
1888 size = end - base;
1889 size = PAGE_ALIGN(size);
1890 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1891 IOVA_PFN(base + size) - 1)) {
1892 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1893 ret = -ENOMEM;
1894 goto error;
1895 }
1896
1897 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1898 size, base, pci_name(pdev));
1899 /*
1900 * RMRR range might have overlap with physical memory range,
1901 * clear it first
1902 */
1903 dma_pte_clear_range(domain, base, base + size);
1904
1905 ret = domain_page_mapping(domain, base, base, size,
1906 DMA_PTE_READ|DMA_PTE_WRITE);
1907 if (ret)
1908 goto error;
1909
1910 /* context entry init */
1911 ret = domain_context_mapping(domain, pdev);
1912 if (!ret)
1913 return 0;
1914 error:
1915 domain_exit(domain);
1916 return ret;
1917
1918 }
1919
1920 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1921 struct pci_dev *pdev)
1922 {
1923 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1924 return 0;
1925 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1926 rmrr->end_address + 1);
1927 }
1928
1929 #ifdef CONFIG_DMAR_GFX_WA
1930 struct iommu_prepare_data {
1931 struct pci_dev *pdev;
1932 int ret;
1933 };
1934
1935 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1936 unsigned long end_pfn, void *datax)
1937 {
1938 struct iommu_prepare_data *data;
1939
1940 data = (struct iommu_prepare_data *)datax;
1941
1942 data->ret = iommu_prepare_identity_map(data->pdev,
1943 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1944 return data->ret;
1945
1946 }
1947
1948 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1949 {
1950 int nid;
1951 struct iommu_prepare_data data;
1952
1953 data.pdev = pdev;
1954 data.ret = 0;
1955
1956 for_each_online_node(nid) {
1957 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1958 if (data.ret)
1959 return data.ret;
1960 }
1961 return data.ret;
1962 }
1963
1964 static void __init iommu_prepare_gfx_mapping(void)
1965 {
1966 struct pci_dev *pdev = NULL;
1967 int ret;
1968
1969 for_each_pci_dev(pdev) {
1970 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1971 !IS_GFX_DEVICE(pdev))
1972 continue;
1973 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1974 pci_name(pdev));
1975 ret = iommu_prepare_with_active_regions(pdev);
1976 if (ret)
1977 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1978 }
1979 }
1980 #else /* !CONFIG_DMAR_GFX_WA */
1981 static inline void iommu_prepare_gfx_mapping(void)
1982 {
1983 return;
1984 }
1985 #endif
1986
1987 #ifdef CONFIG_DMAR_FLOPPY_WA
1988 static inline void iommu_prepare_isa(void)
1989 {
1990 struct pci_dev *pdev;
1991 int ret;
1992
1993 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1994 if (!pdev)
1995 return;
1996
1997 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1998 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1999
2000 if (ret)
2001 printk("IOMMU: Failed to create 0-64M identity map, "
2002 "floppy might not work\n");
2003
2004 }
2005 #else
2006 static inline void iommu_prepare_isa(void)
2007 {
2008 return;
2009 }
2010 #endif /* !CONFIG_DMAR_FLPY_WA */
2011
2012 static int __init init_dmars(void)
2013 {
2014 struct dmar_drhd_unit *drhd;
2015 struct dmar_rmrr_unit *rmrr;
2016 struct pci_dev *pdev;
2017 struct intel_iommu *iommu;
2018 int i, ret, unit = 0;
2019
2020 /*
2021 * for each drhd
2022 * allocate root
2023 * initialize and program root entry to not present
2024 * endfor
2025 */
2026 for_each_drhd_unit(drhd) {
2027 g_num_of_iommus++;
2028 /*
2029 * lock not needed as this is only incremented in the single
2030 * threaded kernel __init code path all other access are read
2031 * only
2032 */
2033 }
2034
2035 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2036 GFP_KERNEL);
2037 if (!g_iommus) {
2038 printk(KERN_ERR "Allocating global iommu array failed\n");
2039 ret = -ENOMEM;
2040 goto error;
2041 }
2042
2043 deferred_flush = kzalloc(g_num_of_iommus *
2044 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2045 if (!deferred_flush) {
2046 kfree(g_iommus);
2047 ret = -ENOMEM;
2048 goto error;
2049 }
2050
2051 for_each_drhd_unit(drhd) {
2052 if (drhd->ignored)
2053 continue;
2054
2055 iommu = drhd->iommu;
2056 g_iommus[iommu->seq_id] = iommu;
2057
2058 ret = iommu_init_domains(iommu);
2059 if (ret)
2060 goto error;
2061
2062 /*
2063 * TBD:
2064 * we could share the same root & context tables
2065 * amoung all IOMMU's. Need to Split it later.
2066 */
2067 ret = iommu_alloc_root_entry(iommu);
2068 if (ret) {
2069 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2070 goto error;
2071 }
2072 }
2073
2074 for_each_drhd_unit(drhd) {
2075 if (drhd->ignored)
2076 continue;
2077
2078 iommu = drhd->iommu;
2079 if (dmar_enable_qi(iommu)) {
2080 /*
2081 * Queued Invalidate not enabled, use Register Based
2082 * Invalidate
2083 */
2084 iommu->flush.flush_context = __iommu_flush_context;
2085 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2086 printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2087 "invalidation\n",
2088 (unsigned long long)drhd->reg_base_addr);
2089 } else {
2090 iommu->flush.flush_context = qi_flush_context;
2091 iommu->flush.flush_iotlb = qi_flush_iotlb;
2092 printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2093 "invalidation\n",
2094 (unsigned long long)drhd->reg_base_addr);
2095 }
2096 }
2097
2098 /*
2099 * For each rmrr
2100 * for each dev attached to rmrr
2101 * do
2102 * locate drhd for dev, alloc domain for dev
2103 * allocate free domain
2104 * allocate page table entries for rmrr
2105 * if context not allocated for bus
2106 * allocate and init context
2107 * set present in root table for this bus
2108 * init context with domain, translation etc
2109 * endfor
2110 * endfor
2111 */
2112 for_each_rmrr_units(rmrr) {
2113 for (i = 0; i < rmrr->devices_cnt; i++) {
2114 pdev = rmrr->devices[i];
2115 /* some BIOS lists non-exist devices in DMAR table */
2116 if (!pdev)
2117 continue;
2118 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2119 if (ret)
2120 printk(KERN_ERR
2121 "IOMMU: mapping reserved region failed\n");
2122 }
2123 }
2124
2125 iommu_prepare_gfx_mapping();
2126
2127 iommu_prepare_isa();
2128
2129 /*
2130 * for each drhd
2131 * enable fault log
2132 * global invalidate context cache
2133 * global invalidate iotlb
2134 * enable translation
2135 */
2136 for_each_drhd_unit(drhd) {
2137 if (drhd->ignored)
2138 continue;
2139 iommu = drhd->iommu;
2140 sprintf (iommu->name, "dmar%d", unit++);
2141
2142 iommu_flush_write_buffer(iommu);
2143
2144 ret = dmar_set_interrupt(iommu);
2145 if (ret)
2146 goto error;
2147
2148 iommu_set_root_entry(iommu);
2149
2150 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2151 0);
2152 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2153 0);
2154 iommu_disable_protect_mem_regions(iommu);
2155
2156 ret = iommu_enable_translation(iommu);
2157 if (ret)
2158 goto error;
2159 }
2160
2161 return 0;
2162 error:
2163 for_each_drhd_unit(drhd) {
2164 if (drhd->ignored)
2165 continue;
2166 iommu = drhd->iommu;
2167 free_iommu(iommu);
2168 }
2169 kfree(g_iommus);
2170 return ret;
2171 }
2172
2173 static inline u64 aligned_size(u64 host_addr, size_t size)
2174 {
2175 u64 addr;
2176 addr = (host_addr & (~PAGE_MASK)) + size;
2177 return PAGE_ALIGN(addr);
2178 }
2179
2180 struct iova *
2181 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2182 {
2183 struct iova *piova;
2184
2185 /* Make sure it's in range */
2186 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2187 if (!size || (IOVA_START_ADDR + size > end))
2188 return NULL;
2189
2190 piova = alloc_iova(&domain->iovad,
2191 size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2192 return piova;
2193 }
2194
2195 static struct iova *
2196 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2197 size_t size, u64 dma_mask)
2198 {
2199 struct pci_dev *pdev = to_pci_dev(dev);
2200 struct iova *iova = NULL;
2201
2202 if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2203 iova = iommu_alloc_iova(domain, size, dma_mask);
2204 else {
2205 /*
2206 * First try to allocate an io virtual address in
2207 * DMA_32BIT_MASK and if that fails then try allocating
2208 * from higher range
2209 */
2210 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2211 if (!iova)
2212 iova = iommu_alloc_iova(domain, size, dma_mask);
2213 }
2214
2215 if (!iova) {
2216 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2217 return NULL;
2218 }
2219
2220 return iova;
2221 }
2222
2223 static struct dmar_domain *
2224 get_valid_domain_for_dev(struct pci_dev *pdev)
2225 {
2226 struct dmar_domain *domain;
2227 int ret;
2228
2229 domain = get_domain_for_dev(pdev,
2230 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2231 if (!domain) {
2232 printk(KERN_ERR
2233 "Allocating domain for %s failed", pci_name(pdev));
2234 return NULL;
2235 }
2236
2237 /* make sure context mapping is ok */
2238 if (unlikely(!domain_context_mapped(pdev))) {
2239 ret = domain_context_mapping(domain, pdev);
2240 if (ret) {
2241 printk(KERN_ERR
2242 "Domain context map for %s failed",
2243 pci_name(pdev));
2244 return NULL;
2245 }
2246 }
2247
2248 return domain;
2249 }
2250
2251 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2252 size_t size, int dir, u64 dma_mask)
2253 {
2254 struct pci_dev *pdev = to_pci_dev(hwdev);
2255 struct dmar_domain *domain;
2256 phys_addr_t start_paddr;
2257 struct iova *iova;
2258 int prot = 0;
2259 int ret;
2260 struct intel_iommu *iommu;
2261
2262 BUG_ON(dir == DMA_NONE);
2263 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2264 return paddr;
2265
2266 domain = get_valid_domain_for_dev(pdev);
2267 if (!domain)
2268 return 0;
2269
2270 iommu = domain_get_iommu(domain);
2271 size = aligned_size((u64)paddr, size);
2272
2273 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2274 if (!iova)
2275 goto error;
2276
2277 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2278
2279 /*
2280 * Check if DMAR supports zero-length reads on write only
2281 * mappings..
2282 */
2283 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2284 !cap_zlr(iommu->cap))
2285 prot |= DMA_PTE_READ;
2286 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2287 prot |= DMA_PTE_WRITE;
2288 /*
2289 * paddr - (paddr + size) might be partial page, we should map the whole
2290 * page. Note: if two part of one page are separately mapped, we
2291 * might have two guest_addr mapping to the same host paddr, but this
2292 * is not a big problem
2293 */
2294 ret = domain_page_mapping(domain, start_paddr,
2295 ((u64)paddr) & PAGE_MASK, size, prot);
2296 if (ret)
2297 goto error;
2298
2299 /* it's a non-present to present mapping */
2300 ret = iommu_flush_iotlb_psi(iommu, domain->id,
2301 start_paddr, size >> VTD_PAGE_SHIFT, 1);
2302 if (ret)
2303 iommu_flush_write_buffer(iommu);
2304
2305 return start_paddr + ((u64)paddr & (~PAGE_MASK));
2306
2307 error:
2308 if (iova)
2309 __free_iova(&domain->iovad, iova);
2310 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2311 pci_name(pdev), size, (unsigned long long)paddr, dir);
2312 return 0;
2313 }
2314
2315 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2316 size_t size, int dir)
2317 {
2318 return __intel_map_single(hwdev, paddr, size, dir,
2319 to_pci_dev(hwdev)->dma_mask);
2320 }
2321
2322 static void flush_unmaps(void)
2323 {
2324 int i, j;
2325
2326 timer_on = 0;
2327
2328 /* just flush them all */
2329 for (i = 0; i < g_num_of_iommus; i++) {
2330 struct intel_iommu *iommu = g_iommus[i];
2331 if (!iommu)
2332 continue;
2333
2334 if (deferred_flush[i].next) {
2335 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2336 DMA_TLB_GLOBAL_FLUSH, 0);
2337 for (j = 0; j < deferred_flush[i].next; j++) {
2338 __free_iova(&deferred_flush[i].domain[j]->iovad,
2339 deferred_flush[i].iova[j]);
2340 }
2341 deferred_flush[i].next = 0;
2342 }
2343 }
2344
2345 list_size = 0;
2346 }
2347
2348 static void flush_unmaps_timeout(unsigned long data)
2349 {
2350 unsigned long flags;
2351
2352 spin_lock_irqsave(&async_umap_flush_lock, flags);
2353 flush_unmaps();
2354 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2355 }
2356
2357 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2358 {
2359 unsigned long flags;
2360 int next, iommu_id;
2361 struct intel_iommu *iommu;
2362
2363 spin_lock_irqsave(&async_umap_flush_lock, flags);
2364 if (list_size == HIGH_WATER_MARK)
2365 flush_unmaps();
2366
2367 iommu = domain_get_iommu(dom);
2368 iommu_id = iommu->seq_id;
2369
2370 next = deferred_flush[iommu_id].next;
2371 deferred_flush[iommu_id].domain[next] = dom;
2372 deferred_flush[iommu_id].iova[next] = iova;
2373 deferred_flush[iommu_id].next++;
2374
2375 if (!timer_on) {
2376 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2377 timer_on = 1;
2378 }
2379 list_size++;
2380 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2381 }
2382
2383 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2384 int dir)
2385 {
2386 struct pci_dev *pdev = to_pci_dev(dev);
2387 struct dmar_domain *domain;
2388 unsigned long start_addr;
2389 struct iova *iova;
2390 struct intel_iommu *iommu;
2391
2392 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2393 return;
2394 domain = find_domain(pdev);
2395 BUG_ON(!domain);
2396
2397 iommu = domain_get_iommu(domain);
2398
2399 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2400 if (!iova)
2401 return;
2402
2403 start_addr = iova->pfn_lo << PAGE_SHIFT;
2404 size = aligned_size((u64)dev_addr, size);
2405
2406 pr_debug("Device %s unmapping: %lx@%llx\n",
2407 pci_name(pdev), size, (unsigned long long)start_addr);
2408
2409 /* clear the whole page */
2410 dma_pte_clear_range(domain, start_addr, start_addr + size);
2411 /* free page tables */
2412 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2413 if (intel_iommu_strict) {
2414 if (iommu_flush_iotlb_psi(iommu,
2415 domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2416 iommu_flush_write_buffer(iommu);
2417 /* free iova */
2418 __free_iova(&domain->iovad, iova);
2419 } else {
2420 add_unmap(domain, iova);
2421 /*
2422 * queue up the release of the unmap to save the 1/6th of the
2423 * cpu used up by the iotlb flush operation...
2424 */
2425 }
2426 }
2427
2428 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2429 dma_addr_t *dma_handle, gfp_t flags)
2430 {
2431 void *vaddr;
2432 int order;
2433
2434 size = PAGE_ALIGN(size);
2435 order = get_order(size);
2436 flags &= ~(GFP_DMA | GFP_DMA32);
2437
2438 vaddr = (void *)__get_free_pages(flags, order);
2439 if (!vaddr)
2440 return NULL;
2441 memset(vaddr, 0, size);
2442
2443 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2444 DMA_BIDIRECTIONAL,
2445 hwdev->coherent_dma_mask);
2446 if (*dma_handle)
2447 return vaddr;
2448 free_pages((unsigned long)vaddr, order);
2449 return NULL;
2450 }
2451
2452 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2453 dma_addr_t dma_handle)
2454 {
2455 int order;
2456
2457 size = PAGE_ALIGN(size);
2458 order = get_order(size);
2459
2460 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2461 free_pages((unsigned long)vaddr, order);
2462 }
2463
2464 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2465
2466 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2467 int nelems, int dir)
2468 {
2469 int i;
2470 struct pci_dev *pdev = to_pci_dev(hwdev);
2471 struct dmar_domain *domain;
2472 unsigned long start_addr;
2473 struct iova *iova;
2474 size_t size = 0;
2475 void *addr;
2476 struct scatterlist *sg;
2477 struct intel_iommu *iommu;
2478
2479 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2480 return;
2481
2482 domain = find_domain(pdev);
2483 BUG_ON(!domain);
2484
2485 iommu = domain_get_iommu(domain);
2486
2487 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2488 if (!iova)
2489 return;
2490 for_each_sg(sglist, sg, nelems, i) {
2491 addr = SG_ENT_VIRT_ADDRESS(sg);
2492 size += aligned_size((u64)addr, sg->length);
2493 }
2494
2495 start_addr = iova->pfn_lo << PAGE_SHIFT;
2496
2497 /* clear the whole page */
2498 dma_pte_clear_range(domain, start_addr, start_addr + size);
2499 /* free page tables */
2500 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2501
2502 if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2503 size >> VTD_PAGE_SHIFT, 0))
2504 iommu_flush_write_buffer(iommu);
2505
2506 /* free iova */
2507 __free_iova(&domain->iovad, iova);
2508 }
2509
2510 static int intel_nontranslate_map_sg(struct device *hddev,
2511 struct scatterlist *sglist, int nelems, int dir)
2512 {
2513 int i;
2514 struct scatterlist *sg;
2515
2516 for_each_sg(sglist, sg, nelems, i) {
2517 BUG_ON(!sg_page(sg));
2518 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2519 sg->dma_length = sg->length;
2520 }
2521 return nelems;
2522 }
2523
2524 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2525 int dir)
2526 {
2527 void *addr;
2528 int i;
2529 struct pci_dev *pdev = to_pci_dev(hwdev);
2530 struct dmar_domain *domain;
2531 size_t size = 0;
2532 int prot = 0;
2533 size_t offset = 0;
2534 struct iova *iova = NULL;
2535 int ret;
2536 struct scatterlist *sg;
2537 unsigned long start_addr;
2538 struct intel_iommu *iommu;
2539
2540 BUG_ON(dir == DMA_NONE);
2541 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2542 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2543
2544 domain = get_valid_domain_for_dev(pdev);
2545 if (!domain)
2546 return 0;
2547
2548 iommu = domain_get_iommu(domain);
2549
2550 for_each_sg(sglist, sg, nelems, i) {
2551 addr = SG_ENT_VIRT_ADDRESS(sg);
2552 addr = (void *)virt_to_phys(addr);
2553 size += aligned_size((u64)addr, sg->length);
2554 }
2555
2556 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2557 if (!iova) {
2558 sglist->dma_length = 0;
2559 return 0;
2560 }
2561
2562 /*
2563 * Check if DMAR supports zero-length reads on write only
2564 * mappings..
2565 */
2566 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2567 !cap_zlr(iommu->cap))
2568 prot |= DMA_PTE_READ;
2569 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2570 prot |= DMA_PTE_WRITE;
2571
2572 start_addr = iova->pfn_lo << PAGE_SHIFT;
2573 offset = 0;
2574 for_each_sg(sglist, sg, nelems, i) {
2575 addr = SG_ENT_VIRT_ADDRESS(sg);
2576 addr = (void *)virt_to_phys(addr);
2577 size = aligned_size((u64)addr, sg->length);
2578 ret = domain_page_mapping(domain, start_addr + offset,
2579 ((u64)addr) & PAGE_MASK,
2580 size, prot);
2581 if (ret) {
2582 /* clear the page */
2583 dma_pte_clear_range(domain, start_addr,
2584 start_addr + offset);
2585 /* free page tables */
2586 dma_pte_free_pagetable(domain, start_addr,
2587 start_addr + offset);
2588 /* free iova */
2589 __free_iova(&domain->iovad, iova);
2590 return 0;
2591 }
2592 sg->dma_address = start_addr + offset +
2593 ((u64)addr & (~PAGE_MASK));
2594 sg->dma_length = sg->length;
2595 offset += size;
2596 }
2597
2598 /* it's a non-present to present mapping */
2599 if (iommu_flush_iotlb_psi(iommu, domain->id,
2600 start_addr, offset >> VTD_PAGE_SHIFT, 1))
2601 iommu_flush_write_buffer(iommu);
2602 return nelems;
2603 }
2604
2605 static struct dma_mapping_ops intel_dma_ops = {
2606 .alloc_coherent = intel_alloc_coherent,
2607 .free_coherent = intel_free_coherent,
2608 .map_single = intel_map_single,
2609 .unmap_single = intel_unmap_single,
2610 .map_sg = intel_map_sg,
2611 .unmap_sg = intel_unmap_sg,
2612 };
2613
2614 static inline int iommu_domain_cache_init(void)
2615 {
2616 int ret = 0;
2617
2618 iommu_domain_cache = kmem_cache_create("iommu_domain",
2619 sizeof(struct dmar_domain),
2620 0,
2621 SLAB_HWCACHE_ALIGN,
2622
2623 NULL);
2624 if (!iommu_domain_cache) {
2625 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2626 ret = -ENOMEM;
2627 }
2628
2629 return ret;
2630 }
2631
2632 static inline int iommu_devinfo_cache_init(void)
2633 {
2634 int ret = 0;
2635
2636 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2637 sizeof(struct device_domain_info),
2638 0,
2639 SLAB_HWCACHE_ALIGN,
2640 NULL);
2641 if (!iommu_devinfo_cache) {
2642 printk(KERN_ERR "Couldn't create devinfo cache\n");
2643 ret = -ENOMEM;
2644 }
2645
2646 return ret;
2647 }
2648
2649 static inline int iommu_iova_cache_init(void)
2650 {
2651 int ret = 0;
2652
2653 iommu_iova_cache = kmem_cache_create("iommu_iova",
2654 sizeof(struct iova),
2655 0,
2656 SLAB_HWCACHE_ALIGN,
2657 NULL);
2658 if (!iommu_iova_cache) {
2659 printk(KERN_ERR "Couldn't create iova cache\n");
2660 ret = -ENOMEM;
2661 }
2662
2663 return ret;
2664 }
2665
2666 static int __init iommu_init_mempool(void)
2667 {
2668 int ret;
2669 ret = iommu_iova_cache_init();
2670 if (ret)
2671 return ret;
2672
2673 ret = iommu_domain_cache_init();
2674 if (ret)
2675 goto domain_error;
2676
2677 ret = iommu_devinfo_cache_init();
2678 if (!ret)
2679 return ret;
2680
2681 kmem_cache_destroy(iommu_domain_cache);
2682 domain_error:
2683 kmem_cache_destroy(iommu_iova_cache);
2684
2685 return -ENOMEM;
2686 }
2687
2688 static void __init iommu_exit_mempool(void)
2689 {
2690 kmem_cache_destroy(iommu_devinfo_cache);
2691 kmem_cache_destroy(iommu_domain_cache);
2692 kmem_cache_destroy(iommu_iova_cache);
2693
2694 }
2695
2696 static void __init init_no_remapping_devices(void)
2697 {
2698 struct dmar_drhd_unit *drhd;
2699
2700 for_each_drhd_unit(drhd) {
2701 if (!drhd->include_all) {
2702 int i;
2703 for (i = 0; i < drhd->devices_cnt; i++)
2704 if (drhd->devices[i] != NULL)
2705 break;
2706 /* ignore DMAR unit if no pci devices exist */
2707 if (i == drhd->devices_cnt)
2708 drhd->ignored = 1;
2709 }
2710 }
2711
2712 if (dmar_map_gfx)
2713 return;
2714
2715 for_each_drhd_unit(drhd) {
2716 int i;
2717 if (drhd->ignored || drhd->include_all)
2718 continue;
2719
2720 for (i = 0; i < drhd->devices_cnt; i++)
2721 if (drhd->devices[i] &&
2722 !IS_GFX_DEVICE(drhd->devices[i]))
2723 break;
2724
2725 if (i < drhd->devices_cnt)
2726 continue;
2727
2728 /* bypass IOMMU if it is just for gfx devices */
2729 drhd->ignored = 1;
2730 for (i = 0; i < drhd->devices_cnt; i++) {
2731 if (!drhd->devices[i])
2732 continue;
2733 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2734 }
2735 }
2736 }
2737
2738 int __init intel_iommu_init(void)
2739 {
2740 int ret = 0;
2741
2742 if (dmar_table_init())
2743 return -ENODEV;
2744
2745 if (dmar_dev_scope_init())
2746 return -ENODEV;
2747
2748 /*
2749 * Check the need for DMA-remapping initialization now.
2750 * Above initialization will also be used by Interrupt-remapping.
2751 */
2752 if (no_iommu || swiotlb || dmar_disabled)
2753 return -ENODEV;
2754
2755 iommu_init_mempool();
2756 dmar_init_reserved_ranges();
2757
2758 init_no_remapping_devices();
2759
2760 ret = init_dmars();
2761 if (ret) {
2762 printk(KERN_ERR "IOMMU: dmar init failed\n");
2763 put_iova_domain(&reserved_iova_list);
2764 iommu_exit_mempool();
2765 return ret;
2766 }
2767 printk(KERN_INFO
2768 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2769
2770 init_timer(&unmap_timer);
2771 force_iommu = 1;
2772 dma_ops = &intel_dma_ops;
2773
2774 register_iommu(&intel_iommu_ops);
2775
2776 return 0;
2777 }
2778
2779 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2780 struct pci_dev *pdev)
2781 {
2782 struct device_domain_info *info;
2783 unsigned long flags;
2784
2785 info = alloc_devinfo_mem();
2786 if (!info)
2787 return -ENOMEM;
2788
2789 info->bus = pdev->bus->number;
2790 info->devfn = pdev->devfn;
2791 info->dev = pdev;
2792 info->domain = domain;
2793
2794 spin_lock_irqsave(&device_domain_lock, flags);
2795 list_add(&info->link, &domain->devices);
2796 list_add(&info->global, &device_domain_list);
2797 pdev->dev.archdata.iommu = info;
2798 spin_unlock_irqrestore(&device_domain_lock, flags);
2799
2800 return 0;
2801 }
2802
2803 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2804 struct pci_dev *pdev)
2805 {
2806 struct device_domain_info *info;
2807 struct intel_iommu *iommu;
2808 unsigned long flags;
2809 int found = 0;
2810 struct list_head *entry, *tmp;
2811
2812 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2813 if (!iommu)
2814 return;
2815
2816 spin_lock_irqsave(&device_domain_lock, flags);
2817 list_for_each_safe(entry, tmp, &domain->devices) {
2818 info = list_entry(entry, struct device_domain_info, link);
2819 if (info->bus == pdev->bus->number &&
2820 info->devfn == pdev->devfn) {
2821 list_del(&info->link);
2822 list_del(&info->global);
2823 if (info->dev)
2824 info->dev->dev.archdata.iommu = NULL;
2825 spin_unlock_irqrestore(&device_domain_lock, flags);
2826
2827 iommu_detach_dev(iommu, info->bus, info->devfn);
2828 free_devinfo_mem(info);
2829
2830 spin_lock_irqsave(&device_domain_lock, flags);
2831
2832 if (found)
2833 break;
2834 else
2835 continue;
2836 }
2837
2838 /* if there is no other devices under the same iommu
2839 * owned by this domain, clear this iommu in iommu_bmp
2840 * update iommu count and coherency
2841 */
2842 if (device_to_iommu(info->bus, info->devfn) == iommu)
2843 found = 1;
2844 }
2845
2846 if (found == 0) {
2847 unsigned long tmp_flags;
2848 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2849 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2850 domain->iommu_count--;
2851 domain_update_iommu_cap(domain);
2852 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2853 }
2854
2855 spin_unlock_irqrestore(&device_domain_lock, flags);
2856 }
2857
2858 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2859 {
2860 struct device_domain_info *info;
2861 struct intel_iommu *iommu;
2862 unsigned long flags1, flags2;
2863
2864 spin_lock_irqsave(&device_domain_lock, flags1);
2865 while (!list_empty(&domain->devices)) {
2866 info = list_entry(domain->devices.next,
2867 struct device_domain_info, link);
2868 list_del(&info->link);
2869 list_del(&info->global);
2870 if (info->dev)
2871 info->dev->dev.archdata.iommu = NULL;
2872
2873 spin_unlock_irqrestore(&device_domain_lock, flags1);
2874
2875 iommu = device_to_iommu(info->bus, info->devfn);
2876 iommu_detach_dev(iommu, info->bus, info->devfn);
2877
2878 /* clear this iommu in iommu_bmp, update iommu count
2879 * and capabilities
2880 */
2881 spin_lock_irqsave(&domain->iommu_lock, flags2);
2882 if (test_and_clear_bit(iommu->seq_id,
2883 &domain->iommu_bmp)) {
2884 domain->iommu_count--;
2885 domain_update_iommu_cap(domain);
2886 }
2887 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2888
2889 free_devinfo_mem(info);
2890 spin_lock_irqsave(&device_domain_lock, flags1);
2891 }
2892 spin_unlock_irqrestore(&device_domain_lock, flags1);
2893 }
2894
2895 /* domain id for virtual machine, it won't be set in context */
2896 static unsigned long vm_domid;
2897
2898 static int vm_domain_min_agaw(struct dmar_domain *domain)
2899 {
2900 int i;
2901 int min_agaw = domain->agaw;
2902
2903 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2904 for (; i < g_num_of_iommus; ) {
2905 if (min_agaw > g_iommus[i]->agaw)
2906 min_agaw = g_iommus[i]->agaw;
2907
2908 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2909 }
2910
2911 return min_agaw;
2912 }
2913
2914 static struct dmar_domain *iommu_alloc_vm_domain(void)
2915 {
2916 struct dmar_domain *domain;
2917
2918 domain = alloc_domain_mem();
2919 if (!domain)
2920 return NULL;
2921
2922 domain->id = vm_domid++;
2923 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2924 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2925
2926 return domain;
2927 }
2928
2929 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2930 {
2931 int adjust_width;
2932
2933 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2934 spin_lock_init(&domain->mapping_lock);
2935 spin_lock_init(&domain->iommu_lock);
2936
2937 domain_reserve_special_ranges(domain);
2938
2939 /* calculate AGAW */
2940 domain->gaw = guest_width;
2941 adjust_width = guestwidth_to_adjustwidth(guest_width);
2942 domain->agaw = width_to_agaw(adjust_width);
2943
2944 INIT_LIST_HEAD(&domain->devices);
2945
2946 domain->iommu_count = 0;
2947 domain->iommu_coherency = 0;
2948 domain->max_addr = 0;
2949
2950 /* always allocate the top pgd */
2951 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2952 if (!domain->pgd)
2953 return -ENOMEM;
2954 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2955 return 0;
2956 }
2957
2958 static void iommu_free_vm_domain(struct dmar_domain *domain)
2959 {
2960 unsigned long flags;
2961 struct dmar_drhd_unit *drhd;
2962 struct intel_iommu *iommu;
2963 unsigned long i;
2964 unsigned long ndomains;
2965
2966 for_each_drhd_unit(drhd) {
2967 if (drhd->ignored)
2968 continue;
2969 iommu = drhd->iommu;
2970
2971 ndomains = cap_ndoms(iommu->cap);
2972 i = find_first_bit(iommu->domain_ids, ndomains);
2973 for (; i < ndomains; ) {
2974 if (iommu->domains[i] == domain) {
2975 spin_lock_irqsave(&iommu->lock, flags);
2976 clear_bit(i, iommu->domain_ids);
2977 iommu->domains[i] = NULL;
2978 spin_unlock_irqrestore(&iommu->lock, flags);
2979 break;
2980 }
2981 i = find_next_bit(iommu->domain_ids, ndomains, i+1);
2982 }
2983 }
2984 }
2985
2986 static void vm_domain_exit(struct dmar_domain *domain)
2987 {
2988 u64 end;
2989
2990 /* Domain 0 is reserved, so dont process it */
2991 if (!domain)
2992 return;
2993
2994 vm_domain_remove_all_dev_info(domain);
2995 /* destroy iovas */
2996 put_iova_domain(&domain->iovad);
2997 end = DOMAIN_MAX_ADDR(domain->gaw);
2998 end = end & (~VTD_PAGE_MASK);
2999
3000 /* clear ptes */
3001 dma_pte_clear_range(domain, 0, end);
3002
3003 /* free page tables */
3004 dma_pte_free_pagetable(domain, 0, end);
3005
3006 iommu_free_vm_domain(domain);
3007 free_domain_mem(domain);
3008 }
3009
3010 static int intel_iommu_domain_init(struct iommu_domain *domain)
3011 {
3012 struct dmar_domain *dmar_domain;
3013
3014 dmar_domain = iommu_alloc_vm_domain();
3015 if (!dmar_domain) {
3016 printk(KERN_ERR
3017 "intel_iommu_domain_init: dmar_domain == NULL\n");
3018 return -ENOMEM;
3019 }
3020 if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3021 printk(KERN_ERR
3022 "intel_iommu_domain_init() failed\n");
3023 vm_domain_exit(dmar_domain);
3024 return -ENOMEM;
3025 }
3026 domain->priv = dmar_domain;
3027
3028 return 0;
3029 }
3030
3031 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3032 {
3033 struct dmar_domain *dmar_domain = domain->priv;
3034
3035 domain->priv = NULL;
3036 vm_domain_exit(dmar_domain);
3037 }
3038
3039 static int intel_iommu_attach_device(struct iommu_domain *domain,
3040 struct device *dev)
3041 {
3042 struct dmar_domain *dmar_domain = domain->priv;
3043 struct pci_dev *pdev = to_pci_dev(dev);
3044 struct intel_iommu *iommu;
3045 int addr_width;
3046 u64 end;
3047 int ret;
3048
3049 /* normally pdev is not mapped */
3050 if (unlikely(domain_context_mapped(pdev))) {
3051 struct dmar_domain *old_domain;
3052
3053 old_domain = find_domain(pdev);
3054 if (old_domain) {
3055 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3056 vm_domain_remove_one_dev_info(old_domain, pdev);
3057 else
3058 domain_remove_dev_info(old_domain);
3059 }
3060 }
3061
3062 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3063 if (!iommu)
3064 return -ENODEV;
3065
3066 /* check if this iommu agaw is sufficient for max mapped address */
3067 addr_width = agaw_to_width(iommu->agaw);
3068 end = DOMAIN_MAX_ADDR(addr_width);
3069 end = end & VTD_PAGE_MASK;
3070 if (end < dmar_domain->max_addr) {
3071 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3072 "sufficient for the mapped address (%llx)\n",
3073 __func__, iommu->agaw, dmar_domain->max_addr);
3074 return -EFAULT;
3075 }
3076
3077 ret = domain_context_mapping(dmar_domain, pdev);
3078 if (ret)
3079 return ret;
3080
3081 ret = vm_domain_add_dev_info(dmar_domain, pdev);
3082 return ret;
3083 }
3084
3085 static void intel_iommu_detach_device(struct iommu_domain *domain,
3086 struct device *dev)
3087 {
3088 struct dmar_domain *dmar_domain = domain->priv;
3089 struct pci_dev *pdev = to_pci_dev(dev);
3090
3091 vm_domain_remove_one_dev_info(dmar_domain, pdev);
3092 }
3093
3094 static int intel_iommu_map_range(struct iommu_domain *domain,
3095 unsigned long iova, phys_addr_t hpa,
3096 size_t size, int iommu_prot)
3097 {
3098 struct dmar_domain *dmar_domain = domain->priv;
3099 u64 max_addr;
3100 int addr_width;
3101 int prot = 0;
3102 int ret;
3103
3104 if (iommu_prot & IOMMU_READ)
3105 prot |= DMA_PTE_READ;
3106 if (iommu_prot & IOMMU_WRITE)
3107 prot |= DMA_PTE_WRITE;
3108
3109 max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3110 if (dmar_domain->max_addr < max_addr) {
3111 int min_agaw;
3112 u64 end;
3113
3114 /* check if minimum agaw is sufficient for mapped address */
3115 min_agaw = vm_domain_min_agaw(dmar_domain);
3116 addr_width = agaw_to_width(min_agaw);
3117 end = DOMAIN_MAX_ADDR(addr_width);
3118 end = end & VTD_PAGE_MASK;
3119 if (end < max_addr) {
3120 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3121 "sufficient for the mapped address (%llx)\n",
3122 __func__, min_agaw, max_addr);
3123 return -EFAULT;
3124 }
3125 dmar_domain->max_addr = max_addr;
3126 }
3127
3128 ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3129 return ret;
3130 }
3131
3132 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3133 unsigned long iova, size_t size)
3134 {
3135 struct dmar_domain *dmar_domain = domain->priv;
3136 dma_addr_t base;
3137
3138 /* The address might not be aligned */
3139 base = iova & VTD_PAGE_MASK;
3140 size = VTD_PAGE_ALIGN(size);
3141 dma_pte_clear_range(dmar_domain, base, base + size);
3142
3143 if (dmar_domain->max_addr == base + size)
3144 dmar_domain->max_addr = base;
3145 }
3146
3147 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3148 unsigned long iova)
3149 {
3150 struct dmar_domain *dmar_domain = domain->priv;
3151 struct dma_pte *pte;
3152 u64 phys = 0;
3153
3154 pte = addr_to_dma_pte(dmar_domain, iova);
3155 if (pte)
3156 phys = dma_pte_addr(pte);
3157
3158 return phys;
3159 }
3160
3161 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3162 unsigned long cap)
3163 {
3164 struct dmar_domain *dmar_domain = domain->priv;
3165
3166 if (cap == IOMMU_CAP_CACHE_COHERENCY)
3167 return dmar_domain->iommu_snooping;
3168
3169 return 0;
3170 }
3171
3172 static struct iommu_ops intel_iommu_ops = {
3173 .domain_init = intel_iommu_domain_init,
3174 .domain_destroy = intel_iommu_domain_destroy,
3175 .attach_dev = intel_iommu_attach_device,
3176 .detach_dev = intel_iommu_detach_device,
3177 .map = intel_iommu_map_range,
3178 .unmap = intel_iommu_unmap_range,
3179 .iova_to_phys = intel_iommu_iova_to_phys,
3180 .domain_has_cap = intel_iommu_domain_has_cap,
3181 };
3182
3183 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3184 {
3185 /*
3186 * Mobile 4 Series Chipset neglects to set RWBF capability,
3187 * but needs it:
3188 */
3189 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3190 rwbf_quirk = 1;
3191 }
3192
3193 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);