drivers/iommu/intel-iommu.c

   1 /*
   2  * Copyright © 2006-2014 Intel Corporation.
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms and conditions of the GNU General Public License,
   6  * version 2, as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope it will be useful, but WITHOUT
   9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11  * more details.
  12  *
  13  * Authors: David Woodhouse <dwmw2@infradead.org>,
  14  *          Ashok Raj <ashok.raj@intel.com>,
  15  *          Shaohua Li <shaohua.li@intel.com>,
  16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
  17  *          Fenghua Yu <fenghua.yu@intel.com>
  18  *          Joerg Roedel <jroedel@suse.de>
  19  */
  20
  21 #define pr_fmt(fmt)     "DMAR: " fmt
  22
  23 #include <linux/init.h>
  24 #include <linux/bitmap.h>
  25 #include <linux/debugfs.h>
  26 #include <linux/export.h>
  27 #include <linux/slab.h>
  28 #include <linux/irq.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/spinlock.h>
  31 #include <linux/pci.h>
  32 #include <linux/dmar.h>
  33 #include <linux/dma-mapping.h>
  34 #include <linux/dma-direct.h>
  35 #include <linux/mempool.h>
  36 #include <linux/memory.h>
  37 #include <linux/cpu.h>
  38 #include <linux/timer.h>
  39 #include <linux/io.h>
  40 #include <linux/iova.h>
  41 #include <linux/iommu.h>
  42 #include <linux/intel-iommu.h>
  43 #include <linux/syscore_ops.h>
  44 #include <linux/tboot.h>
  45 #include <linux/dmi.h>
  46 #include <linux/pci-ats.h>
  47 #include <linux/memblock.h>
  48 #include <linux/dma-contiguous.h>
  49 #include <linux/dma-direct.h>
  50 #include <linux/crash_dump.h>
  51 #include <asm/irq_remapping.h>
  52 #include <asm/cacheflush.h>
  53 #include <asm/iommu.h>
  54
  55 #include "irq_remapping.h"
  56 #include "intel-pasid.h"
  57
  58 #define ROOT_SIZE               VTD_PAGE_SIZE
  59 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  60
  61 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  62 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  63 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  64 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  65
  66 #define IOAPIC_RANGE_START      (0xfee00000)
  67 #define IOAPIC_RANGE_END        (0xfeefffff)
  68 #define IOVA_START_ADDR         (0x1000)
  69
  70 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  71
  72 #define MAX_AGAW_WIDTH 64
  73 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  74
  75 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  76 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  77
  78 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  79    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  80 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  81                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  82 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  83
  84 /* IO virtual address start page frame number */
  85 #define IOVA_START_PFN          (1)
  86
  87 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  88
  89 /* page table handling */
  90 #define LEVEL_STRIDE            (9)
  91 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  92
  93 /*
  94  * This bitmap is used to advertise the page sizes our hardware support
  95  * to the IOMMU core, which will then use this information to split
  96  * physically contiguous memory regions it is mapping into page sizes
  97  * that we support.
  98  *
  99  * Traditionally the IOMMU core just handed us the mappings directly,
 100  * after making sure the size is an order of a 4KiB page and that the
 101  * mapping has natural alignment.
 102  *
 103  * To retain this behavior, we currently advertise that we support
 104  * all page sizes that are an order of 4KiB.
 105  *
 106  * If at some point we'd like to utilize the IOMMU core's new behavior,
 107  * we could change this to advertise the real page sizes we support.
 108  */
 109 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 110
 111 static inline int agaw_to_level(int agaw)
 112 {
 113         return agaw + 2;
 114 }
 115
 116 static inline int agaw_to_width(int agaw)
 117 {
 118         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 119 }
 120
 121 static inline int width_to_agaw(int width)
 122 {
 123         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 124 }
 125
 126 static inline unsigned int level_to_offset_bits(int level)
 127 {
 128         return (level - 1) * LEVEL_STRIDE;
 129 }
 130
 131 static inline int pfn_level_offset(unsigned long pfn, int level)
 132 {
 133         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 134 }
 135
 136 static inline unsigned long level_mask(int level)
 137 {
 138         return -1UL << level_to_offset_bits(level);
 139 }
 140
 141 static inline unsigned long level_size(int level)
 142 {
 143         return 1UL << level_to_offset_bits(level);
 144 }
 145
 146 static inline unsigned long align_to_level(unsigned long pfn, int level)
 147 {
 148         return (pfn + level_size(level) - 1) & level_mask(level);
 149 }
 150
 151 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 152 {
 153         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 154 }
 155
 156 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 157    are never going to work. */
 158 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 159 {
 160         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 161 }
 162
 163 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 164 {
 165         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 166 }
 167 static inline unsigned long page_to_dma_pfn(struct page *pg)
 168 {
 169         return mm_to_dma_pfn(page_to_pfn(pg));
 170 }
 171 static inline unsigned long virt_to_dma_pfn(void *p)
 172 {
 173         return page_to_dma_pfn(virt_to_page(p));
 174 }
 175
 176 /* global iommu list, set NULL for ignored DMAR units */
 177 static struct intel_iommu **g_iommus;
 178
 179 static void __init check_tylersburg_isoch(void);
 180 static int rwbf_quirk;
 181
 182 /*
 183  * set to 1 to panic kernel if can't successfully enable VT-d
 184  * (used when kernel is launched w/ TXT)
 185  */
 186 static int force_on = 0;
 187 int intel_iommu_tboot_noforce;
 188
 189 /*
 190  * 0: Present
 191  * 1-11: Reserved
 192  * 12-63: Context Ptr (12 - (haw-1))
 193  * 64-127: Reserved
 194  */
 195 struct root_entry {
 196         u64     lo;
 197         u64     hi;
 198 };
 199 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 200
 201 /*
 202  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 203  * if marked present.
 204  */
 205 static phys_addr_t root_entry_lctp(struct root_entry *re)
 206 {
 207         if (!(re->lo & 1))
 208                 return 0;
 209
 210         return re->lo & VTD_PAGE_MASK;
 211 }
 212
 213 /*
 214  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 215  * if marked present.
 216  */
 217 static phys_addr_t root_entry_uctp(struct root_entry *re)
 218 {
 219         if (!(re->hi & 1))
 220                 return 0;
 221
 222         return re->hi & VTD_PAGE_MASK;
 223 }
 224 /*
 225  * low 64 bits:
 226  * 0: present
 227  * 1: fault processing disable
 228  * 2-3: translation type
 229  * 12-63: address space root
 230  * high 64 bits:
 231  * 0-2: address width
 232  * 3-6: aval
 233  * 8-23: domain id
 234  */
 235 struct context_entry {
 236         u64 lo;
 237         u64 hi;
 238 };
 239
 240 static inline void context_clear_pasid_enable(struct context_entry *context)
 241 {
 242         context->lo &= ~(1ULL << 11);
 243 }
 244
 245 static inline bool context_pasid_enabled(struct context_entry *context)
 246 {
 247         return !!(context->lo & (1ULL << 11));
 248 }
 249
 250 static inline void context_set_copied(struct context_entry *context)
 251 {
 252         context->hi |= (1ull << 3);
 253 }
 254
 255 static inline bool context_copied(struct context_entry *context)
 256 {
 257         return !!(context->hi & (1ULL << 3));
 258 }
 259
 260 static inline bool __context_present(struct context_entry *context)
 261 {
 262         return (context->lo & 1);
 263 }
 264
 265 static inline bool context_present(struct context_entry *context)
 266 {
 267         return context_pasid_enabled(context) ?
 268              __context_present(context) :
 269              __context_present(context) && !context_copied(context);
 270 }
 271
 272 static inline void context_set_present(struct context_entry *context)
 273 {
 274         context->lo |= 1;
 275 }
 276
 277 static inline void context_set_fault_enable(struct context_entry *context)
 278 {
 279         context->lo &= (((u64)-1) << 2) | 1;
 280 }
 281
 282 static inline void context_set_translation_type(struct context_entry *context,
 283                                                 unsigned long value)
 284 {
 285         context->lo &= (((u64)-1) << 4) | 3;
 286         context->lo |= (value & 3) << 2;
 287 }
 288
 289 static inline void context_set_address_root(struct context_entry *context,
 290                                             unsigned long value)
 291 {
 292         context->lo &= ~VTD_PAGE_MASK;
 293         context->lo |= value & VTD_PAGE_MASK;
 294 }
 295
 296 static inline void context_set_address_width(struct context_entry *context,
 297                                              unsigned long value)
 298 {
 299         context->hi |= value & 7;
 300 }
 301
 302 static inline void context_set_domain_id(struct context_entry *context,
 303                                          unsigned long value)
 304 {
 305         context->hi |= (value & ((1 << 16) - 1)) << 8;
 306 }
 307
 308 static inline int context_domain_id(struct context_entry *c)
 309 {
 310         return((c->hi >> 8) & 0xffff);
 311 }
 312
 313 static inline void context_clear_entry(struct context_entry *context)
 314 {
 315         context->lo = 0;
 316         context->hi = 0;
 317 }
 318
 319 /*
 320  * 0: readable
 321  * 1: writable
 322  * 2-6: reserved
 323  * 7: super page
 324  * 8-10: available
 325  * 11: snoop behavior
 326  * 12-63: Host physcial address
 327  */
 328 struct dma_pte {
 329         u64 val;
 330 };
 331
 332 static inline void dma_clear_pte(struct dma_pte *pte)
 333 {
 334         pte->val = 0;
 335 }
 336
 337 static inline u64 dma_pte_addr(struct dma_pte *pte)
 338 {
 339 #ifdef CONFIG_64BIT
 340         return pte->val & VTD_PAGE_MASK;
 341 #else
 342         /* Must have a full atomic 64-bit read */
 343         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 344 #endif
 345 }
 346
 347 static inline bool dma_pte_present(struct dma_pte *pte)
 348 {
 349         return (pte->val & 3) != 0;
 350 }
 351
 352 static inline bool dma_pte_superpage(struct dma_pte *pte)
 353 {
 354         return (pte->val & DMA_PTE_LARGE_PAGE);
 355 }
 356
 357 static inline int first_pte_in_page(struct dma_pte *pte)
 358 {
 359         return !((unsigned long)pte & ~VTD_PAGE_MASK);
 360 }
 361
 362 /*
 363  * This domain is a statically identity mapping domain.
 364  *      1. This domain creats a static 1:1 mapping to all usable memory.
 365  *      2. It maps to each iommu if successful.
 366  *      3. Each iommu mapps to this domain if successful.
 367  */
 368 static struct dmar_domain *si_domain;
 369 static int hw_pass_through = 1;
 370
 371 /*
 372  * Domain represents a virtual machine, more than one devices
 373  * across iommus may be owned in one domain, e.g. kvm guest.
 374  */
 375 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
 376
 377 /* si_domain contains mulitple devices */
 378 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
 379
 380 #define for_each_domain_iommu(idx, domain)                      \
 381         for (idx = 0; idx < g_num_of_iommus; idx++)             \
 382                 if (domain->iommu_refcnt[idx])
 383
 384 struct dmar_rmrr_unit {
 385         struct list_head list;          /* list of rmrr units   */
 386         struct acpi_dmar_header *hdr;   /* ACPI header          */
 387         u64     base_address;           /* reserved base address*/
 388         u64     end_address;            /* reserved end address */
 389         struct dmar_dev_scope *devices; /* target devices */
 390         int     devices_cnt;            /* target device count */
 391         struct iommu_resv_region *resv; /* reserved region handle */
 392 };
 393
 394 struct dmar_atsr_unit {
 395         struct list_head list;          /* list of ATSR units */
 396         struct acpi_dmar_header *hdr;   /* ACPI header */
 397         struct dmar_dev_scope *devices; /* target devices */
 398         int devices_cnt;                /* target device count */
 399         u8 include_all:1;               /* include all ports */
 400 };
 401
 402 static LIST_HEAD(dmar_atsr_units);
 403 static LIST_HEAD(dmar_rmrr_units);
 404
 405 #define for_each_rmrr_units(rmrr) \
 406         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 407
 408 /* bitmap for indexing intel_iommus */
 409 static int g_num_of_iommus;
 410
 411 static void domain_exit(struct dmar_domain *domain);
 412 static void domain_remove_dev_info(struct dmar_domain *domain);
 413 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
 414                                      struct device *dev);
 415 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 416 static void domain_context_clear(struct intel_iommu *iommu,
 417                                  struct device *dev);
 418 static int domain_detach_iommu(struct dmar_domain *domain,
 419                                struct intel_iommu *iommu);
 420
 421 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 422 int dmar_disabled = 0;
 423 #else
 424 int dmar_disabled = 1;
 425 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 426
 427 int intel_iommu_enabled = 0;
 428 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 429
 430 static int dmar_map_gfx = 1;
 431 static int dmar_forcedac;
 432 static int intel_iommu_strict;
 433 static int intel_iommu_superpage = 1;
 434 static int intel_iommu_ecs = 1;
 435 static int iommu_identity_mapping;
 436
 437 #define IDENTMAP_ALL            1
 438 #define IDENTMAP_GFX            2
 439 #define IDENTMAP_AZALIA         4
 440
 441 #define ecs_enabled(iommu)      (intel_iommu_ecs && ecap_ecs(iommu->ecap))
 442 #define pasid_enabled(iommu)    (ecs_enabled(iommu) && ecap_pasid(iommu->ecap))
 443
 444 int intel_iommu_gfx_mapped;
 445 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 446
 447 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 448 static DEFINE_SPINLOCK(device_domain_lock);
 449 static LIST_HEAD(device_domain_list);
 450
 451 const struct iommu_ops intel_iommu_ops;
 452
 453 static bool translation_pre_enabled(struct intel_iommu *iommu)
 454 {
 455         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 456 }
 457
 458 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 459 {
 460         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 461 }
 462
 463 static void init_translation_status(struct intel_iommu *iommu)
 464 {
 465         u32 gsts;
 466
 467         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 468         if (gsts & DMA_GSTS_TES)
 469                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 470 }
 471
 472 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
 473 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
 474 {
 475         return container_of(dom, struct dmar_domain, domain);
 476 }
 477
 478 static int __init intel_iommu_setup(char *str)
 479 {
 480         if (!str)
 481                 return -EINVAL;
 482         while (*str) {
 483                 if (!strncmp(str, "on", 2)) {
 484                         dmar_disabled = 0;
 485                         pr_info("IOMMU enabled\n");
 486                 } else if (!strncmp(str, "off", 3)) {
 487                         dmar_disabled = 1;
 488                         pr_info("IOMMU disabled\n");
 489                 } else if (!strncmp(str, "igfx_off", 8)) {
 490                         dmar_map_gfx = 0;
 491                         pr_info("Disable GFX device mapping\n");
 492                 } else if (!strncmp(str, "forcedac", 8)) {
 493                         pr_info("Forcing DAC for PCI devices\n");
 494                         dmar_forcedac = 1;
 495                 } else if (!strncmp(str, "strict", 6)) {
 496                         pr_info("Disable batched IOTLB flush\n");
 497                         intel_iommu_strict = 1;
 498                 } else if (!strncmp(str, "sp_off", 6)) {
 499                         pr_info("Disable supported super page\n");
 500                         intel_iommu_superpage = 0;
 501                 } else if (!strncmp(str, "ecs_off", 7)) {
 502                         printk(KERN_INFO
 503                                 "Intel-IOMMU: disable extended context table support\n");
 504                         intel_iommu_ecs = 0;
 505                 } else if (!strncmp(str, "tboot_noforce", 13)) {
 506                         printk(KERN_INFO
 507                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 508                         intel_iommu_tboot_noforce = 1;
 509                 }
 510
 511                 str += strcspn(str, ",");
 512                 while (*str == ',')
 513                         str++;
 514         }
 515         return 0;
 516 }
 517 __setup("intel_iommu=", intel_iommu_setup);
 518
 519 static struct kmem_cache *iommu_domain_cache;
 520 static struct kmem_cache *iommu_devinfo_cache;
 521
 522 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 523 {
 524         struct dmar_domain **domains;
 525         int idx = did >> 8;
 526
 527         domains = iommu->domains[idx];
 528         if (!domains)
 529                 return NULL;
 530
 531         return domains[did & 0xff];
 532 }
 533
 534 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 535                              struct dmar_domain *domain)
 536 {
 537         struct dmar_domain **domains;
 538         int idx = did >> 8;
 539
 540         if (!iommu->domains[idx]) {
 541                 size_t size = 256 * sizeof(struct dmar_domain *);
 542                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 543         }
 544
 545         domains = iommu->domains[idx];
 546         if (WARN_ON(!domains))
 547                 return;
 548         else
 549                 domains[did & 0xff] = domain;
 550 }
 551
 552 void *alloc_pgtable_page(int node)
 553 {
 554         struct page *page;
 555         void *vaddr = NULL;
 556
 557         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 558         if (page)
 559                 vaddr = page_address(page);
 560         return vaddr;
 561 }
 562
 563 void free_pgtable_page(void *vaddr)
 564 {
 565         free_page((unsigned long)vaddr);
 566 }
 567
 568 static inline void *alloc_domain_mem(void)
 569 {
 570         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 571 }
 572
 573 static void free_domain_mem(void *vaddr)
 574 {
 575         kmem_cache_free(iommu_domain_cache, vaddr);
 576 }
 577
 578 static inline void * alloc_devinfo_mem(void)
 579 {
 580         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 581 }
 582
 583 static inline void free_devinfo_mem(void *vaddr)
 584 {
 585         kmem_cache_free(iommu_devinfo_cache, vaddr);
 586 }
 587
 588 static inline int domain_type_is_vm(struct dmar_domain *domain)
 589 {
 590         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
 591 }
 592
 593 static inline int domain_type_is_si(struct dmar_domain *domain)
 594 {
 595         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 596 }
 597
 598 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
 599 {
 600         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
 601                                 DOMAIN_FLAG_STATIC_IDENTITY);
 602 }
 603
 604 static inline int domain_pfn_supported(struct dmar_domain *domain,
 605                                        unsigned long pfn)
 606 {
 607         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 608
 609         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 610 }
 611
 612 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 613 {
 614         unsigned long sagaw;
 615         int agaw = -1;
 616
 617         sagaw = cap_sagaw(iommu->cap);
 618         for (agaw = width_to_agaw(max_gaw);
 619              agaw >= 0; agaw--) {
 620                 if (test_bit(agaw, &sagaw))
 621                         break;
 622         }
 623
 624         return agaw;
 625 }
 626
 627 /*
 628  * Calculate max SAGAW for each iommu.
 629  */
 630 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 631 {
 632         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 633 }
 634
 635 /*
 636  * calculate agaw for each iommu.
 637  * "SAGAW" may be different across iommus, use a default agaw, and
 638  * get a supported less agaw for iommus that don't support the default agaw.
 639  */
 640 int iommu_calculate_agaw(struct intel_iommu *iommu)
 641 {
 642         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 643 }
 644
 645 /* This functionin only returns single iommu in a domain */
 646 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 647 {
 648         int iommu_id;
 649
 650         /* si_domain and vm domain should not get here. */
 651         BUG_ON(domain_type_is_vm_or_si(domain));
 652         for_each_domain_iommu(iommu_id, domain)
 653                 break;
 654
 655         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 656                 return NULL;
 657
 658         return g_iommus[iommu_id];
 659 }
 660
 661 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 662 {
 663         struct dmar_drhd_unit *drhd;
 664         struct intel_iommu *iommu;
 665         bool found = false;
 666         int i;
 667
 668         domain->iommu_coherency = 1;
 669
 670         for_each_domain_iommu(i, domain) {
 671                 found = true;
 672                 if (!ecap_coherent(g_iommus[i]->ecap)) {
 673                         domain->iommu_coherency = 0;
 674                         break;
 675                 }
 676         }
 677         if (found)
 678                 return;
 679
 680         /* No hardware attached; use lowest common denominator */
 681         rcu_read_lock();
 682         for_each_active_iommu(iommu, drhd) {
 683                 if (!ecap_coherent(iommu->ecap)) {
 684                         domain->iommu_coherency = 0;
 685                         break;
 686                 }
 687         }
 688         rcu_read_unlock();
 689 }
 690
 691 static int domain_update_iommu_snooping(struct intel_iommu *skip)
 692 {
 693         struct dmar_drhd_unit *drhd;
 694         struct intel_iommu *iommu;
 695         int ret = 1;
 696
 697         rcu_read_lock();
 698         for_each_active_iommu(iommu, drhd) {
 699                 if (iommu != skip) {
 700                         if (!ecap_sc_support(iommu->ecap)) {
 701                                 ret = 0;
 702                                 break;
 703                         }
 704                 }
 705         }
 706         rcu_read_unlock();
 707
 708         return ret;
 709 }
 710
 711 static int domain_update_iommu_superpage(struct intel_iommu *skip)
 712 {
 713         struct dmar_drhd_unit *drhd;
 714         struct intel_iommu *iommu;
 715         int mask = 0xf;
 716
 717         if (!intel_iommu_superpage) {
 718                 return 0;
 719         }
 720
 721         /* set iommu_superpage to the smallest common denominator */
 722         rcu_read_lock();
 723         for_each_active_iommu(iommu, drhd) {
 724                 if (iommu != skip) {
 725                         mask &= cap_super_page_val(iommu->cap);
 726                         if (!mask)
 727                                 break;
 728                 }
 729         }
 730         rcu_read_unlock();
 731
 732         return fls(mask);
 733 }
 734
 735 /* Some capabilities may be different across iommus */
 736 static void domain_update_iommu_cap(struct dmar_domain *domain)
 737 {
 738         domain_update_iommu_coherency(domain);
 739         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 740         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
 741 }
 742
 743 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
 744                                                        u8 bus, u8 devfn, int alloc)
 745 {
 746         struct root_entry *root = &iommu->root_entry[bus];
 747         struct context_entry *context;
 748         u64 *entry;
 749
 750         entry = &root->lo;
 751         if (ecs_enabled(iommu)) {
 752                 if (devfn >= 0x80) {
 753                         devfn -= 0x80;
 754                         entry = &root->hi;
 755                 }
 756                 devfn *= 2;
 757         }
 758         if (*entry & 1)
 759                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 760         else {
 761                 unsigned long phy_addr;
 762                 if (!alloc)
 763                         return NULL;
 764
 765                 context = alloc_pgtable_page(iommu->node);
 766                 if (!context)
 767                         return NULL;
 768
 769                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 770                 phy_addr = virt_to_phys((void *)context);
 771                 *entry = phy_addr | 1;
 772                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 773         }
 774         return &context[devfn];
 775 }
 776
 777 static int iommu_dummy(struct device *dev)
 778 {
 779         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 780 }
 781
 782 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 783 {
 784         struct dmar_drhd_unit *drhd = NULL;
 785         struct intel_iommu *iommu;
 786         struct device *tmp;
 787         struct pci_dev *ptmp, *pdev = NULL;
 788         u16 segment = 0;
 789         int i;
 790
 791         if (iommu_dummy(dev))
 792                 return NULL;
 793
 794         if (dev_is_pci(dev)) {
 795                 struct pci_dev *pf_pdev;
 796
 797                 pdev = to_pci_dev(dev);
 798
 799 #ifdef CONFIG_X86
 800                 /* VMD child devices currently cannot be handled individually */
 801                 if (is_vmd(pdev->bus))
 802                         return NULL;
 803 #endif
 804
 805                 /* VFs aren't listed in scope tables; we need to look up
 806                  * the PF instead to find the IOMMU. */
 807                 pf_pdev = pci_physfn(pdev);
 808                 dev = &pf_pdev->dev;
 809                 segment = pci_domain_nr(pdev->bus);
 810         } else if (has_acpi_companion(dev))
 811                 dev = &ACPI_COMPANION(dev)->dev;
 812
 813         rcu_read_lock();
 814         for_each_active_iommu(iommu, drhd) {
 815                 if (pdev && segment != drhd->segment)
 816                         continue;
 817
 818                 for_each_active_dev_scope(drhd->devices,
 819                                           drhd->devices_cnt, i, tmp) {
 820                         if (tmp == dev) {
 821                                 /* For a VF use its original BDF# not that of the PF
 822                                  * which we used for the IOMMU lookup. Strictly speaking
 823                                  * we could do this for all PCI devices; we only need to
 824                                  * get the BDF# from the scope table for ACPI matches. */
 825                                 if (pdev && pdev->is_virtfn)
 826                                         goto got_pdev;
 827
 828                                 *bus = drhd->devices[i].bus;
 829                                 *devfn = drhd->devices[i].devfn;
 830                                 goto out;
 831                         }
 832
 833                         if (!pdev || !dev_is_pci(tmp))
 834                                 continue;
 835
 836                         ptmp = to_pci_dev(tmp);
 837                         if (ptmp->subordinate &&
 838                             ptmp->subordinate->number <= pdev->bus->number &&
 839                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
 840                                 goto got_pdev;
 841                 }
 842
 843                 if (pdev && drhd->include_all) {
 844                 got_pdev:
 845                         *bus = pdev->bus->number;
 846                         *devfn = pdev->devfn;
 847                         goto out;
 848                 }
 849         }
 850         iommu = NULL;
 851  out:
 852         rcu_read_unlock();
 853
 854         return iommu;
 855 }
 856
 857 static void domain_flush_cache(struct dmar_domain *domain,
 858                                void *addr, int size)
 859 {
 860         if (!domain->iommu_coherency)
 861                 clflush_cache_range(addr, size);
 862 }
 863
 864 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 865 {
 866         struct context_entry *context;
 867         int ret = 0;
 868         unsigned long flags;
 869
 870         spin_lock_irqsave(&iommu->lock, flags);
 871         context = iommu_context_addr(iommu, bus, devfn, 0);
 872         if (context)
 873                 ret = context_present(context);
 874         spin_unlock_irqrestore(&iommu->lock, flags);
 875         return ret;
 876 }
 877
 878 static void free_context_table(struct intel_iommu *iommu)
 879 {
 880         int i;
 881         unsigned long flags;
 882         struct context_entry *context;
 883
 884         spin_lock_irqsave(&iommu->lock, flags);
 885         if (!iommu->root_entry) {
 886                 goto out;
 887         }
 888         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 889                 context = iommu_context_addr(iommu, i, 0, 0);
 890                 if (context)
 891                         free_pgtable_page(context);
 892
 893                 if (!ecs_enabled(iommu))
 894                         continue;
 895
 896                 context = iommu_context_addr(iommu, i, 0x80, 0);
 897                 if (context)
 898                         free_pgtable_page(context);
 899
 900         }
 901         free_pgtable_page(iommu->root_entry);
 902         iommu->root_entry = NULL;
 903 out:
 904         spin_unlock_irqrestore(&iommu->lock, flags);
 905 }
 906
 907 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 908                                       unsigned long pfn, int *target_level)
 909 {
 910         struct dma_pte *parent, *pte = NULL;
 911         int level = agaw_to_level(domain->agaw);
 912         int offset;
 913
 914         BUG_ON(!domain->pgd);
 915
 916         if (!domain_pfn_supported(domain, pfn))
 917                 /* Address beyond IOMMU's addressing capabilities. */
 918                 return NULL;
 919
 920         parent = domain->pgd;
 921
 922         while (1) {
 923                 void *tmp_page;
 924
 925                 offset = pfn_level_offset(pfn, level);
 926                 pte = &parent[offset];
 927                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 928                         break;
 929                 if (level == *target_level)
 930                         break;
 931
 932                 if (!dma_pte_present(pte)) {
 933                         uint64_t pteval;
 934
 935                         tmp_page = alloc_pgtable_page(domain->nid);
 936
 937                         if (!tmp_page)
 938                                 return NULL;
 939
 940                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 941                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 942                         if (cmpxchg64(&pte->val, 0ULL, pteval))
 943                                 /* Someone else set it while we were thinking; use theirs. */
 944                                 free_pgtable_page(tmp_page);
 945                         else
 946                                 domain_flush_cache(domain, pte, sizeof(*pte));
 947                 }
 948                 if (level == 1)
 949                         break;
 950
 951                 parent = phys_to_virt(dma_pte_addr(pte));
 952                 level--;
 953         }
 954
 955         if (!*target_level)
 956                 *target_level = level;
 957
 958         return pte;
 959 }
 960
 961
 962 /* return address's pte at specific level */
 963 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 964                                          unsigned long pfn,
 965                                          int level, int *large_page)
 966 {
 967         struct dma_pte *parent, *pte = NULL;
 968         int total = agaw_to_level(domain->agaw);
 969         int offset;
 970
 971         parent = domain->pgd;
 972         while (level <= total) {
 973                 offset = pfn_level_offset(pfn, total);
 974                 pte = &parent[offset];
 975                 if (level == total)
 976                         return pte;
 977
 978                 if (!dma_pte_present(pte)) {
 979                         *large_page = total;
 980                         break;
 981                 }
 982
 983                 if (dma_pte_superpage(pte)) {
 984                         *large_page = total;
 985                         return pte;
 986                 }
 987
 988                 parent = phys_to_virt(dma_pte_addr(pte));
 989                 total--;
 990         }
 991         return NULL;
 992 }
 993
 994 /* clear last level pte, a tlb flush should be followed */
 995 static void dma_pte_clear_range(struct dmar_domain *domain,
 996                                 unsigned long start_pfn,
 997                                 unsigned long last_pfn)
 998 {
 999         unsigned int large_page = 1;
1000         struct dma_pte *first_pte, *pte;
1001
1002         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1003         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1004         BUG_ON(start_pfn > last_pfn);
1005
1006         /* we don't need lock here; nobody else touches the iova range */
1007         do {
1008                 large_page = 1;
1009                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1010                 if (!pte) {
1011                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1012                         continue;
1013                 }
1014                 do {
1015                         dma_clear_pte(pte);
1016                         start_pfn += lvl_to_nr_pages(large_page);
1017                         pte++;
1018                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1019
1020                 domain_flush_cache(domain, first_pte,
1021                                    (void *)pte - (void *)first_pte);
1022
1023         } while (start_pfn && start_pfn <= last_pfn);
1024 }
1025
1026 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1027                                int retain_level, struct dma_pte *pte,
1028                                unsigned long pfn, unsigned long start_pfn,
1029                                unsigned long last_pfn)
1030 {
1031         pfn = max(start_pfn, pfn);
1032         pte = &pte[pfn_level_offset(pfn, level)];
1033
1034         do {
1035                 unsigned long level_pfn;
1036                 struct dma_pte *level_pte;
1037
1038                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1039                         goto next;
1040
1041                 level_pfn = pfn & level_mask(level);
1042                 level_pte = phys_to_virt(dma_pte_addr(pte));
1043
1044                 if (level > 2) {
1045                         dma_pte_free_level(domain, level - 1, retain_level,
1046                                            level_pte, level_pfn, start_pfn,
1047                                            last_pfn);
1048                 }
1049
1050                 /*
1051                  * Free the page table if we're below the level we want to
1052                  * retain and the range covers the entire table.
1053                  */
1054                 if (level < retain_level && !(start_pfn > level_pfn ||
1055                       last_pfn < level_pfn + level_size(level) - 1)) {
1056                         dma_clear_pte(pte);
1057                         domain_flush_cache(domain, pte, sizeof(*pte));
1058                         free_pgtable_page(level_pte);
1059                 }
1060 next:
1061                 pfn += level_size(level);
1062         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1063 }
1064
1065 /*
1066  * clear last level (leaf) ptes and free page table pages below the
1067  * level we wish to keep intact.
1068  */
1069 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1070                                    unsigned long start_pfn,
1071                                    unsigned long last_pfn,
1072                                    int retain_level)
1073 {
1074         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1075         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1076         BUG_ON(start_pfn > last_pfn);
1077
1078         dma_pte_clear_range(domain, start_pfn, last_pfn);
1079
1080         /* We don't need lock here; nobody else touches the iova range */
1081         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1082                            domain->pgd, 0, start_pfn, last_pfn);
1083
1084         /* free pgd */
1085         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1086                 free_pgtable_page(domain->pgd);
1087                 domain->pgd = NULL;
1088         }
1089 }
1090
1091 /* When a page at a given level is being unlinked from its parent, we don't
1092    need to *modify* it at all. All we need to do is make a list of all the
1093    pages which can be freed just as soon as we've flushed the IOTLB and we
1094    know the hardware page-walk will no longer touch them.
1095    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1096    be freed. */
1097 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1098                                             int level, struct dma_pte *pte,
1099                                             struct page *freelist)
1100 {
1101         struct page *pg;
1102
1103         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1104         pg->freelist = freelist;
1105         freelist = pg;
1106
1107         if (level == 1)
1108                 return freelist;
1109
1110         pte = page_address(pg);
1111         do {
1112                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1113                         freelist = dma_pte_list_pagetables(domain, level - 1,
1114                                                            pte, freelist);
1115                 pte++;
1116         } while (!first_pte_in_page(pte));
1117
1118         return freelist;
1119 }
1120
1121 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1122                                         struct dma_pte *pte, unsigned long pfn,
1123                                         unsigned long start_pfn,
1124                                         unsigned long last_pfn,
1125                                         struct page *freelist)
1126 {
1127         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1128
1129         pfn = max(start_pfn, pfn);
1130         pte = &pte[pfn_level_offset(pfn, level)];
1131
1132         do {
1133                 unsigned long level_pfn;
1134
1135                 if (!dma_pte_present(pte))
1136                         goto next;
1137
1138                 level_pfn = pfn & level_mask(level);
1139
1140                 /* If range covers entire pagetable, free it */
1141                 if (start_pfn <= level_pfn &&
1142                     last_pfn >= level_pfn + level_size(level) - 1) {
1143                         /* These suborbinate page tables are going away entirely. Don't
1144                            bother to clear them; we're just going to *free* them. */
1145                         if (level > 1 && !dma_pte_superpage(pte))
1146                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1147
1148                         dma_clear_pte(pte);
1149                         if (!first_pte)
1150                                 first_pte = pte;
1151                         last_pte = pte;
1152                 } else if (level > 1) {
1153                         /* Recurse down into a level that isn't *entirely* obsolete */
1154                         freelist = dma_pte_clear_level(domain, level - 1,
1155                                                        phys_to_virt(dma_pte_addr(pte)),
1156                                                        level_pfn, start_pfn, last_pfn,
1157                                                        freelist);
1158                 }
1159 next:
1160                 pfn += level_size(level);
1161         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1162
1163         if (first_pte)
1164                 domain_flush_cache(domain, first_pte,
1165                                    (void *)++last_pte - (void *)first_pte);
1166
1167         return freelist;
1168 }
1169
1170 /* We can't just free the pages because the IOMMU may still be walking
1171    the page tables, and may have cached the intermediate levels. The
1172    pages can only be freed after the IOTLB flush has been done. */
1173 static struct page *domain_unmap(struct dmar_domain *domain,
1174                                  unsigned long start_pfn,
1175                                  unsigned long last_pfn)
1176 {
1177         struct page *freelist = NULL;
1178
1179         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1180         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1181         BUG_ON(start_pfn > last_pfn);
1182
1183         /* we don't need lock here; nobody else touches the iova range */
1184         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1185                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1186
1187         /* free pgd */
1188         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1189                 struct page *pgd_page = virt_to_page(domain->pgd);
1190                 pgd_page->freelist = freelist;
1191                 freelist = pgd_page;
1192
1193                 domain->pgd = NULL;
1194         }
1195
1196         return freelist;
1197 }
1198
1199 static void dma_free_pagelist(struct page *freelist)
1200 {
1201         struct page *pg;
1202
1203         while ((pg = freelist)) {
1204                 freelist = pg->freelist;
1205                 free_pgtable_page(page_address(pg));
1206         }
1207 }
1208
1209 static void iova_entry_free(unsigned long data)
1210 {
1211         struct page *freelist = (struct page *)data;
1212
1213         dma_free_pagelist(freelist);
1214 }
1215
1216 /* iommu handling */
1217 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1218 {
1219         struct root_entry *root;
1220         unsigned long flags;
1221
1222         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1223         if (!root) {
1224                 pr_err("Allocating root entry for %s failed\n",
1225                         iommu->name);
1226                 return -ENOMEM;
1227         }
1228
1229         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1230
1231         spin_lock_irqsave(&iommu->lock, flags);
1232         iommu->root_entry = root;
1233         spin_unlock_irqrestore(&iommu->lock, flags);
1234
1235         return 0;
1236 }
1237
1238 static void iommu_set_root_entry(struct intel_iommu *iommu)
1239 {
1240         u64 addr;
1241         u32 sts;
1242         unsigned long flag;
1243
1244         addr = virt_to_phys(iommu->root_entry);
1245         if (ecs_enabled(iommu))
1246                 addr |= DMA_RTADDR_RTT;
1247
1248         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1249         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1250
1251         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1252
1253         /* Make sure hardware complete it */
1254         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1255                       readl, (sts & DMA_GSTS_RTPS), sts);
1256
1257         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1258 }
1259
1260 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1261 {
1262         u32 val;
1263         unsigned long flag;
1264
1265         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1266                 return;
1267
1268         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1269         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1270
1271         /* Make sure hardware complete it */
1272         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1273                       readl, (!(val & DMA_GSTS_WBFS)), val);
1274
1275         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1276 }
1277
1278 /* return value determine if we need a write buffer flush */
1279 static void __iommu_flush_context(struct intel_iommu *iommu,
1280                                   u16 did, u16 source_id, u8 function_mask,
1281                                   u64 type)
1282 {
1283         u64 val = 0;
1284         unsigned long flag;
1285
1286         switch (type) {
1287         case DMA_CCMD_GLOBAL_INVL:
1288                 val = DMA_CCMD_GLOBAL_INVL;
1289                 break;
1290         case DMA_CCMD_DOMAIN_INVL:
1291                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1292                 break;
1293         case DMA_CCMD_DEVICE_INVL:
1294                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1295                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1296                 break;
1297         default:
1298                 BUG();
1299         }
1300         val |= DMA_CCMD_ICC;
1301
1302         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1303         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1304
1305         /* Make sure hardware complete it */
1306         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1307                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1308
1309         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1310 }
1311
1312 /* return value determine if we need a write buffer flush */
1313 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1314                                 u64 addr, unsigned int size_order, u64 type)
1315 {
1316         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1317         u64 val = 0, val_iva = 0;
1318         unsigned long flag;
1319
1320         switch (type) {
1321         case DMA_TLB_GLOBAL_FLUSH:
1322                 /* global flush doesn't need set IVA_REG */
1323                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1324                 break;
1325         case DMA_TLB_DSI_FLUSH:
1326                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1327                 break;
1328         case DMA_TLB_PSI_FLUSH:
1329                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1330                 /* IH bit is passed in as part of address */
1331                 val_iva = size_order | addr;
1332                 break;
1333         default:
1334                 BUG();
1335         }
1336         /* Note: set drain read/write */
1337 #if 0
1338         /*
1339          * This is probably to be super secure.. Looks like we can
1340          * ignore it without any impact.
1341          */
1342         if (cap_read_drain(iommu->cap))
1343                 val |= DMA_TLB_READ_DRAIN;
1344 #endif
1345         if (cap_write_drain(iommu->cap))
1346                 val |= DMA_TLB_WRITE_DRAIN;
1347
1348         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1349         /* Note: Only uses first TLB reg currently */
1350         if (val_iva)
1351                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1352         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1353
1354         /* Make sure hardware complete it */
1355         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1356                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1357
1358         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1359
1360         /* check IOTLB invalidation granularity */
1361         if (DMA_TLB_IAIG(val) == 0)
1362                 pr_err("Flush IOTLB failed\n");
1363         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1364                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1365                         (unsigned long long)DMA_TLB_IIRG(type),
1366                         (unsigned long long)DMA_TLB_IAIG(val));
1367 }
1368
1369 static struct device_domain_info *
1370 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1371                          u8 bus, u8 devfn)
1372 {
1373         struct device_domain_info *info;
1374
1375         assert_spin_locked(&device_domain_lock);
1376
1377         if (!iommu->qi)
1378                 return NULL;
1379
1380         list_for_each_entry(info, &domain->devices, link)
1381                 if (info->iommu == iommu && info->bus == bus &&
1382                     info->devfn == devfn) {
1383                         if (info->ats_supported && info->dev)
1384                                 return info;
1385                         break;
1386                 }
1387
1388         return NULL;
1389 }
1390
1391 static void domain_update_iotlb(struct dmar_domain *domain)
1392 {
1393         struct device_domain_info *info;
1394         bool has_iotlb_device = false;
1395
1396         assert_spin_locked(&device_domain_lock);
1397
1398         list_for_each_entry(info, &domain->devices, link) {
1399                 struct pci_dev *pdev;
1400
1401                 if (!info->dev || !dev_is_pci(info->dev))
1402                         continue;
1403
1404                 pdev = to_pci_dev(info->dev);
1405                 if (pdev->ats_enabled) {
1406                         has_iotlb_device = true;
1407                         break;
1408                 }
1409         }
1410
1411         domain->has_iotlb_device = has_iotlb_device;
1412 }
1413
1414 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1415 {
1416         struct pci_dev *pdev;
1417
1418         assert_spin_locked(&device_domain_lock);
1419
1420         if (!info || !dev_is_pci(info->dev))
1421                 return;
1422
1423         pdev = to_pci_dev(info->dev);
1424         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1425          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1426          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1427          * reserved, which should be set to 0.
1428          */
1429         if (!ecap_dit(info->iommu->ecap))
1430                 info->pfsid = 0;
1431         else {
1432                 struct pci_dev *pf_pdev;
1433
1434                 /* pdev will be returned if device is not a vf */
1435                 pf_pdev = pci_physfn(pdev);
1436                 info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
1437         }
1438
1439 #ifdef CONFIG_INTEL_IOMMU_SVM
1440         /* The PCIe spec, in its wisdom, declares that the behaviour of
1441            the device if you enable PASID support after ATS support is
1442            undefined. So always enable PASID support on devices which
1443            have it, even if we can't yet know if we're ever going to
1444            use it. */
1445         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1446                 info->pasid_enabled = 1;
1447
1448         if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1449                 info->pri_enabled = 1;
1450 #endif
1451         if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1452                 info->ats_enabled = 1;
1453                 domain_update_iotlb(info->domain);
1454                 info->ats_qdep = pci_ats_queue_depth(pdev);
1455         }
1456 }
1457
1458 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1459 {
1460         struct pci_dev *pdev;
1461
1462         assert_spin_locked(&device_domain_lock);
1463
1464         if (!dev_is_pci(info->dev))
1465                 return;
1466
1467         pdev = to_pci_dev(info->dev);
1468
1469         if (info->ats_enabled) {
1470                 pci_disable_ats(pdev);
1471                 info->ats_enabled = 0;
1472                 domain_update_iotlb(info->domain);
1473         }
1474 #ifdef CONFIG_INTEL_IOMMU_SVM
1475         if (info->pri_enabled) {
1476                 pci_disable_pri(pdev);
1477                 info->pri_enabled = 0;
1478         }
1479         if (info->pasid_enabled) {
1480                 pci_disable_pasid(pdev);
1481                 info->pasid_enabled = 0;
1482         }
1483 #endif
1484 }
1485
1486 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1487                                   u64 addr, unsigned mask)
1488 {
1489         u16 sid, qdep;
1490         unsigned long flags;
1491         struct device_domain_info *info;
1492
1493         if (!domain->has_iotlb_device)
1494                 return;
1495
1496         spin_lock_irqsave(&device_domain_lock, flags);
1497         list_for_each_entry(info, &domain->devices, link) {
1498                 if (!info->ats_enabled)
1499                         continue;
1500
1501                 sid = info->bus << 8 | info->devfn;
1502                 qdep = info->ats_qdep;
1503                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1504                                 qdep, addr, mask);
1505         }
1506         spin_unlock_irqrestore(&device_domain_lock, flags);
1507 }
1508
1509 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1510                                   struct dmar_domain *domain,
1511                                   unsigned long pfn, unsigned int pages,
1512                                   int ih, int map)
1513 {
1514         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1515         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1516         u16 did = domain->iommu_did[iommu->seq_id];
1517
1518         BUG_ON(pages == 0);
1519
1520         if (ih)
1521                 ih = 1 << 6;
1522         /*
1523          * Fallback to domain selective flush if no PSI support or the size is
1524          * too big.
1525          * PSI requires page size to be 2 ^ x, and the base address is naturally
1526          * aligned to the size
1527          */
1528         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1529                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1530                                                 DMA_TLB_DSI_FLUSH);
1531         else
1532                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1533                                                 DMA_TLB_PSI_FLUSH);
1534
1535         /*
1536          * In caching mode, changes of pages from non-present to present require
1537          * flush. However, device IOTLB doesn't need to be flushed in this case.
1538          */
1539         if (!cap_caching_mode(iommu->cap) || !map)
1540                 iommu_flush_dev_iotlb(domain, addr, mask);
1541 }
1542
1543 /* Notification for newly created mappings */
1544 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1545                                         struct dmar_domain *domain,
1546                                         unsigned long pfn, unsigned int pages)
1547 {
1548         /* It's a non-present to present mapping. Only flush if caching mode */
1549         if (cap_caching_mode(iommu->cap))
1550                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1551         else
1552                 iommu_flush_write_buffer(iommu);
1553 }
1554
1555 static void iommu_flush_iova(struct iova_domain *iovad)
1556 {
1557         struct dmar_domain *domain;
1558         int idx;
1559
1560         domain = container_of(iovad, struct dmar_domain, iovad);
1561
1562         for_each_domain_iommu(idx, domain) {
1563                 struct intel_iommu *iommu = g_iommus[idx];
1564                 u16 did = domain->iommu_did[iommu->seq_id];
1565
1566                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1567
1568                 if (!cap_caching_mode(iommu->cap))
1569                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1570                                               0, MAX_AGAW_PFN_WIDTH);
1571         }
1572 }
1573
1574 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1575 {
1576         u32 pmen;
1577         unsigned long flags;
1578
1579         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1580         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1581         pmen &= ~DMA_PMEN_EPM;
1582         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1583
1584         /* wait for the protected region status bit to clear */
1585         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1586                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1587
1588         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1589 }
1590
1591 static void iommu_enable_translation(struct intel_iommu *iommu)
1592 {
1593         u32 sts;
1594         unsigned long flags;
1595
1596         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1597         iommu->gcmd |= DMA_GCMD_TE;
1598         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1599
1600         /* Make sure hardware complete it */
1601         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1602                       readl, (sts & DMA_GSTS_TES), sts);
1603
1604         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1605 }
1606
1607 static void iommu_disable_translation(struct intel_iommu *iommu)
1608 {
1609         u32 sts;
1610         unsigned long flag;
1611
1612         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1613         iommu->gcmd &= ~DMA_GCMD_TE;
1614         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1615
1616         /* Make sure hardware complete it */
1617         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1618                       readl, (!(sts & DMA_GSTS_TES)), sts);
1619
1620         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1621 }
1622
1623
1624 static int iommu_init_domains(struct intel_iommu *iommu)
1625 {
1626         u32 ndomains, nlongs;
1627         size_t size;
1628
1629         ndomains = cap_ndoms(iommu->cap);
1630         pr_debug("%s: Number of Domains supported <%d>\n",
1631                  iommu->name, ndomains);
1632         nlongs = BITS_TO_LONGS(ndomains);
1633
1634         spin_lock_init(&iommu->lock);
1635
1636         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1637         if (!iommu->domain_ids) {
1638                 pr_err("%s: Allocating domain id array failed\n",
1639                        iommu->name);
1640                 return -ENOMEM;
1641         }
1642
1643         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1644         iommu->domains = kzalloc(size, GFP_KERNEL);
1645
1646         if (iommu->domains) {
1647                 size = 256 * sizeof(struct dmar_domain *);
1648                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1649         }
1650
1651         if (!iommu->domains || !iommu->domains[0]) {
1652                 pr_err("%s: Allocating domain array failed\n",
1653                        iommu->name);
1654                 kfree(iommu->domain_ids);
1655                 kfree(iommu->domains);
1656                 iommu->domain_ids = NULL;
1657                 iommu->domains    = NULL;
1658                 return -ENOMEM;
1659         }
1660
1661
1662
1663         /*
1664          * If Caching mode is set, then invalid translations are tagged
1665          * with domain-id 0, hence we need to pre-allocate it. We also
1666          * use domain-id 0 as a marker for non-allocated domain-id, so
1667          * make sure it is not used for a real domain.
1668          */
1669         set_bit(0, iommu->domain_ids);
1670
1671         return 0;
1672 }
1673
1674 static void disable_dmar_iommu(struct intel_iommu *iommu)
1675 {
1676         struct device_domain_info *info, *tmp;
1677         unsigned long flags;
1678
1679         if (!iommu->domains || !iommu->domain_ids)
1680                 return;
1681
1682 again:
1683         spin_lock_irqsave(&device_domain_lock, flags);
1684         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1685                 struct dmar_domain *domain;
1686
1687                 if (info->iommu != iommu)
1688                         continue;
1689
1690                 if (!info->dev || !info->domain)
1691                         continue;
1692
1693                 domain = info->domain;
1694
1695                 __dmar_remove_one_dev_info(info);
1696
1697                 if (!domain_type_is_vm_or_si(domain)) {
1698                         /*
1699                          * The domain_exit() function  can't be called under
1700                          * device_domain_lock, as it takes this lock itself.
1701                          * So release the lock here and re-run the loop
1702                          * afterwards.
1703                          */
1704                         spin_unlock_irqrestore(&device_domain_lock, flags);
1705                         domain_exit(domain);
1706                         goto again;
1707                 }
1708         }
1709         spin_unlock_irqrestore(&device_domain_lock, flags);
1710
1711         if (iommu->gcmd & DMA_GCMD_TE)
1712                 iommu_disable_translation(iommu);
1713 }
1714
1715 static void free_dmar_iommu(struct intel_iommu *iommu)
1716 {
1717         if ((iommu->domains) && (iommu->domain_ids)) {
1718                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1719                 int i;
1720
1721                 for (i = 0; i < elems; i++)
1722                         kfree(iommu->domains[i]);
1723                 kfree(iommu->domains);
1724                 kfree(iommu->domain_ids);
1725                 iommu->domains = NULL;
1726                 iommu->domain_ids = NULL;
1727         }
1728
1729         g_iommus[iommu->seq_id] = NULL;
1730
1731         /* free context mapping */
1732         free_context_table(iommu);
1733
1734 #ifdef CONFIG_INTEL_IOMMU_SVM
1735         if (pasid_enabled(iommu)) {
1736                 if (ecap_prs(iommu->ecap))
1737                         intel_svm_finish_prq(iommu);
1738                 intel_svm_free_pasid_tables(iommu);
1739         }
1740 #endif
1741 }
1742
1743 static struct dmar_domain *alloc_domain(int flags)
1744 {
1745         struct dmar_domain *domain;
1746
1747         domain = alloc_domain_mem();
1748         if (!domain)
1749                 return NULL;
1750
1751         memset(domain, 0, sizeof(*domain));
1752         domain->nid = -1;
1753         domain->flags = flags;
1754         domain->has_iotlb_device = false;
1755         INIT_LIST_HEAD(&domain->devices);
1756
1757         return domain;
1758 }
1759
1760 /* Must be called with iommu->lock */
1761 static int domain_attach_iommu(struct dmar_domain *domain,
1762                                struct intel_iommu *iommu)
1763 {
1764         unsigned long ndomains;
1765         int num;
1766
1767         assert_spin_locked(&device_domain_lock);
1768         assert_spin_locked(&iommu->lock);
1769
1770         domain->iommu_refcnt[iommu->seq_id] += 1;
1771         domain->iommu_count += 1;
1772         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1773                 ndomains = cap_ndoms(iommu->cap);
1774                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1775
1776                 if (num >= ndomains) {
1777                         pr_err("%s: No free domain ids\n", iommu->name);
1778                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1779                         domain->iommu_count -= 1;
1780                         return -ENOSPC;
1781                 }
1782
1783                 set_bit(num, iommu->domain_ids);
1784                 set_iommu_domain(iommu, num, domain);
1785
1786                 domain->iommu_did[iommu->seq_id] = num;
1787                 domain->nid                      = iommu->node;
1788
1789                 domain_update_iommu_cap(domain);
1790         }
1791
1792         return 0;
1793 }
1794
1795 static int domain_detach_iommu(struct dmar_domain *domain,
1796                                struct intel_iommu *iommu)
1797 {
1798         int num, count = INT_MAX;
1799
1800         assert_spin_locked(&device_domain_lock);
1801         assert_spin_locked(&iommu->lock);
1802
1803         domain->iommu_refcnt[iommu->seq_id] -= 1;
1804         count = --domain->iommu_count;
1805         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1806                 num = domain->iommu_did[iommu->seq_id];
1807                 clear_bit(num, iommu->domain_ids);
1808                 set_iommu_domain(iommu, num, NULL);
1809
1810                 domain_update_iommu_cap(domain);
1811                 domain->iommu_did[iommu->seq_id] = 0;
1812         }
1813
1814         return count;
1815 }
1816
1817 static struct iova_domain reserved_iova_list;
1818 static struct lock_class_key reserved_rbtree_key;
1819
1820 static int dmar_init_reserved_ranges(void)
1821 {
1822         struct pci_dev *pdev = NULL;
1823         struct iova *iova;
1824         int i;
1825
1826         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1827
1828         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1829                 &reserved_rbtree_key);
1830
1831         /* IOAPIC ranges shouldn't be accessed by DMA */
1832         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1833                 IOVA_PFN(IOAPIC_RANGE_END));
1834         if (!iova) {
1835                 pr_err("Reserve IOAPIC range failed\n");
1836                 return -ENODEV;
1837         }
1838
1839         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1840         for_each_pci_dev(pdev) {
1841                 struct resource *r;
1842
1843                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1844                         r = &pdev->resource[i];
1845                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1846                                 continue;
1847                         iova = reserve_iova(&reserved_iova_list,
1848                                             IOVA_PFN(r->start),
1849                                             IOVA_PFN(r->end));
1850                         if (!iova) {
1851                                 pr_err("Reserve iova failed\n");
1852                                 return -ENODEV;
1853                         }
1854                 }
1855         }
1856         return 0;
1857 }
1858
1859 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1860 {
1861         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1862 }
1863
1864 static inline int guestwidth_to_adjustwidth(int gaw)
1865 {
1866         int agaw;
1867         int r = (gaw - 12) % 9;
1868
1869         if (r == 0)
1870                 agaw = gaw;
1871         else
1872                 agaw = gaw + 9 - r;
1873         if (agaw > 64)
1874                 agaw = 64;
1875         return agaw;
1876 }
1877
1878 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1879                        int guest_width)
1880 {
1881         int adjust_width, agaw;
1882         unsigned long sagaw;
1883         int err;
1884
1885         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1886
1887         err = init_iova_flush_queue(&domain->iovad,
1888                                     iommu_flush_iova, iova_entry_free);
1889         if (err)
1890                 return err;
1891
1892         domain_reserve_special_ranges(domain);
1893
1894         /* calculate AGAW */
1895         if (guest_width > cap_mgaw(iommu->cap))
1896                 guest_width = cap_mgaw(iommu->cap);
1897         domain->gaw = guest_width;
1898         adjust_width = guestwidth_to_adjustwidth(guest_width);
1899         agaw = width_to_agaw(adjust_width);
1900         sagaw = cap_sagaw(iommu->cap);
1901         if (!test_bit(agaw, &sagaw)) {
1902                 /* hardware doesn't support it, choose a bigger one */
1903                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1904                 agaw = find_next_bit(&sagaw, 5, agaw);
1905                 if (agaw >= 5)
1906                         return -ENODEV;
1907         }
1908         domain->agaw = agaw;
1909
1910         if (ecap_coherent(iommu->ecap))
1911                 domain->iommu_coherency = 1;
1912         else
1913                 domain->iommu_coherency = 0;
1914
1915         if (ecap_sc_support(iommu->ecap))
1916                 domain->iommu_snooping = 1;
1917         else
1918                 domain->iommu_snooping = 0;
1919
1920         if (intel_iommu_superpage)
1921                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1922         else
1923                 domain->iommu_superpage = 0;
1924
1925         domain->nid = iommu->node;
1926
1927         /* always allocate the top pgd */
1928         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1929         if (!domain->pgd)
1930                 return -ENOMEM;
1931         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1932         return 0;
1933 }
1934
1935 static void domain_exit(struct dmar_domain *domain)
1936 {
1937         struct page *freelist = NULL;
1938
1939         /* Domain 0 is reserved, so dont process it */
1940         if (!domain)
1941                 return;
1942
1943         /* Remove associated devices and clear attached or cached domains */
1944         rcu_read_lock();
1945         domain_remove_dev_info(domain);
1946         rcu_read_unlock();
1947
1948         /* destroy iovas */
1949         put_iova_domain(&domain->iovad);
1950
1951         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1952
1953         dma_free_pagelist(freelist);
1954
1955         free_domain_mem(domain);
1956 }
1957
1958 static int domain_context_mapping_one(struct dmar_domain *domain,
1959                                       struct intel_iommu *iommu,
1960                                       u8 bus, u8 devfn)
1961 {
1962         u16 did = domain->iommu_did[iommu->seq_id];
1963         int translation = CONTEXT_TT_MULTI_LEVEL;
1964         struct device_domain_info *info = NULL;
1965         struct context_entry *context;
1966         unsigned long flags;
1967         struct dma_pte *pgd;
1968         int ret, agaw;
1969
1970         WARN_ON(did == 0);
1971
1972         if (hw_pass_through && domain_type_is_si(domain))
1973                 translation = CONTEXT_TT_PASS_THROUGH;
1974
1975         pr_debug("Set context mapping for %02x:%02x.%d\n",
1976                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1977
1978         BUG_ON(!domain->pgd);
1979
1980         spin_lock_irqsave(&device_domain_lock, flags);
1981         spin_lock(&iommu->lock);
1982
1983         ret = -ENOMEM;
1984         context = iommu_context_addr(iommu, bus, devfn, 1);
1985         if (!context)
1986                 goto out_unlock;
1987
1988         ret = 0;
1989         if (context_present(context))
1990                 goto out_unlock;
1991
1992         /*
1993          * For kdump cases, old valid entries may be cached due to the
1994          * in-flight DMA and copied pgtable, but there is no unmapping
1995          * behaviour for them, thus we need an explicit cache flush for
1996          * the newly-mapped device. For kdump, at this point, the device
1997          * is supposed to finish reset at its driver probe stage, so no
1998          * in-flight DMA will exist, and we don't need to worry anymore
1999          * hereafter.
2000          */
2001         if (context_copied(context)) {
2002                 u16 did_old = context_domain_id(context);
2003
2004                 if (did_old < cap_ndoms(iommu->cap)) {
2005                         iommu->flush.flush_context(iommu, did_old,
2006                                                    (((u16)bus) << 8) | devfn,
2007                                                    DMA_CCMD_MASK_NOBIT,
2008                                                    DMA_CCMD_DEVICE_INVL);
2009                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2010                                                  DMA_TLB_DSI_FLUSH);
2011                 }
2012         }
2013
2014         pgd = domain->pgd;
2015
2016         context_clear_entry(context);
2017         context_set_domain_id(context, did);
2018
2019         /*
2020          * Skip top levels of page tables for iommu which has less agaw
2021          * than default.  Unnecessary for PT mode.
2022          */
2023         if (translation != CONTEXT_TT_PASS_THROUGH) {
2024                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2025                         ret = -ENOMEM;
2026                         pgd = phys_to_virt(dma_pte_addr(pgd));
2027                         if (!dma_pte_present(pgd))
2028                                 goto out_unlock;
2029                 }
2030
2031                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2032                 if (info && info->ats_supported)
2033                         translation = CONTEXT_TT_DEV_IOTLB;
2034                 else
2035                         translation = CONTEXT_TT_MULTI_LEVEL;
2036
2037                 context_set_address_root(context, virt_to_phys(pgd));
2038                 context_set_address_width(context, iommu->agaw);
2039         } else {
2040                 /*
2041                  * In pass through mode, AW must be programmed to
2042                  * indicate the largest AGAW value supported by
2043                  * hardware. And ASR is ignored by hardware.
2044                  */
2045                 context_set_address_width(context, iommu->msagaw);
2046         }
2047
2048         context_set_translation_type(context, translation);
2049         context_set_fault_enable(context);
2050         context_set_present(context);
2051         domain_flush_cache(domain, context, sizeof(*context));
2052
2053         /*
2054          * It's a non-present to present mapping. If hardware doesn't cache
2055          * non-present entry we only need to flush the write-buffer. If the
2056          * _does_ cache non-present entries, then it does so in the special
2057          * domain #0, which we have to flush:
2058          */
2059         if (cap_caching_mode(iommu->cap)) {
2060                 iommu->flush.flush_context(iommu, 0,
2061                                            (((u16)bus) << 8) | devfn,
2062                                            DMA_CCMD_MASK_NOBIT,
2063                                            DMA_CCMD_DEVICE_INVL);
2064                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2065         } else {
2066                 iommu_flush_write_buffer(iommu);
2067         }
2068         iommu_enable_dev_iotlb(info);
2069
2070         ret = 0;
2071
2072 out_unlock:
2073         spin_unlock(&iommu->lock);
2074         spin_unlock_irqrestore(&device_domain_lock, flags);
2075
2076         return ret;
2077 }
2078
2079 struct domain_context_mapping_data {
2080         struct dmar_domain *domain;
2081         struct intel_iommu *iommu;
2082 };
2083
2084 static int domain_context_mapping_cb(struct pci_dev *pdev,
2085                                      u16 alias, void *opaque)
2086 {
2087         struct domain_context_mapping_data *data = opaque;
2088
2089         return domain_context_mapping_one(data->domain, data->iommu,
2090                                           PCI_BUS_NUM(alias), alias & 0xff);
2091 }
2092
2093 static int
2094 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2095 {
2096         struct intel_iommu *iommu;
2097         u8 bus, devfn;
2098         struct domain_context_mapping_data data;
2099
2100         iommu = device_to_iommu(dev, &bus, &devfn);
2101         if (!iommu)
2102                 return -ENODEV;
2103
2104         if (!dev_is_pci(dev))
2105                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2106
2107         data.domain = domain;
2108         data.iommu = iommu;
2109
2110         return pci_for_each_dma_alias(to_pci_dev(dev),
2111                                       &domain_context_mapping_cb, &data);
2112 }
2113
2114 static int domain_context_mapped_cb(struct pci_dev *pdev,
2115                                     u16 alias, void *opaque)
2116 {
2117         struct intel_iommu *iommu = opaque;
2118
2119         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2120 }
2121
2122 static int domain_context_mapped(struct device *dev)
2123 {
2124         struct intel_iommu *iommu;
2125         u8 bus, devfn;
2126
2127         iommu = device_to_iommu(dev, &bus, &devfn);
2128         if (!iommu)
2129                 return -ENODEV;
2130
2131         if (!dev_is_pci(dev))
2132                 return device_context_mapped(iommu, bus, devfn);
2133
2134         return !pci_for_each_dma_alias(to_pci_dev(dev),
2135                                        domain_context_mapped_cb, iommu);
2136 }
2137
2138 /* Returns a number of VTD pages, but aligned to MM page size */
2139 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2140                                             size_t size)
2141 {
2142         host_addr &= ~PAGE_MASK;
2143         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2144 }
2145
2146 /* Return largest possible superpage level for a given mapping */
2147 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2148                                           unsigned long iov_pfn,
2149                                           unsigned long phy_pfn,
2150                                           unsigned long pages)
2151 {
2152         int support, level = 1;
2153         unsigned long pfnmerge;
2154
2155         support = domain->iommu_superpage;
2156
2157         /* To use a large page, the virtual *and* physical addresses
2158            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2159            of them will mean we have to use smaller pages. So just
2160            merge them and check both at once. */
2161         pfnmerge = iov_pfn | phy_pfn;
2162
2163         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2164                 pages >>= VTD_STRIDE_SHIFT;
2165                 if (!pages)
2166                         break;
2167                 pfnmerge >>= VTD_STRIDE_SHIFT;
2168                 level++;
2169                 support--;
2170         }
2171         return level;
2172 }
2173
2174 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2175                             struct scatterlist *sg, unsigned long phys_pfn,
2176                             unsigned long nr_pages, int prot)
2177 {
2178         struct dma_pte *first_pte = NULL, *pte = NULL;
2179         phys_addr_t uninitialized_var(pteval);
2180         unsigned long sg_res = 0;
2181         unsigned int largepage_lvl = 0;
2182         unsigned long lvl_pages = 0;
2183
2184         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2185
2186         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2187                 return -EINVAL;
2188
2189         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2190
2191         if (!sg) {
2192                 sg_res = nr_pages;
2193                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2194         }
2195
2196         while (nr_pages > 0) {
2197                 uint64_t tmp;
2198
2199                 if (!sg_res) {
2200                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2201
2202                         sg_res = aligned_nrpages(sg->offset, sg->length);
2203                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2204                         sg->dma_length = sg->length;
2205                         pteval = (sg_phys(sg) - pgoff) | prot;
2206                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2207                 }
2208
2209                 if (!pte) {
2210                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2211
2212                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2213                         if (!pte)
2214                                 return -ENOMEM;
2215                         /* It is large page*/
2216                         if (largepage_lvl > 1) {
2217                                 unsigned long nr_superpages, end_pfn;
2218
2219                                 pteval |= DMA_PTE_LARGE_PAGE;
2220                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2221
2222                                 nr_superpages = sg_res / lvl_pages;
2223                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2224
2225                                 /*
2226                                  * Ensure that old small page tables are
2227                                  * removed to make room for superpage(s).
2228                                  * We're adding new large pages, so make sure
2229                                  * we don't remove their parent tables.
2230                                  */
2231                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2232                                                        largepage_lvl + 1);
2233                         } else {
2234                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2235                         }
2236
2237                 }
2238                 /* We don't need lock here, nobody else
2239                  * touches the iova range
2240                  */
2241                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2242                 if (tmp) {
2243                         static int dumps = 5;
2244                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2245                                 iov_pfn, tmp, (unsigned long long)pteval);
2246                         if (dumps) {
2247                                 dumps--;
2248                                 debug_dma_dump_mappings(NULL);
2249                         }
2250                         WARN_ON(1);
2251                 }
2252
2253                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2254
2255                 BUG_ON(nr_pages < lvl_pages);
2256                 BUG_ON(sg_res < lvl_pages);
2257
2258                 nr_pages -= lvl_pages;
2259                 iov_pfn += lvl_pages;
2260                 phys_pfn += lvl_pages;
2261                 pteval += lvl_pages * VTD_PAGE_SIZE;
2262                 sg_res -= lvl_pages;
2263
2264                 /* If the next PTE would be the first in a new page, then we
2265                    need to flush the cache on the entries we've just written.
2266                    And then we'll need to recalculate 'pte', so clear it and
2267                    let it get set again in the if (!pte) block above.
2268
2269                    If we're done (!nr_pages) we need to flush the cache too.
2270
2271                    Also if we've been setting superpages, we may need to
2272                    recalculate 'pte' and switch back to smaller pages for the
2273                    end of the mapping, if the trailing size is not enough to
2274                    use another superpage (i.e. sg_res < lvl_pages). */
2275                 pte++;
2276                 if (!nr_pages || first_pte_in_page(pte) ||
2277                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2278                         domain_flush_cache(domain, first_pte,
2279                                            (void *)pte - (void *)first_pte);
2280                         pte = NULL;
2281                 }
2282
2283                 if (!sg_res && nr_pages)
2284                         sg = sg_next(sg);
2285         }
2286         return 0;
2287 }
2288
2289 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2290                          struct scatterlist *sg, unsigned long phys_pfn,
2291                          unsigned long nr_pages, int prot)
2292 {
2293        int ret;
2294        struct intel_iommu *iommu;
2295
2296        /* Do the real mapping first */
2297        ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2298        if (ret)
2299                return ret;
2300
2301        /* Notify about the new mapping */
2302        if (domain_type_is_vm(domain)) {
2303                /* VM typed domains can have more than one IOMMUs */
2304                int iommu_id;
2305                for_each_domain_iommu(iommu_id, domain) {
2306                        iommu = g_iommus[iommu_id];
2307                        __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2308                }
2309        } else {
2310                /* General domains only have one IOMMU */
2311                iommu = domain_get_iommu(domain);
2312                __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2313        }
2314
2315        return 0;
2316 }
2317
2318 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2319                                     struct scatterlist *sg, unsigned long nr_pages,
2320                                     int prot)
2321 {
2322         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2323 }
2324
2325 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2326                                      unsigned long phys_pfn, unsigned long nr_pages,
2327                                      int prot)
2328 {
2329         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2330 }
2331
2332 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2333 {
2334         unsigned long flags;
2335         struct context_entry *context;
2336         u16 did_old;
2337
2338         if (!iommu)
2339                 return;
2340
2341         spin_lock_irqsave(&iommu->lock, flags);
2342         context = iommu_context_addr(iommu, bus, devfn, 0);
2343         if (!context) {
2344                 spin_unlock_irqrestore(&iommu->lock, flags);
2345                 return;
2346         }
2347         did_old = context_domain_id(context);
2348         context_clear_entry(context);
2349         __iommu_flush_cache(iommu, context, sizeof(*context));
2350         spin_unlock_irqrestore(&iommu->lock, flags);
2351         iommu->flush.flush_context(iommu,
2352                                    did_old,
2353                                    (((u16)bus) << 8) | devfn,
2354                                    DMA_CCMD_MASK_NOBIT,
2355                                    DMA_CCMD_DEVICE_INVL);
2356         iommu->flush.flush_iotlb(iommu,
2357                                  did_old,
2358                                  0,
2359                                  0,
2360                                  DMA_TLB_DSI_FLUSH);
2361 }
2362
2363 static inline void unlink_domain_info(struct device_domain_info *info)
2364 {
2365         assert_spin_locked(&device_domain_lock);
2366         list_del(&info->link);
2367         list_del(&info->global);
2368         if (info->dev)
2369                 info->dev->archdata.iommu = NULL;
2370 }
2371
2372 static void domain_remove_dev_info(struct dmar_domain *domain)
2373 {
2374         struct device_domain_info *info, *tmp;
2375         unsigned long flags;
2376
2377         spin_lock_irqsave(&device_domain_lock, flags);
2378         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2379                 __dmar_remove_one_dev_info(info);
2380         spin_unlock_irqrestore(&device_domain_lock, flags);
2381 }
2382
2383 /*
2384  * find_domain
2385  * Note: we use struct device->archdata.iommu stores the info
2386  */
2387 static struct dmar_domain *find_domain(struct device *dev)
2388 {
2389         struct device_domain_info *info;
2390
2391         /* No lock here, assumes no domain exit in normal case */
2392         info = dev->archdata.iommu;
2393         if (likely(info))
2394                 return info->domain;
2395         return NULL;
2396 }
2397
2398 static inline struct device_domain_info *
2399 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2400 {
2401         struct device_domain_info *info;
2402
2403         list_for_each_entry(info, &device_domain_list, global)
2404                 if (info->iommu->segment == segment && info->bus == bus &&
2405                     info->devfn == devfn)
2406                         return info;
2407
2408         return NULL;
2409 }
2410
2411 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2412                                                     int bus, int devfn,
2413                                                     struct device *dev,
2414                                                     struct dmar_domain *domain)
2415 {
2416         struct dmar_domain *found = NULL;
2417         struct device_domain_info *info;
2418         unsigned long flags;
2419         int ret;
2420
2421         info = alloc_devinfo_mem();
2422         if (!info)
2423                 return NULL;
2424
2425         info->bus = bus;
2426         info->devfn = devfn;
2427         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2428         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2429         info->ats_qdep = 0;
2430         info->dev = dev;
2431         info->domain = domain;
2432         info->iommu = iommu;
2433
2434         if (dev && dev_is_pci(dev)) {
2435                 struct pci_dev *pdev = to_pci_dev(info->dev);
2436
2437                 if (!pci_ats_disabled() &&
2438                     ecap_dev_iotlb_support(iommu->ecap) &&
2439                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2440                     dmar_find_matched_atsr_unit(pdev))
2441                         info->ats_supported = 1;
2442
2443                 if (ecs_enabled(iommu)) {
2444                         if (pasid_enabled(iommu)) {
2445                                 int features = pci_pasid_features(pdev);
2446                                 if (features >= 0)
2447                                         info->pasid_supported = features | 1;
2448                         }
2449
2450                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2451                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2452                                 info->pri_supported = 1;
2453                 }
2454         }
2455
2456         spin_lock_irqsave(&device_domain_lock, flags);
2457         if (dev)
2458                 found = find_domain(dev);
2459
2460         if (!found) {
2461                 struct device_domain_info *info2;
2462                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2463                 if (info2) {
2464                         found      = info2->domain;
2465                         info2->dev = dev;
2466                 }
2467         }
2468
2469         if (found) {
2470                 spin_unlock_irqrestore(&device_domain_lock, flags);
2471                 free_devinfo_mem(info);
2472                 /* Caller must free the original domain */
2473                 return found;
2474         }
2475
2476         spin_lock(&iommu->lock);
2477         ret = domain_attach_iommu(domain, iommu);
2478         spin_unlock(&iommu->lock);
2479
2480         if (ret) {
2481                 spin_unlock_irqrestore(&device_domain_lock, flags);
2482                 free_devinfo_mem(info);
2483                 return NULL;
2484         }
2485
2486         list_add(&info->link, &domain->devices);
2487         list_add(&info->global, &device_domain_list);
2488         if (dev)
2489                 dev->archdata.iommu = info;
2490         spin_unlock_irqrestore(&device_domain_lock, flags);
2491
2492         if (dev && domain_context_mapping(domain, dev)) {
2493                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2494                 dmar_remove_one_dev_info(domain, dev);
2495                 return NULL;
2496         }
2497
2498         return domain;
2499 }
2500
2501 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2502 {
2503         *(u16 *)opaque = alias;
2504         return 0;
2505 }
2506
2507 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2508 {
2509         struct device_domain_info *info = NULL;
2510         struct dmar_domain *domain = NULL;
2511         struct intel_iommu *iommu;
2512         u16 dma_alias;
2513         unsigned long flags;
2514         u8 bus, devfn;
2515
2516         iommu = device_to_iommu(dev, &bus, &devfn);
2517         if (!iommu)
2518                 return NULL;
2519
2520         if (dev_is_pci(dev)) {
2521                 struct pci_dev *pdev = to_pci_dev(dev);
2522
2523                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2524
2525                 spin_lock_irqsave(&device_domain_lock, flags);
2526                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2527                                                       PCI_BUS_NUM(dma_alias),
2528                                                       dma_alias & 0xff);
2529                 if (info) {
2530                         iommu = info->iommu;
2531                         domain = info->domain;
2532                 }
2533                 spin_unlock_irqrestore(&device_domain_lock, flags);
2534
2535                 /* DMA alias already has a domain, use it */
2536                 if (info)
2537                         goto out;
2538         }
2539
2540         /* Allocate and initialize new domain for the device */
2541         domain = alloc_domain(0);
2542         if (!domain)
2543                 return NULL;
2544         if (domain_init(domain, iommu, gaw)) {
2545                 domain_exit(domain);
2546                 return NULL;
2547         }
2548
2549 out:
2550
2551         return domain;
2552 }
2553
2554 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2555                                               struct dmar_domain *domain)
2556 {
2557         struct intel_iommu *iommu;
2558         struct dmar_domain *tmp;
2559         u16 req_id, dma_alias;
2560         u8 bus, devfn;
2561
2562         iommu = device_to_iommu(dev, &bus, &devfn);
2563         if (!iommu)
2564                 return NULL;
2565
2566         req_id = ((u16)bus << 8) | devfn;
2567
2568         if (dev_is_pci(dev)) {
2569                 struct pci_dev *pdev = to_pci_dev(dev);
2570
2571                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2572
2573                 /* register PCI DMA alias device */
2574                 if (req_id != dma_alias) {
2575                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2576                                         dma_alias & 0xff, NULL, domain);
2577
2578                         if (!tmp || tmp != domain)
2579                                 return tmp;
2580                 }
2581         }
2582
2583         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2584         if (!tmp || tmp != domain)
2585                 return tmp;
2586
2587         return domain;
2588 }
2589
2590 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2591 {
2592         struct dmar_domain *domain, *tmp;
2593
2594         domain = find_domain(dev);
2595         if (domain)
2596                 goto out;
2597
2598         domain = find_or_alloc_domain(dev, gaw);
2599         if (!domain)
2600                 goto out;
2601
2602         tmp = set_domain_for_dev(dev, domain);
2603         if (!tmp || domain != tmp) {
2604                 domain_exit(domain);
2605                 domain = tmp;
2606         }
2607
2608 out:
2609
2610         return domain;
2611 }
2612
2613 static int iommu_domain_identity_map(struct dmar_domain *domain,
2614                                      unsigned long long start,
2615                                      unsigned long long end)
2616 {
2617         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2618         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2619
2620         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2621                           dma_to_mm_pfn(last_vpfn))) {
2622                 pr_err("Reserving iova failed\n");
2623                 return -ENOMEM;
2624         }
2625
2626         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2627         /*
2628          * RMRR range might have overlap with physical memory range,
2629          * clear it first
2630          */
2631         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2632
2633         return __domain_mapping(domain, first_vpfn, NULL,
2634                                 first_vpfn, last_vpfn - first_vpfn + 1,
2635                                 DMA_PTE_READ|DMA_PTE_WRITE);
2636 }
2637
2638 static int domain_prepare_identity_map(struct device *dev,
2639                                        struct dmar_domain *domain,
2640                                        unsigned long long start,
2641                                        unsigned long long end)
2642 {
2643         /* For _hardware_ passthrough, don't bother. But for software
2644            passthrough, we do it anyway -- it may indicate a memory
2645            range which is reserved in E820, so which didn't get set
2646            up to start with in si_domain */
2647         if (domain == si_domain && hw_pass_through) {
2648                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2649                         dev_name(dev), start, end);
2650                 return 0;
2651         }
2652
2653         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2654                 dev_name(dev), start, end);
2655
2656         if (end < start) {
2657                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2658                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2659                         dmi_get_system_info(DMI_BIOS_VENDOR),
2660                         dmi_get_system_info(DMI_BIOS_VERSION),
2661                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2662                 return -EIO;
2663         }
2664
2665         if (end >> agaw_to_width(domain->agaw)) {
2666                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2667                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2668                      agaw_to_width(domain->agaw),
2669                      dmi_get_system_info(DMI_BIOS_VENDOR),
2670                      dmi_get_system_info(DMI_BIOS_VERSION),
2671                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2672                 return -EIO;
2673         }
2674
2675         return iommu_domain_identity_map(domain, start, end);
2676 }
2677
2678 static int iommu_prepare_identity_map(struct device *dev,
2679                                       unsigned long long start,
2680                                       unsigned long long end)
2681 {
2682         struct dmar_domain *domain;
2683         int ret;
2684
2685         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2686         if (!domain)
2687                 return -ENOMEM;
2688
2689         ret = domain_prepare_identity_map(dev, domain, start, end);
2690         if (ret)
2691                 domain_exit(domain);
2692
2693         return ret;
2694 }
2695
2696 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2697                                          struct device *dev)
2698 {
2699         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2700                 return 0;
2701         return iommu_prepare_identity_map(dev, rmrr->base_address,
2702                                           rmrr->end_address);
2703 }
2704
2705 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2706 static inline void iommu_prepare_isa(void)
2707 {
2708         struct pci_dev *pdev;
2709         int ret;
2710
2711         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2712         if (!pdev)
2713                 return;
2714
2715         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2716         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2717
2718         if (ret)
2719                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2720
2721         pci_dev_put(pdev);
2722 }
2723 #else
2724 static inline void iommu_prepare_isa(void)
2725 {
2726         return;
2727 }
2728 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2729
2730 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2731
2732 static int __init si_domain_init(int hw)
2733 {
2734         int nid, ret = 0;
2735
2736         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2737         if (!si_domain)
2738                 return -EFAULT;
2739
2740         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2741                 domain_exit(si_domain);
2742                 return -EFAULT;
2743         }
2744
2745         pr_debug("Identity mapping domain allocated\n");
2746
2747         if (hw)
2748                 return 0;
2749
2750         for_each_online_node(nid) {
2751                 unsigned long start_pfn, end_pfn;
2752                 int i;
2753
2754                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2755                         ret = iommu_domain_identity_map(si_domain,
2756                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2757                         if (ret)
2758                                 return ret;
2759                 }
2760         }
2761
2762         return 0;
2763 }
2764
2765 static int identity_mapping(struct device *dev)
2766 {
2767         struct device_domain_info *info;
2768
2769         if (likely(!iommu_identity_mapping))
2770                 return 0;
2771
2772         info = dev->archdata.iommu;
2773         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2774                 return (info->domain == si_domain);
2775
2776         return 0;
2777 }
2778
2779 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2780 {
2781         struct dmar_domain *ndomain;
2782         struct intel_iommu *iommu;
2783         u8 bus, devfn;
2784
2785         iommu = device_to_iommu(dev, &bus, &devfn);
2786         if (!iommu)
2787                 return -ENODEV;
2788
2789         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2790         if (ndomain != domain)
2791                 return -EBUSY;
2792
2793         return 0;
2794 }
2795
2796 static bool device_has_rmrr(struct device *dev)
2797 {
2798         struct dmar_rmrr_unit *rmrr;
2799         struct device *tmp;
2800         int i;
2801
2802         rcu_read_lock();
2803         for_each_rmrr_units(rmrr) {
2804                 /*
2805                  * Return TRUE if this RMRR contains the device that
2806                  * is passed in.
2807                  */
2808                 for_each_active_dev_scope(rmrr->devices,
2809                                           rmrr->devices_cnt, i, tmp)
2810                         if (tmp == dev) {
2811                                 rcu_read_unlock();
2812                                 return true;
2813                         }
2814         }
2815         rcu_read_unlock();
2816         return false;
2817 }
2818
2819 /*
2820  * There are a couple cases where we need to restrict the functionality of
2821  * devices associated with RMRRs.  The first is when evaluating a device for
2822  * identity mapping because problems exist when devices are moved in and out
2823  * of domains and their respective RMRR information is lost.  This means that
2824  * a device with associated RMRRs will never be in a "passthrough" domain.
2825  * The second is use of the device through the IOMMU API.  This interface
2826  * expects to have full control of the IOVA space for the device.  We cannot
2827  * satisfy both the requirement that RMRR access is maintained and have an
2828  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2829  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2830  * We therefore prevent devices associated with an RMRR from participating in
2831  * the IOMMU API, which eliminates them from device assignment.
2832  *
2833  * In both cases we assume that PCI USB devices with RMRRs have them largely
2834  * for historical reasons and that the RMRR space is not actively used post
2835  * boot.  This exclusion may change if vendors begin to abuse it.
2836  *
2837  * The same exception is made for graphics devices, with the requirement that
2838  * any use of the RMRR regions will be torn down before assigning the device
2839  * to a guest.
2840  */
2841 static bool device_is_rmrr_locked(struct device *dev)
2842 {
2843         if (!device_has_rmrr(dev))
2844                 return false;
2845
2846         if (dev_is_pci(dev)) {
2847                 struct pci_dev *pdev = to_pci_dev(dev);
2848
2849                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2850                         return false;
2851         }
2852
2853         return true;
2854 }
2855
2856 static int iommu_should_identity_map(struct device *dev, int startup)
2857 {
2858
2859         if (dev_is_pci(dev)) {
2860                 struct pci_dev *pdev = to_pci_dev(dev);
2861
2862                 if (device_is_rmrr_locked(dev))
2863                         return 0;
2864
2865                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2866                         return 1;
2867
2868                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2869                         return 1;
2870
2871                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2872                         return 0;
2873
2874                 /*
2875                  * We want to start off with all devices in the 1:1 domain, and
2876                  * take them out later if we find they can't access all of memory.
2877                  *
2878                  * However, we can't do this for PCI devices behind bridges,
2879                  * because all PCI devices behind the same bridge will end up
2880                  * with the same source-id on their transactions.
2881                  *
2882                  * Practically speaking, we can't change things around for these
2883                  * devices at run-time, because we can't be sure there'll be no
2884                  * DMA transactions in flight for any of their siblings.
2885                  *
2886                  * So PCI devices (unless they're on the root bus) as well as
2887                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2888                  * the 1:1 domain, just in _case_ one of their siblings turns out
2889                  * not to be able to map all of memory.
2890                  */
2891                 if (!pci_is_pcie(pdev)) {
2892                         if (!pci_is_root_bus(pdev->bus))
2893                                 return 0;
2894                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2895                                 return 0;
2896                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2897                         return 0;
2898         } else {
2899                 if (device_has_rmrr(dev))
2900                         return 0;
2901         }
2902
2903         /*
2904          * At boot time, we don't yet know if devices will be 64-bit capable.
2905          * Assume that they will — if they turn out not to be, then we can
2906          * take them out of the 1:1 domain later.
2907          */
2908         if (!startup) {
2909                 /*
2910                  * If the device's dma_mask is less than the system's memory
2911                  * size then this is not a candidate for identity mapping.
2912                  */
2913                 u64 dma_mask = *dev->dma_mask;
2914
2915                 if (dev->coherent_dma_mask &&
2916                     dev->coherent_dma_mask < dma_mask)
2917                         dma_mask = dev->coherent_dma_mask;
2918
2919                 return dma_mask >= dma_get_required_mask(dev);
2920         }
2921
2922         return 1;
2923 }
2924
2925 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2926 {
2927         int ret;
2928
2929         if (!iommu_should_identity_map(dev, 1))
2930                 return 0;
2931
2932         ret = domain_add_dev_info(si_domain, dev);
2933         if (!ret)
2934                 pr_info("%s identity mapping for device %s\n",
2935                         hw ? "Hardware" : "Software", dev_name(dev));
2936         else if (ret == -ENODEV)
2937                 /* device not associated with an iommu */
2938                 ret = 0;
2939
2940         return ret;
2941 }
2942
2943
2944 static int __init iommu_prepare_static_identity_mapping(int hw)
2945 {
2946         struct pci_dev *pdev = NULL;
2947         struct dmar_drhd_unit *drhd;
2948         struct intel_iommu *iommu;
2949         struct device *dev;
2950         int i;
2951         int ret = 0;
2952
2953         for_each_pci_dev(pdev) {
2954                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2955                 if (ret)
2956                         return ret;
2957         }
2958
2959         for_each_active_iommu(iommu, drhd)
2960                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2961                         struct acpi_device_physical_node *pn;
2962                         struct acpi_device *adev;
2963
2964                         if (dev->bus != &acpi_bus_type)
2965                                 continue;
2966
2967                         adev= to_acpi_device(dev);
2968                         mutex_lock(&adev->physical_node_lock);
2969                         list_for_each_entry(pn, &adev->physical_node_list, node) {
2970                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2971                                 if (ret)
2972                                         break;
2973                         }
2974                         mutex_unlock(&adev->physical_node_lock);
2975                         if (ret)
2976                                 return ret;
2977                 }
2978
2979         return 0;
2980 }
2981
2982 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2983 {
2984         /*
2985          * Start from the sane iommu hardware state.
2986          * If the queued invalidation is already initialized by us
2987          * (for example, while enabling interrupt-remapping) then
2988          * we got the things already rolling from a sane state.
2989          */
2990         if (!iommu->qi) {
2991                 /*
2992                  * Clear any previous faults.
2993                  */
2994                 dmar_fault(-1, iommu);
2995                 /*
2996                  * Disable queued invalidation if supported and already enabled
2997                  * before OS handover.
2998                  */
2999                 dmar_disable_qi(iommu);
3000         }
3001
3002         if (dmar_enable_qi(iommu)) {
3003                 /*
3004                  * Queued Invalidate not enabled, use Register Based Invalidate
3005                  */
3006                 iommu->flush.flush_context = __iommu_flush_context;
3007                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3008                 pr_info("%s: Using Register based invalidation\n",
3009                         iommu->name);
3010         } else {
3011                 iommu->flush.flush_context = qi_flush_context;
3012                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3013                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3014         }
3015 }
3016
3017 static int copy_context_table(struct intel_iommu *iommu,
3018                               struct root_entry *old_re,
3019                               struct context_entry **tbl,
3020                               int bus, bool ext)
3021 {
3022         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3023         struct context_entry *new_ce = NULL, ce;
3024         struct context_entry *old_ce = NULL;
3025         struct root_entry re;
3026         phys_addr_t old_ce_phys;
3027
3028         tbl_idx = ext ? bus * 2 : bus;
3029         memcpy(&re, old_re, sizeof(re));
3030
3031         for (devfn = 0; devfn < 256; devfn++) {
3032                 /* First calculate the correct index */
3033                 idx = (ext ? devfn * 2 : devfn) % 256;
3034
3035                 if (idx == 0) {
3036                         /* First save what we may have and clean up */
3037                         if (new_ce) {
3038                                 tbl[tbl_idx] = new_ce;
3039                                 __iommu_flush_cache(iommu, new_ce,
3040                                                     VTD_PAGE_SIZE);
3041                                 pos = 1;
3042                         }
3043
3044                         if (old_ce)
3045                                 iounmap(old_ce);
3046
3047                         ret = 0;
3048                         if (devfn < 0x80)
3049                                 old_ce_phys = root_entry_lctp(&re);
3050                         else
3051                                 old_ce_phys = root_entry_uctp(&re);
3052
3053                         if (!old_ce_phys) {
3054                                 if (ext && devfn == 0) {
3055                                         /* No LCTP, try UCTP */
3056                                         devfn = 0x7f;
3057                                         continue;
3058                                 } else {
3059                                         goto out;
3060                                 }
3061                         }
3062
3063                         ret = -ENOMEM;
3064                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3065                                         MEMREMAP_WB);
3066                         if (!old_ce)
3067                                 goto out;
3068
3069                         new_ce = alloc_pgtable_page(iommu->node);
3070                         if (!new_ce)
3071                                 goto out_unmap;
3072
3073                         ret = 0;
3074                 }
3075
3076                 /* Now copy the context entry */
3077                 memcpy(&ce, old_ce + idx, sizeof(ce));
3078
3079                 if (!__context_present(&ce))
3080                         continue;
3081
3082                 did = context_domain_id(&ce);
3083                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3084                         set_bit(did, iommu->domain_ids);
3085
3086                 /*
3087                  * We need a marker for copied context entries. This
3088                  * marker needs to work for the old format as well as
3089                  * for extended context entries.
3090                  *
3091                  * Bit 67 of the context entry is used. In the old
3092                  * format this bit is available to software, in the
3093                  * extended format it is the PGE bit, but PGE is ignored
3094                  * by HW if PASIDs are disabled (and thus still
3095                  * available).
3096                  *
3097                  * So disable PASIDs first and then mark the entry
3098                  * copied. This means that we don't copy PASID
3099                  * translations from the old kernel, but this is fine as
3100                  * faults there are not fatal.
3101                  */
3102                 context_clear_pasid_enable(&ce);
3103                 context_set_copied(&ce);
3104
3105                 new_ce[idx] = ce;
3106         }
3107
3108         tbl[tbl_idx + pos] = new_ce;
3109
3110         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3111
3112 out_unmap:
3113         memunmap(old_ce);
3114
3115 out:
3116         return ret;
3117 }
3118
3119 static int copy_translation_tables(struct intel_iommu *iommu)
3120 {
3121         struct context_entry **ctxt_tbls;
3122         struct root_entry *old_rt;
3123         phys_addr_t old_rt_phys;
3124         int ctxt_table_entries;
3125         unsigned long flags;
3126         u64 rtaddr_reg;
3127         int bus, ret;
3128         bool new_ext, ext;
3129
3130         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3131         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3132         new_ext    = !!ecap_ecs(iommu->ecap);
3133
3134         /*
3135          * The RTT bit can only be changed when translation is disabled,
3136          * but disabling translation means to open a window for data
3137          * corruption. So bail out and don't copy anything if we would
3138          * have to change the bit.
3139          */
3140         if (new_ext != ext)
3141                 return -EINVAL;
3142
3143         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3144         if (!old_rt_phys)
3145                 return -EINVAL;
3146
3147         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3148         if (!old_rt)
3149                 return -ENOMEM;
3150
3151         /* This is too big for the stack - allocate it from slab */
3152         ctxt_table_entries = ext ? 512 : 256;
3153         ret = -ENOMEM;
3154         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3155         if (!ctxt_tbls)
3156                 goto out_unmap;
3157
3158         for (bus = 0; bus < 256; bus++) {
3159                 ret = copy_context_table(iommu, &old_rt[bus],
3160                                          ctxt_tbls, bus, ext);
3161                 if (ret) {
3162                         pr_err("%s: Failed to copy context table for bus %d\n",
3163                                 iommu->name, bus);
3164                         continue;
3165                 }
3166         }
3167
3168         spin_lock_irqsave(&iommu->lock, flags);
3169
3170         /* Context tables are copied, now write them to the root_entry table */
3171         for (bus = 0; bus < 256; bus++) {
3172                 int idx = ext ? bus * 2 : bus;
3173                 u64 val;
3174
3175                 if (ctxt_tbls[idx]) {
3176                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3177                         iommu->root_entry[bus].lo = val;
3178                 }
3179
3180                 if (!ext || !ctxt_tbls[idx + 1])
3181                         continue;
3182
3183                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3184                 iommu->root_entry[bus].hi = val;
3185         }
3186
3187         spin_unlock_irqrestore(&iommu->lock, flags);
3188
3189         kfree(ctxt_tbls);
3190
3191         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3192
3193         ret = 0;
3194
3195 out_unmap:
3196         memunmap(old_rt);
3197
3198         return ret;
3199 }
3200
3201 static int __init init_dmars(void)
3202 {
3203         struct dmar_drhd_unit *drhd;
3204         struct dmar_rmrr_unit *rmrr;
3205         bool copied_tables = false;
3206         struct device *dev;
3207         struct intel_iommu *iommu;
3208         int i, ret;
3209
3210         /*
3211          * for each drhd
3212          *    allocate root
3213          *    initialize and program root entry to not present
3214          * endfor
3215          */
3216         for_each_drhd_unit(drhd) {
3217                 /*
3218                  * lock not needed as this is only incremented in the single
3219                  * threaded kernel __init code path all other access are read
3220                  * only
3221                  */
3222                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3223                         g_num_of_iommus++;
3224                         continue;
3225                 }
3226                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3227         }
3228
3229         /* Preallocate enough resources for IOMMU hot-addition */
3230         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3231                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3232
3233         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3234                         GFP_KERNEL);
3235         if (!g_iommus) {
3236                 pr_err("Allocating global iommu array failed\n");
3237                 ret = -ENOMEM;
3238                 goto error;
3239         }
3240
3241         for_each_active_iommu(iommu, drhd) {
3242                 /*
3243                  * Find the max pasid size of all IOMMU's in the system.
3244                  * We need to ensure the system pasid table is no bigger
3245                  * than the smallest supported.
3246                  */
3247                 if (pasid_enabled(iommu)) {
3248                         u32 temp = 2 << ecap_pss(iommu->ecap);
3249
3250                         intel_pasid_max_id = min_t(u32, temp,
3251                                                    intel_pasid_max_id);
3252                 }
3253
3254                 g_iommus[iommu->seq_id] = iommu;
3255
3256                 intel_iommu_init_qi(iommu);
3257
3258                 ret = iommu_init_domains(iommu);
3259                 if (ret)
3260                         goto free_iommu;
3261
3262                 init_translation_status(iommu);
3263
3264                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3265                         iommu_disable_translation(iommu);
3266                         clear_translation_pre_enabled(iommu);
3267                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3268                                 iommu->name);
3269                 }
3270
3271                 /*
3272                  * TBD:
3273                  * we could share the same root & context tables
3274                  * among all IOMMU's. Need to Split it later.
3275                  */
3276                 ret = iommu_alloc_root_entry(iommu);
3277                 if (ret)
3278                         goto free_iommu;
3279
3280                 if (translation_pre_enabled(iommu)) {
3281                         pr_info("Translation already enabled - trying to copy translation structures\n");
3282
3283                         ret = copy_translation_tables(iommu);
3284                         if (ret) {
3285                                 /*
3286                                  * We found the IOMMU with translation
3287                                  * enabled - but failed to copy over the
3288                                  * old root-entry table. Try to proceed
3289                                  * by disabling translation now and
3290                                  * allocating a clean root-entry table.
3291                                  * This might cause DMAR faults, but
3292                                  * probably the dump will still succeed.
3293                                  */
3294                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3295                                        iommu->name);
3296                                 iommu_disable_translation(iommu);
3297                                 clear_translation_pre_enabled(iommu);
3298                         } else {
3299                                 pr_info("Copied translation tables from previous kernel for %s\n",
3300                                         iommu->name);
3301                                 copied_tables = true;
3302                         }
3303                 }
3304
3305                 if (!ecap_pass_through(iommu->ecap))
3306                         hw_pass_through = 0;
3307 #ifdef CONFIG_INTEL_IOMMU_SVM
3308                 if (pasid_enabled(iommu))
3309                         intel_svm_alloc_pasid_tables(iommu);
3310 #endif
3311         }
3312
3313         /*
3314          * Now that qi is enabled on all iommus, set the root entry and flush
3315          * caches. This is required on some Intel X58 chipsets, otherwise the
3316          * flush_context function will loop forever and the boot hangs.
3317          */
3318         for_each_active_iommu(iommu, drhd) {
3319                 iommu_flush_write_buffer(iommu);
3320                 iommu_set_root_entry(iommu);
3321                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3322                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3323         }
3324
3325         if (iommu_pass_through)
3326                 iommu_identity_mapping |= IDENTMAP_ALL;
3327
3328 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3329         iommu_identity_mapping |= IDENTMAP_GFX;
3330 #endif
3331
3332         check_tylersburg_isoch();
3333
3334         if (iommu_identity_mapping) {
3335                 ret = si_domain_init(hw_pass_through);
3336                 if (ret)
3337                         goto free_iommu;
3338         }
3339
3340
3341         /*
3342          * If we copied translations from a previous kernel in the kdump
3343          * case, we can not assign the devices to domains now, as that
3344          * would eliminate the old mappings. So skip this part and defer
3345          * the assignment to device driver initialization time.
3346          */
3347         if (copied_tables)
3348                 goto domains_done;
3349
3350         /*
3351          * If pass through is not set or not enabled, setup context entries for
3352          * identity mappings for rmrr, gfx, and isa and may fall back to static
3353          * identity mapping if iommu_identity_mapping is set.
3354          */
3355         if (iommu_identity_mapping) {
3356                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3357                 if (ret) {
3358                         pr_crit("Failed to setup IOMMU pass-through\n");
3359                         goto free_iommu;
3360                 }
3361         }
3362         /*
3363          * For each rmrr
3364          *   for each dev attached to rmrr
3365          *   do
3366          *     locate drhd for dev, alloc domain for dev
3367          *     allocate free domain
3368          *     allocate page table entries for rmrr
3369          *     if context not allocated for bus
3370          *           allocate and init context
3371          *           set present in root table for this bus
3372          *     init context with domain, translation etc
3373          *    endfor
3374          * endfor
3375          */
3376         pr_info("Setting RMRR:\n");
3377         for_each_rmrr_units(rmrr) {
3378                 /* some BIOS lists non-exist devices in DMAR table. */
3379                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3380                                           i, dev) {
3381                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3382                         if (ret)
3383                                 pr_err("Mapping reserved region failed\n");
3384                 }
3385         }
3386
3387         iommu_prepare_isa();
3388
3389 domains_done:
3390
3391         /*
3392          * for each drhd
3393          *   enable fault log
3394          *   global invalidate context cache
3395          *   global invalidate iotlb
3396          *   enable translation
3397          */
3398         for_each_iommu(iommu, drhd) {
3399                 if (drhd->ignored) {
3400                         /*
3401                          * we always have to disable PMRs or DMA may fail on
3402                          * this device
3403                          */
3404                         if (force_on)
3405                                 iommu_disable_protect_mem_regions(iommu);
3406                         continue;
3407                 }
3408
3409                 iommu_flush_write_buffer(iommu);
3410
3411 #ifdef CONFIG_INTEL_IOMMU_SVM
3412                 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3413                         ret = intel_svm_enable_prq(iommu);
3414                         if (ret)
3415                                 goto free_iommu;
3416                 }
3417 #endif
3418                 ret = dmar_set_interrupt(iommu);
3419                 if (ret)
3420                         goto free_iommu;
3421
3422                 if (!translation_pre_enabled(iommu))
3423                         iommu_enable_translation(iommu);
3424
3425                 iommu_disable_protect_mem_regions(iommu);
3426         }
3427
3428         return 0;
3429
3430 free_iommu:
3431         for_each_active_iommu(iommu, drhd) {
3432                 disable_dmar_iommu(iommu);
3433                 free_dmar_iommu(iommu);
3434         }
3435
3436         kfree(g_iommus);
3437
3438 error:
3439         return ret;
3440 }
3441
3442 /* This takes a number of _MM_ pages, not VTD pages */
3443 static unsigned long intel_alloc_iova(struct device *dev,
3444                                      struct dmar_domain *domain,
3445                                      unsigned long nrpages, uint64_t dma_mask)
3446 {
3447         unsigned long iova_pfn = 0;
3448
3449         /* Restrict dma_mask to the width that the iommu can handle */
3450         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3451         /* Ensure we reserve the whole size-aligned region */
3452         nrpages = __roundup_pow_of_two(nrpages);
3453
3454         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3455                 /*
3456                  * First try to allocate an io virtual address in
3457                  * DMA_BIT_MASK(32) and if that fails then try allocating
3458                  * from higher range
3459                  */
3460                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3461                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3462                 if (iova_pfn)
3463                         return iova_pfn;
3464         }
3465         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3466                                    IOVA_PFN(dma_mask), true);
3467         if (unlikely(!iova_pfn)) {
3468                 pr_err("Allocating %ld-page iova for %s failed",
3469                        nrpages, dev_name(dev));
3470                 return 0;
3471         }
3472
3473         return iova_pfn;
3474 }
3475
3476 struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3477 {
3478         struct dmar_domain *domain, *tmp;
3479         struct dmar_rmrr_unit *rmrr;
3480         struct device *i_dev;
3481         int i, ret;
3482
3483         domain = find_domain(dev);
3484         if (domain)
3485                 goto out;
3486
3487         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3488         if (!domain)
3489                 goto out;
3490
3491         /* We have a new domain - setup possible RMRRs for the device */
3492         rcu_read_lock();
3493         for_each_rmrr_units(rmrr) {
3494                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3495                                           i, i_dev) {
3496                         if (i_dev != dev)
3497                                 continue;
3498
3499                         ret = domain_prepare_identity_map(dev, domain,
3500                                                           rmrr->base_address,
3501                                                           rmrr->end_address);
3502                         if (ret)
3503                                 dev_err(dev, "Mapping reserved region failed\n");
3504                 }
3505         }
3506         rcu_read_unlock();
3507
3508         tmp = set_domain_for_dev(dev, domain);
3509         if (!tmp || domain != tmp) {
3510                 domain_exit(domain);
3511                 domain = tmp;
3512         }
3513
3514 out:
3515
3516         if (!domain)
3517                 pr_err("Allocating domain for %s failed\n", dev_name(dev));
3518
3519
3520         return domain;
3521 }
3522
3523 /* Check if the dev needs to go through non-identity map and unmap process.*/
3524 static int iommu_no_mapping(struct device *dev)
3525 {
3526         int found;
3527
3528         if (iommu_dummy(dev))
3529                 return 1;
3530
3531         if (!iommu_identity_mapping)
3532                 return 0;
3533
3534         found = identity_mapping(dev);
3535         if (found) {
3536                 if (iommu_should_identity_map(dev, 0))
3537                         return 1;
3538                 else {
3539                         /*
3540                          * 32 bit DMA is removed from si_domain and fall back
3541                          * to non-identity mapping.
3542                          */
3543                         dmar_remove_one_dev_info(si_domain, dev);
3544                         pr_info("32bit %s uses non-identity mapping\n",
3545                                 dev_name(dev));
3546                         return 0;
3547                 }
3548         } else {
3549                 /*
3550                  * In case of a detached 64 bit DMA device from vm, the device
3551                  * is put into si_domain for identity mapping.
3552                  */
3553                 if (iommu_should_identity_map(dev, 0)) {
3554                         int ret;
3555                         ret = domain_add_dev_info(si_domain, dev);
3556                         if (!ret) {
3557                                 pr_info("64bit %s uses identity mapping\n",
3558                                         dev_name(dev));
3559                                 return 1;
3560                         }
3561                 }
3562         }
3563
3564         return 0;
3565 }
3566
3567 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3568                                      size_t size, int dir, u64 dma_mask)
3569 {
3570         struct dmar_domain *domain;
3571         phys_addr_t start_paddr;
3572         unsigned long iova_pfn;
3573         int prot = 0;
3574         int ret;
3575         struct intel_iommu *iommu;
3576         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3577
3578         BUG_ON(dir == DMA_NONE);
3579
3580         if (iommu_no_mapping(dev))
3581                 return paddr;
3582
3583         domain = get_valid_domain_for_dev(dev);
3584         if (!domain)
3585                 return 0;
3586
3587         iommu = domain_get_iommu(domain);
3588         size = aligned_nrpages(paddr, size);
3589
3590         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3591         if (!iova_pfn)
3592                 goto error;
3593
3594         /*
3595          * Check if DMAR supports zero-length reads on write only
3596          * mappings..
3597          */
3598         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3599                         !cap_zlr(iommu->cap))
3600                 prot |= DMA_PTE_READ;
3601         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3602                 prot |= DMA_PTE_WRITE;
3603         /*
3604          * paddr - (paddr + size) might be partial page, we should map the whole
3605          * page.  Note: if two part of one page are separately mapped, we
3606          * might have two guest_addr mapping to the same host paddr, but this
3607          * is not a big problem
3608          */
3609         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3610                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3611         if (ret)
3612                 goto error;
3613
3614         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3615         start_paddr += paddr & ~PAGE_MASK;
3616         return start_paddr;
3617
3618 error:
3619         if (iova_pfn)
3620                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3621         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3622                 dev_name(dev), size, (unsigned long long)paddr, dir);
3623         return 0;
3624 }
3625
3626 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3627                                  unsigned long offset, size_t size,
3628                                  enum dma_data_direction dir,
3629                                  unsigned long attrs)
3630 {
3631         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3632                                   dir, *dev->dma_mask);
3633 }
3634
3635 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3636 {
3637         struct dmar_domain *domain;
3638         unsigned long start_pfn, last_pfn;
3639         unsigned long nrpages;
3640         unsigned long iova_pfn;
3641         struct intel_iommu *iommu;
3642         struct page *freelist;
3643
3644         if (iommu_no_mapping(dev))
3645                 return;
3646
3647         domain = find_domain(dev);
3648         BUG_ON(!domain);
3649
3650         iommu = domain_get_iommu(domain);
3651
3652         iova_pfn = IOVA_PFN(dev_addr);
3653
3654         nrpages = aligned_nrpages(dev_addr, size);
3655         start_pfn = mm_to_dma_pfn(iova_pfn);
3656         last_pfn = start_pfn + nrpages - 1;
3657
3658         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3659                  dev_name(dev), start_pfn, last_pfn);
3660
3661         freelist = domain_unmap(domain, start_pfn, last_pfn);
3662
3663         if (intel_iommu_strict) {
3664                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3665                                       nrpages, !freelist, 0);
3666                 /* free iova */
3667                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3668                 dma_free_pagelist(freelist);
3669         } else {
3670                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3671                            (unsigned long)freelist);
3672                 /*
3673                  * queue up the release of the unmap to save the 1/6th of the
3674                  * cpu used up by the iotlb flush operation...
3675                  */
3676         }
3677 }
3678
3679 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3680                              size_t size, enum dma_data_direction dir,
3681                              unsigned long attrs)
3682 {
3683         intel_unmap(dev, dev_addr, size);
3684 }
3685
3686 static void *intel_alloc_coherent(struct device *dev, size_t size,
3687                                   dma_addr_t *dma_handle, gfp_t flags,
3688                                   unsigned long attrs)
3689 {
3690         void *vaddr;
3691
3692         vaddr = dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3693         if (iommu_no_mapping(dev) || !vaddr)
3694                 return vaddr;
3695
3696         *dma_handle = __intel_map_single(dev, virt_to_phys(vaddr),
3697                         PAGE_ALIGN(size), DMA_BIDIRECTIONAL,
3698                         dev->coherent_dma_mask);
3699         if (!*dma_handle)
3700                 goto out_free_pages;
3701         return vaddr;
3702
3703 out_free_pages:
3704         dma_direct_free(dev, size, vaddr, *dma_handle, attrs);
3705         return NULL;
3706 }
3707
3708 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3709                                 dma_addr_t dma_handle, unsigned long attrs)
3710 {
3711         if (!iommu_no_mapping(dev))
3712                 intel_unmap(dev, dma_handle, PAGE_ALIGN(size));
3713         dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3714 }
3715
3716 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3717                            int nelems, enum dma_data_direction dir,
3718                            unsigned long attrs)
3719 {
3720         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3721         unsigned long nrpages = 0;
3722         struct scatterlist *sg;
3723         int i;
3724
3725         for_each_sg(sglist, sg, nelems, i) {
3726                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3727         }
3728
3729         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3730 }
3731
3732 static int intel_nontranslate_map_sg(struct device *hddev,
3733         struct scatterlist *sglist, int nelems, int dir)
3734 {
3735         int i;
3736         struct scatterlist *sg;
3737
3738         for_each_sg(sglist, sg, nelems, i) {
3739                 BUG_ON(!sg_page(sg));
3740                 sg->dma_address = sg_phys(sg);
3741                 sg->dma_length = sg->length;
3742         }
3743         return nelems;
3744 }
3745
3746 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3747                         enum dma_data_direction dir, unsigned long attrs)
3748 {
3749         int i;
3750         struct dmar_domain *domain;
3751         size_t size = 0;
3752         int prot = 0;
3753         unsigned long iova_pfn;
3754         int ret;
3755         struct scatterlist *sg;
3756         unsigned long start_vpfn;
3757         struct intel_iommu *iommu;
3758
3759         BUG_ON(dir == DMA_NONE);
3760         if (iommu_no_mapping(dev))
3761                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3762
3763         domain = get_valid_domain_for_dev(dev);
3764         if (!domain)
3765                 return 0;
3766
3767         iommu = domain_get_iommu(domain);
3768
3769         for_each_sg(sglist, sg, nelems, i)
3770                 size += aligned_nrpages(sg->offset, sg->length);
3771
3772         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3773                                 *dev->dma_mask);
3774         if (!iova_pfn) {
3775                 sglist->dma_length = 0;
3776                 return 0;
3777         }
3778
3779         /*
3780          * Check if DMAR supports zero-length reads on write only
3781          * mappings..
3782          */
3783         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3784                         !cap_zlr(iommu->cap))
3785                 prot |= DMA_PTE_READ;
3786         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3787                 prot |= DMA_PTE_WRITE;
3788
3789         start_vpfn = mm_to_dma_pfn(iova_pfn);
3790
3791         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3792         if (unlikely(ret)) {
3793                 dma_pte_free_pagetable(domain, start_vpfn,
3794                                        start_vpfn + size - 1,
3795                                        agaw_to_level(domain->agaw) + 1);
3796                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3797                 return 0;
3798         }
3799
3800         return nelems;
3801 }
3802
3803 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3804 {
3805         return !dma_addr;
3806 }
3807
3808 const struct dma_map_ops intel_dma_ops = {
3809         .alloc = intel_alloc_coherent,
3810         .free = intel_free_coherent,
3811         .map_sg = intel_map_sg,
3812         .unmap_sg = intel_unmap_sg,
3813         .map_page = intel_map_page,
3814         .unmap_page = intel_unmap_page,
3815         .mapping_error = intel_mapping_error,
3816 #ifdef CONFIG_X86
3817         .dma_supported = dma_direct_supported,
3818 #endif
3819 };
3820
3821 static inline int iommu_domain_cache_init(void)
3822 {
3823         int ret = 0;
3824
3825         iommu_domain_cache = kmem_cache_create("iommu_domain",
3826                                          sizeof(struct dmar_domain),
3827                                          0,
3828                                          SLAB_HWCACHE_ALIGN,
3829
3830                                          NULL);
3831         if (!iommu_domain_cache) {
3832                 pr_err("Couldn't create iommu_domain cache\n");
3833                 ret = -ENOMEM;
3834         }
3835
3836         return ret;
3837 }
3838
3839 static inline int iommu_devinfo_cache_init(void)
3840 {
3841         int ret = 0;
3842
3843         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3844                                          sizeof(struct device_domain_info),
3845                                          0,
3846                                          SLAB_HWCACHE_ALIGN,
3847                                          NULL);
3848         if (!iommu_devinfo_cache) {
3849                 pr_err("Couldn't create devinfo cache\n");
3850                 ret = -ENOMEM;
3851         }
3852
3853         return ret;
3854 }
3855
3856 static int __init iommu_init_mempool(void)
3857 {
3858         int ret;
3859         ret = iova_cache_get();
3860         if (ret)
3861                 return ret;
3862
3863         ret = iommu_domain_cache_init();
3864         if (ret)
3865                 goto domain_error;
3866
3867         ret = iommu_devinfo_cache_init();
3868         if (!ret)
3869                 return ret;
3870
3871         kmem_cache_destroy(iommu_domain_cache);
3872 domain_error:
3873         iova_cache_put();
3874
3875         return -ENOMEM;
3876 }
3877
3878 static void __init iommu_exit_mempool(void)
3879 {
3880         kmem_cache_destroy(iommu_devinfo_cache);
3881         kmem_cache_destroy(iommu_domain_cache);
3882         iova_cache_put();
3883 }
3884
3885 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3886 {
3887         struct dmar_drhd_unit *drhd;
3888         u32 vtbar;
3889         int rc;
3890
3891         /* We know that this device on this chipset has its own IOMMU.
3892          * If we find it under a different IOMMU, then the BIOS is lying
3893          * to us. Hope that the IOMMU for this device is actually
3894          * disabled, and it needs no translation...
3895          */
3896         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3897         if (rc) {
3898                 /* "can't" happen */
3899                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3900                 return;
3901         }
3902         vtbar &= 0xffff0000;
3903
3904         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3905         drhd = dmar_find_matched_drhd_unit(pdev);
3906         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3907                             TAINT_FIRMWARE_WORKAROUND,
3908                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3909                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3910 }
3911 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3912
3913 static void __init init_no_remapping_devices(void)
3914 {
3915         struct dmar_drhd_unit *drhd;
3916         struct device *dev;
3917         int i;
3918
3919         for_each_drhd_unit(drhd) {
3920                 if (!drhd->include_all) {
3921                         for_each_active_dev_scope(drhd->devices,
3922                                                   drhd->devices_cnt, i, dev)
3923                                 break;
3924                         /* ignore DMAR unit if no devices exist */
3925                         if (i == drhd->devices_cnt)
3926                                 drhd->ignored = 1;
3927                 }
3928         }
3929
3930         for_each_active_drhd_unit(drhd) {
3931                 if (drhd->include_all)
3932                         continue;
3933
3934                 for_each_active_dev_scope(drhd->devices,
3935                                           drhd->devices_cnt, i, dev)
3936                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3937                                 break;
3938                 if (i < drhd->devices_cnt)
3939                         continue;
3940
3941                 /* This IOMMU has *only* gfx devices. Either bypass it or
3942                    set the gfx_mapped flag, as appropriate */
3943                 if (dmar_map_gfx) {
3944                         intel_iommu_gfx_mapped = 1;
3945                 } else {
3946                         drhd->ignored = 1;
3947                         for_each_active_dev_scope(drhd->devices,
3948                                                   drhd->devices_cnt, i, dev)
3949                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3950                 }
3951         }
3952 }
3953
3954 #ifdef CONFIG_SUSPEND
3955 static int init_iommu_hw(void)
3956 {
3957         struct dmar_drhd_unit *drhd;
3958         struct intel_iommu *iommu = NULL;
3959
3960         for_each_active_iommu(iommu, drhd)
3961                 if (iommu->qi)
3962                         dmar_reenable_qi(iommu);
3963
3964         for_each_iommu(iommu, drhd) {
3965                 if (drhd->ignored) {
3966                         /*
3967                          * we always have to disable PMRs or DMA may fail on
3968                          * this device
3969                          */
3970                         if (force_on)
3971                                 iommu_disable_protect_mem_regions(iommu);
3972                         continue;
3973                 }
3974
3975                 iommu_flush_write_buffer(iommu);
3976
3977                 iommu_set_root_entry(iommu);
3978
3979                 iommu->flush.flush_context(iommu, 0, 0, 0,
3980                                            DMA_CCMD_GLOBAL_INVL);
3981                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3982                 iommu_enable_translation(iommu);
3983                 iommu_disable_protect_mem_regions(iommu);
3984         }
3985
3986         return 0;
3987 }
3988
3989 static void iommu_flush_all(void)
3990 {
3991         struct dmar_drhd_unit *drhd;
3992         struct intel_iommu *iommu;
3993
3994         for_each_active_iommu(iommu, drhd) {
3995                 iommu->flush.flush_context(iommu, 0, 0, 0,
3996                                            DMA_CCMD_GLOBAL_INVL);
3997                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3998                                          DMA_TLB_GLOBAL_FLUSH);
3999         }
4000 }
4001
4002 static int iommu_suspend(void)
4003 {
4004         struct dmar_drhd_unit *drhd;
4005         struct intel_iommu *iommu = NULL;
4006         unsigned long flag;
4007
4008         for_each_active_iommu(iommu, drhd) {
4009                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4010                                                  GFP_ATOMIC);
4011                 if (!iommu->iommu_state)
4012                         goto nomem;
4013         }
4014
4015         iommu_flush_all();
4016
4017         for_each_active_iommu(iommu, drhd) {
4018                 iommu_disable_translation(iommu);
4019
4020                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4021
4022                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4023                         readl(iommu->reg + DMAR_FECTL_REG);
4024                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4025                         readl(iommu->reg + DMAR_FEDATA_REG);
4026                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4027                         readl(iommu->reg + DMAR_FEADDR_REG);
4028                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4029                         readl(iommu->reg + DMAR_FEUADDR_REG);
4030
4031                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4032         }
4033         return 0;
4034
4035 nomem:
4036         for_each_active_iommu(iommu, drhd)
4037                 kfree(iommu->iommu_state);
4038
4039         return -ENOMEM;
4040 }
4041
4042 static void iommu_resume(void)
4043 {
4044         struct dmar_drhd_unit *drhd;
4045         struct intel_iommu *iommu = NULL;
4046         unsigned long flag;
4047
4048         if (init_iommu_hw()) {
4049                 if (force_on)
4050                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4051                 else
4052                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4053                 return;
4054         }
4055
4056         for_each_active_iommu(iommu, drhd) {
4057
4058                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4059
4060                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4061                         iommu->reg + DMAR_FECTL_REG);
4062                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4063                         iommu->reg + DMAR_FEDATA_REG);
4064                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4065                         iommu->reg + DMAR_FEADDR_REG);
4066                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4067                         iommu->reg + DMAR_FEUADDR_REG);
4068
4069                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4070         }
4071
4072         for_each_active_iommu(iommu, drhd)
4073                 kfree(iommu->iommu_state);
4074 }
4075
4076 static struct syscore_ops iommu_syscore_ops = {
4077         .resume         = iommu_resume,
4078         .suspend        = iommu_suspend,
4079 };
4080
4081 static void __init init_iommu_pm_ops(void)
4082 {
4083         register_syscore_ops(&iommu_syscore_ops);
4084 }
4085
4086 #else
4087 static inline void init_iommu_pm_ops(void) {}
4088 #endif  /* CONFIG_PM */
4089
4090
4091 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4092 {
4093         struct acpi_dmar_reserved_memory *rmrr;
4094         int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4095         struct dmar_rmrr_unit *rmrru;
4096         size_t length;
4097
4098         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4099         if (!rmrru)
4100                 goto out;
4101
4102         rmrru->hdr = header;
4103         rmrr = (struct acpi_dmar_reserved_memory *)header;
4104         rmrru->base_address = rmrr->base_address;
4105         rmrru->end_address = rmrr->end_address;
4106
4107         length = rmrr->end_address - rmrr->base_address + 1;
4108         rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4109                                               IOMMU_RESV_DIRECT);
4110         if (!rmrru->resv)
4111                 goto free_rmrru;
4112
4113         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4114                                 ((void *)rmrr) + rmrr->header.length,
4115                                 &rmrru->devices_cnt);
4116         if (rmrru->devices_cnt && rmrru->devices == NULL)
4117                 goto free_all;
4118
4119         list_add(&rmrru->list, &dmar_rmrr_units);
4120
4121         return 0;
4122 free_all:
4123         kfree(rmrru->resv);
4124 free_rmrru:
4125         kfree(rmrru);
4126 out:
4127         return -ENOMEM;
4128 }
4129
4130 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4131 {
4132         struct dmar_atsr_unit *atsru;
4133         struct acpi_dmar_atsr *tmp;
4134
4135         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4136                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4137                 if (atsr->segment != tmp->segment)
4138                         continue;
4139                 if (atsr->header.length != tmp->header.length)
4140                         continue;
4141                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4142                         return atsru;
4143         }
4144
4145         return NULL;
4146 }
4147
4148 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4149 {
4150         struct acpi_dmar_atsr *atsr;
4151         struct dmar_atsr_unit *atsru;
4152
4153         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4154                 return 0;
4155
4156         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4157         atsru = dmar_find_atsr(atsr);
4158         if (atsru)
4159                 return 0;
4160
4161         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4162         if (!atsru)
4163                 return -ENOMEM;
4164
4165         /*
4166          * If memory is allocated from slab by ACPI _DSM method, we need to
4167          * copy the memory content because the memory buffer will be freed
4168          * on return.
4169          */
4170         atsru->hdr = (void *)(atsru + 1);
4171         memcpy(atsru->hdr, hdr, hdr->length);
4172         atsru->include_all = atsr->flags & 0x1;
4173         if (!atsru->include_all) {
4174                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4175                                 (void *)atsr + atsr->header.length,
4176                                 &atsru->devices_cnt);
4177                 if (atsru->devices_cnt && atsru->devices == NULL) {
4178                         kfree(atsru);
4179                         return -ENOMEM;
4180                 }
4181         }
4182
4183         list_add_rcu(&atsru->list, &dmar_atsr_units);
4184
4185         return 0;
4186 }
4187
4188 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4189 {
4190         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4191         kfree(atsru);
4192 }
4193
4194 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4195 {
4196         struct acpi_dmar_atsr *atsr;
4197         struct dmar_atsr_unit *atsru;
4198
4199         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4200         atsru = dmar_find_atsr(atsr);
4201         if (atsru) {
4202                 list_del_rcu(&atsru->list);
4203                 synchronize_rcu();
4204                 intel_iommu_free_atsr(atsru);
4205         }
4206
4207         return 0;
4208 }
4209
4210 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4211 {
4212         int i;
4213         struct device *dev;
4214         struct acpi_dmar_atsr *atsr;
4215         struct dmar_atsr_unit *atsru;
4216
4217         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4218         atsru = dmar_find_atsr(atsr);
4219         if (!atsru)
4220                 return 0;
4221
4222         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4223                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4224                                           i, dev)
4225                         return -EBUSY;
4226         }
4227
4228         return 0;
4229 }
4230
4231 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4232 {
4233         int sp, ret = 0;
4234         struct intel_iommu *iommu = dmaru->iommu;
4235
4236         if (g_iommus[iommu->seq_id])
4237                 return 0;
4238
4239         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4240                 pr_warn("%s: Doesn't support hardware pass through.\n",
4241                         iommu->name);
4242                 return -ENXIO;
4243         }
4244         if (!ecap_sc_support(iommu->ecap) &&
4245             domain_update_iommu_snooping(iommu)) {
4246                 pr_warn("%s: Doesn't support snooping.\n",
4247                         iommu->name);
4248                 return -ENXIO;
4249         }
4250         sp = domain_update_iommu_superpage(iommu) - 1;
4251         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4252                 pr_warn("%s: Doesn't support large page.\n",
4253                         iommu->name);
4254                 return -ENXIO;
4255         }
4256
4257         /*
4258          * Disable translation if already enabled prior to OS handover.
4259          */
4260         if (iommu->gcmd & DMA_GCMD_TE)
4261                 iommu_disable_translation(iommu);
4262
4263         g_iommus[iommu->seq_id] = iommu;
4264         ret = iommu_init_domains(iommu);
4265         if (ret == 0)
4266                 ret = iommu_alloc_root_entry(iommu);
4267         if (ret)
4268                 goto out;
4269
4270 #ifdef CONFIG_INTEL_IOMMU_SVM
4271         if (pasid_enabled(iommu))
4272                 intel_svm_alloc_pasid_tables(iommu);
4273 #endif
4274
4275         if (dmaru->ignored) {
4276                 /*
4277                  * we always have to disable PMRs or DMA may fail on this device
4278                  */
4279                 if (force_on)
4280                         iommu_disable_protect_mem_regions(iommu);
4281                 return 0;
4282         }
4283
4284         intel_iommu_init_qi(iommu);
4285         iommu_flush_write_buffer(iommu);
4286
4287 #ifdef CONFIG_INTEL_IOMMU_SVM
4288         if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4289                 ret = intel_svm_enable_prq(iommu);
4290                 if (ret)
4291                         goto disable_iommu;
4292         }
4293 #endif
4294         ret = dmar_set_interrupt(iommu);
4295         if (ret)
4296                 goto disable_iommu;
4297
4298         iommu_set_root_entry(iommu);
4299         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4300         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4301         iommu_enable_translation(iommu);
4302
4303         iommu_disable_protect_mem_regions(iommu);
4304         return 0;
4305
4306 disable_iommu:
4307         disable_dmar_iommu(iommu);
4308 out:
4309         free_dmar_iommu(iommu);
4310         return ret;
4311 }
4312
4313 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4314 {
4315         int ret = 0;
4316         struct intel_iommu *iommu = dmaru->iommu;
4317
4318         if (!intel_iommu_enabled)
4319                 return 0;
4320         if (iommu == NULL)
4321                 return -EINVAL;
4322
4323         if (insert) {
4324                 ret = intel_iommu_add(dmaru);
4325         } else {
4326                 disable_dmar_iommu(iommu);
4327                 free_dmar_iommu(iommu);
4328         }
4329
4330         return ret;
4331 }
4332
4333 static void intel_iommu_free_dmars(void)
4334 {
4335         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4336         struct dmar_atsr_unit *atsru, *atsr_n;
4337
4338         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4339                 list_del(&rmrru->list);
4340                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4341                 kfree(rmrru->resv);
4342                 kfree(rmrru);
4343         }
4344
4345         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4346                 list_del(&atsru->list);
4347                 intel_iommu_free_atsr(atsru);
4348         }
4349 }
4350
4351 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4352 {
4353         int i, ret = 1;
4354         struct pci_bus *bus;
4355         struct pci_dev *bridge = NULL;
4356         struct device *tmp;
4357         struct acpi_dmar_atsr *atsr;
4358         struct dmar_atsr_unit *atsru;
4359
4360         dev = pci_physfn(dev);
4361         for (bus = dev->bus; bus; bus = bus->parent) {
4362                 bridge = bus->self;
4363                 /* If it's an integrated device, allow ATS */
4364                 if (!bridge)
4365                         return 1;
4366                 /* Connected via non-PCIe: no ATS */
4367                 if (!pci_is_pcie(bridge) ||
4368                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4369                         return 0;
4370                 /* If we found the root port, look it up in the ATSR */
4371                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4372                         break;
4373         }
4374
4375         rcu_read_lock();
4376         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4377                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4378                 if (atsr->segment != pci_domain_nr(dev->bus))
4379                         continue;
4380
4381                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4382                         if (tmp == &bridge->dev)
4383                                 goto out;
4384
4385                 if (atsru->include_all)
4386                         goto out;
4387         }
4388         ret = 0;
4389 out:
4390         rcu_read_unlock();
4391
4392         return ret;
4393 }
4394
4395 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4396 {
4397         int ret = 0;
4398         struct dmar_rmrr_unit *rmrru;
4399         struct dmar_atsr_unit *atsru;
4400         struct acpi_dmar_atsr *atsr;
4401         struct acpi_dmar_reserved_memory *rmrr;
4402
4403         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4404                 return 0;
4405
4406         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4407                 rmrr = container_of(rmrru->hdr,
4408                                     struct acpi_dmar_reserved_memory, header);
4409                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4410                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4411                                 ((void *)rmrr) + rmrr->header.length,
4412                                 rmrr->segment, rmrru->devices,
4413                                 rmrru->devices_cnt);
4414                         if(ret < 0)
4415                                 return ret;
4416                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4417                         dmar_remove_dev_scope(info, rmrr->segment,
4418                                 rmrru->devices, rmrru->devices_cnt);
4419                 }
4420         }
4421
4422         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4423                 if (atsru->include_all)
4424                         continue;
4425
4426                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4427                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4428                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4429                                         (void *)atsr + atsr->header.length,
4430                                         atsr->segment, atsru->devices,
4431                                         atsru->devices_cnt);
4432                         if (ret > 0)
4433                                 break;
4434                         else if(ret < 0)
4435                                 return ret;
4436                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4437                         if (dmar_remove_dev_scope(info, atsr->segment,
4438                                         atsru->devices, atsru->devices_cnt))
4439                                 break;
4440                 }
4441         }
4442
4443         return 0;
4444 }
4445
4446 /*
4447  * Here we only respond to action of unbound device from driver.
4448  *
4449  * Added device is not attached to its DMAR domain here yet. That will happen
4450  * when mapping the device to iova.
4451  */
4452 static int device_notifier(struct notifier_block *nb,
4453                                   unsigned long action, void *data)
4454 {
4455         struct device *dev = data;
4456         struct dmar_domain *domain;
4457
4458         if (iommu_dummy(dev))
4459                 return 0;
4460
4461         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4462                 return 0;
4463
4464         domain = find_domain(dev);
4465         if (!domain)
4466                 return 0;
4467
4468         dmar_remove_one_dev_info(domain, dev);
4469         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4470                 domain_exit(domain);
4471
4472         return 0;
4473 }
4474
4475 static struct notifier_block device_nb = {
4476         .notifier_call = device_notifier,
4477 };
4478
4479 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4480                                        unsigned long val, void *v)
4481 {
4482         struct memory_notify *mhp = v;
4483         unsigned long long start, end;
4484         unsigned long start_vpfn, last_vpfn;
4485
4486         switch (val) {
4487         case MEM_GOING_ONLINE:
4488                 start = mhp->start_pfn << PAGE_SHIFT;
4489                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4490                 if (iommu_domain_identity_map(si_domain, start, end)) {
4491                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4492                                 start, end);
4493                         return NOTIFY_BAD;
4494                 }
4495                 break;
4496
4497         case MEM_OFFLINE:
4498         case MEM_CANCEL_ONLINE:
4499                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4500                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4501                 while (start_vpfn <= last_vpfn) {
4502                         struct iova *iova;
4503                         struct dmar_drhd_unit *drhd;
4504                         struct intel_iommu *iommu;
4505                         struct page *freelist;
4506
4507                         iova = find_iova(&si_domain->iovad, start_vpfn);
4508                         if (iova == NULL) {
4509                                 pr_debug("Failed get IOVA for PFN %lx\n",
4510                                          start_vpfn);
4511                                 break;
4512                         }
4513
4514                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4515                                                      start_vpfn, last_vpfn);
4516                         if (iova == NULL) {
4517                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4518                                         start_vpfn, last_vpfn);
4519                                 return NOTIFY_BAD;
4520                         }
4521
4522                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4523                                                iova->pfn_hi);
4524
4525                         rcu_read_lock();
4526                         for_each_active_iommu(iommu, drhd)
4527                                 iommu_flush_iotlb_psi(iommu, si_domain,
4528                                         iova->pfn_lo, iova_size(iova),
4529                                         !freelist, 0);
4530                         rcu_read_unlock();
4531                         dma_free_pagelist(freelist);
4532
4533                         start_vpfn = iova->pfn_hi + 1;
4534                         free_iova_mem(iova);
4535                 }
4536                 break;
4537         }
4538
4539         return NOTIFY_OK;
4540 }
4541
4542 static struct notifier_block intel_iommu_memory_nb = {
4543         .notifier_call = intel_iommu_memory_notifier,
4544         .priority = 0
4545 };
4546
4547 static void free_all_cpu_cached_iovas(unsigned int cpu)
4548 {
4549         int i;
4550
4551         for (i = 0; i < g_num_of_iommus; i++) {
4552                 struct intel_iommu *iommu = g_iommus[i];
4553                 struct dmar_domain *domain;
4554                 int did;
4555
4556                 if (!iommu)
4557                         continue;
4558
4559                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4560                         domain = get_iommu_domain(iommu, (u16)did);
4561
4562                         if (!domain)
4563                                 continue;
4564                         free_cpu_cached_iovas(cpu, &domain->iovad);
4565                 }
4566         }
4567 }
4568
4569 static int intel_iommu_cpu_dead(unsigned int cpu)
4570 {
4571         free_all_cpu_cached_iovas(cpu);
4572         return 0;
4573 }
4574
4575 static void intel_disable_iommus(void)
4576 {
4577         struct intel_iommu *iommu = NULL;
4578         struct dmar_drhd_unit *drhd;
4579
4580         for_each_iommu(iommu, drhd)
4581                 iommu_disable_translation(iommu);
4582 }
4583
4584 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4585 {
4586         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4587
4588         return container_of(iommu_dev, struct intel_iommu, iommu);
4589 }
4590
4591 static ssize_t intel_iommu_show_version(struct device *dev,
4592                                         struct device_attribute *attr,
4593                                         char *buf)
4594 {
4595         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4596         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4597         return sprintf(buf, "%d:%d\n",
4598                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4599 }
4600 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4601
4602 static ssize_t intel_iommu_show_address(struct device *dev,
4603                                         struct device_attribute *attr,
4604                                         char *buf)
4605 {
4606         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4607         return sprintf(buf, "%llx\n", iommu->reg_phys);
4608 }
4609 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4610
4611 static ssize_t intel_iommu_show_cap(struct device *dev,
4612                                     struct device_attribute *attr,
4613                                     char *buf)
4614 {
4615         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4616         return sprintf(buf, "%llx\n", iommu->cap);
4617 }
4618 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4619
4620 static ssize_t intel_iommu_show_ecap(struct device *dev,
4621                                     struct device_attribute *attr,
4622                                     char *buf)
4623 {
4624         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4625         return sprintf(buf, "%llx\n", iommu->ecap);
4626 }
4627 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4628
4629 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4630                                       struct device_attribute *attr,
4631                                       char *buf)
4632 {
4633         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4634         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4635 }
4636 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4637
4638 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4639                                            struct device_attribute *attr,
4640                                            char *buf)
4641 {
4642         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4643         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4644                                                   cap_ndoms(iommu->cap)));
4645 }
4646 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4647
4648 static struct attribute *intel_iommu_attrs[] = {
4649         &dev_attr_version.attr,
4650         &dev_attr_address.attr,
4651         &dev_attr_cap.attr,
4652         &dev_attr_ecap.attr,
4653         &dev_attr_domains_supported.attr,
4654         &dev_attr_domains_used.attr,
4655         NULL,
4656 };
4657
4658 static struct attribute_group intel_iommu_group = {
4659         .name = "intel-iommu",
4660         .attrs = intel_iommu_attrs,
4661 };
4662
4663 const struct attribute_group *intel_iommu_groups[] = {
4664         &intel_iommu_group,
4665         NULL,
4666 };
4667
4668 int __init intel_iommu_init(void)
4669 {
4670         int ret = -ENODEV;
4671         struct dmar_drhd_unit *drhd;
4672         struct intel_iommu *iommu;
4673
4674         /* VT-d is required for a TXT/tboot launch, so enforce that */
4675         force_on = tboot_force_iommu();
4676
4677         if (iommu_init_mempool()) {
4678                 if (force_on)
4679                         panic("tboot: Failed to initialize iommu memory\n");
4680                 return -ENOMEM;
4681         }
4682
4683         down_write(&dmar_global_lock);
4684         if (dmar_table_init()) {
4685                 if (force_on)
4686                         panic("tboot: Failed to initialize DMAR table\n");
4687                 goto out_free_dmar;
4688         }
4689
4690         if (dmar_dev_scope_init() < 0) {
4691                 if (force_on)
4692                         panic("tboot: Failed to initialize DMAR device scope\n");
4693                 goto out_free_dmar;
4694         }
4695
4696         up_write(&dmar_global_lock);
4697
4698         /*
4699          * The bus notifier takes the dmar_global_lock, so lockdep will
4700          * complain later when we register it under the lock.
4701          */
4702         dmar_register_bus_notifier();
4703
4704         down_write(&dmar_global_lock);
4705
4706         if (no_iommu || dmar_disabled) {
4707                 /*
4708                  * We exit the function here to ensure IOMMU's remapping and
4709                  * mempool aren't setup, which means that the IOMMU's PMRs
4710                  * won't be disabled via the call to init_dmars(). So disable
4711                  * it explicitly here. The PMRs were setup by tboot prior to
4712                  * calling SENTER, but the kernel is expected to reset/tear
4713                  * down the PMRs.
4714                  */
4715                 if (intel_iommu_tboot_noforce) {
4716                         for_each_iommu(iommu, drhd)
4717                                 iommu_disable_protect_mem_regions(iommu);
4718                 }
4719
4720                 /*
4721                  * Make sure the IOMMUs are switched off, even when we
4722                  * boot into a kexec kernel and the previous kernel left
4723                  * them enabled
4724                  */
4725                 intel_disable_iommus();
4726                 goto out_free_dmar;
4727         }
4728
4729         if (list_empty(&dmar_rmrr_units))
4730                 pr_info("No RMRR found\n");
4731
4732         if (list_empty(&dmar_atsr_units))
4733                 pr_info("No ATSR found\n");
4734
4735         if (dmar_init_reserved_ranges()) {
4736                 if (force_on)
4737                         panic("tboot: Failed to reserve iommu ranges\n");
4738                 goto out_free_reserved_range;
4739         }
4740
4741         init_no_remapping_devices();
4742
4743         ret = init_dmars();
4744         if (ret) {
4745                 if (force_on)
4746                         panic("tboot: Failed to initialize DMARs\n");
4747                 pr_err("Initialization failed\n");
4748                 goto out_free_reserved_range;
4749         }
4750         up_write(&dmar_global_lock);
4751         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4752
4753 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4754         swiotlb = 0;
4755 #endif
4756         dma_ops = &intel_dma_ops;
4757
4758         init_iommu_pm_ops();
4759
4760         for_each_active_iommu(iommu, drhd) {
4761                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4762                                        intel_iommu_groups,
4763                                        "%s", iommu->name);
4764                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4765                 iommu_device_register(&iommu->iommu);
4766         }
4767
4768         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4769         bus_register_notifier(&pci_bus_type, &device_nb);
4770         if (si_domain && !hw_pass_through)
4771                 register_memory_notifier(&intel_iommu_memory_nb);
4772         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4773                           intel_iommu_cpu_dead);
4774         intel_iommu_enabled = 1;
4775
4776         return 0;
4777
4778 out_free_reserved_range:
4779         put_iova_domain(&reserved_iova_list);
4780 out_free_dmar:
4781         intel_iommu_free_dmars();
4782         up_write(&dmar_global_lock);
4783         iommu_exit_mempool();
4784         return ret;
4785 }
4786
4787 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4788 {
4789         struct intel_iommu *iommu = opaque;
4790
4791         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4792         return 0;
4793 }
4794
4795 /*
4796  * NB - intel-iommu lacks any sort of reference counting for the users of
4797  * dependent devices.  If multiple endpoints have intersecting dependent
4798  * devices, unbinding the driver from any one of them will possibly leave
4799  * the others unable to operate.
4800  */
4801 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4802 {
4803         if (!iommu || !dev || !dev_is_pci(dev))
4804                 return;
4805
4806         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4807 }
4808
4809 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4810 {
4811         struct intel_iommu *iommu;
4812         unsigned long flags;
4813
4814         assert_spin_locked(&device_domain_lock);
4815
4816         if (WARN_ON(!info))
4817                 return;
4818
4819         iommu = info->iommu;
4820
4821         if (info->dev) {
4822                 iommu_disable_dev_iotlb(info);
4823                 domain_context_clear(iommu, info->dev);
4824         }
4825
4826         unlink_domain_info(info);
4827
4828         spin_lock_irqsave(&iommu->lock, flags);
4829         domain_detach_iommu(info->domain, iommu);
4830         spin_unlock_irqrestore(&iommu->lock, flags);
4831
4832         free_devinfo_mem(info);
4833 }
4834
4835 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4836                                      struct device *dev)
4837 {
4838         struct device_domain_info *info;
4839         unsigned long flags;
4840
4841         spin_lock_irqsave(&device_domain_lock, flags);
4842         info = dev->archdata.iommu;
4843         __dmar_remove_one_dev_info(info);
4844         spin_unlock_irqrestore(&device_domain_lock, flags);
4845 }
4846
4847 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4848 {
4849         int adjust_width;
4850
4851         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4852         domain_reserve_special_ranges(domain);
4853
4854         /* calculate AGAW */
4855         domain->gaw = guest_width;
4856         adjust_width = guestwidth_to_adjustwidth(guest_width);
4857         domain->agaw = width_to_agaw(adjust_width);
4858
4859         domain->iommu_coherency = 0;
4860         domain->iommu_snooping = 0;
4861         domain->iommu_superpage = 0;
4862         domain->max_addr = 0;
4863
4864         /* always allocate the top pgd */
4865         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4866         if (!domain->pgd)
4867                 return -ENOMEM;
4868         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4869         return 0;
4870 }
4871
4872 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4873 {
4874         struct dmar_domain *dmar_domain;
4875         struct iommu_domain *domain;
4876
4877         if (type != IOMMU_DOMAIN_UNMANAGED)
4878                 return NULL;
4879
4880         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4881         if (!dmar_domain) {
4882                 pr_err("Can't allocate dmar_domain\n");
4883                 return NULL;
4884         }
4885         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4886                 pr_err("Domain initialization failed\n");
4887                 domain_exit(dmar_domain);
4888                 return NULL;
4889         }
4890         domain_update_iommu_cap(dmar_domain);
4891
4892         domain = &dmar_domain->domain;
4893         domain->geometry.aperture_start = 0;
4894         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4895         domain->geometry.force_aperture = true;
4896
4897         return domain;
4898 }
4899
4900 static void intel_iommu_domain_free(struct iommu_domain *domain)
4901 {
4902         domain_exit(to_dmar_domain(domain));
4903 }
4904
4905 static int intel_iommu_attach_device(struct iommu_domain *domain,
4906                                      struct device *dev)
4907 {
4908         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4909         struct intel_iommu *iommu;
4910         int addr_width;
4911         u8 bus, devfn;
4912
4913         if (device_is_rmrr_locked(dev)) {
4914                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4915                 return -EPERM;
4916         }
4917
4918         /* normally dev is not mapped */
4919         if (unlikely(domain_context_mapped(dev))) {
4920                 struct dmar_domain *old_domain;
4921
4922                 old_domain = find_domain(dev);
4923                 if (old_domain) {
4924                         rcu_read_lock();
4925                         dmar_remove_one_dev_info(old_domain, dev);
4926                         rcu_read_unlock();
4927
4928                         if (!domain_type_is_vm_or_si(old_domain) &&
4929                              list_empty(&old_domain->devices))
4930                                 domain_exit(old_domain);
4931                 }
4932         }
4933
4934         iommu = device_to_iommu(dev, &bus, &devfn);
4935         if (!iommu)
4936                 return -ENODEV;
4937
4938         /* check if this iommu agaw is sufficient for max mapped address */
4939         addr_width = agaw_to_width(iommu->agaw);
4940         if (addr_width > cap_mgaw(iommu->cap))
4941                 addr_width = cap_mgaw(iommu->cap);
4942
4943         if (dmar_domain->max_addr > (1LL << addr_width)) {
4944                 pr_err("%s: iommu width (%d) is not "
4945                        "sufficient for the mapped address (%llx)\n",
4946                        __func__, addr_width, dmar_domain->max_addr);
4947                 return -EFAULT;
4948         }
4949         dmar_domain->gaw = addr_width;
4950
4951         /*
4952          * Knock out extra levels of page tables if necessary
4953          */
4954         while (iommu->agaw < dmar_domain->agaw) {
4955                 struct dma_pte *pte;
4956
4957                 pte = dmar_domain->pgd;
4958                 if (dma_pte_present(pte)) {
4959                         dmar_domain->pgd = (struct dma_pte *)
4960                                 phys_to_virt(dma_pte_addr(pte));
4961                         free_pgtable_page(pte);
4962                 }
4963                 dmar_domain->agaw--;
4964         }
4965
4966         return domain_add_dev_info(dmar_domain, dev);
4967 }
4968
4969 static void intel_iommu_detach_device(struct iommu_domain *domain,
4970                                       struct device *dev)
4971 {
4972         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
4973 }
4974
4975 static int intel_iommu_map(struct iommu_domain *domain,
4976                            unsigned long iova, phys_addr_t hpa,
4977                            size_t size, int iommu_prot)
4978 {
4979         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4980         u64 max_addr;
4981         int prot = 0;
4982         int ret;
4983
4984         if (iommu_prot & IOMMU_READ)
4985                 prot |= DMA_PTE_READ;
4986         if (iommu_prot & IOMMU_WRITE)
4987                 prot |= DMA_PTE_WRITE;
4988         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4989                 prot |= DMA_PTE_SNP;
4990
4991         max_addr = iova + size;
4992         if (dmar_domain->max_addr < max_addr) {
4993                 u64 end;
4994
4995                 /* check if minimum agaw is sufficient for mapped address */
4996                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4997                 if (end < max_addr) {
4998                         pr_err("%s: iommu width (%d) is not "
4999                                "sufficient for the mapped address (%llx)\n",
5000                                __func__, dmar_domain->gaw, max_addr);
5001                         return -EFAULT;
5002                 }
5003                 dmar_domain->max_addr = max_addr;
5004         }
5005         /* Round up size to next multiple of PAGE_SIZE, if it and
5006            the low bits of hpa would take us onto the next page */
5007         size = aligned_nrpages(hpa, size);
5008         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5009                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5010         return ret;
5011 }
5012
5013 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5014                                 unsigned long iova, size_t size)
5015 {
5016         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5017         struct page *freelist = NULL;
5018         unsigned long start_pfn, last_pfn;
5019         unsigned int npages;
5020         int iommu_id, level = 0;
5021
5022         /* Cope with horrid API which requires us to unmap more than the
5023            size argument if it happens to be a large-page mapping. */
5024         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5025
5026         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5027                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5028
5029         start_pfn = iova >> VTD_PAGE_SHIFT;
5030         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5031
5032         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5033
5034         npages = last_pfn - start_pfn + 1;
5035
5036         for_each_domain_iommu(iommu_id, dmar_domain)
5037                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5038                                       start_pfn, npages, !freelist, 0);
5039
5040         dma_free_pagelist(freelist);
5041
5042         if (dmar_domain->max_addr == iova + size)
5043                 dmar_domain->max_addr = iova;
5044
5045         return size;
5046 }
5047
5048 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5049                                             dma_addr_t iova)
5050 {
5051         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5052         struct dma_pte *pte;
5053         int level = 0;
5054         u64 phys = 0;
5055
5056         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5057         if (pte)
5058                 phys = dma_pte_addr(pte);
5059
5060         return phys;
5061 }
5062
5063 static bool intel_iommu_capable(enum iommu_cap cap)
5064 {
5065         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5066                 return domain_update_iommu_snooping(NULL) == 1;
5067         if (cap == IOMMU_CAP_INTR_REMAP)
5068                 return irq_remapping_enabled == 1;
5069
5070         return false;
5071 }
5072
5073 static int intel_iommu_add_device(struct device *dev)
5074 {
5075         struct intel_iommu *iommu;
5076         struct iommu_group *group;
5077         u8 bus, devfn;
5078
5079         iommu = device_to_iommu(dev, &bus, &devfn);
5080         if (!iommu)
5081                 return -ENODEV;
5082
5083         iommu_device_link(&iommu->iommu, dev);
5084
5085         group = iommu_group_get_for_dev(dev);
5086
5087         if (IS_ERR(group))
5088                 return PTR_ERR(group);
5089
5090         iommu_group_put(group);
5091         return 0;
5092 }
5093
5094 static void intel_iommu_remove_device(struct device *dev)
5095 {
5096         struct intel_iommu *iommu;
5097         u8 bus, devfn;
5098
5099         iommu = device_to_iommu(dev, &bus, &devfn);
5100         if (!iommu)
5101                 return;
5102
5103         iommu_group_remove_device(dev);
5104
5105         iommu_device_unlink(&iommu->iommu, dev);
5106 }
5107
5108 static void intel_iommu_get_resv_regions(struct device *device,
5109                                          struct list_head *head)
5110 {
5111         struct iommu_resv_region *reg;
5112         struct dmar_rmrr_unit *rmrr;
5113         struct device *i_dev;
5114         int i;
5115
5116         rcu_read_lock();
5117         for_each_rmrr_units(rmrr) {
5118                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5119                                           i, i_dev) {
5120                         if (i_dev != device)
5121                                 continue;
5122
5123                         list_add_tail(&rmrr->resv->list, head);
5124                 }
5125         }
5126         rcu_read_unlock();
5127
5128         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5129                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5130                                       0, IOMMU_RESV_MSI);
5131         if (!reg)
5132                 return;
5133         list_add_tail(&reg->list, head);
5134 }
5135
5136 static void intel_iommu_put_resv_regions(struct device *dev,
5137                                          struct list_head *head)
5138 {
5139         struct iommu_resv_region *entry, *next;
5140
5141         list_for_each_entry_safe(entry, next, head, list) {
5142                 if (entry->type == IOMMU_RESV_RESERVED)
5143                         kfree(entry);
5144         }
5145 }
5146
5147 #ifdef CONFIG_INTEL_IOMMU_SVM
5148 #define MAX_NR_PASID_BITS (20)
5149 static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
5150 {
5151         /*
5152          * Convert ecap_pss to extend context entry pts encoding, also
5153          * respect the soft pasid_max value set by the iommu.
5154          * - number of PASID bits = ecap_pss + 1
5155          * - number of PASID table entries = 2^(pts + 5)
5156          * Therefore, pts = ecap_pss - 4
5157          * e.g. KBL ecap_pss = 0x13, PASID has 20 bits, pts = 15
5158          */
5159         if (ecap_pss(iommu->ecap) < 5)
5160                 return 0;
5161
5162         /* pasid_max is encoded as actual number of entries not the bits */
5163         return find_first_bit((unsigned long *)&iommu->pasid_max,
5164                         MAX_NR_PASID_BITS) - 5;
5165 }
5166
5167 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5168 {
5169         struct device_domain_info *info;
5170         struct context_entry *context;
5171         struct dmar_domain *domain;
5172         unsigned long flags;
5173         u64 ctx_lo;
5174         int ret;
5175
5176         domain = get_valid_domain_for_dev(sdev->dev);
5177         if (!domain)
5178                 return -EINVAL;
5179
5180         spin_lock_irqsave(&device_domain_lock, flags);
5181         spin_lock(&iommu->lock);
5182
5183         ret = -EINVAL;
5184         info = sdev->dev->archdata.iommu;
5185         if (!info || !info->pasid_supported)
5186                 goto out;
5187
5188         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5189         if (WARN_ON(!context))
5190                 goto out;
5191
5192         ctx_lo = context[0].lo;
5193
5194         sdev->did = domain->iommu_did[iommu->seq_id];
5195         sdev->sid = PCI_DEVID(info->bus, info->devfn);
5196
5197         if (!(ctx_lo & CONTEXT_PASIDE)) {
5198                 if (iommu->pasid_state_table)
5199                         context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5200                 context[1].lo = (u64)virt_to_phys(iommu->pasid_table) |
5201                         intel_iommu_get_pts(iommu);
5202
5203                 wmb();
5204                 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5205                  * extended to permit requests-with-PASID if the PASIDE bit
5206                  * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5207                  * however, the PASIDE bit is ignored and requests-with-PASID
5208                  * are unconditionally blocked. Which makes less sense.
5209                  * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5210                  * "guest mode" translation types depending on whether ATS
5211                  * is available or not. Annoyingly, we can't use the new
5212                  * modes *unless* PASIDE is set. */
5213                 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5214                         ctx_lo &= ~CONTEXT_TT_MASK;
5215                         if (info->ats_supported)
5216                                 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5217                         else
5218                                 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5219                 }
5220                 ctx_lo |= CONTEXT_PASIDE;
5221                 if (iommu->pasid_state_table)
5222                         ctx_lo |= CONTEXT_DINVE;
5223                 if (info->pri_supported)
5224                         ctx_lo |= CONTEXT_PRS;
5225                 context[0].lo = ctx_lo;
5226                 wmb();
5227                 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5228                                            DMA_CCMD_MASK_NOBIT,
5229                                            DMA_CCMD_DEVICE_INVL);
5230         }
5231
5232         /* Enable PASID support in the device, if it wasn't already */
5233         if (!info->pasid_enabled)
5234                 iommu_enable_dev_iotlb(info);
5235
5236         if (info->ats_enabled) {
5237                 sdev->dev_iotlb = 1;
5238                 sdev->qdep = info->ats_qdep;
5239                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5240                         sdev->qdep = 0;
5241         }
5242         ret = 0;
5243
5244  out:
5245         spin_unlock(&iommu->lock);
5246         spin_unlock_irqrestore(&device_domain_lock, flags);
5247
5248         return ret;
5249 }
5250
5251 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5252 {
5253         struct intel_iommu *iommu;
5254         u8 bus, devfn;
5255
5256         if (iommu_dummy(dev)) {
5257                 dev_warn(dev,
5258                          "No IOMMU translation for device; cannot enable SVM\n");
5259                 return NULL;
5260         }
5261
5262         iommu = device_to_iommu(dev, &bus, &devfn);
5263         if ((!iommu)) {
5264                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5265                 return NULL;
5266         }
5267
5268         if (!iommu->pasid_table) {
5269                 dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5270                 return NULL;
5271         }
5272
5273         return iommu;
5274 }
5275 #endif /* CONFIG_INTEL_IOMMU_SVM */
5276
5277 const struct iommu_ops intel_iommu_ops = {
5278         .capable                = intel_iommu_capable,
5279         .domain_alloc           = intel_iommu_domain_alloc,
5280         .domain_free            = intel_iommu_domain_free,
5281         .attach_dev             = intel_iommu_attach_device,
5282         .detach_dev             = intel_iommu_detach_device,
5283         .map                    = intel_iommu_map,
5284         .unmap                  = intel_iommu_unmap,
5285         .map_sg                 = default_iommu_map_sg,
5286         .iova_to_phys           = intel_iommu_iova_to_phys,
5287         .add_device             = intel_iommu_add_device,
5288         .remove_device          = intel_iommu_remove_device,
5289         .get_resv_regions       = intel_iommu_get_resv_regions,
5290         .put_resv_regions       = intel_iommu_put_resv_regions,
5291         .device_group           = pci_device_group,
5292         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5293 };
5294
5295 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5296 {
5297         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5298         pr_info("Disabling IOMMU for graphics on this chipset\n");
5299         dmar_map_gfx = 0;
5300 }
5301
5302 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5303 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5304 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5305 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5306 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5307 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5308 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5309
5310 static void quirk_iommu_rwbf(struct pci_dev *dev)
5311 {
5312         /*
5313          * Mobile 4 Series Chipset neglects to set RWBF capability,
5314          * but needs it. Same seems to hold for the desktop versions.
5315          */
5316         pr_info("Forcing write-buffer flush capability\n");
5317         rwbf_quirk = 1;
5318 }
5319
5320 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5321 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5322 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5323 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5324 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5325 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5326 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5327
5328 #define GGC 0x52
5329 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5330 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5331 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5332 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5333 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5334 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5335 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5336 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5337
5338 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5339 {
5340         unsigned short ggc;
5341
5342         if (pci_read_config_word(dev, GGC, &ggc))
5343                 return;
5344
5345         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5346                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5347                 dmar_map_gfx = 0;
5348         } else if (dmar_map_gfx) {
5349                 /* we have to ensure the gfx device is idle before we flush */
5350                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5351                 intel_iommu_strict = 1;
5352        }
5353 }
5354 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5355 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5356 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5357 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5358
5359 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5360    ISOCH DMAR unit for the Azalia sound device, but not give it any
5361    TLB entries, which causes it to deadlock. Check for that.  We do
5362    this in a function called from init_dmars(), instead of in a PCI
5363    quirk, because we don't want to print the obnoxious "BIOS broken"
5364    message if VT-d is actually disabled.
5365 */
5366 static void __init check_tylersburg_isoch(void)
5367 {
5368         struct pci_dev *pdev;
5369         uint32_t vtisochctrl;
5370
5371         /* If there's no Azalia in the system anyway, forget it. */
5372         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5373         if (!pdev)
5374                 return;
5375         pci_dev_put(pdev);
5376
5377         /* System Management Registers. Might be hidden, in which case
5378            we can't do the sanity check. But that's OK, because the
5379            known-broken BIOSes _don't_ actually hide it, so far. */
5380         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5381         if (!pdev)
5382                 return;
5383
5384         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5385                 pci_dev_put(pdev);
5386                 return;
5387         }
5388
5389         pci_dev_put(pdev);
5390
5391         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5392         if (vtisochctrl & 1)
5393                 return;
5394
5395         /* Drop all bits other than the number of TLB entries */
5396         vtisochctrl &= 0x1c;
5397
5398         /* If we have the recommended number of TLB entries (16), fine. */
5399         if (vtisochctrl == 0x10)
5400                 return;
5401
5402         /* Zero TLB entries? You get to ride the short bus to school. */
5403         if (!vtisochctrl) {
5404                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5405                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5406                      dmi_get_system_info(DMI_BIOS_VENDOR),
5407                      dmi_get_system_info(DMI_BIOS_VERSION),
5408                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5409                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5410                 return;
5411         }
5412
5413         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5414                vtisochctrl);
5415 }